From 2d3c9f0d43758fbd3da8d4a1dc5d039545b39ef9 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 8 Apr 2024 11:35:32 +0200 Subject: [PATCH 01/24] refactor(pageserver): use tokio::signal instead of spawn_blocking (#7332) It's just unnecessary to use spawn_blocking there, and with https://github.com/neondatabase/neon/pull/7331 , it will result in really just one executor thread when enabling one-runtime with current_thread executor. --- pageserver/src/bin/pageserver.rs | 66 +++++++++++++++----------------- 1 file changed, 31 insertions(+), 35 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index c80230d4d7..0903b206ff 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -18,6 +18,7 @@ use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; use pageserver::task_mgr::WALRECEIVER_RUNTIME; use pageserver::tenant::{secondary, TenantSharedResources}; use remote_storage::GenericRemoteStorage; +use tokio::signal::unix::SignalKind; use tokio::time::Instant; use tracing::*; @@ -671,42 +672,37 @@ fn start_pageserver( let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard()); // All started up! Now just sit and wait for shutdown signal. - { - use signal_hook::consts::*; - let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || { - let mut signals = - signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap(); - return signals - .forever() - .next() - .expect("forever() never returns None unless explicitly closed"); - }); - let signal = BACKGROUND_RUNTIME - .block_on(signal_handler) - .expect("join error"); - match signal { - SIGQUIT => { - info!("Got signal {signal}. Terminating in immediate shutdown mode",); - std::process::exit(111); - } - SIGINT | SIGTERM => { - info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",); - // This cancels the `shutdown_pageserver` cancellation tree. - // Right now that tree doesn't reach very far, and `task_mgr` is used instead. - // The plan is to change that over time. - shutdown_pageserver.take(); - let bg_remote_storage = remote_storage.clone(); - let bg_deletion_queue = deletion_queue.clone(); - BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver( - &tenant_manager, - bg_remote_storage.map(|_| bg_deletion_queue), - 0, - )); - unreachable!() - } - _ => unreachable!(), - } + { + BACKGROUND_RUNTIME.block_on(async move { + let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap(); + let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap(); + let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap(); + let signal = tokio::select! { + _ = sigquit.recv() => { + info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",); + std::process::exit(111); + } + _ = sigint.recv() => { "SIGINT" }, + _ = sigterm.recv() => { "SIGTERM" }, + }; + + info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",); + + // This cancels the `shutdown_pageserver` cancellation tree. + // Right now that tree doesn't reach very far, and `task_mgr` is used instead. + // The plan is to change that over time. + shutdown_pageserver.take(); + let bg_remote_storage = remote_storage.clone(); + let bg_deletion_queue = deletion_queue.clone(); + pageserver::shutdown_pageserver( + &tenant_manager, + bg_remote_storage.map(|_| bg_deletion_queue), + 0, + ) + .await; + unreachable!() + }) } } From 47b705cffe0e13182ec41df8da518f310444c8d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 8 Apr 2024 14:59:08 +0200 Subject: [PATCH 02/24] Remove async_trait from CompactionDeltaLayer (#7342) Removes usage of async_trait from the `CompactionDeltaLayer` trait. Split off from #7301 Related earlier work: https://github.com/neondatabase/neon/pull/6305, https://github.com/neondatabase/neon/pull/6464, https://github.com/neondatabase/neon/pull/7303 --- Cargo.lock | 1 - pageserver/compaction/Cargo.toml | 1 - pageserver/compaction/src/helpers.rs | 2 +- pageserver/compaction/src/interface.rs | 7 ++----- pageserver/compaction/src/simulator.rs | 2 -- pageserver/src/tenant/timeline/compaction.rs | 2 -- 6 files changed, 3 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dae406e4ae..67054cf2c7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3616,7 +3616,6 @@ dependencies = [ "anyhow", "async-compression", "async-stream", - "async-trait", "byteorder", "bytes", "chrono", diff --git a/pageserver/compaction/Cargo.toml b/pageserver/compaction/Cargo.toml index 47f318db63..0fd1d81845 100644 --- a/pageserver/compaction/Cargo.toml +++ b/pageserver/compaction/Cargo.toml @@ -11,7 +11,6 @@ default = [] anyhow.workspace = true async-compression.workspace = true async-stream.workspace = true -async-trait.workspace = true byteorder.workspace = true bytes.workspace = true chrono = { workspace = true, features = ["serde"] } diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs index 22a410b4af..9de6363d6e 100644 --- a/pageserver/compaction/src/helpers.rs +++ b/pageserver/compaction/src/helpers.rs @@ -180,7 +180,7 @@ where match top.deref_mut() { LazyLoadLayer::Unloaded(ref mut l) => { let fut = l.load_keys(this.ctx); - this.load_future.set(Some(fut)); + this.load_future.set(Some(Box::pin(fut))); continue; } LazyLoadLayer::Loaded(ref mut entries) => { diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs index 2bb2e749c0..5dc62e506f 100644 --- a/pageserver/compaction/src/interface.rs +++ b/pageserver/compaction/src/interface.rs @@ -3,7 +3,6 @@ //! //! All the heavy lifting is done by the create_image and create_delta //! functions that the implementor provides. -use async_trait::async_trait; use futures::Future; use pageserver_api::{key::Key, keyspace::key_range_size}; use std::ops::Range; @@ -141,18 +140,16 @@ pub trait CompactionLayer { fn is_delta(&self) -> bool; } - -#[async_trait] pub trait CompactionDeltaLayer: CompactionLayer { type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key> where Self: 'a; /// Return all keys in this delta layer. - async fn load_keys<'a>( + fn load_keys<'a>( &self, ctx: &E::RequestContext, - ) -> anyhow::Result>>; + ) -> impl Future>>> + Send; } pub trait CompactionImageLayer: CompactionLayer {} diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs index def7983e75..6c00df3a65 100644 --- a/pageserver/compaction/src/simulator.rs +++ b/pageserver/compaction/src/simulator.rs @@ -2,7 +2,6 @@ mod draw; use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp}; -use async_trait::async_trait; use futures::StreamExt; use rand::Rng; use tracing::info; @@ -139,7 +138,6 @@ impl interface::CompactionLayer for Arc { } } -#[async_trait] impl interface::CompactionDeltaLayer for Arc { type DeltaEntry<'a> = MockRecord; diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index ab001bf10d..8075775bbc 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -12,7 +12,6 @@ use super::layer_manager::LayerManager; use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline}; use anyhow::{anyhow, Context}; -use async_trait::async_trait; use enumset::EnumSet; use fail::fail_point; use itertools::Itertools; @@ -1122,7 +1121,6 @@ impl CompactionLayer for ResidentDeltaLayer { } } -#[async_trait] impl CompactionDeltaLayer for ResidentDeltaLayer { type DeltaEntry<'a> = DeltaEntry<'a>; From 1081a4d2462d324961604b9114def1efea096f44 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 8 Apr 2024 16:27:08 +0200 Subject: [PATCH 03/24] pageserver: option to run with just one tokio runtime (#7331) This PR is an off-by-default revision v2 of the (since-reverted) PR #6555 / commit `3220f830b7fbb785d6db8a93775f46314f10a99b`. See that PR for details on why running with a single runtime is desirable and why we should be ready. We reverted #6555 because it showed regressions in prodlike cloudbench, see the revert commit message `ad072de4209193fd21314cf7f03f14df4fa55eb1` for more context. This PR makes it an opt-in choice via an env var. The default is to use the 4 separate runtimes that we have today, there shouldn't be any performance change. I tested manually that the env var & added metric works. ``` # undefined env var => no change to before this PR, uses 4 runtimes ./target/debug/neon_local start # defining the env var enables one-runtime mode, value defines that one runtime's configuration NEON_PAGESERVER_USE_ONE_RUNTIME=current_thread ./target/debug/neon_local start NEON_PAGESERVER_USE_ONE_RUNTIME=multi_thread:1 ./target/debug/neon_local start NEON_PAGESERVER_USE_ONE_RUNTIME=multi_thread:2 ./target/debug/neon_local start NEON_PAGESERVER_USE_ONE_RUNTIME=multi_thread:default ./target/debug/neon_local start ``` I want to use this change to do more manualy testing and potentially testing in staging. Future Work ----------- Testing / deployment ergonomics would be better if this were a variable in `pageserver.toml`. It can be done, but, I don't need it right now, so let's stick with the env var. --- control_plane/src/background_process.rs | 14 ++- libs/utils/src/env.rs | 21 ++++ libs/utils/src/lib.rs | 2 + pageserver/src/metrics.rs | 21 ++++ pageserver/src/task_mgr.rs | 149 +++++++++++++++++------- pageserver/src/tenant/tasks.rs | 3 +- 6 files changed, 169 insertions(+), 41 deletions(-) create mode 100644 libs/utils/src/env.rs diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 2fced7d778..94666f2870 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -86,7 +86,10 @@ where .stdout(process_log_file) .stderr(same_file_for_stderr) .args(args); - let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command)); + + let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars( + fill_rust_env_vars(background_command), + )); filled_cmd.envs(envs); let pid_file_to_check = match &initial_pid_file { @@ -268,6 +271,15 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command { cmd } +fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command { + for (var, val) in std::env::vars() { + if var.starts_with("NEON_PAGESERVER_") { + cmd = cmd.env(var, val); + } + } + cmd +} + /// Add a `pre_exec` to the cmd that, inbetween fork() and exec(), /// 1. Claims a pidfile with a fcntl lock on it and /// 2. Sets up the pidfile's file descriptor so that it (and the lock) diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs new file mode 100644 index 0000000000..b3e326bfd0 --- /dev/null +++ b/libs/utils/src/env.rs @@ -0,0 +1,21 @@ +//! Wrapper around `std::env::var` for parsing environment variables. + +use std::{fmt::Display, str::FromStr}; + +pub fn var(varname: &str) -> Option +where + V: FromStr, + E: Display, +{ + match std::env::var(varname) { + Ok(s) => Some( + s.parse() + .map_err(|e| format!("failed to parse env var {varname}: {e:#}")) + .unwrap(), + ), + Err(std::env::VarError::NotPresent) => None, + Err(std::env::VarError::NotUnicode(_)) => { + panic!("env var {varname} is not unicode") + } + } +} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 04ce0626c8..cd5075613e 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -89,6 +89,8 @@ pub mod yielding_loop; pub mod zstd; +pub mod env; + /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index ab9a2e8509..3160f204e2 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -2100,6 +2100,7 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { use futures::Future; use pin_project_lite::pin_project; use std::collections::HashMap; +use std::num::NonZeroUsize; use std::pin::Pin; use std::sync::{Arc, Mutex}; use std::task::{Context, Poll}; @@ -2669,6 +2670,26 @@ pub(crate) mod disk_usage_based_eviction { pub(crate) static METRICS: Lazy = Lazy::new(Metrics::default); } +static TOKIO_EXECUTOR_THREAD_COUNT: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_tokio_executor_thread_configured_count", + "Total number of configued tokio executor threads in the process. + The `setup` label denotes whether we're running with multiple runtimes or a single runtime.", + &["setup"], + ) + .unwrap() +}); + +pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) { + static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(()); + let _guard = SERIALIZE.lock().unwrap(); + TOKIO_EXECUTOR_THREAD_COUNT.reset(); + TOKIO_EXECUTOR_THREAD_COUNT + .get_metric_with_label_values(&[setup]) + .unwrap() + .set(u64::try_from(num_threads.get()).unwrap()); +} + pub fn preinitialize_metrics() { // Python tests need these and on some we do alerting. // diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 0cc5611a12..9a1e354ecf 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -33,13 +33,14 @@ use std::collections::HashMap; use std::fmt; use std::future::Future; +use std::num::NonZeroUsize; use std::panic::AssertUnwindSafe; +use std::str::FromStr; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use futures::FutureExt; use pageserver_api::shard::TenantShardId; -use tokio::runtime::Runtime; use tokio::task::JoinHandle; use tokio::task_local; use tokio_util::sync::CancellationToken; @@ -48,8 +49,11 @@ use tracing::{debug, error, info, warn}; use once_cell::sync::Lazy; +use utils::env; use utils::id::TimelineId; +use crate::metrics::set_tokio_runtime_setup; + // // There are four runtimes: // @@ -98,52 +102,119 @@ use utils::id::TimelineId; // other operations, if the upload tasks e.g. get blocked on locks. It shouldn't // happen, but still. // -pub static COMPUTE_REQUEST_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("compute request worker") - .enable_all() - .build() - .expect("Failed to create compute request runtime") -}); -pub static MGMT_REQUEST_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("mgmt request worker") - .enable_all() - .build() - .expect("Failed to create mgmt request runtime") -}); - -pub static WALRECEIVER_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("walreceiver worker") - .enable_all() - .build() - .expect("Failed to create walreceiver runtime") -}); - -pub static BACKGROUND_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("background op worker") - // if you change the number of worker threads please change the constant below - .enable_all() - .build() - .expect("Failed to create background op runtime") -}); - -pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy = Lazy::new(|| { - // force init and thus panics - let _ = BACKGROUND_RUNTIME.handle(); +pub(crate) static TOKIO_WORKER_THREADS: Lazy = Lazy::new(|| { // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly // tokio would had already panicked for parsing errors or NotUnicode // // this will be wrong if any of the runtimes gets their worker threads configured to something // else, but that has not been needed in a long time. - std::env::var("TOKIO_WORKER_THREADS") - .map(|s| s.parse::().unwrap()) - .unwrap_or_else(|_e| usize::max(2, num_cpus::get())) + NonZeroUsize::new( + std::env::var("TOKIO_WORKER_THREADS") + .map(|s| s.parse::().unwrap()) + .unwrap_or_else(|_e| usize::max(2, num_cpus::get())), + ) + .expect("the max() ensures that this is not zero") }); +enum TokioRuntimeMode { + SingleThreaded, + MultiThreaded { num_workers: NonZeroUsize }, +} + +impl FromStr for TokioRuntimeMode { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "current_thread" => Ok(TokioRuntimeMode::SingleThreaded), + s => match s.strip_prefix("multi_thread:") { + Some("default") => Ok(TokioRuntimeMode::MultiThreaded { + num_workers: *TOKIO_WORKER_THREADS, + }), + Some(suffix) => { + let num_workers = suffix.parse::().map_err(|e| { + format!( + "invalid number of multi-threaded runtime workers ({suffix:?}): {e}", + ) + })?; + Ok(TokioRuntimeMode::MultiThreaded { num_workers }) + } + None => Err(format!("invalid runtime config: {s:?}")), + }, + } + } +} + +static ONE_RUNTIME: Lazy> = Lazy::new(|| { + let thread_name = "pageserver-tokio"; + let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else { + // If the env var is not set, leave this static as None. + set_tokio_runtime_setup( + "multiple-runtimes", + NUM_MULTIPLE_RUNTIMES + .checked_mul(*TOKIO_WORKER_THREADS) + .unwrap(), + ); + return None; + }; + Some(match mode { + TokioRuntimeMode::SingleThreaded => { + set_tokio_runtime_setup("one-runtime-single-threaded", NonZeroUsize::new(1).unwrap()); + tokio::runtime::Builder::new_current_thread() + .thread_name(thread_name) + .enable_all() + .build() + .expect("failed to create one single runtime") + } + TokioRuntimeMode::MultiThreaded { num_workers } => { + set_tokio_runtime_setup("one-runtime-multi-threaded", num_workers); + tokio::runtime::Builder::new_multi_thread() + .thread_name(thread_name) + .enable_all() + .worker_threads(num_workers.get()) + .build() + .expect("failed to create one multi-threaded runtime") + } + }) +}); + +/// Declare a lazy static variable named `$varname` that will resolve +/// to a tokio runtime handle. If the env var `NEON_PAGESERVER_USE_ONE_RUNTIME` +/// is set, this will resolve to `ONE_RUNTIME`. Otherwise, the macro invocation +/// declares a separate runtime and the lazy static variable `$varname` +/// will resolve to that separate runtime. +/// +/// The result is is that `$varname.spawn()` will use `ONE_RUNTIME` if +/// `NEON_PAGESERVER_USE_ONE_RUNTIME` is set, and will use the separate runtime +/// otherwise. +macro_rules! pageserver_runtime { + ($varname:ident, $name:literal) => { + pub static $varname: Lazy<&'static tokio::runtime::Runtime> = Lazy::new(|| { + if let Some(runtime) = &*ONE_RUNTIME { + return runtime; + } + static RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name($name) + .worker_threads(TOKIO_WORKER_THREADS.get()) + .enable_all() + .build() + .expect(std::concat!("Failed to create runtime ", $name)) + }); + &*RUNTIME + }); + }; +} + +pageserver_runtime!(COMPUTE_REQUEST_RUNTIME, "compute request worker"); +pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker"); +pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker"); +pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker"); +// Bump this number when adding a new pageserver_runtime! +// SAFETY: it's obviously correct +const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) }; + #[derive(Debug, Clone, Copy)] pub struct PageserverTaskId(u64); diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index e4f5f75132..74ed677ffe 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -18,7 +18,7 @@ use utils::{backoff, completion}; static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy = once_cell::sync::Lazy::new(|| { - let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS; + let total_threads = task_mgr::TOKIO_WORKER_THREADS.get(); let permits = usize::max( 1, // while a lot of the work is done on spawn_blocking, we still do @@ -72,6 +72,7 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit( loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation ); + // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id(); match CONCURRENT_BACKGROUND_TASKS.acquire().await { Ok(permit) => permit, Err(_closed) => unreachable!("we never close the semaphore"), From a306d0a54b0e579556893c0344a27664e39e54a1 Mon Sep 17 00:00:00 2001 From: Kevin Mingtarja <69668484+kevinmingtarja@users.noreply.github.com> Date: Mon, 8 Apr 2024 22:53:07 +0800 Subject: [PATCH 04/24] implement Serialize/Deserialize for SystemTime with RFC3339 format (#7203) ## Problem We have two places that use a helper (`ser_rfc3339_millis`) to get serde to stringify SystemTimes into the desired format. ## Summary of changes Created a new module `utils::serde_system_time` and inside it a wrapper type `SystemTime` for `std::time::SystemTime` that serializes/deserializes to the RFC3339 format. This new type is then used in the two places that were previously using the helper for serialization, thereby eliminating the need to decorate structs. Closes #7151. --- Cargo.lock | 1 + libs/pageserver_api/src/models.rs | 30 +--------- libs/pageserver_api/src/models/utilization.rs | 25 ++------- libs/utils/Cargo.toml | 1 + libs/utils/src/lib.rs | 1 + libs/utils/src/serde_system_time.rs | 55 +++++++++++++++++++ pageserver/src/tenant/secondary/downloader.rs | 4 +- pageserver/src/utilization.rs | 2 +- 8 files changed, 67 insertions(+), 52 deletions(-) create mode 100644 libs/utils/src/serde_system_time.rs diff --git a/Cargo.lock b/Cargo.lock index 67054cf2c7..66ff3dedb7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6569,6 +6569,7 @@ dependencies = [ "heapless", "hex", "hex-literal", + "humantime", "hyper", "jsonwebtoken", "leaky-bucket", diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index ad4ca6710d..b4909f247f 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -20,6 +20,7 @@ use utils::{ history_buffer::HistoryBufferWithDropCounter, id::{NodeId, TenantId, TimelineId}, lsn::Lsn, + serde_system_time, }; use crate::controller_api::PlacementPolicy; @@ -758,11 +759,7 @@ pub struct WalRedoManagerStatus { #[derive(Default, Debug, Serialize, Deserialize, Clone)] pub struct SecondaryProgress { /// The remote storage LastModified time of the heatmap object we last downloaded. - #[serde( - serialize_with = "opt_ser_rfc3339_millis", - deserialize_with = "opt_deser_rfc3339_millis" - )] - pub heatmap_mtime: Option, + pub heatmap_mtime: Option, /// The number of layers currently on-disk pub layers_downloaded: usize, @@ -775,29 +772,6 @@ pub struct SecondaryProgress { pub bytes_total: u64, } -fn opt_ser_rfc3339_millis( - ts: &Option, - serializer: S, -) -> Result { - match ts { - Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)), - None => serializer.serialize_none(), - } -} - -fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result, D::Error> -where - D: serde::de::Deserializer<'de>, -{ - let s: Option = serde::de::Deserialize::deserialize(deserializer)?; - match s { - None => Ok(None), - Some(s) => humantime::parse_rfc3339(&s) - .map_err(serde::de::Error::custom) - .map(Some), - } -} - pub mod virtual_file { #[derive( Copy, diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs index f5984dff5d..e88cab5d6a 100644 --- a/libs/pageserver_api/src/models/utilization.rs +++ b/libs/pageserver_api/src/models/utilization.rs @@ -1,4 +1,4 @@ -use std::time::SystemTime; +use utils::serde_system_time::SystemTime; /// Pageserver current utilization and scoring for how good candidate the pageserver would be for /// the next tenant. @@ -21,28 +21,9 @@ pub struct PageserverUtilization { /// When was this snapshot captured, pageserver local time. /// /// Use millis to give confidence that the value is regenerated often enough. - #[serde( - serialize_with = "ser_rfc3339_millis", - deserialize_with = "deser_rfc3339_millis" - )] pub captured_at: SystemTime, } -fn ser_rfc3339_millis( - ts: &SystemTime, - serializer: S, -) -> Result { - serializer.collect_str(&humantime::format_rfc3339_millis(*ts)) -} - -fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result -where - D: serde::de::Deserializer<'de>, -{ - let s: String = serde::de::Deserialize::deserialize(deserializer)?; - humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom) -} - /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients. /// /// Instead of newtype, use this because a newtype would get require handling deserializing values @@ -69,7 +50,9 @@ mod tests { disk_usage_bytes: u64::MAX, free_space_bytes: 0, utilization_score: u64::MAX, - captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779), + captured_at: SystemTime( + std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779), + ), }; let s = serde_json::to_string(&doc).unwrap(); diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index c2d9d9d396..a6a081c5c1 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -22,6 +22,7 @@ camino.workspace = true chrono.workspace = true heapless.workspace = true hex = { workspace = true, features = ["serde"] } +humantime.workspace = true hyper = { workspace = true, features = ["full"] } fail.workspace = true futures = { workspace = true} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index cd5075613e..b09350d11e 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -63,6 +63,7 @@ pub mod measured_stream; pub mod serde_percent; pub mod serde_regex; +pub mod serde_system_time; pub mod pageserver_feedback; diff --git a/libs/utils/src/serde_system_time.rs b/libs/utils/src/serde_system_time.rs new file mode 100644 index 0000000000..b0f6934e87 --- /dev/null +++ b/libs/utils/src/serde_system_time.rs @@ -0,0 +1,55 @@ +//! A `serde::{Deserialize,Serialize}` type for SystemTime with RFC3339 format and millisecond precision. + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)] +#[serde(transparent)] +pub struct SystemTime( + #[serde( + deserialize_with = "deser_rfc3339_millis", + serialize_with = "ser_rfc3339_millis" + )] + pub std::time::SystemTime, +); + +fn ser_rfc3339_millis( + ts: &std::time::SystemTime, + serializer: S, +) -> Result { + serializer.collect_str(&humantime::format_rfc3339_millis(*ts)) +} + +fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result +where + D: serde::de::Deserializer<'de>, +{ + let s: String = serde::de::Deserialize::deserialize(deserializer)?; + humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom) +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Helper function to make a SystemTime have millisecond precision by truncating additional nanoseconds. + fn to_millisecond_precision(time: SystemTime) -> SystemTime { + match time.0.duration_since(std::time::SystemTime::UNIX_EPOCH) { + Ok(duration) => { + let total_millis = duration.as_secs() * 1_000 + u64::from(duration.subsec_millis()); + SystemTime( + std::time::SystemTime::UNIX_EPOCH + + std::time::Duration::from_millis(total_millis), + ) + } + Err(_) => time, + } + } + + #[test] + fn test_serialize_deserialize() { + let input = SystemTime(std::time::SystemTime::now()); + let expected_serialized = format!("\"{}\"", humantime::format_rfc3339_millis(input.0)); + let serialized = serde_json::to_string(&input).unwrap(); + assert_eq!(expected_serialized, serialized); + let deserialized: SystemTime = serde_json::from_str(&expected_serialized).unwrap(); + assert_eq!(to_millisecond_precision(input), deserialized); + } +} diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 530e1a3244..5b29c126d1 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -51,7 +51,7 @@ use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, warn, Instrument}; use utils::{ backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext, - id::TimelineId, + id::TimelineId, serde_system_time, }; use super::{ @@ -591,7 +591,7 @@ impl<'a> TenantDownloader<'a> { let mut progress = SecondaryProgress { layers_total: heatmap_stats.layers, bytes_total: heatmap_stats.bytes, - heatmap_mtime: Some(heatmap_mtime), + heatmap_mtime: Some(serde_system_time::SystemTime(heatmap_mtime)), layers_downloaded: 0, bytes_downloaded: 0, }; diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs index 5eccf185ac..e6c835aa75 100644 --- a/pageserver/src/utilization.rs +++ b/pageserver/src/utilization.rs @@ -41,7 +41,7 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result Date: Mon, 8 Apr 2024 19:01:41 +0100 Subject: [PATCH 05/24] update measured with some more convenient features (#7334) ## Problem Some awkwardness in the measured API. Missing process metrics. ## Summary of changes Update measured to use the new convenience setup features. Added measured-process lib. Added measured support for libmetrics --- Cargo.lock | 175 +++++++++++++++++++++----- Cargo.toml | 3 +- libs/metrics/Cargo.toml | 2 + libs/metrics/src/lib.rs | 146 ++++++++++++++++++++- storage_controller/src/http.rs | 18 ++- storage_controller/src/main.rs | 8 +- storage_controller/src/metrics.rs | 120 +++++------------- storage_controller/src/persistence.rs | 6 +- workspace_hack/Cargo.toml | 5 +- 9 files changed, 345 insertions(+), 138 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 66ff3dedb7..a7e29b1de3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1124,7 +1124,7 @@ version = "4.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2", "quote", "syn 2.0.52", @@ -1462,12 +1462,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.15" +version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b" -dependencies = [ - "cfg-if", -] +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" [[package]] name = "crossterm" @@ -1840,23 +1837,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.1" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" dependencies = [ - "errno-dragonfly", - "libc", - "windows-sys 0.48.0", -] - -[[package]] -name = "errno-dragonfly" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" -dependencies = [ - "cc", "libc", + "windows-sys 0.52.0", ] [[package]] @@ -2294,6 +2280,12 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "hermit-abi" version = "0.3.3" @@ -2794,6 +2786,12 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" +[[package]] +name = "linux-raw-sys" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" + [[package]] name = "lock_api" version = "0.4.10" @@ -2848,11 +2846,12 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "measured" -version = "0.0.13" +version = "0.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f" +checksum = "3cbf033874bea03565f2449572c8640ca37ec26300455faf36001f24755da452" dependencies = [ "bytes", + "crossbeam-utils", "hashbrown 0.14.0", "itoa", "lasso", @@ -2865,16 +2864,27 @@ dependencies = [ [[package]] name = "measured-derive" -version = "0.0.13" +version = "0.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80" +checksum = "be9e29b682b38f8af2a89f960455054ab1a9f5a06822f6f3500637ad9fa57def" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", "syn 2.0.52", ] +[[package]] +name = "measured-process" +version = "0.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a20849acdd04c5d6a88f565559044546904648a1842a2937cfff0b48b4ca7ef2" +dependencies = [ + "libc", + "measured", + "procfs 0.16.0", +] + [[package]] name = "memchr" version = "2.6.4" @@ -2914,8 +2924,10 @@ version = "0.1.0" dependencies = [ "chrono", "libc", + "measured", + "measured-process", "once_cell", - "procfs", + "procfs 0.14.2", "prometheus", "rand 0.8.5", "rand_distr", @@ -3525,7 +3537,7 @@ dependencies = [ "postgres_connection", "postgres_ffi", "pq_proto", - "procfs", + "procfs 0.14.2", "rand 0.8.5", "regex", "remote_storage", @@ -4085,6 +4097,29 @@ dependencies = [ "rustix 0.36.16", ] +[[package]] +name = "procfs" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" +dependencies = [ + "bitflags 2.4.1", + "hex", + "lazy_static", + "procfs-core", + "rustix 0.38.28", +] + +[[package]] +name = "procfs-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" +dependencies = [ + "bitflags 2.4.1", + "hex", +] + [[package]] name = "prometheus" version = "0.13.3" @@ -4097,7 +4132,7 @@ dependencies = [ "libc", "memchr", "parking_lot 0.12.1", - "procfs", + "procfs 0.14.2", "thiserror", ] @@ -4118,7 +4153,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270" dependencies = [ "bytes", - "heck", + "heck 0.4.1", "itertools", "lazy_static", "log", @@ -4810,6 +4845,19 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "rustix" +version = "0.38.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" +dependencies = [ + "bitflags 2.4.1", + "errno", + "libc", + "linux-raw-sys 0.4.13", + "windows-sys 0.52.0", +] + [[package]] name = "rustls" version = "0.21.9" @@ -5670,7 +5718,7 @@ version = "0.24.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2", "quote", "rustversion", @@ -6930,6 +6978,15 @@ dependencies = [ "windows-targets 0.48.0", ] +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.4", +] + [[package]] name = "windows-targets" version = "0.42.2" @@ -6960,6 +7017,21 @@ dependencies = [ "windows_x86_64_msvc 0.48.0", ] +[[package]] +name = "windows-targets" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +dependencies = [ + "windows_aarch64_gnullvm 0.52.4", + "windows_aarch64_msvc 0.52.4", + "windows_i686_gnu 0.52.4", + "windows_i686_msvc 0.52.4", + "windows_x86_64_gnu 0.52.4", + "windows_x86_64_gnullvm 0.52.4", + "windows_x86_64_msvc 0.52.4", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.42.2" @@ -6972,6 +7044,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" + [[package]] name = "windows_aarch64_msvc" version = "0.42.2" @@ -6984,6 +7062,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" + [[package]] name = "windows_i686_gnu" version = "0.42.2" @@ -6996,6 +7080,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +[[package]] +name = "windows_i686_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" + [[package]] name = "windows_i686_msvc" version = "0.42.2" @@ -7008,6 +7098,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" +[[package]] +name = "windows_i686_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" + [[package]] name = "windows_x86_64_gnu" version = "0.42.2" @@ -7020,6 +7116,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" + [[package]] name = "windows_x86_64_gnullvm" version = "0.42.2" @@ -7032,6 +7134,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" + [[package]] name = "windows_x86_64_msvc" version = "0.42.2" @@ -7044,6 +7152,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" + [[package]] name = "winnow" version = "0.4.6" @@ -7092,7 +7206,6 @@ dependencies = [ "futures-sink", "futures-util", "getrandom 0.2.11", - "hashbrown 0.13.2", "hashbrown 0.14.0", "hex", "hmac", diff --git a/Cargo.toml b/Cargo.toml index 3c6077648e..5db6b7016a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -106,7 +106,8 @@ lasso = "0.7" leaky-bucket = "1.0.1" libc = "0.2" md5 = "0.7.0" -measured = { version = "0.0.13", features=["default", "lasso"] } +measured = { version = "0.0.20", features=["lasso"] } +measured-process = { version = "0.0.20" } memoffset = "0.8" native-tls = "0.2" nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] } diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml index f6a49a0166..0bd804051c 100644 --- a/libs/metrics/Cargo.toml +++ b/libs/metrics/Cargo.toml @@ -10,11 +10,13 @@ libc.workspace = true once_cell.workspace = true chrono.workspace = true twox-hash.workspace = true +measured.workspace = true workspace_hack.workspace = true [target.'cfg(target_os = "linux")'.dependencies] procfs.workspace = true +measured-process.workspace = true [dev-dependencies] rand = "0.8" diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 22b0a18933..6cff28c0ca 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -4,6 +4,17 @@ //! a default registry. #![deny(clippy::undocumented_unsafe_blocks)] +use measured::{ + label::{LabelGroupVisitor, LabelName, NoLabels}, + metric::{ + counter::CounterState, + gauge::GaugeState, + group::{Encoding, MetricValue}, + name::{MetricName, MetricNameEncoder}, + MetricEncoding, MetricFamilyEncoding, + }, + FixedCardinalityLabel, LabelGroup, MetricGroup, +}; use once_cell::sync::Lazy; use prometheus::core::{ Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec, @@ -11,6 +22,7 @@ use prometheus::core::{ pub use prometheus::opts; pub use prometheus::register; pub use prometheus::Error; +use prometheus::Registry; pub use prometheus::{core, default_registry, proto}; pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{register_counter_vec, Counter, CounterVec}; @@ -23,7 +35,6 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec}; pub use prometheus::{register_int_gauge, IntGauge}; pub use prometheus::{register_int_gauge_vec, IntGaugeVec}; pub use prometheus::{Encoder, TextEncoder}; -use prometheus::{Registry, Result}; pub mod launch_timestamp; mod wrappers; @@ -59,7 +70,7 @@ static INTERNAL_REGISTRY: Lazy = Lazy::new(Registry::new); /// Register a collector in the internal registry. MUST be called before the first call to `gather()`. /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector /// while holding the lock. -pub fn register_internal(c: Box) -> Result<()> { +pub fn register_internal(c: Box) -> prometheus::Result<()> { INTERNAL_REGISTRY.register(c) } @@ -96,6 +107,127 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[ 0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, ]; +pub struct BuildInfo { + pub revision: &'static str, + pub build_tag: &'static str, +} + +// todo: allow label group without the set +impl LabelGroup for BuildInfo { + fn visit_values(&self, v: &mut impl LabelGroupVisitor) { + const REVISION: &LabelName = LabelName::from_str("revision"); + v.write_value(REVISION, &self.revision); + const BUILD_TAG: &LabelName = LabelName::from_str("build_tag"); + v.write_value(BUILD_TAG, &self.build_tag); + } +} + +impl MetricFamilyEncoding for BuildInfo +where + GaugeState: MetricEncoding, +{ + fn collect_family_into( + &self, + name: impl measured::metric::name::MetricNameEncoder, + enc: &mut T, + ) -> Result<(), T::Err> { + enc.write_help(&name, "Build/version information")?; + GaugeState::write_type(&name, enc)?; + GaugeState { + count: std::sync::atomic::AtomicI64::new(1), + } + .collect_into(&(), self, name, enc) + } +} + +#[derive(MetricGroup)] +#[metric(new(build_info: BuildInfo))] +pub struct NeonMetrics { + #[cfg(target_os = "linux")] + #[metric(namespace = "process")] + #[metric(init = measured_process::ProcessCollector::for_self())] + process: measured_process::ProcessCollector, + + #[metric(namespace = "libmetrics")] + #[metric(init = LibMetrics::new(build_info))] + libmetrics: LibMetrics, +} + +#[derive(MetricGroup)] +#[metric(new(build_info: BuildInfo))] +pub struct LibMetrics { + #[metric(init = build_info)] + build_info: BuildInfo, + + #[metric(flatten)] + rusage: Rusage, + + serve_count: CollectionCounter, +} + +fn write_gauge( + x: i64, + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut Enc, +) -> Result<(), Enc::Err> { + enc.write_metric_value(name, labels, MetricValue::Int(x)) +} + +#[derive(Default)] +struct Rusage; + +#[derive(FixedCardinalityLabel, Clone, Copy)] +#[label(singleton = "io_operation")] +enum IoOp { + Read, + Write, +} + +impl MetricGroup for Rusage +where + GaugeState: MetricEncoding, +{ + fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> { + const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total"); + const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb"); + + let ru = get_rusage_stats(); + + enc.write_help( + DISK_IO, + "Bytes written and read from disk, grouped by the operation (read|write)", + )?; + GaugeState::write_type(DISK_IO, enc)?; + write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?; + write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?; + + enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?; + GaugeState::write_type(MAXRSS, enc)?; + write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?; + + Ok(()) + } +} + +#[derive(Default)] +struct CollectionCounter(CounterState); + +impl MetricFamilyEncoding for CollectionCounter +where + CounterState: MetricEncoding, +{ + fn collect_family_into( + &self, + name: impl measured::metric::name::MetricNameEncoder, + enc: &mut T, + ) -> Result<(), T::Err> { + self.0.inc(); + enc.write_help(&name, "Number of metric requests made")?; + self.0.collect_into(&(), NoLabels, name, enc) + } +} + pub fn set_build_info_metric(revision: &str, build_tag: &str) { let metric = register_int_gauge_vec!( "libmetrics_build_info", @@ -105,6 +237,7 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) { .expect("Failed to register build info metric"); metric.with_label_values(&[revision, build_tag]).set(1); } +const BYTES_IN_BLOCK: i64 = 512; // Records I/O stats in a "cross-platform" way. // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats. @@ -117,7 +250,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) { fn update_rusage_metrics() { let rusage_stats = get_rusage_stats(); - const BYTES_IN_BLOCK: i64 = 512; DISK_IO_BYTES .with_label_values(&["read"]) .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK); @@ -151,6 +283,7 @@ macro_rules! register_int_counter_pair_vec { } }}; } + /// Create an [`IntCounterPair`] and registers to default registry. #[macro_export(local_inner_macros)] macro_rules! register_int_counter_pair { @@ -188,7 +321,10 @@ impl GenericCounterPairVec

{ /// /// An error is returned if the number of label values is not the same as the /// number of VariableLabels in Desc. - pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result> { + pub fn get_metric_with_label_values( + &self, + vals: &[&str], + ) -> prometheus::Result> { Ok(GenericCounterPair { inc: self.inc.get_metric_with_label_values(vals)?, dec: self.dec.get_metric_with_label_values(vals)?, @@ -201,7 +337,7 @@ impl GenericCounterPairVec

{ self.get_metric_with_label_values(vals).unwrap() } - pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) { + pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) { res[0] = self.inc.remove_label_values(vals); res[1] = self.dec.remove_label_values(vals); } diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index c59bcaa174..2e83bbc5ed 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -8,6 +8,7 @@ use futures::Future; use hyper::header::CONTENT_TYPE; use hyper::{Body, Request, Response}; use hyper::{StatusCode, Uri}; +use metrics::{BuildInfo, NeonMetrics}; use pageserver_api::models::{ TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest, TenantTimeTravelRequest, TimelineCreateRequest, @@ -44,15 +45,19 @@ use control_plane::storage_controller::{AttachHookRequest, InspectRequest}; use routerify::Middleware; /// State available to HTTP request handlers -#[derive(Clone)] pub struct HttpState { service: Arc, auth: Option>, + neon_metrics: NeonMetrics, allowlist_routes: Vec, } impl HttpState { - pub fn new(service: Arc, auth: Option>) -> Self { + pub fn new( + service: Arc, + auth: Option>, + build_info: BuildInfo, + ) -> Self { let allowlist_routes = ["/status", "/ready", "/metrics"] .iter() .map(|v| v.parse().unwrap()) @@ -60,6 +65,7 @@ impl HttpState { Self { service, auth, + neon_metrics: NeonMetrics::new(build_info), allowlist_routes, } } @@ -672,10 +678,11 @@ fn epilogue_metrics_middleware }) } -pub async fn measured_metrics_handler(_req: Request) -> Result, ApiError> { +pub async fn measured_metrics_handler(req: Request) -> Result, ApiError> { pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4"; - let payload = crate::metrics::METRICS_REGISTRY.encode(); + let state = get_state(&req); + let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics); let response = Response::builder() .status(200) .header(CONTENT_TYPE, TEXT_FORMAT) @@ -704,6 +711,7 @@ where pub fn make_router( service: Arc, auth: Option>, + build_info: BuildInfo, ) -> RouterBuilder { let mut router = endpoint::make_router() .middleware(prologue_metrics_middleware()) @@ -720,7 +728,7 @@ pub fn make_router( } router - .data(Arc::new(HttpState::new(service, auth))) + .data(Arc::new(HttpState::new(service, auth, build_info))) .get("/metrics", |r| { named_request_span(r, measured_metrics_handler, RequestName("metrics")) }) diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 3c03d6efe8..6466b9f7a3 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -3,6 +3,7 @@ use camino::Utf8PathBuf; use clap::Parser; use diesel::Connection; use metrics::launch_timestamp::LaunchTimestamp; +use metrics::BuildInfo; use std::sync::Arc; use storage_controller::http::make_router; use storage_controller::metrics::preinitialize_metrics; @@ -192,6 +193,11 @@ async fn async_main() -> anyhow::Result<()> { args.listen ); + let build_info = BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }; + let strict_mode = if args.dev { StrictMode::Dev } else { @@ -253,7 +259,7 @@ async fn async_main() -> anyhow::Result<()> { let auth = secrets .public_key .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth))); - let router = make_router(service.clone(), auth) + let router = make_router(service.clone(), auth, build_info) .build() .map_err(|err| anyhow!(err))?; let router_service = utils::http::RouterService::new(router).unwrap(); diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index cabf416b9f..ac9f22c739 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -8,10 +8,8 @@ //! The rest of the code defines label group types and deals with converting outer types to labels. //! use bytes::Bytes; -use measured::{ - label::{LabelValue, StaticLabelSet}, - FixedCardinalityLabel, MetricGroup, -}; +use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup}; +use metrics::NeonMetrics; use once_cell::sync::Lazy; use std::sync::Mutex; @@ -26,13 +24,15 @@ pub fn preinitialize_metrics() { pub(crate) struct StorageControllerMetrics { pub(crate) metrics_group: StorageControllerMetricGroup, - encoder: Mutex, + encoder: Mutex, } #[derive(measured::MetricGroup)] +#[metric(new())] pub(crate) struct StorageControllerMetricGroup { /// Count of how many times we spawn a reconcile task pub(crate) storage_controller_reconcile_spawn: measured::Counter, + /// Reconciler tasks completed, broken down by success/failure/cancelled pub(crate) storage_controller_reconcile_complete: measured::CounterVec, @@ -43,7 +43,9 @@ pub(crate) struct StorageControllerMetricGroup { /// HTTP request status counters for handled requests pub(crate) storage_controller_http_request_status: measured::CounterVec, + /// HTTP request handler latency across all status codes + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_http_request_latency: measured::HistogramVec, @@ -55,6 +57,7 @@ pub(crate) struct StorageControllerMetricGroup { /// Latency of HTTP requests to the pageserver, broken down by pageserver /// node id, request name and method. This include both successful and unsuccessful /// requests. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_pageserver_request_latency: measured::HistogramVec, @@ -66,6 +69,7 @@ pub(crate) struct StorageControllerMetricGroup { /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver /// node id, request name and method. This include both successful and unsuccessful /// requests. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_passthrough_request_latency: measured::HistogramVec, @@ -74,76 +78,34 @@ pub(crate) struct StorageControllerMetricGroup { measured::CounterVec, /// Latency of database queries, broken down by operation. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] pub(crate) storage_controller_database_query_latency: measured::HistogramVec, } impl StorageControllerMetrics { - pub(crate) fn encode(&self) -> Bytes { + pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes { let mut encoder = self.encoder.lock().unwrap(); - self.metrics_group.collect_into(&mut *encoder); + neon_metrics + .collect_group_into(&mut *encoder) + .unwrap_or_else(|infallible| match infallible {}); + self.metrics_group + .collect_group_into(&mut *encoder) + .unwrap_or_else(|infallible| match infallible {}); encoder.finish() } } impl Default for StorageControllerMetrics { fn default() -> Self { - Self { - metrics_group: StorageControllerMetricGroup::new(), - encoder: Mutex::new(measured::text::TextEncoder::new()), - } - } -} + let mut metrics_group = StorageControllerMetricGroup::new(); + metrics_group + .storage_controller_reconcile_complete + .init_all_dense(); -impl StorageControllerMetricGroup { - pub(crate) fn new() -> Self { Self { - storage_controller_reconcile_spawn: measured::Counter::new(), - storage_controller_reconcile_complete: measured::CounterVec::new( - ReconcileCompleteLabelGroupSet { - status: StaticLabelSet::new(), - }, - ), - storage_controller_schedule_optimization: measured::Counter::new(), - storage_controller_http_request_status: measured::CounterVec::new( - HttpRequestStatusLabelGroupSet { - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - status: StaticLabelSet::new(), - }, - ), - storage_controller_http_request_latency: measured::HistogramVec::new( - measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), - ), - storage_controller_pageserver_request_error: measured::CounterVec::new( - PageserverRequestLabelGroupSet { - pageserver_id: lasso::ThreadedRodeo::new(), - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - }, - ), - storage_controller_pageserver_request_latency: measured::HistogramVec::new( - measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), - ), - storage_controller_passthrough_request_error: measured::CounterVec::new( - PageserverRequestLabelGroupSet { - pageserver_id: lasso::ThreadedRodeo::new(), - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - }, - ), - storage_controller_passthrough_request_latency: measured::HistogramVec::new( - measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), - ), - storage_controller_database_query_error: measured::CounterVec::new( - DatabaseQueryErrorLabelGroupSet { - operation: StaticLabelSet::new(), - error_type: StaticLabelSet::new(), - }, - ), - storage_controller_database_query_latency: measured::HistogramVec::new( - measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), - ), + metrics_group, + encoder: Mutex::new(measured::text::BufferedTextEncoder::new()), } } } @@ -157,7 +119,7 @@ pub(crate) struct ReconcileCompleteLabelGroup { #[derive(measured::LabelGroup)] #[label(set = HttpRequestStatusLabelGroupSet)] pub(crate) struct HttpRequestStatusLabelGroup<'a> { - #[label(dynamic_with = lasso::ThreadedRodeo)] + #[label(dynamic_with = lasso::ThreadedRodeo, default)] pub(crate) path: &'a str, pub(crate) method: Method, pub(crate) status: StatusCode, @@ -166,40 +128,21 @@ pub(crate) struct HttpRequestStatusLabelGroup<'a> { #[derive(measured::LabelGroup)] #[label(set = HttpRequestLatencyLabelGroupSet)] pub(crate) struct HttpRequestLatencyLabelGroup<'a> { - #[label(dynamic_with = lasso::ThreadedRodeo)] + #[label(dynamic_with = lasso::ThreadedRodeo, default)] pub(crate) path: &'a str, pub(crate) method: Method, } -impl Default for HttpRequestLatencyLabelGroupSet { - fn default() -> Self { - Self { - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - } - } -} - #[derive(measured::LabelGroup, Clone)] #[label(set = PageserverRequestLabelGroupSet)] pub(crate) struct PageserverRequestLabelGroup<'a> { - #[label(dynamic_with = lasso::ThreadedRodeo)] + #[label(dynamic_with = lasso::ThreadedRodeo, default)] pub(crate) pageserver_id: &'a str, - #[label(dynamic_with = lasso::ThreadedRodeo)] + #[label(dynamic_with = lasso::ThreadedRodeo, default)] pub(crate) path: &'a str, pub(crate) method: Method, } -impl Default for PageserverRequestLabelGroupSet { - fn default() -> Self { - Self { - pageserver_id: lasso::ThreadedRodeo::new(), - path: lasso::ThreadedRodeo::new(), - method: StaticLabelSet::new(), - } - } -} - #[derive(measured::LabelGroup)] #[label(set = DatabaseQueryErrorLabelGroupSet)] pub(crate) struct DatabaseQueryErrorLabelGroup { @@ -213,7 +156,7 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup { pub(crate) operation: DatabaseOperation, } -#[derive(FixedCardinalityLabel)] +#[derive(FixedCardinalityLabel, Clone, Copy)] pub(crate) enum ReconcileOutcome { #[label(rename = "ok")] Success, @@ -221,7 +164,7 @@ pub(crate) enum ReconcileOutcome { Cancel, } -#[derive(FixedCardinalityLabel, Clone)] +#[derive(FixedCardinalityLabel, Copy, Clone)] pub(crate) enum Method { Get, Put, @@ -246,11 +189,12 @@ impl From for Method { } } +#[derive(Clone, Copy)] pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode); impl LabelValue for StatusCode { fn visit(&self, v: V) -> V::Output { - v.write_int(self.0.as_u16() as u64) + v.write_int(self.0.as_u16() as i64) } } @@ -268,7 +212,7 @@ impl FixedCardinalityLabel for StatusCode { } } -#[derive(FixedCardinalityLabel)] +#[derive(FixedCardinalityLabel, Clone, Copy)] pub(crate) enum DatabaseErrorLabel { Query, Connection, diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 55fbfd10bc..5312e1e218 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -79,7 +79,7 @@ pub(crate) enum DatabaseError { Logical(String), } -#[derive(measured::FixedCardinalityLabel, Clone)] +#[derive(measured::FixedCardinalityLabel, Copy, Clone)] pub(crate) enum DatabaseOperation { InsertNode, UpdateNode, @@ -153,9 +153,7 @@ impl Persistence { let latency = &METRICS_REGISTRY .metrics_group .storage_controller_database_query_latency; - let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { - operation: op.clone(), - }); + let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op }); let res = self.with_conn(func).await; diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 7b8228a082..bcbd4daa7e 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -37,8 +37,7 @@ futures-io = { version = "0.3" } futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } getrandom = { version = "0.2", default-features = false, features = ["std"] } -hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] } -hashbrown-594e8ee84c453af0 = { package = "hashbrown", version = "0.13", features = ["raw"] } +hashbrown = { version = "0.14", features = ["raw"] } hex = { version = "0.4", features = ["serde"] } hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper = { version = "0.14", features = ["full"] } @@ -91,7 +90,7 @@ cc = { version = "1", default-features = false, features = ["parallel"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] } either = { version = "1" } getrandom = { version = "0.2", default-features = false, features = ["std"] } -hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] } +hashbrown = { version = "0.14", features = ["raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits", "use_std"] } From bcab344490fbb68daf75c98900cdd8e20f6417d6 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 9 Apr 2024 10:50:43 +0100 Subject: [PATCH 06/24] CI(flaky-tests): remove outdated restriction (#7345) ## Problem After switching the default pageserver io-engine to `tokio-epoll-uring` on CI, we tuned a query that finds flaky tests (in https://github.com/neondatabase/neon/pull/7077). It has been almost a month since then, additional query tuning is not required anymore. ## Summary of changes - Remove extra condition from flaky tests query - Also return back parameterisation to the query --- scripts/flaky_tests.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py index 853c67d218..878840fcee 100755 --- a/scripts/flaky_tests.py +++ b/scripts/flaky_tests.py @@ -15,8 +15,7 @@ FLAKY_TESTS_QUERY = """ DISTINCT parent_suite, suite, name FROM results WHERE - started_at > CURRENT_DATE - INTERVAL '10' day - AND started_at > '2024-03-11 14:50:11.845+00' -- we switched the default PAGESERVER_VIRTUAL_FILE_IO_ENGINE to `tokio-epoll-uring` from `std-fs` on this date, we want to ignore the flaky tests for `std-fs` + started_at > CURRENT_DATE - INTERVAL '%s' day AND ( (status IN ('failed', 'broken') AND reference = 'refs/heads/main') OR flaky From 4f4f787119c2a353da0a0691714256bec1f82b11 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 9 Apr 2024 12:03:46 +0100 Subject: [PATCH 07/24] Update staging hostname (#7347) ## Problem ``` Could not resolve host: console.stage.neon.tech ``` ## Summary of changes - replace `console.stage.neon.tech` with `console-stage.neon.build` --- .github/actions/neon-branch-create/action.yml | 2 +- .github/actions/neon-branch-delete/action.yml | 2 +- .github/actions/neon-project-create/action.yml | 2 +- .github/actions/neon-project-delete/action.yml | 2 +- scripts/sk_cleanup_tenants/script.py | 2 +- scripts/sk_collect_dumps/readme.md | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml index f1eea34ab9..dea3fc2357 100644 --- a/.github/actions/neon-branch-create/action.yml +++ b/.github/actions/neon-branch-create/action.yml @@ -10,7 +10,7 @@ inputs: required: true api_host: desctiption: 'Neon API host' - default: console.stage.neon.tech + default: console-stage.neon.build outputs: dsn: description: 'Created Branch DSN (for main database)' diff --git a/.github/actions/neon-branch-delete/action.yml b/.github/actions/neon-branch-delete/action.yml index f8cd351dd9..8acba7ad00 100644 --- a/.github/actions/neon-branch-delete/action.yml +++ b/.github/actions/neon-branch-delete/action.yml @@ -13,7 +13,7 @@ inputs: required: true api_host: desctiption: 'Neon API host' - default: console.stage.neon.tech + default: console-stage.neon.build runs: using: "composite" diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index ae6464990e..7f0e599b97 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -13,7 +13,7 @@ inputs: default: 15 api_host: desctiption: 'Neon API host' - default: console.stage.neon.tech + default: console-stage.neon.build provisioner: desctiption: 'k8s-pod or k8s-neonvm' default: 'k8s-pod' diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml index adc8510a34..b8ec6cac70 100644 --- a/.github/actions/neon-project-delete/action.yml +++ b/.github/actions/neon-project-delete/action.yml @@ -10,7 +10,7 @@ inputs: required: true api_host: desctiption: 'Neon API host' - default: console.stage.neon.tech + default: console-stage.neon.build runs: using: "composite" diff --git a/scripts/sk_cleanup_tenants/script.py b/scripts/sk_cleanup_tenants/script.py index fa22433614..c20a4bb830 100644 --- a/scripts/sk_cleanup_tenants/script.py +++ b/scripts/sk_cleanup_tenants/script.py @@ -22,7 +22,7 @@ parser.add_argument("--safekeeper-host", required=True, type=str) args = parser.parse_args() access_key = os.getenv("CONSOLE_API_TOKEN") -endpoint: str = "https://console.stage.neon.tech/api" +endpoint: str = "https://console-stage.neon.build/api" trash_dir: Path = args.trash_dir dry_run: bool = args.dry_run diff --git a/scripts/sk_collect_dumps/readme.md b/scripts/sk_collect_dumps/readme.md index 7494a6cb78..5ae55e058b 100644 --- a/scripts/sk_collect_dumps/readme.md +++ b/scripts/sk_collect_dumps/readme.md @@ -3,7 +3,7 @@ 3. Issue admin token (add/remove .stage from url for staging/prod and setting proper API key): ``` # staging: -AUTH_TOKEN=$(curl https://console.stage.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') +AUTH_TOKEN=$(curl https://console-stage.neon.build/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') # prod: AUTH_TOKEN=$(curl https://console.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_PROD_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') # check From dbac2d2c473f3648251f0a64e36d066f444dfe00 Mon Sep 17 00:00:00 2001 From: Anna Khanova <32508607+khanova@users.noreply.github.com> Date: Wed, 10 Apr 2024 02:40:14 +0200 Subject: [PATCH 08/24] Proxy read ids from redis (#7205) ## Problem Proxy doesn't know about existing endpoints. ## Summary of changes * Added caching of all available endpoints. * On the high load, use it before going to cplane. * Report metrics for the outcome. * For rate limiter and credentials caching don't distinguish between `-pooled` and not TODOs: * Make metrics more meaningful * Consider integrating it with the endpoint rate limiter * Test it together with cplane in preview --- proxy/src/auth/backend.rs | 4 +- proxy/src/bin/proxy.rs | 15 +- proxy/src/cache.rs | 1 + proxy/src/cache/endpoints.rs | 191 ++++++++++++++++++ proxy/src/config.rs | 69 +++++++ proxy/src/console/provider.rs | 22 +- proxy/src/console/provider/neon.rs | 20 +- proxy/src/context.rs | 15 +- proxy/src/intern.rs | 15 ++ proxy/src/lib.rs | 37 ++++ proxy/src/metrics.rs | 12 ++ proxy/src/proxy.rs | 4 +- proxy/src/rate_limiter.rs | 2 +- proxy/src/rate_limiter/limiter.rs | 10 +- proxy/src/redis/cancellation_publisher.rs | 6 +- .../regress/test_proxy_rate_limiter.py | 84 -------- 16 files changed, 393 insertions(+), 114 deletions(-) create mode 100644 proxy/src/cache/endpoints.rs delete mode 100644 test_runner/regress/test_proxy_rate_limiter.py diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index e421798067..71e9da18bc 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -27,7 +27,7 @@ use crate::{ }, stream, url, }; -use crate::{scram, EndpointCacheKey, EndpointId, RoleName}; +use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; @@ -186,7 +186,7 @@ impl AuthenticationConfig { is_cleartext: bool, ) -> auth::Result { // we have validated the endpoint exists, so let's intern it. - let endpoint_int = EndpointIdInt::from(endpoint); + let endpoint_int = EndpointIdInt::from(endpoint.normalize()); // only count the full hash count if password hack or websocket flow. // in other words, if proxy needs to run the hashing diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 56a3ef79cd..9302b31d5c 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -189,7 +189,9 @@ struct ProxyCliArgs { /// cache for `project_info` (use `size=0` to disable) #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)] project_info_cache: String, - + /// cache for all valid endpoints + #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)] + endpoint_cache_config: String, #[clap(flatten)] parquet_upload: ParquetUploadArgs, @@ -401,6 +403,7 @@ async fn main() -> anyhow::Result<()> { if let auth::BackendType::Console(api, _) = &config.auth_backend { if let proxy::console::provider::ConsoleBackend::Console(api) = &**api { + maintenance_tasks.spawn(api.locks.garbage_collect_worker()); if let Some(redis_notifications_client) = redis_notifications_client { let cache = api.caches.project_info.clone(); maintenance_tasks.spawn(notifications::task_main( @@ -410,6 +413,9 @@ async fn main() -> anyhow::Result<()> { args.region.clone(), )); maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); + let cache = api.caches.endpoints_cache.clone(); + let con = redis_notifications_client.clone(); + maintenance_tasks.spawn(async move { cache.do_read(con).await }); } } } @@ -489,14 +495,18 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = args.project_info_cache.parse()?; + let endpoint_cache_config: config::EndpointCacheConfig = + args.endpoint_cache_config.parse()?; info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); info!( "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" ); + info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); let caches = Box::leak(Box::new(console::caches::ApiCaches::new( wake_compute_cache_config, project_info_cache_config, + endpoint_cache_config, ))); let config::WakeComputeLockOptions { @@ -507,10 +517,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { } = args.wake_compute_lock.parse()?; info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)"); let locks = Box::leak(Box::new( - console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout) + console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout, epoch) .unwrap(), )); - tokio::spawn(locks.garbage_collect_worker(epoch)); let url = args.auth_endpoint.parse()?; let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config)); diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs index fc5f416395..d1d4087241 100644 --- a/proxy/src/cache.rs +++ b/proxy/src/cache.rs @@ -1,4 +1,5 @@ pub mod common; +pub mod endpoints; pub mod project_info; mod timed_lru; diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs new file mode 100644 index 0000000000..9bc019c2d8 --- /dev/null +++ b/proxy/src/cache/endpoints.rs @@ -0,0 +1,191 @@ +use std::{ + convert::Infallible, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, +}; + +use dashmap::DashSet; +use redis::{ + streams::{StreamReadOptions, StreamReadReply}, + AsyncCommands, FromRedisValue, Value, +}; +use serde::Deserialize; +use tokio::sync::Mutex; + +use crate::{ + config::EndpointCacheConfig, + context::RequestMonitoring, + intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}, + metrics::REDIS_BROKEN_MESSAGES, + rate_limiter::GlobalRateLimiter, + redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider, + EndpointId, Normalize, +}; + +#[derive(Deserialize, Debug, Clone)] +#[serde(rename_all(deserialize = "snake_case"))] +pub enum ControlPlaneEventKey { + EndpointCreated, + BranchCreated, + ProjectCreated, +} + +pub struct EndpointsCache { + config: EndpointCacheConfig, + endpoints: DashSet, + branches: DashSet, + projects: DashSet, + ready: AtomicBool, + limiter: Arc>, +} + +impl EndpointsCache { + pub fn new(config: EndpointCacheConfig) -> Self { + Self { + limiter: Arc::new(Mutex::new(GlobalRateLimiter::new( + config.limiter_info.clone(), + ))), + config, + endpoints: DashSet::new(), + branches: DashSet::new(), + projects: DashSet::new(), + ready: AtomicBool::new(false), + } + } + pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool { + if !self.ready.load(Ordering::Acquire) { + return true; + } + // If cache is disabled, just collect the metrics and return. + if self.config.disable_cache { + ctx.set_rejected(self.should_reject(endpoint)); + return true; + } + // If the limiter allows, we don't need to check the cache. + if self.limiter.lock().await.check() { + return true; + } + let rejected = self.should_reject(endpoint); + ctx.set_rejected(rejected); + !rejected + } + fn should_reject(&self, endpoint: &EndpointId) -> bool { + let endpoint = endpoint.normalize(); + if endpoint.is_endpoint() { + !self.endpoints.contains(&EndpointIdInt::from(&endpoint)) + } else if endpoint.is_branch() { + !self + .branches + .contains(&BranchIdInt::from(&endpoint.as_branch())) + } else { + !self + .projects + .contains(&ProjectIdInt::from(&endpoint.as_project())) + } + } + fn insert_event(&self, key: ControlPlaneEventKey, value: String) { + // Do not do normalization here, we expect the events to be normalized. + match key { + ControlPlaneEventKey::EndpointCreated => { + self.endpoints.insert(EndpointIdInt::from(&value.into())); + } + ControlPlaneEventKey::BranchCreated => { + self.branches.insert(BranchIdInt::from(&value.into())); + } + ControlPlaneEventKey::ProjectCreated => { + self.projects.insert(ProjectIdInt::from(&value.into())); + } + } + } + pub async fn do_read( + &self, + mut con: ConnectionWithCredentialsProvider, + ) -> anyhow::Result { + let mut last_id = "0-0".to_string(); + loop { + self.ready.store(false, Ordering::Release); + if let Err(e) = con.connect().await { + tracing::error!("error connecting to redis: {:?}", e); + continue; + } + if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await { + tracing::error!("error reading from redis: {:?}", e); + } + } + } + async fn read_from_stream( + &self, + con: &mut ConnectionWithCredentialsProvider, + last_id: &mut String, + ) -> anyhow::Result<()> { + tracing::info!("reading endpoints/branches/projects from redis"); + self.batch_read( + con, + StreamReadOptions::default().count(self.config.initial_batch_size), + last_id, + true, + ) + .await?; + tracing::info!("ready to filter user requests"); + self.ready.store(true, Ordering::Release); + self.batch_read( + con, + StreamReadOptions::default() + .count(self.config.initial_batch_size) + .block(self.config.xread_timeout.as_millis() as usize), + last_id, + false, + ) + .await + } + fn parse_key_value(key: &str, value: &Value) -> anyhow::Result<(ControlPlaneEventKey, String)> { + Ok((serde_json::from_str(key)?, String::from_redis_value(value)?)) + } + async fn batch_read( + &self, + conn: &mut ConnectionWithCredentialsProvider, + opts: StreamReadOptions, + last_id: &mut String, + return_when_finish: bool, + ) -> anyhow::Result<()> { + let mut total: usize = 0; + loop { + let mut res: StreamReadReply = conn + .xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts) + .await?; + if res.keys.len() != 1 { + anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name); + } + + let res = res.keys.pop().expect("Checked length above"); + + if return_when_finish && res.ids.len() <= self.config.default_batch_size { + break; + } + for x in res.ids { + total += 1; + for (k, v) in x.map { + let (key, value) = match Self::parse_key_value(&k, &v) { + Ok(x) => x, + Err(e) => { + REDIS_BROKEN_MESSAGES + .with_label_values(&[&self.config.stream_name]) + .inc(); + tracing::error!("error parsing key-value {k}-{v:?}: {e:?}"); + continue; + } + }; + self.insert_event(key, value); + } + if total.is_power_of_two() { + tracing::debug!("endpoints read {}", total); + } + *last_id = x.id; + } + } + tracing::info!("read {} endpoints/branches/projects from redis", total); + Ok(()) + } +} diff --git a/proxy/src/config.rs b/proxy/src/config.rs index fc490c7348..3bdfb3cfad 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -313,6 +313,75 @@ impl CertResolver { } } +#[derive(Debug)] +pub struct EndpointCacheConfig { + /// Batch size to receive all endpoints on the startup. + pub initial_batch_size: usize, + /// Batch size to receive endpoints. + pub default_batch_size: usize, + /// Timeouts for the stream read operation. + pub xread_timeout: Duration, + /// Stream name to read from. + pub stream_name: String, + /// Limiter info (to distinguish when to enable cache). + pub limiter_info: Vec, + /// Disable cache. + /// If true, cache is ignored, but reports all statistics. + pub disable_cache: bool, +} + +impl EndpointCacheConfig { + /// Default options for [`crate::console::provider::NodeInfoCache`]. + /// Notice that by default the limiter is empty, which means that cache is disabled. + pub const CACHE_DEFAULT_OPTIONS: &'static str = + "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s"; + + /// Parse cache options passed via cmdline. + /// Example: [`Self::CACHE_DEFAULT_OPTIONS`]. + fn parse(options: &str) -> anyhow::Result { + let mut initial_batch_size = None; + let mut default_batch_size = None; + let mut xread_timeout = None; + let mut stream_name = None; + let mut limiter_info = vec![]; + let mut disable_cache = false; + + for option in options.split(',') { + let (key, value) = option + .split_once('=') + .with_context(|| format!("bad key-value pair: {option}"))?; + + match key { + "initial_batch_size" => initial_batch_size = Some(value.parse()?), + "default_batch_size" => default_batch_size = Some(value.parse()?), + "xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?), + "stream_name" => stream_name = Some(value.to_string()), + "limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?), + "disable_cache" => disable_cache = value.parse()?, + unknown => bail!("unknown key: {unknown}"), + } + } + RateBucketInfo::validate(&mut limiter_info)?; + + Ok(Self { + initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?, + default_batch_size: default_batch_size.context("missing `default_batch_size`")?, + xread_timeout: xread_timeout.context("missing `xread_timeout`")?, + stream_name: stream_name.context("missing `stream_name`")?, + disable_cache, + limiter_info, + }) + } +} + +impl FromStr for EndpointCacheConfig { + type Err = anyhow::Error; + + fn from_str(options: &str) -> Result { + let error = || format!("failed to parse endpoint cache options '{options}'"); + Self::parse(options).with_context(error) + } +} #[derive(Debug)] pub struct MetricBackupCollectionConfig { pub interval: Duration, diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index f7d621fb12..ee2bc866ab 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -8,15 +8,15 @@ use crate::{ backend::{ComputeCredentialKeys, ComputeUserInfo}, IpPattern, }, - cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru}, + cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru}, compute, - config::{CacheOptions, ProjectInfoCacheOptions}, + config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}, context::RequestMonitoring, intern::ProjectIdInt, scram, EndpointCacheKey, }; use dashmap::DashMap; -use std::{sync::Arc, time::Duration}; +use std::{convert::Infallible, sync::Arc, time::Duration}; use tokio::sync::{OwnedSemaphorePermit, Semaphore}; use tokio::time::Instant; use tracing::info; @@ -416,12 +416,15 @@ pub struct ApiCaches { pub node_info: NodeInfoCache, /// Cache which stores project_id -> endpoint_ids mapping. pub project_info: Arc, + /// List of all valid endpoints. + pub endpoints_cache: Arc, } impl ApiCaches { pub fn new( wake_compute_cache_config: CacheOptions, project_info_cache_config: ProjectInfoCacheOptions, + endpoint_cache_config: EndpointCacheConfig, ) -> Self { Self { node_info: NodeInfoCache::new( @@ -431,6 +434,7 @@ impl ApiCaches { true, ), project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)), + endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)), } } } @@ -441,6 +445,7 @@ pub struct ApiLocks { node_locks: DashMap>, permits: usize, timeout: Duration, + epoch: std::time::Duration, registered: prometheus::IntCounter, unregistered: prometheus::IntCounter, reclamation_lag: prometheus::Histogram, @@ -453,6 +458,7 @@ impl ApiLocks { permits: usize, shards: usize, timeout: Duration, + epoch: std::time::Duration, ) -> prometheus::Result { let registered = prometheus::IntCounter::with_opts( prometheus::Opts::new( @@ -497,6 +503,7 @@ impl ApiLocks { node_locks: DashMap::with_shard_amount(shards), permits, timeout, + epoch, lock_acquire_lag, registered, unregistered, @@ -536,12 +543,9 @@ impl ApiLocks { }) } - pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) { - if self.permits == 0 { - return; - } - - let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32); + pub async fn garbage_collect_worker(&self) -> anyhow::Result { + let mut interval = + tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32); loop { for (i, shard) in self.node_locks.shards().iter().enumerate() { interval.tick().await; diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 1a3e2ca795..3a0e5609d8 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -8,6 +8,7 @@ use super::{ }; use crate::{ auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram, + Normalize, }; use crate::{ cache::Cached, @@ -23,7 +24,7 @@ use tracing::{error, info, info_span, warn, Instrument}; pub struct Api { endpoint: http::Endpoint, pub caches: &'static ApiCaches, - locks: &'static ApiLocks, + pub locks: &'static ApiLocks, jwt: String, } @@ -55,6 +56,15 @@ impl Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { + if !self + .caches + .endpoints_cache + .is_valid(ctx, &user_info.endpoint) + .await + { + info!("endpoint is not valid, skipping the request"); + return Ok(AuthInfo::default()); + } let request_id = ctx.session_id.to_string(); let application_name = ctx.console_application_name(); async { @@ -81,7 +91,9 @@ impl Api { Ok(body) => body, // Error 404 is special: it's ok not to have a secret. Err(e) => match e.http_status_code() { - Some(http::StatusCode::NOT_FOUND) => return Ok(AuthInfo::default()), + Some(http::StatusCode::NOT_FOUND) => { + return Ok(AuthInfo::default()); + } _otherwise => return Err(e.into()), }, }; @@ -181,7 +193,7 @@ impl super::Api for Api { } let auth_info = self.do_get_auth_info(ctx, user_info).await?; if let Some(project_id) = auth_info.project_id { - let ep_int = ep.into(); + let ep_int = ep.normalize().into(); self.caches.project_info.insert_role_secret( project_id, ep_int, @@ -218,7 +230,7 @@ impl super::Api for Api { let allowed_ips = Arc::new(auth_info.allowed_ips); let user = &user_info.user; if let Some(project_id) = auth_info.project_id { - let ep_int = ep.into(); + let ep_int = ep.normalize().into(); self.caches.project_info.insert_role_secret( project_id, ep_int, diff --git a/proxy/src/context.rs b/proxy/src/context.rs index fec95f4722..85544f1d65 100644 --- a/proxy/src/context.rs +++ b/proxy/src/context.rs @@ -12,7 +12,9 @@ use crate::{ console::messages::{ColdStartInfo, MetricsAuxInfo}, error::ErrorKind, intern::{BranchIdInt, ProjectIdInt}, - metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND}, + metrics::{ + bool_to_str, LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND, NUM_INVALID_ENDPOINTS, + }, DbName, EndpointId, RoleName, }; @@ -50,6 +52,8 @@ pub struct RequestMonitoring { // This sender is here to keep the request monitoring channel open while requests are taking place. sender: Option>, pub latency_timer: LatencyTimer, + // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane. + rejected: bool, } #[derive(Clone, Debug)] @@ -93,6 +97,7 @@ impl RequestMonitoring { error_kind: None, auth_method: None, success: false, + rejected: false, cold_start_info: ColdStartInfo::Unknown, sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()), @@ -113,6 +118,10 @@ impl RequestMonitoring { ) } + pub fn set_rejected(&mut self, rejected: bool) { + self.rejected = rejected; + } + pub fn set_cold_start_info(&mut self, info: ColdStartInfo) { self.cold_start_info = info; self.latency_timer.cold_start_info(info); @@ -178,6 +187,10 @@ impl RequestMonitoring { impl Drop for RequestMonitoring { fn drop(&mut self) { + let outcome = if self.success { "success" } else { "failure" }; + NUM_INVALID_ENDPOINTS + .with_label_values(&[self.protocol, bool_to_str(self.rejected), outcome]) + .inc(); if let Some(tx) = self.sender.take() { let _: Result<(), _> = tx.send(RequestData::from(&*self)); } diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs index a6519bdff9..e38135dd22 100644 --- a/proxy/src/intern.rs +++ b/proxy/src/intern.rs @@ -160,6 +160,11 @@ impl From<&EndpointId> for EndpointIdInt { EndpointIdTag::get_interner().get_or_intern(value) } } +impl From for EndpointIdInt { + fn from(value: EndpointId) -> Self { + EndpointIdTag::get_interner().get_or_intern(&value) + } +} #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct BranchIdTag; @@ -175,6 +180,11 @@ impl From<&BranchId> for BranchIdInt { BranchIdTag::get_interner().get_or_intern(value) } } +impl From for BranchIdInt { + fn from(value: BranchId) -> Self { + BranchIdTag::get_interner().get_or_intern(&value) + } +} #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct ProjectIdTag; @@ -190,6 +200,11 @@ impl From<&ProjectId> for ProjectIdInt { ProjectIdTag::get_interner().get_or_intern(value) } } +impl From for ProjectIdInt { + fn from(value: ProjectId) -> Self { + ProjectIdTag::get_interner().get_or_intern(&value) + } +} #[cfg(test)] mod tests { diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index da7c7f3ed2..3f6d985fe8 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -127,6 +127,24 @@ macro_rules! smol_str_wrapper { }; } +const POOLER_SUFFIX: &str = "-pooler"; + +pub trait Normalize { + fn normalize(&self) -> Self; +} + +impl + From> Normalize for S { + fn normalize(&self) -> Self { + if self.as_ref().ends_with(POOLER_SUFFIX) { + let mut s = self.as_ref().to_string(); + s.truncate(s.len() - POOLER_SUFFIX.len()); + s.into() + } else { + self.clone() + } + } +} + // 90% of role name strings are 20 characters or less. smol_str_wrapper!(RoleName); // 50% of endpoint strings are 23 characters or less. @@ -140,3 +158,22 @@ smol_str_wrapper!(ProjectId); smol_str_wrapper!(EndpointCacheKey); smol_str_wrapper!(DbName); + +// Endpoints are a bit tricky. Rare they might be branches or projects. +impl EndpointId { + pub fn is_endpoint(&self) -> bool { + self.0.starts_with("ep-") + } + pub fn is_branch(&self) -> bool { + self.0.starts_with("br-") + } + pub fn is_project(&self) -> bool { + !self.is_endpoint() && !self.is_branch() + } + pub fn as_branch(&self) -> BranchId { + BranchId(self.0.clone()) + } + pub fn as_project(&self) -> ProjectId { + ProjectId(self.0.clone()) + } +} diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 59ee899c08..f299313e0a 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -169,6 +169,18 @@ pub static NUM_CANCELLATION_REQUESTS: Lazy = Lazy::new(|| { .unwrap() }); +pub static NUM_INVALID_ENDPOINTS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "proxy_invalid_endpoints_total", + "Number of invalid endpoints (per protocol, per rejected).", + // http/ws/tcp, true/false, success/failure + // TODO(anna): the last dimension is just a proxy to what we actually want to measure. + // We need to measure whether the endpoint was found by cplane or not. + &["protocol", "rejected", "outcome"], + ) + .unwrap() +}); + pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client"; pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis"; diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 6051c0a812..166e761a4e 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -20,7 +20,7 @@ use crate::{ proxy::handshake::{handshake, HandshakeData}, rate_limiter::EndpointRateLimiter, stream::{PqStream, Stream}, - EndpointCacheKey, + EndpointCacheKey, Normalize, }; use futures::TryFutureExt; use itertools::Itertools; @@ -280,7 +280,7 @@ pub async fn handle_client( // check rate limit if let Some(ep) = user_info.get_endpoint() { - if !endpoint_rate_limiter.check(ep, 1) { + if !endpoint_rate_limiter.check(ep.normalize(), 1) { return stream .throw_error(auth::AuthError::too_many_connections()) .await?; diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs index 13dffffca0..a3b83e5e50 100644 --- a/proxy/src/rate_limiter.rs +++ b/proxy/src/rate_limiter.rs @@ -4,4 +4,4 @@ mod limiter; pub use aimd::Aimd; pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig}; pub use limiter::Limiter; -pub use limiter::{AuthRateLimiter, EndpointRateLimiter, RateBucketInfo, RedisRateLimiter}; +pub use limiter::{AuthRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo}; diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index f590896dd9..0503deb311 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -24,13 +24,13 @@ use super::{ RateLimiterConfig, }; -pub struct RedisRateLimiter { +pub struct GlobalRateLimiter { data: Vec, - info: &'static [RateBucketInfo], + info: Vec, } -impl RedisRateLimiter { - pub fn new(info: &'static [RateBucketInfo]) -> Self { +impl GlobalRateLimiter { + pub fn new(info: Vec) -> Self { Self { data: vec![ RateBucket { @@ -50,7 +50,7 @@ impl RedisRateLimiter { let should_allow_request = self .data .iter_mut() - .zip(self.info) + .zip(&self.info) .all(|(bucket, info)| bucket.should_allow_request(info, now, 1)); if should_allow_request { diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs index 422789813c..7baf104374 100644 --- a/proxy/src/redis/cancellation_publisher.rs +++ b/proxy/src/redis/cancellation_publisher.rs @@ -5,7 +5,7 @@ use redis::AsyncCommands; use tokio::sync::Mutex; use uuid::Uuid; -use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter}; +use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; use super::{ connection_with_credentials_provider::ConnectionWithCredentialsProvider, @@ -80,7 +80,7 @@ impl CancellationPublisher for Arc> { pub struct RedisPublisherClient { client: ConnectionWithCredentialsProvider, region_id: String, - limiter: RedisRateLimiter, + limiter: GlobalRateLimiter, } impl RedisPublisherClient { @@ -92,7 +92,7 @@ impl RedisPublisherClient { Ok(Self { client, region_id, - limiter: RedisRateLimiter::new(info), + limiter: GlobalRateLimiter::new(info.into()), }) } diff --git a/test_runner/regress/test_proxy_rate_limiter.py b/test_runner/regress/test_proxy_rate_limiter.py deleted file mode 100644 index f39f0cad07..0000000000 --- a/test_runner/regress/test_proxy_rate_limiter.py +++ /dev/null @@ -1,84 +0,0 @@ -import asyncio -import time -from pathlib import Path -from typing import Iterator - -import pytest -from fixtures.neon_fixtures import ( - PSQL, - NeonProxy, -) -from fixtures.port_distributor import PortDistributor -from pytest_httpserver import HTTPServer -from werkzeug.wrappers.response import Response - - -def waiting_handler(status_code: int) -> Response: - # wait more than timeout to make sure that both (two) connections are open. - # It would be better to use a barrier here, but I don't know how to do that together with pytest-httpserver. - time.sleep(2) - return Response(status=status_code) - - -@pytest.fixture(scope="function") -def proxy_with_rate_limit( - port_distributor: PortDistributor, - neon_binpath: Path, - httpserver_listen_address, - test_output_dir: Path, -) -> Iterator[NeonProxy]: - """Neon proxy that routes directly to vanilla postgres.""" - - proxy_port = port_distributor.get_port() - mgmt_port = port_distributor.get_port() - http_port = port_distributor.get_port() - external_http_port = port_distributor.get_port() - (host, port) = httpserver_listen_address - endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" - - with NeonProxy( - neon_binpath=neon_binpath, - test_output_dir=test_output_dir, - proxy_port=proxy_port, - http_port=http_port, - mgmt_port=mgmt_port, - external_http_port=external_http_port, - auth_backend=NeonProxy.Console(endpoint, fixed_rate_limit=5), - ) as proxy: - proxy.start() - yield proxy - - -@pytest.mark.asyncio -async def test_proxy_rate_limit( - httpserver: HTTPServer, - proxy_with_rate_limit: NeonProxy, -): - uri = "/billing/api/v1/usage_events/proxy_get_role_secret" - # mock control plane service - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: Response(status=200) - ) - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: waiting_handler(429) - ) - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: waiting_handler(500) - ) - - psql = PSQL(host=proxy_with_rate_limit.host, port=proxy_with_rate_limit.proxy_port) - f = await psql.run("select 42;") - await proxy_with_rate_limit.find_auth_link(uri, f) - # Limit should be 2. - - # Run two queries in parallel. - f1, f2 = await asyncio.gather(psql.run("select 42;"), psql.run("select 42;")) - await proxy_with_rate_limit.find_auth_link(uri, f1) - await proxy_with_rate_limit.find_auth_link(uri, f2) - - # Now limit should be 0. - f = await psql.run("select 42;") - await proxy_with_rate_limit.find_auth_link(uri, f) - - # There last query shouldn't reach the http-server. - assert httpserver.assertions == [] From 221414de4b0260056e0961528d46c5141825a0a0 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 10 Apr 2024 06:31:28 +0100 Subject: [PATCH 09/24] pageserver: time based rolling based on the first write timestamp (#7346) Problem Currently, we base our time based layer rolling decision on the last time we froze a layer. This means that if we roll a layer and then go idle for longer than the checkpoint timeout the next layer will be rolled after the first write. This is of course not desirable. Summary of changes Record the timepoint of the first write to an open layer and use that for time based layer rolling decisions. Note that I had to keep `Timeline::last_freeze_ts` for the sharded tenant disk consistent lsn skip hack. Fixes #7241 --- .../tenant/storage_layer/inmemory_layer.rs | 8 +++ pageserver/src/tenant/timeline.rs | 29 ++++------- .../regress/test_pageserver_layer_rolling.py | 50 ++++++++++++++++--- 3 files changed, 62 insertions(+), 25 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 43942ba2db..29751641b4 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -19,6 +19,7 @@ use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; use std::collections::{BinaryHeap, HashMap, HashSet}; use std::sync::{Arc, OnceLock}; +use std::time::Instant; use tracing::*; use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap}; // avoid binding to Write (conflicts with std::io::Write) @@ -53,6 +54,8 @@ pub struct InMemoryLayer { /// Writes are only allowed when this is `None`. end_lsn: OnceLock, + opened_at: Instant, + /// The above fields never change, except for `end_lsn`, which is only set once. /// All other changing parts are in `inner`, and protected by a mutex. inner: RwLock, @@ -460,6 +463,7 @@ impl InMemoryLayer { tenant_shard_id, start_lsn, end_lsn: OnceLock::new(), + opened_at: Instant::now(), inner: RwLock::new(InMemoryLayerInner { index: HashMap::new(), file, @@ -520,6 +524,10 @@ impl InMemoryLayer { Ok(()) } + pub(crate) fn get_opened_at(&self) -> Instant { + self.opened_at + } + pub(crate) async fn tick(&self) -> Option { let mut inner = self.inner.write().await; let size = inner.file.len(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index d3c8c5f66c..d046a60af4 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1257,7 +1257,7 @@ impl Timeline { checkpoint_distance, self.get_last_record_lsn(), self.last_freeze_at.load(), - *self.last_freeze_ts.read().unwrap(), + open_layer.get_opened_at(), ) { match open_layer.info() { InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => { @@ -1622,7 +1622,7 @@ impl Timeline { checkpoint_distance: u64, projected_lsn: Lsn, last_freeze_at: Lsn, - last_freeze_ts: Instant, + opened_at: Instant, ) -> bool { let distance = projected_lsn.widening_sub(last_freeze_at); @@ -1648,13 +1648,13 @@ impl Timeline { ); true - } else if distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() { + } else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() { info!( - "Will roll layer at {} with layer size {} due to time since last flush ({:?})", - projected_lsn, - layer_size, - last_freeze_ts.elapsed() - ); + "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})", + projected_lsn, + layer_size, + opened_at.elapsed() + ); true } else { @@ -4703,23 +4703,16 @@ struct TimelineWriterState { max_lsn: Option, // Cached details of the last freeze. Avoids going trough the atomic/lock on every put. cached_last_freeze_at: Lsn, - cached_last_freeze_ts: Instant, } impl TimelineWriterState { - fn new( - open_layer: Arc, - current_size: u64, - last_freeze_at: Lsn, - last_freeze_ts: Instant, - ) -> Self { + fn new(open_layer: Arc, current_size: u64, last_freeze_at: Lsn) -> Self { Self { open_layer, current_size, prev_lsn: None, max_lsn: None, cached_last_freeze_at: last_freeze_at, - cached_last_freeze_ts: last_freeze_ts, } } } @@ -4818,12 +4811,10 @@ impl<'a> TimelineWriter<'a> { let initial_size = layer.size().await?; let last_freeze_at = self.last_freeze_at.load(); - let last_freeze_ts = *self.last_freeze_ts.read().unwrap(); self.write_guard.replace(TimelineWriterState::new( layer, initial_size, last_freeze_at, - last_freeze_ts, )); Ok(()) @@ -4870,7 +4861,7 @@ impl<'a> TimelineWriter<'a> { self.get_checkpoint_distance(), lsn, state.cached_last_freeze_at, - state.cached_last_freeze_ts, + state.open_layer.get_opened_at(), ) { OpenLayerAction::Roll } else { diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py index c7e1e88468..c5dc0f2919 100644 --- a/test_runner/regress/test_pageserver_layer_rolling.py +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -1,6 +1,7 @@ import asyncio import os -from typing import Tuple +import time +from typing import Optional, Tuple import psutil import pytest @@ -20,20 +21,30 @@ ENTRIES_PER_TIMELINE = 10_000 CHECKPOINT_TIMEOUT_SECONDS = 60 -async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]: - tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf) +async def run_worker_for_tenant( + env: NeonEnv, entries: int, tenant: TenantId, offset: Optional[int] = None +) -> Lsn: + if offset is None: + offset = 0 + with env.endpoints.create_start("main", tenant_id=tenant) as ep: conn = await ep.connect_async() try: await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)") await conn.execute( - f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series(0,{entries}) as i" + f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series({offset},{entries}) as i" ) finally: await conn.close(timeout=10) last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) - return tenant, timeline, last_flush_lsn + return last_flush_lsn + + +async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]: + tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf) + last_flush_lsn = await run_worker_for_tenant(env, entries, tenant) + return tenant, timeline, last_flush_lsn async def workload( @@ -89,7 +100,9 @@ def assert_dirty_bytes(env, v): def assert_dirty_bytes_nonzero(env): - assert get_dirty_bytes(env) > 0 + dirty_bytes = get_dirty_bytes(env) + assert dirty_bytes > 0 + return dirty_bytes @pytest.mark.parametrize("immediate_shutdown", [True, False]) @@ -182,6 +195,31 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): log.info("Waiting for background checkpoints...") wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # type: ignore + # The code below verifies that we do not flush on the first write + # after an idle period longer than the checkpoint timeout. + + # Sit quietly for longer than the checkpoint timeout + time.sleep(CHECKPOINT_TIMEOUT_SECONDS + CHECKPOINT_TIMEOUT_SECONDS / 2) + + # Restart the safekeepers and write a bit of extra data into one tenant + for sk in env.safekeepers: + sk.start() + + tenant_with_extra_writes = last_flush_lsns[0][0] + asyncio.run( + run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE) + ) + + dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + + # We shouldn't flush since we've just opened a new layer + waited_for = 0 + while waited_for < CHECKPOINT_TIMEOUT_SECONDS // 4: + time.sleep(5) + waited_for += 5 + + assert get_dirty_bytes(env) >= dirty_after_write + @pytest.mark.skipif( # We have to use at least ~100MB of data to hit the lowest limit we can configure, which is From fd88d4608c3e8a8cb8579786a7b507a436033efc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 10 Apr 2024 09:12:07 +0200 Subject: [PATCH 10/24] Add command to time travel recover prefixes (#7322) Adds another tool to the DR toolbox: ability in pagectl to recover arbitrary prefixes in remote storage. Requires remote storage config, the prefix, and the travel-to timestamp parameter to be specified as cli args. The done-if-after parameter is also supported. Example invocation (after `aws login --profile dev`): ``` RUST_LOG=remote_storage=debug AWS_PROFILE=dev cargo run -p pagectl time-travel-remote-prefix 'remote_storage = { bucket_name = "neon-test-bucket-name", bucket_region = "us-east-2" }' wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/ 2024-04-05T17:00:00Z ``` This has been written to resolve a customer recovery case: https://neondb.slack.com/archives/C033RQ5SPDH/p1712256888468009 There is validation of the prefix to prevent accidentially specifying too generic prefixes, which can cause corruption and data loss if used wrongly. Still, the validation is not perfect and it is important that the command is used with caution. If possible, `time_travel_remote_storage` should be used instead which has additional checks in place. --- Cargo.lock | 5 ++ pageserver/ctl/Cargo.toml | 5 ++ pageserver/ctl/src/main.rs | 166 ++++++++++++++++++++++++++++++++++++- 3 files changed, 175 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index a7e29b1de3..4c2bcf250e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3477,12 +3477,17 @@ dependencies = [ "camino", "clap", "git-version", + "humantime", "pageserver", + "pageserver_api", "postgres_ffi", + "remote_storage", "serde", "serde_json", "svg_fmt", "tokio", + "tokio-util", + "toml_edit", "utils", "workspace_hack", ] diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml index c5cd451e8d..843f5dd862 100644 --- a/pageserver/ctl/Cargo.toml +++ b/pageserver/ctl/Cargo.toml @@ -12,9 +12,14 @@ bytes.workspace = true camino.workspace = true clap = { workspace = true, features = ["string"] } git-version.workspace = true +humantime.workspace = true pageserver = { path = ".." } +pageserver_api.workspace = true +remote_storage = { path = "../../libs/remote_storage" } postgres_ffi.workspace = true tokio.workspace = true +tokio-util.workspace = true +toml_edit.workspace = true utils.workspace = true svg_fmt.workspace = true workspace_hack.workspace = true diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index e73d961e36..1fb75584fc 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -9,6 +9,11 @@ mod index_part; mod layer_map_analyzer; mod layers; +use std::{ + str::FromStr, + time::{Duration, SystemTime}, +}; + use camino::{Utf8Path, Utf8PathBuf}; use clap::{Parser, Subcommand}; use index_part::IndexPartCmd; @@ -20,8 +25,16 @@ use pageserver::{ tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, virtual_file, }; +use pageserver_api::shard::TenantShardId; use postgres_ffi::ControlFileData; -use utils::{lsn::Lsn, project_git_version}; +use remote_storage::{RemotePath, RemoteStorageConfig}; +use tokio_util::sync::CancellationToken; +use utils::{ + id::TimelineId, + logging::{self, LogFormat, TracingErrorLayerEnablement}, + lsn::Lsn, + project_git_version, +}; project_git_version!(GIT_VERSION); @@ -43,6 +56,7 @@ enum Commands { #[command(subcommand)] IndexPart(IndexPartCmd), PrintLayerFile(PrintLayerFileCmd), + TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd), DrawTimeline {}, AnalyzeLayerMap(AnalyzeLayerMapCmd), #[command(subcommand)] @@ -68,6 +82,26 @@ struct PrintLayerFileCmd { path: Utf8PathBuf, } +/// Roll back the time for the specified prefix using S3 history. +/// +/// The command is fairly low level and powerful. Validation is only very light, +/// so it is more powerful, and thus potentially more dangerous. +#[derive(Parser)] +struct TimeTravelRemotePrefixCmd { + /// A configuration string for the remote_storage configuration. + /// + /// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }` + config_toml_str: String, + /// remote prefix to time travel recover. For safety reasons, we require it to contain + /// a timeline or tenant ID in the prefix. + prefix: String, + /// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy. + travel_to: String, + /// Timestamp of the start of the operation, must be after any changes we want to roll back and after. + /// You can use a few seconds before invoking the command. Same format as `travel_to`. + done_if_after: Option, +} + #[derive(Parser)] struct AnalyzeLayerMapCmd { /// Pageserver data path @@ -78,6 +112,14 @@ struct AnalyzeLayerMapCmd { #[tokio::main] async fn main() -> anyhow::Result<()> { + logging::init( + LogFormat::Plain, + TracingErrorLayerEnablement::EnableWithRustLogFilter, + logging::Output::Stdout, + )?; + + logging::replace_panic_hook_with_tracing_panic_hook().forget(); + let cli = CliOpts::parse(); match cli.command { @@ -105,6 +147,42 @@ async fn main() -> anyhow::Result<()> { print_layerfile(&cmd.path).await?; } } + Commands::TimeTravelRemotePrefix(cmd) => { + let timestamp = humantime::parse_rfc3339(&cmd.travel_to) + .map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?; + + let done_if_after = if let Some(done_if_after) = &cmd.done_if_after { + humantime::parse_rfc3339(done_if_after).map_err(|_e| { + anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after) + })? + } else { + const SAFETY_MARGIN: Duration = Duration::from_secs(3); + tokio::time::sleep(SAFETY_MARGIN).await; + // Convert to string representation and back to get rid of sub-second values + let done_if_after = SystemTime::now(); + tokio::time::sleep(SAFETY_MARGIN).await; + done_if_after + }; + + let timestamp = strip_subsecond(timestamp); + let done_if_after = strip_subsecond(done_if_after); + + let Some(prefix) = validate_prefix(&cmd.prefix) else { + println!("specified prefix '{}' failed validation", cmd.prefix); + return Ok(()); + }; + let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?; + let toml_item = toml_document + .get("remote_storage") + .expect("need remote_storage"); + let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config"); + let storage = remote_storage::GenericRemoteStorage::from_config(&config); + let cancel = CancellationToken::new(); + storage + .unwrap() + .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel) + .await?; + } }; Ok(()) } @@ -185,3 +263,89 @@ fn handle_metadata( Ok(()) } + +/// Ensures that the given S3 prefix is sufficiently constrained. +/// The command is very risky already and we don't want to expose something +/// that allows usually unintentional and quite catastrophic time travel of +/// an entire bucket, which would be a major catastrophy and away +/// by only one character change (similar to "rm -r /home /username/foobar"). +fn validate_prefix(prefix: &str) -> Option { + if prefix.is_empty() { + // Empty prefix means we want to specify the *whole* bucket + return None; + } + let components = prefix.split('/').collect::>(); + let (last, components) = { + let last = components.last()?; + if last.is_empty() { + ( + components.iter().nth_back(1)?, + &components[..(components.len() - 1)], + ) + } else { + (last, &components[..]) + } + }; + 'valid: { + if let Ok(_timeline_id) = TimelineId::from_str(last) { + // Ends in either a tenant or timeline ID + break 'valid; + } + if *last == "timelines" { + if let Some(before_last) = components.iter().nth_back(1) { + if let Ok(_tenant_id) = TenantShardId::from_str(before_last) { + // Has a valid tenant id + break 'valid; + } + } + } + + return None; + } + RemotePath::from_string(prefix).ok() +} + +fn strip_subsecond(timestamp: SystemTime) -> SystemTime { + let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string(); + humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_validate_prefix() { + assert_eq!(validate_prefix(""), None); + assert_eq!(validate_prefix("/"), None); + #[track_caller] + fn assert_valid(prefix: &str) { + let remote_path = RemotePath::from_string(prefix).unwrap(); + assert_eq!(validate_prefix(prefix), Some(remote_path)); + } + assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"); + // Path is not relative but absolute + assert_eq!( + validate_prefix( + "/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/" + ), + None + ); + assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/"); + // Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix + assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None); + assert_eq!(validate_prefix("wal"), None); + assert_eq!(validate_prefix("/wal/"), None); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001"); + // Partial tenant ID + assert_eq!( + validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"), + None + ); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines"); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines"); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/"); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683"); + assert_eq!(validate_prefix("pageserver/v1/tenants/"), None); + } +} From c0ff4f18dcb60d2b8035a8d83b693e5e81ceaeff Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Wed, 10 Apr 2024 09:23:59 +0100 Subject: [PATCH 11/24] proxy: hyper1 for only proxy (#7073) ## Problem hyper1 offers control over the HTTP connection that hyper0_14 does not. We're blocked on switching all services to hyper1 because of how we use tonic, but no reason we can't switch proxy over. ## Summary of changes 1. hyper0.14 -> hyper1 1. self managed server 2. Remove the `WithConnectionGuard` wrapper from `protocol2` 2. Remove TLS listener as it's no longer necessary 3. include first session ID in connection startup logs --- Cargo.lock | 214 +++++++++++++---- Cargo.toml | 3 +- proxy/Cargo.toml | 4 + proxy/src/protocol2.rs | 105 +-------- proxy/src/serverless.rs | 315 ++++++++++++++------------ proxy/src/serverless/http_util.rs | 92 ++++++++ proxy/src/serverless/sql_over_http.rs | 44 ++-- proxy/src/serverless/tls_listener.rs | 123 ---------- workspace_hack/Cargo.toml | 3 +- 9 files changed, 458 insertions(+), 445 deletions(-) create mode 100644 proxy/src/serverless/http_util.rs delete mode 100644 proxy/src/serverless/tls_listener.rs diff --git a/Cargo.lock b/Cargo.lock index 4c2bcf250e..bdf2b08c5c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -270,6 +270,12 @@ dependencies = [ "critical-section", ] +[[package]] +name = "atomic-take" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3" + [[package]] name = "autocfg" version = "1.1.0" @@ -298,7 +304,7 @@ dependencies = [ "fastrand 2.0.0", "hex", "http 0.2.9", - "hyper", + "hyper 0.14.26", "ring 0.17.6", "time", "tokio", @@ -335,7 +341,7 @@ dependencies = [ "bytes", "fastrand 2.0.0", "http 0.2.9", - "http-body", + "http-body 0.4.5", "percent-encoding", "pin-project-lite", "tracing", @@ -386,7 +392,7 @@ dependencies = [ "aws-types", "bytes", "http 0.2.9", - "http-body", + "http-body 0.4.5", "once_cell", "percent-encoding", "regex-lite", @@ -514,7 +520,7 @@ dependencies = [ "crc32fast", "hex", "http 0.2.9", - "http-body", + "http-body 0.4.5", "md-5", "pin-project-lite", "sha1", @@ -546,7 +552,7 @@ dependencies = [ "bytes-utils", "futures-core", "http 0.2.9", - "http-body", + "http-body 0.4.5", "once_cell", "percent-encoding", "pin-project-lite", @@ -585,10 +591,10 @@ dependencies = [ "aws-smithy-types", "bytes", "fastrand 2.0.0", - "h2", + "h2 0.3.26", "http 0.2.9", - "http-body", - "hyper", + "http-body 0.4.5", + "hyper 0.14.26", "hyper-rustls", "once_cell", "pin-project-lite", @@ -626,7 +632,7 @@ dependencies = [ "bytes-utils", "futures-core", "http 0.2.9", - "http-body", + "http-body 0.4.5", "itoa", "num-integer", "pin-project-lite", @@ -675,8 +681,8 @@ dependencies = [ "bytes", "futures-util", "http 0.2.9", - "http-body", - "hyper", + "http-body 0.4.5", + "hyper 0.14.26", "itoa", "matchit", "memchr", @@ -691,7 +697,7 @@ dependencies = [ "sha1", "sync_wrapper", "tokio", - "tokio-tungstenite", + "tokio-tungstenite 0.20.0", "tower", "tower-layer", "tower-service", @@ -707,7 +713,7 @@ dependencies = [ "bytes", "futures-util", "http 0.2.9", - "http-body", + "http-body 0.4.5", "mime", "rustversion", "tower-layer", @@ -1196,7 +1202,7 @@ dependencies = [ "compute_api", "flate2", "futures", - "hyper", + "hyper 0.14.26", "nix 0.27.1", "notify", "num_cpus", @@ -1313,7 +1319,7 @@ dependencies = [ "git-version", "hex", "humantime", - "hyper", + "hyper 0.14.26", "nix 0.27.1", "once_cell", "pageserver_api", @@ -2199,6 +2205,25 @@ dependencies = [ "tracing", ] +[[package]] +name = "h2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "816ec7294445779408f36fe57bc5b7fc1cf59664059096c65f905c1c61f58069" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http 1.1.0", + "indexmap 2.0.1", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "half" version = "1.8.2" @@ -2370,6 +2395,29 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-body" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" +dependencies = [ + "bytes", + "http 1.1.0", +] + +[[package]] +name = "http-body-util" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41cb79eb393015dadd30fc252023adb0b2400a0caee0fa2a077e6e21a551e840" +dependencies = [ + "bytes", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "pin-project-lite", +] + [[package]] name = "http-types" version = "2.12.0" @@ -2428,9 +2476,9 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "h2", + "h2 0.3.26", "http 0.2.9", - "http-body", + "http-body 0.4.5", "httparse", "httpdate", "itoa", @@ -2442,6 +2490,26 @@ dependencies = [ "want", ] +[[package]] +name = "hyper" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "186548d73ac615b32a73aafe38fb4f56c0d340e110e5a200bcadbaf2e199263a" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "h2 0.4.4", + "http 1.1.0", + "http-body 1.0.0", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", +] + [[package]] name = "hyper-rustls" version = "0.24.0" @@ -2449,7 +2517,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7" dependencies = [ "http 0.2.9", - "hyper", + "hyper 0.14.26", "log", "rustls 0.21.9", "rustls-native-certs 0.6.2", @@ -2463,7 +2531,7 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" dependencies = [ - "hyper", + "hyper 0.14.26", "pin-project-lite", "tokio", "tokio-io-timeout", @@ -2476,7 +2544,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" dependencies = [ "bytes", - "hyper", + "hyper 0.14.26", "native-tls", "tokio", "tokio-native-tls", @@ -2484,15 +2552,33 @@ dependencies = [ [[package]] name = "hyper-tungstenite" -version = "0.11.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9" +checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad" dependencies = [ - "hyper", + "http-body-util", + "hyper 1.2.0", + "hyper-util", "pin-project-lite", "tokio", - "tokio-tungstenite", - "tungstenite", + "tokio-tungstenite 0.21.0", + "tungstenite 0.21.0", +] + +[[package]] +name = "hyper-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa" +dependencies = [ + "bytes", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "hyper 1.2.0", + "pin-project-lite", + "socket2 0.5.5", + "tokio", ] [[package]] @@ -3523,7 +3609,7 @@ dependencies = [ "hex-literal", "humantime", "humantime-serde", - "hyper", + "hyper 0.14.26", "itertools", "leaky-bucket", "md5", @@ -4202,6 +4288,7 @@ dependencies = [ "anyhow", "async-compression", "async-trait", + "atomic-take", "aws-config", "aws-sdk-iam", "aws-sigv4", @@ -4225,9 +4312,12 @@ dependencies = [ "hmac", "hostname", "http 1.1.0", + "http-body-util", "humantime", - "hyper", + "hyper 0.14.26", + "hyper 1.2.0", "hyper-tungstenite", + "hyper-util", "ipnet", "itertools", "lasso", @@ -4560,7 +4650,7 @@ dependencies = [ "futures-util", "http-types", "humantime", - "hyper", + "hyper 0.14.26", "itertools", "metrics", "once_cell", @@ -4590,10 +4680,10 @@ dependencies = [ "encoding_rs", "futures-core", "futures-util", - "h2", + "h2 0.3.26", "http 0.2.9", - "http-body", - "hyper", + "http-body 0.4.5", + "hyper 0.14.26", "hyper-rustls", "hyper-tls", "ipnet", @@ -4651,7 +4741,7 @@ dependencies = [ "futures", "getrandom 0.2.11", "http 0.2.9", - "hyper", + "hyper 0.14.26", "parking_lot 0.11.2", "reqwest", "reqwest-middleware", @@ -4738,7 +4828,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945" dependencies = [ "http 0.2.9", - "hyper", + "hyper 0.14.26", "lazy_static", "percent-encoding", "regex", @@ -5043,7 +5133,7 @@ dependencies = [ "git-version", "hex", "humantime", - "hyper", + "hyper 0.14.26", "metrics", "once_cell", "parking_lot 0.12.1", @@ -5528,9 +5618,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.11.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" +checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" [[package]] name = "smol_str" @@ -5622,7 +5712,7 @@ dependencies = [ "futures-util", "git-version", "humantime", - "hyper", + "hyper 0.14.26", "metrics", "once_cell", "parking_lot 0.12.1", @@ -5653,7 +5743,7 @@ dependencies = [ "git-version", "hex", "humantime", - "hyper", + "hyper 0.14.26", "itertools", "lasso", "measured", @@ -5682,7 +5772,7 @@ dependencies = [ "anyhow", "clap", "comfy-table", - "hyper", + "hyper 0.14.26", "pageserver_api", "pageserver_client", "reqwest", @@ -6165,7 +6255,19 @@ dependencies = [ "futures-util", "log", "tokio", - "tungstenite", + "tungstenite 0.20.1", +] + +[[package]] +name = "tokio-tungstenite" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38" +dependencies = [ + "futures-util", + "log", + "tokio", + "tungstenite 0.21.0", ] [[package]] @@ -6232,10 +6334,10 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "h2", + "h2 0.3.26", "http 0.2.9", - "http-body", - "hyper", + "http-body 0.4.5", + "hyper 0.14.26", "hyper-timeout", "percent-encoding", "pin-project", @@ -6421,7 +6523,7 @@ dependencies = [ name = "tracing-utils" version = "0.1.0" dependencies = [ - "hyper", + "hyper 0.14.26", "opentelemetry", "opentelemetry-otlp", "opentelemetry-semantic-conventions", @@ -6458,6 +6560,25 @@ dependencies = [ "utf-8", ] +[[package]] +name = "tungstenite" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1" +dependencies = [ + "byteorder", + "bytes", + "data-encoding", + "http 1.1.0", + "httparse", + "log", + "rand 0.8.5", + "sha1", + "thiserror", + "url", + "utf-8", +] + [[package]] name = "twox-hash" version = "1.6.3" @@ -6623,7 +6744,7 @@ dependencies = [ "hex", "hex-literal", "humantime", - "hyper", + "hyper 0.14.26", "jsonwebtoken", "leaky-bucket", "metrics", @@ -7214,7 +7335,7 @@ dependencies = [ "hashbrown 0.14.0", "hex", "hmac", - "hyper", + "hyper 0.14.26", "indexmap 1.9.3", "itertools", "libc", @@ -7252,7 +7373,6 @@ dependencies = [ "tower", "tracing", "tracing-core", - "tungstenite", "url", "uuid", "zeroize", diff --git a/Cargo.toml b/Cargo.toml index 5db6b7016a..feea17ab05 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,6 +44,7 @@ license = "Apache-2.0" anyhow = { version = "1.0", features = ["backtrace"] } arc-swap = "1.6" async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } +atomic-take = "1.1.0" azure_core = "0.18" azure_identity = "0.18" azure_storage = "0.18" @@ -97,7 +98,7 @@ http-types = { version = "2", default-features = false } humantime = "2.1" humantime-serde = "1.1.1" hyper = "0.14" -hyper-tungstenite = "0.11" +hyper-tungstenite = "0.13.0" inotify = "0.10.2" ipnet = "2.9.0" itertools = "0.10" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index b327890be2..12bd67ea36 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -12,6 +12,7 @@ testing = [] anyhow.workspace = true async-compression.workspace = true async-trait.workspace = true +atomic-take.workspace = true aws-config.workspace = true aws-sdk-iam.workspace = true aws-sigv4.workspace = true @@ -36,6 +37,9 @@ http.workspace = true humantime.workspace = true hyper-tungstenite.workspace = true hyper.workspace = true +hyper1 = { package = "hyper", version = "1.2", features = ["server"] } +hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] } +http-body-util = { version = "0.1" } ipnet.workspace = true itertools.workspace = true lasso = { workspace = true, features = ["multi-threaded"] } diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 700c8c8681..70f9b4bfab 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -5,19 +5,13 @@ use std::{ io, net::SocketAddr, pin::{pin, Pin}, - sync::Mutex, task::{ready, Context, Poll}, }; use bytes::{Buf, BytesMut}; -use hyper::server::accept::Accept; -use hyper::server::conn::{AddrIncoming, AddrStream}; -use metrics::IntCounterPairGuard; +use hyper::server::conn::AddrIncoming; use pin_project_lite::pin_project; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf}; -use uuid::Uuid; - -use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE; pub struct ProxyProtocolAccept { pub incoming: AddrIncoming, @@ -331,103 +325,6 @@ impl AsyncRead for WithClientIp { } } -impl Accept for ProxyProtocolAccept { - type Conn = WithConnectionGuard>; - - type Error = io::Error; - - fn poll_accept( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll>> { - let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?); - - let conn_id = uuid::Uuid::new_v4(); - let span = tracing::info_span!("http_conn", ?conn_id); - { - let _enter = span.enter(); - tracing::info!("accepted new TCP connection"); - } - - let Some(conn) = conn else { - return Poll::Ready(None); - }; - - Poll::Ready(Some(Ok(WithConnectionGuard { - inner: WithClientIp::new(conn), - connection_id: Uuid::new_v4(), - gauge: Mutex::new(Some( - NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&[self.protocol]) - .guard(), - )), - span, - }))) - } -} - -pin_project! { - pub struct WithConnectionGuard { - #[pin] - pub inner: T, - pub connection_id: Uuid, - pub gauge: Mutex>, - pub span: tracing::Span, - } - - impl PinnedDrop for WithConnectionGuard { - fn drop(this: Pin<&mut Self>) { - let _enter = this.span.enter(); - tracing::info!("HTTP connection closed") - } - } -} - -impl AsyncWrite for WithConnectionGuard { - #[inline] - fn poll_write( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - buf: &[u8], - ) -> Poll> { - self.project().inner.poll_write(cx, buf) - } - - #[inline] - fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().inner.poll_flush(cx) - } - - #[inline] - fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().inner.poll_shutdown(cx) - } - - #[inline] - fn poll_write_vectored( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - bufs: &[io::IoSlice<'_>], - ) -> Poll> { - self.project().inner.poll_write_vectored(cx, bufs) - } - - #[inline] - fn is_write_vectored(&self) -> bool { - self.inner.is_write_vectored() - } -} - -impl AsyncRead for WithConnectionGuard { - fn poll_read( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - buf: &mut ReadBuf<'_>, - ) -> Poll> { - self.project().inner.poll_read(cx, buf) - } -} - #[cfg(test)] mod tests { use std::pin::pin; diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index a2010fd613..f275caa7eb 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -4,42 +4,48 @@ mod backend; mod conn_pool; +mod http_util; mod json; mod sql_over_http; -pub mod tls_listener; mod websocket; +use atomic_take::AtomicTake; +use bytes::Bytes; pub use conn_pool::GlobalConnPoolOptions; -use anyhow::bail; -use hyper::StatusCode; -use metrics::IntCounterPairGuard; +use anyhow::Context; +use futures::future::{select, Either}; +use futures::TryFutureExt; +use http::{Method, Response, StatusCode}; +use http_body_util::Full; +use hyper1::body::Incoming; +use hyper_util::rt::TokioExecutor; +use hyper_util::server::conn::auto::Builder; use rand::rngs::StdRng; use rand::SeedableRng; pub use reqwest_middleware::{ClientWithMiddleware, Error}; pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; +use tokio::time::timeout; +use tokio_rustls::TlsAcceptor; use tokio_util::task::TaskTracker; -use tracing::instrument::Instrumented; use crate::cancellation::CancellationHandlerMain; use crate::config::ProxyConfig; use crate::context::RequestMonitoring; -use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard}; +use crate::metrics::{NUM_CLIENT_CONNECTION_GAUGE, TLS_HANDSHAKE_FAILURES}; +use crate::protocol2::WithClientIp; +use crate::proxy::run_until_cancelled; use crate::rate_limiter::EndpointRateLimiter; use crate::serverless::backend::PoolingBackend; -use hyper::{ - server::conn::{AddrIncoming, AddrStream}, - Body, Method, Request, Response, -}; +use crate::serverless::http_util::{api_error_into_response, json_response}; -use std::net::IpAddr; +use std::net::{IpAddr, SocketAddr}; +use std::pin::pin; use std::sync::Arc; -use std::task::Poll; -use tls_listener::TlsListener; -use tokio::net::TcpListener; -use tokio_util::sync::{CancellationToken, DropGuard}; +use tokio::net::{TcpListener, TcpStream}; +use tokio_util::sync::CancellationToken; use tracing::{error, info, warn, Instrument}; -use utils::http::{error::ApiError, json::json_response}; +use utils::http::error::ApiError; pub const SERVERLESS_DRIVER_SNI: &str = "api"; @@ -91,161 +97,174 @@ pub async fn task_main( tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()]; let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into(); - let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?; - let _ = addr_incoming.set_nodelay(true); - let addr_incoming = ProxyProtocolAccept { - incoming: addr_incoming, - protocol: "http", - }; + let connections = tokio_util::task::task_tracker::TaskTracker::new(); + connections.close(); // allows `connections.wait to complete` - let ws_connections = tokio_util::task::task_tracker::TaskTracker::new(); - ws_connections.close(); // allows `ws_connections.wait to complete` + let server = Builder::new(hyper_util::rt::TokioExecutor::new()); - let tls_listener = TlsListener::new(tls_acceptor, addr_incoming, config.handshake_timeout); + while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await { + let (conn, peer_addr) = res.context("could not accept TCP stream")?; + if let Err(e) = conn.set_nodelay(true) { + tracing::error!("could not set nodelay: {e}"); + continue; + } + let conn_id = uuid::Uuid::new_v4(); + let http_conn_span = tracing::info_span!("http_conn", ?conn_id); - let make_svc = hyper::service::make_service_fn( - |stream: &tokio_rustls::server::TlsStream< - WithConnectionGuard>, - >| { - let (conn, _) = stream.get_ref(); + connections.spawn( + connection_handler( + config, + backend.clone(), + connections.clone(), + cancellation_handler.clone(), + endpoint_rate_limiter.clone(), + cancellation_token.clone(), + server.clone(), + tls_acceptor.clone(), + conn, + peer_addr, + ) + .instrument(http_conn_span), + ); + } - // this is jank. should dissapear with hyper 1.0 migration. - let gauge = conn - .gauge - .lock() - .expect("lock should not be poisoned") - .take() - .expect("gauge should be set on connection start"); - - // Cancel all current inflight HTTP requests if the HTTP connection is closed. - let http_cancellation_token = CancellationToken::new(); - let cancel_connection = http_cancellation_token.clone().drop_guard(); - - let span = conn.span.clone(); - let client_addr = conn.inner.client_addr(); - let remote_addr = conn.inner.inner.remote_addr(); - let backend = backend.clone(); - let ws_connections = ws_connections.clone(); - let endpoint_rate_limiter = endpoint_rate_limiter.clone(); - let cancellation_handler = cancellation_handler.clone(); - async move { - let peer_addr = match client_addr { - Some(addr) => addr, - None if config.require_client_ip => bail!("missing required client ip"), - None => remote_addr, - }; - Ok(MetricService::new( - hyper::service::service_fn(move |req: Request| { - let backend = backend.clone(); - let ws_connections2 = ws_connections.clone(); - let endpoint_rate_limiter = endpoint_rate_limiter.clone(); - let cancellation_handler = cancellation_handler.clone(); - let http_cancellation_token = http_cancellation_token.child_token(); - - // `request_handler` is not cancel safe. It expects to be cancelled only at specific times. - // By spawning the future, we ensure it never gets cancelled until it decides to. - ws_connections.spawn( - async move { - // Cancel the current inflight HTTP request if the requets stream is closed. - // This is slightly different to `_cancel_connection` in that - // h2 can cancel individual requests with a `RST_STREAM`. - let _cancel_session = http_cancellation_token.clone().drop_guard(); - - let res = request_handler( - req, - config, - backend, - ws_connections2, - cancellation_handler, - peer_addr.ip(), - endpoint_rate_limiter, - http_cancellation_token, - ) - .await - .map_or_else(|e| e.into_response(), |r| r); - - _cancel_session.disarm(); - - res - } - .in_current_span(), - ) - }), - gauge, - cancel_connection, - span, - )) - } - }, - ); - - hyper::Server::builder(tls_listener) - .serve(make_svc) - .with_graceful_shutdown(cancellation_token.cancelled()) - .await?; - - // await websocket connections - ws_connections.wait().await; + connections.wait().await; Ok(()) } -struct MetricService { - inner: S, - _gauge: IntCounterPairGuard, - _cancel: DropGuard, - span: tracing::Span, -} +/// Handles the TCP lifecycle. +/// +/// 1. Parses PROXY protocol V2 +/// 2. Handles TLS handshake +/// 3. Handles HTTP connection +/// 1. With graceful shutdowns +/// 2. With graceful request cancellation with connection failure +/// 3. With websocket upgrade support. +#[allow(clippy::too_many_arguments)] +async fn connection_handler( + config: &'static ProxyConfig, + backend: Arc, + connections: TaskTracker, + cancellation_handler: Arc, + endpoint_rate_limiter: Arc, + cancellation_token: CancellationToken, + server: Builder, + tls_acceptor: TlsAcceptor, + conn: TcpStream, + peer_addr: SocketAddr, +) { + let session_id = uuid::Uuid::new_v4(); -impl MetricService { - fn new( - inner: S, - _gauge: IntCounterPairGuard, - _cancel: DropGuard, - span: tracing::Span, - ) -> MetricService { - MetricService { - inner, - _gauge, - _cancel, - span, + let _gauge = NUM_CLIENT_CONNECTION_GAUGE + .with_label_values(&["http"]) + .guard(); + + // handle PROXY protocol + let mut conn = WithClientIp::new(conn); + let peer = match conn.wait_for_addr().await { + Ok(peer) => peer, + Err(e) => { + tracing::error!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}"); + return; } - } -} + }; -impl hyper::service::Service> for MetricService -where - S: hyper::service::Service>, -{ - type Response = S::Response; - type Error = S::Error; - type Future = Instrumented; + let peer_addr = peer.unwrap_or(peer_addr).ip(); + info!(?session_id, %peer_addr, "accepted new TCP connection"); - fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll> { - self.inner.poll_ready(cx) - } + // try upgrade to TLS, but with a timeout. + let conn = match timeout(config.handshake_timeout, tls_acceptor.accept(conn)).await { + Ok(Ok(conn)) => { + info!(?session_id, %peer_addr, "accepted new TLS connection"); + conn + } + // The handshake failed + Ok(Err(e)) => { + TLS_HANDSHAKE_FAILURES.inc(); + warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}"); + return; + } + // The handshake timed out + Err(e) => { + TLS_HANDSHAKE_FAILURES.inc(); + warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}"); + return; + } + }; - fn call(&mut self, req: Request) -> Self::Future { - self.span - .in_scope(|| self.inner.call(req)) - .instrument(self.span.clone()) + let session_id = AtomicTake::new(session_id); + + // Cancel all current inflight HTTP requests if the HTTP connection is closed. + let http_cancellation_token = CancellationToken::new(); + let _cancel_connection = http_cancellation_token.clone().drop_guard(); + + let conn = server.serve_connection_with_upgrades( + hyper_util::rt::TokioIo::new(conn), + hyper1::service::service_fn(move |req: hyper1::Request| { + // First HTTP request shares the same session ID + let session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4); + + // Cancel the current inflight HTTP request if the requets stream is closed. + // This is slightly different to `_cancel_connection` in that + // h2 can cancel individual requests with a `RST_STREAM`. + let http_request_token = http_cancellation_token.child_token(); + let cancel_request = http_request_token.clone().drop_guard(); + + // `request_handler` is not cancel safe. It expects to be cancelled only at specific times. + // By spawning the future, we ensure it never gets cancelled until it decides to. + let handler = connections.spawn( + request_handler( + req, + config, + backend.clone(), + connections.clone(), + cancellation_handler.clone(), + session_id, + peer_addr, + endpoint_rate_limiter.clone(), + http_request_token, + ) + .in_current_span() + .map_ok_or_else(api_error_into_response, |r| r), + ); + + async move { + let res = handler.await; + cancel_request.disarm(); + res + } + }), + ); + + // On cancellation, trigger the HTTP connection handler to shut down. + let res = match select(pin!(cancellation_token.cancelled()), pin!(conn)).await { + Either::Left((_cancelled, mut conn)) => { + conn.as_mut().graceful_shutdown(); + conn.await + } + Either::Right((res, _)) => res, + }; + + match res { + Ok(()) => tracing::info!(%peer_addr, "HTTP connection closed"), + Err(e) => tracing::warn!(%peer_addr, "HTTP connection error {e}"), } } #[allow(clippy::too_many_arguments)] async fn request_handler( - mut request: Request, + mut request: hyper1::Request, config: &'static ProxyConfig, backend: Arc, ws_connections: TaskTracker, cancellation_handler: Arc, + session_id: uuid::Uuid, peer_addr: IpAddr, endpoint_rate_limiter: Arc, // used to cancel in-flight HTTP requests. not used to cancel websockets http_cancellation_token: CancellationToken, -) -> Result, ApiError> { - let session_id = uuid::Uuid::new_v4(); - +) -> Result>, ApiError> { let host = request .headers() .get("host") @@ -282,14 +301,14 @@ async fn request_handler( // Return the response so the spawned future can continue. Ok(response) - } else if request.uri().path() == "/sql" && request.method() == Method::POST { + } else if request.uri().path() == "/sql" && *request.method() == Method::POST { let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region); let span = ctx.span.clone(); sql_over_http::handle(config, ctx, request, backend, http_cancellation_token) .instrument(span) .await - } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS { + } else if request.uri().path() == "/sql" && *request.method() == Method::OPTIONS { Response::builder() .header("Allow", "OPTIONS, POST") .header("Access-Control-Allow-Origin", "*") @@ -299,7 +318,7 @@ async fn request_handler( ) .header("Access-Control-Max-Age", "86400" /* 24 hours */) .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code - .body(Body::empty()) + .body(Full::new(Bytes::new())) .map_err(|e| ApiError::InternalServerError(e.into())) } else { json_response(StatusCode::BAD_REQUEST, "query is not supported") diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs new file mode 100644 index 0000000000..ab9127b13e --- /dev/null +++ b/proxy/src/serverless/http_util.rs @@ -0,0 +1,92 @@ +//! Things stolen from `libs/utils/src/http` to add hyper 1.0 compatibility +//! Will merge back in at some point in the future. + +use bytes::Bytes; + +use anyhow::Context; +use http::{Response, StatusCode}; +use http_body_util::Full; + +use serde::Serialize; +use utils::http::error::ApiError; + +/// Like [`ApiError::into_response`] +pub fn api_error_into_response(this: ApiError) -> Response> { + match this { + ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status( + format!("{err:#?}"), // use debug printing so that we give the cause + StatusCode::BAD_REQUEST, + ), + ApiError::Forbidden(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::FORBIDDEN) + } + ApiError::Unauthorized(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::UNAUTHORIZED) + } + ApiError::NotFound(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::NOT_FOUND) + } + ApiError::Conflict(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::CONFLICT) + } + ApiError::PreconditionFailed(_) => HttpErrorBody::response_from_msg_and_status( + this.to_string(), + StatusCode::PRECONDITION_FAILED, + ), + ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status( + "Shutting down".to_string(), + StatusCode::SERVICE_UNAVAILABLE, + ), + ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::SERVICE_UNAVAILABLE, + ), + ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::REQUEST_TIMEOUT, + ), + ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::INTERNAL_SERVER_ERROR, + ), + } +} + +/// Same as [`utils::http::error::HttpErrorBody`] +#[derive(Serialize)] +struct HttpErrorBody { + pub msg: String, +} + +impl HttpErrorBody { + /// Same as [`utils::http::error::HttpErrorBody::response_from_msg_and_status`] + fn response_from_msg_and_status(msg: String, status: StatusCode) -> Response> { + HttpErrorBody { msg }.to_response(status) + } + + /// Same as [`utils::http::error::HttpErrorBody::to_response`] + fn to_response(&self, status: StatusCode) -> Response> { + Response::builder() + .status(status) + .header(http::header::CONTENT_TYPE, "application/json") + // we do not have nested maps with non string keys so serialization shouldn't fail + .body(Full::new(Bytes::from(serde_json::to_string(self).unwrap()))) + .unwrap() + } +} + +/// Same as [`utils::http::json::json_response`] +pub fn json_response( + status: StatusCode, + data: T, +) -> Result>, ApiError> { + let json = serde_json::to_string(&data) + .context("Failed to serialize JSON response") + .map_err(ApiError::InternalServerError)?; + let response = Response::builder() + .status(status) + .header(http::header::CONTENT_TYPE, "application/json") + .body(Full::new(Bytes::from(json))) + .map_err(|e| ApiError::InternalServerError(e.into()))?; + Ok(response) +} diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 00dffd5784..7f7f93988c 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -1,18 +1,22 @@ use std::pin::pin; use std::sync::Arc; +use bytes::Bytes; use futures::future::select; use futures::future::try_join; use futures::future::Either; use futures::StreamExt; use futures::TryFutureExt; -use hyper::body::HttpBody; -use hyper::header; -use hyper::http::HeaderName; -use hyper::http::HeaderValue; -use hyper::Response; -use hyper::StatusCode; -use hyper::{Body, HeaderMap, Request}; +use http_body_util::BodyExt; +use http_body_util::Full; +use hyper1::body::Body; +use hyper1::body::Incoming; +use hyper1::header; +use hyper1::http::HeaderName; +use hyper1::http::HeaderValue; +use hyper1::Response; +use hyper1::StatusCode; +use hyper1::{HeaderMap, Request}; use serde_json::json; use serde_json::Value; use tokio::time; @@ -29,7 +33,6 @@ use tracing::error; use tracing::info; use url::Url; use utils::http::error::ApiError; -use utils::http::json::json_response; use crate::auth::backend::ComputeUserInfo; use crate::auth::endpoint_sni; @@ -52,6 +55,7 @@ use crate::RoleName; use super::backend::PoolingBackend; use super::conn_pool::Client; use super::conn_pool::ConnInfo; +use super::http_util::json_response; use super::json::json_to_pg_text; use super::json::pg_text_row_to_json; use super::json::JsonConversionError; @@ -218,10 +222,10 @@ fn get_conn_info( pub async fn handle( config: &'static ProxyConfig, mut ctx: RequestMonitoring, - request: Request, + request: Request, backend: Arc, cancel: CancellationToken, -) -> Result, ApiError> { +) -> Result>, ApiError> { let result = handle_inner(cancel, config, &mut ctx, request, backend).await; let mut response = match result { @@ -332,10 +336,9 @@ pub async fn handle( } }; - response.headers_mut().insert( - "Access-Control-Allow-Origin", - hyper::http::HeaderValue::from_static("*"), - ); + response + .headers_mut() + .insert("Access-Control-Allow-Origin", HeaderValue::from_static("*")); Ok(response) } @@ -396,7 +399,7 @@ impl UserFacingError for SqlOverHttpError { #[derive(Debug, thiserror::Error)] pub enum ReadPayloadError { #[error("could not read the HTTP request body: {0}")] - Read(#[from] hyper::Error), + Read(#[from] hyper1::Error), #[error("could not parse the HTTP request body: {0}")] Parse(#[from] serde_json::Error), } @@ -437,7 +440,7 @@ struct HttpHeaders { } impl HttpHeaders { - fn try_parse(headers: &hyper::http::HeaderMap) -> Result { + fn try_parse(headers: &hyper1::http::HeaderMap) -> Result { // Determine the output options. Default behaviour is 'false'. Anything that is not // strictly 'true' assumed to be false. let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE); @@ -488,9 +491,9 @@ async fn handle_inner( cancel: CancellationToken, config: &'static ProxyConfig, ctx: &mut RequestMonitoring, - request: Request, + request: Request, backend: Arc, -) -> Result, SqlOverHttpError> { +) -> Result>, SqlOverHttpError> { let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE .with_label_values(&[ctx.protocol]) .guard(); @@ -528,7 +531,7 @@ async fn handle_inner( } let fetch_and_process_request = async { - let body = hyper::body::to_bytes(request.into_body()).await?; + let body = request.into_body().collect().await?.to_bytes(); info!(length = body.len(), "request payload read"); let payload: Payload = serde_json::from_slice(&body)?; Ok::(payload) // Adjust error type accordingly @@ -596,7 +599,7 @@ async fn handle_inner( let body = serde_json::to_string(&result).expect("json serialization should not fail"); let len = body.len(); let response = response - .body(Body::from(body)) + .body(Full::new(Bytes::from(body))) // only fails if invalid status code or invalid header/values are given. // these are not user configurable so it cannot fail dynamically .expect("building response payload should not fail"); @@ -639,6 +642,7 @@ impl QueryData { } // The query was cancelled. Either::Right((_cancelled, query)) => { + tracing::info!("cancelling query"); if let Err(err) = cancel_token.cancel_query(NoTls).await { tracing::error!(?err, "could not cancel query"); } diff --git a/proxy/src/serverless/tls_listener.rs b/proxy/src/serverless/tls_listener.rs deleted file mode 100644 index 33f194dd59..0000000000 --- a/proxy/src/serverless/tls_listener.rs +++ /dev/null @@ -1,123 +0,0 @@ -use std::{ - convert::Infallible, - pin::Pin, - task::{Context, Poll}, - time::Duration, -}; - -use hyper::server::{accept::Accept, conn::AddrStream}; -use pin_project_lite::pin_project; -use tokio::{ - io::{AsyncRead, AsyncWrite}, - task::JoinSet, - time::timeout, -}; -use tokio_rustls::{server::TlsStream, TlsAcceptor}; -use tracing::{info, warn, Instrument}; - -use crate::{ - metrics::TLS_HANDSHAKE_FAILURES, - protocol2::{WithClientIp, WithConnectionGuard}, -}; - -pin_project! { - /// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself - /// encrypted using TLS. - pub(crate) struct TlsListener { - #[pin] - listener: A, - tls: TlsAcceptor, - waiting: JoinSet>>, - timeout: Duration, - } -} - -impl TlsListener { - /// Create a `TlsListener` with default options. - pub(crate) fn new(tls: TlsAcceptor, listener: A, timeout: Duration) -> Self { - TlsListener { - listener, - tls, - waiting: JoinSet::new(), - timeout, - } - } -} - -impl Accept for TlsListener -where - A: Accept>>, - A::Error: std::error::Error, - A::Conn: AsyncRead + AsyncWrite + Unpin + Send + 'static, -{ - type Conn = TlsStream; - - type Error = Infallible; - - fn poll_accept( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll>> { - let mut this = self.project(); - - loop { - match this.listener.as_mut().poll_accept(cx) { - Poll::Pending => break, - Poll::Ready(Some(Ok(mut conn))) => { - let t = *this.timeout; - let tls = this.tls.clone(); - let span = conn.span.clone(); - this.waiting.spawn(async move { - let peer_addr = match conn.inner.wait_for_addr().await { - Ok(Some(addr)) => addr, - Err(e) => { - tracing::error!("failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}"); - return None; - } - Ok(None) => conn.inner.inner.remote_addr() - }; - - let accept = tls.accept(conn); - match timeout(t, accept).await { - Ok(Ok(conn)) => { - info!(%peer_addr, "accepted new TLS connection"); - Some(conn) - }, - // The handshake failed, try getting another connection from the queue - Ok(Err(e)) => { - TLS_HANDSHAKE_FAILURES.inc(); - warn!(%peer_addr, "failed to accept TLS connection: {e:?}"); - None - } - // The handshake timed out, try getting another connection from the queue - Err(_) => { - TLS_HANDSHAKE_FAILURES.inc(); - warn!(%peer_addr, "failed to accept TLS connection: timeout"); - None - } - } - }.instrument(span)); - } - Poll::Ready(Some(Err(e))) => { - tracing::error!("error accepting TCP connection: {e}"); - continue; - } - Poll::Ready(None) => return Poll::Ready(None), - } - } - - loop { - return match this.waiting.poll_join_next(cx) { - Poll::Ready(Some(Ok(Some(conn)))) => Poll::Ready(Some(Ok(conn))), - // The handshake failed to complete, try getting another connection from the queue - Poll::Ready(Some(Ok(None))) => continue, - // The handshake panicked or was cancelled. ignore and get another connection - Poll::Ready(Some(Err(e))) => { - tracing::warn!("handshake aborted: {e}"); - continue; - } - _ => Poll::Pending, - }; - } - } -} diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index bcbd4daa7e..d6e2cc2996 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -63,7 +63,7 @@ scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["raw_value"] } sha2 = { version = "0.10", features = ["asm"] } -smallvec = { version = "1", default-features = false, features = ["write"] } +smallvec = { version = "1", default-features = false, features = ["const_new", "write"] } subtle = { version = "2" } time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } @@ -75,7 +75,6 @@ tonic = { version = "0.9", features = ["tls-roots"] } tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } -tungstenite = { version = "0.20" } url = { version = "2", features = ["serde"] } uuid = { version = "1", features = ["serde", "v4", "v7"] } zeroize = { version = "1", features = ["derive"] } From 5efe95a008bb6a19ec9676a0c7b1a5516f85e4c1 Mon Sep 17 00:00:00 2001 From: Anna Khanova <32508607+khanova@users.noreply.github.com> Date: Wed, 10 Apr 2024 10:30:09 +0200 Subject: [PATCH 12/24] proxy: fix credentials cache lookup (#7349) ## Problem Incorrect processing of `-pooler` connections. ## Summary of changes Fix TODO: add e2e tests for caching --- proxy/src/cache/endpoints.rs | 5 ++--- proxy/src/console/provider/neon.rs | 32 ++++++++++++++++++------------ 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs index 9bc019c2d8..31e3ef6891 100644 --- a/proxy/src/cache/endpoints.rs +++ b/proxy/src/cache/endpoints.rs @@ -21,7 +21,7 @@ use crate::{ metrics::REDIS_BROKEN_MESSAGES, rate_limiter::GlobalRateLimiter, redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider, - EndpointId, Normalize, + EndpointId, }; #[derive(Deserialize, Debug, Clone)] @@ -72,9 +72,8 @@ impl EndpointsCache { !rejected } fn should_reject(&self, endpoint: &EndpointId) -> bool { - let endpoint = endpoint.normalize(); if endpoint.is_endpoint() { - !self.endpoints.contains(&EndpointIdInt::from(&endpoint)) + !self.endpoints.contains(&EndpointIdInt::from(endpoint)) } else if endpoint.is_branch() { !self .branches diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 3a0e5609d8..68b91447f9 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -59,7 +59,7 @@ impl Api { if !self .caches .endpoints_cache - .is_valid(ctx, &user_info.endpoint) + .is_valid(ctx, &user_info.endpoint.normalize()) .await { info!("endpoint is not valid, skipping the request"); @@ -186,23 +186,27 @@ impl super::Api for Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { - let ep = &user_info.endpoint; + let normalized_ep = &user_info.endpoint.normalize(); let user = &user_info.user; - if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) { + if let Some(role_secret) = self + .caches + .project_info + .get_role_secret(normalized_ep, user) + { return Ok(role_secret); } let auth_info = self.do_get_auth_info(ctx, user_info).await?; if let Some(project_id) = auth_info.project_id { - let ep_int = ep.normalize().into(); + let normalized_ep_int = normalized_ep.into(); self.caches.project_info.insert_role_secret( project_id, - ep_int, + normalized_ep_int, user.into(), auth_info.secret.clone(), ); self.caches.project_info.insert_allowed_ips( project_id, - ep_int, + normalized_ep_int, Arc::new(auth_info.allowed_ips), ); ctx.set_project_id(project_id); @@ -216,8 +220,8 @@ impl super::Api for Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { - let ep = &user_info.endpoint; - if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) { + let normalized_ep = &user_info.endpoint.normalize(); + if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) { ALLOWED_IPS_BY_CACHE_OUTCOME .with_label_values(&["hit"]) .inc(); @@ -230,16 +234,18 @@ impl super::Api for Api { let allowed_ips = Arc::new(auth_info.allowed_ips); let user = &user_info.user; if let Some(project_id) = auth_info.project_id { - let ep_int = ep.normalize().into(); + let normalized_ep_int = normalized_ep.into(); self.caches.project_info.insert_role_secret( project_id, - ep_int, + normalized_ep_int, user.into(), auth_info.secret.clone(), ); - self.caches - .project_info - .insert_allowed_ips(project_id, ep_int, allowed_ips.clone()); + self.caches.project_info.insert_allowed_ips( + project_id, + normalized_ep_int, + allowed_ips.clone(), + ); ctx.set_project_id(project_id); } Ok(( From 0bb04ebe19c1dd024c7762926ecce166f4259d82 Mon Sep 17 00:00:00 2001 From: Anna Khanova <32508607+khanova@users.noreply.github.com> Date: Wed, 10 Apr 2024 12:12:55 +0200 Subject: [PATCH 13/24] Revert "Proxy read ids from redis (#7205)" (#7350) This reverts commit dbac2d2c473f3648251f0a64e36d066f444dfe00. ## Problem Proxy pods fails to install in k8s clusters, cplane release blocking. ## Summary of changes Revert --- proxy/src/auth/backend.rs | 4 +- proxy/src/bin/proxy.rs | 15 +- proxy/src/cache.rs | 1 - proxy/src/cache/endpoints.rs | 190 ------------------ proxy/src/config.rs | 69 ------- proxy/src/console/provider.rs | 22 +- proxy/src/console/provider/neon.rs | 46 ++--- proxy/src/context.rs | 15 +- proxy/src/intern.rs | 15 -- proxy/src/lib.rs | 37 ---- proxy/src/metrics.rs | 12 -- proxy/src/proxy.rs | 4 +- proxy/src/rate_limiter.rs | 2 +- proxy/src/rate_limiter/limiter.rs | 10 +- proxy/src/redis/cancellation_publisher.rs | 6 +- .../regress/test_proxy_rate_limiter.py | 84 ++++++++ 16 files changed, 124 insertions(+), 408 deletions(-) delete mode 100644 proxy/src/cache/endpoints.rs create mode 100644 test_runner/regress/test_proxy_rate_limiter.py diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 71e9da18bc..e421798067 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -27,7 +27,7 @@ use crate::{ }, stream, url, }; -use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName}; +use crate::{scram, EndpointCacheKey, EndpointId, RoleName}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; @@ -186,7 +186,7 @@ impl AuthenticationConfig { is_cleartext: bool, ) -> auth::Result { // we have validated the endpoint exists, so let's intern it. - let endpoint_int = EndpointIdInt::from(endpoint.normalize()); + let endpoint_int = EndpointIdInt::from(endpoint); // only count the full hash count if password hack or websocket flow. // in other words, if proxy needs to run the hashing diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 9302b31d5c..56a3ef79cd 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -189,9 +189,7 @@ struct ProxyCliArgs { /// cache for `project_info` (use `size=0` to disable) #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)] project_info_cache: String, - /// cache for all valid endpoints - #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)] - endpoint_cache_config: String, + #[clap(flatten)] parquet_upload: ParquetUploadArgs, @@ -403,7 +401,6 @@ async fn main() -> anyhow::Result<()> { if let auth::BackendType::Console(api, _) = &config.auth_backend { if let proxy::console::provider::ConsoleBackend::Console(api) = &**api { - maintenance_tasks.spawn(api.locks.garbage_collect_worker()); if let Some(redis_notifications_client) = redis_notifications_client { let cache = api.caches.project_info.clone(); maintenance_tasks.spawn(notifications::task_main( @@ -413,9 +410,6 @@ async fn main() -> anyhow::Result<()> { args.region.clone(), )); maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); - let cache = api.caches.endpoints_cache.clone(); - let con = redis_notifications_client.clone(); - maintenance_tasks.spawn(async move { cache.do_read(con).await }); } } } @@ -495,18 +489,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = args.project_info_cache.parse()?; - let endpoint_cache_config: config::EndpointCacheConfig = - args.endpoint_cache_config.parse()?; info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); info!( "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" ); - info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); let caches = Box::leak(Box::new(console::caches::ApiCaches::new( wake_compute_cache_config, project_info_cache_config, - endpoint_cache_config, ))); let config::WakeComputeLockOptions { @@ -517,9 +507,10 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { } = args.wake_compute_lock.parse()?; info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)"); let locks = Box::leak(Box::new( - console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout, epoch) + console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout) .unwrap(), )); + tokio::spawn(locks.garbage_collect_worker(epoch)); let url = args.auth_endpoint.parse()?; let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config)); diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs index d1d4087241..fc5f416395 100644 --- a/proxy/src/cache.rs +++ b/proxy/src/cache.rs @@ -1,5 +1,4 @@ pub mod common; -pub mod endpoints; pub mod project_info; mod timed_lru; diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs deleted file mode 100644 index 31e3ef6891..0000000000 --- a/proxy/src/cache/endpoints.rs +++ /dev/null @@ -1,190 +0,0 @@ -use std::{ - convert::Infallible, - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }, -}; - -use dashmap::DashSet; -use redis::{ - streams::{StreamReadOptions, StreamReadReply}, - AsyncCommands, FromRedisValue, Value, -}; -use serde::Deserialize; -use tokio::sync::Mutex; - -use crate::{ - config::EndpointCacheConfig, - context::RequestMonitoring, - intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}, - metrics::REDIS_BROKEN_MESSAGES, - rate_limiter::GlobalRateLimiter, - redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider, - EndpointId, -}; - -#[derive(Deserialize, Debug, Clone)] -#[serde(rename_all(deserialize = "snake_case"))] -pub enum ControlPlaneEventKey { - EndpointCreated, - BranchCreated, - ProjectCreated, -} - -pub struct EndpointsCache { - config: EndpointCacheConfig, - endpoints: DashSet, - branches: DashSet, - projects: DashSet, - ready: AtomicBool, - limiter: Arc>, -} - -impl EndpointsCache { - pub fn new(config: EndpointCacheConfig) -> Self { - Self { - limiter: Arc::new(Mutex::new(GlobalRateLimiter::new( - config.limiter_info.clone(), - ))), - config, - endpoints: DashSet::new(), - branches: DashSet::new(), - projects: DashSet::new(), - ready: AtomicBool::new(false), - } - } - pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool { - if !self.ready.load(Ordering::Acquire) { - return true; - } - // If cache is disabled, just collect the metrics and return. - if self.config.disable_cache { - ctx.set_rejected(self.should_reject(endpoint)); - return true; - } - // If the limiter allows, we don't need to check the cache. - if self.limiter.lock().await.check() { - return true; - } - let rejected = self.should_reject(endpoint); - ctx.set_rejected(rejected); - !rejected - } - fn should_reject(&self, endpoint: &EndpointId) -> bool { - if endpoint.is_endpoint() { - !self.endpoints.contains(&EndpointIdInt::from(endpoint)) - } else if endpoint.is_branch() { - !self - .branches - .contains(&BranchIdInt::from(&endpoint.as_branch())) - } else { - !self - .projects - .contains(&ProjectIdInt::from(&endpoint.as_project())) - } - } - fn insert_event(&self, key: ControlPlaneEventKey, value: String) { - // Do not do normalization here, we expect the events to be normalized. - match key { - ControlPlaneEventKey::EndpointCreated => { - self.endpoints.insert(EndpointIdInt::from(&value.into())); - } - ControlPlaneEventKey::BranchCreated => { - self.branches.insert(BranchIdInt::from(&value.into())); - } - ControlPlaneEventKey::ProjectCreated => { - self.projects.insert(ProjectIdInt::from(&value.into())); - } - } - } - pub async fn do_read( - &self, - mut con: ConnectionWithCredentialsProvider, - ) -> anyhow::Result { - let mut last_id = "0-0".to_string(); - loop { - self.ready.store(false, Ordering::Release); - if let Err(e) = con.connect().await { - tracing::error!("error connecting to redis: {:?}", e); - continue; - } - if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await { - tracing::error!("error reading from redis: {:?}", e); - } - } - } - async fn read_from_stream( - &self, - con: &mut ConnectionWithCredentialsProvider, - last_id: &mut String, - ) -> anyhow::Result<()> { - tracing::info!("reading endpoints/branches/projects from redis"); - self.batch_read( - con, - StreamReadOptions::default().count(self.config.initial_batch_size), - last_id, - true, - ) - .await?; - tracing::info!("ready to filter user requests"); - self.ready.store(true, Ordering::Release); - self.batch_read( - con, - StreamReadOptions::default() - .count(self.config.initial_batch_size) - .block(self.config.xread_timeout.as_millis() as usize), - last_id, - false, - ) - .await - } - fn parse_key_value(key: &str, value: &Value) -> anyhow::Result<(ControlPlaneEventKey, String)> { - Ok((serde_json::from_str(key)?, String::from_redis_value(value)?)) - } - async fn batch_read( - &self, - conn: &mut ConnectionWithCredentialsProvider, - opts: StreamReadOptions, - last_id: &mut String, - return_when_finish: bool, - ) -> anyhow::Result<()> { - let mut total: usize = 0; - loop { - let mut res: StreamReadReply = conn - .xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts) - .await?; - if res.keys.len() != 1 { - anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name); - } - - let res = res.keys.pop().expect("Checked length above"); - - if return_when_finish && res.ids.len() <= self.config.default_batch_size { - break; - } - for x in res.ids { - total += 1; - for (k, v) in x.map { - let (key, value) = match Self::parse_key_value(&k, &v) { - Ok(x) => x, - Err(e) => { - REDIS_BROKEN_MESSAGES - .with_label_values(&[&self.config.stream_name]) - .inc(); - tracing::error!("error parsing key-value {k}-{v:?}: {e:?}"); - continue; - } - }; - self.insert_event(key, value); - } - if total.is_power_of_two() { - tracing::debug!("endpoints read {}", total); - } - *last_id = x.id; - } - } - tracing::info!("read {} endpoints/branches/projects from redis", total); - Ok(()) - } -} diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 3bdfb3cfad..fc490c7348 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -313,75 +313,6 @@ impl CertResolver { } } -#[derive(Debug)] -pub struct EndpointCacheConfig { - /// Batch size to receive all endpoints on the startup. - pub initial_batch_size: usize, - /// Batch size to receive endpoints. - pub default_batch_size: usize, - /// Timeouts for the stream read operation. - pub xread_timeout: Duration, - /// Stream name to read from. - pub stream_name: String, - /// Limiter info (to distinguish when to enable cache). - pub limiter_info: Vec, - /// Disable cache. - /// If true, cache is ignored, but reports all statistics. - pub disable_cache: bool, -} - -impl EndpointCacheConfig { - /// Default options for [`crate::console::provider::NodeInfoCache`]. - /// Notice that by default the limiter is empty, which means that cache is disabled. - pub const CACHE_DEFAULT_OPTIONS: &'static str = - "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s"; - - /// Parse cache options passed via cmdline. - /// Example: [`Self::CACHE_DEFAULT_OPTIONS`]. - fn parse(options: &str) -> anyhow::Result { - let mut initial_batch_size = None; - let mut default_batch_size = None; - let mut xread_timeout = None; - let mut stream_name = None; - let mut limiter_info = vec![]; - let mut disable_cache = false; - - for option in options.split(',') { - let (key, value) = option - .split_once('=') - .with_context(|| format!("bad key-value pair: {option}"))?; - - match key { - "initial_batch_size" => initial_batch_size = Some(value.parse()?), - "default_batch_size" => default_batch_size = Some(value.parse()?), - "xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?), - "stream_name" => stream_name = Some(value.to_string()), - "limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?), - "disable_cache" => disable_cache = value.parse()?, - unknown => bail!("unknown key: {unknown}"), - } - } - RateBucketInfo::validate(&mut limiter_info)?; - - Ok(Self { - initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?, - default_batch_size: default_batch_size.context("missing `default_batch_size`")?, - xread_timeout: xread_timeout.context("missing `xread_timeout`")?, - stream_name: stream_name.context("missing `stream_name`")?, - disable_cache, - limiter_info, - }) - } -} - -impl FromStr for EndpointCacheConfig { - type Err = anyhow::Error; - - fn from_str(options: &str) -> Result { - let error = || format!("failed to parse endpoint cache options '{options}'"); - Self::parse(options).with_context(error) - } -} #[derive(Debug)] pub struct MetricBackupCollectionConfig { pub interval: Duration, diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index ee2bc866ab..f7d621fb12 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -8,15 +8,15 @@ use crate::{ backend::{ComputeCredentialKeys, ComputeUserInfo}, IpPattern, }, - cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru}, + cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru}, compute, - config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}, + config::{CacheOptions, ProjectInfoCacheOptions}, context::RequestMonitoring, intern::ProjectIdInt, scram, EndpointCacheKey, }; use dashmap::DashMap; -use std::{convert::Infallible, sync::Arc, time::Duration}; +use std::{sync::Arc, time::Duration}; use tokio::sync::{OwnedSemaphorePermit, Semaphore}; use tokio::time::Instant; use tracing::info; @@ -416,15 +416,12 @@ pub struct ApiCaches { pub node_info: NodeInfoCache, /// Cache which stores project_id -> endpoint_ids mapping. pub project_info: Arc, - /// List of all valid endpoints. - pub endpoints_cache: Arc, } impl ApiCaches { pub fn new( wake_compute_cache_config: CacheOptions, project_info_cache_config: ProjectInfoCacheOptions, - endpoint_cache_config: EndpointCacheConfig, ) -> Self { Self { node_info: NodeInfoCache::new( @@ -434,7 +431,6 @@ impl ApiCaches { true, ), project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)), - endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)), } } } @@ -445,7 +441,6 @@ pub struct ApiLocks { node_locks: DashMap>, permits: usize, timeout: Duration, - epoch: std::time::Duration, registered: prometheus::IntCounter, unregistered: prometheus::IntCounter, reclamation_lag: prometheus::Histogram, @@ -458,7 +453,6 @@ impl ApiLocks { permits: usize, shards: usize, timeout: Duration, - epoch: std::time::Duration, ) -> prometheus::Result { let registered = prometheus::IntCounter::with_opts( prometheus::Opts::new( @@ -503,7 +497,6 @@ impl ApiLocks { node_locks: DashMap::with_shard_amount(shards), permits, timeout, - epoch, lock_acquire_lag, registered, unregistered, @@ -543,9 +536,12 @@ impl ApiLocks { }) } - pub async fn garbage_collect_worker(&self) -> anyhow::Result { - let mut interval = - tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32); + pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) { + if self.permits == 0 { + return; + } + + let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32); loop { for (i, shard) in self.node_locks.shards().iter().enumerate() { interval.tick().await; diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 68b91447f9..1a3e2ca795 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -8,7 +8,6 @@ use super::{ }; use crate::{ auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram, - Normalize, }; use crate::{ cache::Cached, @@ -24,7 +23,7 @@ use tracing::{error, info, info_span, warn, Instrument}; pub struct Api { endpoint: http::Endpoint, pub caches: &'static ApiCaches, - pub locks: &'static ApiLocks, + locks: &'static ApiLocks, jwt: String, } @@ -56,15 +55,6 @@ impl Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { - if !self - .caches - .endpoints_cache - .is_valid(ctx, &user_info.endpoint.normalize()) - .await - { - info!("endpoint is not valid, skipping the request"); - return Ok(AuthInfo::default()); - } let request_id = ctx.session_id.to_string(); let application_name = ctx.console_application_name(); async { @@ -91,9 +81,7 @@ impl Api { Ok(body) => body, // Error 404 is special: it's ok not to have a secret. Err(e) => match e.http_status_code() { - Some(http::StatusCode::NOT_FOUND) => { - return Ok(AuthInfo::default()); - } + Some(http::StatusCode::NOT_FOUND) => return Ok(AuthInfo::default()), _otherwise => return Err(e.into()), }, }; @@ -186,27 +174,23 @@ impl super::Api for Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { - let normalized_ep = &user_info.endpoint.normalize(); + let ep = &user_info.endpoint; let user = &user_info.user; - if let Some(role_secret) = self - .caches - .project_info - .get_role_secret(normalized_ep, user) - { + if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) { return Ok(role_secret); } let auth_info = self.do_get_auth_info(ctx, user_info).await?; if let Some(project_id) = auth_info.project_id { - let normalized_ep_int = normalized_ep.into(); + let ep_int = ep.into(); self.caches.project_info.insert_role_secret( project_id, - normalized_ep_int, + ep_int, user.into(), auth_info.secret.clone(), ); self.caches.project_info.insert_allowed_ips( project_id, - normalized_ep_int, + ep_int, Arc::new(auth_info.allowed_ips), ); ctx.set_project_id(project_id); @@ -220,8 +204,8 @@ impl super::Api for Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { - let normalized_ep = &user_info.endpoint.normalize(); - if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) { + let ep = &user_info.endpoint; + if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) { ALLOWED_IPS_BY_CACHE_OUTCOME .with_label_values(&["hit"]) .inc(); @@ -234,18 +218,16 @@ impl super::Api for Api { let allowed_ips = Arc::new(auth_info.allowed_ips); let user = &user_info.user; if let Some(project_id) = auth_info.project_id { - let normalized_ep_int = normalized_ep.into(); + let ep_int = ep.into(); self.caches.project_info.insert_role_secret( project_id, - normalized_ep_int, + ep_int, user.into(), auth_info.secret.clone(), ); - self.caches.project_info.insert_allowed_ips( - project_id, - normalized_ep_int, - allowed_ips.clone(), - ); + self.caches + .project_info + .insert_allowed_ips(project_id, ep_int, allowed_ips.clone()); ctx.set_project_id(project_id); } Ok(( diff --git a/proxy/src/context.rs b/proxy/src/context.rs index 85544f1d65..fec95f4722 100644 --- a/proxy/src/context.rs +++ b/proxy/src/context.rs @@ -12,9 +12,7 @@ use crate::{ console::messages::{ColdStartInfo, MetricsAuxInfo}, error::ErrorKind, intern::{BranchIdInt, ProjectIdInt}, - metrics::{ - bool_to_str, LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND, NUM_INVALID_ENDPOINTS, - }, + metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND}, DbName, EndpointId, RoleName, }; @@ -52,8 +50,6 @@ pub struct RequestMonitoring { // This sender is here to keep the request monitoring channel open while requests are taking place. sender: Option>, pub latency_timer: LatencyTimer, - // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane. - rejected: bool, } #[derive(Clone, Debug)] @@ -97,7 +93,6 @@ impl RequestMonitoring { error_kind: None, auth_method: None, success: false, - rejected: false, cold_start_info: ColdStartInfo::Unknown, sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()), @@ -118,10 +113,6 @@ impl RequestMonitoring { ) } - pub fn set_rejected(&mut self, rejected: bool) { - self.rejected = rejected; - } - pub fn set_cold_start_info(&mut self, info: ColdStartInfo) { self.cold_start_info = info; self.latency_timer.cold_start_info(info); @@ -187,10 +178,6 @@ impl RequestMonitoring { impl Drop for RequestMonitoring { fn drop(&mut self) { - let outcome = if self.success { "success" } else { "failure" }; - NUM_INVALID_ENDPOINTS - .with_label_values(&[self.protocol, bool_to_str(self.rejected), outcome]) - .inc(); if let Some(tx) = self.sender.take() { let _: Result<(), _> = tx.send(RequestData::from(&*self)); } diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs index e38135dd22..a6519bdff9 100644 --- a/proxy/src/intern.rs +++ b/proxy/src/intern.rs @@ -160,11 +160,6 @@ impl From<&EndpointId> for EndpointIdInt { EndpointIdTag::get_interner().get_or_intern(value) } } -impl From for EndpointIdInt { - fn from(value: EndpointId) -> Self { - EndpointIdTag::get_interner().get_or_intern(&value) - } -} #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct BranchIdTag; @@ -180,11 +175,6 @@ impl From<&BranchId> for BranchIdInt { BranchIdTag::get_interner().get_or_intern(value) } } -impl From for BranchIdInt { - fn from(value: BranchId) -> Self { - BranchIdTag::get_interner().get_or_intern(&value) - } -} #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct ProjectIdTag; @@ -200,11 +190,6 @@ impl From<&ProjectId> for ProjectIdInt { ProjectIdTag::get_interner().get_or_intern(value) } } -impl From for ProjectIdInt { - fn from(value: ProjectId) -> Self { - ProjectIdTag::get_interner().get_or_intern(&value) - } -} #[cfg(test)] mod tests { diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index 3f6d985fe8..da7c7f3ed2 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -127,24 +127,6 @@ macro_rules! smol_str_wrapper { }; } -const POOLER_SUFFIX: &str = "-pooler"; - -pub trait Normalize { - fn normalize(&self) -> Self; -} - -impl + From> Normalize for S { - fn normalize(&self) -> Self { - if self.as_ref().ends_with(POOLER_SUFFIX) { - let mut s = self.as_ref().to_string(); - s.truncate(s.len() - POOLER_SUFFIX.len()); - s.into() - } else { - self.clone() - } - } -} - // 90% of role name strings are 20 characters or less. smol_str_wrapper!(RoleName); // 50% of endpoint strings are 23 characters or less. @@ -158,22 +140,3 @@ smol_str_wrapper!(ProjectId); smol_str_wrapper!(EndpointCacheKey); smol_str_wrapper!(DbName); - -// Endpoints are a bit tricky. Rare they might be branches or projects. -impl EndpointId { - pub fn is_endpoint(&self) -> bool { - self.0.starts_with("ep-") - } - pub fn is_branch(&self) -> bool { - self.0.starts_with("br-") - } - pub fn is_project(&self) -> bool { - !self.is_endpoint() && !self.is_branch() - } - pub fn as_branch(&self) -> BranchId { - BranchId(self.0.clone()) - } - pub fn as_project(&self) -> ProjectId { - ProjectId(self.0.clone()) - } -} diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index f299313e0a..59ee899c08 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -169,18 +169,6 @@ pub static NUM_CANCELLATION_REQUESTS: Lazy = Lazy::new(|| { .unwrap() }); -pub static NUM_INVALID_ENDPOINTS: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_invalid_endpoints_total", - "Number of invalid endpoints (per protocol, per rejected).", - // http/ws/tcp, true/false, success/failure - // TODO(anna): the last dimension is just a proxy to what we actually want to measure. - // We need to measure whether the endpoint was found by cplane or not. - &["protocol", "rejected", "outcome"], - ) - .unwrap() -}); - pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client"; pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis"; diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 166e761a4e..6051c0a812 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -20,7 +20,7 @@ use crate::{ proxy::handshake::{handshake, HandshakeData}, rate_limiter::EndpointRateLimiter, stream::{PqStream, Stream}, - EndpointCacheKey, Normalize, + EndpointCacheKey, }; use futures::TryFutureExt; use itertools::Itertools; @@ -280,7 +280,7 @@ pub async fn handle_client( // check rate limit if let Some(ep) = user_info.get_endpoint() { - if !endpoint_rate_limiter.check(ep.normalize(), 1) { + if !endpoint_rate_limiter.check(ep, 1) { return stream .throw_error(auth::AuthError::too_many_connections()) .await?; diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs index a3b83e5e50..13dffffca0 100644 --- a/proxy/src/rate_limiter.rs +++ b/proxy/src/rate_limiter.rs @@ -4,4 +4,4 @@ mod limiter; pub use aimd::Aimd; pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig}; pub use limiter::Limiter; -pub use limiter::{AuthRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo}; +pub use limiter::{AuthRateLimiter, EndpointRateLimiter, RateBucketInfo, RedisRateLimiter}; diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index 0503deb311..f590896dd9 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -24,13 +24,13 @@ use super::{ RateLimiterConfig, }; -pub struct GlobalRateLimiter { +pub struct RedisRateLimiter { data: Vec, - info: Vec, + info: &'static [RateBucketInfo], } -impl GlobalRateLimiter { - pub fn new(info: Vec) -> Self { +impl RedisRateLimiter { + pub fn new(info: &'static [RateBucketInfo]) -> Self { Self { data: vec![ RateBucket { @@ -50,7 +50,7 @@ impl GlobalRateLimiter { let should_allow_request = self .data .iter_mut() - .zip(&self.info) + .zip(self.info) .all(|(bucket, info)| bucket.should_allow_request(info, now, 1)); if should_allow_request { diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs index 7baf104374..422789813c 100644 --- a/proxy/src/redis/cancellation_publisher.rs +++ b/proxy/src/redis/cancellation_publisher.rs @@ -5,7 +5,7 @@ use redis::AsyncCommands; use tokio::sync::Mutex; use uuid::Uuid; -use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; +use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter}; use super::{ connection_with_credentials_provider::ConnectionWithCredentialsProvider, @@ -80,7 +80,7 @@ impl CancellationPublisher for Arc> { pub struct RedisPublisherClient { client: ConnectionWithCredentialsProvider, region_id: String, - limiter: GlobalRateLimiter, + limiter: RedisRateLimiter, } impl RedisPublisherClient { @@ -92,7 +92,7 @@ impl RedisPublisherClient { Ok(Self { client, region_id, - limiter: GlobalRateLimiter::new(info.into()), + limiter: RedisRateLimiter::new(info), }) } diff --git a/test_runner/regress/test_proxy_rate_limiter.py b/test_runner/regress/test_proxy_rate_limiter.py new file mode 100644 index 0000000000..f39f0cad07 --- /dev/null +++ b/test_runner/regress/test_proxy_rate_limiter.py @@ -0,0 +1,84 @@ +import asyncio +import time +from pathlib import Path +from typing import Iterator + +import pytest +from fixtures.neon_fixtures import ( + PSQL, + NeonProxy, +) +from fixtures.port_distributor import PortDistributor +from pytest_httpserver import HTTPServer +from werkzeug.wrappers.response import Response + + +def waiting_handler(status_code: int) -> Response: + # wait more than timeout to make sure that both (two) connections are open. + # It would be better to use a barrier here, but I don't know how to do that together with pytest-httpserver. + time.sleep(2) + return Response(status=status_code) + + +@pytest.fixture(scope="function") +def proxy_with_rate_limit( + port_distributor: PortDistributor, + neon_binpath: Path, + httpserver_listen_address, + test_output_dir: Path, +) -> Iterator[NeonProxy]: + """Neon proxy that routes directly to vanilla postgres.""" + + proxy_port = port_distributor.get_port() + mgmt_port = port_distributor.get_port() + http_port = port_distributor.get_port() + external_http_port = port_distributor.get_port() + (host, port) = httpserver_listen_address + endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" + + with NeonProxy( + neon_binpath=neon_binpath, + test_output_dir=test_output_dir, + proxy_port=proxy_port, + http_port=http_port, + mgmt_port=mgmt_port, + external_http_port=external_http_port, + auth_backend=NeonProxy.Console(endpoint, fixed_rate_limit=5), + ) as proxy: + proxy.start() + yield proxy + + +@pytest.mark.asyncio +async def test_proxy_rate_limit( + httpserver: HTTPServer, + proxy_with_rate_limit: NeonProxy, +): + uri = "/billing/api/v1/usage_events/proxy_get_role_secret" + # mock control plane service + httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( + lambda _: Response(status=200) + ) + httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( + lambda _: waiting_handler(429) + ) + httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( + lambda _: waiting_handler(500) + ) + + psql = PSQL(host=proxy_with_rate_limit.host, port=proxy_with_rate_limit.proxy_port) + f = await psql.run("select 42;") + await proxy_with_rate_limit.find_auth_link(uri, f) + # Limit should be 2. + + # Run two queries in parallel. + f1, f2 = await asyncio.gather(psql.run("select 42;"), psql.run("select 42;")) + await proxy_with_rate_limit.find_auth_link(uri, f1) + await proxy_with_rate_limit.find_auth_link(uri, f2) + + # Now limit should be 0. + f = await psql.run("select 42;") + await proxy_with_rate_limit.find_auth_link(uri, f) + + # There last query shouldn't reach the http-server. + assert httpserver.assertions == [] From f86845f64b9576d05b06de9c33dec3c6be19c47c Mon Sep 17 00:00:00 2001 From: Em Sharnoff Date: Wed, 10 Apr 2024 06:13:48 -0700 Subject: [PATCH 14/24] compute_ctl: Auto-set dynamic_shared_memory_type (#7348) Part of neondatabase/cloud#12047. The basic idea is that for our VMs, we want to enable swap and disable Linux memory overcommit. Alongside these, we should set postgres' dynamic_shared_memory_type to mmap, but we want to avoid setting it to mmap if swap is not enabled. Implementing this in the control plane would be fiddly, but it's relatively straightforward to add to compute_ctl. --- compute_tools/src/config.rs | 25 +++++++++++++++++++++++-- compute_tools/src/pg_helpers.rs | 2 +- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index f1fd8637f5..89c866b20c 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -6,8 +6,8 @@ use std::path::Path; use anyhow::Result; use crate::pg_helpers::escape_conf_value; -use crate::pg_helpers::PgOptionsSerialize; -use compute_api::spec::{ComputeMode, ComputeSpec}; +use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize}; +use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption}; /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. @@ -92,6 +92,27 @@ pub fn write_postgres_conf( } } + if cfg!(target_os = "linux") { + // Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is + // disabled), then the control plane has enabled swap and we should set + // dynamic_shared_memory_type = 'mmap'. + // + // This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047. + let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory") + // ignore any errors - they may be expected to occur under certain situations (e.g. when + // not running in Linux). + .unwrap_or_else(|_| String::new()); + if overcommit_memory_contents.trim() == "2" { + let opt = GenericOption { + name: "dynamic_shared_memory_type".to_owned(), + value: Some("mmap".to_owned()), + vartype: "enum".to_owned(), + }; + + write!(file, "{}", opt.to_pg_setting())?; + } + } + // If there are any extra options in the 'settings' field, append those if spec.cluster.settings.is_some() { writeln!(file, "# Managed by compute_ctl: begin")?; diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 5deb50d6b7..fa0822748b 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String { format!("'{}'", res) } -trait GenericOptionExt { +pub trait GenericOptionExt { fn to_pg_option(&self) -> String; fn to_pg_setting(&self) -> String; } From d47e4a2a4148ff0b6467d5bda504401b90bb00da Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 11 Apr 2024 07:47:45 +0300 Subject: [PATCH 15/24] Remember last written LSN when it is first requested (#7343) ## Problem See https://neondb.slack.com/archives/C03QLRH7PPD/p1712529369520409 In case of statements CREATE TABLE AS SELECT... or INSERT FROM SELECT... we are fetching data from source table and storing it in destination table. It cause problems with prefetch last-written-lsn is known for the pages of source table (which for example happens after compute restart). In this case we get get global value of last-written-lsn which is changed frequently as far as we are writing pages of destination table. As a result request-isn for the prefetch and request-let when this page is actually needed are different and we got exported prefetch request. So it actually disarms prefetch. ## Summary of changes Proposed simple patch stores last-written LSN for the page when it is not found. So next time we will request last-written LSN for this page, we will get the same value (certainly if the page was not changed). ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- vendor/postgres-v16 | 2 +- vendor/revisions.json | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index a7b4c66156..d9149dc59a 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit a7b4c66156bce00afa60e5592d4284ba9e40b4cf +Subproject commit d9149dc59abcbeeb26293707509aef51752db28f diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 64b8c7bccc..85d809c124 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 64b8c7bccc6b77e04795e2d4cf6ad82dc8d987ed +Subproject commit 85d809c124a898847a97d66a211f7d5ef4f8e0cb diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 3946b2e2ea..261497dd63 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 3946b2e2ea71d07af092099cb5bcae76a69b90d6 +Subproject commit 261497dd63ace434045058b1453bcbaaa83f23e5 diff --git a/vendor/revisions.json b/vendor/revisions.json index 75dc095168..dfc0aa04c3 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "postgres-v16": "3946b2e2ea71d07af092099cb5bcae76a69b90d6", - "postgres-v15": "64b8c7bccc6b77e04795e2d4cf6ad82dc8d987ed", - "postgres-v14": "a7b4c66156bce00afa60e5592d4284ba9e40b4cf" + "postgres-v16": "261497dd63ace434045058b1453bcbaaa83f23e5", + "postgres-v15": "85d809c124a898847a97d66a211f7d5ef4f8e0cb", + "postgres-v14": "d9149dc59abcbeeb26293707509aef51752db28f" } From db72543f4d4d3300d48375db177c8ee598ed4049 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 11 Apr 2024 12:31:27 +0200 Subject: [PATCH 16/24] Reenable test_forward_compatibility (#7358) It was disabled due to https://github.com/neondatabase/neon/pull/6530 breaking forward compatiblity. Now that we have deployed it to production, we can reenable the test --- test_runner/regress/test_compatibility.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 208263a22a..ddad98a5fa 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -192,9 +192,6 @@ def test_backward_compatibility( assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" -# Forward compatibility is broken due to https://github.com/neondatabase/neon/pull/6530 -# The test is disabled until the next release deployment -@pytest.mark.xfail @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") From 1628b5b145b335e4a26fcdb1ccdf4263ab8745cf Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 11 Apr 2024 17:14:09 +0300 Subject: [PATCH 17/24] compute hook: use shared client with explicit timeout (#7359) ## Problem We are seeing some mysterious long waits when sending requests. ## Summary of changes - To eliminate risk that we are incurring some unreasonable overheads from setup, e.g. DNS, use a single Client (internally a pool) instead of repeatedly constructing a fresh one. - To make it clearer where a timeout is occurring, apply a 10 second timeout to requests as we send them. --- storage_controller/src/compute_hook.rs | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index eb0c4472e4..1ed8998713 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -17,6 +17,8 @@ use crate::service::Config; const SLOWDOWN_DELAY: Duration = Duration::from_secs(5); +const NOTIFY_REQUEST_TIMEOUT: Duration = Duration::from_secs(10); + pub(crate) const API_CONCURRENCY: usize = 32; struct UnshardedComputeHookTenant { @@ -242,6 +244,10 @@ pub(super) struct ComputeHook { // This lock is only used in testing enviroments, to serialize calls into neon_lock neon_local_lock: tokio::sync::Mutex<()>, + + // We share a client across all notifications to enable connection re-use etc when + // sending large numbers of notifications + client: reqwest::Client, } impl ComputeHook { @@ -251,12 +257,18 @@ impl ComputeHook { .clone() .map(|jwt| format!("Bearer {}", jwt)); + let client = reqwest::ClientBuilder::new() + .timeout(NOTIFY_REQUEST_TIMEOUT) + .build() + .expect("Failed to construct HTTP client"); + Self { state: Default::default(), config, authorization_header, neon_local_lock: Default::default(), api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY), + client, } } @@ -310,12 +322,11 @@ impl ComputeHook { async fn do_notify_iteration( &self, - client: &reqwest::Client, url: &String, reconfigure_request: &ComputeHookNotifyRequest, cancel: &CancellationToken, ) -> Result<(), NotifyError> { - let req = client.request(Method::PUT, url); + let req = self.client.request(Method::PUT, url); let req = if let Some(value) = &self.authorization_header { req.header(reqwest::header::AUTHORIZATION, value) } else { @@ -381,8 +392,6 @@ impl ComputeHook { reconfigure_request: &ComputeHookNotifyRequest, cancel: &CancellationToken, ) -> Result<(), NotifyError> { - let client = reqwest::Client::new(); - // We hold these semaphore units across all retries, rather than only across each // HTTP request: this is to preserve fairness and avoid a situation where a retry might // time out waiting for a semaphore. @@ -394,7 +403,7 @@ impl ComputeHook { .map_err(|_| NotifyError::ShuttingDown)?; backoff::retry( - || self.do_notify_iteration(&client, url, reconfigure_request, cancel), + || self.do_notify_iteration(url, reconfigure_request, cancel), |e| { matches!( e, From 99a56b56064264fd73a7dc3ce5606469725cc4cb Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 11 Apr 2024 15:23:08 +0100 Subject: [PATCH 18/24] CI(build-build-tools-image): Do not cancel concurrent workflows (#7226) ## Problem `build-build-tools-image` workflow is designed to be run only in one example per the whole repository. Currently, the job gets cancelled if a newer one is scheduled, here's an example: https://github.com/neondatabase/neon/actions/runs/8419610607 ## Summary of changes - Explicitly set `cancel-in-progress: false` for all jobs that aren't supposed to be cancelled --- .github/workflows/approved-for-ci-run.yml | 1 + .github/workflows/build-build-tools-image.yml | 1 + .github/workflows/pin-build-tools-image.yml | 1 + 3 files changed, 3 insertions(+) diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml index 69c48d86b9..ab616d17e2 100644 --- a/.github/workflows/approved-for-ci-run.yml +++ b/.github/workflows/approved-for-ci-run.yml @@ -18,6 +18,7 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number }} + cancel-in-progress: false env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index 251423e701..c527cef1ac 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -21,6 +21,7 @@ defaults: concurrency: group: build-build-tools-image-${{ inputs.image-tag }} + cancel-in-progress: false # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index c941692066..d495a158e8 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -20,6 +20,7 @@ defaults: concurrency: group: pin-build-tools-image-${{ inputs.from-tag }} + cancel-in-progress: false permissions: {} From 5299f917d6d2be5d87b56d236342d48682a5c9f4 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Thu, 11 Apr 2024 17:26:01 +0100 Subject: [PATCH 19/24] proxy: replace prometheus with measured (#6717) ## Problem My benchmarks show that prometheus is not very good. https://github.com/conradludgate/measured We're already using it in storage_controller and it seems to be working well. ## Summary of changes Replace prometheus with my new measured crate in proxy only. Apologies for the large diff. I tried to keep it as minimal as I could. The label types add a bit of boiler plate (but reduce the chance we mistype the labels), and some of our custom metrics like CounterPair and HLL needed to be rewritten. --- Cargo.lock | 13 +- Cargo.toml | 4 +- libs/metrics/src/hll.rs | 395 ++++------------ libs/metrics/src/lib.rs | 172 ++++++- proxy/Cargo.toml | 1 + proxy/src/auth/backend.rs | 10 +- proxy/src/auth/credentials.rs | 21 +- proxy/src/bin/pg_sni_router.rs | 7 +- proxy/src/bin/proxy.rs | 40 +- proxy/src/cancellation.rs | 34 +- proxy/src/compute.rs | 9 +- proxy/src/console/messages.rs | 5 +- proxy/src/console/provider.rs | 63 +-- proxy/src/console/provider/neon.rs | 32 +- proxy/src/context.rs | 24 +- proxy/src/context/parquet.rs | 2 +- proxy/src/error.rs | 9 +- proxy/src/http.rs | 21 +- proxy/src/http/health_server.rs | 89 +++- proxy/src/jemalloc.rs | 178 +++---- proxy/src/metrics.rs | 658 +++++++++++++++----------- proxy/src/proxy.rs | 30 +- proxy/src/proxy/connect_compute.rs | 8 +- proxy/src/proxy/passthrough.rs | 16 +- proxy/src/proxy/wake_compute.rs | 31 +- proxy/src/rate_limiter/limiter.rs | 30 +- proxy/src/redis/notifications.rs | 10 +- proxy/src/serverless.rs | 28 +- proxy/src/serverless/conn_pool.rs | 51 +- proxy/src/serverless/sql_over_http.rs | 27 +- proxy/src/serverless/websocket.rs | 9 +- proxy/src/stream.rs | 4 +- 32 files changed, 1127 insertions(+), 904 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bdf2b08c5c..6faf4b72f0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2932,9 +2932,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "measured" -version = "0.0.20" +version = "0.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cbf033874bea03565f2449572c8640ca37ec26300455faf36001f24755da452" +checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5" dependencies = [ "bytes", "crossbeam-utils", @@ -2950,9 +2950,9 @@ dependencies = [ [[package]] name = "measured-derive" -version = "0.0.20" +version = "0.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be9e29b682b38f8af2a89f960455054ab1a9f5a06822f6f3500637ad9fa57def" +checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -2962,9 +2962,9 @@ dependencies = [ [[package]] name = "measured-process" -version = "0.0.20" +version = "0.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a20849acdd04c5d6a88f565559044546904648a1842a2937cfff0b48b4ca7ef2" +checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000" dependencies = [ "libc", "measured", @@ -4322,6 +4322,7 @@ dependencies = [ "itertools", "lasso", "md5", + "measured", "metrics", "native-tls", "once_cell", diff --git a/Cargo.toml b/Cargo.toml index feea17ab05..8310d2d522 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -107,8 +107,8 @@ lasso = "0.7" leaky-bucket = "1.0.1" libc = "0.2" md5 = "0.7.0" -measured = { version = "0.0.20", features=["lasso"] } -measured-process = { version = "0.0.20" } +measured = { version = "0.0.21", features=["lasso"] } +measured-process = { version = "0.0.21" } memoffset = "0.8" native-tls = "0.2" nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] } diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs index dfb4461ce9..f53511ab5c 100644 --- a/libs/metrics/src/hll.rs +++ b/libs/metrics/src/hll.rs @@ -7,14 +7,19 @@ //! use significantly less memory than this, but can only approximate the cardinality. use std::{ - collections::HashMap, - hash::{BuildHasher, BuildHasherDefault, Hash, Hasher}, - sync::{atomic::AtomicU8, Arc, RwLock}, + hash::{BuildHasher, BuildHasherDefault, Hash}, + sync::atomic::AtomicU8, }; -use prometheus::{ - core::{self, Describer}, - proto, Opts, +use measured::{ + label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor}, + metric::{ + group::{Encoding, MetricValue}, + name::MetricNameEncoder, + Metric, MetricType, MetricVec, + }, + text::TextEncoder, + LabelGroup, }; use twox_hash::xxh3; @@ -93,203 +98,25 @@ macro_rules! register_hll { /// ``` /// /// See for estimates on alpha -#[derive(Clone)] -pub struct HyperLogLogVec { - core: Arc>, +pub type HyperLogLogVec = MetricVec, L>; +pub type HyperLogLog = Metric>; + +pub struct HyperLogLogState { + shards: [AtomicU8; N], } - -struct HyperLogLogVecCore { - pub children: RwLock, BuildHasherDefault>>, - pub desc: core::Desc, - pub opts: Opts, -} - -impl core::Collector for HyperLogLogVec { - fn desc(&self) -> Vec<&core::Desc> { - vec![&self.core.desc] - } - - fn collect(&self) -> Vec { - let mut m = proto::MetricFamily::default(); - m.set_name(self.core.desc.fq_name.clone()); - m.set_help(self.core.desc.help.clone()); - m.set_field_type(proto::MetricType::GAUGE); - - let mut metrics = Vec::new(); - for child in self.core.children.read().unwrap().values() { - child.core.collect_into(&mut metrics); - } - m.set_metric(metrics); - - vec![m] +impl Default for HyperLogLogState { + fn default() -> Self { + #[allow(clippy::declare_interior_mutable_const)] + const ZERO: AtomicU8 = AtomicU8::new(0); + Self { shards: [ZERO; N] } } } -impl HyperLogLogVec { - /// Create a new [`HyperLogLogVec`] based on the provided - /// [`Opts`] and partitioned by the given label names. At least one label name must be - /// provided. - pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result { - assert!(N.is_power_of_two()); - let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect(); - let opts = opts.variable_labels(variable_names); - - let desc = opts.describe()?; - let v = HyperLogLogVecCore { - children: RwLock::new(HashMap::default()), - desc, - opts, - }; - - Ok(Self { core: Arc::new(v) }) - } - - /// `get_metric_with_label_values` returns the [`HyperLogLog

`] for the given slice - /// of label values (same order as the VariableLabels in Desc). If that combination of - /// label values is accessed for the first time, a new [`HyperLogLog

`] is created. - /// - /// An error is returned if the number of label values is not the same as the - /// number of VariableLabels in Desc. - pub fn get_metric_with_label_values( - &self, - vals: &[&str], - ) -> prometheus::Result> { - self.core.get_metric_with_label_values(vals) - } - - /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error - /// occurs. - pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog { - self.get_metric_with_label_values(vals).unwrap() - } +impl MetricType for HyperLogLogState { + type Metadata = (); } -impl HyperLogLogVecCore { - pub fn get_metric_with_label_values( - &self, - vals: &[&str], - ) -> prometheus::Result> { - let h = self.hash_label_values(vals)?; - - if let Some(metric) = self.children.read().unwrap().get(&h).cloned() { - return Ok(metric); - } - - self.get_or_create_metric(h, vals) - } - - pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result { - if vals.len() != self.desc.variable_labels.len() { - return Err(prometheus::Error::InconsistentCardinality { - expect: self.desc.variable_labels.len(), - got: vals.len(), - }); - } - - let mut h = xxh3::Hash64::default(); - for val in vals { - h.write(val.as_bytes()); - } - - Ok(h.finish()) - } - - fn get_or_create_metric( - &self, - hash: u64, - label_values: &[&str], - ) -> prometheus::Result> { - let mut children = self.children.write().unwrap(); - // Check exist first. - if let Some(metric) = children.get(&hash).cloned() { - return Ok(metric); - } - - let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?; - children.insert(hash, metric.clone()); - Ok(metric) - } -} - -/// HLL is a probabilistic cardinality measure. -/// -/// How to use this time-series for a metric name `my_metrics_total_hll`: -/// -/// ```promql -/// # harmonic mean -/// 1 / ( -/// sum ( -/// 2 ^ -( -/// # HLL merge operation -/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...) -/// ) -/// ) without (hll_shard) -/// ) -/// * alpha -/// * shards_count -/// * shards_count -/// ``` -/// -/// If you want an estimate over time, you can use the following query: -/// -/// ```promql -/// # harmonic mean -/// 1 / ( -/// sum ( -/// 2 ^ -( -/// # HLL merge operation -/// max ( -/// max_over_time(my_metrics_total_hll{}[$__rate_interval]) -/// ) by (hll_shard, other_labels...) -/// ) -/// ) without (hll_shard) -/// ) -/// * alpha -/// * shards_count -/// * shards_count -/// ``` -/// -/// In the case of low cardinality, you might want to use the linear counting approximation: -/// -/// ```promql -/// # LinearCounting(m, V) = m log (m / V) -/// shards_count * ln(shards_count / -/// # calculate V = how many shards contain a 0 -/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard) -/// ) -/// ``` -/// -/// See for estimates on alpha -#[derive(Clone)] -pub struct HyperLogLog { - core: Arc>, -} - -impl HyperLogLog { - /// Create a [`HyperLogLog`] with the `name` and `help` arguments. - pub fn new, S2: Into>(name: S1, help: S2) -> prometheus::Result { - assert!(N.is_power_of_two()); - let opts = Opts::new(name, help); - Self::with_opts(opts) - } - - /// Create a [`HyperLogLog`] with the `opts` options. - pub fn with_opts(opts: Opts) -> prometheus::Result { - Self::with_opts_and_label_values(&opts, &[]) - } - - fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result { - let desc = opts.describe()?; - let labels = make_label_pairs(&desc, label_values)?; - - let v = HyperLogLogCore { - shards: [0; N].map(AtomicU8::new), - desc, - labels, - }; - Ok(Self { core: Arc::new(v) }) - } - +impl HyperLogLogState { pub fn measure(&self, item: &impl Hash) { // changing the hasher will break compatibility with previous measurements. self.record(BuildHasherDefault::::default().hash_one(item)); @@ -299,42 +126,11 @@ impl HyperLogLog { let p = N.ilog2() as u8; let j = hash & (N as u64 - 1); let rho = (hash >> p).leading_zeros() as u8 + 1 - p; - self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed); - } -} - -struct HyperLogLogCore { - shards: [AtomicU8; N], - desc: core::Desc, - labels: Vec, -} - -impl core::Collector for HyperLogLog { - fn desc(&self) -> Vec<&core::Desc> { - vec![&self.core.desc] + self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed); } - fn collect(&self) -> Vec { - let mut m = proto::MetricFamily::default(); - m.set_name(self.core.desc.fq_name.clone()); - m.set_help(self.core.desc.help.clone()); - m.set_field_type(proto::MetricType::GAUGE); - - let mut metrics = Vec::new(); - self.core.collect_into(&mut metrics); - m.set_metric(metrics); - - vec![m] - } -} - -impl HyperLogLogCore { - fn collect_into(&self, metrics: &mut Vec) { - self.shards.iter().enumerate().for_each(|(i, x)| { - let mut shard_label = proto::LabelPair::default(); - shard_label.set_name("hll_shard".to_owned()); - shard_label.set_value(format!("{i}")); - + fn take_sample(&self) -> [u8; N] { + self.shards.each_ref().map(|x| { // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus. // This seems like it would be a race condition, @@ -344,85 +140,90 @@ impl HyperLogLogCore { // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window. // this would mean that a dev port-forwarding the metrics url won't break the sampling. - let v = x.swap(0, std::sync::atomic::Ordering::Relaxed); - - let mut m = proto::Metric::default(); - let mut c = proto::Gauge::default(); - c.set_value(v as f64); - m.set_gauge(c); - - let mut labels = Vec::with_capacity(self.labels.len() + 1); - labels.extend_from_slice(&self.labels); - labels.push(shard_label); - - m.set_label(labels); - metrics.push(m); + x.swap(0, std::sync::atomic::Ordering::Relaxed) }) } } - -fn make_label_pairs( - desc: &core::Desc, - label_values: &[&str], -) -> prometheus::Result> { - if desc.variable_labels.len() != label_values.len() { - return Err(prometheus::Error::InconsistentCardinality { - expect: desc.variable_labels.len(), - got: label_values.len(), - }); +impl measured::metric::MetricEncoding> + for HyperLogLogState +{ + fn write_type( + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + enc.write_type(&name, measured::text::MetricType::Gauge) } + fn collect_into( + &self, + _: &(), + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + struct I64(i64); + impl LabelValue for I64 { + fn visit(&self, v: V) -> V::Output { + v.write_int(self.0) + } + } - let total_len = desc.variable_labels.len() + desc.const_label_pairs.len(); - if total_len == 0 { - return Ok(vec![]); - } + struct HllShardLabel { + hll_shard: i64, + } - if desc.variable_labels.is_empty() { - return Ok(desc.const_label_pairs.clone()); - } + impl LabelGroup for HllShardLabel { + fn visit_values(&self, v: &mut impl LabelGroupVisitor) { + const LE: &LabelName = LabelName::from_str("hll_shard"); + v.write_value(LE, &I64(self.hll_shard)); + } + } - let mut label_pairs = Vec::with_capacity(total_len); - for (i, n) in desc.variable_labels.iter().enumerate() { - let mut label_pair = proto::LabelPair::default(); - label_pair.set_name(n.clone()); - label_pair.set_value(label_values[i].to_owned()); - label_pairs.push(label_pair); + self.take_sample() + .into_iter() + .enumerate() + .try_for_each(|(hll_shard, val)| { + enc.write_metric_value( + name.by_ref(), + labels.by_ref().compose_with(HllShardLabel { + hll_shard: hll_shard as i64, + }), + MetricValue::Int(val as i64), + ) + }) } - - for label_pair in &desc.const_label_pairs { - label_pairs.push(label_pair.clone()); - } - label_pairs.sort(); - Ok(label_pairs) } #[cfg(test)] mod tests { use std::collections::HashSet; - use prometheus::{proto, Opts}; + use measured::{label::StaticLabelSet, FixedCardinalityLabel}; use rand::{rngs::StdRng, Rng, SeedableRng}; use rand_distr::{Distribution, Zipf}; use crate::HyperLogLogVec; - fn collect(hll: &HyperLogLogVec<32>) -> Vec { - let mut metrics = vec![]; - hll.core - .children - .read() - .unwrap() - .values() - .for_each(|c| c.core.collect_into(&mut metrics)); - metrics + #[derive(FixedCardinalityLabel, Clone, Copy)] + #[label(singleton = "x")] + enum Label { + A, + B, } - fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 { + + fn collect(hll: &HyperLogLogVec, 32>) -> ([u8; 32], [u8; 32]) { + // cannot go through the `hll.collect_family_into` interface yet... + // need to see if I can fix the conflicting impls problem in measured. + ( + hll.get_metric(hll.with_labels(Label::A)).take_sample(), + hll.get_metric(hll.with_labels(Label::B)).take_sample(), + ) + } + + fn get_cardinality(samples: &[[u8; 32]]) -> f64 { let mut buckets = [0.0; 32]; - for metric in metrics.chunks_exact(32) { - if filter(&metric[0]) { - for (i, m) in metric.iter().enumerate() { - buckets[i] = f64::max(buckets[i], m.get_gauge().get_value()); - } + for &sample in samples { + for (i, m) in sample.into_iter().enumerate() { + buckets[i] = f64::max(buckets[i], m as f64); } } @@ -437,7 +238,7 @@ mod tests { } fn test_cardinality(n: usize, dist: impl Distribution) -> ([usize; 3], [f64; 3]) { - let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap(); + let hll = HyperLogLogVec::, 32>::new(); let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist); let mut set_a = HashSet::new(); @@ -445,18 +246,20 @@ mod tests { for x in iter.by_ref().take(n) { set_a.insert(x.to_bits()); - hll.with_label_values(&["a"]).measure(&x.to_bits()); + hll.get_metric(hll.with_labels(Label::A)) + .measure(&x.to_bits()); } for x in iter.by_ref().take(n) { set_b.insert(x.to_bits()); - hll.with_label_values(&["b"]).measure(&x.to_bits()); + hll.get_metric(hll.with_labels(Label::B)) + .measure(&x.to_bits()); } let merge = &set_a | &set_b; - let metrics = collect(&hll); - let len = get_cardinality(&metrics, |_| true); - let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a"); - let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b"); + let (a, b) = collect(&hll); + let len = get_cardinality(&[a, b]); + let len_a = get_cardinality(&[a]); + let len_b = get_cardinality(&[b]); ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b]) } diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 6cff28c0ca..2cf3cdeaa7 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -5,7 +5,7 @@ #![deny(clippy::undocumented_unsafe_blocks)] use measured::{ - label::{LabelGroupVisitor, LabelName, NoLabels}, + label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels}, metric::{ counter::CounterState, gauge::GaugeState, @@ -40,7 +40,7 @@ pub mod launch_timestamp; mod wrappers; pub use wrappers::{CountedReader, CountedWriter}; mod hll; -pub use hll::{HyperLogLog, HyperLogLogVec}; +pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec}; #[cfg(target_os = "linux")] pub mod more_process_metrics; @@ -421,3 +421,171 @@ pub type IntCounterPair = GenericCounterPair; /// A guard for [`IntCounterPair`] that will decrement the gauge on drop pub type IntCounterPairGuard = GenericCounterPairGuard; + +pub trait CounterPairAssoc { + const INC_NAME: &'static MetricName; + const DEC_NAME: &'static MetricName; + + const INC_HELP: &'static str; + const DEC_HELP: &'static str; + + type LabelGroupSet: LabelGroupSet; +} + +pub struct CounterPairVec { + vec: measured::metric::MetricVec, +} + +impl Default for CounterPairVec +where + A::LabelGroupSet: Default, +{ + fn default() -> Self { + Self { + vec: Default::default(), + } + } +} + +impl CounterPairVec { + pub fn guard( + &self, + labels: ::Group<'_>, + ) -> MeasuredCounterPairGuard<'_, A> { + let id = self.vec.with_labels(labels); + self.vec.get_metric(id).inc.inc(); + MeasuredCounterPairGuard { vec: &self.vec, id } + } + pub fn inc(&self, labels: ::Group<'_>) { + let id = self.vec.with_labels(labels); + self.vec.get_metric(id).inc.inc(); + } + pub fn dec(&self, labels: ::Group<'_>) { + let id = self.vec.with_labels(labels); + self.vec.get_metric(id).dec.inc(); + } + pub fn remove_metric( + &self, + labels: ::Group<'_>, + ) -> Option { + let id = self.vec.with_labels(labels); + self.vec.remove_metric(id) + } +} + +impl ::measured::metric::group::MetricGroup for CounterPairVec +where + T: ::measured::metric::group::Encoding, + A: CounterPairAssoc, + ::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding, +{ + fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> { + // write decrement first to avoid a race condition where inc - dec < 0 + T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?; + self.vec + .collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?; + + T::write_help(enc, A::INC_NAME, A::INC_HELP)?; + self.vec + .collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?; + + Ok(()) + } +} + +#[derive(MetricGroup, Default)] +pub struct MeasuredCounterPairState { + pub inc: CounterState, + pub dec: CounterState, +} + +impl measured::metric::MetricType for MeasuredCounterPairState { + type Metadata = (); +} + +pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> { + vec: &'a measured::metric::MetricVec, + id: measured::metric::LabelId, +} + +impl Drop for MeasuredCounterPairGuard<'_, A> { + fn drop(&mut self) { + self.vec.get_metric(self.id).dec.inc(); + } +} + +/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder. +struct Inc(T); +/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder. +struct Dec(T); + +impl Encoding for Inc { + type Err = T::Err; + + fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> { + self.0.write_help(name, help) + } + + fn write_metric_value( + &mut self, + name: impl MetricNameEncoder, + labels: impl LabelGroup, + value: MetricValue, + ) -> Result<(), Self::Err> { + self.0.write_metric_value(name, labels, value) + } +} + +impl MetricEncoding> for MeasuredCounterPairState +where + CounterState: MetricEncoding, +{ + fn write_type(name: impl MetricNameEncoder, enc: &mut Inc) -> Result<(), T::Err> { + CounterState::write_type(name, &mut enc.0) + } + fn collect_into( + &self, + metadata: &(), + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut Inc, + ) -> Result<(), T::Err> { + self.inc.collect_into(metadata, labels, name, &mut enc.0) + } +} + +impl Encoding for Dec { + type Err = T::Err; + + fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> { + self.0.write_help(name, help) + } + + fn write_metric_value( + &mut self, + name: impl MetricNameEncoder, + labels: impl LabelGroup, + value: MetricValue, + ) -> Result<(), Self::Err> { + self.0.write_metric_value(name, labels, value) + } +} + +/// Write the dec counter to the encoder +impl MetricEncoding> for MeasuredCounterPairState +where + CounterState: MetricEncoding, +{ + fn write_type(name: impl MetricNameEncoder, enc: &mut Dec) -> Result<(), T::Err> { + CounterState::write_type(name, &mut enc.0) + } + fn collect_into( + &self, + metadata: &(), + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut Dec, + ) -> Result<(), T::Err> { + self.dec.collect_into(metadata, labels, name, &mut enc.0) + } +} diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 12bd67ea36..6b8f2ecbf4 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -44,6 +44,7 @@ ipnet.workspace = true itertools.workspace = true lasso = { workspace = true, features = ["multi-threaded"] } md5.workspace = true +measured = { workspace = true, features = ["lasso"] } metrics.workspace = true once_cell.workspace = true opentelemetry.workspace = true diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index e421798067..229d499e30 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -13,7 +13,7 @@ use crate::console::provider::{CachedRoleSecret, ConsoleBackend}; use crate::console::{AuthSecret, NodeInfo}; use crate::context::RequestMonitoring; use crate::intern::EndpointIdInt; -use crate::metrics::{AUTH_RATE_LIMIT_HITS, ENDPOINTS_AUTH_RATE_LIMITED}; +use crate::metrics::Metrics; use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; use crate::stream::Stream; @@ -210,8 +210,12 @@ impl AuthenticationConfig { enabled = self.rate_limiter_enabled, "rate limiting authentication" ); - AUTH_RATE_LIMIT_HITS.inc(); - ENDPOINTS_AUTH_RATE_LIMITED.measure(endpoint); + Metrics::get().proxy.requests_auth_rate_limits_total.inc(); + Metrics::get() + .proxy + .endpoints_auth_rate_limits + .get_metric() + .measure(endpoint); if self.rate_limiter_enabled { return Err(auth::AuthError::too_many_connections()); diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 89773aa1ff..783a1a5a21 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -4,7 +4,7 @@ use crate::{ auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::{ReportableError, UserFacingError}, - metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, + metrics::{Metrics, SniKind}, proxy::NeonOptions, serverless::SERVERLESS_DRIVER_SNI, EndpointId, RoleName, @@ -144,21 +144,22 @@ impl ComputeUserInfoMaybeEndpoint { ctx.set_endpoint_id(ep.clone()); } + let metrics = Metrics::get(); info!(%user, "credentials"); if sni.is_some() { info!("Connection with sni"); - NUM_CONNECTION_ACCEPTED_BY_SNI - .with_label_values(&["sni"]) - .inc(); + metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni); } else if endpoint.is_some() { - NUM_CONNECTION_ACCEPTED_BY_SNI - .with_label_values(&["no_sni"]) - .inc(); + metrics + .proxy + .accepted_connections_by_sni + .inc(SniKind::NoSni); info!("Connection without sni"); } else { - NUM_CONNECTION_ACCEPTED_BY_SNI - .with_label_values(&["password_hack"]) - .inc(); + metrics + .proxy + .accepted_connections_by_sni + .inc(SniKind::PasswordHack); info!("Connection with password hack"); } diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index c28814b1c8..58737efe46 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -176,7 +176,12 @@ async fn task_main( .context("failed to set socket option")?; info!(%peer_addr, "serving"); - let ctx = RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni"); + let ctx = RequestMonitoring::new( + session_id, + peer_addr.ip(), + proxy::metrics::Protocol::SniRouter, + "sni", + ); handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await } .unwrap_or_else(|e| { diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 56a3ef79cd..3392c21075 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -18,7 +18,8 @@ use proxy::config::ProjectInfoCacheOptions; use proxy::console; use proxy::context::parquet::ParquetUploadArgs; use proxy::http; -use proxy::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT; +use proxy::http::health_server::AppMetrics; +use proxy::metrics::Metrics; use proxy::rate_limiter::AuthRateLimiter; use proxy::rate_limiter::EndpointRateLimiter; use proxy::rate_limiter::RateBucketInfo; @@ -249,14 +250,18 @@ async fn main() -> anyhow::Result<()> { info!("Version: {GIT_VERSION}"); info!("Build_tag: {BUILD_TAG}"); - ::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG); + let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }); - match proxy::jemalloc::MetricRecorder::new(prometheus::default_registry()) { - Ok(t) => { - t.start(); + let jemalloc = match proxy::jemalloc::MetricRecorder::new() { + Ok(t) => Some(t), + Err(e) => { + tracing::error!(error = ?e, "could not start jemalloc metrics loop"); + None } - Err(e) => tracing::error!(error = ?e, "could not start jemalloc metrics loop"), - } + }; let args = ProxyCliArgs::parse(); let config = build_config(&args)?; @@ -349,7 +354,7 @@ async fn main() -> anyhow::Result<()> { >::new( cancel_map.clone(), redis_publisher, - NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT, + proxy::metrics::CancellationSource::FromClient, )); // client facing tasks. these will exit on error or on cancellation @@ -387,7 +392,14 @@ async fn main() -> anyhow::Result<()> { // maintenance tasks. these never return unless there's an error let mut maintenance_tasks = JoinSet::new(); maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone())); - maintenance_tasks.spawn(http::health_server::task_main(http_listener)); + maintenance_tasks.spawn(http::health_server::task_main( + http_listener, + AppMetrics { + jemalloc, + neon_metrics, + proxy: proxy::metrics::Metrics::get(), + }, + )); maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener)); if let Some(metrics_config) = &config.metric_collection { @@ -507,8 +519,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { } = args.wake_compute_lock.parse()?; info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)"); let locks = Box::leak(Box::new( - console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout) - .unwrap(), + console::locks::ApiLocks::new( + "wake_compute_lock", + permits, + shards, + timeout, + &Metrics::get().wake_compute_lock, + ) + .unwrap(), )); tokio::spawn(locks.garbage_collect_worker(epoch)); diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 6151513614..34512e9f5b 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -10,7 +10,7 @@ use uuid::Uuid; use crate::{ error::ReportableError, - metrics::NUM_CANCELLATION_REQUESTS, + metrics::{CancellationRequest, CancellationSource, Metrics}, redis::cancellation_publisher::{ CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, }, @@ -28,7 +28,7 @@ pub struct CancellationHandler

{ client: P, /// This field used for the monitoring purposes. /// Represents the source of the cancellation request. - from: &'static str, + from: CancellationSource, } #[derive(Debug, Error)] @@ -89,9 +89,13 @@ impl CancellationHandler

{ // NB: we should immediately release the lock after cloning the token. let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else { tracing::warn!("query cancellation key not found: {key}"); - NUM_CANCELLATION_REQUESTS - .with_label_values(&[self.from, "not_found"]) - .inc(); + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + source: self.from, + kind: crate::metrics::CancellationOutcome::NotFound, + }); match self.client.try_publish(key, session_id).await { Ok(()) => {} // do nothing Err(e) => { @@ -103,9 +107,13 @@ impl CancellationHandler

{ } return Ok(()); }; - NUM_CANCELLATION_REQUESTS - .with_label_values(&[self.from, "found"]) - .inc(); + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + source: self.from, + kind: crate::metrics::CancellationOutcome::Found, + }); info!("cancelling query per user's request using key {key}"); cancel_closure.try_cancel_query().await } @@ -122,7 +130,7 @@ impl CancellationHandler

{ } impl CancellationHandler<()> { - pub fn new(map: CancelMap, from: &'static str) -> Self { + pub fn new(map: CancelMap, from: CancellationSource) -> Self { Self { map, client: (), @@ -132,7 +140,7 @@ impl CancellationHandler<()> { } impl CancellationHandler>>> { - pub fn new(map: CancelMap, client: Option>>, from: &'static str) -> Self { + pub fn new(map: CancelMap, client: Option>>, from: CancellationSource) -> Self { Self { map, client, from } } } @@ -192,15 +200,13 @@ impl

Drop for Session

{ #[cfg(test)] mod tests { - use crate::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS; - use super::*; #[tokio::test] async fn check_session_drop() -> anyhow::Result<()> { let cancellation_handler = Arc::new(CancellationHandler::<()>::new( CancelMap::default(), - NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, + CancellationSource::FromRedis, )); let session = cancellation_handler.clone().get_session(); @@ -214,7 +220,7 @@ mod tests { #[tokio::test] async fn cancel_session_noop_regression() { - let handler = CancellationHandler::<()>::new(Default::default(), "local"); + let handler = CancellationHandler::<()>::new(Default::default(), CancellationSource::Local); handler .cancel_session( CancelKeyData { diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index ee33b97fbd..149a619316 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -4,12 +4,11 @@ use crate::{ console::{errors::WakeComputeError, messages::MetricsAuxInfo}, context::RequestMonitoring, error::{ReportableError, UserFacingError}, - metrics::NUM_DB_CONNECTIONS_GAUGE, + metrics::{Metrics, NumDbConnectionsGuard}, proxy::neon_option, }; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; -use metrics::IntCounterPairGuard; use pq_proto::StartupMessageParams; use std::{io, net::SocketAddr, time::Duration}; use thiserror::Error; @@ -249,7 +248,7 @@ pub struct PostgresConnection { /// Labels for proxy's metrics. pub aux: MetricsAuxInfo, - _guage: IntCounterPairGuard, + _guage: NumDbConnectionsGuard<'static>, } impl ConnCfg { @@ -295,9 +294,7 @@ impl ConnCfg { params, cancel_closure, aux, - _guage: NUM_DB_CONNECTIONS_GAUGE - .with_label_values(&[ctx.protocol]) - .guard(), + _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol), }; Ok(connection) diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index 45161f5ac8..9869b95768 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -1,3 +1,4 @@ +use measured::FixedCardinalityLabel; use serde::{Deserialize, Serialize}; use std::fmt; @@ -102,7 +103,7 @@ pub struct MetricsAuxInfo { pub cold_start_info: ColdStartInfo, } -#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy)] +#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy, FixedCardinalityLabel)] #[serde(rename_all = "snake_case")] pub enum ColdStartInfo { #[default] @@ -110,9 +111,11 @@ pub enum ColdStartInfo { /// Compute was already running Warm, #[serde(rename = "pool_hit")] + #[label(rename = "pool_hit")] /// Compute was not running but there was an available VM VmPoolHit, #[serde(rename = "pool_miss")] + #[label(rename = "pool_miss")] /// Compute was not running and there were no VMs available VmPoolMiss, diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index f7d621fb12..b9502f0722 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -13,6 +13,7 @@ use crate::{ config::{CacheOptions, ProjectInfoCacheOptions}, context::RequestMonitoring, intern::ProjectIdInt, + metrics::ApiLockMetrics, scram, EndpointCacheKey, }; use dashmap::DashMap; @@ -441,10 +442,7 @@ pub struct ApiLocks { node_locks: DashMap>, permits: usize, timeout: Duration, - registered: prometheus::IntCounter, - unregistered: prometheus::IntCounter, - reclamation_lag: prometheus::Histogram, - lock_acquire_lag: prometheus::Histogram, + metrics: &'static ApiLockMetrics, } impl ApiLocks { @@ -453,54 +451,14 @@ impl ApiLocks { permits: usize, shards: usize, timeout: Duration, + metrics: &'static ApiLockMetrics, ) -> prometheus::Result { - let registered = prometheus::IntCounter::with_opts( - prometheus::Opts::new( - "semaphores_registered", - "Number of semaphores registered in this api lock", - ) - .namespace(name), - )?; - prometheus::register(Box::new(registered.clone()))?; - let unregistered = prometheus::IntCounter::with_opts( - prometheus::Opts::new( - "semaphores_unregistered", - "Number of semaphores unregistered in this api lock", - ) - .namespace(name), - )?; - prometheus::register(Box::new(unregistered.clone()))?; - let reclamation_lag = prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "reclamation_lag_seconds", - "Time it takes to reclaim unused semaphores in the api lock", - ) - .namespace(name) - // 1us -> 65ms - // benchmarks on my mac indicate it's usually in the range of 256us and 512us - .buckets(prometheus::exponential_buckets(1e-6, 2.0, 16)?), - )?; - prometheus::register(Box::new(reclamation_lag.clone()))?; - let lock_acquire_lag = prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "semaphore_acquire_seconds", - "Time it takes to reclaim unused semaphores in the api lock", - ) - .namespace(name) - // 0.1ms -> 6s - .buckets(prometheus::exponential_buckets(1e-4, 2.0, 16)?), - )?; - prometheus::register(Box::new(lock_acquire_lag.clone()))?; - Ok(Self { name, node_locks: DashMap::with_shard_amount(shards), permits, timeout, - lock_acquire_lag, - registered, - unregistered, - reclamation_lag, + metrics, }) } @@ -520,7 +478,7 @@ impl ApiLocks { self.node_locks .entry(key.clone()) .or_insert_with(|| { - self.registered.inc(); + self.metrics.semaphores_registered.inc(); Arc::new(Semaphore::new(self.permits)) }) .clone() @@ -528,8 +486,9 @@ impl ApiLocks { }; let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await; - self.lock_acquire_lag - .observe((Instant::now() - now).as_secs_f64()); + self.metrics + .semaphore_acquire_seconds + .observe(now.elapsed().as_secs_f64()); Ok(WakeComputePermit { permit: Some(permit??), @@ -554,13 +513,13 @@ impl ApiLocks { "performing epoch reclamation on api lock" ); let mut lock = shard.write(); - let timer = self.reclamation_lag.start_timer(); + let timer = self.metrics.reclamation_lag_seconds.start_timer(); let count = lock .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1) .count(); drop(lock); - self.unregistered.inc_by(count as u64); - timer.observe_duration() + self.metrics.semaphores_unregistered.inc_by(count as u64); + timer.observe(); } } } diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 1a3e2ca795..9ac1900324 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -7,13 +7,14 @@ use super::{ NodeInfo, }; use crate::{ - auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram, -}; -use crate::{ - cache::Cached, - context::RequestMonitoring, - metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER}, + auth::backend::ComputeUserInfo, + compute, + console::messages::ColdStartInfo, + http, + metrics::{CacheOutcome, Metrics}, + scram, }; +use crate::{cache::Cached, context::RequestMonitoring}; use futures::TryFutureExt; use std::sync::Arc; use tokio::time::Instant; @@ -95,7 +96,10 @@ impl Api { Some(secret) }; let allowed_ips = body.allowed_ips.unwrap_or_default(); - ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64); + Metrics::get() + .proxy + .allowed_ips_number + .observe(allowed_ips.len() as f64); Ok(AuthInfo { secret, allowed_ips, @@ -206,14 +210,16 @@ impl super::Api for Api { ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { let ep = &user_info.endpoint; if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) { - ALLOWED_IPS_BY_CACHE_OUTCOME - .with_label_values(&["hit"]) - .inc(); + Metrics::get() + .proxy + .allowed_ips_cache_misses + .inc(CacheOutcome::Hit); return Ok((allowed_ips, None)); } - ALLOWED_IPS_BY_CACHE_OUTCOME - .with_label_values(&["miss"]) - .inc(); + Metrics::get() + .proxy + .allowed_ips_cache_misses + .inc(CacheOutcome::Miss); let auth_info = self.do_get_auth_info(ctx, user_info).await?; let allowed_ips = Arc::new(auth_info.allowed_ips); let user = &user_info.user; diff --git a/proxy/src/context.rs b/proxy/src/context.rs index fec95f4722..0094235921 100644 --- a/proxy/src/context.rs +++ b/proxy/src/context.rs @@ -12,7 +12,7 @@ use crate::{ console::messages::{ColdStartInfo, MetricsAuxInfo}, error::ErrorKind, intern::{BranchIdInt, ProjectIdInt}, - metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND}, + metrics::{LatencyTimer, Metrics, Protocol}, DbName, EndpointId, RoleName, }; @@ -29,7 +29,7 @@ static LOG_CHAN: OnceCell> = OnceCell::ne pub struct RequestMonitoring { pub peer_addr: IpAddr, pub session_id: Uuid, - pub protocol: &'static str, + pub protocol: Protocol, first_packet: chrono::DateTime, region: &'static str, pub span: Span, @@ -65,7 +65,7 @@ impl RequestMonitoring { pub fn new( session_id: Uuid, peer_addr: IpAddr, - protocol: &'static str, + protocol: Protocol, region: &'static str, ) -> Self { let span = info_span!( @@ -102,7 +102,7 @@ impl RequestMonitoring { #[cfg(test)] pub fn test() -> Self { - RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), "test", "test") + RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), Protocol::Tcp, "test") } pub fn console_application_name(&self) -> String { @@ -134,9 +134,9 @@ impl RequestMonitoring { pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) { if self.endpoint_id.is_none() { self.span.record("ep", display(&endpoint_id)); - crate::metrics::CONNECTING_ENDPOINTS - .with_label_values(&[self.protocol]) - .measure(&endpoint_id); + let metric = &Metrics::get().proxy.connecting_endpoints; + let label = metric.with_labels(self.protocol); + metric.get_metric(label).measure(&endpoint_id); self.endpoint_id = Some(endpoint_id); } } @@ -158,13 +158,11 @@ impl RequestMonitoring { } pub fn set_error_kind(&mut self, kind: ErrorKind) { - ERROR_BY_KIND - .with_label_values(&[kind.to_metric_label()]) - .inc(); + Metrics::get().proxy.errors_total.inc(kind); if let Some(ep) = &self.endpoint_id { - ENDPOINT_ERRORS_BY_KIND - .with_label_values(&[kind.to_metric_label()]) - .measure(ep); + let metric = &Metrics::get().proxy.endpoints_affected_by_errors; + let label = metric.with_labels(kind); + metric.get_metric(label).measure(ep); } self.error_kind = Some(kind); } diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index eb77409429..e061216d15 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -111,7 +111,7 @@ impl From<&RequestMonitoring> for RequestData { super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus", super::AuthMethod::Cleartext => "cleartext", }), - protocol: value.protocol, + protocol: value.protocol.as_str(), region: value.region, error: value.error_kind.as_ref().map(|e| e.to_metric_label()), success: value.success, diff --git a/proxy/src/error.rs b/proxy/src/error.rs index 4614f3913d..fdfe50a494 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -1,5 +1,7 @@ use std::{error::Error as StdError, fmt, io}; +use measured::FixedCardinalityLabel; + /// Upcast (almost) any error into an opaque [`io::Error`]. pub fn io_error(e: impl Into>) -> io::Error { io::Error::new(io::ErrorKind::Other, e) @@ -29,24 +31,29 @@ pub trait UserFacingError: ReportableError { } } -#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[derive(Copy, Clone, Debug, Eq, PartialEq, FixedCardinalityLabel)] +#[label(singleton = "type")] pub enum ErrorKind { /// Wrong password, unknown endpoint, protocol violation, etc... User, /// Network error between user and proxy. Not necessarily user error + #[label(rename = "clientdisconnect")] ClientDisconnect, /// Proxy self-imposed user rate limits + #[label(rename = "ratelimit")] RateLimit, /// Proxy self-imposed service-wise rate limits + #[label(rename = "serviceratelimit")] ServiceRateLimit, /// internal errors Service, /// Error communicating with control plane + #[label(rename = "controlplane")] ControlPlane, /// Postgres error diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 59e1492ed4..95ca0ccd5c 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -13,7 +13,11 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; use tokio::time::Instant; use tracing::trace; -use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl}; +use crate::{ + metrics::{ConsoleRequest, Metrics}, + rate_limiter, + url::ApiUrl, +}; use reqwest_middleware::RequestBuilder; /// This is the preferred way to create new http clients, @@ -90,13 +94,14 @@ impl Endpoint { /// Execute a [request](reqwest::Request). pub async fn execute(&self, request: Request) -> Result { - let path = request.url().path().to_string(); - let start = Instant::now(); - let res = self.client.execute(request).await; - CONSOLE_REQUEST_LATENCY - .with_label_values(&[&path]) - .observe(start.elapsed().as_secs_f64()); - res + let _timer = Metrics::get() + .proxy + .console_request_latency + .start_timer(ConsoleRequest { + request: request.url().path(), + }); + + self.client.execute(request).await } } diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index cbb17ebcb7..cae9eb5b97 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -1,30 +1,49 @@ use anyhow::{anyhow, bail}; -use hyper::{Body, Request, Response, StatusCode}; -use std::{convert::Infallible, net::TcpListener}; -use tracing::info; +use hyper::{header::CONTENT_TYPE, Body, Request, Response, StatusCode}; +use measured::{text::BufferedTextEncoder, MetricGroup}; +use metrics::NeonMetrics; +use std::{ + convert::Infallible, + net::TcpListener, + sync::{Arc, Mutex}, +}; +use tracing::{info, info_span}; use utils::http::{ - endpoint::{self, prometheus_metrics_handler, request_span}, + endpoint::{self, request_span}, error::ApiError, json::json_response, RouterBuilder, RouterService, }; +use crate::jemalloc; + async fn status_handler(_: Request) -> Result, ApiError> { json_response(StatusCode::OK, "") } -fn make_router() -> RouterBuilder { +fn make_router(metrics: AppMetrics) -> RouterBuilder { + let state = Arc::new(Mutex::new(PrometheusHandler { + encoder: BufferedTextEncoder::new(), + metrics, + })); + endpoint::make_router() - .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) + .get("/metrics", move |r| { + let state = state.clone(); + request_span(r, move |b| prometheus_metrics_handler(b, state)) + }) .get("/v1/status", status_handler) } -pub async fn task_main(http_listener: TcpListener) -> anyhow::Result { +pub async fn task_main( + http_listener: TcpListener, + metrics: AppMetrics, +) -> anyhow::Result { scopeguard::defer! { info!("http has shut down"); } - let service = || RouterService::new(make_router().build()?); + let service = || RouterService::new(make_router(metrics).build()?); hyper::Server::from_tcp(http_listener)? .serve(service().map_err(|e| anyhow!(e))?) @@ -32,3 +51,57 @@ pub async fn task_main(http_listener: TcpListener) -> anyhow::Result bail!("hyper server without shutdown handling cannot shutdown successfully"); } + +struct PrometheusHandler { + encoder: BufferedTextEncoder, + metrics: AppMetrics, +} + +#[derive(MetricGroup)] +pub struct AppMetrics { + #[metric(namespace = "jemalloc")] + pub jemalloc: Option, + #[metric(flatten)] + pub neon_metrics: NeonMetrics, + #[metric(flatten)] + pub proxy: &'static crate::metrics::Metrics, +} + +async fn prometheus_metrics_handler( + _req: Request, + state: Arc>, +) -> Result, ApiError> { + let started_at = std::time::Instant::now(); + + let span = info_span!("blocking"); + let body = tokio::task::spawn_blocking(move || { + let _span = span.entered(); + + let mut state = state.lock().unwrap(); + let PrometheusHandler { encoder, metrics } = &mut *state; + + metrics + .collect_group_into(&mut *encoder) + .unwrap_or_else(|infallible| match infallible {}); + + let body = encoder.finish(); + + tracing::info!( + bytes = body.len(), + elapsed_ms = started_at.elapsed().as_millis(), + "responded /metrics" + ); + + body + }) + .await + .unwrap(); + + let response = Response::builder() + .status(200) + .header(CONTENT_TYPE, "text/plain; version=0.0.4") + .body(Body::from(body)) + .unwrap(); + + Ok(response) +} diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs index ed20798d56..3243e6a140 100644 --- a/proxy/src/jemalloc.rs +++ b/proxy/src/jemalloc.rs @@ -1,27 +1,45 @@ -use std::time::Duration; +use std::marker::PhantomData; -use metrics::IntGauge; -use prometheus::{register_int_gauge_with_registry, Registry}; +use measured::{ + label::NoLabels, + metric::{ + gauge::GaugeState, group::Encoding, group::MetricValue, name::MetricNameEncoder, + MetricEncoding, MetricFamilyEncoding, MetricType, + }, + text::TextEncoder, + LabelGroup, MetricGroup, +}; use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version}; pub struct MetricRecorder { epoch: epoch_mib, - active: stats::active_mib, - active_gauge: IntGauge, - allocated: stats::allocated_mib, - allocated_gauge: IntGauge, - mapped: stats::mapped_mib, - mapped_gauge: IntGauge, - metadata: stats::metadata_mib, - metadata_gauge: IntGauge, - resident: stats::resident_mib, - resident_gauge: IntGauge, - retained: stats::retained_mib, - retained_gauge: IntGauge, + inner: Metrics, +} + +#[derive(MetricGroup)] +struct Metrics { + active_bytes: JemallocGaugeFamily, + allocated_bytes: JemallocGaugeFamily, + mapped_bytes: JemallocGaugeFamily, + metadata_bytes: JemallocGaugeFamily, + resident_bytes: JemallocGaugeFamily, + retained_bytes: JemallocGaugeFamily, +} + +impl MetricGroup for MetricRecorder +where + Metrics: MetricGroup, +{ + fn collect_group_into(&self, enc: &mut Enc) -> Result<(), Enc::Err> { + if self.epoch.advance().is_ok() { + self.inner.collect_group_into(enc)?; + } + Ok(()) + } } impl MetricRecorder { - pub fn new(registry: &Registry) -> Result { + pub fn new() -> Result { tracing::info!( config = config::malloc_conf::read()?, version = version::read()?, @@ -30,71 +48,69 @@ impl MetricRecorder { Ok(Self { epoch: epoch::mib()?, - active: stats::active::mib()?, - active_gauge: register_int_gauge_with_registry!( - "jemalloc_active_bytes", - "Total number of bytes in active pages allocated by the process", - registry - )?, - allocated: stats::allocated::mib()?, - allocated_gauge: register_int_gauge_with_registry!( - "jemalloc_allocated_bytes", - "Total number of bytes allocated by the process", - registry - )?, - mapped: stats::mapped::mib()?, - mapped_gauge: register_int_gauge_with_registry!( - "jemalloc_mapped_bytes", - "Total number of bytes in active extents mapped by the allocator", - registry - )?, - metadata: stats::metadata::mib()?, - metadata_gauge: register_int_gauge_with_registry!( - "jemalloc_metadata_bytes", - "Total number of bytes dedicated to jemalloc metadata", - registry - )?, - resident: stats::resident::mib()?, - resident_gauge: register_int_gauge_with_registry!( - "jemalloc_resident_bytes", - "Total number of bytes in physically resident data pages mapped by the allocator", - registry - )?, - retained: stats::retained::mib()?, - retained_gauge: register_int_gauge_with_registry!( - "jemalloc_retained_bytes", - "Total number of bytes in virtual memory mappings that were retained rather than being returned to the operating system", - registry - )?, - }) - } - - fn _poll(&self) -> Result<(), anyhow::Error> { - self.epoch.advance()?; - self.active_gauge.set(self.active.read()? as i64); - self.allocated_gauge.set(self.allocated.read()? as i64); - self.mapped_gauge.set(self.mapped.read()? as i64); - self.metadata_gauge.set(self.metadata.read()? as i64); - self.resident_gauge.set(self.resident.read()? as i64); - self.retained_gauge.set(self.retained.read()? as i64); - Ok(()) - } - - #[inline] - pub fn poll(&self) { - if let Err(error) = self._poll() { - tracing::warn!(%error, "Failed to poll jemalloc stats"); - } - } - - pub fn start(self) -> tokio::task::JoinHandle<()> { - tokio::task::spawn(async move { - let mut interval = tokio::time::interval(Duration::from_secs(15)); - interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); - loop { - self.poll(); - interval.tick().await; - } + inner: Metrics { + active_bytes: JemallocGaugeFamily(stats::active::mib()?), + allocated_bytes: JemallocGaugeFamily(stats::allocated::mib()?), + mapped_bytes: JemallocGaugeFamily(stats::mapped::mib()?), + metadata_bytes: JemallocGaugeFamily(stats::metadata::mib()?), + resident_bytes: JemallocGaugeFamily(stats::resident::mib()?), + retained_bytes: JemallocGaugeFamily(stats::retained::mib()?), + }, }) } } + +struct JemallocGauge(PhantomData); + +impl Default for JemallocGauge { + fn default() -> Self { + JemallocGauge(PhantomData) + } +} +impl MetricType for JemallocGauge { + type Metadata = T; +} + +struct JemallocGaugeFamily(T); +impl MetricFamilyEncoding for JemallocGaugeFamily +where + JemallocGauge: MetricEncoding, +{ + fn collect_family_into(&self, name: impl MetricNameEncoder, enc: &mut T) -> Result<(), T::Err> { + JemallocGauge::write_type(&name, enc)?; + JemallocGauge(PhantomData).collect_into(&self.0, NoLabels, name, enc) + } +} + +macro_rules! jemalloc_gauge { + ($stat:ident, $mib:ident) => { + impl MetricEncoding> for JemallocGauge { + fn write_type( + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + GaugeState::write_type(name, enc) + } + + fn collect_into( + &self, + mib: &stats::$mib, + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + if let Ok(v) = mib.read() { + enc.write_metric_value(name, labels, MetricValue::Int(v as i64))?; + } + Ok(()) + } + } + }; +} + +jemalloc_gauge!(active, active_mib); +jemalloc_gauge!(allocated, allocated_mib); +jemalloc_gauge!(mapped, mapped_mib); +jemalloc_gauge!(metadata, metadata_mib); +jemalloc_gauge!(resident, resident_mib); +jemalloc_gauge!(retained, retained_mib); diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 59ee899c08..78840f5983 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -1,176 +1,356 @@ -use ::metrics::{ - exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec, - register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, - register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec, - IntCounterVec, IntGauge, IntGaugeVec, -}; -use metrics::{ - register_hll, register_int_counter, register_int_counter_pair, HyperLogLog, IntCounter, - IntCounterPair, -}; +use std::sync::OnceLock; + +use lasso::ThreadedRodeo; +use measured::{ + label::StaticLabelSet, + metric::{histogram::Thresholds, name::MetricName}, + Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec, + LabelGroup, MetricGroup, +}; +use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec}; -use once_cell::sync::Lazy; use tokio::time::{self, Instant}; use crate::console::messages::ColdStartInfo; -pub static NUM_DB_CONNECTIONS_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_opened_db_connections_total", - "Number of opened connections to a database.", - "proxy_closed_db_connections_total", - "Number of closed connections to a database.", - &["protocol"], - ) - .unwrap() -}); +#[derive(MetricGroup)] +pub struct Metrics { + #[metric(namespace = "proxy")] + pub proxy: ProxyMetrics, -pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_opened_client_connections_total", - "Number of opened connections from a client.", - "proxy_closed_client_connections_total", - "Number of closed connections from a client.", - &["protocol"], - ) - .unwrap() -}); + #[metric(namespace = "wake_compute_lock")] + pub wake_compute_lock: ApiLockMetrics, -pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_accepted_connections_total", - "Number of client connections accepted.", - "proxy_closed_connections_total", - "Number of client connections closed.", - &["protocol"], - ) - .unwrap() -}); + // the one metric not called proxy_.... + pub semaphore_control_plane_limit: GaugeVec>, +} -pub static COMPUTE_CONNECTION_LATENCY: Lazy = Lazy::new(|| { - register_histogram_vec!( - "proxy_compute_connection_latency_seconds", - "Time it took for proxy to establish a connection to the compute endpoint", - // http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane - // 3 * 6 * 2 * 2 = 72 counters - &["protocol", "cold_start_info", "outcome", "excluded"], - // largest bucket = 2^16 * 0.5ms = 32s - exponential_buckets(0.0005, 2.0, 16).unwrap(), - ) - .unwrap() -}); +impl Metrics { + pub fn get() -> &'static Self { + static SELF: OnceLock = OnceLock::new(); + SELF.get_or_init(|| Metrics { + proxy: ProxyMetrics::default(), + wake_compute_lock: ApiLockMetrics::new(), + semaphore_control_plane_limit: GaugeVec::default(), + }) + } +} -pub static CONSOLE_REQUEST_LATENCY: Lazy = Lazy::new(|| { - register_histogram_vec!( - "proxy_console_request_latency", - "Time it took for proxy to establish a connection to the compute endpoint", - // proxy_wake_compute/proxy_get_role_info - &["request"], +#[derive(MetricGroup)] +#[metric(new())] +pub struct ProxyMetrics { + #[metric(flatten)] + pub db_connections: CounterPairVec, + #[metric(flatten)] + pub client_connections: CounterPairVec, + #[metric(flatten)] + pub connection_requests: CounterPairVec, + #[metric(flatten)] + pub http_endpoint_pools: HttpEndpointPools, + + /// Time it took for proxy to establish a connection to the compute endpoint. + // largest bucket = 2^16 * 0.5ms = 32s + #[metric(metadata = Thresholds::exponential_buckets(0.0005, 2.0))] + pub compute_connection_latency_seconds: HistogramVec, + + /// Time it took for proxy to receive a response from control plane. + #[metric( // largest bucket = 2^16 * 0.2ms = 13s - exponential_buckets(0.0002, 2.0, 16).unwrap(), - ) - .unwrap() -}); + metadata = Thresholds::exponential_buckets(0.0002, 2.0), + )] + pub console_request_latency: HistogramVec, -pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_allowed_ips_cache_misses", - "Number of cache hits/misses for allowed ips", - // hit/miss - &["outcome"], - ) - .unwrap() -}); + /// Time it takes to acquire a token to call console plane. + // largest bucket = 3^16 * 0.05ms = 2.15s + #[metric(metadata = Thresholds::exponential_buckets(0.00005, 3.0))] + pub control_plane_token_acquire_seconds: Histogram<16>, -pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_control_plane_token_acquire_seconds", - "Time it took for proxy to establish a connection to the compute endpoint", - // largest bucket = 3^16 * 0.05ms = 2.15s - exponential_buckets(0.00005, 3.0, 16).unwrap(), - ) - .unwrap() -}); + /// Size of the HTTP request body lengths. + // smallest bucket = 16 bytes + // largest bucket = 4^12 * 16 bytes = 256MB + #[metric(metadata = Thresholds::exponential_buckets(16.0, 4.0))] + pub http_conn_content_length_bytes: HistogramVec, 12>, -pub static RATE_LIMITER_LIMIT: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "semaphore_control_plane_limit", - "Current limit of the semaphore control plane", - &["limit"], // 2 counters - ) - .unwrap() -}); + /// Time it takes to reclaim unused connection pools. + #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))] + pub http_pool_reclaimation_lag_seconds: Histogram<16>, -pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_accepted_connections_by_sni", - "Number of connections (per sni).", - &["kind"], - ) - .unwrap() -}); + /// Number of opened connections to a database. + pub http_pool_opened_connections: Gauge, -pub static ALLOWED_IPS_NUMBER: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_allowed_ips_number", - "Number of allowed ips", - vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0], - ) - .unwrap() -}); + /// Number of cache hits/misses for allowed ips. + pub allowed_ips_cache_misses: CounterVec>, -pub static HTTP_CONTENT_LENGTH: Lazy = Lazy::new(|| { - register_histogram_vec!( - "proxy_http_conn_content_length_bytes", - "Number of bytes the HTTP response content consumes", - // request/response - &["direction"], - // smallest bucket = 16 bytes - // largest bucket = 4^12 * 16 bytes = 256MB - exponential_buckets(16.0, 4.0, 12).unwrap() - ) - .unwrap() -}); + /// Number of allowed ips + #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] + pub allowed_ips_number: Histogram<10>, -pub static GC_LATENCY: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_http_pool_reclaimation_lag_seconds", - "Time it takes to reclaim unused connection pools", - // 1us -> 65ms - exponential_buckets(1e-6, 2.0, 16).unwrap(), - ) - .unwrap() -}); + /// Number of connections (per sni). + pub accepted_connections_by_sni: CounterVec>, -pub static ENDPOINT_POOLS: Lazy = Lazy::new(|| { - register_int_counter_pair!( - "proxy_http_pool_endpoints_registered_total", - "Number of endpoints we have registered pools for", - "proxy_http_pool_endpoints_unregistered_total", - "Number of endpoints we have unregistered pools for", - ) - .unwrap() -}); + /// Number of connection failures (per kind). + pub connection_failures_total: CounterVec>, -pub static NUM_OPEN_CLIENTS_IN_HTTP_POOL: Lazy = Lazy::new(|| { - register_int_gauge!( - "proxy_http_pool_opened_connections", - "Number of opened connections to a database.", - ) - .unwrap() -}); + /// Number of wake-up failures (per kind). + pub connection_failures_breakdown: CounterVec, -pub static NUM_CANCELLATION_REQUESTS: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_cancellation_requests_total", - "Number of cancellation requests (per found/not_found).", - &["source", "kind"], - ) - .unwrap() -}); + /// Number of bytes sent/received between all clients and backends. + pub io_bytes: CounterVec>, -pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client"; -pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis"; + /// Number of errors by a given classification. + pub errors_total: CounterVec>, + + /// Number of cancellation requests (per found/not_found). + pub cancellation_requests_total: CounterVec, + + /// Number of errors by a given classification + pub redis_errors_total: CounterVec, + + /// Number of TLS handshake failures + pub tls_handshake_failures: Counter, + + /// Number of connection requests affected by authentication rate limits + pub requests_auth_rate_limits_total: Counter, + + /// HLL approximate cardinality of endpoints that are connecting + pub connecting_endpoints: HyperLogLogVec, 32>, + + /// Number of endpoints affected by errors of a given classification + pub endpoints_affected_by_errors: HyperLogLogVec, 32>, + + /// Number of endpoints affected by authentication rate limits + pub endpoints_auth_rate_limits: HyperLogLog<32>, +} + +#[derive(MetricGroup)] +#[metric(new())] +pub struct ApiLockMetrics { + /// Number of semaphores registered in this api lock + pub semaphores_registered: Counter, + /// Number of semaphores unregistered in this api lock + pub semaphores_unregistered: Counter, + /// Time it takes to reclaim unused semaphores in the api lock + #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))] + pub reclamation_lag_seconds: Histogram<16>, + /// Time it takes to acquire a semaphore lock + #[metric(metadata = Thresholds::exponential_buckets(1e-4, 2.0))] + pub semaphore_acquire_seconds: Histogram<16>, +} + +impl Default for ProxyMetrics { + fn default() -> Self { + Self::new() + } +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "direction")] +pub enum HttpDirection { + Request, + Response, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "direction")] +pub enum Direction { + Tx, + Rx, +} + +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +#[label(singleton = "protocol")] +pub enum Protocol { + Http, + Ws, + Tcp, + SniRouter, +} + +impl Protocol { + pub fn as_str(&self) -> &'static str { + match self { + Protocol::Http => "http", + Protocol::Ws => "ws", + Protocol::Tcp => "tcp", + Protocol::SniRouter => "sni_router", + } + } +} + +impl std::fmt::Display for Protocol { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum Bool { + True, + False, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "outcome")] +pub enum Outcome { + Success, + Failed, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "outcome")] +pub enum CacheOutcome { + Hit, + Miss, +} + +#[derive(LabelGroup)] +#[label(set = ConsoleRequestSet)] +pub struct ConsoleRequest<'a> { + #[label(dynamic_with = ThreadedRodeo, default)] + pub request: &'a str, +} + +#[derive(MetricGroup, Default)] +pub struct HttpEndpointPools { + /// Number of endpoints we have registered pools for + pub http_pool_endpoints_registered_total: Counter, + /// Number of endpoints we have unregistered pools for + pub http_pool_endpoints_unregistered_total: Counter, +} + +pub struct HttpEndpointPoolsGuard<'a> { + dec: &'a Counter, +} + +impl Drop for HttpEndpointPoolsGuard<'_> { + fn drop(&mut self) { + self.dec.inc(); + } +} + +impl HttpEndpointPools { + pub fn guard(&self) -> HttpEndpointPoolsGuard { + self.http_pool_endpoints_registered_total.inc(); + HttpEndpointPoolsGuard { + dec: &self.http_pool_endpoints_unregistered_total, + } + } +} +pub struct NumDbConnectionsGauge; +impl CounterPairAssoc for NumDbConnectionsGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("opened_db_connections_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_db_connections_total"); + const INC_HELP: &'static str = "Number of opened connections to a database."; + const DEC_HELP: &'static str = "Number of closed connections to a database."; + type LabelGroupSet = StaticLabelSet; +} +pub type NumDbConnectionsGuard<'a> = metrics::MeasuredCounterPairGuard<'a, NumDbConnectionsGauge>; + +pub struct NumClientConnectionsGauge; +impl CounterPairAssoc for NumClientConnectionsGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("opened_client_connections_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_client_connections_total"); + const INC_HELP: &'static str = "Number of opened connections from a client."; + const DEC_HELP: &'static str = "Number of closed connections from a client."; + type LabelGroupSet = StaticLabelSet; +} +pub type NumClientConnectionsGuard<'a> = + metrics::MeasuredCounterPairGuard<'a, NumClientConnectionsGauge>; + +pub struct NumConnectionRequestsGauge; +impl CounterPairAssoc for NumConnectionRequestsGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("accepted_connections_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_connections_total"); + const INC_HELP: &'static str = "Number of client connections accepted."; + const DEC_HELP: &'static str = "Number of client connections closed."; + type LabelGroupSet = StaticLabelSet; +} +pub type NumConnectionRequestsGuard<'a> = + metrics::MeasuredCounterPairGuard<'a, NumConnectionRequestsGauge>; + +#[derive(LabelGroup)] +#[label(set = ComputeConnectionLatencySet)] +pub struct ComputeConnectionLatencyGroup { + protocol: Protocol, + cold_start_info: ColdStartInfo, + outcome: ConnectOutcome, + excluded: LatencyExclusions, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum LatencyExclusions { + Client, + ClientAndCplane, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "limit")] +pub enum RateLimit { + Actual, + Expected, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum SniKind { + Sni, + NoSni, + PasswordHack, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum ConnectionFailureKind { + ComputeCached, + ComputeUncached, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum WakeupFailureKind { + BadComputeAddress, + ApiTransportError, + QuotaExceeded, + ApiConsoleLocked, + ApiConsoleBadRequest, + ApiConsoleOtherServerError, + ApiConsoleOtherError, + TimeoutError, +} + +#[derive(LabelGroup)] +#[label(set = ConnectionFailuresBreakdownSet)] +pub struct ConnectionFailuresBreakdownGroup { + pub kind: WakeupFailureKind, + pub retry: Bool, +} + +#[derive(LabelGroup, Copy, Clone)] +#[label(set = RedisErrorsSet)] +pub struct RedisErrors<'a> { + #[label(dynamic_with = ThreadedRodeo, default)] + pub channel: &'a str, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum CancellationSource { + FromClient, + FromRedis, + Local, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum CancellationOutcome { + NotFound, + Found, +} + +#[derive(LabelGroup)] +#[label(set = CancellationRequestSet)] +pub struct CancellationRequest { + pub source: CancellationSource, + pub kind: CancellationOutcome, +} pub enum Waiting { Cplane, @@ -185,20 +365,6 @@ struct Accumulated { compute: time::Duration, } -enum Outcome { - Success, - Failed, -} - -impl Outcome { - fn as_str(&self) -> &'static str { - match self { - Outcome::Success => "success", - Outcome::Failed => "failed", - } - } -} - pub struct LatencyTimer { // time since the stopwatch was started start: time::Instant, @@ -207,9 +373,9 @@ pub struct LatencyTimer { // accumulated time on the stopwatch accumulated: Accumulated, // label data - protocol: &'static str, + protocol: Protocol, cold_start_info: ColdStartInfo, - outcome: Outcome, + outcome: ConnectOutcome, } pub struct LatencyTimerPause<'a> { @@ -219,7 +385,7 @@ pub struct LatencyTimerPause<'a> { } impl LatencyTimer { - pub fn new(protocol: &'static str) -> Self { + pub fn new(protocol: Protocol) -> Self { Self { start: time::Instant::now(), stop: None, @@ -227,7 +393,7 @@ impl LatencyTimer { protocol, cold_start_info: ColdStartInfo::Unknown, // assume failed unless otherwise specified - outcome: Outcome::Failed, + outcome: ConnectOutcome::Failed, } } @@ -248,7 +414,7 @@ impl LatencyTimer { self.stop = Some(time::Instant::now()); // success - self.outcome = Outcome::Success; + self.outcome = ConnectOutcome::Success; } } @@ -263,128 +429,54 @@ impl Drop for LatencyTimerPause<'_> { } } +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +enum ConnectOutcome { + Success, + Failed, +} + impl Drop for LatencyTimer { fn drop(&mut self) { let duration = self .stop .unwrap_or_else(time::Instant::now) .duration_since(self.start); - // Excluding cplane communication from the accumulated time. - COMPUTE_CONNECTION_LATENCY - .with_label_values(&[ - self.protocol, - self.cold_start_info.as_str(), - self.outcome.as_str(), - "client", - ]) - .observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64()); + + let metric = &Metrics::get().proxy.compute_connection_latency_seconds; + + // Excluding client communication from the accumulated time. + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::Client, + }, + duration + .saturating_sub(self.accumulated.client) + .as_secs_f64(), + ); + // Exclude client and cplane communication from the accumulated time. let accumulated_total = self.accumulated.client + self.accumulated.cplane; - COMPUTE_CONNECTION_LATENCY - .with_label_values(&[ - self.protocol, - self.cold_start_info.as_str(), - self.outcome.as_str(), - "client_and_cplane", - ]) - .observe((duration.saturating_sub(accumulated_total)).as_secs_f64()); + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::ClientAndCplane, + }, + duration.saturating_sub(accumulated_total).as_secs_f64(), + ); } } -pub static NUM_CONNECTION_FAILURES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_connection_failures_total", - "Number of connection failures (per kind).", - &["kind"], - ) - .unwrap() -}); - -pub static NUM_WAKEUP_FAILURES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_connection_failures_breakdown", - "Number of wake-up failures (per kind).", - &["retry", "kind"], - ) - .unwrap() -}); - -pub static NUM_BYTES_PROXIED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_io_bytes", - "Number of bytes sent/received between all clients and backends.", - &["direction"], - ) - .unwrap() -}); - -pub const fn bool_to_str(x: bool) -> &'static str { - if x { - "true" - } else { - "false" +impl From for Bool { + fn from(value: bool) -> Self { + if value { + Bool::True + } else { + Bool::False + } } } - -pub static CONNECTING_ENDPOINTS: Lazy> = Lazy::new(|| { - register_hll_vec!( - 32, - "proxy_connecting_endpoints", - "HLL approximate cardinality of endpoints that are connecting", - &["protocol"], - ) - .unwrap() -}); - -pub static ERROR_BY_KIND: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_errors_total", - "Number of errors by a given classification", - &["type"], - ) - .unwrap() -}); - -pub static ENDPOINT_ERRORS_BY_KIND: Lazy> = Lazy::new(|| { - register_hll_vec!( - 32, - "proxy_endpoints_affected_by_errors", - "Number of endpoints affected by errors of a given classification", - &["type"], - ) - .unwrap() -}); - -pub static REDIS_BROKEN_MESSAGES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_redis_errors_total", - "Number of errors by a given classification", - &["channel"], - ) - .unwrap() -}); - -pub static TLS_HANDSHAKE_FAILURES: Lazy = Lazy::new(|| { - register_int_counter!( - "proxy_tls_handshake_failures", - "Number of TLS handshake failures", - ) - .unwrap() -}); - -pub static ENDPOINTS_AUTH_RATE_LIMITED: Lazy> = Lazy::new(|| { - register_hll!( - 32, - "proxy_endpoints_auth_rate_limits", - "Number of endpoints affected by authentication rate limits", - ) - .unwrap() -}); - -pub static AUTH_RATE_LIMIT_HITS: Lazy = Lazy::new(|| { - register_int_counter!( - "proxy_requests_auth_rate_limits_total", - "Number of connection requests affected by authentication rate limits", - ) - .unwrap() -}); diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 6051c0a812..5598215b6b 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -15,7 +15,7 @@ use crate::{ config::{ProxyConfig, TlsConfig}, context::RequestMonitoring, error::ReportableError, - metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE}, + metrics::{Metrics, NumClientConnectionsGuard}, protocol2::WithClientIp, proxy::handshake::{handshake, HandshakeData}, rate_limiter::EndpointRateLimiter, @@ -24,7 +24,6 @@ use crate::{ }; use futures::TryFutureExt; use itertools::Itertools; -use metrics::IntCounterPairGuard; use once_cell::sync::OnceCell; use pq_proto::{BeMessage as Be, StartupMessageParams}; use regex::Regex; @@ -79,9 +78,10 @@ pub async fn task_main( { let (socket, peer_addr) = accept_result?; - let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&["tcp"]) - .guard(); + let conn_gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Tcp); let session_id = uuid::Uuid::new_v4(); let cancellation_handler = Arc::clone(&cancellation_handler); @@ -113,7 +113,12 @@ pub async fn task_main( }, }; - let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region); + let mut ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Tcp, + &config.region, + ); let span = ctx.span.clone(); let res = handle_client( @@ -237,14 +242,17 @@ pub async fn handle_client( stream: S, mode: ClientMode, endpoint_rate_limiter: Arc, - conn_gauge: IntCounterPairGuard, + conn_gauge: NumClientConnectionsGuard<'static>, ) -> Result>, ClientRequestError> { - info!("handling interactive connection from client"); + info!( + protocol = %ctx.protocol, + "handling interactive connection from client" + ); + let metrics = &Metrics::get().proxy; let proto = ctx.protocol; - let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE - .with_label_values(&[proto]) - .guard(); + // let _client_gauge = metrics.client_connections.guard(proto); + let _request_gauge = metrics.connection_requests.guard(proto); let tls = config.tls_config.as_ref(); diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 4c0d68ce0b..33f394c550 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -4,7 +4,7 @@ use crate::{ console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo}, context::RequestMonitoring, error::ReportableError, - metrics::NUM_CONNECTION_FAILURES, + metrics::{ConnectionFailureKind, Metrics}, proxy::{ retry::{retry_after, ShouldRetry}, wake_compute::wake_compute, @@ -27,10 +27,10 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo { warn!("invalidating stalled compute node info cache entry"); } let label = match is_cached { - true => "compute_cached", - false => "compute_uncached", + true => ConnectionFailureKind::ComputeCached, + false => ConnectionFailureKind::ComputeUncached, }; - NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc(); + Metrics::get().proxy.connection_failures_total.inc(label); node_info.invalidate() } diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index c81a1a8292..62de79946f 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -2,11 +2,10 @@ use crate::{ cancellation, compute::PostgresConnection, console::messages::MetricsAuxInfo, - metrics::NUM_BYTES_PROXIED_COUNTER, + metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}, stream::Stream, usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}, }; -use metrics::IntCounterPairGuard; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; use utils::measured_stream::MeasuredStream; @@ -23,24 +22,25 @@ pub async fn proxy_pass( branch_id: aux.branch_id, }); - let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]); + let metrics = &Metrics::get().proxy.io_bytes; + let m_sent = metrics.with_labels(Direction::Tx); let mut client = MeasuredStream::new( client, |_| {}, |cnt| { // Number of bytes we sent to the client (outbound). - m_sent.inc_by(cnt as u64); + metrics.get_metric(m_sent).inc_by(cnt as u64); usage.record_egress(cnt as u64); }, ); - let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]); + let m_recv = metrics.with_labels(Direction::Rx); let mut compute = MeasuredStream::new( compute, |_| {}, |cnt| { // Number of bytes the client sent to the compute node (inbound). - m_recv.inc_by(cnt as u64); + metrics.get_metric(m_recv).inc_by(cnt as u64); }, ); @@ -60,8 +60,8 @@ pub struct ProxyPassthrough { pub compute: PostgresConnection, pub aux: MetricsAuxInfo, - pub req: IntCounterPairGuard, - pub conn: IntCounterPairGuard, + pub req: NumConnectionRequestsGuard<'static>, + pub conn: NumClientConnectionsGuard<'static>, pub cancel: cancellation::Session

, } diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index bfe4b7ec3a..f8154b1a94 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -1,6 +1,6 @@ use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo}; use crate::context::RequestMonitoring; -use crate::metrics::{bool_to_str, NUM_WAKEUP_FAILURES}; +use crate::metrics::{ConnectionFailuresBreakdownGroup, Metrics, WakeupFailureKind}; use crate::proxy::retry::retry_after; use hyper::StatusCode; use std::ops::ControlFlow; @@ -57,39 +57,46 @@ pub fn handle_try_wake( fn report_error(e: &WakeComputeError, retry: bool) { use crate::console::errors::ApiError; - let retry = bool_to_str(retry); let kind = match e { - WakeComputeError::BadComputeAddress(_) => "bad_compute_address", - WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error", + WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress, + WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError, WakeComputeError::ApiError(ApiError::Console { status: StatusCode::LOCKED, ref text, }) if text.contains("written data quota exceeded") || text.contains("the limit for current plan reached") => { - "quota_exceeded" + WakeupFailureKind::QuotaExceeded } WakeComputeError::ApiError(ApiError::Console { status: StatusCode::UNPROCESSABLE_ENTITY, ref text, }) if text.contains("compute time quota of non-primary branches is exceeded") => { - "quota_exceeded" + WakeupFailureKind::QuotaExceeded } WakeComputeError::ApiError(ApiError::Console { status: StatusCode::LOCKED, .. - }) => "api_console_locked", + }) => WakeupFailureKind::ApiConsoleLocked, WakeComputeError::ApiError(ApiError::Console { status: StatusCode::BAD_REQUEST, .. - }) => "api_console_bad_request", + }) => WakeupFailureKind::ApiConsoleBadRequest, WakeComputeError::ApiError(ApiError::Console { status, .. }) if status.is_server_error() => { - "api_console_other_server_error" + WakeupFailureKind::ApiConsoleOtherServerError } - WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error", - WakeComputeError::TimeoutError => "timeout_error", + WakeComputeError::ApiError(ApiError::Console { .. }) => { + WakeupFailureKind::ApiConsoleOtherError + } + WakeComputeError::TimeoutError => WakeupFailureKind::TimeoutError, }; - NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc(); + Metrics::get() + .proxy + .connection_failures_breakdown + .inc(ConnectionFailuresBreakdownGroup { + kind, + retry: retry.into(), + }); } diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index f590896dd9..aba5120f38 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -17,7 +17,13 @@ use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit}; use tokio::time::{timeout, Duration, Instant}; use tracing::info; -use crate::{intern::EndpointIdInt, EndpointId}; +use crate::{ + intern::EndpointIdInt, + { + metrics::{Metrics, RateLimit}, + EndpointId, + }, +}; use super::{ limit_algorithm::{LimitAlgorithm, Sample}, @@ -457,12 +463,9 @@ impl Limiter { } new_limit }; - crate::metrics::RATE_LIMITER_LIMIT - .with_label_values(&["expected"]) - .set(new_limit as i64); - crate::metrics::RATE_LIMITER_LIMIT - .with_label_values(&["actual"]) - .set(actual_limit as i64); + let metric = &Metrics::get().semaphore_control_plane_limit; + metric.set(RateLimit::Expected, new_limit as i64); + metric.set(RateLimit::Actual, actual_limit as i64); self.limits.store(new_limit, Ordering::Release); #[cfg(test)] if let Some(n) = &self.notifier { @@ -519,7 +522,10 @@ impl reqwest_middleware::Middleware for Limiter { extensions: &mut task_local_extensions::Extensions, next: reqwest_middleware::Next<'_>, ) -> reqwest_middleware::Result { - let start = Instant::now(); + let timer = Metrics::get() + .proxy + .control_plane_token_acquire_seconds + .start_timer(); let token = self .acquire_timeout(self.config.timeout) .await @@ -533,8 +539,12 @@ impl reqwest_middleware::Middleware for Limiter { .into(), ) })?; - info!(duration = ?start.elapsed(), "waiting for token to connect to the control plane"); - crate::metrics::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64()); + let duration = timer.observe(); + info!( + ?duration, + "waiting for token to connect to the control plane" + ); + match next.run(req, extensions).await { Ok(response) => { self.release(token, Some(Outcome::from_reqwest_response(&response))) diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 8b7e3e3419..5a38530faf 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -11,7 +11,7 @@ use crate::{ cache::project_info::ProjectInfoCache, cancellation::{CancelMap, CancellationHandler}, intern::{ProjectIdInt, RoleNameInt}, - metrics::{NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, REDIS_BROKEN_MESSAGES}, + metrics::{Metrics, RedisErrors}, }; const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; @@ -104,9 +104,9 @@ impl MessageHandler { let msg: Notification = match serde_json::from_str(&payload) { Ok(msg) => msg, Err(e) => { - REDIS_BROKEN_MESSAGES - .with_label_values(&[msg.get_channel_name()]) - .inc(); + Metrics::get().proxy.redis_errors_total.inc(RedisErrors { + channel: msg.get_channel_name(), + }); tracing::error!("broken message: {e}"); return Ok(()); } @@ -183,7 +183,7 @@ where cache, Arc::new(CancellationHandler::<()>::new( cancel_map, - NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, + crate::metrics::CancellationSource::FromRedis, )), region_id, ); diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index f275caa7eb..24c94fadd8 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -32,7 +32,7 @@ use tokio_util::task::TaskTracker; use crate::cancellation::CancellationHandlerMain; use crate::config::ProxyConfig; use crate::context::RequestMonitoring; -use crate::metrics::{NUM_CLIENT_CONNECTION_GAUGE, TLS_HANDSHAKE_FAILURES}; +use crate::metrics::Metrics; use crate::protocol2::WithClientIp; use crate::proxy::run_until_cancelled; use crate::rate_limiter::EndpointRateLimiter; @@ -156,9 +156,10 @@ async fn connection_handler( ) { let session_id = uuid::Uuid::new_v4(); - let _gauge = NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&["http"]) - .guard(); + let _gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Http); // handle PROXY protocol let mut conn = WithClientIp::new(conn); @@ -181,13 +182,13 @@ async fn connection_handler( } // The handshake failed Ok(Err(e)) => { - TLS_HANDSHAKE_FAILURES.inc(); + Metrics::get().proxy.tls_handshake_failures.inc(); warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}"); return; } // The handshake timed out Err(e) => { - TLS_HANDSHAKE_FAILURES.inc(); + Metrics::get().proxy.tls_handshake_failures.inc(); warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}"); return; } @@ -274,7 +275,13 @@ async fn request_handler( // Check if the request is a websocket upgrade request. if hyper_tungstenite::is_upgrade_request(&request) { - let ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region); + let ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Ws, + &config.region, + ); + let span = ctx.span.clone(); info!(parent: &span, "performing websocket upgrade"); @@ -302,7 +309,12 @@ async fn request_handler( // Return the response so the spawned future can continue. Ok(response) } else if request.uri().path() == "/sql" && *request.method() == Method::POST { - let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region); + let ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Http, + &config.region, + ); let span = ctx.span.clone(); sql_over_http::handle(config, ctx, request, backend, http_cancellation_token) diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 35311facb8..131f088880 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -1,6 +1,5 @@ use dashmap::DashMap; use futures::{future::poll_fn, Future}; -use metrics::IntCounterPairGuard; use parking_lot::RwLock; use rand::Rng; use smallvec::SmallVec; @@ -18,11 +17,10 @@ use tokio_postgres::tls::NoTlsStream; use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; use crate::console::messages::{ColdStartInfo, MetricsAuxInfo}; -use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL}; +use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; use crate::{ - auth::backend::ComputeUserInfo, context::RequestMonitoring, metrics::NUM_DB_CONNECTIONS_GAUGE, - DbName, EndpointCacheKey, RoleName, + auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName, }; use tracing::{debug, error, warn, Span}; @@ -78,7 +76,7 @@ pub struct EndpointConnPool { pools: HashMap<(DbName, RoleName), DbUserConnPool>, total_conns: usize, max_conns: usize, - _guard: IntCounterPairGuard, + _guard: HttpEndpointPoolsGuard<'static>, global_connections_count: Arc, global_pool_size_max_conns: usize, } @@ -110,7 +108,11 @@ impl EndpointConnPool { let removed = old_len - new_len; if removed > 0 { global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); - NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); } *total_conns -= removed; removed > 0 @@ -156,7 +158,11 @@ impl EndpointConnPool { pool.total_conns += 1; pool.global_connections_count .fetch_add(1, atomic::Ordering::Relaxed); - NUM_OPEN_CLIENTS_IN_HTTP_POOL.inc(); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .inc(); } pool.total_conns @@ -176,7 +182,11 @@ impl Drop for EndpointConnPool { if self.total_conns > 0 { self.global_connections_count .fetch_sub(self.total_conns, atomic::Ordering::Relaxed); - NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(self.total_conns as i64); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(self.total_conns as i64); } } } @@ -215,7 +225,11 @@ impl DbUserConnPool { removed += 1; } global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); - NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); conn } } @@ -303,7 +317,10 @@ impl GlobalConnPool { // acquire a random shard lock let mut shard = self.global_pool.shards()[shard].write(); - let timer = GC_LATENCY.start_timer(); + let timer = Metrics::get() + .proxy + .http_pool_reclaimation_lag_seconds + .start_timer(); let current_len = shard.len(); let mut clients_removed = 0; shard.retain(|endpoint, x| { @@ -331,7 +348,7 @@ impl GlobalConnPool { let new_len = shard.len(); drop(shard); - timer.observe_duration(); + timer.observe(); // Do logging outside of the lock. if clients_removed > 0 { @@ -339,7 +356,11 @@ impl GlobalConnPool { .global_connections_count .fetch_sub(clients_removed, atomic::Ordering::Relaxed) - clients_removed; - NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(clients_removed as i64); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(clients_removed as i64); info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"); } let removed = current_len - new_len; @@ -410,7 +431,7 @@ impl GlobalConnPool { pools: HashMap::new(), total_conns: 0, max_conns: self.config.pool_options.max_conns_per_endpoint, - _guard: ENDPOINT_POOLS.guard(), + _guard: Metrics::get().proxy.http_endpoint_pools.guard(), global_connections_count: self.global_connections_count.clone(), global_pool_size_max_conns: self.config.pool_options.max_total_conns, })); @@ -450,9 +471,7 @@ pub fn poll_client( conn_id: uuid::Uuid, aux: MetricsAuxInfo, ) -> Client { - let conn_gauge = NUM_DB_CONNECTIONS_GAUGE - .with_label_values(&[ctx.protocol]) - .guard(); + let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol); let mut session_id = ctx.session_id; let (tx, mut rx) = tokio::sync::watch::channel(session_id); diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 7f7f93988c..a66edb2c66 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -43,8 +43,8 @@ use crate::context::RequestMonitoring; use crate::error::ErrorKind; use crate::error::ReportableError; use crate::error::UserFacingError; -use crate::metrics::HTTP_CONTENT_LENGTH; -use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE; +use crate::metrics::HttpDirection; +use crate::metrics::Metrics; use crate::proxy::run_until_cancelled; use crate::proxy::NeonOptions; use crate::serverless::backend::HttpConnError; @@ -494,10 +494,11 @@ async fn handle_inner( request: Request, backend: Arc, ) -> Result>, SqlOverHttpError> { - let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE - .with_label_values(&[ctx.protocol]) - .guard(); - info!("handling interactive connection from client"); + let _requeset_gauge = Metrics::get().proxy.connection_requests.guard(ctx.protocol); + info!( + protocol = %ctx.protocol, + "handling interactive connection from client" + ); // // Determine the destination and connection params @@ -520,9 +521,10 @@ async fn handle_inner( None => MAX_REQUEST_SIZE + 1, }; info!(request_content_length, "request size in bytes"); - HTTP_CONTENT_LENGTH - .with_label_values(&["request"]) - .observe(request_content_length as f64); + Metrics::get() + .proxy + .http_conn_content_length_bytes + .observe(HttpDirection::Request, request_content_length as f64); // we don't have a streaming request support yet so this is to prevent OOM // from a malicious user sending an extremely large request body @@ -607,9 +609,10 @@ async fn handle_inner( // count the egress bytes - we miss the TLS and header overhead but oh well... // moving this later in the stack is going to be a lot of effort and ehhhh metrics.record_egress(len as u64); - HTTP_CONTENT_LENGTH - .with_label_values(&["response"]) - .observe(len as f64); + Metrics::get() + .proxy + .http_conn_content_length_bytes + .observe(HttpDirection::Response, len as f64); Ok(response) } diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index ada6c974f4..d054877126 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -3,7 +3,7 @@ use crate::{ config::ProxyConfig, context::RequestMonitoring, error::{io_error, ReportableError}, - metrics::NUM_CLIENT_CONNECTION_GAUGE, + metrics::Metrics, proxy::{handle_client, ClientMode}, rate_limiter::EndpointRateLimiter, }; @@ -139,9 +139,10 @@ pub async fn serve_websocket( endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { let websocket = websocket.await?; - let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&["ws"]) - .guard(); + let conn_gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Ws); let res = handle_client( config, diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index b6b7a85659..fdd2be3ee5 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -1,6 +1,6 @@ use crate::config::TlsServerEndPoint; use crate::error::{ErrorKind, ReportableError, UserFacingError}; -use crate::metrics::TLS_HANDSHAKE_FAILURES; +use crate::metrics::Metrics; use bytes::BytesMut; use pq_proto::framed::{ConnectionError, Framed}; @@ -228,7 +228,7 @@ impl Stream { Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg) .accept(raw) .await - .inspect_err(|_| TLS_HANDSHAKE_FAILURES.inc())?), + .inspect_err(|_| Metrics::get().proxy.tls_handshake_failures.inc())?), Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls), } } From 40f15c31235242ffdefc8b3662ba252cec55377e Mon Sep 17 00:00:00 2001 From: Anna Khanova <32508607+khanova@users.noreply.github.com> Date: Thu, 11 Apr 2024 20:24:34 +0200 Subject: [PATCH 20/24] Read cplane events from regional redis (#7352) ## Problem Actually read redis events. ## Summary of changes This is revert of https://github.com/neondatabase/neon/pull/7350 + fixes. * Fixed events parsing * Added timeout after connection failure * Separated regional and global redis clients. --- proxy/src/auth/backend.rs | 4 +- proxy/src/bin/proxy.rs | 61 +++-- proxy/src/cache.rs | 1 + proxy/src/cache/endpoints.rs | 226 ++++++++++++++++++ proxy/src/config.rs | 74 ++++++ proxy/src/console/provider.rs | 17 +- proxy/src/console/provider/neon.rs | 47 ++-- proxy/src/context.rs | 22 +- proxy/src/intern.rs | 15 ++ proxy/src/lib.rs | 37 +++ proxy/src/metrics.rs | 13 +- proxy/src/proxy.rs | 4 +- proxy/src/rate_limiter.rs | 2 +- proxy/src/rate_limiter/limiter.rs | 10 +- proxy/src/redis/cancellation_publisher.rs | 6 +- .../regress/test_proxy_rate_limiter.py | 84 ------- 16 files changed, 479 insertions(+), 144 deletions(-) create mode 100644 proxy/src/cache/endpoints.rs delete mode 100644 test_runner/regress/test_proxy_rate_limiter.py diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 229d499e30..ab5dd4544b 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -27,7 +27,7 @@ use crate::{ }, stream, url, }; -use crate::{scram, EndpointCacheKey, EndpointId, RoleName}; +use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; @@ -186,7 +186,7 @@ impl AuthenticationConfig { is_cleartext: bool, ) -> auth::Result { // we have validated the endpoint exists, so let's intern it. - let endpoint_int = EndpointIdInt::from(endpoint); + let endpoint_int = EndpointIdInt::from(endpoint.normalize()); // only count the full hash count if password hack or websocket flow. // in other words, if proxy needs to run the hashing diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 3392c21075..2e749fc7e8 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -190,7 +190,9 @@ struct ProxyCliArgs { /// cache for `project_info` (use `size=0` to disable) #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)] project_info_cache: String, - + /// cache for all valid endpoints + #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)] + endpoint_cache_config: String, #[clap(flatten)] parquet_upload: ParquetUploadArgs, @@ -301,27 +303,27 @@ async fn main() -> anyhow::Result<()> { ), aws_credentials_provider, )); - let redis_notifications_client = - match (args.redis_notifications, (args.redis_host, args.redis_port)) { - (Some(url), _) => { - info!("Starting redis notifications listener ({url})"); - Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url)) - } - (None, (Some(host), Some(port))) => Some( - ConnectionWithCredentialsProvider::new_with_credentials_provider( - host, - port, - elasticache_credentials_provider.clone(), - ), + let regional_redis_client = match (args.redis_host, args.redis_port) { + (Some(host), Some(port)) => Some( + ConnectionWithCredentialsProvider::new_with_credentials_provider( + host, + port, + elasticache_credentials_provider.clone(), ), - (None, (None, None)) => { - warn!("Redis is disabled"); - None - } - _ => { - bail!("redis-host and redis-port must be specified together"); - } - }; + ), + (None, None) => { + warn!("Redis events from console are disabled"); + None + } + _ => { + bail!("redis-host and redis-port must be specified together"); + } + }; + let redis_notifications_client = if let Some(url) = args.redis_notifications { + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url)) + } else { + regional_redis_client.clone() + }; // Check that we can bind to address before further initialization let http_address: SocketAddr = args.http.parse()?; @@ -340,8 +342,7 @@ async fn main() -> anyhow::Result<()> { let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit)); let cancel_map = CancelMap::default(); - // let redis_notifications_client = redis_notifications_client.map(|x| Box::leak(Box::new(x))); - let redis_publisher = match &redis_notifications_client { + let redis_publisher = match ®ional_redis_client { Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new( redis_publisher.clone(), args.region.clone(), @@ -416,13 +417,18 @@ async fn main() -> anyhow::Result<()> { if let Some(redis_notifications_client) = redis_notifications_client { let cache = api.caches.project_info.clone(); maintenance_tasks.spawn(notifications::task_main( - redis_notifications_client.clone(), + redis_notifications_client, cache.clone(), cancel_map.clone(), args.region.clone(), )); maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); } + if let Some(regional_redis_client) = regional_redis_client { + let cache = api.caches.endpoints_cache.clone(); + let con = regional_redis_client; + maintenance_tasks.spawn(async move { cache.do_read(con).await }); + } } } @@ -501,14 +507,18 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = args.project_info_cache.parse()?; + let endpoint_cache_config: config::EndpointCacheConfig = + args.endpoint_cache_config.parse()?; info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); info!( "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" ); + info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); let caches = Box::leak(Box::new(console::caches::ApiCaches::new( wake_compute_cache_config, project_info_cache_config, + endpoint_cache_config, ))); let config::WakeComputeLockOptions { @@ -524,11 +534,12 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { permits, shards, timeout, + epoch, &Metrics::get().wake_compute_lock, ) .unwrap(), )); - tokio::spawn(locks.garbage_collect_worker(epoch)); + tokio::spawn(locks.garbage_collect_worker()); let url = args.auth_endpoint.parse()?; let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config)); diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs index fc5f416395..d1d4087241 100644 --- a/proxy/src/cache.rs +++ b/proxy/src/cache.rs @@ -1,4 +1,5 @@ pub mod common; +pub mod endpoints; pub mod project_info; mod timed_lru; diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs new file mode 100644 index 0000000000..f3f9e9395f --- /dev/null +++ b/proxy/src/cache/endpoints.rs @@ -0,0 +1,226 @@ +use std::{ + convert::Infallible, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, +}; + +use dashmap::DashSet; +use redis::{ + streams::{StreamReadOptions, StreamReadReply}, + AsyncCommands, FromRedisValue, Value, +}; +use serde::Deserialize; +use tokio::sync::Mutex; + +use crate::{ + config::EndpointCacheConfig, + context::RequestMonitoring, + intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}, + metrics::{Metrics, RedisErrors}, + rate_limiter::GlobalRateLimiter, + redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider, + EndpointId, +}; + +#[derive(Deserialize, Debug, Clone)] +pub struct ControlPlaneEventKey { + endpoint_created: Option, + branch_created: Option, + project_created: Option, +} +#[derive(Deserialize, Debug, Clone)] +struct EndpointCreated { + endpoint_id: String, +} +#[derive(Deserialize, Debug, Clone)] +struct BranchCreated { + branch_id: String, +} +#[derive(Deserialize, Debug, Clone)] +struct ProjectCreated { + project_id: String, +} + +pub struct EndpointsCache { + config: EndpointCacheConfig, + endpoints: DashSet, + branches: DashSet, + projects: DashSet, + ready: AtomicBool, + limiter: Arc>, +} + +impl EndpointsCache { + pub fn new(config: EndpointCacheConfig) -> Self { + Self { + limiter: Arc::new(Mutex::new(GlobalRateLimiter::new( + config.limiter_info.clone(), + ))), + config, + endpoints: DashSet::new(), + branches: DashSet::new(), + projects: DashSet::new(), + ready: AtomicBool::new(false), + } + } + pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool { + if !self.ready.load(Ordering::Acquire) { + return true; + } + // If cache is disabled, just collect the metrics and return. + if self.config.disable_cache { + ctx.set_rejected(self.should_reject(endpoint)); + return true; + } + // If the limiter allows, we don't need to check the cache. + if self.limiter.lock().await.check() { + return true; + } + let rejected = self.should_reject(endpoint); + ctx.set_rejected(rejected); + !rejected + } + fn should_reject(&self, endpoint: &EndpointId) -> bool { + if endpoint.is_endpoint() { + !self.endpoints.contains(&EndpointIdInt::from(endpoint)) + } else if endpoint.is_branch() { + !self + .branches + .contains(&BranchIdInt::from(&endpoint.as_branch())) + } else { + !self + .projects + .contains(&ProjectIdInt::from(&endpoint.as_project())) + } + } + fn insert_event(&self, key: ControlPlaneEventKey) { + // Do not do normalization here, we expect the events to be normalized. + if let Some(endpoint_created) = key.endpoint_created { + self.endpoints + .insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into())); + } + if let Some(branch_created) = key.branch_created { + self.branches + .insert(BranchIdInt::from(&branch_created.branch_id.into())); + } + if let Some(project_created) = key.project_created { + self.projects + .insert(ProjectIdInt::from(&project_created.project_id.into())); + } + } + pub async fn do_read( + &self, + mut con: ConnectionWithCredentialsProvider, + ) -> anyhow::Result { + let mut last_id = "0-0".to_string(); + loop { + self.ready.store(false, Ordering::Release); + if let Err(e) = con.connect().await { + tracing::error!("error connecting to redis: {:?}", e); + continue; + } + if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await { + tracing::error!("error reading from redis: {:?}", e); + } + tokio::time::sleep(self.config.retry_interval).await; + } + } + async fn read_from_stream( + &self, + con: &mut ConnectionWithCredentialsProvider, + last_id: &mut String, + ) -> anyhow::Result<()> { + tracing::info!("reading endpoints/branches/projects from redis"); + self.batch_read( + con, + StreamReadOptions::default().count(self.config.initial_batch_size), + last_id, + true, + ) + .await?; + tracing::info!("ready to filter user requests"); + self.ready.store(true, Ordering::Release); + self.batch_read( + con, + StreamReadOptions::default() + .count(self.config.default_batch_size) + .block(self.config.xread_timeout.as_millis() as usize), + last_id, + false, + ) + .await + } + fn parse_key_value(value: &Value) -> anyhow::Result { + let s: String = FromRedisValue::from_redis_value(value)?; + Ok(serde_json::from_str(&s)?) + } + async fn batch_read( + &self, + conn: &mut ConnectionWithCredentialsProvider, + opts: StreamReadOptions, + last_id: &mut String, + return_when_finish: bool, + ) -> anyhow::Result<()> { + let mut total: usize = 0; + loop { + let mut res: StreamReadReply = conn + .xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts) + .await?; + + if res.keys.is_empty() { + if return_when_finish { + anyhow::bail!( + "Redis stream {} is empty, cannot be used to filter endpoints", + self.config.stream_name + ); + } + // If we are not returning when finish, we should wait for more data. + continue; + } + if res.keys.len() != 1 { + anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name); + } + + let res = res.keys.pop().expect("Checked length above"); + let len = res.ids.len(); + for x in res.ids { + total += 1; + for (_, v) in x.map { + let key = match Self::parse_key_value(&v) { + Ok(x) => x, + Err(e) => { + Metrics::get().proxy.redis_errors_total.inc(RedisErrors { + channel: &self.config.stream_name, + }); + tracing::error!("error parsing value {v:?}: {e:?}"); + continue; + } + }; + self.insert_event(key); + } + if total.is_power_of_two() { + tracing::debug!("endpoints read {}", total); + } + *last_id = x.id; + } + if return_when_finish && len <= self.config.default_batch_size { + break; + } + } + tracing::info!("read {} endpoints/branches/projects from redis", total); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::ControlPlaneEventKey; + + #[test] + fn test() { + let s = "{\"branch_created\":null,\"endpoint_created\":{\"endpoint_id\":\"ep-rapid-thunder-w0qqw2q9\"},\"project_created\":null,\"type\":\"endpoint_created\"}"; + let _: ControlPlaneEventKey = serde_json::from_str(s).unwrap(); + } +} diff --git a/proxy/src/config.rs b/proxy/src/config.rs index fc490c7348..b4b2ce8dbd 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -313,6 +313,80 @@ impl CertResolver { } } +#[derive(Debug)] +pub struct EndpointCacheConfig { + /// Batch size to receive all endpoints on the startup. + pub initial_batch_size: usize, + /// Batch size to receive endpoints. + pub default_batch_size: usize, + /// Timeouts for the stream read operation. + pub xread_timeout: Duration, + /// Stream name to read from. + pub stream_name: String, + /// Limiter info (to distinguish when to enable cache). + pub limiter_info: Vec, + /// Disable cache. + /// If true, cache is ignored, but reports all statistics. + pub disable_cache: bool, + /// Retry interval for the stream read operation. + pub retry_interval: Duration, +} + +impl EndpointCacheConfig { + /// Default options for [`crate::console::provider::NodeInfoCache`]. + /// Notice that by default the limiter is empty, which means that cache is disabled. + pub const CACHE_DEFAULT_OPTIONS: &'static str = + "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s"; + + /// Parse cache options passed via cmdline. + /// Example: [`Self::CACHE_DEFAULT_OPTIONS`]. + fn parse(options: &str) -> anyhow::Result { + let mut initial_batch_size = None; + let mut default_batch_size = None; + let mut xread_timeout = None; + let mut stream_name = None; + let mut limiter_info = vec![]; + let mut disable_cache = false; + let mut retry_interval = None; + + for option in options.split(',') { + let (key, value) = option + .split_once('=') + .with_context(|| format!("bad key-value pair: {option}"))?; + + match key { + "initial_batch_size" => initial_batch_size = Some(value.parse()?), + "default_batch_size" => default_batch_size = Some(value.parse()?), + "xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?), + "stream_name" => stream_name = Some(value.to_string()), + "limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?), + "disable_cache" => disable_cache = value.parse()?, + "retry_interval" => retry_interval = Some(humantime::parse_duration(value)?), + unknown => bail!("unknown key: {unknown}"), + } + } + RateBucketInfo::validate(&mut limiter_info)?; + + Ok(Self { + initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?, + default_batch_size: default_batch_size.context("missing `default_batch_size`")?, + xread_timeout: xread_timeout.context("missing `xread_timeout`")?, + stream_name: stream_name.context("missing `stream_name`")?, + disable_cache, + limiter_info, + retry_interval: retry_interval.context("missing `retry_interval`")?, + }) + } +} + +impl FromStr for EndpointCacheConfig { + type Err = anyhow::Error; + + fn from_str(options: &str) -> Result { + let error = || format!("failed to parse endpoint cache options '{options}'"); + Self::parse(options).with_context(error) + } +} #[derive(Debug)] pub struct MetricBackupCollectionConfig { pub interval: Duration, diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index b9502f0722..3fa7221f98 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -8,9 +8,9 @@ use crate::{ backend::{ComputeCredentialKeys, ComputeUserInfo}, IpPattern, }, - cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru}, + cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru}, compute, - config::{CacheOptions, ProjectInfoCacheOptions}, + config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}, context::RequestMonitoring, intern::ProjectIdInt, metrics::ApiLockMetrics, @@ -417,12 +417,15 @@ pub struct ApiCaches { pub node_info: NodeInfoCache, /// Cache which stores project_id -> endpoint_ids mapping. pub project_info: Arc, + /// List of all valid endpoints. + pub endpoints_cache: Arc, } impl ApiCaches { pub fn new( wake_compute_cache_config: CacheOptions, project_info_cache_config: ProjectInfoCacheOptions, + endpoint_cache_config: EndpointCacheConfig, ) -> Self { Self { node_info: NodeInfoCache::new( @@ -432,6 +435,7 @@ impl ApiCaches { true, ), project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)), + endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)), } } } @@ -442,6 +446,7 @@ pub struct ApiLocks { node_locks: DashMap>, permits: usize, timeout: Duration, + epoch: std::time::Duration, metrics: &'static ApiLockMetrics, } @@ -451,6 +456,7 @@ impl ApiLocks { permits: usize, shards: usize, timeout: Duration, + epoch: std::time::Duration, metrics: &'static ApiLockMetrics, ) -> prometheus::Result { Ok(Self { @@ -458,6 +464,7 @@ impl ApiLocks { node_locks: DashMap::with_shard_amount(shards), permits, timeout, + epoch, metrics, }) } @@ -495,12 +502,12 @@ impl ApiLocks { }) } - pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) { + pub async fn garbage_collect_worker(&self) { if self.permits == 0 { return; } - - let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32); + let mut interval = + tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32); loop { for (i, shard) in self.node_locks.shards().iter().enumerate() { interval.tick().await; diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 9ac1900324..138acdf578 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -12,7 +12,7 @@ use crate::{ console::messages::ColdStartInfo, http, metrics::{CacheOutcome, Metrics}, - scram, + scram, Normalize, }; use crate::{cache::Cached, context::RequestMonitoring}; use futures::TryFutureExt; @@ -24,7 +24,7 @@ use tracing::{error, info, info_span, warn, Instrument}; pub struct Api { endpoint: http::Endpoint, pub caches: &'static ApiCaches, - locks: &'static ApiLocks, + pub locks: &'static ApiLocks, jwt: String, } @@ -56,6 +56,15 @@ impl Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { + if !self + .caches + .endpoints_cache + .is_valid(ctx, &user_info.endpoint.normalize()) + .await + { + info!("endpoint is not valid, skipping the request"); + return Ok(AuthInfo::default()); + } let request_id = ctx.session_id.to_string(); let application_name = ctx.console_application_name(); async { @@ -82,7 +91,9 @@ impl Api { Ok(body) => body, // Error 404 is special: it's ok not to have a secret. Err(e) => match e.http_status_code() { - Some(http::StatusCode::NOT_FOUND) => return Ok(AuthInfo::default()), + Some(http::StatusCode::NOT_FOUND) => { + return Ok(AuthInfo::default()); + } _otherwise => return Err(e.into()), }, }; @@ -178,23 +189,27 @@ impl super::Api for Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result { - let ep = &user_info.endpoint; + let normalized_ep = &user_info.endpoint.normalize(); let user = &user_info.user; - if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) { + if let Some(role_secret) = self + .caches + .project_info + .get_role_secret(normalized_ep, user) + { return Ok(role_secret); } let auth_info = self.do_get_auth_info(ctx, user_info).await?; if let Some(project_id) = auth_info.project_id { - let ep_int = ep.into(); + let normalized_ep_int = normalized_ep.into(); self.caches.project_info.insert_role_secret( project_id, - ep_int, + normalized_ep_int, user.into(), auth_info.secret.clone(), ); self.caches.project_info.insert_allowed_ips( project_id, - ep_int, + normalized_ep_int, Arc::new(auth_info.allowed_ips), ); ctx.set_project_id(project_id); @@ -208,8 +223,8 @@ impl super::Api for Api { ctx: &mut RequestMonitoring, user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { - let ep = &user_info.endpoint; - if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) { + let normalized_ep = &user_info.endpoint.normalize(); + if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) { Metrics::get() .proxy .allowed_ips_cache_misses @@ -224,16 +239,18 @@ impl super::Api for Api { let allowed_ips = Arc::new(auth_info.allowed_ips); let user = &user_info.user; if let Some(project_id) = auth_info.project_id { - let ep_int = ep.into(); + let normalized_ep_int = normalized_ep.into(); self.caches.project_info.insert_role_secret( project_id, - ep_int, + normalized_ep_int, user.into(), auth_info.secret.clone(), ); - self.caches - .project_info - .insert_allowed_ips(project_id, ep_int, allowed_ips.clone()); + self.caches.project_info.insert_allowed_ips( + project_id, + normalized_ep_int, + allowed_ips.clone(), + ); ctx.set_project_id(project_id); } Ok(( diff --git a/proxy/src/context.rs b/proxy/src/context.rs index 0094235921..dc475d57ed 100644 --- a/proxy/src/context.rs +++ b/proxy/src/context.rs @@ -12,7 +12,7 @@ use crate::{ console::messages::{ColdStartInfo, MetricsAuxInfo}, error::ErrorKind, intern::{BranchIdInt, ProjectIdInt}, - metrics::{LatencyTimer, Metrics, Protocol}, + metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol}, DbName, EndpointId, RoleName, }; @@ -50,6 +50,8 @@ pub struct RequestMonitoring { // This sender is here to keep the request monitoring channel open while requests are taking place. sender: Option>, pub latency_timer: LatencyTimer, + // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane. + rejected: bool, } #[derive(Clone, Debug)] @@ -93,6 +95,7 @@ impl RequestMonitoring { error_kind: None, auth_method: None, success: false, + rejected: false, cold_start_info: ColdStartInfo::Unknown, sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()), @@ -113,6 +116,10 @@ impl RequestMonitoring { ) } + pub fn set_rejected(&mut self, rejected: bool) { + self.rejected = rejected; + } + pub fn set_cold_start_info(&mut self, info: ColdStartInfo) { self.cold_start_info = info; self.latency_timer.cold_start_info(info); @@ -176,6 +183,19 @@ impl RequestMonitoring { impl Drop for RequestMonitoring { fn drop(&mut self) { + let outcome = if self.success { + ConnectOutcome::Success + } else { + ConnectOutcome::Failed + }; + Metrics::get() + .proxy + .invalid_endpoints_total + .inc(InvalidEndpointsGroup { + protocol: self.protocol, + rejected: self.rejected.into(), + outcome, + }); if let Some(tx) = self.sender.take() { let _: Result<(), _> = tx.send(RequestData::from(&*self)); } diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs index a6519bdff9..e38135dd22 100644 --- a/proxy/src/intern.rs +++ b/proxy/src/intern.rs @@ -160,6 +160,11 @@ impl From<&EndpointId> for EndpointIdInt { EndpointIdTag::get_interner().get_or_intern(value) } } +impl From for EndpointIdInt { + fn from(value: EndpointId) -> Self { + EndpointIdTag::get_interner().get_or_intern(&value) + } +} #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct BranchIdTag; @@ -175,6 +180,11 @@ impl From<&BranchId> for BranchIdInt { BranchIdTag::get_interner().get_or_intern(value) } } +impl From for BranchIdInt { + fn from(value: BranchId) -> Self { + BranchIdTag::get_interner().get_or_intern(&value) + } +} #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub struct ProjectIdTag; @@ -190,6 +200,11 @@ impl From<&ProjectId> for ProjectIdInt { ProjectIdTag::get_interner().get_or_intern(value) } } +impl From for ProjectIdInt { + fn from(value: ProjectId) -> Self { + ProjectIdTag::get_interner().get_or_intern(&value) + } +} #[cfg(test)] mod tests { diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index da7c7f3ed2..3f6d985fe8 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -127,6 +127,24 @@ macro_rules! smol_str_wrapper { }; } +const POOLER_SUFFIX: &str = "-pooler"; + +pub trait Normalize { + fn normalize(&self) -> Self; +} + +impl + From> Normalize for S { + fn normalize(&self) -> Self { + if self.as_ref().ends_with(POOLER_SUFFIX) { + let mut s = self.as_ref().to_string(); + s.truncate(s.len() - POOLER_SUFFIX.len()); + s.into() + } else { + self.clone() + } + } +} + // 90% of role name strings are 20 characters or less. smol_str_wrapper!(RoleName); // 50% of endpoint strings are 23 characters or less. @@ -140,3 +158,22 @@ smol_str_wrapper!(ProjectId); smol_str_wrapper!(EndpointCacheKey); smol_str_wrapper!(DbName); + +// Endpoints are a bit tricky. Rare they might be branches or projects. +impl EndpointId { + pub fn is_endpoint(&self) -> bool { + self.0.starts_with("ep-") + } + pub fn is_branch(&self) -> bool { + self.0.starts_with("br-") + } + pub fn is_project(&self) -> bool { + !self.is_endpoint() && !self.is_branch() + } + pub fn as_branch(&self) -> BranchId { + BranchId(self.0.clone()) + } + pub fn as_project(&self) -> ProjectId { + ProjectId(self.0.clone()) + } +} diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 78840f5983..b96950b0a2 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -120,6 +120,9 @@ pub struct ProxyMetrics { /// Number of endpoints affected by authentication rate limits pub endpoints_auth_rate_limits: HyperLogLog<32>, + + /// Number of invalid endpoints (per protocol, per rejected). + pub invalid_endpoints_total: CounterVec, } #[derive(MetricGroup)] @@ -430,7 +433,7 @@ impl Drop for LatencyTimerPause<'_> { } #[derive(FixedCardinalityLabel, Clone, Copy, Debug)] -enum ConnectOutcome { +pub enum ConnectOutcome { Success, Failed, } @@ -480,3 +483,11 @@ impl From for Bool { } } } + +#[derive(LabelGroup)] +#[label(set = InvalidEndpointsSet)] +pub struct InvalidEndpointsGroup { + pub protocol: Protocol, + pub rejected: Bool, + pub outcome: ConnectOutcome, +} diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 5598215b6b..42fb10b326 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -20,7 +20,7 @@ use crate::{ proxy::handshake::{handshake, HandshakeData}, rate_limiter::EndpointRateLimiter, stream::{PqStream, Stream}, - EndpointCacheKey, + EndpointCacheKey, Normalize, }; use futures::TryFutureExt; use itertools::Itertools; @@ -288,7 +288,7 @@ pub async fn handle_client( // check rate limit if let Some(ep) = user_info.get_endpoint() { - if !endpoint_rate_limiter.check(ep, 1) { + if !endpoint_rate_limiter.check(ep.normalize(), 1) { return stream .throw_error(auth::AuthError::too_many_connections()) .await?; diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs index 13dffffca0..a3b83e5e50 100644 --- a/proxy/src/rate_limiter.rs +++ b/proxy/src/rate_limiter.rs @@ -4,4 +4,4 @@ mod limiter; pub use aimd::Aimd; pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig}; pub use limiter::Limiter; -pub use limiter::{AuthRateLimiter, EndpointRateLimiter, RateBucketInfo, RedisRateLimiter}; +pub use limiter::{AuthRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo}; diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index aba5120f38..7e9370f606 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -30,13 +30,13 @@ use super::{ RateLimiterConfig, }; -pub struct RedisRateLimiter { +pub struct GlobalRateLimiter { data: Vec, - info: &'static [RateBucketInfo], + info: Vec, } -impl RedisRateLimiter { - pub fn new(info: &'static [RateBucketInfo]) -> Self { +impl GlobalRateLimiter { + pub fn new(info: Vec) -> Self { Self { data: vec![ RateBucket { @@ -56,7 +56,7 @@ impl RedisRateLimiter { let should_allow_request = self .data .iter_mut() - .zip(self.info) + .zip(&self.info) .all(|(bucket, info)| bucket.should_allow_request(info, now, 1)); if should_allow_request { diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs index 422789813c..7baf104374 100644 --- a/proxy/src/redis/cancellation_publisher.rs +++ b/proxy/src/redis/cancellation_publisher.rs @@ -5,7 +5,7 @@ use redis::AsyncCommands; use tokio::sync::Mutex; use uuid::Uuid; -use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter}; +use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; use super::{ connection_with_credentials_provider::ConnectionWithCredentialsProvider, @@ -80,7 +80,7 @@ impl CancellationPublisher for Arc> { pub struct RedisPublisherClient { client: ConnectionWithCredentialsProvider, region_id: String, - limiter: RedisRateLimiter, + limiter: GlobalRateLimiter, } impl RedisPublisherClient { @@ -92,7 +92,7 @@ impl RedisPublisherClient { Ok(Self { client, region_id, - limiter: RedisRateLimiter::new(info), + limiter: GlobalRateLimiter::new(info.into()), }) } diff --git a/test_runner/regress/test_proxy_rate_limiter.py b/test_runner/regress/test_proxy_rate_limiter.py deleted file mode 100644 index f39f0cad07..0000000000 --- a/test_runner/regress/test_proxy_rate_limiter.py +++ /dev/null @@ -1,84 +0,0 @@ -import asyncio -import time -from pathlib import Path -from typing import Iterator - -import pytest -from fixtures.neon_fixtures import ( - PSQL, - NeonProxy, -) -from fixtures.port_distributor import PortDistributor -from pytest_httpserver import HTTPServer -from werkzeug.wrappers.response import Response - - -def waiting_handler(status_code: int) -> Response: - # wait more than timeout to make sure that both (two) connections are open. - # It would be better to use a barrier here, but I don't know how to do that together with pytest-httpserver. - time.sleep(2) - return Response(status=status_code) - - -@pytest.fixture(scope="function") -def proxy_with_rate_limit( - port_distributor: PortDistributor, - neon_binpath: Path, - httpserver_listen_address, - test_output_dir: Path, -) -> Iterator[NeonProxy]: - """Neon proxy that routes directly to vanilla postgres.""" - - proxy_port = port_distributor.get_port() - mgmt_port = port_distributor.get_port() - http_port = port_distributor.get_port() - external_http_port = port_distributor.get_port() - (host, port) = httpserver_listen_address - endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" - - with NeonProxy( - neon_binpath=neon_binpath, - test_output_dir=test_output_dir, - proxy_port=proxy_port, - http_port=http_port, - mgmt_port=mgmt_port, - external_http_port=external_http_port, - auth_backend=NeonProxy.Console(endpoint, fixed_rate_limit=5), - ) as proxy: - proxy.start() - yield proxy - - -@pytest.mark.asyncio -async def test_proxy_rate_limit( - httpserver: HTTPServer, - proxy_with_rate_limit: NeonProxy, -): - uri = "/billing/api/v1/usage_events/proxy_get_role_secret" - # mock control plane service - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: Response(status=200) - ) - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: waiting_handler(429) - ) - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: waiting_handler(500) - ) - - psql = PSQL(host=proxy_with_rate_limit.host, port=proxy_with_rate_limit.proxy_port) - f = await psql.run("select 42;") - await proxy_with_rate_limit.find_auth_link(uri, f) - # Limit should be 2. - - # Run two queries in parallel. - f1, f2 = await asyncio.gather(psql.run("select 42;"), psql.run("select 42;")) - await proxy_with_rate_limit.find_auth_link(uri, f1) - await proxy_with_rate_limit.find_auth_link(uri, f2) - - # Now limit should be 0. - f = await psql.run("select 42;") - await proxy_with_rate_limit.find_auth_link(uri, f) - - # There last query shouldn't reach the http-server. - assert httpserver.assertions == [] From e92fb94149967d5eca3eccddcdd718149d3d7031 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Thu, 11 Apr 2024 21:55:05 +0100 Subject: [PATCH 21/24] proxy: fix overloaded db connection closure (#7364) ## Problem possible for the database connections to not close in time. ## Summary of changes force the closing of connections if the client has hung up --- proxy/src/serverless/conn_pool.rs | 36 +++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 131f088880..798e488509 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -15,6 +15,7 @@ use std::{ use tokio::time::Instant; use tokio_postgres::tls::NoTlsStream; use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; +use tokio_util::sync::CancellationToken; use crate::console::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; @@ -488,15 +489,32 @@ pub fn poll_client( let db_user = conn_info.db_and_user(); let idle = global_pool.get_idle_timeout(); + let cancel = CancellationToken::new(); + let cancelled = cancel.clone().cancelled_owned(); + tokio::spawn( async move { let _conn_gauge = conn_gauge; let mut idle_timeout = pin!(tokio::time::sleep(idle)); + let mut cancelled = pin!(cancelled); + poll_fn(move |cx| { - if matches!(rx.has_changed(), Ok(true)) { - session_id = *rx.borrow_and_update(); - info!(%session_id, "changed session"); - idle_timeout.as_mut().reset(Instant::now() + idle); + if cancelled.as_mut().poll(cx).is_ready() { + info!("connection dropped"); + return Poll::Ready(()) + } + + match rx.has_changed() { + Ok(true) => { + session_id = *rx.borrow_and_update(); + info!(%session_id, "changed session"); + idle_timeout.as_mut().reset(Instant::now() + idle); + } + Err(_) => { + info!("connection dropped"); + return Poll::Ready(()) + } + _ => {} } // 5 minute idle connection timeout @@ -551,6 +569,7 @@ pub fn poll_client( let inner = ClientInner { inner: client, session: tx, + cancel, aux, conn_id, }; @@ -560,10 +579,18 @@ pub fn poll_client( struct ClientInner { inner: C, session: tokio::sync::watch::Sender, + cancel: CancellationToken, aux: MetricsAuxInfo, conn_id: uuid::Uuid, } +impl Drop for ClientInner { + fn drop(&mut self) { + // on client drop, tell the conn to shut down + self.cancel.cancel(); + } +} + pub trait ClientInnerExt: Sync + Send + 'static { fn is_closed(&self) -> bool; fn get_process_id(&self) -> i32; @@ -716,6 +743,7 @@ mod tests { ClientInner { inner: client, session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()), + cancel: CancellationToken::new(), aux: MetricsAuxInfo { endpoint_id: (&EndpointId::from("endpoint")).into(), project_id: (&ProjectId::from("project")).into(), From 94505fd67288e0301c32763348c7b75f0b63e514 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 11 Apr 2024 23:35:30 +0100 Subject: [PATCH 22/24] CI: speed up Allure reports upload (#7362) ## Problem `create-test-report` job takes more than 8 minutes, the longest step is uploading Allure report to S3: Before: ``` + aws s3 cp --recursive --only-show-errors /tmp/pr-7362-1712847045/report s3://neon-github-public-dev/reports/pr-7362/8647730612 real 6m10.572s user 6m37.717s sys 1m9.429s ``` After: ``` + s5cmd --log error cp '/tmp/pr-7362-1712858221/report/*' s3://neon-github-public-dev/reports/pr-7362/8650636861/ real 0m9.698s user 1m9.438s sys 0m6.419s ``` ## Summary of changes - Add `s5cmd`(https://github.com/peak/s5cmd) to build-tools image - Use `s5cmd` instead of `aws s3` for uploading Allure reports --- .github/actions/allure-report-generate/action.yml | 2 +- Dockerfile.build-tools | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index 1ecb5ecc7e..f84beff20c 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -150,7 +150,7 @@ runs: # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work, # and to keep files on the host to upload them to the database - time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}" + time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/" # Generate redirect cat < ${WORKDIR}/index.html diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index 1ed6f87473..a082f15c34 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -58,6 +58,12 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$ && mv protoc/include/google /usr/local/include/google \ && rm -rf protoc.zip protoc +# s5cmd +ENV S5CMD_VERSION=2.2.2 +RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \ + && chmod +x s5cmd \ + && mv s5cmd /usr/local/bin/s5cmd + # LLVM ENV LLVM_VERSION=17 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ From e8338c60f9c048e27c38fb8212ac96b542cbfcff Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Thu, 11 Apr 2024 23:42:18 -0500 Subject: [PATCH 23/24] Fix typo in pg_ctl shutdown mode (#7365) The allowed modes as of Postgres 17 are: smart, fast, and immediate. $ cargo neon stop Finished dev [unoptimized + debuginfo] target(s) in 0.24s Running `target/debug/neon_local stop` postgres stop failed: pg_ctl failed, exit code: exit status: 1, stdout: , stderr: pg_ctl: unrecognized shutdown mode "fast " Try "pg_ctl --help" for more information. --- control_plane/src/bin/neon_local.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 56495dd2da..68a5474c87 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -1231,7 +1231,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { match ComputeControlPlane::load(env.clone()) { Ok(cplane) => { for (_k, node) in cplane.endpoints { - if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) { + if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) { eprintln!("postgres stop failed: {e:#}"); } } From 5288f9621e2c84e912ca972e3a7bbf597884be49 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 12 Apr 2024 10:15:40 +0100 Subject: [PATCH 24/24] build(deps): bump idna from 3.3 to 3.7 (#7367) --- poetry.lock | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 7b49daf42a..aca88073a8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "aiohttp" @@ -1191,13 +1191,13 @@ files = [ [[package]] name = "idna" -version = "3.3" +version = "3.7" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.5" files = [ - {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, - {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, + {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"}, + {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, ] [[package]] @@ -2182,6 +2182,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2652,6 +2653,16 @@ files = [ {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, + {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"}, + {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"}, + {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"}, + {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"}, + {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"}, + {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"}, + {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"}, + {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"}, + {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"}, + {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},