diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index 0ef4080495..3fc5b076f3 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -91,12 +91,12 @@ use std::{sync::Arc, time::Duration}; -use once_cell::sync::Lazy; + use tracing::warn; use utils::{id::TimelineId, shard::TenantShardId}; use crate::{ - metrics::{StorageIoSizeMetrics, TimelineMetrics}, + metrics::TimelineMetrics, task_mgr::TaskKind, tenant::Timeline, }; @@ -122,38 +122,34 @@ pub struct RequestContext { #[derive(Clone)] pub(crate) enum Scope { Global { - io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics, + }, SecondaryTenant { - io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics, + }, SecondaryTimeline { - io_size_metrics: crate::metrics::StorageIoSizeMetrics, + }, Timeline { - // We wrap the `Arc`s inside another Arc to avoid child + // We wrap the `Arc`s inside another Arc to avoid child // context creation contending for the ref counters of the Arc, // which are shared among all tasks that operate on the timeline, especially // concurrent page_service connections. #[allow(clippy::redundant_allocation)] - arc_arc: Arc>, - }, + arc_arc: Arc>, }, #[cfg(test)] UnitTest { - io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics, + }, DebugTools { - io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics, + }, } -static GLOBAL_IO_SIZE_METRICS: Lazy = - Lazy::new(|| crate::metrics::StorageIoSizeMetrics::new("*", "*", "*")); impl Scope { pub(crate) fn new_global() -> Self { Scope::Global { - io_size_metrics: &GLOBAL_IO_SIZE_METRICS, } } /// NB: this allocates, so, use only at relatively long-lived roots, e.g., at start @@ -173,18 +169,13 @@ impl Scope { } } pub(crate) fn new_secondary_timeline( - tenant_shard_id: &TenantShardId, - timeline_id: &TimelineId, + _tenant_shard_id: &TenantShardId, + _timeline_id: &TimelineId, ) -> Self { // TODO(https://github.com/neondatabase/neon/issues/11156): secondary timelines have no infrastructure for metrics lifecycle. - let tenant_id = tenant_shard_id.tenant_id.to_string(); - let shard_id = tenant_shard_id.shard_slug().to_string(); - let timeline_id = timeline_id.to_string(); - let io_size_metrics = - crate::metrics::StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id); - Scope::SecondaryTimeline { io_size_metrics } + Scope::SecondaryTimeline { } } pub(crate) fn new_secondary_tenant(_tenant_shard_id: &TenantShardId) -> Self { // Before propagating metrics via RequestContext, the labels were inferred from file path. @@ -197,19 +188,19 @@ impl Scope { // like we do for attached timelines. (We don't have attached-tenant-scoped usage of VirtualFile // at this point, so, we were able to completely side-step tenant-scoped stuff there). Scope::SecondaryTenant { - io_size_metrics: &GLOBAL_IO_SIZE_METRICS, + } } #[cfg(test)] pub(crate) fn new_unit_test() -> Self { Scope::UnitTest { - io_size_metrics: &GLOBAL_IO_SIZE_METRICS, + } } pub(crate) fn new_debug_tools() -> Self { Scope::DebugTools { - io_size_metrics: &GLOBAL_IO_SIZE_METRICS, + } } } @@ -528,41 +519,6 @@ impl RequestContext { self.read_path_debug } - pub(crate) fn io_size_metrics(&self) -> &StorageIoSizeMetrics { - match &self.scope { - Scope::Global { io_size_metrics } => { - let is_unit_test = cfg!(test); - let is_regress_test_build = cfg!(feature = "testing"); - if is_unit_test || is_regress_test_build { - panic!("all VirtualFile instances are timeline-scoped"); - } else { - use once_cell::sync::Lazy; - use std::sync::Mutex; - use std::time::Duration; - use utils::rate_limit::RateLimit; - static LIMIT: Lazy> = - Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(1)))); - let mut guard = LIMIT.lock().unwrap(); - guard.call2(|rate_limit_stats| { - warn!( - %rate_limit_stats, - backtrace=%std::backtrace::Backtrace::force_capture(), - "all VirtualFile instances are timeline-scoped", - ); - }); - - io_size_metrics - } - } - Scope::Timeline { arc_arc } => &arc_arc.storage_io_size, - Scope::SecondaryTimeline { io_size_metrics } => io_size_metrics, - Scope::SecondaryTenant { io_size_metrics } => io_size_metrics, - #[cfg(test)] - Scope::UnitTest { io_size_metrics } => io_size_metrics, - Scope::DebugTools { io_size_metrics } => io_size_metrics, - } - } - pub(crate) fn ondemand_download_wait_observe(&self, duration: Duration) { if duration == Duration::ZERO { return; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 8c88863dd8..8338ce4fbb 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -7,10 +7,10 @@ use enum_map::{Enum as _, EnumMap}; use futures::Future; use metrics::{ CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, - IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, + IntCounterPairVec, IntCounterVec, IntGaugeVec, UIntGauge, UIntGaugeVec, register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, - register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec, + register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec, }; use once_cell::sync::Lazy; use pageserver_api::config::{ @@ -626,59 +626,6 @@ pub(crate) enum StorageIoOperation { } -#[derive(Clone, Copy)] -#[repr(usize)] -pub(crate) enum StorageIoSizeOperation { - Read, - Write, -} - -impl StorageIoSizeOperation { - pub(crate) const VARIANTS: &'static [&'static str] = &["read", "write"]; - - fn as_str(&self) -> &'static str { - Self::VARIANTS[*self as usize] - } -} - -// Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1 -pub(crate) static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { - register_uint_gauge_vec!( - "pageserver_io_operations_bytes_total", - "Total amount of bytes read/written in IO operations", - &["operation", "tenant_id", "shard_id", "timeline_id"] - ) - .expect("failed to define a metric") -}); - -#[derive(Clone, Debug)] -pub(crate) struct StorageIoSizeMetrics { - pub read: UIntGauge, - pub write: UIntGauge, -} - -impl StorageIoSizeMetrics { - pub(crate) fn new(tenant_id: &str, shard_id: &str, timeline_id: &str) -> Self { - let read = STORAGE_IO_SIZE - .get_metric_with_label_values(&[ - StorageIoSizeOperation::Read.as_str(), - tenant_id, - shard_id, - timeline_id, - ]) - .unwrap(); - let write = STORAGE_IO_SIZE - .get_metric_with_label_values(&[ - StorageIoSizeOperation::Write.as_str(), - tenant_id, - shard_id, - timeline_id, - ]) - .unwrap(); - Self { read, write } - } -} - #[cfg(not(test))] pub(crate) mod virtual_file_descriptor_cache { use super::*; @@ -941,19 +888,9 @@ static PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE: Lazy .expect("failed to define a metric") }); -pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "pageserver_page_service_config_max_batch_size", - "Configured maximum batch size for the server-side batching functionality of page_service. \ - Labels expose more of the configuration parameters.", - &["mode", "execution", "batching"] - ) - .expect("failed to define a metric") -}); fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) { - PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset(); - let (label_values, value) = match conf { + let (_label_values, _value) = match conf { PageServicePipeliningConfig::Serial => (["serial", "-", "-"], 1), PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined { max_batch_size, @@ -975,9 +912,6 @@ fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) { ([mode, execution, batching], max_batch_size.get()) } }; - PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE - .with_label_values(&label_values) - .set(value.try_into().unwrap()); } static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS: Lazy = Lazy::new(|| { @@ -1329,14 +1263,6 @@ pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy = Lazy::new( .expect("failed to define a metric") }); -pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy = Lazy::new(|| { - register_int_gauge!( - "pageserver_walreceiver_active_managers", - "Number of active walreceiver managers" - ) - .expect("failed to define a metric") -}); - pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy = Lazy::new(|| { register_int_counter!( @@ -1346,21 +1272,6 @@ pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "pageserver_walreceiver_candidates_events_total", - "Number of walreceiver candidate events", - &["event"] - ) - .expect("failed to define a metric") -}); - -pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy = - Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"])); - -pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy = - Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"])); - // Metrics collected on WAL redo operations // // We collect the time spent in actual WAL redo ('redo'), and time waiting @@ -1670,7 +1581,6 @@ pub(crate) struct TimelineMetrics { pub garbage_collect_histo: StorageTimeMetrics, pub find_gc_cutoffs_histo: StorageTimeMetrics, /// copy of LayeredTimeline.current_logical_size - pub storage_io_size: StorageIoSizeMetrics, shutdown: std::sync::atomic::AtomicBool, } @@ -1726,9 +1636,6 @@ impl TimelineMetrics { &timeline_id, ); - - let storage_io_size = StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id); - TimelineMetrics { tenant_id, shard_id, @@ -1740,7 +1647,6 @@ impl TimelineMetrics { imitate_logical_size_histo, garbage_collect_histo, find_gc_cutoffs_histo, - storage_io_size, shutdown: std::sync::atomic::AtomicBool::default(), } } @@ -1835,10 +1741,6 @@ impl TimelineMetrics { ]); } - for op in StorageIoSizeOperation::VARIANTS { - let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]); - } - let _ = WAIT_LSN_IN_PROGRESS_MICROS.remove_label_values(&[tenant_id, shard_id, timeline_id]); @@ -2259,8 +2161,6 @@ pub fn preinitialize_metrics( [ &WALRECEIVER_STARTED_CONNECTIONS, &WALRECEIVER_BROKER_UPDATES, - &WALRECEIVER_CANDIDATES_ADDED, - &WALRECEIVER_CANDIDATES_REMOVED, &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES, &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES, &REMOTE_ONDEMAND_DOWNLOADED_LAYERS, diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index d0ac49b1a8..847e15aa49 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -4,7 +4,6 @@ use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, Instant, SystemTime}; -use crate::metrics::{STORAGE_IO_SIZE, StorageIoSizeOperation}; use camino::Utf8PathBuf; use chrono::format::{DelayedFormat, StrftimeItems}; use futures::Future; @@ -317,8 +316,8 @@ impl SecondaryDetail { } fn clear_timeline_metrics( - tenant_shard_id: &TenantShardId, - timeline_id: &TimelineId, + _tenant_shard_id: &TenantShardId, + _timeline_id: &TimelineId, detail: SecondaryDetailTimeline, resident_metric: &UIntGauge, ) { @@ -330,17 +329,7 @@ impl SecondaryDetail { .sum(), ); - let shard_id = format!("{}", tenant_shard_id.shard_slug()); - let tenant_id = tenant_shard_id.tenant_id.to_string(); - let timeline_id = timeline_id.to_string(); - for op in StorageIoSizeOperation::VARIANTS { - let _ = STORAGE_IO_SIZE.remove_label_values(&[ - op, - tenant_id.as_str(), - shard_id.as_str(), - timeline_id.as_str(), - ]); - } + } /// Additionally returns the total number of layers, used for more stable relative access time diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index b698514be1..cad475813b 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -39,10 +39,6 @@ use utils::postgres_client::{ use super::walreceiver_connection::{WalConnectionStatus, WalReceiverError}; use super::{TaskEvent, TaskHandle, TaskStateUpdate, WalReceiverConf}; use crate::context::{DownloadBehavior, RequestContext}; -use crate::metrics::{ - WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED, - WALRECEIVER_CANDIDATES_REMOVED, -}; use crate::task_mgr::TaskKind; use crate::tenant::{Timeline, debug_assert_current_span_has_tenant_and_timeline_id}; @@ -76,11 +72,6 @@ pub(super) async fn connection_manager_loop_step( } } - WALRECEIVER_ACTIVE_MANAGERS.inc(); - scopeguard::defer! { - WALRECEIVER_ACTIVE_MANAGERS.dec(); - } - let id = TenantTimelineId { tenant_id: connection_manager_state.timeline.tenant_shard_id.tenant_id, timeline_id: connection_manager_state.timeline.timeline_id, @@ -728,8 +719,6 @@ impl ConnectionManagerState { } }; - WALRECEIVER_BROKER_UPDATES.inc(); - trace!( "safekeeper info update: standby_horizon(cutoff)={}", timeline_update.standby_horizon @@ -756,7 +745,6 @@ impl ConnectionManagerState { %new_safekeeper_id, "New SK node was added", ); - WALRECEIVER_CANDIDATES_ADDED.inc(); } } @@ -1044,7 +1032,6 @@ impl ConnectionManagerState { "Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections" ); self.wal_connection_retries.remove(&node_id); - WALRECEIVER_CANDIDATES_REMOVED.inc(); } } } diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 1f8d5cdcf7..8c9dc0d768 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -29,7 +29,6 @@ pub use pageserver_api::models::virtual_file as api; use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice}; -use crate::assert_u64_eq_usize::UsizeIsU64; use crate::context::RequestContext; use crate::page_cache::{PAGE_SZ, PageWriteGuard}; pub(crate) mod io_engine; @@ -905,7 +904,7 @@ impl VirtualFileInner { &self, buf: tokio_epoll_uring::Slice, offset: u64, - ctx: &RequestContext, + _ctx: &RequestContext, ) -> (tokio_epoll_uring::Slice, Result) where Buf: tokio_epoll_uring::IoBufMut + Send, @@ -922,9 +921,7 @@ impl VirtualFileInner { observe_duration!(StorageIoOperation::Read, { let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await; let res = res.maybe_fatal_err("io_engine read_at inside VirtualFileInner::read_at"); - if let Ok(size) = res { - ctx.io_size_metrics().read.add(size.into_u64()); - } + (buf, res) }) } @@ -945,7 +942,7 @@ impl VirtualFileInner { &self, buf: FullSlice, offset: u64, - ctx: &RequestContext, + _ctx: &RequestContext, ) -> (FullSlice, Result) { let file_guard = match self.lock_file().await { Ok(file_guard) => file_guard, @@ -954,9 +951,7 @@ impl VirtualFileInner { observe_duration!(StorageIoOperation::Write, { let ((_file_guard, buf), result) = io_engine::get().write_at(file_guard, offset, buf).await; - if let Ok(size) = result { - ctx.io_size_metrics().write.add(size.into_u64()); - } + (buf, result) }) }