remove some more metrics, still compiles

This commit is contained in:
BodoBolero
2025-04-22 18:32:04 +02:00
parent 6d77432ed2
commit a12369be43
5 changed files with 25 additions and 198 deletions

View File

@@ -91,12 +91,12 @@
use std::{sync::Arc, time::Duration};
use once_cell::sync::Lazy;
use tracing::warn;
use utils::{id::TimelineId, shard::TenantShardId};
use crate::{
metrics::{StorageIoSizeMetrics, TimelineMetrics},
metrics::TimelineMetrics,
task_mgr::TaskKind,
tenant::Timeline,
};
@@ -122,38 +122,34 @@ pub struct RequestContext {
#[derive(Clone)]
pub(crate) enum Scope {
Global {
io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
},
SecondaryTenant {
io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
},
SecondaryTimeline {
io_size_metrics: crate::metrics::StorageIoSizeMetrics,
},
Timeline {
// We wrap the `Arc<TimelineMetrics>`s inside another Arc to avoid child
// We wrap the `Arc<TimelineMetrics>`s inside another Arc to avoid child
// context creation contending for the ref counters of the Arc<TimelineMetrics>,
// which are shared among all tasks that operate on the timeline, especially
// concurrent page_service connections.
#[allow(clippy::redundant_allocation)]
arc_arc: Arc<Arc<TimelineMetrics>>,
},
arc_arc: Arc<Arc<TimelineMetrics>>, },
#[cfg(test)]
UnitTest {
io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
},
DebugTools {
io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
},
}
static GLOBAL_IO_SIZE_METRICS: Lazy<crate::metrics::StorageIoSizeMetrics> =
Lazy::new(|| crate::metrics::StorageIoSizeMetrics::new("*", "*", "*"));
impl Scope {
pub(crate) fn new_global() -> Self {
Scope::Global {
io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
}
}
/// NB: this allocates, so, use only at relatively long-lived roots, e.g., at start
@@ -173,18 +169,13 @@ impl Scope {
}
}
pub(crate) fn new_secondary_timeline(
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
_tenant_shard_id: &TenantShardId,
_timeline_id: &TimelineId,
) -> Self {
// TODO(https://github.com/neondatabase/neon/issues/11156): secondary timelines have no infrastructure for metrics lifecycle.
let tenant_id = tenant_shard_id.tenant_id.to_string();
let shard_id = tenant_shard_id.shard_slug().to_string();
let timeline_id = timeline_id.to_string();
let io_size_metrics =
crate::metrics::StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id);
Scope::SecondaryTimeline { io_size_metrics }
Scope::SecondaryTimeline { }
}
pub(crate) fn new_secondary_tenant(_tenant_shard_id: &TenantShardId) -> Self {
// Before propagating metrics via RequestContext, the labels were inferred from file path.
@@ -197,19 +188,19 @@ impl Scope {
// like we do for attached timelines. (We don't have attached-tenant-scoped usage of VirtualFile
// at this point, so, we were able to completely side-step tenant-scoped stuff there).
Scope::SecondaryTenant {
io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
}
}
#[cfg(test)]
pub(crate) fn new_unit_test() -> Self {
Scope::UnitTest {
io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
}
}
pub(crate) fn new_debug_tools() -> Self {
Scope::DebugTools {
io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
}
}
}
@@ -528,41 +519,6 @@ impl RequestContext {
self.read_path_debug
}
pub(crate) fn io_size_metrics(&self) -> &StorageIoSizeMetrics {
match &self.scope {
Scope::Global { io_size_metrics } => {
let is_unit_test = cfg!(test);
let is_regress_test_build = cfg!(feature = "testing");
if is_unit_test || is_regress_test_build {
panic!("all VirtualFile instances are timeline-scoped");
} else {
use once_cell::sync::Lazy;
use std::sync::Mutex;
use std::time::Duration;
use utils::rate_limit::RateLimit;
static LIMIT: Lazy<Mutex<RateLimit>> =
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(1))));
let mut guard = LIMIT.lock().unwrap();
guard.call2(|rate_limit_stats| {
warn!(
%rate_limit_stats,
backtrace=%std::backtrace::Backtrace::force_capture(),
"all VirtualFile instances are timeline-scoped",
);
});
io_size_metrics
}
}
Scope::Timeline { arc_arc } => &arc_arc.storage_io_size,
Scope::SecondaryTimeline { io_size_metrics } => io_size_metrics,
Scope::SecondaryTenant { io_size_metrics } => io_size_metrics,
#[cfg(test)]
Scope::UnitTest { io_size_metrics } => io_size_metrics,
Scope::DebugTools { io_size_metrics } => io_size_metrics,
}
}
pub(crate) fn ondemand_download_wait_observe(&self, duration: Duration) {
if duration == Duration::ZERO {
return;

View File

@@ -7,10 +7,10 @@ use enum_map::{Enum as _, EnumMap};
use futures::Future;
use metrics::{
CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter,
IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
IntCounterPairVec, IntCounterVec, IntGaugeVec, UIntGauge, UIntGaugeVec,
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
};
use once_cell::sync::Lazy;
use pageserver_api::config::{
@@ -626,59 +626,6 @@ pub(crate) enum StorageIoOperation {
}
#[derive(Clone, Copy)]
#[repr(usize)]
pub(crate) enum StorageIoSizeOperation {
Read,
Write,
}
impl StorageIoSizeOperation {
pub(crate) const VARIANTS: &'static [&'static str] = &["read", "write"];
fn as_str(&self) -> &'static str {
Self::VARIANTS[*self as usize]
}
}
// Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
pub(crate) static STORAGE_IO_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_io_operations_bytes_total",
"Total amount of bytes read/written in IO operations",
&["operation", "tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
});
#[derive(Clone, Debug)]
pub(crate) struct StorageIoSizeMetrics {
pub read: UIntGauge,
pub write: UIntGauge,
}
impl StorageIoSizeMetrics {
pub(crate) fn new(tenant_id: &str, shard_id: &str, timeline_id: &str) -> Self {
let read = STORAGE_IO_SIZE
.get_metric_with_label_values(&[
StorageIoSizeOperation::Read.as_str(),
tenant_id,
shard_id,
timeline_id,
])
.unwrap();
let write = STORAGE_IO_SIZE
.get_metric_with_label_values(&[
StorageIoSizeOperation::Write.as_str(),
tenant_id,
shard_id,
timeline_id,
])
.unwrap();
Self { read, write }
}
}
#[cfg(not(test))]
pub(crate) mod virtual_file_descriptor_cache {
use super::*;
@@ -941,19 +888,9 @@ static PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE: Lazy<IntCounterVec>
.expect("failed to define a metric")
});
pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_page_service_config_max_batch_size",
"Configured maximum batch size for the server-side batching functionality of page_service. \
Labels expose more of the configuration parameters.",
&["mode", "execution", "batching"]
)
.expect("failed to define a metric")
});
fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset();
let (label_values, value) = match conf {
let (_label_values, _value) = match conf {
PageServicePipeliningConfig::Serial => (["serial", "-", "-"], 1),
PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
max_batch_size,
@@ -975,9 +912,6 @@ fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
([mode, execution, batching], max_batch_size.get())
}
};
PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE
.with_label_values(&label_values)
.set(value.try_into().unwrap());
}
static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -1329,14 +1263,6 @@ pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(
.expect("failed to define a metric")
});
pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
register_int_gauge!(
"pageserver_walreceiver_active_managers",
"Number of active walreceiver managers"
)
.expect("failed to define a metric")
});
pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
@@ -1346,21 +1272,6 @@ pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_walreceiver_candidates_events_total",
"Number of walreceiver candidate events",
&["event"]
)
.expect("failed to define a metric")
});
pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));
pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));
// Metrics collected on WAL redo operations
//
// We collect the time spent in actual WAL redo ('redo'), and time waiting
@@ -1670,7 +1581,6 @@ pub(crate) struct TimelineMetrics {
pub garbage_collect_histo: StorageTimeMetrics,
pub find_gc_cutoffs_histo: StorageTimeMetrics,
/// copy of LayeredTimeline.current_logical_size
pub storage_io_size: StorageIoSizeMetrics,
shutdown: std::sync::atomic::AtomicBool,
}
@@ -1726,9 +1636,6 @@ impl TimelineMetrics {
&timeline_id,
);
let storage_io_size = StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id);
TimelineMetrics {
tenant_id,
shard_id,
@@ -1740,7 +1647,6 @@ impl TimelineMetrics {
imitate_logical_size_histo,
garbage_collect_histo,
find_gc_cutoffs_histo,
storage_io_size,
shutdown: std::sync::atomic::AtomicBool::default(),
}
}
@@ -1835,10 +1741,6 @@ impl TimelineMetrics {
]);
}
for op in StorageIoSizeOperation::VARIANTS {
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
}
let _ =
WAIT_LSN_IN_PROGRESS_MICROS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
@@ -2259,8 +2161,6 @@ pub fn preinitialize_metrics(
[
&WALRECEIVER_STARTED_CONNECTIONS,
&WALRECEIVER_BROKER_UPDATES,
&WALRECEIVER_CANDIDATES_ADDED,
&WALRECEIVER_CANDIDATES_REMOVED,
&tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES,
&tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
&REMOTE_ONDEMAND_DOWNLOADED_LAYERS,

View File

@@ -4,7 +4,6 @@ use std::str::FromStr;
use std::sync::Arc;
use std::time::{Duration, Instant, SystemTime};
use crate::metrics::{STORAGE_IO_SIZE, StorageIoSizeOperation};
use camino::Utf8PathBuf;
use chrono::format::{DelayedFormat, StrftimeItems};
use futures::Future;
@@ -317,8 +316,8 @@ impl SecondaryDetail {
}
fn clear_timeline_metrics(
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
_tenant_shard_id: &TenantShardId,
_timeline_id: &TimelineId,
detail: SecondaryDetailTimeline,
resident_metric: &UIntGauge,
) {
@@ -330,17 +329,7 @@ impl SecondaryDetail {
.sum(),
);
let shard_id = format!("{}", tenant_shard_id.shard_slug());
let tenant_id = tenant_shard_id.tenant_id.to_string();
let timeline_id = timeline_id.to_string();
for op in StorageIoSizeOperation::VARIANTS {
let _ = STORAGE_IO_SIZE.remove_label_values(&[
op,
tenant_id.as_str(),
shard_id.as_str(),
timeline_id.as_str(),
]);
}
}
/// Additionally returns the total number of layers, used for more stable relative access time

View File

@@ -39,10 +39,6 @@ use utils::postgres_client::{
use super::walreceiver_connection::{WalConnectionStatus, WalReceiverError};
use super::{TaskEvent, TaskHandle, TaskStateUpdate, WalReceiverConf};
use crate::context::{DownloadBehavior, RequestContext};
use crate::metrics::{
WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
WALRECEIVER_CANDIDATES_REMOVED,
};
use crate::task_mgr::TaskKind;
use crate::tenant::{Timeline, debug_assert_current_span_has_tenant_and_timeline_id};
@@ -76,11 +72,6 @@ pub(super) async fn connection_manager_loop_step(
}
}
WALRECEIVER_ACTIVE_MANAGERS.inc();
scopeguard::defer! {
WALRECEIVER_ACTIVE_MANAGERS.dec();
}
let id = TenantTimelineId {
tenant_id: connection_manager_state.timeline.tenant_shard_id.tenant_id,
timeline_id: connection_manager_state.timeline.timeline_id,
@@ -728,8 +719,6 @@ impl ConnectionManagerState {
}
};
WALRECEIVER_BROKER_UPDATES.inc();
trace!(
"safekeeper info update: standby_horizon(cutoff)={}",
timeline_update.standby_horizon
@@ -756,7 +745,6 @@ impl ConnectionManagerState {
%new_safekeeper_id,
"New SK node was added",
);
WALRECEIVER_CANDIDATES_ADDED.inc();
}
}
@@ -1044,7 +1032,6 @@ impl ConnectionManagerState {
"Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections"
);
self.wal_connection_retries.remove(&node_id);
WALRECEIVER_CANDIDATES_REMOVED.inc();
}
}
}

View File

@@ -29,7 +29,6 @@ pub use pageserver_api::models::virtual_file as api;
use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};
use crate::assert_u64_eq_usize::UsizeIsU64;
use crate::context::RequestContext;
use crate::page_cache::{PAGE_SZ, PageWriteGuard};
pub(crate) mod io_engine;
@@ -905,7 +904,7 @@ impl VirtualFileInner {
&self,
buf: tokio_epoll_uring::Slice<Buf>,
offset: u64,
ctx: &RequestContext,
_ctx: &RequestContext,
) -> (tokio_epoll_uring::Slice<Buf>, Result<usize, Error>)
where
Buf: tokio_epoll_uring::IoBufMut + Send,
@@ -922,9 +921,7 @@ impl VirtualFileInner {
observe_duration!(StorageIoOperation::Read, {
let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
let res = res.maybe_fatal_err("io_engine read_at inside VirtualFileInner::read_at");
if let Ok(size) = res {
ctx.io_size_metrics().read.add(size.into_u64());
}
(buf, res)
})
}
@@ -945,7 +942,7 @@ impl VirtualFileInner {
&self,
buf: FullSlice<B>,
offset: u64,
ctx: &RequestContext,
_ctx: &RequestContext,
) -> (FullSlice<B>, Result<usize, Error>) {
let file_guard = match self.lock_file().await {
Ok(file_guard) => file_guard,
@@ -954,9 +951,7 @@ impl VirtualFileInner {
observe_duration!(StorageIoOperation::Write, {
let ((_file_guard, buf), result) =
io_engine::get().write_at(file_guard, offset, buf).await;
if let Ok(size) = result {
ctx.io_size_metrics().write.add(size.into_u64());
}
(buf, result)
})
}