remove some more metrics

This commit is contained in:
BodoBolero
2025-04-17 14:16:07 +02:00
parent e019b82d87
commit ef81d0b81d
6 changed files with 17 additions and 522 deletions

View File

@@ -21,7 +21,7 @@ use pageserver_api::models::InMemoryLayerInfo;
use pageserver_api::shard::TenantShardId;
use postgres_backend::{QueryError, is_expected_io_error};
use pq_proto::framed::ConnectionError;
use strum::{EnumCount, IntoEnumIterator as _, VariantNames};
use strum::{IntoEnumIterator as _, VariantNames};
use strum_macros::{IntoStaticStr, VariantNames};
use utils::id::TimelineId;
@@ -191,39 +191,6 @@ pub(crate) struct ScanLatency {
map: EnumMap<TaskKind, Option<Histogram>>,
}
pub(crate) struct PageCacheSizeMetrics {
pub max_bytes: UIntGauge,
pub current_bytes_immutable: UIntGauge,
}
static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_page_cache_size_current_bytes",
"Current size of the page cache in bytes, by key kind",
&["key_kind"]
)
.expect("failed to define a metric")
});
pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
Lazy::new(|| PageCacheSizeMetrics {
max_bytes: {
register_uint_gauge!(
"pageserver_page_cache_size_max_bytes",
"Maximum size of the page cache in bytes"
)
.expect("failed to define a metric")
},
current_bytes_immutable: {
PAGE_CACHE_SIZE_CURRENT_BYTES
.get_metric_with_label_values(&["immutable"])
.unwrap()
},
});
#[derive(IntoStaticStr)]
#[strum(serialize_all = "kebab_case")]
pub(crate) enum PageCacheErrorKind {
@@ -272,44 +239,6 @@ pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy<IntCounter> = Lazy::n
pub(crate) mod wait_ondemand_download_time {
use super::*;
const WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS: &[f64] = &[
0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, // 10 ms - 100ms
0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, // 100ms to 1s
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, // 1s to 10s
10.0, 20.0, 30.0, 40.0, 50.0, 60.0, // 10s to 1m
];
/// The task kinds for which we want to track wait times for on-demand downloads.
/// Other task kinds' wait times are accumulated in label value `unknown`.
pub(crate) const WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS: [TaskKind; 2] = [
TaskKind::PageRequestHandler,
TaskKind::WalReceiverConnectionHandler,
];
pub(crate) static WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL: Lazy<Vec<Histogram>> = Lazy::new(|| {
let histo = register_histogram_vec!(
"pageserver_wait_ondemand_download_seconds_global",
"Observations are individual tasks' wait times for on-demand downloads. \
If N tasks coalesce on an on-demand download, and it takes 10s, than we observe N * 10s.",
&["task_kind"],
WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS.into(),
)
.expect("failed to define a metric");
WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS
.iter()
.map(|task_kind| histo.with_label_values(&[task_kind.into()]))
.collect::<Vec<_>>()
});
pub(crate) static WAIT_ONDEMAND_DOWNLOAD_TIME_SUM: Lazy<CounterVec> = Lazy::new(|| {
register_counter_vec!(
// use a name that _could_ be evolved into a per-timeline histogram later
"pageserver_wait_ondemand_download_seconds_sum",
"Like `pageserver_wait_ondemand_download_seconds_global` but per timeline",
&["tenant_id", "shard_id", "timeline_id", "task_kind"],
)
.unwrap()
});
pub struct WaitOndemandDownloadTimeSum {
}
@@ -324,19 +253,10 @@ pub(crate) mod wait_ondemand_download_time {
}
}
pub(crate) fn shutdown_timeline(tenant_id: &str, shard_id: &str, timeline_id: &str) {
for task_kind in WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS {
let _ = WAIT_ONDEMAND_DOWNLOAD_TIME_SUM.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
task_kind.into(),
]);
}
pub(crate) fn shutdown_timeline(_tenant_id: &str, _shard_id: &str, _timeline_id: &str) {
}
pub(crate) fn preinitialize_global_metrics() {
Lazy::force(&WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL);
}
}
@@ -588,119 +508,15 @@ pub(crate) static RELSIZE_CACHE_MISSES_OLD: Lazy<IntCounter> = Lazy::new(|| {
});
pub(crate) mod initial_logical_size {
use metrics::{IntCounter, IntCounterVec, register_int_counter, register_int_counter_vec};
use metrics::{IntCounter, register_int_counter};
use once_cell::sync::Lazy;
pub(crate) struct StartCalculation(IntCounterVec);
pub(crate) static START_CALCULATION: Lazy<StartCalculation> = Lazy::new(|| {
StartCalculation(
register_int_counter_vec!(
"pageserver_initial_logical_size_start_calculation",
"Incremented each time we start an initial logical size calculation attempt. \
The `circumstances` label provides some additional details.",
&["attempt", "circumstances"]
)
.unwrap(),
)
});
struct DropCalculation {
first: IntCounter,
retry: IntCounter,
}
static DROP_CALCULATION: Lazy<DropCalculation> = Lazy::new(|| {
let vec = register_int_counter_vec!(
"pageserver_initial_logical_size_drop_calculation",
"Incremented each time we abort a started size calculation attmpt.",
&["attempt"]
)
.unwrap();
DropCalculation {
first: vec.with_label_values(&["first"]),
retry: vec.with_label_values(&["retry"]),
}
});
pub(crate) struct Calculated {
pub(crate) births: IntCounter,
pub(crate) deaths: IntCounter,
}
pub(crate) static CALCULATED: Lazy<Calculated> = Lazy::new(|| Calculated {
births: register_int_counter!(
"pageserver_initial_logical_size_finish_calculation",
"Incremented every time we finish calculation of initial logical size.\
If everything is working well, this should happen at most once per Timeline object."
)
.unwrap(),
deaths: register_int_counter!(
"pageserver_initial_logical_size_drop_finished_calculation",
"Incremented when we drop a finished initial logical size calculation result.\
Mainly useful to turn pageserver_initial_logical_size_finish_calculation into a gauge."
)
.unwrap(),
});
pub(crate) struct OngoingCalculationGuard {
inc_drop_calculation: Option<IntCounter>,
}
#[derive(strum_macros::IntoStaticStr)]
pub(crate) enum StartCircumstances {
EmptyInitial,
SkippedConcurrencyLimiter,
AfterBackgroundTasksRateLimit,
}
impl StartCalculation {
pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
let circumstances_label: &'static str = circumstances.into();
self.0
.with_label_values(&["first", circumstances_label])
.inc();
OngoingCalculationGuard {
inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
}
}
pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
let circumstances_label: &'static str = circumstances.into();
self.0
.with_label_values(&["retry", circumstances_label])
.inc();
OngoingCalculationGuard {
inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
}
}
}
impl Drop for OngoingCalculationGuard {
fn drop(&mut self) {
if let Some(counter) = self.inc_drop_calculation.take() {
counter.inc();
}
}
}
impl OngoingCalculationGuard {
pub(crate) fn calculation_result_saved(mut self) -> FinishedCalculationGuard {
drop(self.inc_drop_calculation.take());
CALCULATED.births.inc();
FinishedCalculationGuard {
inc_on_drop: CALCULATED.deaths.clone(),
}
}
}
pub(crate) struct FinishedCalculationGuard {
inc_on_drop: IntCounter,
}
impl Drop for FinishedCalculationGuard {
fn drop(&mut self) {
self.inc_on_drop.inc();
}
}
// context: https://github.com/neondatabase/neon/issues/5963
pub(crate) static TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE: Lazy<IntCounter> =
@@ -773,25 +589,6 @@ static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_evictions_with_low_residence_duration",
"If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \
Residence duration is determined using the `residence_duration_data_source`.",
&["tenant_id", "shard_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
)
.expect("failed to define a metric")
});
pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_unexpected_ondemand_downloads_count",
"Number of unexpected on-demand downloads. \
We log more context for each increment, so, forgo any labels in this metric.",
)
.expect("failed to define a metric")
});
/// How long did we take to start up? Broken down by labels to describe
/// different phases of startup.
pub static STARTUP_DURATION: Lazy<GaugeVec> = Lazy::new(|| {
@@ -869,134 +666,6 @@ pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
}
});
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
#[derive(Debug)]
pub(crate) struct EvictionsWithLowResidenceDuration {
data_source: &'static str,
threshold: Duration,
counter: Option<IntCounter>,
}
pub(crate) struct EvictionsWithLowResidenceDurationBuilder {
data_source: &'static str,
threshold: Duration,
}
impl EvictionsWithLowResidenceDurationBuilder {
pub fn new(data_source: &'static str, threshold: Duration) -> Self {
Self {
data_source,
threshold,
}
}
fn build(
&self,
tenant_id: &str,
shard_id: &str,
timeline_id: &str,
) -> EvictionsWithLowResidenceDuration {
let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION
.get_metric_with_label_values(&[
tenant_id,
shard_id,
timeline_id,
self.data_source,
&EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold),
])
.unwrap();
EvictionsWithLowResidenceDuration {
data_source: self.data_source,
threshold: self.threshold,
counter: Some(counter),
}
}
}
impl EvictionsWithLowResidenceDuration {
fn threshold_label_value(threshold: Duration) -> String {
format!("{}", threshold.as_secs())
}
pub fn observe(&self, observed_value: Duration) {
if observed_value < self.threshold {
self.counter
.as_ref()
.expect("nobody calls this function after `remove_from_vec`")
.inc();
}
}
pub fn change_threshold(
&mut self,
tenant_id: &str,
shard_id: &str,
timeline_id: &str,
new_threshold: Duration,
) {
if new_threshold == self.threshold {
return;
}
let mut with_new = EvictionsWithLowResidenceDurationBuilder::new(
self.data_source,
new_threshold,
)
.build(tenant_id, shard_id, timeline_id);
std::mem::swap(self, &mut with_new);
with_new.remove(tenant_id, shard_id, timeline_id);
}
// This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
fn remove(&mut self, tenant_id: &str, shard_id: &str, timeline_id: &str) {
let Some(_counter) = self.counter.take() else {
return;
};
let threshold = Self::threshold_label_value(self.threshold);
let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
self.data_source,
&threshold,
]);
match removed {
Err(e) => {
// this has been hit in staging as
// <https://neondatabase.sentry.io/issues/4142396994/>, but we don't know how.
// because we can be in the drop path already, don't risk:
// - "double-panic => illegal instruction" or
// - future "drop panick => abort"
//
// so just nag: (the error has the labels)
tracing::warn!(
"failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}"
);
}
Ok(()) => {
// to help identify cases where we double-remove the same values, let's log all
// deletions?
tracing::info!(
"removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}",
self.data_source
);
}
}
}
}
// Metrics collected on disk IO operations
//
// Roughly logarithmic scale.
const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
0.000030, // 30 usec
0.001000, // 1000 usec
0.030, // 30 ms
1.000, // 1000 ms
30.000, // 30000 ms
];
/// VirtualFile fs operation variants.
///
@@ -1024,52 +693,6 @@ pub(crate) enum StorageIoOperation {
Metadata,
}
impl StorageIoOperation {
pub fn as_str(&self) -> &'static str {
match self {
StorageIoOperation::Open => "open",
StorageIoOperation::OpenAfterReplace => "open-after-replace",
StorageIoOperation::Close => "close",
StorageIoOperation::CloseByReplace => "close-by-replace",
StorageIoOperation::Read => "read",
StorageIoOperation::Write => "write",
StorageIoOperation::Seek => "seek",
StorageIoOperation::Fsync => "fsync",
StorageIoOperation::Metadata => "metadata",
}
}
}
/// Tracks time taken by fs operations near VirtualFile.
#[derive(Debug)]
pub(crate) struct StorageIoTime {
metrics: [Histogram; StorageIoOperation::COUNT],
}
impl StorageIoTime {
fn new() -> Self {
let storage_io_histogram_vec = register_histogram_vec!(
"pageserver_io_operations_seconds",
"Time spent in IO operations",
&["operation"],
STORAGE_IO_TIME_BUCKETS.into()
)
.expect("failed to define a metric");
let metrics = std::array::from_fn(|i| {
let op = StorageIoOperation::from_repr(i).unwrap();
storage_io_histogram_vec
.get_metric_with_label_values(&[op.as_str()])
.unwrap()
});
Self { metrics }
}
pub(crate) fn get(&self, op: StorageIoOperation) -> &Histogram {
&self.metrics[op as usize]
}
}
pub(crate) static STORAGE_IO_TIME_METRIC: Lazy<StorageIoTime> = Lazy::new(StorageIoTime::new);
#[derive(Clone, Copy)]
#[repr(usize)]
@@ -2190,7 +1813,6 @@ pub(crate) struct TimelineMetrics {
pub load_layer_map_histo: StorageTimeMetrics,
pub garbage_collect_histo: StorageTimeMetrics,
pub find_gc_cutoffs_histo: StorageTimeMetrics,
pub last_record_lsn_gauge: IntGauge,
pub disk_consistent_lsn_gauge: IntGauge,
pub pitr_history_size: UIntGauge,
pub archival_size: UIntGauge,
@@ -2203,7 +1825,6 @@ pub(crate) struct TimelineMetrics {
pub aux_file_size_gauge: IntGauge,
pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
pub evictions: IntCounter,
pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
/// Number of valid LSN leases.
pub valid_lsn_lease_count_gauge: UIntGauge,
pub wal_records_received: IntCounter,
@@ -2218,7 +1839,6 @@ impl TimelineMetrics {
pub fn new(
tenant_shard_id: &TenantShardId,
timeline_id_raw: &TimelineId,
evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
) -> Self {
let tenant_id = tenant_shard_id.tenant_id.to_string();
let shard_id = format!("{}", tenant_shard_id.shard_slug());
@@ -2277,9 +1897,6 @@ impl TimelineMetrics {
&shard_id,
&timeline_id,
);
let last_record_lsn_gauge = LAST_RECORD_LSN
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let disk_consistent_lsn_gauge = DISK_CONSISTENT_LSN
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
@@ -2332,8 +1949,6 @@ impl TimelineMetrics {
let evictions = EVICTIONS
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
.build(&tenant_id, &shard_id, &timeline_id);
let valid_lsn_lease_count_gauge = VALID_LSN_LEASE_COUNT
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
@@ -2373,7 +1988,6 @@ impl TimelineMetrics {
garbage_collect_histo,
find_gc_cutoffs_histo,
load_layer_map_histo,
last_record_lsn_gauge,
disk_consistent_lsn_gauge,
pitr_history_size,
archival_size,
@@ -2385,9 +1999,6 @@ impl TimelineMetrics {
aux_file_size_gauge,
directory_entries_count_gauge,
evictions,
evictions_with_low_residence_duration: std::sync::RwLock::new(
evictions_with_low_residence_duration,
),
storage_io_size,
valid_lsn_lease_count_gauge,
wal_records_received,
@@ -2482,11 +2093,6 @@ impl TimelineMetrics {
let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
self.evictions_with_low_residence_duration
.write()
.unwrap()
.remove(tenant_id, shard_id, timeline_id);
// The following metrics are born outside of the TimelineMetrics lifecycle but still
// removed at the end of it. The idea is to have the metrics outlive the
// entity during which they're observed, e.g., the smgr metrics shall
@@ -3244,7 +2850,6 @@ pub fn preinitialize_metrics(
// counters
[
&UNEXPECTED_ONDEMAND_DOWNLOADS,
&WALRECEIVER_STARTED_CONNECTIONS,
&WALRECEIVER_BROKER_UPDATES,
&WALRECEIVER_CANDIDATES_ADDED,

View File

@@ -77,7 +77,6 @@ use anyhow::Context;
use once_cell::sync::OnceCell;
use crate::context::RequestContext;
use crate::metrics::PageCacheSizeMetrics;
use crate::virtual_file::{IoBufferMut, IoPageSlice};
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -206,7 +205,6 @@ pub struct PageCache {
/// This is interpreted modulo the page cache size.
next_evict_slot: AtomicUsize,
size_metrics: &'static PageCacheSizeMetrics,
}
struct PinnedSlotsPermit {
@@ -500,7 +498,6 @@ impl PageCache {
let mut map = self.immutable_page_map.write().unwrap();
map.remove(&(*file_id, *blkno))
.expect("could not find old key in mapping");
self.size_metrics.current_bytes_immutable.sub_page_sz(1);
}
}
}
@@ -518,7 +515,6 @@ impl PageCache {
Entry::Occupied(entry) => Some(*entry.get()),
Entry::Vacant(entry) => {
entry.insert(slot_idx);
self.size_metrics.current_bytes_immutable.add_page_sz(1);
None
}
}
@@ -608,10 +604,6 @@ impl PageCache {
// this is avoided.
let page_buffer = IoBufferMut::with_capacity_zeroed(num_pages * PAGE_SZ).leak();
let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
size_metrics.max_bytes.set_page_sz(num_pages);
size_metrics.current_bytes_immutable.set_page_sz(0);
let slots = page_buffer
.chunks_exact_mut(PAGE_SZ)
.map(|chunk| {
@@ -633,31 +625,8 @@ impl PageCache {
immutable_page_map: Default::default(),
slots,
next_evict_slot: AtomicUsize::new(0),
size_metrics,
pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
}
}
}
trait PageSzBytesMetric {
fn set_page_sz(&self, count: usize);
fn add_page_sz(&self, count: usize);
fn sub_page_sz(&self, count: usize);
}
#[inline(always)]
fn count_times_page_sz(count: usize) -> u64 {
u64::try_from(count).unwrap() * u64::try_from(PAGE_SZ).unwrap()
}
impl PageSzBytesMetric for metrics::UIntGauge {
fn set_page_sz(&self, count: usize) {
self.set(count_times_page_sz(count));
}
fn add_page_sz(&self, count: usize) {
self.add(count_times_page_sz(count));
}
fn sub_page_sz(&self, count: usize) {
self.sub(count_times_page_sz(count));
}
}

View File

@@ -1121,7 +1121,6 @@ impl LayerInner {
"unexpectedly on-demand downloading for task kind {:?}",
ctx.task_kind()
);
crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
let really_error =
matches!(b, Error) && !self.conf.ondemand_download_behavior_treat_error_as_warn;
@@ -1570,17 +1569,7 @@ impl LayerInner {
Ok(elapsed) => {
let accessed_and_visible = self.access_stats.accessed()
&& self.access_stats.visibility() == LayerVisibilityHint::Visible;
if accessed_and_visible {
// Only layers used for reads contribute to our "low residence" metric that is used
// to detect thrashing. Layers promoted for other reasons (e.g. compaction) are allowed
// to be rapidly evicted without contributing to this metric.
timeline
.metrics
.evictions_with_low_residence_duration
.read()
.unwrap()
.observe(elapsed);
}
tracing::info!(
residence_millis = elapsed.as_millis(),

View File

@@ -2718,15 +2718,6 @@ impl Timeline {
.unwrap_or(self.conf.default_tenant_conf.eviction_policy)
}
fn get_evictions_low_residence_duration_metric_threshold(
tenant_conf: &pageserver_api::models::TenantConfig,
default_tenant_conf: &pageserver_api::config::TenantConfigToml,
) -> Duration {
tenant_conf
.evictions_low_residence_duration_metric_threshold
.unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
}
fn get_image_layer_creation_check_threshold(&self) -> u8 {
let tenant_conf = self.tenant_conf.load();
tenant_conf
@@ -2802,28 +2793,8 @@ impl Timeline {
// The threshold is embedded in the metric. So, we need to update it.
{
let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
&new_conf.tenant_conf,
&self.conf.default_tenant_conf,
);
let tenant_id_str = self.tenant_shard_id.tenant_id.to_string();
let shard_id_str = format!("{}", self.tenant_shard_id.shard_slug());
let timeline_id_str = self.timeline_id.to_string();
self.remote_client.update_config(&new_conf.location);
self.metrics
.evictions_with_low_residence_duration
.write()
.unwrap()
.change_threshold(
&tenant_id_str,
&shard_id_str,
&timeline_id_str,
new_threshold,
);
}
}
@@ -2857,13 +2828,6 @@ impl Timeline {
let (layer_flush_start_tx, _) = tokio::sync::watch::channel((0, disk_consistent_lsn));
let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
let evictions_low_residence_duration_metric_threshold = {
let loaded_tenant_conf = tenant_conf.load();
Self::get_evictions_low_residence_duration_metric_threshold(
&loaded_tenant_conf.tenant_conf,
&conf.default_tenant_conf,
)
};
if let Some(ancestor) = &ancestor {
let mut ancestor_gc_info = ancestor.gc_info.write().unwrap();
@@ -2876,10 +2840,6 @@ impl Timeline {
let metrics = Arc::new(TimelineMetrics::new(
&tenant_shard_id,
&timeline_id,
crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
"mtime",
evictions_low_residence_duration_metric_threshold,
),
));
let aux_file_metrics = metrics.aux_file_size_gauge.clone();
@@ -3016,10 +2976,6 @@ impl Timeline {
result.repartition_threshold =
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
result
.metrics
.last_record_lsn_gauge
.set(disk_consistent_lsn.0 as i64);
result
})
}
@@ -3489,7 +3445,7 @@ impl Timeline {
self.current_logical_size.initialized.add_permits(1);
}
let try_once = |attempt: usize| {
let try_once = |_attempt: usize| {
let background_ctx = &background_ctx;
let self_ref = &self;
let skip_concurrency_limiter = &skip_concurrency_limiter;
@@ -3500,7 +3456,7 @@ impl Timeline {
);
use crate::metrics::initial_logical_size::StartCircumstances;
let (_maybe_permit, circumstances) = tokio::select! {
let (_maybe_permit, _circumstances) = tokio::select! {
permit = wait_for_permit => {
(Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit)
}
@@ -3517,12 +3473,6 @@ impl Timeline {
}
};
let metrics_guard = if attempt == 1 {
crate::metrics::initial_logical_size::START_CALCULATION.first(circumstances)
} else {
crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances)
};
let io_concurrency = IoConcurrency::spawn_from_conf(
self_ref.conf,
self_ref
@@ -3549,7 +3499,7 @@ impl Timeline {
// TODO: add aux file size to logical size
Ok((calculated_size, metrics_guard))
Ok(calculated_size)
}
};
@@ -3586,7 +3536,7 @@ impl Timeline {
}
};
let (calculated_size, metrics_guard) = match retrying.await {
let calculated_size = match retrying.await {
ControlFlow::Continue(calculated_size) => calculated_size,
ControlFlow::Break(()) => return,
};
@@ -3605,8 +3555,7 @@ impl Timeline {
self.current_logical_size
.initial_logical_size
.set((calculated_size, metrics_guard.calculation_result_saved()))
.ok()
.set((calculated_size,))
.expect("only this task sets it");
}
@@ -4503,8 +4452,6 @@ impl Timeline {
pub(crate) fn finish_write(&self, new_lsn: Lsn) {
assert!(new_lsn.is_aligned());
self.metrics.last_record_lsn_gauge.set(new_lsn.0 as i64);
self.last_record_lsn.advance(new_lsn);
}

View File

@@ -23,7 +23,6 @@ pub(super) struct LogicalSize {
/// the initial size at a different LSN.
pub initial_logical_size: OnceCell<(
u64,
crate::metrics::initial_logical_size::FinishedCalculationGuard,
)>,
/// Cancellation for the best-effort logical size calculation.
@@ -130,11 +129,7 @@ impl CurrentLogicalSize {
impl LogicalSize {
pub(super) fn empty_initial() -> Self {
Self {
initial_logical_size: OnceCell::with_value((0, {
crate::metrics::initial_logical_size::START_CALCULATION
.first(crate::metrics::initial_logical_size::StartCircumstances::EmptyInitial)
.calculation_result_saved()
})),
initial_logical_size: OnceCell::with_value((0,)),
cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
initial_part_end: None,
size_added_after_initial: AtomicI64::new(0),
@@ -159,7 +154,7 @@ impl LogicalSize {
// ^^^ keep this type explicit so that the casts in this function break if
// we change the type.
match self.initial_logical_size.get() {
Some((initial_size, _)) => {
Some((initial_size, )) => {
CurrentLogicalSize::Exact(Exact(initial_size.checked_add_signed(size_increment)
.with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
.unwrap()))
@@ -181,7 +176,7 @@ impl LogicalSize {
/// available for re-use. This doesn't contain the incremental part.
pub(super) fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
match self.initial_part_end {
Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, _)| *s),
Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, )| *s),
_ => None,
}
}

View File

@@ -27,12 +27,10 @@ use owned_buffers_io::io_buf_ext::FullSlice;
use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
pub use pageserver_api::models::virtual_file as api;
use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
use tokio::time::Instant;
use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};
use crate::assert_u64_eq_usize::UsizeIsU64;
use crate::context::RequestContext;
use crate::metrics::{STORAGE_IO_TIME_METRIC, StorageIoOperation};
use crate::page_cache::{PAGE_SZ, PageWriteGuard};
pub(crate) mod io_engine;
pub use io_engine::{
@@ -431,9 +429,7 @@ impl OpenFiles {
if let Some(old_file) = slot_guard.file.take() {
// the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
// distinguish the two.
STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::CloseByReplace)
.observe_closure_duration(|| drop(old_file));
drop(old_file);
}
// Prepare the slot for reuse and return it
@@ -532,13 +528,9 @@ impl<T> MaybeFatalIo<T> for std::io::Result<T> {
/// where "support" means that we measure wall clock time.
macro_rules! observe_duration {
($op:expr, $($body:tt)*) => {{
let instant = Instant::now();
let result = $($body)*;
let elapsed = instant.elapsed().as_secs_f64();
STORAGE_IO_TIME_METRIC
.get($op)
.observe(elapsed);
result
$($body)*
}}
}
@@ -1263,9 +1255,7 @@ impl Drop for VirtualFileInner {
// there is also operation "close-by-replace" for closes done on eviction for
// comparison.
if let Some(fd) = slot_guard.file.take() {
STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::Close)
.observe_closure_duration(|| drop(fd));
drop(fd);
}
}
}