diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 141d8a6d01..0ff8ec8be3 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -103,9 +103,10 @@ static MAXRSS_KB: Lazy = Lazy::new(|| { .expect("Failed to register maxrss_kb int gauge") }); -pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[ - 0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, -]; +/// Most common fsync latency is 50 µs - 100 µs, but it can be much higher, +/// especially during many concurrent disk operations. +pub const DISK_FSYNC_SECONDS_BUCKETS: &[f64] = + &[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 30.0]; pub struct BuildInfo { pub revision: &'static str, diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 1e965393e3..a484c45af8 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -5,15 +5,15 @@ use std::{ time::{Instant, SystemTime}, }; -use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS}; +use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_FSYNC_SECONDS_BUCKETS}; use anyhow::Result; use futures::Future; use metrics::{ core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}, proto::MetricFamily, - register_int_counter, register_int_counter_pair, register_int_counter_pair_vec, - register_int_counter_vec, Gauge, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec, - IntGaugeVec, + register_histogram_vec, register_int_counter, register_int_counter_pair, + register_int_counter_pair_vec, register_int_counter_vec, Gauge, HistogramVec, IntCounter, + IntCounterPair, IntCounterPairVec, IntCounterVec, IntGaugeVec, }; use once_cell::sync::Lazy; @@ -48,7 +48,7 @@ pub static WRITE_WAL_SECONDS: Lazy = Lazy::new(|| { register_histogram!( "safekeeper_write_wal_seconds", "Seconds spent writing and syncing WAL to a disk in a single request", - DISK_WRITE_SECONDS_BUCKETS.to_vec() + DISK_FSYNC_SECONDS_BUCKETS.to_vec() ) .expect("Failed to register safekeeper_write_wal_seconds histogram") }); @@ -56,7 +56,7 @@ pub static FLUSH_WAL_SECONDS: Lazy = Lazy::new(|| { register_histogram!( "safekeeper_flush_wal_seconds", "Seconds spent syncing WAL to a disk", - DISK_WRITE_SECONDS_BUCKETS.to_vec() + DISK_FSYNC_SECONDS_BUCKETS.to_vec() ) .expect("Failed to register safekeeper_flush_wal_seconds histogram") }); @@ -64,10 +64,26 @@ pub static PERSIST_CONTROL_FILE_SECONDS: Lazy = Lazy::new(|| { register_histogram!( "safekeeper_persist_control_file_seconds", "Seconds to persist and sync control file", - DISK_WRITE_SECONDS_BUCKETS.to_vec() + DISK_FSYNC_SECONDS_BUCKETS.to_vec() ) .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec") }); +pub static WAL_STORAGE_OPERATION_SECONDS: Lazy = Lazy::new(|| { + register_histogram_vec!( + "safekeeper_wal_storage_operation_seconds", + "Seconds spent on WAL storage operations", + &["operation"] + ) + .expect("Failed to register safekeeper_wal_storage_operation_seconds histogram vec") +}); +pub static MISC_OPERATION_SECONDS: Lazy = Lazy::new(|| { + register_histogram_vec!( + "safekeeper_misc_operation_seconds", + "Seconds spent on miscellaneous operations", + &["operation"] + ) + .expect("Failed to register safekeeper_misc_operation_seconds histogram vec") +}); pub static PG_IO_BYTES: Lazy = Lazy::new(|| { register_int_counter_vec!( "safekeeper_pg_io_bytes_total", @@ -126,7 +142,7 @@ pub static BROKER_PUSH_ALL_UPDATES_SECONDS: Lazy = Lazy::new(|| { register_histogram!( "safekeeper_broker_push_update_seconds", "Seconds to push all timeline updates to the broker", - DISK_WRITE_SECONDS_BUCKETS.to_vec() + DISK_FSYNC_SECONDS_BUCKETS.to_vec() ) .expect("Failed to register safekeeper_broker_push_update_seconds histogram vec") }); diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 4d0992e8bd..33ec39b852 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -15,6 +15,7 @@ use storage_broker::proto::SafekeeperTimelineInfo; use tracing::*; use crate::control_file; +use crate::metrics::MISC_OPERATION_SECONDS; use crate::send_wal::HotStandbyFeedback; use crate::state::TimelineState; @@ -696,6 +697,10 @@ where &mut self, msg: &ProposerElected, ) -> Result> { + let _timer = MISC_OPERATION_SECONDS + .with_label_values(&["handle_elected"]) + .start_timer(); + info!("received ProposerElected {:?}", msg); if self.state.acceptor_state.term < msg.term { let mut state = self.state.start_change(); diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index e0f7b65aef..dca6414082 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -189,7 +189,12 @@ where /// Persist given state. c.f. start_change. pub async fn finish_change(&mut self, s: &TimelinePersistentState) -> Result<()> { - self.pers.persist(s).await?; + if s.eq(&*self.pers) { + // nothing to do if state didn't change + } else { + self.pers.persist(s).await?; + } + // keep in memory values up to date self.inmem.commit_lsn = s.commit_lsn; self.inmem.backup_lsn = s.backup_lsn; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index f632cd6fb3..6b83270c18 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -39,7 +39,7 @@ use crate::wal_backup::{self}; use crate::wal_backup_partial::PartialRemoteSegment; use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; -use crate::metrics::{FullTimelineInfo, WalStorageMetrics}; +use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS}; use crate::wal_storage::{Storage as wal_storage_iface, WalReader}; use crate::{debug_dump, timeline_manager, wal_storage}; use crate::{GlobalTimelines, SafeKeeperConf}; @@ -856,28 +856,40 @@ impl Timeline { } debug!("requesting WalResidentTimeline guard"); + let started_at = Instant::now(); + let status_before = self.mgr_status.get(); - // Wait 5 seconds for the guard to be acquired, should be enough for uneviction. - // If it times out, most likely there is a deadlock in the manager task. - let res = tokio::time::timeout( - Duration::from_secs(5), + // Wait 30 seconds for the guard to be acquired. It can time out if someone is + // holding the lock (e.g. during `SafeKeeper::process_msg()`) or manager task + // is stuck. + let res = tokio::time::timeout_at( + started_at + Duration::from_secs(30), self.manager_ctl.wal_residence_guard(), ) .await; let guard = match res { - Ok(Ok(guard)) => guard, + Ok(Ok(guard)) => { + let finished_at = Instant::now(); + let elapsed = finished_at - started_at; + MISC_OPERATION_SECONDS + .with_label_values(&["wal_residence_guard"]) + .observe(elapsed.as_secs_f64()); + + guard + } Ok(Err(e)) => { warn!( - "error while acquiring WalResidentTimeline guard (current state {:?}): {}", - self.mgr_status.get(), - e + "error while acquiring WalResidentTimeline guard, statuses {:?} => {:?}", + status_before, + self.mgr_status.get() ); return Err(e); } Err(_) => { warn!( - "timeout while acquiring WalResidentTimeline guard (current state {:?})", + "timeout while acquiring WalResidentTimeline guard, statuses {:?} => {:?}", + status_before, self.mgr_status.get() ); anyhow::bail!("timeout while acquiring WalResidentTimeline guard"); diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index c3abeac644..66c62ce197 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -22,7 +22,7 @@ use utils::lsn::Lsn; use crate::{ control_file::{FileStorage, Storage}, - metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL}, + metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS}, recovery::recovery_main, remove_wal::calc_horizon_lsn, safekeeper::Term, @@ -357,6 +357,10 @@ impl Manager { /// Get a snapshot of the timeline state. async fn state_snapshot(&self) -> StateSnapshot { + let _timer = MISC_OPERATION_SECONDS + .with_label_values(&["state_snapshot"]) + .start_timer(); + StateSnapshot::new( self.tli.read_shared_state().await, self.conf.heartbeat_timeout, diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 74c4693ccd..ded8571a3e 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -23,7 +23,9 @@ use tokio::io::{AsyncReadExt, AsyncSeekExt}; use tracing::*; use utils::crashsafe::durable_rename; -use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS}; +use crate::metrics::{ + time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, +}; use crate::state::TimelinePersistentState; use crate::wal_backup::{read_object, remote_timeline_path}; use crate::SafeKeeperConf; @@ -331,6 +333,10 @@ impl Storage for PhysicalStorage { } async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()> { + let _timer = WAL_STORAGE_OPERATION_SECONDS + .with_label_values(&["initialize_first_segment"]) + .start_timer(); + let segno = init_lsn.segment_number(self.wal_seg_size); let (mut file, _) = self.open_or_create(segno).await?; let major_pg_version = self.pg_version / 10000; @@ -422,6 +428,10 @@ impl Storage for PhysicalStorage { /// Truncate written WAL by removing all WAL segments after the given LSN. /// end_pos must point to the end of the WAL record. async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { + let _timer = WAL_STORAGE_OPERATION_SECONDS + .with_label_values(&["truncate_wal"]) + .start_timer(); + // Streaming must not create a hole, so truncate cannot be called on non-written lsn if self.write_lsn != Lsn(0) && end_pos > self.write_lsn { bail!( @@ -497,6 +507,10 @@ async fn remove_segments_from_disk( wal_seg_size: usize, remove_predicate: impl Fn(XLogSegNo) -> bool, ) -> Result<()> { + let _timer = WAL_STORAGE_OPERATION_SECONDS + .with_label_values(&["remove_segments_from_disk"]) + .start_timer(); + let mut n_removed = 0; let mut min_removed = u64::MAX; let mut max_removed = u64::MIN;