mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-04 03:52:56 +00:00
Add disk write/sync metrics to Safekeeper (#745)
This commit is contained in:
@@ -19,7 +19,10 @@ use lazy_static::lazy_static;
|
||||
|
||||
use crate::replication::HotStandbyFeedback;
|
||||
use postgres_ffi::xlog_utils::MAX_SEND_SIZE;
|
||||
use zenith_metrics::{register_gauge_vec, Gauge, GaugeVec};
|
||||
use zenith_metrics::{
|
||||
register_gauge_vec, register_histogram_vec, Gauge, GaugeVec, Histogram, HistogramVec,
|
||||
DISK_WRITE_SECONDS_BUCKETS,
|
||||
};
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::pq_proto::SystemId;
|
||||
@@ -299,11 +302,27 @@ lazy_static! {
|
||||
&["ztli"]
|
||||
)
|
||||
.expect("Failed to register safekeeper_commit_lsn gauge vec");
|
||||
static ref WRITE_WAL_BYTES: HistogramVec = register_histogram_vec!(
|
||||
"safekeeper_write_wal_bytes",
|
||||
"Bytes written to WAL in a single request, grouped by timeline",
|
||||
&["timeline_id"],
|
||||
vec![1.0, 10.0, 100.0, 1024.0, 8192.0, 128.0 * 1024.0, 1024.0 * 1024.0, 10.0 * 1024.0 * 1024.0]
|
||||
)
|
||||
.expect("Failed to register safekeeper_write_wal_bytes histogram vec");
|
||||
static ref WRITE_WAL_SECONDS: HistogramVec = register_histogram_vec!(
|
||||
"safekeeper_write_wal_seconds",
|
||||
"Seconds spent writing and syncing WAL to a disk in a single request, grouped by timeline",
|
||||
&["timeline_id"],
|
||||
DISK_WRITE_SECONDS_BUCKETS.to_vec()
|
||||
)
|
||||
.expect("Failed to register safekeeper_write_wal_seconds histogram vec");
|
||||
}
|
||||
|
||||
struct SafeKeeperMetrics {
|
||||
flush_lsn: Gauge,
|
||||
commit_lsn: Gauge,
|
||||
write_wal_bytes: Histogram,
|
||||
write_wal_seconds: Histogram,
|
||||
}
|
||||
|
||||
impl SafeKeeperMetrics {
|
||||
@@ -312,6 +331,8 @@ impl SafeKeeperMetrics {
|
||||
SafeKeeperMetrics {
|
||||
flush_lsn: FLUSH_LSN_GAUGE.with_label_values(&[&ztli_str]),
|
||||
commit_lsn: COMMIT_LSN_GAUGE.with_label_values(&[&ztli_str]),
|
||||
write_wal_bytes: WRITE_WAL_BYTES.with_label_values(&[&ztli_str]),
|
||||
write_wal_seconds: WRITE_WAL_SECONDS.with_label_values(&[&ztli_str]),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -319,6 +340,8 @@ impl SafeKeeperMetrics {
|
||||
SafeKeeperMetrics {
|
||||
flush_lsn: FLUSH_LSN_GAUGE.with_label_values(&["n/a"]),
|
||||
commit_lsn: COMMIT_LSN_GAUGE.with_label_values(&["n/a"]),
|
||||
write_wal_bytes: WRITE_WAL_BYTES.with_label_values(&["n/a"]),
|
||||
write_wal_seconds: WRITE_WAL_SECONDS.with_label_values(&["n/a"]),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -472,8 +495,14 @@ where
|
||||
// do the job
|
||||
let mut last_rec_lsn = Lsn(0);
|
||||
if !msg.wal_data.is_empty() {
|
||||
self.storage
|
||||
.write_wal(&self.s.server, msg.h.begin_lsn, &msg.wal_data)?;
|
||||
self.metrics
|
||||
.write_wal_bytes
|
||||
.observe(msg.wal_data.len() as f64);
|
||||
{
|
||||
let _timer = self.metrics.write_wal_seconds.start_timer();
|
||||
self.storage
|
||||
.write_wal(&self.s.server, msg.h.begin_lsn, &msg.wal_data)?;
|
||||
}
|
||||
|
||||
// figure out last record's end lsn for reporting (if we got the
|
||||
// whole record)
|
||||
|
||||
@@ -12,6 +12,7 @@ use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{Seek, SeekFrom, Write};
|
||||
use std::sync::{Arc, Condvar, Mutex};
|
||||
use std::time::Duration;
|
||||
use zenith_metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS};
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
@@ -73,6 +74,23 @@ pub enum CreateControlFile {
|
||||
False,
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref PERSIST_SYNC_CONTROL_FILE_SECONDS: HistogramVec = register_histogram_vec!(
|
||||
"safekeeper_persist_sync_control_file_seconds",
|
||||
"Seconds to persist and sync control file, grouped by timeline",
|
||||
&["timeline_id"],
|
||||
DISK_WRITE_SECONDS_BUCKETS.to_vec()
|
||||
)
|
||||
.expect("Failed to register safekeeper_persist_sync_control_file_seconds histogram vec");
|
||||
static ref PERSIST_NOSYNC_CONTROL_FILE_SECONDS: HistogramVec = register_histogram_vec!(
|
||||
"safekeeper_persist_nosync_control_file_seconds",
|
||||
"Seconds to persist and sync control file, grouped by timeline",
|
||||
&["timeline_id"],
|
||||
DISK_WRITE_SECONDS_BUCKETS.to_vec()
|
||||
)
|
||||
.expect("Failed to register safekeeper_persist_nosync_control_file_seconds histogram vec");
|
||||
}
|
||||
|
||||
impl SharedState {
|
||||
/// Get combined stateof all alive replicas
|
||||
pub fn get_replicas_state(&self) -> ReplicaState {
|
||||
@@ -109,9 +127,14 @@ impl SharedState {
|
||||
create: CreateControlFile,
|
||||
) -> Result<Self> {
|
||||
let (cf, state) = SharedState::load_control_file(conf, timelineid, create)?;
|
||||
let timelineid_str = format!("{}", timelineid);
|
||||
let storage = FileStorage {
|
||||
control_file: cf,
|
||||
conf: conf.clone(),
|
||||
persist_sync_control_file_seconds: PERSIST_SYNC_CONTROL_FILE_SECONDS
|
||||
.with_label_values(&[&timelineid_str]),
|
||||
persist_nosync_control_file_seconds: PERSIST_NOSYNC_CONTROL_FILE_SECONDS
|
||||
.with_label_values(&[&timelineid_str]),
|
||||
};
|
||||
let (flush_lsn, tli) = if state.server.wal_seg_size != 0 {
|
||||
let wal_dir = conf.workdir.join(format!("{}", timelineid));
|
||||
@@ -376,10 +399,18 @@ impl GlobalTimelines {
|
||||
struct FileStorage {
|
||||
control_file: File,
|
||||
conf: SafeKeeperConf,
|
||||
persist_sync_control_file_seconds: Histogram,
|
||||
persist_nosync_control_file_seconds: Histogram,
|
||||
}
|
||||
|
||||
impl Storage for FileStorage {
|
||||
fn persist(&mut self, s: &SafeKeeperState, sync: bool) -> Result<()> {
|
||||
let _timer = if sync {
|
||||
&self.persist_sync_control_file_seconds
|
||||
} else {
|
||||
&self.persist_nosync_control_file_seconds
|
||||
}
|
||||
.start_timer();
|
||||
self.control_file.seek(SeekFrom::Start(0))?;
|
||||
s.ser_into(&mut self.control_file)?;
|
||||
if sync {
|
||||
|
||||
@@ -74,6 +74,10 @@ lazy_static! {
|
||||
.expect("Failed to register maxrss_kb int gauge");
|
||||
}
|
||||
|
||||
pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
|
||||
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
|
||||
];
|
||||
|
||||
// Records I/O stats in a "cross-platform" way.
|
||||
// Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
|
||||
// An alternative is to read procfs (`/proc/[pid]/io`) which does not work under macOS at all, hence abandoned.
|
||||
|
||||
Reference in New Issue
Block a user