Add metrics to WAL redo.

Track the time spent on replaying WAL records by the special Postgres
process, the time spent waiting for acces to the Postgres process (since
there is only one per tenant), and the number of records replayed.
This commit is contained in:
Heikki Linnakangas
2021-08-16 15:49:17 +03:00
parent 047a05efb2
commit 7ee8de3725

View File

@@ -20,6 +20,7 @@
//!
use byteorder::{ByteOrder, LittleEndian};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use lazy_static::lazy_static;
use log::*;
use serde::{Deserialize, Serialize};
use std::cell::RefCell;
@@ -36,6 +37,7 @@ use tokio::io::AsyncBufReadExt;
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tokio::process::{ChildStdin, ChildStdout, Command};
use tokio::time::timeout;
use zenith_metrics::{register_histogram, register_int_counter, Histogram, IntCounter};
use zenith_utils::bin_ser::BeSer;
use zenith_utils::lsn::Lsn;
use zenith_utils::zid::ZTenantId;
@@ -103,6 +105,27 @@ impl crate::walredo::WalRedoManager for DummyRedoManager {
static TIMEOUT: Duration = Duration::from_secs(20);
// Metrics collected on WAL redo operations
//
// We collect the time spent in actual WAL redo ('redo'), and time waiting
// for access to the postgres process ('wait') since there is only one for
// each tenant.
lazy_static! {
static ref WAL_REDO_TIME: Histogram =
register_histogram!("pageserver_wal_redo_time", "Time spent on WAL redo")
.expect("failed to define a metric");
static ref WAL_REDO_WAIT_TIME: Histogram = register_histogram!(
"pageserver_wal_redo_wait_time",
"Time spent waiting for access to the WAL redo process"
)
.expect("failed to define a metric");
static ref WAL_REDO_RECORD_COUNTER: IntCounter = register_int_counter!(
"pageserver_wal_records_replayed",
"Number of WAL records replayed"
)
.unwrap();
}
///
/// This is the real implementation that uses a Postgres process to
/// perform WAL replay. Only one thread can use the processs at a time,
@@ -156,6 +179,9 @@ impl WalRedoManager for PostgresRedoManager {
base_img: Option<Bytes>,
records: Vec<WALRecord>,
) -> Result<Bytes, WalRedoError> {
let start_time;
let lock_time;
let end_time;
let request = WalRedoRequest {
rel,
@@ -165,16 +191,29 @@ impl WalRedoManager for PostgresRedoManager {
records,
};
// launch the WAL redo process on first use
let mut process_guard = self.process.lock().unwrap();
if process_guard.is_none() {
let p = self.runtime
.block_on(PostgresRedoProcess::launch(self.conf, &self.tenantid))?;
*process_guard = Some(p);
}
let process = (*process_guard).as_ref().unwrap();
start_time = Instant::now();
let result = {
let mut process_guard = self.process.lock().unwrap();
lock_time = Instant::now();
self.runtime.block_on(self.handle_apply_request(&process, &request))
// launch the WAL redo process on first use
if process_guard.is_none() {
let p = self
.runtime
.block_on(PostgresRedoProcess::launch(self.conf, &self.tenantid))?;
*process_guard = Some(p);
}
let process = (*process_guard).as_ref().unwrap();
self.runtime
.block_on(self.handle_apply_request(&process, &request))
};
end_time = Instant::now();
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());
result
}
}
@@ -257,6 +296,8 @@ impl PostgresRedoManager {
for record in records {
let mut buf = record.rec.clone();
WAL_REDO_RECORD_COUNTER.inc();
// 1. Parse XLogRecord struct
// FIXME: refactor to avoid code duplication.
let xlogrec = XLogRecord::from_bytes(&mut buf);
@@ -557,6 +598,8 @@ impl PostgresRedoProcess {
for rec in records.iter() {
let r = rec.clone();
WAL_REDO_RECORD_COUNTER.inc();
stdin
.write_all(&build_apply_record_msg(r.lsn, r.rec))
.await?;