mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-05 20:42:54 +00:00
feat(walredo): various observability improvements (#6573)
- log when we start walredo process - include tenant shard id in walredo argv - dump some basic walredo state in tenant details api - more suitable walredo process launch histogram buckets - avoid duplicate tracing labels in walredo launch spans
This commit is contained in:
committed by
GitHub
parent
be30388901
commit
7a70ef991f
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -3494,6 +3494,7 @@ dependencies = [
|
||||
"bincode",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"const_format",
|
||||
"enum-map",
|
||||
"hex",
|
||||
|
||||
@@ -20,6 +20,7 @@ strum_macros.workspace = true
|
||||
hex.workspace = true
|
||||
thiserror.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
chrono.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
|
||||
@@ -454,6 +454,8 @@ pub struct TenantDetails {
|
||||
#[serde(flatten)]
|
||||
pub tenant_info: TenantInfo,
|
||||
|
||||
pub walredo: Option<WalRedoManagerStatus>,
|
||||
|
||||
pub timelines: Vec<TimelineId>,
|
||||
}
|
||||
|
||||
@@ -641,6 +643,12 @@ pub struct TimelineGcRequest {
|
||||
pub gc_horizon: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WalRedoManagerStatus {
|
||||
pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||
pub pid: Option<u32>,
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
pub enum PagestreamFeMessage {
|
||||
|
||||
@@ -959,6 +959,7 @@ async fn tenant_status(
|
||||
attachment_status: state.attachment_status(),
|
||||
generation: tenant.generation().into(),
|
||||
},
|
||||
walredo: tenant.wal_redo_manager_status(),
|
||||
timelines: tenant.list_timeline_ids(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1651,11 +1651,18 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
#[rustfmt::skip]
|
||||
pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_process_launch_duration",
|
||||
"Histogram of the duration of successful WalRedoProcess::launch calls",
|
||||
redo_histogram_time_buckets!(),
|
||||
vec![
|
||||
0.0002, 0.0004, 0.0006, 0.0008, 0.0010,
|
||||
0.0020, 0.0040, 0.0060, 0.0080, 0.0100,
|
||||
0.0200, 0.0400, 0.0600, 0.0800, 0.1000,
|
||||
0.2000, 0.4000, 0.6000, 0.8000, 1.0000,
|
||||
1.5000, 2.0000, 2.5000, 3.0000, 4.0000, 10.0000
|
||||
],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
@@ -20,6 +20,7 @@ use futures::FutureExt;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::models;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use pageserver_api::models::WalRedoManagerStatus;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::DownloadError;
|
||||
@@ -364,6 +365,14 @@ impl WalRedoManager {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
|
||||
match self {
|
||||
WalRedoManager::Prod(m) => m.status(),
|
||||
#[cfg(test)]
|
||||
WalRedoManager::Test(_) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error, PartialEq, Eq)]
|
||||
@@ -1957,6 +1966,10 @@ impl Tenant {
|
||||
self.generation
|
||||
}
|
||||
|
||||
pub(crate) fn wal_redo_manager_status(&self) -> Option<WalRedoManagerStatus> {
|
||||
self.walredo_mgr.status()
|
||||
}
|
||||
|
||||
/// Changes tenant status to active, unless shutdown was already requested.
|
||||
///
|
||||
/// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup
|
||||
|
||||
@@ -22,6 +22,7 @@ use anyhow::Context;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use nix::poll::*;
|
||||
use pageserver_api::models::WalRedoManagerStatus;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use serde::Serialize;
|
||||
use std::collections::VecDeque;
|
||||
@@ -179,6 +180,20 @@ impl PostgresRedoManager {
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
|
||||
Some(WalRedoManagerStatus {
|
||||
last_redo_at: {
|
||||
let at = *self.last_redo_at.lock().unwrap();
|
||||
at.and_then(|at| {
|
||||
let age = at.elapsed();
|
||||
// map any chrono errors silently to None here
|
||||
chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
|
||||
})
|
||||
},
|
||||
pid: self.redo_process.read().unwrap().as_ref().map(|p| p.id()),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl PostgresRedoManager {
|
||||
@@ -243,8 +258,7 @@ impl PostgresRedoManager {
|
||||
let mut proc_guard = self.redo_process.write().unwrap();
|
||||
match &*proc_guard {
|
||||
None => {
|
||||
let timer =
|
||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer();
|
||||
let start = Instant::now();
|
||||
let proc = Arc::new(
|
||||
WalRedoProcess::launch(
|
||||
self.conf,
|
||||
@@ -253,7 +267,14 @@ impl PostgresRedoManager {
|
||||
)
|
||||
.context("launch walredo process")?,
|
||||
);
|
||||
timer.observe_duration();
|
||||
let duration = start.elapsed();
|
||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM
|
||||
.observe(duration.as_secs_f64());
|
||||
info!(
|
||||
duration_ms = duration.as_millis(),
|
||||
pid = proc.id(),
|
||||
"launched walredo process"
|
||||
);
|
||||
*proc_guard = Some(Arc::clone(&proc));
|
||||
proc
|
||||
}
|
||||
@@ -669,7 +690,11 @@ impl WalRedoProcess {
|
||||
|
||||
// Start postgres itself
|
||||
let child = Command::new(pg_bin_dir_path.join("postgres"))
|
||||
// the first arg must be --wal-redo so the child process enters into walredo mode
|
||||
.arg("--wal-redo")
|
||||
// the child doesn't process this arg, but, having it in the argv helps indentify the
|
||||
// walredo process for a particular tenant when debugging a pagserver
|
||||
.args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
|
||||
.stdin(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
|
||||
Reference in New Issue
Block a user