mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-10 06:52:55 +00:00
Add more redo metrics: (#2645)
- Measure size of redo WAL (new histogram), with bounds between 24B-32kB - Add 2 more buckets at the upper end of the redo time histogram We often (>0.1% of several hours each day) take more than 250ms to do the redo round-trip to the postgres process. We need to measure these redo times more precisely.
This commit is contained in:
@@ -277,11 +277,15 @@ pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
/// smallest redo processing times. These buckets allow us to measure down
|
||||
/// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec.
|
||||
/// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec.
|
||||
///
|
||||
/// Values up to 1s are recorded because metrics show that we have redo
|
||||
/// durations and lock times larger than 0.250s.
|
||||
macro_rules! redo_histogram_time_buckets {
|
||||
() => {
|
||||
vec![
|
||||
0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000,
|
||||
0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000,
|
||||
0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000, 0.500_000,
|
||||
1.000_000,
|
||||
]
|
||||
};
|
||||
}
|
||||
@@ -296,6 +300,17 @@ macro_rules! redo_histogram_count_buckets {
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! redo_bytes_histogram_count_buckets {
|
||||
() => {
|
||||
// powers of (2^.5), from 2^4.5 to 2^15 (22 buckets)
|
||||
// rounded up to the next multiple of 8 to capture any MAXALIGNed record of that size, too.
|
||||
vec![
|
||||
24.0, 32.0, 48.0, 64.0, 96.0, 128.0, 184.0, 256.0, 368.0, 512.0, 728.0, 1024.0, 1456.0,
|
||||
2048.0, 2904.0, 4096.0, 5800.0, 8192.0, 11592.0, 16384.0, 23176.0, 32768.0,
|
||||
]
|
||||
};
|
||||
}
|
||||
|
||||
pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_seconds",
|
||||
@@ -323,6 +338,15 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_bytes_histogram",
|
||||
"Histogram of number of records replayed per redo",
|
||||
redo_bytes_histogram_count_buckets!(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_replayed_wal_records_total",
|
||||
|
||||
@@ -39,7 +39,8 @@ use utils::crashsafe_dir::path_with_suffix_extension;
|
||||
use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
|
||||
|
||||
use crate::metrics::{
|
||||
WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, WAL_REDO_WAIT_TIME,
|
||||
WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
|
||||
WAL_REDO_WAIT_TIME,
|
||||
};
|
||||
use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
|
||||
use crate::reltag::{RelTag, SlruKind};
|
||||
@@ -244,12 +245,23 @@ impl PostgresRedoManager {
|
||||
let end_time = Instant::now();
|
||||
let duration = end_time.duration_since(lock_time);
|
||||
|
||||
let len = records.len();
|
||||
let nbytes = records.iter().fold(0, |acumulator, record| {
|
||||
acumulator
|
||||
+ match &record.1 {
|
||||
NeonWalRecord::Postgres { rec, .. } => rec.len(),
|
||||
_ => unreachable!("Only PostgreSQL records are accepted in this batch"),
|
||||
}
|
||||
});
|
||||
|
||||
WAL_REDO_TIME.observe(duration.as_secs_f64());
|
||||
WAL_REDO_RECORDS_HISTOGRAM.observe(records.len() as f64);
|
||||
WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
|
||||
WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
|
||||
|
||||
debug!(
|
||||
"postgres applied {} WAL records in {} us to reconstruct page image at LSN {}",
|
||||
records.len(),
|
||||
"postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
|
||||
len,
|
||||
nbytes,
|
||||
duration.as_micros(),
|
||||
lsn
|
||||
);
|
||||
@@ -258,8 +270,9 @@ impl PostgresRedoManager {
|
||||
// next request will launch a new one.
|
||||
if result.is_err() {
|
||||
error!(
|
||||
"error applying {} WAL records to reconstruct page image at LSN {}",
|
||||
"error applying {} WAL records ({} bytes) to reconstruct page image at LSN {}",
|
||||
records.len(),
|
||||
nbytes,
|
||||
lsn
|
||||
);
|
||||
let process = process_guard.take().unwrap();
|
||||
|
||||
Reference in New Issue
Block a user