mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-14 03:30:36 +00:00
metrics: smgr query time: add a pre-aggregated histogram (#5064)
When doing global queries in VictoriaMetrics, the per-timeline histograms make us run into cardinality limits. We don't want to give them up just yet because we don't have an alternative for drilling down on timeline-specific performance issues. So, add a pre-aggregated histogram and add observations to it whenever we add observations to the per-timeline histogram. While we're at it, switch to using a strummed enum for the operation type names.
This commit is contained in:
committed by
GitHub
parent
933a869f00
commit
8cd20485f8
@@ -6,7 +6,7 @@ use metrics::{
|
||||
HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use strum::VariantNames;
|
||||
use strum::{EnumCount, IntoEnumIterator, VariantNames};
|
||||
use strum_macros::{EnumVariantNames, IntoStaticStr};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
@@ -570,23 +570,160 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
const SMGR_QUERY_TIME_OPERATIONS: &[&str] = &[
|
||||
"get_rel_exists",
|
||||
"get_rel_size",
|
||||
"get_page_at_lsn",
|
||||
"get_db_size",
|
||||
];
|
||||
#[derive(Debug)]
|
||||
struct GlobalAndPerTimelineHistogram {
|
||||
global: Histogram,
|
||||
per_tenant_timeline: Histogram,
|
||||
}
|
||||
|
||||
pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
impl GlobalAndPerTimelineHistogram {
|
||||
fn observe(&self, value: f64) {
|
||||
self.global.observe(value);
|
||||
self.per_tenant_timeline.observe(value);
|
||||
}
|
||||
}
|
||||
|
||||
struct GlobalAndPerTimelineHistogramTimer<'a> {
|
||||
h: &'a GlobalAndPerTimelineHistogram,
|
||||
start: std::time::Instant,
|
||||
}
|
||||
|
||||
impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> {
|
||||
fn drop(&mut self) {
|
||||
let elapsed = self.start.elapsed();
|
||||
self.h.observe(elapsed.as_secs_f64());
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(
|
||||
Debug,
|
||||
Clone,
|
||||
Copy,
|
||||
IntoStaticStr,
|
||||
strum_macros::EnumCount,
|
||||
strum_macros::EnumIter,
|
||||
strum_macros::FromRepr,
|
||||
)]
|
||||
#[strum(serialize_all = "snake_case")]
|
||||
pub enum SmgrQueryType {
|
||||
GetRelExists,
|
||||
GetRelSize,
|
||||
GetPageAtLsn,
|
||||
GetDbSize,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct SmgrQueryTimePerTimeline {
|
||||
metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
|
||||
}
|
||||
|
||||
static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_smgr_query_seconds",
|
||||
"Time spent on smgr query handling",
|
||||
"Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
|
||||
&["smgr_query_type", "tenant_id", "timeline_id"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_smgr_query_seconds_global",
|
||||
"Time spent on smgr query handling, aggregated by query type.",
|
||||
&["smgr_query_type"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
impl SmgrQueryTimePerTimeline {
|
||||
pub(crate) fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
|
||||
let tenant_id = tenant_id.to_string();
|
||||
let timeline_id = timeline_id.to_string();
|
||||
let metrics = std::array::from_fn(|i| {
|
||||
let op = SmgrQueryType::from_repr(i).unwrap();
|
||||
let global = SMGR_QUERY_TIME_GLOBAL
|
||||
.get_metric_with_label_values(&[op.into()])
|
||||
.unwrap();
|
||||
let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
|
||||
.get_metric_with_label_values(&[op.into(), &tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
GlobalAndPerTimelineHistogram {
|
||||
global,
|
||||
per_tenant_timeline,
|
||||
}
|
||||
});
|
||||
Self { metrics }
|
||||
}
|
||||
pub(crate) fn start_timer(&self, op: SmgrQueryType) -> impl Drop + '_ {
|
||||
let metric = &self.metrics[op as usize];
|
||||
GlobalAndPerTimelineHistogramTimer {
|
||||
h: metric,
|
||||
start: std::time::Instant::now(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod smgr_query_time_tests {
|
||||
use strum::IntoEnumIterator;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
// Regression test, we used hard-coded string constants before using an enum.
|
||||
#[test]
|
||||
fn op_label_name() {
|
||||
use super::SmgrQueryType::*;
|
||||
let expect: [(super::SmgrQueryType, &'static str); 4] = [
|
||||
(GetRelExists, "get_rel_exists"),
|
||||
(GetRelSize, "get_rel_size"),
|
||||
(GetPageAtLsn, "get_page_at_lsn"),
|
||||
(GetDbSize, "get_db_size"),
|
||||
];
|
||||
for (op, expect) in expect {
|
||||
let actual: &'static str = op.into();
|
||||
assert_eq!(actual, expect);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn basic() {
|
||||
let ops: Vec<_> = super::SmgrQueryType::iter().collect();
|
||||
|
||||
for op in &ops {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let metrics = super::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
|
||||
|
||||
let get_counts = || {
|
||||
let global: u64 = ops
|
||||
.iter()
|
||||
.map(|op| metrics.metrics[*op as usize].global.get_sample_count())
|
||||
.sum();
|
||||
let per_tenant_timeline: u64 = ops
|
||||
.iter()
|
||||
.map(|op| {
|
||||
metrics.metrics[*op as usize]
|
||||
.per_tenant_timeline
|
||||
.get_sample_count()
|
||||
})
|
||||
.sum();
|
||||
(global, per_tenant_timeline)
|
||||
};
|
||||
|
||||
let (pre_global, pre_per_tenant_timeline) = get_counts();
|
||||
assert_eq!(pre_per_tenant_timeline, 0);
|
||||
|
||||
let timer = metrics.start_timer(*op);
|
||||
drop(timer);
|
||||
|
||||
let (post_global, post_per_tenant_timeline) = get_counts();
|
||||
assert_eq!(post_per_tenant_timeline, 1);
|
||||
assert!(post_global > pre_global);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// keep in sync with control plane Go code so that we can validate
|
||||
// compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
|
||||
static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
|
||||
@@ -1045,6 +1182,12 @@ impl Drop for TimelineMetrics {
|
||||
.write()
|
||||
.unwrap()
|
||||
.remove(tenant_id, timeline_id);
|
||||
|
||||
// The following metrics are born outside of the TimelineMetrics lifecycle but still
|
||||
// removed at the end of it. The idea is to have the metrics outlive the
|
||||
// entity during which they're observed, e.g., the smgr metrics shall
|
||||
// outlive an individual smgr connection, but not the timeline.
|
||||
|
||||
for op in StorageTimeOperation::VARIANTS {
|
||||
let _ =
|
||||
STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
@@ -1056,8 +1199,12 @@ impl Drop for TimelineMetrics {
|
||||
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
}
|
||||
|
||||
for op in SMGR_QUERY_TIME_OPERATIONS {
|
||||
let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
for op in SmgrQueryType::iter() {
|
||||
let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
|
||||
op.into(),
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,7 +50,8 @@ use crate::basebackup;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::import_datadir::import_wal_from_tar;
|
||||
use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
|
||||
use crate::metrics;
|
||||
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant;
|
||||
@@ -306,39 +307,6 @@ async fn page_service_conn_main(
|
||||
}
|
||||
}
|
||||
|
||||
struct PageRequestMetrics {
|
||||
get_rel_exists: metrics::Histogram,
|
||||
get_rel_size: metrics::Histogram,
|
||||
get_page_at_lsn: metrics::Histogram,
|
||||
get_db_size: metrics::Histogram,
|
||||
}
|
||||
|
||||
impl PageRequestMetrics {
|
||||
fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
|
||||
let tenant_id = tenant_id.to_string();
|
||||
let timeline_id = timeline_id.to_string();
|
||||
|
||||
let get_rel_exists =
|
||||
SMGR_QUERY_TIME.with_label_values(&["get_rel_exists", &tenant_id, &timeline_id]);
|
||||
|
||||
let get_rel_size =
|
||||
SMGR_QUERY_TIME.with_label_values(&["get_rel_size", &tenant_id, &timeline_id]);
|
||||
|
||||
let get_page_at_lsn =
|
||||
SMGR_QUERY_TIME.with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id]);
|
||||
|
||||
let get_db_size =
|
||||
SMGR_QUERY_TIME.with_label_values(&["get_db_size", &tenant_id, &timeline_id]);
|
||||
|
||||
Self {
|
||||
get_rel_exists,
|
||||
get_rel_size,
|
||||
get_page_at_lsn,
|
||||
get_db_size,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct PageServerHandler {
|
||||
_conf: &'static PageServerConf,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
@@ -406,7 +374,7 @@ impl PageServerHandler {
|
||||
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
|
||||
pgb.flush().await?;
|
||||
|
||||
let metrics = PageRequestMetrics::new(&tenant_id, &timeline_id);
|
||||
let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
|
||||
|
||||
loop {
|
||||
let msg = tokio::select! {
|
||||
@@ -446,21 +414,21 @@ impl PageServerHandler {
|
||||
|
||||
let response = match neon_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => {
|
||||
let _timer = metrics.get_rel_exists.start_timer();
|
||||
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelExists);
|
||||
self.handle_get_rel_exists_request(&timeline, &req, &ctx)
|
||||
.await
|
||||
}
|
||||
PagestreamFeMessage::Nblocks(req) => {
|
||||
let _timer = metrics.get_rel_size.start_timer();
|
||||
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelSize);
|
||||
self.handle_get_nblocks_request(&timeline, &req, &ctx).await
|
||||
}
|
||||
PagestreamFeMessage::GetPage(req) => {
|
||||
let _timer = metrics.get_page_at_lsn.start_timer();
|
||||
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
|
||||
self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
|
||||
.await
|
||||
}
|
||||
PagestreamFeMessage::DbSize(req) => {
|
||||
let _timer = metrics.get_db_size.start_timer();
|
||||
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetDbSize);
|
||||
self.handle_db_size_request(&timeline, &req, &ctx).await
|
||||
}
|
||||
};
|
||||
@@ -984,8 +952,8 @@ where
|
||||
false
|
||||
};
|
||||
|
||||
metrics::metric_vec_duration::observe_async_block_duration_by_result(
|
||||
&*crate::metrics::BASEBACKUP_QUERY_TIME,
|
||||
::metrics::metric_vec_duration::observe_async_block_duration_by_result(
|
||||
&*metrics::BASEBACKUP_QUERY_TIME,
|
||||
async move {
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
|
||||
@@ -70,6 +70,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_getpage_reconstruct_seconds_count",
|
||||
"pageserver_getpage_reconstruct_seconds_sum",
|
||||
*[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
|
||||
*histogram("pageserver_smgr_query_seconds_global"),
|
||||
*histogram("pageserver_read_num_fs_layers"),
|
||||
*histogram("pageserver_getpage_get_reconstruct_data_seconds"),
|
||||
*histogram("pageserver_wait_lsn_seconds"),
|
||||
|
||||
Reference in New Issue
Block a user