pageserver: expose prometheus metrics for startup time (#4893)

## Problem

Currently to know how long pageserver startup took requires inspecting
logs.

## Summary of changes

`pageserver_startup_duration_ms` metric is added, with label `phase` for
different phases of startup.

These are broken down by phase, where the phases correspond to the
existing wait points in the code:
- Start of doing I/O
- When tenant load is done
- When initial size calculation is done
- When background jobs start
- Then "complete" when everything is done.

`pageserver_startup_is_loading` is a 0/1 gauge that indicates whether we are in the initial load of tenants.

`pageserver_tenant_activation_seconds` is a histogram of time in seconds taken to activate a tenant.

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
This commit is contained in:
John Spray
2023-08-08 10:41:37 +01:00
committed by GitHub
parent 6d17d6c775
commit 4dc644612b
5 changed files with 105 additions and 27 deletions

View File

@@ -9,8 +9,10 @@ use clap::{Arg, ArgAction, Command};
use fail::FailScenario;
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
use remote_storage::GenericRemoteStorage;
use tokio::time::Instant;
use tracing::*;
use metrics::set_build_info_metric;
@@ -224,6 +226,19 @@ fn start_pageserver(
launch_ts: &'static LaunchTimestamp,
conf: &'static PageServerConf,
) -> anyhow::Result<()> {
// Monotonic time for later calculating startup duration
let started_startup_at = Instant::now();
let startup_checkpoint = move |phase: &str, human_phase: &str| {
let elapsed = started_startup_at.elapsed();
let secs = elapsed.as_secs_f64();
STARTUP_DURATION.with_label_values(&[phase]).set(secs);
info!(
elapsed_ms = elapsed.as_millis(),
"{human_phase} ({secs:.3}s since start)"
)
};
// Print version and launch timestamp to the log,
// and expose them as prometheus metrics.
// A changed version string indicates changed software.
@@ -333,6 +348,11 @@ fn start_pageserver(
// Set up remote storage client
let remote_storage = create_remote_storage_client(conf)?;
// Up to this point no significant I/O has been done: this should have been fast. Record
// duration prior to starting I/O intensive phase of startup.
startup_checkpoint("initial", "Starting loading tenants");
STARTUP_IS_LOADING.set(1);
// Startup staging or optimizing:
//
// We want to minimize downtime for `page_service` connections, and trying not to overload
@@ -358,7 +378,6 @@ fn start_pageserver(
};
// Scan the local 'tenants/' directory and start loading the tenants
let init_started_at = std::time::Instant::now();
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
@@ -376,18 +395,13 @@ fn start_pageserver(
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
init_done_rx.wait().await;
startup_checkpoint("initial_tenant_load", "Initial load completed");
STARTUP_IS_LOADING.set(0);
// initial logical sizes can now start, as they were waiting on init_done_rx.
scopeguard::ScopeGuard::into_inner(guard);
let init_done = std::time::Instant::now();
let elapsed = init_done - init_started_at;
tracing::info!(
elapsed_millis = elapsed.as_millis(),
"Initial load completed"
);
let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
let timeout = conf.background_task_maximum_delay;
@@ -396,12 +410,7 @@ fn start_pageserver(
let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
Ok(_) => {
let now = std::time::Instant::now();
tracing::info!(
from_init_done_millis = (now - init_done).as_millis(),
from_init_millis = (now - init_started_at).as_millis(),
"Initial logical sizes completed"
);
startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed");
None
}
Err(_) => {
@@ -417,6 +426,7 @@ fn start_pageserver(
// allow background jobs to start
drop(background_jobs_can_start);
startup_checkpoint("background_jobs_can_start", "Starting background jobs");
if let Some(init_sizes_done) = init_sizes_done {
// ending up here is not a bug; at the latest logical sizes will be queried by
@@ -426,14 +436,11 @@ fn start_pageserver(
scopeguard::ScopeGuard::into_inner(guard);
let now = std::time::Instant::now();
tracing::info!(
from_init_done_millis = (now - init_done).as_millis(),
from_init_millis = (now - init_started_at).as_millis(),
"Initial logical sizes completed after timeout (background jobs already started)"
);
startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)");
}
startup_checkpoint("complete", "Startup complete");
};
async move {

View File

@@ -7,7 +7,7 @@ pub mod disk_usage_eviction_task;
pub mod http;
pub mod import_datadir;
pub mod keyspace;
pub(crate) mod metrics;
pub mod metrics;
pub mod page_cache;
pub mod page_service;
pub mod pgdatadir_mapping;

View File

@@ -1,9 +1,9 @@
use metrics::metric_vec_duration::DurationResultObserver;
use metrics::{
register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge,
register_uint_gauge_vec, Counter, CounterVec, Histogram, HistogramVec, IntCounter,
IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram,
HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
};
use once_cell::sync::Lazy;
use strum::VariantNames;
@@ -394,6 +394,35 @@ pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(||
.expect("failed to define a metric")
});
/// How long did we take to start up? Broken down by labels to describe
/// different phases of startup.
pub static STARTUP_DURATION: Lazy<GaugeVec> = Lazy::new(|| {
register_gauge_vec!(
"pageserver_startup_duration_seconds",
"Time taken by phases of pageserver startup, in seconds",
&["phase"]
)
.expect("Failed to register pageserver_startup_duration_seconds metric")
});
pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_startup_is_loading",
"1 while in initial startup load of tenants, 0 at other times"
)
.expect("Failed to register pageserver_startup_is_loading")
});
/// How long did tenants take to go from construction to active state?
pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_tenant_activation_seconds",
"Time taken by tenants to activate, in seconds",
CRITICAL_OP_BUCKETS.into()
)
.expect("Failed to register pageserver_tenant_activation_seconds metric")
});
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
#[derive(Debug)]
pub struct EvictionsWithLowResidenceDuration {

View File

@@ -56,6 +56,7 @@ use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::import_datadir;
use crate::is_uninit_mark;
use crate::metrics::TENANT_ACTIVATION;
use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
use crate::repository::GcResult;
use crate::task_mgr;
@@ -1639,6 +1640,8 @@ impl Tenant {
post_state = <&'static str>::from(&*current_state),
"activation attempt finished"
);
TENANT_ACTIVATION.observe(elapsed.as_secs_f64());
});
}
}

View File

@@ -12,6 +12,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
env.neon_cli.create_branch("test_pageserver_restart")
endpoint = env.endpoints.create_start("test_pageserver_restart")
pageserver_http = env.pageserver.http_client()
pg_conn = endpoint.connect()
cur = pg_conn.cursor()
@@ -52,8 +53,11 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
# pageserver does if a compute node connects and sends a request for the tenant
# while it's still in Loading state. (It waits for the loading to finish, and then
# processes the request.)
tenant_load_delay_ms = 5000
env.pageserver.stop()
env.pageserver.start(extra_env_vars={"FAILPOINTS": "before-loading-tenant=return(5000)"})
env.pageserver.start(
extra_env_vars={"FAILPOINTS": f"before-loading-tenant=return({tenant_load_delay_ms})"}
)
# Check that it's in Loading state
client = env.pageserver.http_client()
@@ -65,6 +69,41 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
cur.execute("SELECT count(*) FROM foo")
assert cur.fetchone() == (100000,)
# Validate startup time metrics
metrics = pageserver_http.get_metrics()
# Expectation callbacks: arg t is sample value, arg p is the previous phase's sample value
expectations = {
"initial": lambda t, p: True, # make no assumptions about the initial time point, it could be 0 in theory
# Initial tenant load should reflect the delay we injected
"initial_tenant_load": lambda t, p: t >= (tenant_load_delay_ms / 1000.0) and t >= p,
# Subsequent steps should occur in expected order
"initial_logical_sizes": lambda t, p: t > 0 and t >= p,
"background_jobs_can_start": lambda t, p: t > 0 and t >= p,
"complete": lambda t, p: t > 0 and t >= p,
}
prev_value = None
for sample in metrics.query_all("pageserver_startup_duration_seconds"):
labels = dict(sample.labels)
phase = labels["phase"]
log.info(f"metric {phase}={sample.value}")
assert phase in expectations, f"Unexpected phase {phase}"
assert expectations[phase](
sample.value, prev_value
), f"Unexpected value for {phase}: {sample.value}"
prev_value = sample.value
# Startup is complete, this metric should exist but be zero
assert metrics.query_one("pageserver_startup_is_loading").value == 0
# This histogram should have been populated, although we aren't specific about exactly
# which bucket values: just nonzero
assert any(
bucket.value > 0
for bucket in metrics.query_all("pageserver_tenant_activation_seconds_bucket")
)
# Test that repeatedly kills and restarts the page server, while the
# safekeeper and compute node keep running.