diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 3569043101..a86993d7f3 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -9,8 +9,10 @@ use clap::{Arg, ArgAction, Command}; use fail::FailScenario; use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; +use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; use pageserver::task_mgr::WALRECEIVER_RUNTIME; use remote_storage::GenericRemoteStorage; +use tokio::time::Instant; use tracing::*; use metrics::set_build_info_metric; @@ -224,6 +226,19 @@ fn start_pageserver( launch_ts: &'static LaunchTimestamp, conf: &'static PageServerConf, ) -> anyhow::Result<()> { + // Monotonic time for later calculating startup duration + let started_startup_at = Instant::now(); + + let startup_checkpoint = move |phase: &str, human_phase: &str| { + let elapsed = started_startup_at.elapsed(); + let secs = elapsed.as_secs_f64(); + STARTUP_DURATION.with_label_values(&[phase]).set(secs); + info!( + elapsed_ms = elapsed.as_millis(), + "{human_phase} ({secs:.3}s since start)" + ) + }; + // Print version and launch timestamp to the log, // and expose them as prometheus metrics. // A changed version string indicates changed software. @@ -333,6 +348,11 @@ fn start_pageserver( // Set up remote storage client let remote_storage = create_remote_storage_client(conf)?; + // Up to this point no significant I/O has been done: this should have been fast. Record + // duration prior to starting I/O intensive phase of startup. + startup_checkpoint("initial", "Starting loading tenants"); + STARTUP_IS_LOADING.set(1); + // Startup staging or optimizing: // // We want to minimize downtime for `page_service` connections, and trying not to overload @@ -358,7 +378,6 @@ fn start_pageserver( }; // Scan the local 'tenants/' directory and start loading the tenants - let init_started_at = std::time::Instant::now(); let shutdown_pageserver = tokio_util::sync::CancellationToken::new(); BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr( @@ -376,18 +395,13 @@ fn start_pageserver( let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed")); init_done_rx.wait().await; + startup_checkpoint("initial_tenant_load", "Initial load completed"); + STARTUP_IS_LOADING.set(0); + // initial logical sizes can now start, as they were waiting on init_done_rx. scopeguard::ScopeGuard::into_inner(guard); - let init_done = std::time::Instant::now(); - let elapsed = init_done - init_started_at; - - tracing::info!( - elapsed_millis = elapsed.as_millis(), - "Initial load completed" - ); - let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait()); let timeout = conf.background_task_maximum_delay; @@ -396,12 +410,7 @@ fn start_pageserver( let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await { Ok(_) => { - let now = std::time::Instant::now(); - tracing::info!( - from_init_done_millis = (now - init_done).as_millis(), - from_init_millis = (now - init_started_at).as_millis(), - "Initial logical sizes completed" - ); + startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed"); None } Err(_) => { @@ -417,6 +426,7 @@ fn start_pageserver( // allow background jobs to start drop(background_jobs_can_start); + startup_checkpoint("background_jobs_can_start", "Starting background jobs"); if let Some(init_sizes_done) = init_sizes_done { // ending up here is not a bug; at the latest logical sizes will be queried by @@ -426,14 +436,11 @@ fn start_pageserver( scopeguard::ScopeGuard::into_inner(guard); - let now = std::time::Instant::now(); - tracing::info!( - from_init_done_millis = (now - init_done).as_millis(), - from_init_millis = (now - init_started_at).as_millis(), - "Initial logical sizes completed after timeout (background jobs already started)" - ); + startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)"); } + + startup_checkpoint("complete", "Startup complete"); }; async move { diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 1b1d0acaee..a3a9497d57 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -7,7 +7,7 @@ pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; pub mod keyspace; -pub(crate) mod metrics; +pub mod metrics; pub mod page_cache; pub mod page_service; pub mod pgdatadir_mapping; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 84f2d923e2..f64d11d058 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,9 +1,9 @@ use metrics::metric_vec_duration::DurationResultObserver; use metrics::{ - register_counter_vec, register_histogram, register_histogram_vec, register_int_counter, - register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge, - register_uint_gauge_vec, Counter, CounterVec, Histogram, HistogramVec, IntCounter, - IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, + register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, + register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec, + register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram, + HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; use strum::VariantNames; @@ -394,6 +394,35 @@ pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy = Lazy::new(|| .expect("failed to define a metric") }); +/// How long did we take to start up? Broken down by labels to describe +/// different phases of startup. +pub static STARTUP_DURATION: Lazy = Lazy::new(|| { + register_gauge_vec!( + "pageserver_startup_duration_seconds", + "Time taken by phases of pageserver startup, in seconds", + &["phase"] + ) + .expect("Failed to register pageserver_startup_duration_seconds metric") +}); + +pub static STARTUP_IS_LOADING: Lazy = Lazy::new(|| { + register_uint_gauge!( + "pageserver_startup_is_loading", + "1 while in initial startup load of tenants, 0 at other times" + ) + .expect("Failed to register pageserver_startup_is_loading") +}); + +/// How long did tenants take to go from construction to active state? +pub(crate) static TENANT_ACTIVATION: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_tenant_activation_seconds", + "Time taken by tenants to activate, in seconds", + CRITICAL_OP_BUCKETS.into() + ) + .expect("Failed to register pageserver_tenant_activation_seconds metric") +}); + /// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric. #[derive(Debug)] pub struct EvictionsWithLowResidenceDuration { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 9fd6005330..a9885a5f70 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -56,6 +56,7 @@ use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::import_datadir; use crate::is_uninit_mark; +use crate::metrics::TENANT_ACTIVATION; use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC}; use crate::repository::GcResult; use crate::task_mgr; @@ -1639,6 +1640,8 @@ impl Tenant { post_state = <&'static str>::from(&*current_state), "activation attempt finished" ); + + TENANT_ACTIVATION.observe(elapsed.as_secs_f64()); }); } } diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index b8ddbe3ec1..1e41ebd15b 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -12,6 +12,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): env.neon_cli.create_branch("test_pageserver_restart") endpoint = env.endpoints.create_start("test_pageserver_restart") + pageserver_http = env.pageserver.http_client() pg_conn = endpoint.connect() cur = pg_conn.cursor() @@ -52,8 +53,11 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): # pageserver does if a compute node connects and sends a request for the tenant # while it's still in Loading state. (It waits for the loading to finish, and then # processes the request.) + tenant_load_delay_ms = 5000 env.pageserver.stop() - env.pageserver.start(extra_env_vars={"FAILPOINTS": "before-loading-tenant=return(5000)"}) + env.pageserver.start( + extra_env_vars={"FAILPOINTS": f"before-loading-tenant=return({tenant_load_delay_ms})"} + ) # Check that it's in Loading state client = env.pageserver.http_client() @@ -65,6 +69,41 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): cur.execute("SELECT count(*) FROM foo") assert cur.fetchone() == (100000,) + # Validate startup time metrics + metrics = pageserver_http.get_metrics() + + # Expectation callbacks: arg t is sample value, arg p is the previous phase's sample value + expectations = { + "initial": lambda t, p: True, # make no assumptions about the initial time point, it could be 0 in theory + # Initial tenant load should reflect the delay we injected + "initial_tenant_load": lambda t, p: t >= (tenant_load_delay_ms / 1000.0) and t >= p, + # Subsequent steps should occur in expected order + "initial_logical_sizes": lambda t, p: t > 0 and t >= p, + "background_jobs_can_start": lambda t, p: t > 0 and t >= p, + "complete": lambda t, p: t > 0 and t >= p, + } + + prev_value = None + for sample in metrics.query_all("pageserver_startup_duration_seconds"): + labels = dict(sample.labels) + phase = labels["phase"] + log.info(f"metric {phase}={sample.value}") + assert phase in expectations, f"Unexpected phase {phase}" + assert expectations[phase]( + sample.value, prev_value + ), f"Unexpected value for {phase}: {sample.value}" + prev_value = sample.value + + # Startup is complete, this metric should exist but be zero + assert metrics.query_one("pageserver_startup_is_loading").value == 0 + + # This histogram should have been populated, although we aren't specific about exactly + # which bucket values: just nonzero + assert any( + bucket.value > 0 + for bucket in metrics.query_all("pageserver_tenant_activation_seconds_bucket") + ) + # Test that repeatedly kills and restarts the page server, while the # safekeeper and compute node keep running.