From e9a9e6be30aad84bd1e4ca87704ffd35f8a2f627 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 15 Dec 2023 14:15:38 +0000 Subject: [PATCH] pageserver: additional starts for tenant startup --- pageserver/src/metrics.rs | 28 ++++++++++++++++++++++++---- pageserver/src/tenant.rs | 14 +++++++++----- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index ba6fd00bd1..32b41db183 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -684,14 +684,34 @@ pub static STARTUP_IS_LOADING: Lazy = Lazy::new(|| { .expect("Failed to register pageserver_startup_is_loading") }); -/// How long did tenants take to go from construction to active state? -pub(crate) static TENANT_ACTIVATION: Lazy = Lazy::new(|| { - register_histogram!( +/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things +/// like how long it took to load. +pub(crate) struct TenantMetrics { + /// How long did tenants take to go from construction to active state? + pub(crate) activation: Histogram, + pub(crate) preload: Histogram, + pub(crate) attach: Histogram, +} + +pub(crate) static TENANT: Lazy = Lazy::new(|| TenantMetrics { + activation: register_histogram!( "pageserver_tenant_activation_seconds", "Time taken by tenants to activate, in seconds", CRITICAL_OP_BUCKETS.into() ) - .expect("Failed to register pageserver_tenant_activation_seconds metric") + .expect("Failed to register metric"), + preload: register_histogram!( + "pageserver_tenant_preload_seconds", + "Time taken by tenants to load remote metadata on startup/attach, in seconds", + CRITICAL_OP_BUCKETS.into() + ) + .expect("Failed to register metric"), + attach: register_histogram!( + "pageserver_tenant_attach_seconds", + "Time taken by tenants to intialize, after remote metadata is already loaded", + CRITICAL_OP_BUCKETS.into() + ) + .expect("Failed to register metric"), }); /// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric. diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 49652f57d2..4baf02f7aa 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -61,7 +61,7 @@ use crate::deletion_queue::DeletionQueueClient; use crate::deletion_queue::DeletionQueueError; use crate::import_datadir; use crate::is_uninit_mark; -use crate::metrics::TENANT_ACTIVATION; +use crate::metrics::TENANT; use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC}; use crate::repository::GcResult; use crate::task_mgr; @@ -228,7 +228,7 @@ pub struct Tenant { /// The value creation timestamp, used to measure activation delay, see: /// - loading_started_at: Instant, + constructed_at: Instant, state: watch::Sender, @@ -702,6 +702,7 @@ impl Tenant { AttachType::Normal }; + let preload_timer = TENANT.preload.start_timer(); let preload = match mode { SpawnMode::Create => {None}, SpawnMode::Normal => { @@ -724,6 +725,7 @@ impl Tenant { } } }; + preload_timer.observe_duration(); // Remote preload is complete. drop(remote_load_completion); @@ -775,6 +777,7 @@ impl Tenant { } } + let attach_timer = TENANT.attach.start_timer(); match tenant_clone.attach(preload, &ctx).await { Ok(()) => { info!("attach finished, activating"); @@ -784,6 +787,7 @@ impl Tenant { make_broken(&tenant_clone, anyhow::anyhow!(e)); } } + attach_timer.observe_duration(); // If we are doing an opportunistic warmup attachment at startup, initialize // logical size at the same time. This is better than starting a bunch of idle tenants @@ -1941,7 +1945,7 @@ impl Tenant { ); *current_state = TenantState::Active; - let elapsed = self.loading_started_at.elapsed(); + let elapsed = self.constructed_at.elapsed(); let total_timelines = timelines_accessor.len(); // log a lot of stuff, because some tenants sometimes suffer from user-visible @@ -1956,7 +1960,7 @@ impl Tenant { "activation attempt finished" ); - TENANT_ACTIVATION.observe(elapsed.as_secs_f64()); + TENANT.activation.observe(elapsed.as_secs_f64()); }); } } @@ -2564,7 +2568,7 @@ impl Tenant { conf, // using now here is good enough approximation to catch tenants with really long // activation times. - loading_started_at: Instant::now(), + constructed_at: Instant::now(), tenant_conf: Arc::new(RwLock::new(attached_conf)), timelines: Mutex::new(HashMap::new()), timelines_creating: Mutex::new(HashSet::new()),