diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 32b41db183..45c01b71d1 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -686,14 +686,23 @@ pub static STARTUP_IS_LOADING: Lazy = Lazy::new(|| { /// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things /// like how long it took to load. +/// +/// Note that these are process-global metrics, _not_ per-tenant metrics. Per-tenant +/// metrics are rather expensive, and usually fine grained stuff makes more sense +/// at a timeline level than tenant level. pub(crate) struct TenantMetrics { /// How long did tenants take to go from construction to active state? pub(crate) activation: Histogram, pub(crate) preload: Histogram, pub(crate) attach: Histogram, + + /// How many tenants are included in the initial startup of the pagesrever? + pub(crate) startup_scheduled: IntCounter, + pub(crate) startup_complete: IntCounter, } -pub(crate) static TENANT: Lazy = Lazy::new(|| TenantMetrics { +pub(crate) static TENANT: Lazy = Lazy::new(|| { + TenantMetrics { activation: register_histogram!( "pageserver_tenant_activation_seconds", "Time taken by tenants to activate, in seconds", @@ -712,6 +721,17 @@ pub(crate) static TENANT: Lazy = Lazy::new(|| TenantMetrics { CRITICAL_OP_BUCKETS.into() ) .expect("Failed to register metric"), + startup_scheduled: register_int_counter!( + "pageserver_tenant_startup_scheduled", + "Number of tenants included in pageserver startup (doesn't count tenants attached later)" + ).expect("Failed to register metric"), + startup_complete: register_int_counter!( + "pageserver_tenant_startup_complete", + "Number of tenants that have completed warm-up, or activated on-demand during initial startup: \ + should eventually reach `pageserver_tenant_startup_scheduled_total`. Does not include broken \ + tenants: such cases will lead to this metric never reaching the scheduled count." + ).expect("Failed to register metric"), +} }); /// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric. @@ -2233,6 +2253,9 @@ pub fn preinitialize_metrics() { // Deletion queue stats Lazy::force(&DELETION_QUEUE); + // Tenant stats + Lazy::force(&TENANT); + // Tenant manager stats Lazy::force(&TENANT_MANAGER); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 4baf02f7aa..6279f05571 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -629,6 +629,11 @@ impl Tenant { "attach tenant", false, async move { + scopeguard::defer! { + tracing::info!("Increment complete count"); + TENANT.startup_complete.inc(); + } + // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state. let make_broken = |t: &Tenant, err: anyhow::Error| { diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 0108717e65..e38dff5a0c 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -28,7 +28,7 @@ use crate::control_plane_client::{ ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError, }; use crate::deletion_queue::DeletionQueueClient; -use crate::metrics::TENANT_MANAGER as METRICS; +use crate::metrics::{TENANT, TENANT_MANAGER as METRICS}; use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::{ AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt, @@ -434,6 +434,7 @@ pub async fn init_tenant_mgr( tenant_configs.len(), init_order.warmup_limit.available_permits() ); + TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64); // Construct `Tenant` objects and start them running for (tenant_shard_id, location_conf) in tenant_configs { diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 19c29de44b..3a504a3eca 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -757,7 +757,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): # Empty tenants are not subject to waiting for logical size calculations, because # those hapen on timeline level - branch_name = f"{tenant_id}-main" + branch_name = "main" timeline_id = TimelineId.generate() env.neon_cli.create_timeline( new_branch_name=branch_name, tenant_id=tenant_id, timeline_id=timeline_id @@ -795,15 +795,22 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): assert len([s for s in states.values() if s == "Active"]) == expect_activated assert len([s for s in states.values() if s == "Attaching"]) == n_tenants - expect_activated + assert ( + pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants + ) + + # This is zero, and subsequent checks are expect_activated - 1, because this counter does not + # count how may tenants are Active, it counts how many have finished warmup. The first tenant + # that reached Active is still stuck in its local size calculation, and has therefore not finished warmup. + assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == 0 + # If a client accesses one of the blocked tenants, it should skip waiting for warmup and # go active as fast as it can. stuck_tenant_id = list( [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"] )[0][0] - endpoint = env.endpoints.create_start( - branch_name=f"{stuck_tenant_id}-main", tenant_id=stuck_tenant_id - ) + endpoint = env.endpoints.create_start(branch_name="main", tenant_id=stuck_tenant_id) endpoint.safe_psql_many( [ "CREATE TABLE foo (x INTEGER)", @@ -815,6 +822,10 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): # That one that we successfully accessed is now Active expect_activated += 1 assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active" + assert ( + pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") + == expect_activated - 1 + ) # The ones we didn't touch are still in Attaching assert ( @@ -834,6 +845,11 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): == n_tenants - expect_activated ) + assert ( + pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") + == expect_activated - 1 + ) + # When we unblock logical size calculation, all tenants should proceed to active state via # the warmup route. pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off")) @@ -848,3 +864,8 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): env.pageserver.stop() env.pageserver.start() wait_until(10, 1, all_active) + + assert ( + pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants + ) + assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants