pageserver: additional starts for tenant startup

This commit is contained in:
John Spray
2023-12-15 14:15:38 +00:00
parent e8d4c214a3
commit e9a9e6be30
2 changed files with 33 additions and 9 deletions

View File

@@ -684,14 +684,34 @@ pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
.expect("Failed to register pageserver_startup_is_loading")
});
/// How long did tenants take to go from construction to active state?
pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
/// like how long it took to load.
pub(crate) struct TenantMetrics {
/// How long did tenants take to go from construction to active state?
pub(crate) activation: Histogram,
pub(crate) preload: Histogram,
pub(crate) attach: Histogram,
}
pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| TenantMetrics {
activation: register_histogram!(
"pageserver_tenant_activation_seconds",
"Time taken by tenants to activate, in seconds",
CRITICAL_OP_BUCKETS.into()
)
.expect("Failed to register pageserver_tenant_activation_seconds metric")
.expect("Failed to register metric"),
preload: register_histogram!(
"pageserver_tenant_preload_seconds",
"Time taken by tenants to load remote metadata on startup/attach, in seconds",
CRITICAL_OP_BUCKETS.into()
)
.expect("Failed to register metric"),
attach: register_histogram!(
"pageserver_tenant_attach_seconds",
"Time taken by tenants to intialize, after remote metadata is already loaded",
CRITICAL_OP_BUCKETS.into()
)
.expect("Failed to register metric"),
});
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.

View File

@@ -61,7 +61,7 @@ use crate::deletion_queue::DeletionQueueClient;
use crate::deletion_queue::DeletionQueueError;
use crate::import_datadir;
use crate::is_uninit_mark;
use crate::metrics::TENANT_ACTIVATION;
use crate::metrics::TENANT;
use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
use crate::repository::GcResult;
use crate::task_mgr;
@@ -228,7 +228,7 @@ pub struct Tenant {
/// The value creation timestamp, used to measure activation delay, see:
/// <https://github.com/neondatabase/neon/issues/4025>
loading_started_at: Instant,
constructed_at: Instant,
state: watch::Sender<TenantState>,
@@ -702,6 +702,7 @@ impl Tenant {
AttachType::Normal
};
let preload_timer = TENANT.preload.start_timer();
let preload = match mode {
SpawnMode::Create => {None},
SpawnMode::Normal => {
@@ -724,6 +725,7 @@ impl Tenant {
}
}
};
preload_timer.observe_duration();
// Remote preload is complete.
drop(remote_load_completion);
@@ -775,6 +777,7 @@ impl Tenant {
}
}
let attach_timer = TENANT.attach.start_timer();
match tenant_clone.attach(preload, &ctx).await {
Ok(()) => {
info!("attach finished, activating");
@@ -784,6 +787,7 @@ impl Tenant {
make_broken(&tenant_clone, anyhow::anyhow!(e));
}
}
attach_timer.observe_duration();
// If we are doing an opportunistic warmup attachment at startup, initialize
// logical size at the same time. This is better than starting a bunch of idle tenants
@@ -1941,7 +1945,7 @@ impl Tenant {
);
*current_state = TenantState::Active;
let elapsed = self.loading_started_at.elapsed();
let elapsed = self.constructed_at.elapsed();
let total_timelines = timelines_accessor.len();
// log a lot of stuff, because some tenants sometimes suffer from user-visible
@@ -1956,7 +1960,7 @@ impl Tenant {
"activation attempt finished"
);
TENANT_ACTIVATION.observe(elapsed.as_secs_f64());
TENANT.activation.observe(elapsed.as_secs_f64());
});
}
}
@@ -2564,7 +2568,7 @@ impl Tenant {
conf,
// using now here is good enough approximation to catch tenants with really long
// activation times.
loading_started_at: Instant::now(),
constructed_at: Instant::now(),
tenant_conf: Arc::new(RwLock::new(attached_conf)),
timelines: Mutex::new(HashMap::new()),
timelines_creating: Mutex::new(HashSet::new()),