mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-15 09:22:55 +00:00
counters metrics for tenants completing warmup
This commit is contained in:
@@ -686,14 +686,23 @@ pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
|
||||
/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
|
||||
/// like how long it took to load.
|
||||
///
|
||||
/// Note that these are process-global metrics, _not_ per-tenant metrics. Per-tenant
|
||||
/// metrics are rather expensive, and usually fine grained stuff makes more sense
|
||||
/// at a timeline level than tenant level.
|
||||
pub(crate) struct TenantMetrics {
|
||||
/// How long did tenants take to go from construction to active state?
|
||||
pub(crate) activation: Histogram,
|
||||
pub(crate) preload: Histogram,
|
||||
pub(crate) attach: Histogram,
|
||||
|
||||
/// How many tenants are included in the initial startup of the pagesrever?
|
||||
pub(crate) startup_scheduled: IntCounter,
|
||||
pub(crate) startup_complete: IntCounter,
|
||||
}
|
||||
|
||||
pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| TenantMetrics {
|
||||
pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
|
||||
TenantMetrics {
|
||||
activation: register_histogram!(
|
||||
"pageserver_tenant_activation_seconds",
|
||||
"Time taken by tenants to activate, in seconds",
|
||||
@@ -712,6 +721,17 @@ pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| TenantMetrics {
|
||||
CRITICAL_OP_BUCKETS.into()
|
||||
)
|
||||
.expect("Failed to register metric"),
|
||||
startup_scheduled: register_int_counter!(
|
||||
"pageserver_tenant_startup_scheduled",
|
||||
"Number of tenants included in pageserver startup (doesn't count tenants attached later)"
|
||||
).expect("Failed to register metric"),
|
||||
startup_complete: register_int_counter!(
|
||||
"pageserver_tenant_startup_complete",
|
||||
"Number of tenants that have completed warm-up, or activated on-demand during initial startup: \
|
||||
should eventually reach `pageserver_tenant_startup_scheduled_total`. Does not include broken \
|
||||
tenants: such cases will lead to this metric never reaching the scheduled count."
|
||||
).expect("Failed to register metric"),
|
||||
}
|
||||
});
|
||||
|
||||
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
|
||||
@@ -2233,6 +2253,9 @@ pub fn preinitialize_metrics() {
|
||||
// Deletion queue stats
|
||||
Lazy::force(&DELETION_QUEUE);
|
||||
|
||||
// Tenant stats
|
||||
Lazy::force(&TENANT);
|
||||
|
||||
// Tenant manager stats
|
||||
Lazy::force(&TENANT_MANAGER);
|
||||
|
||||
|
||||
@@ -629,6 +629,11 @@ impl Tenant {
|
||||
"attach tenant",
|
||||
false,
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
tracing::info!("Increment complete count");
|
||||
TENANT.startup_complete.inc();
|
||||
}
|
||||
|
||||
// Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
|
||||
let make_broken =
|
||||
|t: &Tenant, err: anyhow::Error| {
|
||||
|
||||
@@ -28,7 +28,7 @@ use crate::control_plane_client::{
|
||||
ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
|
||||
};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::metrics::TENANT_MANAGER as METRICS;
|
||||
use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::{
|
||||
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
|
||||
@@ -434,6 +434,7 @@ pub async fn init_tenant_mgr(
|
||||
tenant_configs.len(),
|
||||
init_order.warmup_limit.available_permits()
|
||||
);
|
||||
TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);
|
||||
|
||||
// Construct `Tenant` objects and start them running
|
||||
for (tenant_shard_id, location_conf) in tenant_configs {
|
||||
|
||||
@@ -757,7 +757,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# Empty tenants are not subject to waiting for logical size calculations, because
|
||||
# those hapen on timeline level
|
||||
branch_name = f"{tenant_id}-main"
|
||||
branch_name = "main"
|
||||
timeline_id = TimelineId.generate()
|
||||
env.neon_cli.create_timeline(
|
||||
new_branch_name=branch_name, tenant_id=tenant_id, timeline_id=timeline_id
|
||||
@@ -795,15 +795,22 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
|
||||
assert len([s for s in states.values() if s == "Active"]) == expect_activated
|
||||
assert len([s for s in states.values() if s == "Attaching"]) == n_tenants - expect_activated
|
||||
|
||||
assert (
|
||||
pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
|
||||
)
|
||||
|
||||
# This is zero, and subsequent checks are expect_activated - 1, because this counter does not
|
||||
# count how may tenants are Active, it counts how many have finished warmup. The first tenant
|
||||
# that reached Active is still stuck in its local size calculation, and has therefore not finished warmup.
|
||||
assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == 0
|
||||
|
||||
# If a client accesses one of the blocked tenants, it should skip waiting for warmup and
|
||||
# go active as fast as it can.
|
||||
stuck_tenant_id = list(
|
||||
[(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
|
||||
)[0][0]
|
||||
|
||||
endpoint = env.endpoints.create_start(
|
||||
branch_name=f"{stuck_tenant_id}-main", tenant_id=stuck_tenant_id
|
||||
)
|
||||
endpoint = env.endpoints.create_start(branch_name="main", tenant_id=stuck_tenant_id)
|
||||
endpoint.safe_psql_many(
|
||||
[
|
||||
"CREATE TABLE foo (x INTEGER)",
|
||||
@@ -815,6 +822,10 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
|
||||
# That one that we successfully accessed is now Active
|
||||
expect_activated += 1
|
||||
assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active"
|
||||
assert (
|
||||
pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
|
||||
== expect_activated - 1
|
||||
)
|
||||
|
||||
# The ones we didn't touch are still in Attaching
|
||||
assert (
|
||||
@@ -834,6 +845,11 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
|
||||
== n_tenants - expect_activated
|
||||
)
|
||||
|
||||
assert (
|
||||
pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
|
||||
== expect_activated - 1
|
||||
)
|
||||
|
||||
# When we unblock logical size calculation, all tenants should proceed to active state via
|
||||
# the warmup route.
|
||||
pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
|
||||
@@ -848,3 +864,8 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
wait_until(10, 1, all_active)
|
||||
|
||||
assert (
|
||||
pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
|
||||
)
|
||||
assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants
|
||||
|
||||
Reference in New Issue
Block a user