mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-12 07:00:36 +00:00
Add metrics for tenants state (#3448)
## Describe your changes Added a metric that allow to monitor tenants state ## Issue ticket number and link https://github.com/neondatabase/neon/issues/3161 ## Checklist before requesting a review - [X] I have performed a self-review of my code. - [X] I have added an e2e test for it. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.
This commit is contained in:
@@ -29,6 +29,14 @@ pub enum TenantState {
|
||||
Broken,
|
||||
}
|
||||
|
||||
pub mod state {
|
||||
pub const LOADING: &str = "loading";
|
||||
pub const ATTACHING: &str = "attaching";
|
||||
pub const ACTIVE: &str = "active";
|
||||
pub const STOPPING: &str = "stopping";
|
||||
pub const BROKEN: &str = "broken";
|
||||
}
|
||||
|
||||
impl TenantState {
|
||||
pub fn has_in_progress_downloads(&self) -> bool {
|
||||
match self {
|
||||
@@ -39,6 +47,16 @@ impl TenantState {
|
||||
Self::Broken => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
TenantState::Loading => state::LOADING,
|
||||
TenantState::Attaching => state::ATTACHING,
|
||||
TenantState::Active => state::ACTIVE,
|
||||
TenantState::Stopping => state::STOPPING,
|
||||
TenantState::Broken => state::BROKEN,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A state of a timeline in pageserver's memory.
|
||||
|
||||
@@ -5,6 +5,7 @@ use metrics::{
|
||||
IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::models::state;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
/// Prometheus histogram buckets (in seconds) that capture the majority of
|
||||
@@ -112,6 +113,24 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define current logical size metric")
|
||||
});
|
||||
|
||||
// Metrics collected on tenant states.
|
||||
const TENANT_STATE_OPTIONS: &[&str] = &[
|
||||
state::LOADING,
|
||||
state::ATTACHING,
|
||||
state::ACTIVE,
|
||||
state::STOPPING,
|
||||
state::BROKEN,
|
||||
];
|
||||
|
||||
pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_tenant_states_count",
|
||||
"Count of tenants per state",
|
||||
&["tenant_id", "state"]
|
||||
)
|
||||
.expect("Failed to register pageserver_tenant_states_count metric")
|
||||
});
|
||||
|
||||
// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
|
||||
// or in testing they estimate how much we would upload if we did.
|
||||
static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
@@ -496,6 +515,10 @@ impl Drop for TimelineMetrics {
|
||||
|
||||
pub fn remove_tenant_metrics(tenant_id: &TenantId) {
|
||||
let _ = STORAGE_TIME.remove_label_values(&["gc", &tenant_id.to_string(), "-"]);
|
||||
let tid = tenant_id.to_string();
|
||||
for state in TENANT_STATE_OPTIONS {
|
||||
let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
|
||||
}
|
||||
}
|
||||
|
||||
use futures::Future;
|
||||
|
||||
@@ -51,6 +51,7 @@ use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::import_datadir;
|
||||
use crate::is_uninit_mark;
|
||||
use crate::metrics::TENANT_STATE_METRIC;
|
||||
use crate::metrics::{remove_tenant_metrics, STORAGE_TIME};
|
||||
use crate::repository::GcResult;
|
||||
use crate::task_mgr;
|
||||
@@ -1736,7 +1737,33 @@ impl Tenant {
|
||||
tenant_id: TenantId,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
) -> Tenant {
|
||||
let (state, _) = watch::channel(state);
|
||||
let (state, mut rx) = watch::channel(state);
|
||||
|
||||
tokio::spawn(async move {
|
||||
let current_state = *rx.borrow_and_update();
|
||||
let tid = tenant_id.to_string();
|
||||
TENANT_STATE_METRIC
|
||||
.with_label_values(&[&tid, current_state.as_str()])
|
||||
.inc();
|
||||
loop {
|
||||
match rx.changed().await {
|
||||
Ok(()) => {
|
||||
let new_state = *rx.borrow();
|
||||
TENANT_STATE_METRIC
|
||||
.with_label_values(&[&tid, current_state.as_str()])
|
||||
.dec();
|
||||
TENANT_STATE_METRIC
|
||||
.with_label_values(&[&tid, new_state.as_str()])
|
||||
.inc();
|
||||
}
|
||||
Err(_sender_dropped_error) => {
|
||||
info!("Tenant dropped the state updates sender, quitting waiting for tenant state change");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
Tenant {
|
||||
tenant_id,
|
||||
conf,
|
||||
|
||||
@@ -69,5 +69,6 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_wait_lsn_seconds_sum",
|
||||
"pageserver_created_persistent_files_total",
|
||||
"pageserver_written_persistent_bytes_total",
|
||||
"pageserver_tenant_states_count",
|
||||
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
|
||||
)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import os
|
||||
import shutil
|
||||
import time
|
||||
from contextlib import closing
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
@@ -259,7 +260,7 @@ def test_pageserver_with_empty_tenants(
|
||||
files_in_timelines_dir == 0
|
||||
), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory"
|
||||
|
||||
# Trigger timeline reinitialization after pageserver restart
|
||||
# Trigger timeline re-initialization after pageserver restart
|
||||
env.postgres.stop_all()
|
||||
env.pageserver.stop()
|
||||
|
||||
@@ -278,7 +279,51 @@ def test_pageserver_with_empty_tenants(
|
||||
broken_tenant["state"] == "Broken"
|
||||
), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
|
||||
|
||||
broken_tenant_status = client.tenant_status(tenant_without_timelines_dir)
|
||||
assert (
|
||||
broken_tenant_status["state"] == "Broken"
|
||||
), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
|
||||
|
||||
assert env.pageserver.log_contains(".*Setting tenant as Broken state, reason:.*")
|
||||
|
||||
[loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines_dir)]
|
||||
assert (
|
||||
loaded_tenant["state"] == "Active"
|
||||
), "Tenant {tenant_with_empty_timelines_dir} with empty timelines dir should be active and ready for timeline creation"
|
||||
|
||||
loaded_tenant_status = client.tenant_status(tenant_with_empty_timelines_dir)
|
||||
assert (
|
||||
loaded_tenant_status["state"] == "Active"
|
||||
), f"Tenant {tenant_with_empty_timelines_dir} without timelines dir should be active"
|
||||
|
||||
time.sleep(1) # to allow metrics propagation
|
||||
|
||||
ps_metrics = parse_metrics(client.get_metrics(), "pageserver")
|
||||
broken_tenants_metric_filter = {
|
||||
"tenant_id": str(tenant_without_timelines_dir),
|
||||
"state": "broken",
|
||||
}
|
||||
active_tenants_metric_filter = {
|
||||
"tenant_id": str(tenant_with_empty_timelines_dir),
|
||||
"state": "active",
|
||||
}
|
||||
|
||||
tenant_active_count = int(
|
||||
ps_metrics.query_one(
|
||||
"pageserver_tenant_states_count", filter=active_tenants_metric_filter
|
||||
).value
|
||||
)
|
||||
|
||||
assert (
|
||||
tenant_active_count == 1
|
||||
), f"Tenant {tenant_with_empty_timelines_dir} should have metric as active"
|
||||
|
||||
tenant_broken_count = int(
|
||||
ps_metrics.query_one(
|
||||
"pageserver_tenant_states_count", filter=broken_tenants_metric_filter
|
||||
).value
|
||||
)
|
||||
|
||||
assert (
|
||||
tenant_broken_count == 1
|
||||
), f"Tenant {tenant_without_timelines_dir} should have metric as broken"
|
||||
|
||||
Reference in New Issue
Block a user