mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-15 20:20:38 +00:00
Add metrics for tenants state (#3448)
## Describe your changes Added a metric that allow to monitor tenants state ## Issue ticket number and link https://github.com/neondatabase/neon/issues/3161 ## Checklist before requesting a review - [X] I have performed a self-review of my code. - [X] I have added an e2e test for it. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.
This commit is contained in:
@@ -29,6 +29,14 @@ pub enum TenantState {
|
|||||||
Broken,
|
Broken,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub mod state {
|
||||||
|
pub const LOADING: &str = "loading";
|
||||||
|
pub const ATTACHING: &str = "attaching";
|
||||||
|
pub const ACTIVE: &str = "active";
|
||||||
|
pub const STOPPING: &str = "stopping";
|
||||||
|
pub const BROKEN: &str = "broken";
|
||||||
|
}
|
||||||
|
|
||||||
impl TenantState {
|
impl TenantState {
|
||||||
pub fn has_in_progress_downloads(&self) -> bool {
|
pub fn has_in_progress_downloads(&self) -> bool {
|
||||||
match self {
|
match self {
|
||||||
@@ -39,6 +47,16 @@ impl TenantState {
|
|||||||
Self::Broken => false,
|
Self::Broken => false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn as_str(&self) -> &'static str {
|
||||||
|
match self {
|
||||||
|
TenantState::Loading => state::LOADING,
|
||||||
|
TenantState::Attaching => state::ATTACHING,
|
||||||
|
TenantState::Active => state::ACTIVE,
|
||||||
|
TenantState::Stopping => state::STOPPING,
|
||||||
|
TenantState::Broken => state::BROKEN,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A state of a timeline in pageserver's memory.
|
/// A state of a timeline in pageserver's memory.
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ use metrics::{
|
|||||||
IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||||
};
|
};
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
|
use pageserver_api::models::state;
|
||||||
use utils::id::{TenantId, TimelineId};
|
use utils::id::{TenantId, TimelineId};
|
||||||
|
|
||||||
/// Prometheus histogram buckets (in seconds) that capture the majority of
|
/// Prometheus histogram buckets (in seconds) that capture the majority of
|
||||||
@@ -112,6 +113,24 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
|||||||
.expect("failed to define current logical size metric")
|
.expect("failed to define current logical size metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Metrics collected on tenant states.
|
||||||
|
const TENANT_STATE_OPTIONS: &[&str] = &[
|
||||||
|
state::LOADING,
|
||||||
|
state::ATTACHING,
|
||||||
|
state::ACTIVE,
|
||||||
|
state::STOPPING,
|
||||||
|
state::BROKEN,
|
||||||
|
];
|
||||||
|
|
||||||
|
pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||||
|
register_uint_gauge_vec!(
|
||||||
|
"pageserver_tenant_states_count",
|
||||||
|
"Count of tenants per state",
|
||||||
|
&["tenant_id", "state"]
|
||||||
|
)
|
||||||
|
.expect("Failed to register pageserver_tenant_states_count metric")
|
||||||
|
});
|
||||||
|
|
||||||
// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
|
// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
|
||||||
// or in testing they estimate how much we would upload if we did.
|
// or in testing they estimate how much we would upload if we did.
|
||||||
static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
|
static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||||
@@ -496,6 +515,10 @@ impl Drop for TimelineMetrics {
|
|||||||
|
|
||||||
pub fn remove_tenant_metrics(tenant_id: &TenantId) {
|
pub fn remove_tenant_metrics(tenant_id: &TenantId) {
|
||||||
let _ = STORAGE_TIME.remove_label_values(&["gc", &tenant_id.to_string(), "-"]);
|
let _ = STORAGE_TIME.remove_label_values(&["gc", &tenant_id.to_string(), "-"]);
|
||||||
|
let tid = tenant_id.to_string();
|
||||||
|
for state in TENANT_STATE_OPTIONS {
|
||||||
|
let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
use futures::Future;
|
use futures::Future;
|
||||||
|
|||||||
@@ -51,6 +51,7 @@ use crate::config::PageServerConf;
|
|||||||
use crate::context::{DownloadBehavior, RequestContext};
|
use crate::context::{DownloadBehavior, RequestContext};
|
||||||
use crate::import_datadir;
|
use crate::import_datadir;
|
||||||
use crate::is_uninit_mark;
|
use crate::is_uninit_mark;
|
||||||
|
use crate::metrics::TENANT_STATE_METRIC;
|
||||||
use crate::metrics::{remove_tenant_metrics, STORAGE_TIME};
|
use crate::metrics::{remove_tenant_metrics, STORAGE_TIME};
|
||||||
use crate::repository::GcResult;
|
use crate::repository::GcResult;
|
||||||
use crate::task_mgr;
|
use crate::task_mgr;
|
||||||
@@ -1736,7 +1737,33 @@ impl Tenant {
|
|||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
remote_storage: Option<GenericRemoteStorage>,
|
remote_storage: Option<GenericRemoteStorage>,
|
||||||
) -> Tenant {
|
) -> Tenant {
|
||||||
let (state, _) = watch::channel(state);
|
let (state, mut rx) = watch::channel(state);
|
||||||
|
|
||||||
|
tokio::spawn(async move {
|
||||||
|
let current_state = *rx.borrow_and_update();
|
||||||
|
let tid = tenant_id.to_string();
|
||||||
|
TENANT_STATE_METRIC
|
||||||
|
.with_label_values(&[&tid, current_state.as_str()])
|
||||||
|
.inc();
|
||||||
|
loop {
|
||||||
|
match rx.changed().await {
|
||||||
|
Ok(()) => {
|
||||||
|
let new_state = *rx.borrow();
|
||||||
|
TENANT_STATE_METRIC
|
||||||
|
.with_label_values(&[&tid, current_state.as_str()])
|
||||||
|
.dec();
|
||||||
|
TENANT_STATE_METRIC
|
||||||
|
.with_label_values(&[&tid, new_state.as_str()])
|
||||||
|
.inc();
|
||||||
|
}
|
||||||
|
Err(_sender_dropped_error) => {
|
||||||
|
info!("Tenant dropped the state updates sender, quitting waiting for tenant state change");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
Tenant {
|
Tenant {
|
||||||
tenant_id,
|
tenant_id,
|
||||||
conf,
|
conf,
|
||||||
|
|||||||
@@ -69,5 +69,6 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
|||||||
"pageserver_wait_lsn_seconds_sum",
|
"pageserver_wait_lsn_seconds_sum",
|
||||||
"pageserver_created_persistent_files_total",
|
"pageserver_created_persistent_files_total",
|
||||||
"pageserver_written_persistent_bytes_total",
|
"pageserver_written_persistent_bytes_total",
|
||||||
|
"pageserver_tenant_states_count",
|
||||||
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
|
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
import time
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -259,7 +260,7 @@ def test_pageserver_with_empty_tenants(
|
|||||||
files_in_timelines_dir == 0
|
files_in_timelines_dir == 0
|
||||||
), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory"
|
), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory"
|
||||||
|
|
||||||
# Trigger timeline reinitialization after pageserver restart
|
# Trigger timeline re-initialization after pageserver restart
|
||||||
env.postgres.stop_all()
|
env.postgres.stop_all()
|
||||||
env.pageserver.stop()
|
env.pageserver.stop()
|
||||||
|
|
||||||
@@ -278,7 +279,51 @@ def test_pageserver_with_empty_tenants(
|
|||||||
broken_tenant["state"] == "Broken"
|
broken_tenant["state"] == "Broken"
|
||||||
), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
|
), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
|
||||||
|
|
||||||
|
broken_tenant_status = client.tenant_status(tenant_without_timelines_dir)
|
||||||
|
assert (
|
||||||
|
broken_tenant_status["state"] == "Broken"
|
||||||
|
), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
|
||||||
|
|
||||||
|
assert env.pageserver.log_contains(".*Setting tenant as Broken state, reason:.*")
|
||||||
|
|
||||||
[loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines_dir)]
|
[loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines_dir)]
|
||||||
assert (
|
assert (
|
||||||
loaded_tenant["state"] == "Active"
|
loaded_tenant["state"] == "Active"
|
||||||
), "Tenant {tenant_with_empty_timelines_dir} with empty timelines dir should be active and ready for timeline creation"
|
), "Tenant {tenant_with_empty_timelines_dir} with empty timelines dir should be active and ready for timeline creation"
|
||||||
|
|
||||||
|
loaded_tenant_status = client.tenant_status(tenant_with_empty_timelines_dir)
|
||||||
|
assert (
|
||||||
|
loaded_tenant_status["state"] == "Active"
|
||||||
|
), f"Tenant {tenant_with_empty_timelines_dir} without timelines dir should be active"
|
||||||
|
|
||||||
|
time.sleep(1) # to allow metrics propagation
|
||||||
|
|
||||||
|
ps_metrics = parse_metrics(client.get_metrics(), "pageserver")
|
||||||
|
broken_tenants_metric_filter = {
|
||||||
|
"tenant_id": str(tenant_without_timelines_dir),
|
||||||
|
"state": "broken",
|
||||||
|
}
|
||||||
|
active_tenants_metric_filter = {
|
||||||
|
"tenant_id": str(tenant_with_empty_timelines_dir),
|
||||||
|
"state": "active",
|
||||||
|
}
|
||||||
|
|
||||||
|
tenant_active_count = int(
|
||||||
|
ps_metrics.query_one(
|
||||||
|
"pageserver_tenant_states_count", filter=active_tenants_metric_filter
|
||||||
|
).value
|
||||||
|
)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
tenant_active_count == 1
|
||||||
|
), f"Tenant {tenant_with_empty_timelines_dir} should have metric as active"
|
||||||
|
|
||||||
|
tenant_broken_count = int(
|
||||||
|
ps_metrics.query_one(
|
||||||
|
"pageserver_tenant_states_count", filter=broken_tenants_metric_filter
|
||||||
|
).value
|
||||||
|
)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
tenant_broken_count == 1
|
||||||
|
), f"Tenant {tenant_without_timelines_dir} should have metric as broken"
|
||||||
|
|||||||
Reference in New Issue
Block a user