Add metrics for tenants state (#3448)

## Describe your changes
Added a metric that allow to monitor tenants state 
## Issue ticket number and link
https://github.com/neondatabase/neon/issues/3161

## Checklist before requesting a review
- [X] I have performed a self-review of my code.
- [X] I have added an e2e test for it.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
This commit is contained in:
Shany Pozin
2023-01-29 14:04:06 +02:00
committed by GitHub
parent 67d418e91c
commit ddb9c2fe94
5 changed files with 116 additions and 2 deletions

View File

@@ -29,6 +29,14 @@ pub enum TenantState {
Broken, Broken,
} }
pub mod state {
pub const LOADING: &str = "loading";
pub const ATTACHING: &str = "attaching";
pub const ACTIVE: &str = "active";
pub const STOPPING: &str = "stopping";
pub const BROKEN: &str = "broken";
}
impl TenantState { impl TenantState {
pub fn has_in_progress_downloads(&self) -> bool { pub fn has_in_progress_downloads(&self) -> bool {
match self { match self {
@@ -39,6 +47,16 @@ impl TenantState {
Self::Broken => false, Self::Broken => false,
} }
} }
pub fn as_str(&self) -> &'static str {
match self {
TenantState::Loading => state::LOADING,
TenantState::Attaching => state::ATTACHING,
TenantState::Active => state::ACTIVE,
TenantState::Stopping => state::STOPPING,
TenantState::Broken => state::BROKEN,
}
}
} }
/// A state of a timeline in pageserver's memory. /// A state of a timeline in pageserver's memory.

View File

@@ -5,6 +5,7 @@ use metrics::{
IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
}; };
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use pageserver_api::models::state;
use utils::id::{TenantId, TimelineId}; use utils::id::{TenantId, TimelineId};
/// Prometheus histogram buckets (in seconds) that capture the majority of /// Prometheus histogram buckets (in seconds) that capture the majority of
@@ -112,6 +113,24 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define current logical size metric") .expect("failed to define current logical size metric")
}); });
// Metrics collected on tenant states.
const TENANT_STATE_OPTIONS: &[&str] = &[
state::LOADING,
state::ATTACHING,
state::ACTIVE,
state::STOPPING,
state::BROKEN,
];
pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_tenant_states_count",
"Count of tenants per state",
&["tenant_id", "state"]
)
.expect("Failed to register pageserver_tenant_states_count metric")
});
// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
// or in testing they estimate how much we would upload if we did. // or in testing they estimate how much we would upload if we did.
static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| { static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -496,6 +515,10 @@ impl Drop for TimelineMetrics {
pub fn remove_tenant_metrics(tenant_id: &TenantId) { pub fn remove_tenant_metrics(tenant_id: &TenantId) {
let _ = STORAGE_TIME.remove_label_values(&["gc", &tenant_id.to_string(), "-"]); let _ = STORAGE_TIME.remove_label_values(&["gc", &tenant_id.to_string(), "-"]);
let tid = tenant_id.to_string();
for state in TENANT_STATE_OPTIONS {
let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
}
} }
use futures::Future; use futures::Future;

View File

@@ -51,6 +51,7 @@ use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext}; use crate::context::{DownloadBehavior, RequestContext};
use crate::import_datadir; use crate::import_datadir;
use crate::is_uninit_mark; use crate::is_uninit_mark;
use crate::metrics::TENANT_STATE_METRIC;
use crate::metrics::{remove_tenant_metrics, STORAGE_TIME}; use crate::metrics::{remove_tenant_metrics, STORAGE_TIME};
use crate::repository::GcResult; use crate::repository::GcResult;
use crate::task_mgr; use crate::task_mgr;
@@ -1736,7 +1737,33 @@ impl Tenant {
tenant_id: TenantId, tenant_id: TenantId,
remote_storage: Option<GenericRemoteStorage>, remote_storage: Option<GenericRemoteStorage>,
) -> Tenant { ) -> Tenant {
let (state, _) = watch::channel(state); let (state, mut rx) = watch::channel(state);
tokio::spawn(async move {
let current_state = *rx.borrow_and_update();
let tid = tenant_id.to_string();
TENANT_STATE_METRIC
.with_label_values(&[&tid, current_state.as_str()])
.inc();
loop {
match rx.changed().await {
Ok(()) => {
let new_state = *rx.borrow();
TENANT_STATE_METRIC
.with_label_values(&[&tid, current_state.as_str()])
.dec();
TENANT_STATE_METRIC
.with_label_values(&[&tid, new_state.as_str()])
.inc();
}
Err(_sender_dropped_error) => {
info!("Tenant dropped the state updates sender, quitting waiting for tenant state change");
return;
}
}
}
});
Tenant { Tenant {
tenant_id, tenant_id,
conf, conf,

View File

@@ -69,5 +69,6 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
"pageserver_wait_lsn_seconds_sum", "pageserver_wait_lsn_seconds_sum",
"pageserver_created_persistent_files_total", "pageserver_created_persistent_files_total",
"pageserver_written_persistent_bytes_total", "pageserver_written_persistent_bytes_total",
"pageserver_tenant_states_count",
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
) )

View File

@@ -1,5 +1,6 @@
import os import os
import shutil import shutil
import time
from contextlib import closing from contextlib import closing
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
@@ -259,7 +260,7 @@ def test_pageserver_with_empty_tenants(
files_in_timelines_dir == 0 files_in_timelines_dir == 0
), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory" ), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory"
# Trigger timeline reinitialization after pageserver restart # Trigger timeline re-initialization after pageserver restart
env.postgres.stop_all() env.postgres.stop_all()
env.pageserver.stop() env.pageserver.stop()
@@ -278,7 +279,51 @@ def test_pageserver_with_empty_tenants(
broken_tenant["state"] == "Broken" broken_tenant["state"] == "Broken"
), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken" ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
broken_tenant_status = client.tenant_status(tenant_without_timelines_dir)
assert (
broken_tenant_status["state"] == "Broken"
), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
assert env.pageserver.log_contains(".*Setting tenant as Broken state, reason:.*")
[loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines_dir)] [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines_dir)]
assert ( assert (
loaded_tenant["state"] == "Active" loaded_tenant["state"] == "Active"
), "Tenant {tenant_with_empty_timelines_dir} with empty timelines dir should be active and ready for timeline creation" ), "Tenant {tenant_with_empty_timelines_dir} with empty timelines dir should be active and ready for timeline creation"
loaded_tenant_status = client.tenant_status(tenant_with_empty_timelines_dir)
assert (
loaded_tenant_status["state"] == "Active"
), f"Tenant {tenant_with_empty_timelines_dir} without timelines dir should be active"
time.sleep(1) # to allow metrics propagation
ps_metrics = parse_metrics(client.get_metrics(), "pageserver")
broken_tenants_metric_filter = {
"tenant_id": str(tenant_without_timelines_dir),
"state": "broken",
}
active_tenants_metric_filter = {
"tenant_id": str(tenant_with_empty_timelines_dir),
"state": "active",
}
tenant_active_count = int(
ps_metrics.query_one(
"pageserver_tenant_states_count", filter=active_tenants_metric_filter
).value
)
assert (
tenant_active_count == 1
), f"Tenant {tenant_with_empty_timelines_dir} should have metric as active"
tenant_broken_count = int(
ps_metrics.query_one(
"pageserver_tenant_states_count", filter=broken_tenants_metric_filter
).value
)
assert (
tenant_broken_count == 1
), f"Tenant {tenant_without_timelines_dir} should have metric as broken"