Add metrics for tenants state (#3448)

## Describe your changes
Added a metric that allow to monitor tenants state 
## Issue ticket number and link
https://github.com/neondatabase/neon/issues/3161

## Checklist before requesting a review
- [X] I have performed a self-review of my code.
- [X] I have added an e2e test for it.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
This commit is contained in:
Shany Pozin
2023-01-29 14:04:06 +02:00
committed by GitHub
parent 67d418e91c
commit ddb9c2fe94
5 changed files with 116 additions and 2 deletions

View File

@@ -29,6 +29,14 @@ pub enum TenantState {
Broken,
}
pub mod state {
pub const LOADING: &str = "loading";
pub const ATTACHING: &str = "attaching";
pub const ACTIVE: &str = "active";
pub const STOPPING: &str = "stopping";
pub const BROKEN: &str = "broken";
}
impl TenantState {
pub fn has_in_progress_downloads(&self) -> bool {
match self {
@@ -39,6 +47,16 @@ impl TenantState {
Self::Broken => false,
}
}
pub fn as_str(&self) -> &'static str {
match self {
TenantState::Loading => state::LOADING,
TenantState::Attaching => state::ATTACHING,
TenantState::Active => state::ACTIVE,
TenantState::Stopping => state::STOPPING,
TenantState::Broken => state::BROKEN,
}
}
}
/// A state of a timeline in pageserver's memory.

View File

@@ -5,6 +5,7 @@ use metrics::{
IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
};
use once_cell::sync::Lazy;
use pageserver_api::models::state;
use utils::id::{TenantId, TimelineId};
/// Prometheus histogram buckets (in seconds) that capture the majority of
@@ -112,6 +113,24 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define current logical size metric")
});
// Metrics collected on tenant states.
const TENANT_STATE_OPTIONS: &[&str] = &[
state::LOADING,
state::ATTACHING,
state::ACTIVE,
state::STOPPING,
state::BROKEN,
];
pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_tenant_states_count",
"Count of tenants per state",
&["tenant_id", "state"]
)
.expect("Failed to register pageserver_tenant_states_count metric")
});
// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
// or in testing they estimate how much we would upload if we did.
static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -496,6 +515,10 @@ impl Drop for TimelineMetrics {
pub fn remove_tenant_metrics(tenant_id: &TenantId) {
let _ = STORAGE_TIME.remove_label_values(&["gc", &tenant_id.to_string(), "-"]);
let tid = tenant_id.to_string();
for state in TENANT_STATE_OPTIONS {
let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
}
}
use futures::Future;

View File

@@ -51,6 +51,7 @@ use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::import_datadir;
use crate::is_uninit_mark;
use crate::metrics::TENANT_STATE_METRIC;
use crate::metrics::{remove_tenant_metrics, STORAGE_TIME};
use crate::repository::GcResult;
use crate::task_mgr;
@@ -1736,7 +1737,33 @@ impl Tenant {
tenant_id: TenantId,
remote_storage: Option<GenericRemoteStorage>,
) -> Tenant {
let (state, _) = watch::channel(state);
let (state, mut rx) = watch::channel(state);
tokio::spawn(async move {
let current_state = *rx.borrow_and_update();
let tid = tenant_id.to_string();
TENANT_STATE_METRIC
.with_label_values(&[&tid, current_state.as_str()])
.inc();
loop {
match rx.changed().await {
Ok(()) => {
let new_state = *rx.borrow();
TENANT_STATE_METRIC
.with_label_values(&[&tid, current_state.as_str()])
.dec();
TENANT_STATE_METRIC
.with_label_values(&[&tid, new_state.as_str()])
.inc();
}
Err(_sender_dropped_error) => {
info!("Tenant dropped the state updates sender, quitting waiting for tenant state change");
return;
}
}
}
});
Tenant {
tenant_id,
conf,

View File

@@ -69,5 +69,6 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
"pageserver_wait_lsn_seconds_sum",
"pageserver_created_persistent_files_total",
"pageserver_written_persistent_bytes_total",
"pageserver_tenant_states_count",
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
)

View File

@@ -1,5 +1,6 @@
import os
import shutil
import time
from contextlib import closing
from datetime import datetime
from pathlib import Path
@@ -259,7 +260,7 @@ def test_pageserver_with_empty_tenants(
files_in_timelines_dir == 0
), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory"
# Trigger timeline reinitialization after pageserver restart
# Trigger timeline re-initialization after pageserver restart
env.postgres.stop_all()
env.pageserver.stop()
@@ -278,7 +279,51 @@ def test_pageserver_with_empty_tenants(
broken_tenant["state"] == "Broken"
), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
broken_tenant_status = client.tenant_status(tenant_without_timelines_dir)
assert (
broken_tenant_status["state"] == "Broken"
), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
assert env.pageserver.log_contains(".*Setting tenant as Broken state, reason:.*")
[loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines_dir)]
assert (
loaded_tenant["state"] == "Active"
), "Tenant {tenant_with_empty_timelines_dir} with empty timelines dir should be active and ready for timeline creation"
loaded_tenant_status = client.tenant_status(tenant_with_empty_timelines_dir)
assert (
loaded_tenant_status["state"] == "Active"
), f"Tenant {tenant_with_empty_timelines_dir} without timelines dir should be active"
time.sleep(1) # to allow metrics propagation
ps_metrics = parse_metrics(client.get_metrics(), "pageserver")
broken_tenants_metric_filter = {
"tenant_id": str(tenant_without_timelines_dir),
"state": "broken",
}
active_tenants_metric_filter = {
"tenant_id": str(tenant_with_empty_timelines_dir),
"state": "active",
}
tenant_active_count = int(
ps_metrics.query_one(
"pageserver_tenant_states_count", filter=active_tenants_metric_filter
).value
)
assert (
tenant_active_count == 1
), f"Tenant {tenant_with_empty_timelines_dir} should have metric as active"
tenant_broken_count = int(
ps_metrics.query_one(
"pageserver_tenant_states_count", filter=broken_tenants_metric_filter
).value
)
assert (
tenant_broken_count == 1
), f"Tenant {tenant_without_timelines_dir} should have metric as broken"