mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-22 23:50:39 +00:00
## Problem For computes running inside NeonVM, the actual compute image tag is buried inside the NeonVM spec, and we cannot get it as part of standard k8s container metrics (it's always an image and a tag of the NeonVM runner container). The workaround we currently use is to extract the running computes info from the control plane database with SQL. It has several drawbacks: i) it's complicated, separate DB per region; ii) it's slow; iii) it's still an indirect source of info, i.e. k8s state could be different from what the control plane expects. ## Summary of changes Add a new `compute_ctl_up` gauge metric with `build_tag` and `status` labels. It will help us to both overview what are the tags/versions of all running computes; and to break them down by current status (`empty`, `running`, `failed`, etc.) Later, we could introduce low cardinality (no endpoint or compute ids) streaming aggregates for such metrics, so they will be blazingly fast and usable for monitoring the fleet-wide state.
93 lines
3.2 KiB
Rust
93 lines
3.2 KiB
Rust
use metrics::core::{AtomicF64, Collector, GenericGauge};
|
|
use metrics::proto::MetricFamily;
|
|
use metrics::{
|
|
IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter_vec,
|
|
register_int_gauge_vec, register_uint_gauge_vec,
|
|
};
|
|
use once_cell::sync::Lazy;
|
|
|
|
pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
|
register_uint_gauge_vec!(
|
|
"compute_installed_extensions",
|
|
"Number of databases where the version of extension is installed",
|
|
&["extension_name", "version", "owned_by_superuser"]
|
|
)
|
|
.expect("failed to define a metric")
|
|
});
|
|
|
|
// Normally, any HTTP API request is described by METHOD (e.g. GET, POST, etc.) + PATH,
|
|
// but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec.
|
|
// And it's fair to call it a 'RPC' (Remote Procedure Call).
|
|
pub enum CPlaneRequestRPC {
|
|
GetSpec,
|
|
}
|
|
|
|
impl CPlaneRequestRPC {
|
|
pub fn as_str(&self) -> &str {
|
|
match self {
|
|
CPlaneRequestRPC::GetSpec => "GetSpec",
|
|
}
|
|
}
|
|
}
|
|
|
|
pub const UNKNOWN_HTTP_STATUS: &str = "unknown";
|
|
|
|
pub(crate) static CPLANE_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
|
|
register_int_counter_vec!(
|
|
"compute_ctl_cplane_requests_total",
|
|
"Total number of control plane requests made by compute_ctl by status",
|
|
&["rpc", "http_status"]
|
|
)
|
|
.expect("failed to define a metric")
|
|
});
|
|
|
|
/// Total number of failed database migrations. Per-compute, this is actually a boolean metric,
|
|
/// either empty or with a single value (1, migration_id) because we stop at the first failure.
|
|
/// Yet, the sum over the fleet will provide the total number of failures.
|
|
pub(crate) static DB_MIGRATION_FAILED: Lazy<IntCounterVec> = Lazy::new(|| {
|
|
register_int_counter_vec!(
|
|
"compute_ctl_db_migration_failed_total",
|
|
"Total number of failed database migrations",
|
|
&["migration_id"]
|
|
)
|
|
.expect("failed to define a metric")
|
|
});
|
|
|
|
pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
|
|
register_int_counter_vec!(
|
|
"compute_ctl_remote_ext_requests_total",
|
|
"Total number of requests made by compute_ctl to download extensions from S3 proxy by status",
|
|
&["http_status", "filename"]
|
|
)
|
|
.expect("failed to define a metric")
|
|
});
|
|
|
|
// Size of audit log directory in bytes
|
|
pub(crate) static AUDIT_LOG_DIR_SIZE: Lazy<GenericGauge<AtomicF64>> = Lazy::new(|| {
|
|
register_gauge!(
|
|
"compute_audit_log_dir_size",
|
|
"Size of audit log directory in bytes",
|
|
)
|
|
.expect("failed to define a metric")
|
|
});
|
|
|
|
// Report that `compute_ctl` is up and what's the current compute status.
|
|
pub(crate) static COMPUTE_CTL_UP: Lazy<IntGaugeVec> = Lazy::new(|| {
|
|
register_int_gauge_vec!(
|
|
"compute_ctl_up",
|
|
"Whether compute_ctl is running",
|
|
&["build_tag", "status"]
|
|
)
|
|
.expect("failed to define a metric")
|
|
});
|
|
|
|
pub fn collect() -> Vec<MetricFamily> {
|
|
let mut metrics = COMPUTE_CTL_UP.collect();
|
|
metrics.extend(INSTALLED_EXTENSIONS.collect());
|
|
metrics.extend(CPLANE_REQUESTS_TOTAL.collect());
|
|
metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect());
|
|
metrics.extend(DB_MIGRATION_FAILED.collect());
|
|
metrics.extend(AUDIT_LOG_DIR_SIZE.collect());
|
|
metrics
|
|
}
|