mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-26 09:30:37 +00:00
Proxy metrics (#3290)
Implement proxy metrics collection. Only collect metric for outbound traffic. Add proxy CLI parameters: - metric-collection-endpoint - metric-collection-interval. Add test_proxy_metric_collection test. Move shared consumption metrics code to libs/consumption_metrics. Refactor the code.
This commit is contained in:
committed by
GitHub
parent
5c6a7a17cb
commit
2cbe84b78f
@@ -5,6 +5,12 @@ use std::sync::Arc;
|
||||
pub struct ProxyConfig {
|
||||
pub tls_config: Option<TlsConfig>,
|
||||
pub auth_backend: auth::BackendType<'static, ()>,
|
||||
pub metric_collection_config: Option<MetricCollectionConfig>,
|
||||
}
|
||||
|
||||
pub struct MetricCollectionConfig {
|
||||
pub endpoint: reqwest::Url,
|
||||
pub interval: std::time::Duration,
|
||||
}
|
||||
|
||||
pub struct TlsConfig {
|
||||
|
||||
@@ -11,6 +11,7 @@ mod config;
|
||||
mod console;
|
||||
mod error;
|
||||
mod http;
|
||||
mod metrics;
|
||||
mod mgmt;
|
||||
mod parse;
|
||||
mod proxy;
|
||||
@@ -20,14 +21,14 @@ mod stream;
|
||||
mod url;
|
||||
mod waiters;
|
||||
|
||||
use ::metrics::set_build_info_metric;
|
||||
use anyhow::{bail, Context};
|
||||
use clap::{self, Arg};
|
||||
use config::ProxyConfig;
|
||||
use futures::FutureExt;
|
||||
use metrics::set_build_info_metric;
|
||||
use std::{borrow::Cow, future::Future, net::SocketAddr};
|
||||
use tokio::{net::TcpListener, task::JoinError};
|
||||
use tracing::info;
|
||||
use tracing::{info, info_span, Instrument};
|
||||
use utils::project_git_version;
|
||||
use utils::sentry_init::{init_sentry, release_name};
|
||||
|
||||
@@ -65,6 +66,22 @@ async fn main() -> anyhow::Result<()> {
|
||||
let mgmt_address: SocketAddr = arg_matches.get_one::<String>("mgmt").unwrap().parse()?;
|
||||
let http_address: SocketAddr = arg_matches.get_one::<String>("http").unwrap().parse()?;
|
||||
|
||||
let metric_collection_config = match
|
||||
(
|
||||
arg_matches.get_one::<String>("metric-collection-endpoint"),
|
||||
arg_matches.get_one::<String>("metric-collection-interval"),
|
||||
) {
|
||||
|
||||
(Some(endpoint), Some(interval)) => {
|
||||
Some(config::MetricCollectionConfig {
|
||||
endpoint: endpoint.parse()?,
|
||||
interval: humantime::parse_duration(interval)?,
|
||||
})
|
||||
}
|
||||
(None, None) => None,
|
||||
_ => bail!("either both or neither metric-collection-endpoint and metric-collection-interval must be specified"),
|
||||
};
|
||||
|
||||
let auth_backend = match arg_matches
|
||||
.get_one::<String>("auth-backend")
|
||||
.unwrap()
|
||||
@@ -95,6 +112,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
let config: &ProxyConfig = Box::leak(Box::new(ProxyConfig {
|
||||
tls_config,
|
||||
auth_backend,
|
||||
metric_collection_config,
|
||||
}));
|
||||
|
||||
info!("Version: {GIT_VERSION}");
|
||||
@@ -126,6 +144,21 @@ async fn main() -> anyhow::Result<()> {
|
||||
)));
|
||||
}
|
||||
|
||||
if let Some(metric_collection_config) = &config.metric_collection_config {
|
||||
let hostname = hostname::get()?
|
||||
.into_string()
|
||||
.map_err(|e| anyhow::anyhow!("failed to get hostname {e:?}"))?;
|
||||
|
||||
tasks.push(tokio::spawn(
|
||||
metrics::collect_metrics(
|
||||
&metric_collection_config.endpoint,
|
||||
metric_collection_config.interval,
|
||||
hostname,
|
||||
)
|
||||
.instrument(info_span!("collect_metrics")),
|
||||
));
|
||||
}
|
||||
|
||||
let tasks = tasks.into_iter().map(flatten_err);
|
||||
|
||||
set_build_info_metric(GIT_VERSION);
|
||||
@@ -199,6 +232,16 @@ fn cli() -> clap::Command {
|
||||
.alias("ssl-cert") // backwards compatibility
|
||||
.help("path to TLS cert for client postgres connections"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("metric-collection-endpoint")
|
||||
.long("metric-collection-endpoint")
|
||||
.help("metric collection HTTP endpoint"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("metric-collection-interval")
|
||||
.long("metric-collection-interval")
|
||||
.help("metric collection interval"),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
196
proxy/src/metrics.rs
Normal file
196
proxy/src/metrics.rs
Normal file
@@ -0,0 +1,196 @@
|
||||
//!
|
||||
//! Periodically collect proxy consumption metrics
|
||||
//! and push them to a HTTP endpoint.
|
||||
//!
|
||||
use chrono::{DateTime, Utc};
|
||||
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
|
||||
use serde::Serialize;
|
||||
use std::{collections::HashMap, time::Duration};
|
||||
use tracing::{debug, error, log::info, trace};
|
||||
|
||||
const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
|
||||
|
||||
///
|
||||
/// Key that uniquely identifies the object, this metric describes.
|
||||
/// Currently, endpoint_id is enough, but this may change later,
|
||||
/// so keep it in a named struct.
|
||||
///
|
||||
/// Both the proxy and the ingestion endpoint will live in the same region (or cell)
|
||||
/// so while the project-id is unique across regions the whole pipeline will work correctly
|
||||
/// because we enrich the event with project_id in the control-plane endpoint.
|
||||
///
|
||||
#[derive(Eq, Hash, PartialEq, Serialize)]
|
||||
pub struct Ids {
|
||||
pub endpoint_id: String,
|
||||
}
|
||||
|
||||
pub async fn collect_metrics(
|
||||
metric_collection_endpoint: &reqwest::Url,
|
||||
metric_collection_interval: Duration,
|
||||
hostname: String,
|
||||
) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
info!("collect_metrics has shut down");
|
||||
}
|
||||
|
||||
let mut ticker = tokio::time::interval(metric_collection_interval);
|
||||
|
||||
info!(
|
||||
"starting collect_metrics. metric_collection_endpoint: {}",
|
||||
metric_collection_endpoint
|
||||
);
|
||||
|
||||
// define client here to reuse it for all requests
|
||||
let client = reqwest::Client::new();
|
||||
let mut cached_metrics: HashMap<Ids, (u64, DateTime<Utc>)> = HashMap::new();
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = ticker.tick() => {
|
||||
|
||||
match collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, hostname.clone()).await
|
||||
{
|
||||
Err(e) => {
|
||||
error!("Failed to send consumption metrics: {} ", e);
|
||||
},
|
||||
Ok(_) => { trace!("collect_metrics_iteration completed successfully") },
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
|
||||
let mut current_metrics: Vec<(Ids, (u64, DateTime<Utc>))> = Vec::new();
|
||||
let metrics = prometheus::default_registry().gather();
|
||||
|
||||
for m in metrics {
|
||||
if m.get_name() == "proxy_io_bytes_per_client" {
|
||||
for ms in m.get_metric() {
|
||||
let direction = ms
|
||||
.get_label()
|
||||
.iter()
|
||||
.find(|l| l.get_name() == "direction")
|
||||
.unwrap()
|
||||
.get_value();
|
||||
|
||||
// Only collect metric for outbound traffic
|
||||
if direction == "tx" {
|
||||
let endpoint_id = ms
|
||||
.get_label()
|
||||
.iter()
|
||||
.find(|l| l.get_name() == "endpoint_id")
|
||||
.unwrap()
|
||||
.get_value();
|
||||
let value = ms.get_counter().get_value() as u64;
|
||||
|
||||
debug!("endpoint_id:val - {}: {}", endpoint_id, value);
|
||||
current_metrics.push((
|
||||
Ids {
|
||||
endpoint_id: endpoint_id.to_string(),
|
||||
},
|
||||
(value, Utc::now()),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
current_metrics
|
||||
}
|
||||
|
||||
pub async fn collect_metrics_iteration(
|
||||
client: &reqwest::Client,
|
||||
cached_metrics: &mut HashMap<Ids, (u64, DateTime<Utc>)>,
|
||||
metric_collection_endpoint: &reqwest::Url,
|
||||
hostname: String,
|
||||
) -> anyhow::Result<()> {
|
||||
info!(
|
||||
"starting collect_metrics_iteration. metric_collection_endpoint: {}",
|
||||
metric_collection_endpoint
|
||||
);
|
||||
|
||||
let current_metrics = gather_proxy_io_bytes_per_client();
|
||||
|
||||
let metrics_to_send: Vec<Event<Ids>> = current_metrics
|
||||
.iter()
|
||||
.filter_map(|(curr_key, (curr_val, curr_time))| {
|
||||
let mut start_time = *curr_time;
|
||||
let mut value = *curr_val;
|
||||
|
||||
if let Some((prev_val, prev_time)) = cached_metrics.get(curr_key) {
|
||||
// Only send metrics updates if the metric has changed
|
||||
if curr_val - prev_val > 0 {
|
||||
value = curr_val - prev_val;
|
||||
start_time = *prev_time;
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
Some(Event {
|
||||
kind: EventType::Incremental {
|
||||
start_time,
|
||||
stop_time: *curr_time,
|
||||
},
|
||||
metric: PROXY_IO_BYTES_PER_CLIENT,
|
||||
idempotency_key: idempotency_key(hostname.clone()),
|
||||
value,
|
||||
extra: Ids {
|
||||
endpoint_id: curr_key.endpoint_id.clone(),
|
||||
},
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
if metrics_to_send.is_empty() {
|
||||
trace!("no new metrics to send");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Send metrics.
|
||||
// Split into chunks of 1000 metrics to avoid exceeding the max request size
|
||||
for chunk in metrics_to_send.chunks(CHUNK_SIZE) {
|
||||
let chunk_json = serde_json::value::to_raw_value(&EventChunk { events: chunk })
|
||||
.expect("ProxyConsumptionMetric should not fail serialization");
|
||||
|
||||
let res = client
|
||||
.post(metric_collection_endpoint.clone())
|
||||
.json(&chunk_json)
|
||||
.send()
|
||||
.await;
|
||||
|
||||
let res = match res {
|
||||
Ok(x) => x,
|
||||
Err(err) => {
|
||||
error!("failed to send metrics: {:?}", err);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
if res.status().is_success() {
|
||||
// update cached metrics after they were sent successfully
|
||||
for send_metric in chunk {
|
||||
let stop_time = match send_metric.kind {
|
||||
EventType::Incremental { stop_time, .. } => stop_time,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
cached_metrics
|
||||
.entry(Ids {
|
||||
endpoint_id: send_metric.extra.endpoint_id.clone(),
|
||||
})
|
||||
// update cached value (add delta) and time
|
||||
.and_modify(|e| {
|
||||
e.0 += send_metric.value;
|
||||
e.1 = stop_time
|
||||
})
|
||||
// cache new metric
|
||||
.or_insert((send_metric.value, stop_time));
|
||||
}
|
||||
} else {
|
||||
error!("metrics endpoint refused the sent metrics: {:?}", res);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user