Proxy metrics (#3290)

Implement proxy metrics collection.
Only collect metric for outbound traffic.

Add proxy CLI parameters:
- metric-collection-endpoint
- metric-collection-interval.

Add test_proxy_metric_collection test.

Move shared consumption metrics code to libs/consumption_metrics.
Refactor the code.
This commit is contained in:
Anastasia Lubennikova
2023-01-16 17:17:28 +02:00
committed by GitHub
parent 5c6a7a17cb
commit 2cbe84b78f
13 changed files with 586 additions and 214 deletions

View File

@@ -5,6 +5,12 @@ use std::sync::Arc;
pub struct ProxyConfig {
pub tls_config: Option<TlsConfig>,
pub auth_backend: auth::BackendType<'static, ()>,
pub metric_collection_config: Option<MetricCollectionConfig>,
}
pub struct MetricCollectionConfig {
pub endpoint: reqwest::Url,
pub interval: std::time::Duration,
}
pub struct TlsConfig {

View File

@@ -11,6 +11,7 @@ mod config;
mod console;
mod error;
mod http;
mod metrics;
mod mgmt;
mod parse;
mod proxy;
@@ -20,14 +21,14 @@ mod stream;
mod url;
mod waiters;
use ::metrics::set_build_info_metric;
use anyhow::{bail, Context};
use clap::{self, Arg};
use config::ProxyConfig;
use futures::FutureExt;
use metrics::set_build_info_metric;
use std::{borrow::Cow, future::Future, net::SocketAddr};
use tokio::{net::TcpListener, task::JoinError};
use tracing::info;
use tracing::{info, info_span, Instrument};
use utils::project_git_version;
use utils::sentry_init::{init_sentry, release_name};
@@ -65,6 +66,22 @@ async fn main() -> anyhow::Result<()> {
let mgmt_address: SocketAddr = arg_matches.get_one::<String>("mgmt").unwrap().parse()?;
let http_address: SocketAddr = arg_matches.get_one::<String>("http").unwrap().parse()?;
let metric_collection_config = match
(
arg_matches.get_one::<String>("metric-collection-endpoint"),
arg_matches.get_one::<String>("metric-collection-interval"),
) {
(Some(endpoint), Some(interval)) => {
Some(config::MetricCollectionConfig {
endpoint: endpoint.parse()?,
interval: humantime::parse_duration(interval)?,
})
}
(None, None) => None,
_ => bail!("either both or neither metric-collection-endpoint and metric-collection-interval must be specified"),
};
let auth_backend = match arg_matches
.get_one::<String>("auth-backend")
.unwrap()
@@ -95,6 +112,7 @@ async fn main() -> anyhow::Result<()> {
let config: &ProxyConfig = Box::leak(Box::new(ProxyConfig {
tls_config,
auth_backend,
metric_collection_config,
}));
info!("Version: {GIT_VERSION}");
@@ -126,6 +144,21 @@ async fn main() -> anyhow::Result<()> {
)));
}
if let Some(metric_collection_config) = &config.metric_collection_config {
let hostname = hostname::get()?
.into_string()
.map_err(|e| anyhow::anyhow!("failed to get hostname {e:?}"))?;
tasks.push(tokio::spawn(
metrics::collect_metrics(
&metric_collection_config.endpoint,
metric_collection_config.interval,
hostname,
)
.instrument(info_span!("collect_metrics")),
));
}
let tasks = tasks.into_iter().map(flatten_err);
set_build_info_metric(GIT_VERSION);
@@ -199,6 +232,16 @@ fn cli() -> clap::Command {
.alias("ssl-cert") // backwards compatibility
.help("path to TLS cert for client postgres connections"),
)
.arg(
Arg::new("metric-collection-endpoint")
.long("metric-collection-endpoint")
.help("metric collection HTTP endpoint"),
)
.arg(
Arg::new("metric-collection-interval")
.long("metric-collection-interval")
.help("metric collection interval"),
)
}
#[test]

196
proxy/src/metrics.rs Normal file
View File

@@ -0,0 +1,196 @@
//!
//! Periodically collect proxy consumption metrics
//! and push them to a HTTP endpoint.
//!
use chrono::{DateTime, Utc};
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
use serde::Serialize;
use std::{collections::HashMap, time::Duration};
use tracing::{debug, error, log::info, trace};
const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
///
/// Key that uniquely identifies the object, this metric describes.
/// Currently, endpoint_id is enough, but this may change later,
/// so keep it in a named struct.
///
/// Both the proxy and the ingestion endpoint will live in the same region (or cell)
/// so while the project-id is unique across regions the whole pipeline will work correctly
/// because we enrich the event with project_id in the control-plane endpoint.
///
#[derive(Eq, Hash, PartialEq, Serialize)]
pub struct Ids {
pub endpoint_id: String,
}
pub async fn collect_metrics(
metric_collection_endpoint: &reqwest::Url,
metric_collection_interval: Duration,
hostname: String,
) -> anyhow::Result<()> {
scopeguard::defer! {
info!("collect_metrics has shut down");
}
let mut ticker = tokio::time::interval(metric_collection_interval);
info!(
"starting collect_metrics. metric_collection_endpoint: {}",
metric_collection_endpoint
);
// define client here to reuse it for all requests
let client = reqwest::Client::new();
let mut cached_metrics: HashMap<Ids, (u64, DateTime<Utc>)> = HashMap::new();
loop {
tokio::select! {
_ = ticker.tick() => {
match collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, hostname.clone()).await
{
Err(e) => {
error!("Failed to send consumption metrics: {} ", e);
},
Ok(_) => { trace!("collect_metrics_iteration completed successfully") },
}
}
}
}
}
pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
let mut current_metrics: Vec<(Ids, (u64, DateTime<Utc>))> = Vec::new();
let metrics = prometheus::default_registry().gather();
for m in metrics {
if m.get_name() == "proxy_io_bytes_per_client" {
for ms in m.get_metric() {
let direction = ms
.get_label()
.iter()
.find(|l| l.get_name() == "direction")
.unwrap()
.get_value();
// Only collect metric for outbound traffic
if direction == "tx" {
let endpoint_id = ms
.get_label()
.iter()
.find(|l| l.get_name() == "endpoint_id")
.unwrap()
.get_value();
let value = ms.get_counter().get_value() as u64;
debug!("endpoint_id:val - {}: {}", endpoint_id, value);
current_metrics.push((
Ids {
endpoint_id: endpoint_id.to_string(),
},
(value, Utc::now()),
));
}
}
}
}
current_metrics
}
pub async fn collect_metrics_iteration(
client: &reqwest::Client,
cached_metrics: &mut HashMap<Ids, (u64, DateTime<Utc>)>,
metric_collection_endpoint: &reqwest::Url,
hostname: String,
) -> anyhow::Result<()> {
info!(
"starting collect_metrics_iteration. metric_collection_endpoint: {}",
metric_collection_endpoint
);
let current_metrics = gather_proxy_io_bytes_per_client();
let metrics_to_send: Vec<Event<Ids>> = current_metrics
.iter()
.filter_map(|(curr_key, (curr_val, curr_time))| {
let mut start_time = *curr_time;
let mut value = *curr_val;
if let Some((prev_val, prev_time)) = cached_metrics.get(curr_key) {
// Only send metrics updates if the metric has changed
if curr_val - prev_val > 0 {
value = curr_val - prev_val;
start_time = *prev_time;
} else {
return None;
}
};
Some(Event {
kind: EventType::Incremental {
start_time,
stop_time: *curr_time,
},
metric: PROXY_IO_BYTES_PER_CLIENT,
idempotency_key: idempotency_key(hostname.clone()),
value,
extra: Ids {
endpoint_id: curr_key.endpoint_id.clone(),
},
})
})
.collect();
if metrics_to_send.is_empty() {
trace!("no new metrics to send");
return Ok(());
}
// Send metrics.
// Split into chunks of 1000 metrics to avoid exceeding the max request size
for chunk in metrics_to_send.chunks(CHUNK_SIZE) {
let chunk_json = serde_json::value::to_raw_value(&EventChunk { events: chunk })
.expect("ProxyConsumptionMetric should not fail serialization");
let res = client
.post(metric_collection_endpoint.clone())
.json(&chunk_json)
.send()
.await;
let res = match res {
Ok(x) => x,
Err(err) => {
error!("failed to send metrics: {:?}", err);
continue;
}
};
if res.status().is_success() {
// update cached metrics after they were sent successfully
for send_metric in chunk {
let stop_time = match send_metric.kind {
EventType::Incremental { stop_time, .. } => stop_time,
_ => unreachable!(),
};
cached_metrics
.entry(Ids {
endpoint_id: send_metric.extra.endpoint_id.clone(),
})
// update cached value (add delta) and time
.and_modify(|e| {
e.0 += send_metric.value;
e.1 = stop_time
})
// cache new metric
.or_insert((send_metric.value, stop_time));
}
} else {
error!("metrics endpoint refused the sent metrics: {:?}", res);
}
}
Ok(())
}