mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-04 14:00:38 +00:00
proxy: replace prometheus with measured (#6717)
## Problem My benchmarks show that prometheus is not very good. https://github.com/conradludgate/measured We're already using it in storage_controller and it seems to be working well. ## Summary of changes Replace prometheus with my new measured crate in proxy only. Apologies for the large diff. I tried to keep it as minimal as I could. The label types add a bit of boiler plate (but reduce the chance we mistype the labels), and some of our custom metrics like CounterPair and HLL needed to be rewritten.
This commit is contained in:
13
Cargo.lock
generated
13
Cargo.lock
generated
@@ -2932,9 +2932,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
||||
|
||||
[[package]]
|
||||
name = "measured"
|
||||
version = "0.0.20"
|
||||
version = "0.0.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3cbf033874bea03565f2449572c8640ca37ec26300455faf36001f24755da452"
|
||||
checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"crossbeam-utils",
|
||||
@@ -2950,9 +2950,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "measured-derive"
|
||||
version = "0.0.20"
|
||||
version = "0.0.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "be9e29b682b38f8af2a89f960455054ab1a9f5a06822f6f3500637ad9fa57def"
|
||||
checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d"
|
||||
dependencies = [
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
@@ -2962,9 +2962,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "measured-process"
|
||||
version = "0.0.20"
|
||||
version = "0.0.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a20849acdd04c5d6a88f565559044546904648a1842a2937cfff0b48b4ca7ef2"
|
||||
checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"measured",
|
||||
@@ -4322,6 +4322,7 @@ dependencies = [
|
||||
"itertools",
|
||||
"lasso",
|
||||
"md5",
|
||||
"measured",
|
||||
"metrics",
|
||||
"native-tls",
|
||||
"once_cell",
|
||||
|
||||
@@ -107,8 +107,8 @@ lasso = "0.7"
|
||||
leaky-bucket = "1.0.1"
|
||||
libc = "0.2"
|
||||
md5 = "0.7.0"
|
||||
measured = { version = "0.0.20", features=["lasso"] }
|
||||
measured-process = { version = "0.0.20" }
|
||||
measured = { version = "0.0.21", features=["lasso"] }
|
||||
measured-process = { version = "0.0.21" }
|
||||
memoffset = "0.8"
|
||||
native-tls = "0.2"
|
||||
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
||||
|
||||
@@ -7,14 +7,19 @@
|
||||
//! use significantly less memory than this, but can only approximate the cardinality.
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
|
||||
sync::{atomic::AtomicU8, Arc, RwLock},
|
||||
hash::{BuildHasher, BuildHasherDefault, Hash},
|
||||
sync::atomic::AtomicU8,
|
||||
};
|
||||
|
||||
use prometheus::{
|
||||
core::{self, Describer},
|
||||
proto, Opts,
|
||||
use measured::{
|
||||
label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
|
||||
metric::{
|
||||
group::{Encoding, MetricValue},
|
||||
name::MetricNameEncoder,
|
||||
Metric, MetricType, MetricVec,
|
||||
},
|
||||
text::TextEncoder,
|
||||
LabelGroup,
|
||||
};
|
||||
use twox_hash::xxh3;
|
||||
|
||||
@@ -93,203 +98,25 @@ macro_rules! register_hll {
|
||||
/// ```
|
||||
///
|
||||
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
|
||||
#[derive(Clone)]
|
||||
pub struct HyperLogLogVec<const N: usize> {
|
||||
core: Arc<HyperLogLogVecCore<N>>,
|
||||
pub type HyperLogLogVec<L, const N: usize> = MetricVec<HyperLogLogState<N>, L>;
|
||||
pub type HyperLogLog<const N: usize> = Metric<HyperLogLogState<N>>;
|
||||
|
||||
pub struct HyperLogLogState<const N: usize> {
|
||||
shards: [AtomicU8; N],
|
||||
}
|
||||
|
||||
struct HyperLogLogVecCore<const N: usize> {
|
||||
pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
|
||||
pub desc: core::Desc,
|
||||
pub opts: Opts,
|
||||
}
|
||||
|
||||
impl<const N: usize> core::Collector for HyperLogLogVec<N> {
|
||||
fn desc(&self) -> Vec<&core::Desc> {
|
||||
vec![&self.core.desc]
|
||||
}
|
||||
|
||||
fn collect(&self) -> Vec<proto::MetricFamily> {
|
||||
let mut m = proto::MetricFamily::default();
|
||||
m.set_name(self.core.desc.fq_name.clone());
|
||||
m.set_help(self.core.desc.help.clone());
|
||||
m.set_field_type(proto::MetricType::GAUGE);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
for child in self.core.children.read().unwrap().values() {
|
||||
child.core.collect_into(&mut metrics);
|
||||
}
|
||||
m.set_metric(metrics);
|
||||
|
||||
vec![m]
|
||||
impl<const N: usize> Default for HyperLogLogState<N> {
|
||||
fn default() -> Self {
|
||||
#[allow(clippy::declare_interior_mutable_const)]
|
||||
const ZERO: AtomicU8 = AtomicU8::new(0);
|
||||
Self { shards: [ZERO; N] }
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLogVec<N> {
|
||||
/// Create a new [`HyperLogLogVec`] based on the provided
|
||||
/// [`Opts`] and partitioned by the given label names. At least one label name must be
|
||||
/// provided.
|
||||
pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
|
||||
assert!(N.is_power_of_two());
|
||||
let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
|
||||
let opts = opts.variable_labels(variable_names);
|
||||
|
||||
let desc = opts.describe()?;
|
||||
let v = HyperLogLogVecCore {
|
||||
children: RwLock::new(HashMap::default()),
|
||||
desc,
|
||||
opts,
|
||||
};
|
||||
|
||||
Ok(Self { core: Arc::new(v) })
|
||||
}
|
||||
|
||||
/// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
|
||||
/// of label values (same order as the VariableLabels in Desc). If that combination of
|
||||
/// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
|
||||
///
|
||||
/// An error is returned if the number of label values is not the same as the
|
||||
/// number of VariableLabels in Desc.
|
||||
pub fn get_metric_with_label_values(
|
||||
&self,
|
||||
vals: &[&str],
|
||||
) -> prometheus::Result<HyperLogLog<N>> {
|
||||
self.core.get_metric_with_label_values(vals)
|
||||
}
|
||||
|
||||
/// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
|
||||
/// occurs.
|
||||
pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
|
||||
self.get_metric_with_label_values(vals).unwrap()
|
||||
}
|
||||
impl<const N: usize> MetricType for HyperLogLogState<N> {
|
||||
type Metadata = ();
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLogVecCore<N> {
|
||||
pub fn get_metric_with_label_values(
|
||||
&self,
|
||||
vals: &[&str],
|
||||
) -> prometheus::Result<HyperLogLog<N>> {
|
||||
let h = self.hash_label_values(vals)?;
|
||||
|
||||
if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
|
||||
return Ok(metric);
|
||||
}
|
||||
|
||||
self.get_or_create_metric(h, vals)
|
||||
}
|
||||
|
||||
pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
|
||||
if vals.len() != self.desc.variable_labels.len() {
|
||||
return Err(prometheus::Error::InconsistentCardinality {
|
||||
expect: self.desc.variable_labels.len(),
|
||||
got: vals.len(),
|
||||
});
|
||||
}
|
||||
|
||||
let mut h = xxh3::Hash64::default();
|
||||
for val in vals {
|
||||
h.write(val.as_bytes());
|
||||
}
|
||||
|
||||
Ok(h.finish())
|
||||
}
|
||||
|
||||
fn get_or_create_metric(
|
||||
&self,
|
||||
hash: u64,
|
||||
label_values: &[&str],
|
||||
) -> prometheus::Result<HyperLogLog<N>> {
|
||||
let mut children = self.children.write().unwrap();
|
||||
// Check exist first.
|
||||
if let Some(metric) = children.get(&hash).cloned() {
|
||||
return Ok(metric);
|
||||
}
|
||||
|
||||
let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
|
||||
children.insert(hash, metric.clone());
|
||||
Ok(metric)
|
||||
}
|
||||
}
|
||||
|
||||
/// HLL is a probabilistic cardinality measure.
|
||||
///
|
||||
/// How to use this time-series for a metric name `my_metrics_total_hll`:
|
||||
///
|
||||
/// ```promql
|
||||
/// # harmonic mean
|
||||
/// 1 / (
|
||||
/// sum (
|
||||
/// 2 ^ -(
|
||||
/// # HLL merge operation
|
||||
/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
|
||||
/// )
|
||||
/// ) without (hll_shard)
|
||||
/// )
|
||||
/// * alpha
|
||||
/// * shards_count
|
||||
/// * shards_count
|
||||
/// ```
|
||||
///
|
||||
/// If you want an estimate over time, you can use the following query:
|
||||
///
|
||||
/// ```promql
|
||||
/// # harmonic mean
|
||||
/// 1 / (
|
||||
/// sum (
|
||||
/// 2 ^ -(
|
||||
/// # HLL merge operation
|
||||
/// max (
|
||||
/// max_over_time(my_metrics_total_hll{}[$__rate_interval])
|
||||
/// ) by (hll_shard, other_labels...)
|
||||
/// )
|
||||
/// ) without (hll_shard)
|
||||
/// )
|
||||
/// * alpha
|
||||
/// * shards_count
|
||||
/// * shards_count
|
||||
/// ```
|
||||
///
|
||||
/// In the case of low cardinality, you might want to use the linear counting approximation:
|
||||
///
|
||||
/// ```promql
|
||||
/// # LinearCounting(m, V) = m log (m / V)
|
||||
/// shards_count * ln(shards_count /
|
||||
/// # calculate V = how many shards contain a 0
|
||||
/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
|
||||
/// )
|
||||
/// ```
|
||||
///
|
||||
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
|
||||
#[derive(Clone)]
|
||||
pub struct HyperLogLog<const N: usize> {
|
||||
core: Arc<HyperLogLogCore<N>>,
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLog<N> {
|
||||
/// Create a [`HyperLogLog`] with the `name` and `help` arguments.
|
||||
pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
|
||||
assert!(N.is_power_of_two());
|
||||
let opts = Opts::new(name, help);
|
||||
Self::with_opts(opts)
|
||||
}
|
||||
|
||||
/// Create a [`HyperLogLog`] with the `opts` options.
|
||||
pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
|
||||
Self::with_opts_and_label_values(&opts, &[])
|
||||
}
|
||||
|
||||
fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
|
||||
let desc = opts.describe()?;
|
||||
let labels = make_label_pairs(&desc, label_values)?;
|
||||
|
||||
let v = HyperLogLogCore {
|
||||
shards: [0; N].map(AtomicU8::new),
|
||||
desc,
|
||||
labels,
|
||||
};
|
||||
Ok(Self { core: Arc::new(v) })
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLogState<N> {
|
||||
pub fn measure(&self, item: &impl Hash) {
|
||||
// changing the hasher will break compatibility with previous measurements.
|
||||
self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
|
||||
@@ -299,42 +126,11 @@ impl<const N: usize> HyperLogLog<N> {
|
||||
let p = N.ilog2() as u8;
|
||||
let j = hash & (N as u64 - 1);
|
||||
let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
|
||||
self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
struct HyperLogLogCore<const N: usize> {
|
||||
shards: [AtomicU8; N],
|
||||
desc: core::Desc,
|
||||
labels: Vec<proto::LabelPair>,
|
||||
}
|
||||
|
||||
impl<const N: usize> core::Collector for HyperLogLog<N> {
|
||||
fn desc(&self) -> Vec<&core::Desc> {
|
||||
vec![&self.core.desc]
|
||||
self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
|
||||
fn collect(&self) -> Vec<proto::MetricFamily> {
|
||||
let mut m = proto::MetricFamily::default();
|
||||
m.set_name(self.core.desc.fq_name.clone());
|
||||
m.set_help(self.core.desc.help.clone());
|
||||
m.set_field_type(proto::MetricType::GAUGE);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
self.core.collect_into(&mut metrics);
|
||||
m.set_metric(metrics);
|
||||
|
||||
vec![m]
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLogCore<N> {
|
||||
fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
|
||||
self.shards.iter().enumerate().for_each(|(i, x)| {
|
||||
let mut shard_label = proto::LabelPair::default();
|
||||
shard_label.set_name("hll_shard".to_owned());
|
||||
shard_label.set_value(format!("{i}"));
|
||||
|
||||
fn take_sample(&self) -> [u8; N] {
|
||||
self.shards.each_ref().map(|x| {
|
||||
// We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
|
||||
|
||||
// This seems like it would be a race condition,
|
||||
@@ -344,85 +140,90 @@ impl<const N: usize> HyperLogLogCore<N> {
|
||||
|
||||
// TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
|
||||
// this would mean that a dev port-forwarding the metrics url won't break the sampling.
|
||||
let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
|
||||
|
||||
let mut m = proto::Metric::default();
|
||||
let mut c = proto::Gauge::default();
|
||||
c.set_value(v as f64);
|
||||
m.set_gauge(c);
|
||||
|
||||
let mut labels = Vec::with_capacity(self.labels.len() + 1);
|
||||
labels.extend_from_slice(&self.labels);
|
||||
labels.push(shard_label);
|
||||
|
||||
m.set_label(labels);
|
||||
metrics.push(m);
|
||||
x.swap(0, std::sync::atomic::Ordering::Relaxed)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn make_label_pairs(
|
||||
desc: &core::Desc,
|
||||
label_values: &[&str],
|
||||
) -> prometheus::Result<Vec<proto::LabelPair>> {
|
||||
if desc.variable_labels.len() != label_values.len() {
|
||||
return Err(prometheus::Error::InconsistentCardinality {
|
||||
expect: desc.variable_labels.len(),
|
||||
got: label_values.len(),
|
||||
});
|
||||
impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
|
||||
for HyperLogLogState<N>
|
||||
{
|
||||
fn write_type(
|
||||
name: impl MetricNameEncoder,
|
||||
enc: &mut TextEncoder<W>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
enc.write_type(&name, measured::text::MetricType::Gauge)
|
||||
}
|
||||
fn collect_into(
|
||||
&self,
|
||||
_: &(),
|
||||
labels: impl LabelGroup,
|
||||
name: impl MetricNameEncoder,
|
||||
enc: &mut TextEncoder<W>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
struct I64(i64);
|
||||
impl LabelValue for I64 {
|
||||
fn visit<V: LabelVisitor>(&self, v: V) -> V::Output {
|
||||
v.write_int(self.0)
|
||||
}
|
||||
}
|
||||
|
||||
let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
|
||||
if total_len == 0 {
|
||||
return Ok(vec![]);
|
||||
}
|
||||
struct HllShardLabel {
|
||||
hll_shard: i64,
|
||||
}
|
||||
|
||||
if desc.variable_labels.is_empty() {
|
||||
return Ok(desc.const_label_pairs.clone());
|
||||
}
|
||||
impl LabelGroup for HllShardLabel {
|
||||
fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
|
||||
const LE: &LabelName = LabelName::from_str("hll_shard");
|
||||
v.write_value(LE, &I64(self.hll_shard));
|
||||
}
|
||||
}
|
||||
|
||||
let mut label_pairs = Vec::with_capacity(total_len);
|
||||
for (i, n) in desc.variable_labels.iter().enumerate() {
|
||||
let mut label_pair = proto::LabelPair::default();
|
||||
label_pair.set_name(n.clone());
|
||||
label_pair.set_value(label_values[i].to_owned());
|
||||
label_pairs.push(label_pair);
|
||||
self.take_sample()
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.try_for_each(|(hll_shard, val)| {
|
||||
enc.write_metric_value(
|
||||
name.by_ref(),
|
||||
labels.by_ref().compose_with(HllShardLabel {
|
||||
hll_shard: hll_shard as i64,
|
||||
}),
|
||||
MetricValue::Int(val as i64),
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
for label_pair in &desc.const_label_pairs {
|
||||
label_pairs.push(label_pair.clone());
|
||||
}
|
||||
label_pairs.sort();
|
||||
Ok(label_pairs)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashSet;
|
||||
|
||||
use prometheus::{proto, Opts};
|
||||
use measured::{label::StaticLabelSet, FixedCardinalityLabel};
|
||||
use rand::{rngs::StdRng, Rng, SeedableRng};
|
||||
use rand_distr::{Distribution, Zipf};
|
||||
|
||||
use crate::HyperLogLogVec;
|
||||
|
||||
fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
|
||||
let mut metrics = vec![];
|
||||
hll.core
|
||||
.children
|
||||
.read()
|
||||
.unwrap()
|
||||
.values()
|
||||
.for_each(|c| c.core.collect_into(&mut metrics));
|
||||
metrics
|
||||
#[derive(FixedCardinalityLabel, Clone, Copy)]
|
||||
#[label(singleton = "x")]
|
||||
enum Label {
|
||||
A,
|
||||
B,
|
||||
}
|
||||
fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
|
||||
|
||||
fn collect(hll: &HyperLogLogVec<StaticLabelSet<Label>, 32>) -> ([u8; 32], [u8; 32]) {
|
||||
// cannot go through the `hll.collect_family_into` interface yet...
|
||||
// need to see if I can fix the conflicting impls problem in measured.
|
||||
(
|
||||
hll.get_metric(hll.with_labels(Label::A)).take_sample(),
|
||||
hll.get_metric(hll.with_labels(Label::B)).take_sample(),
|
||||
)
|
||||
}
|
||||
|
||||
fn get_cardinality(samples: &[[u8; 32]]) -> f64 {
|
||||
let mut buckets = [0.0; 32];
|
||||
for metric in metrics.chunks_exact(32) {
|
||||
if filter(&metric[0]) {
|
||||
for (i, m) in metric.iter().enumerate() {
|
||||
buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
|
||||
}
|
||||
for &sample in samples {
|
||||
for (i, m) in sample.into_iter().enumerate() {
|
||||
buckets[i] = f64::max(buckets[i], m as f64);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -437,7 +238,7 @@ mod tests {
|
||||
}
|
||||
|
||||
fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
|
||||
let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
|
||||
let hll = HyperLogLogVec::<StaticLabelSet<Label>, 32>::new();
|
||||
|
||||
let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
|
||||
let mut set_a = HashSet::new();
|
||||
@@ -445,18 +246,20 @@ mod tests {
|
||||
|
||||
for x in iter.by_ref().take(n) {
|
||||
set_a.insert(x.to_bits());
|
||||
hll.with_label_values(&["a"]).measure(&x.to_bits());
|
||||
hll.get_metric(hll.with_labels(Label::A))
|
||||
.measure(&x.to_bits());
|
||||
}
|
||||
for x in iter.by_ref().take(n) {
|
||||
set_b.insert(x.to_bits());
|
||||
hll.with_label_values(&["b"]).measure(&x.to_bits());
|
||||
hll.get_metric(hll.with_labels(Label::B))
|
||||
.measure(&x.to_bits());
|
||||
}
|
||||
let merge = &set_a | &set_b;
|
||||
|
||||
let metrics = collect(&hll);
|
||||
let len = get_cardinality(&metrics, |_| true);
|
||||
let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
|
||||
let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
|
||||
let (a, b) = collect(&hll);
|
||||
let len = get_cardinality(&[a, b]);
|
||||
let len_a = get_cardinality(&[a]);
|
||||
let len_b = get_cardinality(&[b]);
|
||||
|
||||
([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
use measured::{
|
||||
label::{LabelGroupVisitor, LabelName, NoLabels},
|
||||
label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels},
|
||||
metric::{
|
||||
counter::CounterState,
|
||||
gauge::GaugeState,
|
||||
@@ -40,7 +40,7 @@ pub mod launch_timestamp;
|
||||
mod wrappers;
|
||||
pub use wrappers::{CountedReader, CountedWriter};
|
||||
mod hll;
|
||||
pub use hll::{HyperLogLog, HyperLogLogVec};
|
||||
pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec};
|
||||
#[cfg(target_os = "linux")]
|
||||
pub mod more_process_metrics;
|
||||
|
||||
@@ -421,3 +421,171 @@ pub type IntCounterPair = GenericCounterPair<AtomicU64>;
|
||||
|
||||
/// A guard for [`IntCounterPair`] that will decrement the gauge on drop
|
||||
pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
|
||||
|
||||
pub trait CounterPairAssoc {
|
||||
const INC_NAME: &'static MetricName;
|
||||
const DEC_NAME: &'static MetricName;
|
||||
|
||||
const INC_HELP: &'static str;
|
||||
const DEC_HELP: &'static str;
|
||||
|
||||
type LabelGroupSet: LabelGroupSet;
|
||||
}
|
||||
|
||||
pub struct CounterPairVec<A: CounterPairAssoc> {
|
||||
vec: measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
|
||||
}
|
||||
|
||||
impl<A: CounterPairAssoc> Default for CounterPairVec<A>
|
||||
where
|
||||
A::LabelGroupSet: Default,
|
||||
{
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
vec: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: CounterPairAssoc> CounterPairVec<A> {
|
||||
pub fn guard(
|
||||
&self,
|
||||
labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
|
||||
) -> MeasuredCounterPairGuard<'_, A> {
|
||||
let id = self.vec.with_labels(labels);
|
||||
self.vec.get_metric(id).inc.inc();
|
||||
MeasuredCounterPairGuard { vec: &self.vec, id }
|
||||
}
|
||||
pub fn inc(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
|
||||
let id = self.vec.with_labels(labels);
|
||||
self.vec.get_metric(id).inc.inc();
|
||||
}
|
||||
pub fn dec(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
|
||||
let id = self.vec.with_labels(labels);
|
||||
self.vec.get_metric(id).dec.inc();
|
||||
}
|
||||
pub fn remove_metric(
|
||||
&self,
|
||||
labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
|
||||
) -> Option<MeasuredCounterPairState> {
|
||||
let id = self.vec.with_labels(labels);
|
||||
self.vec.remove_metric(id)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
|
||||
where
|
||||
T: ::measured::metric::group::Encoding,
|
||||
A: CounterPairAssoc,
|
||||
::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding<T>,
|
||||
{
|
||||
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
|
||||
// write decrement first to avoid a race condition where inc - dec < 0
|
||||
T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?;
|
||||
self.vec
|
||||
.collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?;
|
||||
|
||||
T::write_help(enc, A::INC_NAME, A::INC_HELP)?;
|
||||
self.vec
|
||||
.collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(MetricGroup, Default)]
|
||||
pub struct MeasuredCounterPairState {
|
||||
pub inc: CounterState,
|
||||
pub dec: CounterState,
|
||||
}
|
||||
|
||||
impl measured::metric::MetricType for MeasuredCounterPairState {
|
||||
type Metadata = ();
|
||||
}
|
||||
|
||||
pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> {
|
||||
vec: &'a measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
|
||||
id: measured::metric::LabelId<A::LabelGroupSet>,
|
||||
}
|
||||
|
||||
impl<A: CounterPairAssoc> Drop for MeasuredCounterPairGuard<'_, A> {
|
||||
fn drop(&mut self) {
|
||||
self.vec.get_metric(self.id).dec.inc();
|
||||
}
|
||||
}
|
||||
|
||||
/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder.
|
||||
struct Inc<T>(T);
|
||||
/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder.
|
||||
struct Dec<T>(T);
|
||||
|
||||
impl<T: Encoding> Encoding for Inc<T> {
|
||||
type Err = T::Err;
|
||||
|
||||
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
|
||||
self.0.write_help(name, help)
|
||||
}
|
||||
|
||||
fn write_metric_value(
|
||||
&mut self,
|
||||
name: impl MetricNameEncoder,
|
||||
labels: impl LabelGroup,
|
||||
value: MetricValue,
|
||||
) -> Result<(), Self::Err> {
|
||||
self.0.write_metric_value(name, labels, value)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
|
||||
where
|
||||
CounterState: MetricEncoding<T>,
|
||||
{
|
||||
fn write_type(name: impl MetricNameEncoder, enc: &mut Inc<T>) -> Result<(), T::Err> {
|
||||
CounterState::write_type(name, &mut enc.0)
|
||||
}
|
||||
fn collect_into(
|
||||
&self,
|
||||
metadata: &(),
|
||||
labels: impl LabelGroup,
|
||||
name: impl MetricNameEncoder,
|
||||
enc: &mut Inc<T>,
|
||||
) -> Result<(), T::Err> {
|
||||
self.inc.collect_into(metadata, labels, name, &mut enc.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Encoding> Encoding for Dec<T> {
|
||||
type Err = T::Err;
|
||||
|
||||
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
|
||||
self.0.write_help(name, help)
|
||||
}
|
||||
|
||||
fn write_metric_value(
|
||||
&mut self,
|
||||
name: impl MetricNameEncoder,
|
||||
labels: impl LabelGroup,
|
||||
value: MetricValue,
|
||||
) -> Result<(), Self::Err> {
|
||||
self.0.write_metric_value(name, labels, value)
|
||||
}
|
||||
}
|
||||
|
||||
/// Write the dec counter to the encoder
|
||||
impl<T: Encoding> MetricEncoding<Dec<T>> for MeasuredCounterPairState
|
||||
where
|
||||
CounterState: MetricEncoding<T>,
|
||||
{
|
||||
fn write_type(name: impl MetricNameEncoder, enc: &mut Dec<T>) -> Result<(), T::Err> {
|
||||
CounterState::write_type(name, &mut enc.0)
|
||||
}
|
||||
fn collect_into(
|
||||
&self,
|
||||
metadata: &(),
|
||||
labels: impl LabelGroup,
|
||||
name: impl MetricNameEncoder,
|
||||
enc: &mut Dec<T>,
|
||||
) -> Result<(), T::Err> {
|
||||
self.dec.collect_into(metadata, labels, name, &mut enc.0)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,6 +44,7 @@ ipnet.workspace = true
|
||||
itertools.workspace = true
|
||||
lasso = { workspace = true, features = ["multi-threaded"] }
|
||||
md5.workspace = true
|
||||
measured = { workspace = true, features = ["lasso"] }
|
||||
metrics.workspace = true
|
||||
once_cell.workspace = true
|
||||
opentelemetry.workspace = true
|
||||
|
||||
@@ -13,7 +13,7 @@ use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
|
||||
use crate::console::{AuthSecret, NodeInfo};
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::intern::EndpointIdInt;
|
||||
use crate::metrics::{AUTH_RATE_LIMIT_HITS, ENDPOINTS_AUTH_RATE_LIMITED};
|
||||
use crate::metrics::Metrics;
|
||||
use crate::proxy::connect_compute::ComputeConnectBackend;
|
||||
use crate::proxy::NeonOptions;
|
||||
use crate::stream::Stream;
|
||||
@@ -210,8 +210,12 @@ impl AuthenticationConfig {
|
||||
enabled = self.rate_limiter_enabled,
|
||||
"rate limiting authentication"
|
||||
);
|
||||
AUTH_RATE_LIMIT_HITS.inc();
|
||||
ENDPOINTS_AUTH_RATE_LIMITED.measure(endpoint);
|
||||
Metrics::get().proxy.requests_auth_rate_limits_total.inc();
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.endpoints_auth_rate_limits
|
||||
.get_metric()
|
||||
.measure(endpoint);
|
||||
|
||||
if self.rate_limiter_enabled {
|
||||
return Err(auth::AuthError::too_many_connections());
|
||||
|
||||
@@ -4,7 +4,7 @@ use crate::{
|
||||
auth::password_hack::parse_endpoint_param,
|
||||
context::RequestMonitoring,
|
||||
error::{ReportableError, UserFacingError},
|
||||
metrics::NUM_CONNECTION_ACCEPTED_BY_SNI,
|
||||
metrics::{Metrics, SniKind},
|
||||
proxy::NeonOptions,
|
||||
serverless::SERVERLESS_DRIVER_SNI,
|
||||
EndpointId, RoleName,
|
||||
@@ -144,21 +144,22 @@ impl ComputeUserInfoMaybeEndpoint {
|
||||
ctx.set_endpoint_id(ep.clone());
|
||||
}
|
||||
|
||||
let metrics = Metrics::get();
|
||||
info!(%user, "credentials");
|
||||
if sni.is_some() {
|
||||
info!("Connection with sni");
|
||||
NUM_CONNECTION_ACCEPTED_BY_SNI
|
||||
.with_label_values(&["sni"])
|
||||
.inc();
|
||||
metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni);
|
||||
} else if endpoint.is_some() {
|
||||
NUM_CONNECTION_ACCEPTED_BY_SNI
|
||||
.with_label_values(&["no_sni"])
|
||||
.inc();
|
||||
metrics
|
||||
.proxy
|
||||
.accepted_connections_by_sni
|
||||
.inc(SniKind::NoSni);
|
||||
info!("Connection without sni");
|
||||
} else {
|
||||
NUM_CONNECTION_ACCEPTED_BY_SNI
|
||||
.with_label_values(&["password_hack"])
|
||||
.inc();
|
||||
metrics
|
||||
.proxy
|
||||
.accepted_connections_by_sni
|
||||
.inc(SniKind::PasswordHack);
|
||||
info!("Connection with password hack");
|
||||
}
|
||||
|
||||
|
||||
@@ -176,7 +176,12 @@ async fn task_main(
|
||||
.context("failed to set socket option")?;
|
||||
|
||||
info!(%peer_addr, "serving");
|
||||
let ctx = RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
|
||||
let ctx = RequestMonitoring::new(
|
||||
session_id,
|
||||
peer_addr.ip(),
|
||||
proxy::metrics::Protocol::SniRouter,
|
||||
"sni",
|
||||
);
|
||||
handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await
|
||||
}
|
||||
.unwrap_or_else(|e| {
|
||||
|
||||
@@ -18,7 +18,8 @@ use proxy::config::ProjectInfoCacheOptions;
|
||||
use proxy::console;
|
||||
use proxy::context::parquet::ParquetUploadArgs;
|
||||
use proxy::http;
|
||||
use proxy::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT;
|
||||
use proxy::http::health_server::AppMetrics;
|
||||
use proxy::metrics::Metrics;
|
||||
use proxy::rate_limiter::AuthRateLimiter;
|
||||
use proxy::rate_limiter::EndpointRateLimiter;
|
||||
use proxy::rate_limiter::RateBucketInfo;
|
||||
@@ -249,14 +250,18 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
info!("Version: {GIT_VERSION}");
|
||||
info!("Build_tag: {BUILD_TAG}");
|
||||
::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);
|
||||
let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
|
||||
revision: GIT_VERSION,
|
||||
build_tag: BUILD_TAG,
|
||||
});
|
||||
|
||||
match proxy::jemalloc::MetricRecorder::new(prometheus::default_registry()) {
|
||||
Ok(t) => {
|
||||
t.start();
|
||||
let jemalloc = match proxy::jemalloc::MetricRecorder::new() {
|
||||
Ok(t) => Some(t),
|
||||
Err(e) => {
|
||||
tracing::error!(error = ?e, "could not start jemalloc metrics loop");
|
||||
None
|
||||
}
|
||||
Err(e) => tracing::error!(error = ?e, "could not start jemalloc metrics loop"),
|
||||
}
|
||||
};
|
||||
|
||||
let args = ProxyCliArgs::parse();
|
||||
let config = build_config(&args)?;
|
||||
@@ -349,7 +354,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
>::new(
|
||||
cancel_map.clone(),
|
||||
redis_publisher,
|
||||
NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT,
|
||||
proxy::metrics::CancellationSource::FromClient,
|
||||
));
|
||||
|
||||
// client facing tasks. these will exit on error or on cancellation
|
||||
@@ -387,7 +392,14 @@ async fn main() -> anyhow::Result<()> {
|
||||
// maintenance tasks. these never return unless there's an error
|
||||
let mut maintenance_tasks = JoinSet::new();
|
||||
maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
|
||||
maintenance_tasks.spawn(http::health_server::task_main(http_listener));
|
||||
maintenance_tasks.spawn(http::health_server::task_main(
|
||||
http_listener,
|
||||
AppMetrics {
|
||||
jemalloc,
|
||||
neon_metrics,
|
||||
proxy: proxy::metrics::Metrics::get(),
|
||||
},
|
||||
));
|
||||
maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));
|
||||
|
||||
if let Some(metrics_config) = &config.metric_collection {
|
||||
@@ -507,8 +519,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
} = args.wake_compute_lock.parse()?;
|
||||
info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
|
||||
let locks = Box::leak(Box::new(
|
||||
console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout)
|
||||
.unwrap(),
|
||||
console::locks::ApiLocks::new(
|
||||
"wake_compute_lock",
|
||||
permits,
|
||||
shards,
|
||||
timeout,
|
||||
&Metrics::get().wake_compute_lock,
|
||||
)
|
||||
.unwrap(),
|
||||
));
|
||||
tokio::spawn(locks.garbage_collect_worker(epoch));
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ use uuid::Uuid;
|
||||
|
||||
use crate::{
|
||||
error::ReportableError,
|
||||
metrics::NUM_CANCELLATION_REQUESTS,
|
||||
metrics::{CancellationRequest, CancellationSource, Metrics},
|
||||
redis::cancellation_publisher::{
|
||||
CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
|
||||
},
|
||||
@@ -28,7 +28,7 @@ pub struct CancellationHandler<P> {
|
||||
client: P,
|
||||
/// This field used for the monitoring purposes.
|
||||
/// Represents the source of the cancellation request.
|
||||
from: &'static str,
|
||||
from: CancellationSource,
|
||||
}
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
@@ -89,9 +89,13 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
|
||||
// NB: we should immediately release the lock after cloning the token.
|
||||
let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
|
||||
tracing::warn!("query cancellation key not found: {key}");
|
||||
NUM_CANCELLATION_REQUESTS
|
||||
.with_label_values(&[self.from, "not_found"])
|
||||
.inc();
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.cancellation_requests_total
|
||||
.inc(CancellationRequest {
|
||||
source: self.from,
|
||||
kind: crate::metrics::CancellationOutcome::NotFound,
|
||||
});
|
||||
match self.client.try_publish(key, session_id).await {
|
||||
Ok(()) => {} // do nothing
|
||||
Err(e) => {
|
||||
@@ -103,9 +107,13 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
|
||||
}
|
||||
return Ok(());
|
||||
};
|
||||
NUM_CANCELLATION_REQUESTS
|
||||
.with_label_values(&[self.from, "found"])
|
||||
.inc();
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.cancellation_requests_total
|
||||
.inc(CancellationRequest {
|
||||
source: self.from,
|
||||
kind: crate::metrics::CancellationOutcome::Found,
|
||||
});
|
||||
info!("cancelling query per user's request using key {key}");
|
||||
cancel_closure.try_cancel_query().await
|
||||
}
|
||||
@@ -122,7 +130,7 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
|
||||
}
|
||||
|
||||
impl CancellationHandler<()> {
|
||||
pub fn new(map: CancelMap, from: &'static str) -> Self {
|
||||
pub fn new(map: CancelMap, from: CancellationSource) -> Self {
|
||||
Self {
|
||||
map,
|
||||
client: (),
|
||||
@@ -132,7 +140,7 @@ impl CancellationHandler<()> {
|
||||
}
|
||||
|
||||
impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
|
||||
pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: &'static str) -> Self {
|
||||
pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: CancellationSource) -> Self {
|
||||
Self { map, client, from }
|
||||
}
|
||||
}
|
||||
@@ -192,15 +200,13 @@ impl<P> Drop for Session<P> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn check_session_drop() -> anyhow::Result<()> {
|
||||
let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
|
||||
CancelMap::default(),
|
||||
NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS,
|
||||
CancellationSource::FromRedis,
|
||||
));
|
||||
|
||||
let session = cancellation_handler.clone().get_session();
|
||||
@@ -214,7 +220,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn cancel_session_noop_regression() {
|
||||
let handler = CancellationHandler::<()>::new(Default::default(), "local");
|
||||
let handler = CancellationHandler::<()>::new(Default::default(), CancellationSource::Local);
|
||||
handler
|
||||
.cancel_session(
|
||||
CancelKeyData {
|
||||
|
||||
@@ -4,12 +4,11 @@ use crate::{
|
||||
console::{errors::WakeComputeError, messages::MetricsAuxInfo},
|
||||
context::RequestMonitoring,
|
||||
error::{ReportableError, UserFacingError},
|
||||
metrics::NUM_DB_CONNECTIONS_GAUGE,
|
||||
metrics::{Metrics, NumDbConnectionsGuard},
|
||||
proxy::neon_option,
|
||||
};
|
||||
use futures::{FutureExt, TryFutureExt};
|
||||
use itertools::Itertools;
|
||||
use metrics::IntCounterPairGuard;
|
||||
use pq_proto::StartupMessageParams;
|
||||
use std::{io, net::SocketAddr, time::Duration};
|
||||
use thiserror::Error;
|
||||
@@ -249,7 +248,7 @@ pub struct PostgresConnection {
|
||||
/// Labels for proxy's metrics.
|
||||
pub aux: MetricsAuxInfo,
|
||||
|
||||
_guage: IntCounterPairGuard,
|
||||
_guage: NumDbConnectionsGuard<'static>,
|
||||
}
|
||||
|
||||
impl ConnCfg {
|
||||
@@ -295,9 +294,7 @@ impl ConnCfg {
|
||||
params,
|
||||
cancel_closure,
|
||||
aux,
|
||||
_guage: NUM_DB_CONNECTIONS_GAUGE
|
||||
.with_label_values(&[ctx.protocol])
|
||||
.guard(),
|
||||
_guage: Metrics::get().proxy.db_connections.guard(ctx.protocol),
|
||||
};
|
||||
|
||||
Ok(connection)
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use measured::FixedCardinalityLabel;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
|
||||
@@ -102,7 +103,7 @@ pub struct MetricsAuxInfo {
|
||||
pub cold_start_info: ColdStartInfo,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy)]
|
||||
#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy, FixedCardinalityLabel)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ColdStartInfo {
|
||||
#[default]
|
||||
@@ -110,9 +111,11 @@ pub enum ColdStartInfo {
|
||||
/// Compute was already running
|
||||
Warm,
|
||||
#[serde(rename = "pool_hit")]
|
||||
#[label(rename = "pool_hit")]
|
||||
/// Compute was not running but there was an available VM
|
||||
VmPoolHit,
|
||||
#[serde(rename = "pool_miss")]
|
||||
#[label(rename = "pool_miss")]
|
||||
/// Compute was not running and there were no VMs available
|
||||
VmPoolMiss,
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ use crate::{
|
||||
config::{CacheOptions, ProjectInfoCacheOptions},
|
||||
context::RequestMonitoring,
|
||||
intern::ProjectIdInt,
|
||||
metrics::ApiLockMetrics,
|
||||
scram, EndpointCacheKey,
|
||||
};
|
||||
use dashmap::DashMap;
|
||||
@@ -441,10 +442,7 @@ pub struct ApiLocks {
|
||||
node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
|
||||
permits: usize,
|
||||
timeout: Duration,
|
||||
registered: prometheus::IntCounter,
|
||||
unregistered: prometheus::IntCounter,
|
||||
reclamation_lag: prometheus::Histogram,
|
||||
lock_acquire_lag: prometheus::Histogram,
|
||||
metrics: &'static ApiLockMetrics,
|
||||
}
|
||||
|
||||
impl ApiLocks {
|
||||
@@ -453,54 +451,14 @@ impl ApiLocks {
|
||||
permits: usize,
|
||||
shards: usize,
|
||||
timeout: Duration,
|
||||
metrics: &'static ApiLockMetrics,
|
||||
) -> prometheus::Result<Self> {
|
||||
let registered = prometheus::IntCounter::with_opts(
|
||||
prometheus::Opts::new(
|
||||
"semaphores_registered",
|
||||
"Number of semaphores registered in this api lock",
|
||||
)
|
||||
.namespace(name),
|
||||
)?;
|
||||
prometheus::register(Box::new(registered.clone()))?;
|
||||
let unregistered = prometheus::IntCounter::with_opts(
|
||||
prometheus::Opts::new(
|
||||
"semaphores_unregistered",
|
||||
"Number of semaphores unregistered in this api lock",
|
||||
)
|
||||
.namespace(name),
|
||||
)?;
|
||||
prometheus::register(Box::new(unregistered.clone()))?;
|
||||
let reclamation_lag = prometheus::Histogram::with_opts(
|
||||
prometheus::HistogramOpts::new(
|
||||
"reclamation_lag_seconds",
|
||||
"Time it takes to reclaim unused semaphores in the api lock",
|
||||
)
|
||||
.namespace(name)
|
||||
// 1us -> 65ms
|
||||
// benchmarks on my mac indicate it's usually in the range of 256us and 512us
|
||||
.buckets(prometheus::exponential_buckets(1e-6, 2.0, 16)?),
|
||||
)?;
|
||||
prometheus::register(Box::new(reclamation_lag.clone()))?;
|
||||
let lock_acquire_lag = prometheus::Histogram::with_opts(
|
||||
prometheus::HistogramOpts::new(
|
||||
"semaphore_acquire_seconds",
|
||||
"Time it takes to reclaim unused semaphores in the api lock",
|
||||
)
|
||||
.namespace(name)
|
||||
// 0.1ms -> 6s
|
||||
.buckets(prometheus::exponential_buckets(1e-4, 2.0, 16)?),
|
||||
)?;
|
||||
prometheus::register(Box::new(lock_acquire_lag.clone()))?;
|
||||
|
||||
Ok(Self {
|
||||
name,
|
||||
node_locks: DashMap::with_shard_amount(shards),
|
||||
permits,
|
||||
timeout,
|
||||
lock_acquire_lag,
|
||||
registered,
|
||||
unregistered,
|
||||
reclamation_lag,
|
||||
metrics,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -520,7 +478,7 @@ impl ApiLocks {
|
||||
self.node_locks
|
||||
.entry(key.clone())
|
||||
.or_insert_with(|| {
|
||||
self.registered.inc();
|
||||
self.metrics.semaphores_registered.inc();
|
||||
Arc::new(Semaphore::new(self.permits))
|
||||
})
|
||||
.clone()
|
||||
@@ -528,8 +486,9 @@ impl ApiLocks {
|
||||
};
|
||||
let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await;
|
||||
|
||||
self.lock_acquire_lag
|
||||
.observe((Instant::now() - now).as_secs_f64());
|
||||
self.metrics
|
||||
.semaphore_acquire_seconds
|
||||
.observe(now.elapsed().as_secs_f64());
|
||||
|
||||
Ok(WakeComputePermit {
|
||||
permit: Some(permit??),
|
||||
@@ -554,13 +513,13 @@ impl ApiLocks {
|
||||
"performing epoch reclamation on api lock"
|
||||
);
|
||||
let mut lock = shard.write();
|
||||
let timer = self.reclamation_lag.start_timer();
|
||||
let timer = self.metrics.reclamation_lag_seconds.start_timer();
|
||||
let count = lock
|
||||
.extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
|
||||
.count();
|
||||
drop(lock);
|
||||
self.unregistered.inc_by(count as u64);
|
||||
timer.observe_duration()
|
||||
self.metrics.semaphores_unregistered.inc_by(count as u64);
|
||||
timer.observe();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,13 +7,14 @@ use super::{
|
||||
NodeInfo,
|
||||
};
|
||||
use crate::{
|
||||
auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram,
|
||||
};
|
||||
use crate::{
|
||||
cache::Cached,
|
||||
context::RequestMonitoring,
|
||||
metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER},
|
||||
auth::backend::ComputeUserInfo,
|
||||
compute,
|
||||
console::messages::ColdStartInfo,
|
||||
http,
|
||||
metrics::{CacheOutcome, Metrics},
|
||||
scram,
|
||||
};
|
||||
use crate::{cache::Cached, context::RequestMonitoring};
|
||||
use futures::TryFutureExt;
|
||||
use std::sync::Arc;
|
||||
use tokio::time::Instant;
|
||||
@@ -95,7 +96,10 @@ impl Api {
|
||||
Some(secret)
|
||||
};
|
||||
let allowed_ips = body.allowed_ips.unwrap_or_default();
|
||||
ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64);
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.allowed_ips_number
|
||||
.observe(allowed_ips.len() as f64);
|
||||
Ok(AuthInfo {
|
||||
secret,
|
||||
allowed_ips,
|
||||
@@ -206,14 +210,16 @@ impl super::Api for Api {
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
|
||||
let ep = &user_info.endpoint;
|
||||
if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) {
|
||||
ALLOWED_IPS_BY_CACHE_OUTCOME
|
||||
.with_label_values(&["hit"])
|
||||
.inc();
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.allowed_ips_cache_misses
|
||||
.inc(CacheOutcome::Hit);
|
||||
return Ok((allowed_ips, None));
|
||||
}
|
||||
ALLOWED_IPS_BY_CACHE_OUTCOME
|
||||
.with_label_values(&["miss"])
|
||||
.inc();
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.allowed_ips_cache_misses
|
||||
.inc(CacheOutcome::Miss);
|
||||
let auth_info = self.do_get_auth_info(ctx, user_info).await?;
|
||||
let allowed_ips = Arc::new(auth_info.allowed_ips);
|
||||
let user = &user_info.user;
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::{
|
||||
console::messages::{ColdStartInfo, MetricsAuxInfo},
|
||||
error::ErrorKind,
|
||||
intern::{BranchIdInt, ProjectIdInt},
|
||||
metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
|
||||
metrics::{LatencyTimer, Metrics, Protocol},
|
||||
DbName, EndpointId, RoleName,
|
||||
};
|
||||
|
||||
@@ -29,7 +29,7 @@ static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::ne
|
||||
pub struct RequestMonitoring {
|
||||
pub peer_addr: IpAddr,
|
||||
pub session_id: Uuid,
|
||||
pub protocol: &'static str,
|
||||
pub protocol: Protocol,
|
||||
first_packet: chrono::DateTime<Utc>,
|
||||
region: &'static str,
|
||||
pub span: Span,
|
||||
@@ -65,7 +65,7 @@ impl RequestMonitoring {
|
||||
pub fn new(
|
||||
session_id: Uuid,
|
||||
peer_addr: IpAddr,
|
||||
protocol: &'static str,
|
||||
protocol: Protocol,
|
||||
region: &'static str,
|
||||
) -> Self {
|
||||
let span = info_span!(
|
||||
@@ -102,7 +102,7 @@ impl RequestMonitoring {
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn test() -> Self {
|
||||
RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), "test", "test")
|
||||
RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), Protocol::Tcp, "test")
|
||||
}
|
||||
|
||||
pub fn console_application_name(&self) -> String {
|
||||
@@ -134,9 +134,9 @@ impl RequestMonitoring {
|
||||
pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
|
||||
if self.endpoint_id.is_none() {
|
||||
self.span.record("ep", display(&endpoint_id));
|
||||
crate::metrics::CONNECTING_ENDPOINTS
|
||||
.with_label_values(&[self.protocol])
|
||||
.measure(&endpoint_id);
|
||||
let metric = &Metrics::get().proxy.connecting_endpoints;
|
||||
let label = metric.with_labels(self.protocol);
|
||||
metric.get_metric(label).measure(&endpoint_id);
|
||||
self.endpoint_id = Some(endpoint_id);
|
||||
}
|
||||
}
|
||||
@@ -158,13 +158,11 @@ impl RequestMonitoring {
|
||||
}
|
||||
|
||||
pub fn set_error_kind(&mut self, kind: ErrorKind) {
|
||||
ERROR_BY_KIND
|
||||
.with_label_values(&[kind.to_metric_label()])
|
||||
.inc();
|
||||
Metrics::get().proxy.errors_total.inc(kind);
|
||||
if let Some(ep) = &self.endpoint_id {
|
||||
ENDPOINT_ERRORS_BY_KIND
|
||||
.with_label_values(&[kind.to_metric_label()])
|
||||
.measure(ep);
|
||||
let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
|
||||
let label = metric.with_labels(kind);
|
||||
metric.get_metric(label).measure(ep);
|
||||
}
|
||||
self.error_kind = Some(kind);
|
||||
}
|
||||
|
||||
@@ -111,7 +111,7 @@ impl From<&RequestMonitoring> for RequestData {
|
||||
super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus",
|
||||
super::AuthMethod::Cleartext => "cleartext",
|
||||
}),
|
||||
protocol: value.protocol,
|
||||
protocol: value.protocol.as_str(),
|
||||
region: value.region,
|
||||
error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
|
||||
success: value.success,
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
use std::{error::Error as StdError, fmt, io};
|
||||
|
||||
use measured::FixedCardinalityLabel;
|
||||
|
||||
/// Upcast (almost) any error into an opaque [`io::Error`].
|
||||
pub fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, e)
|
||||
@@ -29,24 +31,29 @@ pub trait UserFacingError: ReportableError {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
|
||||
#[derive(Copy, Clone, Debug, Eq, PartialEq, FixedCardinalityLabel)]
|
||||
#[label(singleton = "type")]
|
||||
pub enum ErrorKind {
|
||||
/// Wrong password, unknown endpoint, protocol violation, etc...
|
||||
User,
|
||||
|
||||
/// Network error between user and proxy. Not necessarily user error
|
||||
#[label(rename = "clientdisconnect")]
|
||||
ClientDisconnect,
|
||||
|
||||
/// Proxy self-imposed user rate limits
|
||||
#[label(rename = "ratelimit")]
|
||||
RateLimit,
|
||||
|
||||
/// Proxy self-imposed service-wise rate limits
|
||||
#[label(rename = "serviceratelimit")]
|
||||
ServiceRateLimit,
|
||||
|
||||
/// internal errors
|
||||
Service,
|
||||
|
||||
/// Error communicating with control plane
|
||||
#[label(rename = "controlplane")]
|
||||
ControlPlane,
|
||||
|
||||
/// Postgres error
|
||||
|
||||
@@ -13,7 +13,11 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
|
||||
use tokio::time::Instant;
|
||||
use tracing::trace;
|
||||
|
||||
use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl};
|
||||
use crate::{
|
||||
metrics::{ConsoleRequest, Metrics},
|
||||
rate_limiter,
|
||||
url::ApiUrl,
|
||||
};
|
||||
use reqwest_middleware::RequestBuilder;
|
||||
|
||||
/// This is the preferred way to create new http clients,
|
||||
@@ -90,13 +94,14 @@ impl Endpoint {
|
||||
|
||||
/// Execute a [request](reqwest::Request).
|
||||
pub async fn execute(&self, request: Request) -> Result<Response, Error> {
|
||||
let path = request.url().path().to_string();
|
||||
let start = Instant::now();
|
||||
let res = self.client.execute(request).await;
|
||||
CONSOLE_REQUEST_LATENCY
|
||||
.with_label_values(&[&path])
|
||||
.observe(start.elapsed().as_secs_f64());
|
||||
res
|
||||
let _timer = Metrics::get()
|
||||
.proxy
|
||||
.console_request_latency
|
||||
.start_timer(ConsoleRequest {
|
||||
request: request.url().path(),
|
||||
});
|
||||
|
||||
self.client.execute(request).await
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,30 +1,49 @@
|
||||
use anyhow::{anyhow, bail};
|
||||
use hyper::{Body, Request, Response, StatusCode};
|
||||
use std::{convert::Infallible, net::TcpListener};
|
||||
use tracing::info;
|
||||
use hyper::{header::CONTENT_TYPE, Body, Request, Response, StatusCode};
|
||||
use measured::{text::BufferedTextEncoder, MetricGroup};
|
||||
use metrics::NeonMetrics;
|
||||
use std::{
|
||||
convert::Infallible,
|
||||
net::TcpListener,
|
||||
sync::{Arc, Mutex},
|
||||
};
|
||||
use tracing::{info, info_span};
|
||||
use utils::http::{
|
||||
endpoint::{self, prometheus_metrics_handler, request_span},
|
||||
endpoint::{self, request_span},
|
||||
error::ApiError,
|
||||
json::json_response,
|
||||
RouterBuilder, RouterService,
|
||||
};
|
||||
|
||||
use crate::jemalloc;
|
||||
|
||||
async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(StatusCode::OK, "")
|
||||
}
|
||||
|
||||
fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
|
||||
fn make_router(metrics: AppMetrics) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
let state = Arc::new(Mutex::new(PrometheusHandler {
|
||||
encoder: BufferedTextEncoder::new(),
|
||||
metrics,
|
||||
}));
|
||||
|
||||
endpoint::make_router()
|
||||
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
|
||||
.get("/metrics", move |r| {
|
||||
let state = state.clone();
|
||||
request_span(r, move |b| prometheus_metrics_handler(b, state))
|
||||
})
|
||||
.get("/v1/status", status_handler)
|
||||
}
|
||||
|
||||
pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible> {
|
||||
pub async fn task_main(
|
||||
http_listener: TcpListener,
|
||||
metrics: AppMetrics,
|
||||
) -> anyhow::Result<Infallible> {
|
||||
scopeguard::defer! {
|
||||
info!("http has shut down");
|
||||
}
|
||||
|
||||
let service = || RouterService::new(make_router().build()?);
|
||||
let service = || RouterService::new(make_router(metrics).build()?);
|
||||
|
||||
hyper::Server::from_tcp(http_listener)?
|
||||
.serve(service().map_err(|e| anyhow!(e))?)
|
||||
@@ -32,3 +51,57 @@ pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible>
|
||||
|
||||
bail!("hyper server without shutdown handling cannot shutdown successfully");
|
||||
}
|
||||
|
||||
struct PrometheusHandler {
|
||||
encoder: BufferedTextEncoder,
|
||||
metrics: AppMetrics,
|
||||
}
|
||||
|
||||
#[derive(MetricGroup)]
|
||||
pub struct AppMetrics {
|
||||
#[metric(namespace = "jemalloc")]
|
||||
pub jemalloc: Option<jemalloc::MetricRecorder>,
|
||||
#[metric(flatten)]
|
||||
pub neon_metrics: NeonMetrics,
|
||||
#[metric(flatten)]
|
||||
pub proxy: &'static crate::metrics::Metrics,
|
||||
}
|
||||
|
||||
async fn prometheus_metrics_handler(
|
||||
_req: Request<Body>,
|
||||
state: Arc<Mutex<PrometheusHandler>>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
let span = info_span!("blocking");
|
||||
let body = tokio::task::spawn_blocking(move || {
|
||||
let _span = span.entered();
|
||||
|
||||
let mut state = state.lock().unwrap();
|
||||
let PrometheusHandler { encoder, metrics } = &mut *state;
|
||||
|
||||
metrics
|
||||
.collect_group_into(&mut *encoder)
|
||||
.unwrap_or_else(|infallible| match infallible {});
|
||||
|
||||
let body = encoder.finish();
|
||||
|
||||
tracing::info!(
|
||||
bytes = body.len(),
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"responded /metrics"
|
||||
);
|
||||
|
||||
body
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let response = Response::builder()
|
||||
.status(200)
|
||||
.header(CONTENT_TYPE, "text/plain; version=0.0.4")
|
||||
.body(Body::from(body))
|
||||
.unwrap();
|
||||
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
@@ -1,27 +1,45 @@
|
||||
use std::time::Duration;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use metrics::IntGauge;
|
||||
use prometheus::{register_int_gauge_with_registry, Registry};
|
||||
use measured::{
|
||||
label::NoLabels,
|
||||
metric::{
|
||||
gauge::GaugeState, group::Encoding, group::MetricValue, name::MetricNameEncoder,
|
||||
MetricEncoding, MetricFamilyEncoding, MetricType,
|
||||
},
|
||||
text::TextEncoder,
|
||||
LabelGroup, MetricGroup,
|
||||
};
|
||||
use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version};
|
||||
|
||||
pub struct MetricRecorder {
|
||||
epoch: epoch_mib,
|
||||
active: stats::active_mib,
|
||||
active_gauge: IntGauge,
|
||||
allocated: stats::allocated_mib,
|
||||
allocated_gauge: IntGauge,
|
||||
mapped: stats::mapped_mib,
|
||||
mapped_gauge: IntGauge,
|
||||
metadata: stats::metadata_mib,
|
||||
metadata_gauge: IntGauge,
|
||||
resident: stats::resident_mib,
|
||||
resident_gauge: IntGauge,
|
||||
retained: stats::retained_mib,
|
||||
retained_gauge: IntGauge,
|
||||
inner: Metrics,
|
||||
}
|
||||
|
||||
#[derive(MetricGroup)]
|
||||
struct Metrics {
|
||||
active_bytes: JemallocGaugeFamily<stats::active_mib>,
|
||||
allocated_bytes: JemallocGaugeFamily<stats::allocated_mib>,
|
||||
mapped_bytes: JemallocGaugeFamily<stats::mapped_mib>,
|
||||
metadata_bytes: JemallocGaugeFamily<stats::metadata_mib>,
|
||||
resident_bytes: JemallocGaugeFamily<stats::resident_mib>,
|
||||
retained_bytes: JemallocGaugeFamily<stats::retained_mib>,
|
||||
}
|
||||
|
||||
impl<Enc: Encoding> MetricGroup<Enc> for MetricRecorder
|
||||
where
|
||||
Metrics: MetricGroup<Enc>,
|
||||
{
|
||||
fn collect_group_into(&self, enc: &mut Enc) -> Result<(), Enc::Err> {
|
||||
if self.epoch.advance().is_ok() {
|
||||
self.inner.collect_group_into(enc)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl MetricRecorder {
|
||||
pub fn new(registry: &Registry) -> Result<Self, anyhow::Error> {
|
||||
pub fn new() -> Result<Self, anyhow::Error> {
|
||||
tracing::info!(
|
||||
config = config::malloc_conf::read()?,
|
||||
version = version::read()?,
|
||||
@@ -30,71 +48,69 @@ impl MetricRecorder {
|
||||
|
||||
Ok(Self {
|
||||
epoch: epoch::mib()?,
|
||||
active: stats::active::mib()?,
|
||||
active_gauge: register_int_gauge_with_registry!(
|
||||
"jemalloc_active_bytes",
|
||||
"Total number of bytes in active pages allocated by the process",
|
||||
registry
|
||||
)?,
|
||||
allocated: stats::allocated::mib()?,
|
||||
allocated_gauge: register_int_gauge_with_registry!(
|
||||
"jemalloc_allocated_bytes",
|
||||
"Total number of bytes allocated by the process",
|
||||
registry
|
||||
)?,
|
||||
mapped: stats::mapped::mib()?,
|
||||
mapped_gauge: register_int_gauge_with_registry!(
|
||||
"jemalloc_mapped_bytes",
|
||||
"Total number of bytes in active extents mapped by the allocator",
|
||||
registry
|
||||
)?,
|
||||
metadata: stats::metadata::mib()?,
|
||||
metadata_gauge: register_int_gauge_with_registry!(
|
||||
"jemalloc_metadata_bytes",
|
||||
"Total number of bytes dedicated to jemalloc metadata",
|
||||
registry
|
||||
)?,
|
||||
resident: stats::resident::mib()?,
|
||||
resident_gauge: register_int_gauge_with_registry!(
|
||||
"jemalloc_resident_bytes",
|
||||
"Total number of bytes in physically resident data pages mapped by the allocator",
|
||||
registry
|
||||
)?,
|
||||
retained: stats::retained::mib()?,
|
||||
retained_gauge: register_int_gauge_with_registry!(
|
||||
"jemalloc_retained_bytes",
|
||||
"Total number of bytes in virtual memory mappings that were retained rather than being returned to the operating system",
|
||||
registry
|
||||
)?,
|
||||
})
|
||||
}
|
||||
|
||||
fn _poll(&self) -> Result<(), anyhow::Error> {
|
||||
self.epoch.advance()?;
|
||||
self.active_gauge.set(self.active.read()? as i64);
|
||||
self.allocated_gauge.set(self.allocated.read()? as i64);
|
||||
self.mapped_gauge.set(self.mapped.read()? as i64);
|
||||
self.metadata_gauge.set(self.metadata.read()? as i64);
|
||||
self.resident_gauge.set(self.resident.read()? as i64);
|
||||
self.retained_gauge.set(self.retained.read()? as i64);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn poll(&self) {
|
||||
if let Err(error) = self._poll() {
|
||||
tracing::warn!(%error, "Failed to poll jemalloc stats");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start(self) -> tokio::task::JoinHandle<()> {
|
||||
tokio::task::spawn(async move {
|
||||
let mut interval = tokio::time::interval(Duration::from_secs(15));
|
||||
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
|
||||
loop {
|
||||
self.poll();
|
||||
interval.tick().await;
|
||||
}
|
||||
inner: Metrics {
|
||||
active_bytes: JemallocGaugeFamily(stats::active::mib()?),
|
||||
allocated_bytes: JemallocGaugeFamily(stats::allocated::mib()?),
|
||||
mapped_bytes: JemallocGaugeFamily(stats::mapped::mib()?),
|
||||
metadata_bytes: JemallocGaugeFamily(stats::metadata::mib()?),
|
||||
resident_bytes: JemallocGaugeFamily(stats::resident::mib()?),
|
||||
retained_bytes: JemallocGaugeFamily(stats::retained::mib()?),
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
struct JemallocGauge<T>(PhantomData<T>);
|
||||
|
||||
impl<T> Default for JemallocGauge<T> {
|
||||
fn default() -> Self {
|
||||
JemallocGauge(PhantomData)
|
||||
}
|
||||
}
|
||||
impl<T> MetricType for JemallocGauge<T> {
|
||||
type Metadata = T;
|
||||
}
|
||||
|
||||
struct JemallocGaugeFamily<T>(T);
|
||||
impl<M, T: Encoding> MetricFamilyEncoding<T> for JemallocGaugeFamily<M>
|
||||
where
|
||||
JemallocGauge<M>: MetricEncoding<T, Metadata = M>,
|
||||
{
|
||||
fn collect_family_into(&self, name: impl MetricNameEncoder, enc: &mut T) -> Result<(), T::Err> {
|
||||
JemallocGauge::write_type(&name, enc)?;
|
||||
JemallocGauge(PhantomData).collect_into(&self.0, NoLabels, name, enc)
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! jemalloc_gauge {
|
||||
($stat:ident, $mib:ident) => {
|
||||
impl<W: std::io::Write> MetricEncoding<TextEncoder<W>> for JemallocGauge<stats::$mib> {
|
||||
fn write_type(
|
||||
name: impl MetricNameEncoder,
|
||||
enc: &mut TextEncoder<W>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
GaugeState::write_type(name, enc)
|
||||
}
|
||||
|
||||
fn collect_into(
|
||||
&self,
|
||||
mib: &stats::$mib,
|
||||
labels: impl LabelGroup,
|
||||
name: impl MetricNameEncoder,
|
||||
enc: &mut TextEncoder<W>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
if let Ok(v) = mib.read() {
|
||||
enc.write_metric_value(name, labels, MetricValue::Int(v as i64))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
jemalloc_gauge!(active, active_mib);
|
||||
jemalloc_gauge!(allocated, allocated_mib);
|
||||
jemalloc_gauge!(mapped, mapped_mib);
|
||||
jemalloc_gauge!(metadata, metadata_mib);
|
||||
jemalloc_gauge!(resident, resident_mib);
|
||||
jemalloc_gauge!(retained, retained_mib);
|
||||
|
||||
@@ -1,176 +1,356 @@
|
||||
use ::metrics::{
|
||||
exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec,
|
||||
register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge,
|
||||
register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec,
|
||||
IntCounterVec, IntGauge, IntGaugeVec,
|
||||
};
|
||||
use metrics::{
|
||||
register_hll, register_int_counter, register_int_counter_pair, HyperLogLog, IntCounter,
|
||||
IntCounterPair,
|
||||
};
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use lasso::ThreadedRodeo;
|
||||
use measured::{
|
||||
label::StaticLabelSet,
|
||||
metric::{histogram::Thresholds, name::MetricName},
|
||||
Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
|
||||
LabelGroup, MetricGroup,
|
||||
};
|
||||
use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use tokio::time::{self, Instant};
|
||||
|
||||
use crate::console::messages::ColdStartInfo;
|
||||
|
||||
pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
||||
register_int_counter_pair_vec!(
|
||||
"proxy_opened_db_connections_total",
|
||||
"Number of opened connections to a database.",
|
||||
"proxy_closed_db_connections_total",
|
||||
"Number of closed connections to a database.",
|
||||
&["protocol"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
#[derive(MetricGroup)]
|
||||
pub struct Metrics {
|
||||
#[metric(namespace = "proxy")]
|
||||
pub proxy: ProxyMetrics,
|
||||
|
||||
pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
||||
register_int_counter_pair_vec!(
|
||||
"proxy_opened_client_connections_total",
|
||||
"Number of opened connections from a client.",
|
||||
"proxy_closed_client_connections_total",
|
||||
"Number of closed connections from a client.",
|
||||
&["protocol"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
#[metric(namespace = "wake_compute_lock")]
|
||||
pub wake_compute_lock: ApiLockMetrics,
|
||||
|
||||
pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
||||
register_int_counter_pair_vec!(
|
||||
"proxy_accepted_connections_total",
|
||||
"Number of client connections accepted.",
|
||||
"proxy_closed_connections_total",
|
||||
"Number of client connections closed.",
|
||||
&["protocol"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
// the one metric not called proxy_....
|
||||
pub semaphore_control_plane_limit: GaugeVec<StaticLabelSet<RateLimit>>,
|
||||
}
|
||||
|
||||
pub static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"proxy_compute_connection_latency_seconds",
|
||||
"Time it took for proxy to establish a connection to the compute endpoint",
|
||||
// http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane
|
||||
// 3 * 6 * 2 * 2 = 72 counters
|
||||
&["protocol", "cold_start_info", "outcome", "excluded"],
|
||||
// largest bucket = 2^16 * 0.5ms = 32s
|
||||
exponential_buckets(0.0005, 2.0, 16).unwrap(),
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
impl Metrics {
|
||||
pub fn get() -> &'static Self {
|
||||
static SELF: OnceLock<Metrics> = OnceLock::new();
|
||||
SELF.get_or_init(|| Metrics {
|
||||
proxy: ProxyMetrics::default(),
|
||||
wake_compute_lock: ApiLockMetrics::new(),
|
||||
semaphore_control_plane_limit: GaugeVec::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub static CONSOLE_REQUEST_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"proxy_console_request_latency",
|
||||
"Time it took for proxy to establish a connection to the compute endpoint",
|
||||
// proxy_wake_compute/proxy_get_role_info
|
||||
&["request"],
|
||||
#[derive(MetricGroup)]
|
||||
#[metric(new())]
|
||||
pub struct ProxyMetrics {
|
||||
#[metric(flatten)]
|
||||
pub db_connections: CounterPairVec<NumDbConnectionsGauge>,
|
||||
#[metric(flatten)]
|
||||
pub client_connections: CounterPairVec<NumClientConnectionsGauge>,
|
||||
#[metric(flatten)]
|
||||
pub connection_requests: CounterPairVec<NumConnectionRequestsGauge>,
|
||||
#[metric(flatten)]
|
||||
pub http_endpoint_pools: HttpEndpointPools,
|
||||
|
||||
/// Time it took for proxy to establish a connection to the compute endpoint.
|
||||
// largest bucket = 2^16 * 0.5ms = 32s
|
||||
#[metric(metadata = Thresholds::exponential_buckets(0.0005, 2.0))]
|
||||
pub compute_connection_latency_seconds: HistogramVec<ComputeConnectionLatencySet, 16>,
|
||||
|
||||
/// Time it took for proxy to receive a response from control plane.
|
||||
#[metric(
|
||||
// largest bucket = 2^16 * 0.2ms = 13s
|
||||
exponential_buckets(0.0002, 2.0, 16).unwrap(),
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
metadata = Thresholds::exponential_buckets(0.0002, 2.0),
|
||||
)]
|
||||
pub console_request_latency: HistogramVec<ConsoleRequestSet, 16>,
|
||||
|
||||
pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"proxy_allowed_ips_cache_misses",
|
||||
"Number of cache hits/misses for allowed ips",
|
||||
// hit/miss
|
||||
&["outcome"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
/// Time it takes to acquire a token to call console plane.
|
||||
// largest bucket = 3^16 * 0.05ms = 2.15s
|
||||
#[metric(metadata = Thresholds::exponential_buckets(0.00005, 3.0))]
|
||||
pub control_plane_token_acquire_seconds: Histogram<16>,
|
||||
|
||||
pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"proxy_control_plane_token_acquire_seconds",
|
||||
"Time it took for proxy to establish a connection to the compute endpoint",
|
||||
// largest bucket = 3^16 * 0.05ms = 2.15s
|
||||
exponential_buckets(0.00005, 3.0, 16).unwrap(),
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
/// Size of the HTTP request body lengths.
|
||||
// smallest bucket = 16 bytes
|
||||
// largest bucket = 4^12 * 16 bytes = 256MB
|
||||
#[metric(metadata = Thresholds::exponential_buckets(16.0, 4.0))]
|
||||
pub http_conn_content_length_bytes: HistogramVec<StaticLabelSet<HttpDirection>, 12>,
|
||||
|
||||
pub static RATE_LIMITER_LIMIT: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"semaphore_control_plane_limit",
|
||||
"Current limit of the semaphore control plane",
|
||||
&["limit"], // 2 counters
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
/// Time it takes to reclaim unused connection pools.
|
||||
#[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))]
|
||||
pub http_pool_reclaimation_lag_seconds: Histogram<16>,
|
||||
|
||||
pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"proxy_accepted_connections_by_sni",
|
||||
"Number of connections (per sni).",
|
||||
&["kind"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
/// Number of opened connections to a database.
|
||||
pub http_pool_opened_connections: Gauge,
|
||||
|
||||
pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"proxy_allowed_ips_number",
|
||||
"Number of allowed ips",
|
||||
vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
/// Number of cache hits/misses for allowed ips.
|
||||
pub allowed_ips_cache_misses: CounterVec<StaticLabelSet<CacheOutcome>>,
|
||||
|
||||
pub static HTTP_CONTENT_LENGTH: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"proxy_http_conn_content_length_bytes",
|
||||
"Number of bytes the HTTP response content consumes",
|
||||
// request/response
|
||||
&["direction"],
|
||||
// smallest bucket = 16 bytes
|
||||
// largest bucket = 4^12 * 16 bytes = 256MB
|
||||
exponential_buckets(16.0, 4.0, 12).unwrap()
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
/// Number of allowed ips
|
||||
#[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))]
|
||||
pub allowed_ips_number: Histogram<10>,
|
||||
|
||||
pub static GC_LATENCY: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"proxy_http_pool_reclaimation_lag_seconds",
|
||||
"Time it takes to reclaim unused connection pools",
|
||||
// 1us -> 65ms
|
||||
exponential_buckets(1e-6, 2.0, 16).unwrap(),
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
/// Number of connections (per sni).
|
||||
pub accepted_connections_by_sni: CounterVec<StaticLabelSet<SniKind>>,
|
||||
|
||||
pub static ENDPOINT_POOLS: Lazy<IntCounterPair> = Lazy::new(|| {
|
||||
register_int_counter_pair!(
|
||||
"proxy_http_pool_endpoints_registered_total",
|
||||
"Number of endpoints we have registered pools for",
|
||||
"proxy_http_pool_endpoints_unregistered_total",
|
||||
"Number of endpoints we have unregistered pools for",
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
/// Number of connection failures (per kind).
|
||||
pub connection_failures_total: CounterVec<StaticLabelSet<ConnectionFailureKind>>,
|
||||
|
||||
pub static NUM_OPEN_CLIENTS_IN_HTTP_POOL: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
"proxy_http_pool_opened_connections",
|
||||
"Number of opened connections to a database.",
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
/// Number of wake-up failures (per kind).
|
||||
pub connection_failures_breakdown: CounterVec<ConnectionFailuresBreakdownSet>,
|
||||
|
||||
pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"proxy_cancellation_requests_total",
|
||||
"Number of cancellation requests (per found/not_found).",
|
||||
&["source", "kind"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
/// Number of bytes sent/received between all clients and backends.
|
||||
pub io_bytes: CounterVec<StaticLabelSet<Direction>>,
|
||||
|
||||
pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client";
|
||||
pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis";
|
||||
/// Number of errors by a given classification.
|
||||
pub errors_total: CounterVec<StaticLabelSet<crate::error::ErrorKind>>,
|
||||
|
||||
/// Number of cancellation requests (per found/not_found).
|
||||
pub cancellation_requests_total: CounterVec<CancellationRequestSet>,
|
||||
|
||||
/// Number of errors by a given classification
|
||||
pub redis_errors_total: CounterVec<RedisErrorsSet>,
|
||||
|
||||
/// Number of TLS handshake failures
|
||||
pub tls_handshake_failures: Counter,
|
||||
|
||||
/// Number of connection requests affected by authentication rate limits
|
||||
pub requests_auth_rate_limits_total: Counter,
|
||||
|
||||
/// HLL approximate cardinality of endpoints that are connecting
|
||||
pub connecting_endpoints: HyperLogLogVec<StaticLabelSet<Protocol>, 32>,
|
||||
|
||||
/// Number of endpoints affected by errors of a given classification
|
||||
pub endpoints_affected_by_errors: HyperLogLogVec<StaticLabelSet<crate::error::ErrorKind>, 32>,
|
||||
|
||||
/// Number of endpoints affected by authentication rate limits
|
||||
pub endpoints_auth_rate_limits: HyperLogLog<32>,
|
||||
}
|
||||
|
||||
#[derive(MetricGroup)]
|
||||
#[metric(new())]
|
||||
pub struct ApiLockMetrics {
|
||||
/// Number of semaphores registered in this api lock
|
||||
pub semaphores_registered: Counter,
|
||||
/// Number of semaphores unregistered in this api lock
|
||||
pub semaphores_unregistered: Counter,
|
||||
/// Time it takes to reclaim unused semaphores in the api lock
|
||||
#[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))]
|
||||
pub reclamation_lag_seconds: Histogram<16>,
|
||||
/// Time it takes to acquire a semaphore lock
|
||||
#[metric(metadata = Thresholds::exponential_buckets(1e-4, 2.0))]
|
||||
pub semaphore_acquire_seconds: Histogram<16>,
|
||||
}
|
||||
|
||||
impl Default for ProxyMetrics {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Copy, Clone)]
|
||||
#[label(singleton = "direction")]
|
||||
pub enum HttpDirection {
|
||||
Request,
|
||||
Response,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Copy, Clone)]
|
||||
#[label(singleton = "direction")]
|
||||
pub enum Direction {
|
||||
Tx,
|
||||
Rx,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
|
||||
#[label(singleton = "protocol")]
|
||||
pub enum Protocol {
|
||||
Http,
|
||||
Ws,
|
||||
Tcp,
|
||||
SniRouter,
|
||||
}
|
||||
|
||||
impl Protocol {
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
Protocol::Http => "http",
|
||||
Protocol::Ws => "ws",
|
||||
Protocol::Tcp => "tcp",
|
||||
Protocol::SniRouter => "sni_router",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Protocol {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.as_str())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Copy, Clone)]
|
||||
pub enum Bool {
|
||||
True,
|
||||
False,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Copy, Clone)]
|
||||
#[label(singleton = "outcome")]
|
||||
pub enum Outcome {
|
||||
Success,
|
||||
Failed,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Copy, Clone)]
|
||||
#[label(singleton = "outcome")]
|
||||
pub enum CacheOutcome {
|
||||
Hit,
|
||||
Miss,
|
||||
}
|
||||
|
||||
#[derive(LabelGroup)]
|
||||
#[label(set = ConsoleRequestSet)]
|
||||
pub struct ConsoleRequest<'a> {
|
||||
#[label(dynamic_with = ThreadedRodeo, default)]
|
||||
pub request: &'a str,
|
||||
}
|
||||
|
||||
#[derive(MetricGroup, Default)]
|
||||
pub struct HttpEndpointPools {
|
||||
/// Number of endpoints we have registered pools for
|
||||
pub http_pool_endpoints_registered_total: Counter,
|
||||
/// Number of endpoints we have unregistered pools for
|
||||
pub http_pool_endpoints_unregistered_total: Counter,
|
||||
}
|
||||
|
||||
pub struct HttpEndpointPoolsGuard<'a> {
|
||||
dec: &'a Counter,
|
||||
}
|
||||
|
||||
impl Drop for HttpEndpointPoolsGuard<'_> {
|
||||
fn drop(&mut self) {
|
||||
self.dec.inc();
|
||||
}
|
||||
}
|
||||
|
||||
impl HttpEndpointPools {
|
||||
pub fn guard(&self) -> HttpEndpointPoolsGuard {
|
||||
self.http_pool_endpoints_registered_total.inc();
|
||||
HttpEndpointPoolsGuard {
|
||||
dec: &self.http_pool_endpoints_unregistered_total,
|
||||
}
|
||||
}
|
||||
}
|
||||
pub struct NumDbConnectionsGauge;
|
||||
impl CounterPairAssoc for NumDbConnectionsGauge {
|
||||
const INC_NAME: &'static MetricName = MetricName::from_str("opened_db_connections_total");
|
||||
const DEC_NAME: &'static MetricName = MetricName::from_str("closed_db_connections_total");
|
||||
const INC_HELP: &'static str = "Number of opened connections to a database.";
|
||||
const DEC_HELP: &'static str = "Number of closed connections to a database.";
|
||||
type LabelGroupSet = StaticLabelSet<Protocol>;
|
||||
}
|
||||
pub type NumDbConnectionsGuard<'a> = metrics::MeasuredCounterPairGuard<'a, NumDbConnectionsGauge>;
|
||||
|
||||
pub struct NumClientConnectionsGauge;
|
||||
impl CounterPairAssoc for NumClientConnectionsGauge {
|
||||
const INC_NAME: &'static MetricName = MetricName::from_str("opened_client_connections_total");
|
||||
const DEC_NAME: &'static MetricName = MetricName::from_str("closed_client_connections_total");
|
||||
const INC_HELP: &'static str = "Number of opened connections from a client.";
|
||||
const DEC_HELP: &'static str = "Number of closed connections from a client.";
|
||||
type LabelGroupSet = StaticLabelSet<Protocol>;
|
||||
}
|
||||
pub type NumClientConnectionsGuard<'a> =
|
||||
metrics::MeasuredCounterPairGuard<'a, NumClientConnectionsGauge>;
|
||||
|
||||
pub struct NumConnectionRequestsGauge;
|
||||
impl CounterPairAssoc for NumConnectionRequestsGauge {
|
||||
const INC_NAME: &'static MetricName = MetricName::from_str("accepted_connections_total");
|
||||
const DEC_NAME: &'static MetricName = MetricName::from_str("closed_connections_total");
|
||||
const INC_HELP: &'static str = "Number of client connections accepted.";
|
||||
const DEC_HELP: &'static str = "Number of client connections closed.";
|
||||
type LabelGroupSet = StaticLabelSet<Protocol>;
|
||||
}
|
||||
pub type NumConnectionRequestsGuard<'a> =
|
||||
metrics::MeasuredCounterPairGuard<'a, NumConnectionRequestsGauge>;
|
||||
|
||||
#[derive(LabelGroup)]
|
||||
#[label(set = ComputeConnectionLatencySet)]
|
||||
pub struct ComputeConnectionLatencyGroup {
|
||||
protocol: Protocol,
|
||||
cold_start_info: ColdStartInfo,
|
||||
outcome: ConnectOutcome,
|
||||
excluded: LatencyExclusions,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Copy, Clone)]
|
||||
pub enum LatencyExclusions {
|
||||
Client,
|
||||
ClientAndCplane,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Copy, Clone)]
|
||||
#[label(singleton = "limit")]
|
||||
pub enum RateLimit {
|
||||
Actual,
|
||||
Expected,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Copy, Clone)]
|
||||
#[label(singleton = "kind")]
|
||||
pub enum SniKind {
|
||||
Sni,
|
||||
NoSni,
|
||||
PasswordHack,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Copy, Clone)]
|
||||
#[label(singleton = "kind")]
|
||||
pub enum ConnectionFailureKind {
|
||||
ComputeCached,
|
||||
ComputeUncached,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Copy, Clone)]
|
||||
#[label(singleton = "kind")]
|
||||
pub enum WakeupFailureKind {
|
||||
BadComputeAddress,
|
||||
ApiTransportError,
|
||||
QuotaExceeded,
|
||||
ApiConsoleLocked,
|
||||
ApiConsoleBadRequest,
|
||||
ApiConsoleOtherServerError,
|
||||
ApiConsoleOtherError,
|
||||
TimeoutError,
|
||||
}
|
||||
|
||||
#[derive(LabelGroup)]
|
||||
#[label(set = ConnectionFailuresBreakdownSet)]
|
||||
pub struct ConnectionFailuresBreakdownGroup {
|
||||
pub kind: WakeupFailureKind,
|
||||
pub retry: Bool,
|
||||
}
|
||||
|
||||
#[derive(LabelGroup, Copy, Clone)]
|
||||
#[label(set = RedisErrorsSet)]
|
||||
pub struct RedisErrors<'a> {
|
||||
#[label(dynamic_with = ThreadedRodeo, default)]
|
||||
pub channel: &'a str,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Copy, Clone)]
|
||||
pub enum CancellationSource {
|
||||
FromClient,
|
||||
FromRedis,
|
||||
Local,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Copy, Clone)]
|
||||
pub enum CancellationOutcome {
|
||||
NotFound,
|
||||
Found,
|
||||
}
|
||||
|
||||
#[derive(LabelGroup)]
|
||||
#[label(set = CancellationRequestSet)]
|
||||
pub struct CancellationRequest {
|
||||
pub source: CancellationSource,
|
||||
pub kind: CancellationOutcome,
|
||||
}
|
||||
|
||||
pub enum Waiting {
|
||||
Cplane,
|
||||
@@ -185,20 +365,6 @@ struct Accumulated {
|
||||
compute: time::Duration,
|
||||
}
|
||||
|
||||
enum Outcome {
|
||||
Success,
|
||||
Failed,
|
||||
}
|
||||
|
||||
impl Outcome {
|
||||
fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
Outcome::Success => "success",
|
||||
Outcome::Failed => "failed",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LatencyTimer {
|
||||
// time since the stopwatch was started
|
||||
start: time::Instant,
|
||||
@@ -207,9 +373,9 @@ pub struct LatencyTimer {
|
||||
// accumulated time on the stopwatch
|
||||
accumulated: Accumulated,
|
||||
// label data
|
||||
protocol: &'static str,
|
||||
protocol: Protocol,
|
||||
cold_start_info: ColdStartInfo,
|
||||
outcome: Outcome,
|
||||
outcome: ConnectOutcome,
|
||||
}
|
||||
|
||||
pub struct LatencyTimerPause<'a> {
|
||||
@@ -219,7 +385,7 @@ pub struct LatencyTimerPause<'a> {
|
||||
}
|
||||
|
||||
impl LatencyTimer {
|
||||
pub fn new(protocol: &'static str) -> Self {
|
||||
pub fn new(protocol: Protocol) -> Self {
|
||||
Self {
|
||||
start: time::Instant::now(),
|
||||
stop: None,
|
||||
@@ -227,7 +393,7 @@ impl LatencyTimer {
|
||||
protocol,
|
||||
cold_start_info: ColdStartInfo::Unknown,
|
||||
// assume failed unless otherwise specified
|
||||
outcome: Outcome::Failed,
|
||||
outcome: ConnectOutcome::Failed,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -248,7 +414,7 @@ impl LatencyTimer {
|
||||
self.stop = Some(time::Instant::now());
|
||||
|
||||
// success
|
||||
self.outcome = Outcome::Success;
|
||||
self.outcome = ConnectOutcome::Success;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -263,128 +429,54 @@ impl Drop for LatencyTimerPause<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
|
||||
enum ConnectOutcome {
|
||||
Success,
|
||||
Failed,
|
||||
}
|
||||
|
||||
impl Drop for LatencyTimer {
|
||||
fn drop(&mut self) {
|
||||
let duration = self
|
||||
.stop
|
||||
.unwrap_or_else(time::Instant::now)
|
||||
.duration_since(self.start);
|
||||
// Excluding cplane communication from the accumulated time.
|
||||
COMPUTE_CONNECTION_LATENCY
|
||||
.with_label_values(&[
|
||||
self.protocol,
|
||||
self.cold_start_info.as_str(),
|
||||
self.outcome.as_str(),
|
||||
"client",
|
||||
])
|
||||
.observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64());
|
||||
|
||||
let metric = &Metrics::get().proxy.compute_connection_latency_seconds;
|
||||
|
||||
// Excluding client communication from the accumulated time.
|
||||
metric.observe(
|
||||
ComputeConnectionLatencyGroup {
|
||||
protocol: self.protocol,
|
||||
cold_start_info: self.cold_start_info,
|
||||
outcome: self.outcome,
|
||||
excluded: LatencyExclusions::Client,
|
||||
},
|
||||
duration
|
||||
.saturating_sub(self.accumulated.client)
|
||||
.as_secs_f64(),
|
||||
);
|
||||
|
||||
// Exclude client and cplane communication from the accumulated time.
|
||||
let accumulated_total = self.accumulated.client + self.accumulated.cplane;
|
||||
COMPUTE_CONNECTION_LATENCY
|
||||
.with_label_values(&[
|
||||
self.protocol,
|
||||
self.cold_start_info.as_str(),
|
||||
self.outcome.as_str(),
|
||||
"client_and_cplane",
|
||||
])
|
||||
.observe((duration.saturating_sub(accumulated_total)).as_secs_f64());
|
||||
metric.observe(
|
||||
ComputeConnectionLatencyGroup {
|
||||
protocol: self.protocol,
|
||||
cold_start_info: self.cold_start_info,
|
||||
outcome: self.outcome,
|
||||
excluded: LatencyExclusions::ClientAndCplane,
|
||||
},
|
||||
duration.saturating_sub(accumulated_total).as_secs_f64(),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
pub static NUM_CONNECTION_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"proxy_connection_failures_total",
|
||||
"Number of connection failures (per kind).",
|
||||
&["kind"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"proxy_connection_failures_breakdown",
|
||||
"Number of wake-up failures (per kind).",
|
||||
&["retry", "kind"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"proxy_io_bytes",
|
||||
"Number of bytes sent/received between all clients and backends.",
|
||||
&["direction"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub const fn bool_to_str(x: bool) -> &'static str {
|
||||
if x {
|
||||
"true"
|
||||
} else {
|
||||
"false"
|
||||
impl From<bool> for Bool {
|
||||
fn from(value: bool) -> Self {
|
||||
if value {
|
||||
Bool::True
|
||||
} else {
|
||||
Bool::False
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub static CONNECTING_ENDPOINTS: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
|
||||
register_hll_vec!(
|
||||
32,
|
||||
"proxy_connecting_endpoints",
|
||||
"HLL approximate cardinality of endpoints that are connecting",
|
||||
&["protocol"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub static ERROR_BY_KIND: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"proxy_errors_total",
|
||||
"Number of errors by a given classification",
|
||||
&["type"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub static ENDPOINT_ERRORS_BY_KIND: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
|
||||
register_hll_vec!(
|
||||
32,
|
||||
"proxy_endpoints_affected_by_errors",
|
||||
"Number of endpoints affected by errors of a given classification",
|
||||
&["type"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub static REDIS_BROKEN_MESSAGES: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"proxy_redis_errors_total",
|
||||
"Number of errors by a given classification",
|
||||
&["channel"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub static TLS_HANDSHAKE_FAILURES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"proxy_tls_handshake_failures",
|
||||
"Number of TLS handshake failures",
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub static ENDPOINTS_AUTH_RATE_LIMITED: Lazy<HyperLogLog<32>> = Lazy::new(|| {
|
||||
register_hll!(
|
||||
32,
|
||||
"proxy_endpoints_auth_rate_limits",
|
||||
"Number of endpoints affected by authentication rate limits",
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub static AUTH_RATE_LIMIT_HITS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"proxy_requests_auth_rate_limits_total",
|
||||
"Number of connection requests affected by authentication rate limits",
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
@@ -15,7 +15,7 @@ use crate::{
|
||||
config::{ProxyConfig, TlsConfig},
|
||||
context::RequestMonitoring,
|
||||
error::ReportableError,
|
||||
metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE},
|
||||
metrics::{Metrics, NumClientConnectionsGuard},
|
||||
protocol2::WithClientIp,
|
||||
proxy::handshake::{handshake, HandshakeData},
|
||||
rate_limiter::EndpointRateLimiter,
|
||||
@@ -24,7 +24,6 @@ use crate::{
|
||||
};
|
||||
use futures::TryFutureExt;
|
||||
use itertools::Itertools;
|
||||
use metrics::IntCounterPairGuard;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pq_proto::{BeMessage as Be, StartupMessageParams};
|
||||
use regex::Regex;
|
||||
@@ -79,9 +78,10 @@ pub async fn task_main(
|
||||
{
|
||||
let (socket, peer_addr) = accept_result?;
|
||||
|
||||
let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE
|
||||
.with_label_values(&["tcp"])
|
||||
.guard();
|
||||
let conn_gauge = Metrics::get()
|
||||
.proxy
|
||||
.client_connections
|
||||
.guard(crate::metrics::Protocol::Tcp);
|
||||
|
||||
let session_id = uuid::Uuid::new_v4();
|
||||
let cancellation_handler = Arc::clone(&cancellation_handler);
|
||||
@@ -113,7 +113,12 @@ pub async fn task_main(
|
||||
},
|
||||
};
|
||||
|
||||
let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region);
|
||||
let mut ctx = RequestMonitoring::new(
|
||||
session_id,
|
||||
peer_addr,
|
||||
crate::metrics::Protocol::Tcp,
|
||||
&config.region,
|
||||
);
|
||||
let span = ctx.span.clone();
|
||||
|
||||
let res = handle_client(
|
||||
@@ -237,14 +242,17 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
stream: S,
|
||||
mode: ClientMode,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
conn_gauge: IntCounterPairGuard,
|
||||
conn_gauge: NumClientConnectionsGuard<'static>,
|
||||
) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
|
||||
info!("handling interactive connection from client");
|
||||
info!(
|
||||
protocol = %ctx.protocol,
|
||||
"handling interactive connection from client"
|
||||
);
|
||||
|
||||
let metrics = &Metrics::get().proxy;
|
||||
let proto = ctx.protocol;
|
||||
let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
|
||||
.with_label_values(&[proto])
|
||||
.guard();
|
||||
// let _client_gauge = metrics.client_connections.guard(proto);
|
||||
let _request_gauge = metrics.connection_requests.guard(proto);
|
||||
|
||||
let tls = config.tls_config.as_ref();
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ use crate::{
|
||||
console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo},
|
||||
context::RequestMonitoring,
|
||||
error::ReportableError,
|
||||
metrics::NUM_CONNECTION_FAILURES,
|
||||
metrics::{ConnectionFailureKind, Metrics},
|
||||
proxy::{
|
||||
retry::{retry_after, ShouldRetry},
|
||||
wake_compute::wake_compute,
|
||||
@@ -27,10 +27,10 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
|
||||
warn!("invalidating stalled compute node info cache entry");
|
||||
}
|
||||
let label = match is_cached {
|
||||
true => "compute_cached",
|
||||
false => "compute_uncached",
|
||||
true => ConnectionFailureKind::ComputeCached,
|
||||
false => ConnectionFailureKind::ComputeUncached,
|
||||
};
|
||||
NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
|
||||
Metrics::get().proxy.connection_failures_total.inc(label);
|
||||
|
||||
node_info.invalidate()
|
||||
}
|
||||
|
||||
@@ -2,11 +2,10 @@ use crate::{
|
||||
cancellation,
|
||||
compute::PostgresConnection,
|
||||
console::messages::MetricsAuxInfo,
|
||||
metrics::NUM_BYTES_PROXIED_COUNTER,
|
||||
metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard},
|
||||
stream::Stream,
|
||||
usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS},
|
||||
};
|
||||
use metrics::IntCounterPairGuard;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::info;
|
||||
use utils::measured_stream::MeasuredStream;
|
||||
@@ -23,24 +22,25 @@ pub async fn proxy_pass(
|
||||
branch_id: aux.branch_id,
|
||||
});
|
||||
|
||||
let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
|
||||
let metrics = &Metrics::get().proxy.io_bytes;
|
||||
let m_sent = metrics.with_labels(Direction::Tx);
|
||||
let mut client = MeasuredStream::new(
|
||||
client,
|
||||
|_| {},
|
||||
|cnt| {
|
||||
// Number of bytes we sent to the client (outbound).
|
||||
m_sent.inc_by(cnt as u64);
|
||||
metrics.get_metric(m_sent).inc_by(cnt as u64);
|
||||
usage.record_egress(cnt as u64);
|
||||
},
|
||||
);
|
||||
|
||||
let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
|
||||
let m_recv = metrics.with_labels(Direction::Rx);
|
||||
let mut compute = MeasuredStream::new(
|
||||
compute,
|
||||
|_| {},
|
||||
|cnt| {
|
||||
// Number of bytes the client sent to the compute node (inbound).
|
||||
m_recv.inc_by(cnt as u64);
|
||||
metrics.get_metric(m_recv).inc_by(cnt as u64);
|
||||
},
|
||||
);
|
||||
|
||||
@@ -60,8 +60,8 @@ pub struct ProxyPassthrough<P, S> {
|
||||
pub compute: PostgresConnection,
|
||||
pub aux: MetricsAuxInfo,
|
||||
|
||||
pub req: IntCounterPairGuard,
|
||||
pub conn: IntCounterPairGuard,
|
||||
pub req: NumConnectionRequestsGuard<'static>,
|
||||
pub conn: NumClientConnectionsGuard<'static>,
|
||||
pub cancel: cancellation::Session<P>,
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo};
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::metrics::{bool_to_str, NUM_WAKEUP_FAILURES};
|
||||
use crate::metrics::{ConnectionFailuresBreakdownGroup, Metrics, WakeupFailureKind};
|
||||
use crate::proxy::retry::retry_after;
|
||||
use hyper::StatusCode;
|
||||
use std::ops::ControlFlow;
|
||||
@@ -57,39 +57,46 @@ pub fn handle_try_wake(
|
||||
|
||||
fn report_error(e: &WakeComputeError, retry: bool) {
|
||||
use crate::console::errors::ApiError;
|
||||
let retry = bool_to_str(retry);
|
||||
let kind = match e {
|
||||
WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
|
||||
WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
|
||||
WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress,
|
||||
WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError,
|
||||
WakeComputeError::ApiError(ApiError::Console {
|
||||
status: StatusCode::LOCKED,
|
||||
ref text,
|
||||
}) if text.contains("written data quota exceeded")
|
||||
|| text.contains("the limit for current plan reached") =>
|
||||
{
|
||||
"quota_exceeded"
|
||||
WakeupFailureKind::QuotaExceeded
|
||||
}
|
||||
WakeComputeError::ApiError(ApiError::Console {
|
||||
status: StatusCode::UNPROCESSABLE_ENTITY,
|
||||
ref text,
|
||||
}) if text.contains("compute time quota of non-primary branches is exceeded") => {
|
||||
"quota_exceeded"
|
||||
WakeupFailureKind::QuotaExceeded
|
||||
}
|
||||
WakeComputeError::ApiError(ApiError::Console {
|
||||
status: StatusCode::LOCKED,
|
||||
..
|
||||
}) => "api_console_locked",
|
||||
}) => WakeupFailureKind::ApiConsoleLocked,
|
||||
WakeComputeError::ApiError(ApiError::Console {
|
||||
status: StatusCode::BAD_REQUEST,
|
||||
..
|
||||
}) => "api_console_bad_request",
|
||||
}) => WakeupFailureKind::ApiConsoleBadRequest,
|
||||
WakeComputeError::ApiError(ApiError::Console { status, .. })
|
||||
if status.is_server_error() =>
|
||||
{
|
||||
"api_console_other_server_error"
|
||||
WakeupFailureKind::ApiConsoleOtherServerError
|
||||
}
|
||||
WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
|
||||
WakeComputeError::TimeoutError => "timeout_error",
|
||||
WakeComputeError::ApiError(ApiError::Console { .. }) => {
|
||||
WakeupFailureKind::ApiConsoleOtherError
|
||||
}
|
||||
WakeComputeError::TimeoutError => WakeupFailureKind::TimeoutError,
|
||||
};
|
||||
NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.connection_failures_breakdown
|
||||
.inc(ConnectionFailuresBreakdownGroup {
|
||||
kind,
|
||||
retry: retry.into(),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -17,7 +17,13 @@ use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
|
||||
use tokio::time::{timeout, Duration, Instant};
|
||||
use tracing::info;
|
||||
|
||||
use crate::{intern::EndpointIdInt, EndpointId};
|
||||
use crate::{
|
||||
intern::EndpointIdInt,
|
||||
{
|
||||
metrics::{Metrics, RateLimit},
|
||||
EndpointId,
|
||||
},
|
||||
};
|
||||
|
||||
use super::{
|
||||
limit_algorithm::{LimitAlgorithm, Sample},
|
||||
@@ -457,12 +463,9 @@ impl Limiter {
|
||||
}
|
||||
new_limit
|
||||
};
|
||||
crate::metrics::RATE_LIMITER_LIMIT
|
||||
.with_label_values(&["expected"])
|
||||
.set(new_limit as i64);
|
||||
crate::metrics::RATE_LIMITER_LIMIT
|
||||
.with_label_values(&["actual"])
|
||||
.set(actual_limit as i64);
|
||||
let metric = &Metrics::get().semaphore_control_plane_limit;
|
||||
metric.set(RateLimit::Expected, new_limit as i64);
|
||||
metric.set(RateLimit::Actual, actual_limit as i64);
|
||||
self.limits.store(new_limit, Ordering::Release);
|
||||
#[cfg(test)]
|
||||
if let Some(n) = &self.notifier {
|
||||
@@ -519,7 +522,10 @@ impl reqwest_middleware::Middleware for Limiter {
|
||||
extensions: &mut task_local_extensions::Extensions,
|
||||
next: reqwest_middleware::Next<'_>,
|
||||
) -> reqwest_middleware::Result<reqwest::Response> {
|
||||
let start = Instant::now();
|
||||
let timer = Metrics::get()
|
||||
.proxy
|
||||
.control_plane_token_acquire_seconds
|
||||
.start_timer();
|
||||
let token = self
|
||||
.acquire_timeout(self.config.timeout)
|
||||
.await
|
||||
@@ -533,8 +539,12 @@ impl reqwest_middleware::Middleware for Limiter {
|
||||
.into(),
|
||||
)
|
||||
})?;
|
||||
info!(duration = ?start.elapsed(), "waiting for token to connect to the control plane");
|
||||
crate::metrics::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64());
|
||||
let duration = timer.observe();
|
||||
info!(
|
||||
?duration,
|
||||
"waiting for token to connect to the control plane"
|
||||
);
|
||||
|
||||
match next.run(req, extensions).await {
|
||||
Ok(response) => {
|
||||
self.release(token, Some(Outcome::from_reqwest_response(&response)))
|
||||
|
||||
@@ -11,7 +11,7 @@ use crate::{
|
||||
cache::project_info::ProjectInfoCache,
|
||||
cancellation::{CancelMap, CancellationHandler},
|
||||
intern::{ProjectIdInt, RoleNameInt},
|
||||
metrics::{NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, REDIS_BROKEN_MESSAGES},
|
||||
metrics::{Metrics, RedisErrors},
|
||||
};
|
||||
|
||||
const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
|
||||
@@ -104,9 +104,9 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
|
||||
let msg: Notification = match serde_json::from_str(&payload) {
|
||||
Ok(msg) => msg,
|
||||
Err(e) => {
|
||||
REDIS_BROKEN_MESSAGES
|
||||
.with_label_values(&[msg.get_channel_name()])
|
||||
.inc();
|
||||
Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
|
||||
channel: msg.get_channel_name(),
|
||||
});
|
||||
tracing::error!("broken message: {e}");
|
||||
return Ok(());
|
||||
}
|
||||
@@ -183,7 +183,7 @@ where
|
||||
cache,
|
||||
Arc::new(CancellationHandler::<()>::new(
|
||||
cancel_map,
|
||||
NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS,
|
||||
crate::metrics::CancellationSource::FromRedis,
|
||||
)),
|
||||
region_id,
|
||||
);
|
||||
|
||||
@@ -32,7 +32,7 @@ use tokio_util::task::TaskTracker;
|
||||
use crate::cancellation::CancellationHandlerMain;
|
||||
use crate::config::ProxyConfig;
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::metrics::{NUM_CLIENT_CONNECTION_GAUGE, TLS_HANDSHAKE_FAILURES};
|
||||
use crate::metrics::Metrics;
|
||||
use crate::protocol2::WithClientIp;
|
||||
use crate::proxy::run_until_cancelled;
|
||||
use crate::rate_limiter::EndpointRateLimiter;
|
||||
@@ -156,9 +156,10 @@ async fn connection_handler(
|
||||
) {
|
||||
let session_id = uuid::Uuid::new_v4();
|
||||
|
||||
let _gauge = NUM_CLIENT_CONNECTION_GAUGE
|
||||
.with_label_values(&["http"])
|
||||
.guard();
|
||||
let _gauge = Metrics::get()
|
||||
.proxy
|
||||
.client_connections
|
||||
.guard(crate::metrics::Protocol::Http);
|
||||
|
||||
// handle PROXY protocol
|
||||
let mut conn = WithClientIp::new(conn);
|
||||
@@ -181,13 +182,13 @@ async fn connection_handler(
|
||||
}
|
||||
// The handshake failed
|
||||
Ok(Err(e)) => {
|
||||
TLS_HANDSHAKE_FAILURES.inc();
|
||||
Metrics::get().proxy.tls_handshake_failures.inc();
|
||||
warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
|
||||
return;
|
||||
}
|
||||
// The handshake timed out
|
||||
Err(e) => {
|
||||
TLS_HANDSHAKE_FAILURES.inc();
|
||||
Metrics::get().proxy.tls_handshake_failures.inc();
|
||||
warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
|
||||
return;
|
||||
}
|
||||
@@ -274,7 +275,13 @@ async fn request_handler(
|
||||
|
||||
// Check if the request is a websocket upgrade request.
|
||||
if hyper_tungstenite::is_upgrade_request(&request) {
|
||||
let ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region);
|
||||
let ctx = RequestMonitoring::new(
|
||||
session_id,
|
||||
peer_addr,
|
||||
crate::metrics::Protocol::Ws,
|
||||
&config.region,
|
||||
);
|
||||
|
||||
let span = ctx.span.clone();
|
||||
info!(parent: &span, "performing websocket upgrade");
|
||||
|
||||
@@ -302,7 +309,12 @@ async fn request_handler(
|
||||
// Return the response so the spawned future can continue.
|
||||
Ok(response)
|
||||
} else if request.uri().path() == "/sql" && *request.method() == Method::POST {
|
||||
let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
|
||||
let ctx = RequestMonitoring::new(
|
||||
session_id,
|
||||
peer_addr,
|
||||
crate::metrics::Protocol::Http,
|
||||
&config.region,
|
||||
);
|
||||
let span = ctx.span.clone();
|
||||
|
||||
sql_over_http::handle(config, ctx, request, backend, http_cancellation_token)
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use dashmap::DashMap;
|
||||
use futures::{future::poll_fn, Future};
|
||||
use metrics::IntCounterPairGuard;
|
||||
use parking_lot::RwLock;
|
||||
use rand::Rng;
|
||||
use smallvec::SmallVec;
|
||||
@@ -18,11 +17,10 @@ use tokio_postgres::tls::NoTlsStream;
|
||||
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
|
||||
|
||||
use crate::console::messages::{ColdStartInfo, MetricsAuxInfo};
|
||||
use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL};
|
||||
use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
|
||||
use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
|
||||
use crate::{
|
||||
auth::backend::ComputeUserInfo, context::RequestMonitoring, metrics::NUM_DB_CONNECTIONS_GAUGE,
|
||||
DbName, EndpointCacheKey, RoleName,
|
||||
auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName,
|
||||
};
|
||||
|
||||
use tracing::{debug, error, warn, Span};
|
||||
@@ -78,7 +76,7 @@ pub struct EndpointConnPool<C: ClientInnerExt> {
|
||||
pools: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
|
||||
total_conns: usize,
|
||||
max_conns: usize,
|
||||
_guard: IntCounterPairGuard,
|
||||
_guard: HttpEndpointPoolsGuard<'static>,
|
||||
global_connections_count: Arc<AtomicUsize>,
|
||||
global_pool_size_max_conns: usize,
|
||||
}
|
||||
@@ -110,7 +108,11 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
|
||||
let removed = old_len - new_len;
|
||||
if removed > 0 {
|
||||
global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
|
||||
NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64);
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.http_pool_opened_connections
|
||||
.get_metric()
|
||||
.dec_by(removed as i64);
|
||||
}
|
||||
*total_conns -= removed;
|
||||
removed > 0
|
||||
@@ -156,7 +158,11 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
|
||||
pool.total_conns += 1;
|
||||
pool.global_connections_count
|
||||
.fetch_add(1, atomic::Ordering::Relaxed);
|
||||
NUM_OPEN_CLIENTS_IN_HTTP_POOL.inc();
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.http_pool_opened_connections
|
||||
.get_metric()
|
||||
.inc();
|
||||
}
|
||||
|
||||
pool.total_conns
|
||||
@@ -176,7 +182,11 @@ impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
|
||||
if self.total_conns > 0 {
|
||||
self.global_connections_count
|
||||
.fetch_sub(self.total_conns, atomic::Ordering::Relaxed);
|
||||
NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(self.total_conns as i64);
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.http_pool_opened_connections
|
||||
.get_metric()
|
||||
.dec_by(self.total_conns as i64);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -215,7 +225,11 @@ impl<C: ClientInnerExt> DbUserConnPool<C> {
|
||||
removed += 1;
|
||||
}
|
||||
global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
|
||||
NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64);
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.http_pool_opened_connections
|
||||
.get_metric()
|
||||
.dec_by(removed as i64);
|
||||
conn
|
||||
}
|
||||
}
|
||||
@@ -303,7 +317,10 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
|
||||
// acquire a random shard lock
|
||||
let mut shard = self.global_pool.shards()[shard].write();
|
||||
|
||||
let timer = GC_LATENCY.start_timer();
|
||||
let timer = Metrics::get()
|
||||
.proxy
|
||||
.http_pool_reclaimation_lag_seconds
|
||||
.start_timer();
|
||||
let current_len = shard.len();
|
||||
let mut clients_removed = 0;
|
||||
shard.retain(|endpoint, x| {
|
||||
@@ -331,7 +348,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
|
||||
|
||||
let new_len = shard.len();
|
||||
drop(shard);
|
||||
timer.observe_duration();
|
||||
timer.observe();
|
||||
|
||||
// Do logging outside of the lock.
|
||||
if clients_removed > 0 {
|
||||
@@ -339,7 +356,11 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
|
||||
.global_connections_count
|
||||
.fetch_sub(clients_removed, atomic::Ordering::Relaxed)
|
||||
- clients_removed;
|
||||
NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(clients_removed as i64);
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.http_pool_opened_connections
|
||||
.get_metric()
|
||||
.dec_by(clients_removed as i64);
|
||||
info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}");
|
||||
}
|
||||
let removed = current_len - new_len;
|
||||
@@ -410,7 +431,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
|
||||
pools: HashMap::new(),
|
||||
total_conns: 0,
|
||||
max_conns: self.config.pool_options.max_conns_per_endpoint,
|
||||
_guard: ENDPOINT_POOLS.guard(),
|
||||
_guard: Metrics::get().proxy.http_endpoint_pools.guard(),
|
||||
global_connections_count: self.global_connections_count.clone(),
|
||||
global_pool_size_max_conns: self.config.pool_options.max_total_conns,
|
||||
}));
|
||||
@@ -450,9 +471,7 @@ pub fn poll_client<C: ClientInnerExt>(
|
||||
conn_id: uuid::Uuid,
|
||||
aux: MetricsAuxInfo,
|
||||
) -> Client<C> {
|
||||
let conn_gauge = NUM_DB_CONNECTIONS_GAUGE
|
||||
.with_label_values(&[ctx.protocol])
|
||||
.guard();
|
||||
let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol);
|
||||
let mut session_id = ctx.session_id;
|
||||
let (tx, mut rx) = tokio::sync::watch::channel(session_id);
|
||||
|
||||
|
||||
@@ -43,8 +43,8 @@ use crate::context::RequestMonitoring;
|
||||
use crate::error::ErrorKind;
|
||||
use crate::error::ReportableError;
|
||||
use crate::error::UserFacingError;
|
||||
use crate::metrics::HTTP_CONTENT_LENGTH;
|
||||
use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
|
||||
use crate::metrics::HttpDirection;
|
||||
use crate::metrics::Metrics;
|
||||
use crate::proxy::run_until_cancelled;
|
||||
use crate::proxy::NeonOptions;
|
||||
use crate::serverless::backend::HttpConnError;
|
||||
@@ -494,10 +494,11 @@ async fn handle_inner(
|
||||
request: Request<Incoming>,
|
||||
backend: Arc<PoolingBackend>,
|
||||
) -> Result<Response<Full<Bytes>>, SqlOverHttpError> {
|
||||
let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
|
||||
.with_label_values(&[ctx.protocol])
|
||||
.guard();
|
||||
info!("handling interactive connection from client");
|
||||
let _requeset_gauge = Metrics::get().proxy.connection_requests.guard(ctx.protocol);
|
||||
info!(
|
||||
protocol = %ctx.protocol,
|
||||
"handling interactive connection from client"
|
||||
);
|
||||
|
||||
//
|
||||
// Determine the destination and connection params
|
||||
@@ -520,9 +521,10 @@ async fn handle_inner(
|
||||
None => MAX_REQUEST_SIZE + 1,
|
||||
};
|
||||
info!(request_content_length, "request size in bytes");
|
||||
HTTP_CONTENT_LENGTH
|
||||
.with_label_values(&["request"])
|
||||
.observe(request_content_length as f64);
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.http_conn_content_length_bytes
|
||||
.observe(HttpDirection::Request, request_content_length as f64);
|
||||
|
||||
// we don't have a streaming request support yet so this is to prevent OOM
|
||||
// from a malicious user sending an extremely large request body
|
||||
@@ -607,9 +609,10 @@ async fn handle_inner(
|
||||
// count the egress bytes - we miss the TLS and header overhead but oh well...
|
||||
// moving this later in the stack is going to be a lot of effort and ehhhh
|
||||
metrics.record_egress(len as u64);
|
||||
HTTP_CONTENT_LENGTH
|
||||
.with_label_values(&["response"])
|
||||
.observe(len as f64);
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.http_conn_content_length_bytes
|
||||
.observe(HttpDirection::Response, len as f64);
|
||||
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use crate::{
|
||||
config::ProxyConfig,
|
||||
context::RequestMonitoring,
|
||||
error::{io_error, ReportableError},
|
||||
metrics::NUM_CLIENT_CONNECTION_GAUGE,
|
||||
metrics::Metrics,
|
||||
proxy::{handle_client, ClientMode},
|
||||
rate_limiter::EndpointRateLimiter,
|
||||
};
|
||||
@@ -139,9 +139,10 @@ pub async fn serve_websocket(
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
) -> anyhow::Result<()> {
|
||||
let websocket = websocket.await?;
|
||||
let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE
|
||||
.with_label_values(&["ws"])
|
||||
.guard();
|
||||
let conn_gauge = Metrics::get()
|
||||
.proxy
|
||||
.client_connections
|
||||
.guard(crate::metrics::Protocol::Ws);
|
||||
|
||||
let res = handle_client(
|
||||
config,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::config::TlsServerEndPoint;
|
||||
use crate::error::{ErrorKind, ReportableError, UserFacingError};
|
||||
use crate::metrics::TLS_HANDSHAKE_FAILURES;
|
||||
use crate::metrics::Metrics;
|
||||
use bytes::BytesMut;
|
||||
|
||||
use pq_proto::framed::{ConnectionError, Framed};
|
||||
@@ -228,7 +228,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Stream<S> {
|
||||
Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg)
|
||||
.accept(raw)
|
||||
.await
|
||||
.inspect_err(|_| TLS_HANDSHAKE_FAILURES.inc())?),
|
||||
.inspect_err(|_| Metrics::get().proxy.tls_handshake_failures.inc())?),
|
||||
Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls),
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user