more ephasis on performance

This commit is contained in:
Conrad Ludgate
2025-07-21 11:53:56 +01:00
parent b33047df7e
commit 0a34084ba5
4 changed files with 161 additions and 231 deletions

View File

@@ -2,14 +2,86 @@ use std::alloc::{GlobalAlloc, Layout, System, handle_alloc_error};
use alloc_metrics::TrackedAllocator;
use criterion::{
AxisScale, BatchSize, BenchmarkId as Id, Criterion, PlotConfiguration, Throughput,
criterion_group, criterion_main,
AxisScale, BenchmarkGroup, BenchmarkId, Criterion, PlotConfiguration, measurement::Measurement,
};
use measured::FixedCardinalityLabel;
use tikv_jemallocator::Jemalloc;
criterion_group!(benches, bench_alloc);
criterion_main!(benches);
fn main() {
let mut c = Criterion::default().configure_from_args();
bench(&mut c);
c.final_summary();
}
#[rustfmt::skip]
fn bench(c: &mut Criterion) {
bench_alloc(c.benchmark_group("alloc/system"), &System, &ALLOC_SYSTEM);
bench_alloc(c.benchmark_group("alloc/jemalloc"), &Jemalloc, &ALLOC_JEMALLOC);
bench_dealloc(c.benchmark_group("dealloc/system"), &System, &ALLOC_SYSTEM);
bench_dealloc(c.benchmark_group("dealloc/jemalloc"), &Jemalloc, &ALLOC_JEMALLOC);
}
#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
#[label(singleton = "memory_context")]
pub enum MemoryContext {
Root,
Test,
}
static ALLOC_SYSTEM: TrackedAllocator<System, MemoryContext> =
unsafe { TrackedAllocator::new(System, MemoryContext::Root) };
static ALLOC_JEMALLOC: TrackedAllocator<Jemalloc, MemoryContext> =
unsafe { TrackedAllocator::new(Jemalloc, MemoryContext::Root) };
const KB: u64 = 1024;
const SIZES: [u64; 6] = [64, 256, KB, 4 * KB, 16 * KB, KB * KB];
fn bench_alloc<A: GlobalAlloc>(
mut g: BenchmarkGroup<'_, impl Measurement>,
alloc1: &'static A,
alloc2: &'static TrackedAllocator<A, MemoryContext>,
) {
g.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic));
for size in SIZES {
let layout = Layout::from_size_align(size as usize, 8).unwrap();
g.throughput(criterion::Throughput::Bytes(size));
g.bench_with_input(BenchmarkId::new("default", size), &layout, |b, &layout| {
let bs = criterion::BatchSize::NumBatches(10 + size.ilog2() as u64);
b.iter_batched(|| {}, |()| Alloc::new(alloc1, layout), bs);
});
g.bench_with_input(BenchmarkId::new("tracked", size), &layout, |b, &layout| {
let _scope = alloc2.scope(MemoryContext::Test);
let bs = criterion::BatchSize::NumBatches(10 + size.ilog2() as u64);
b.iter_batched(|| {}, |()| Alloc::new(alloc2, layout), bs);
});
}
}
fn bench_dealloc<A: GlobalAlloc>(
mut g: BenchmarkGroup<'_, impl Measurement>,
alloc1: &'static A,
alloc2: &'static TrackedAllocator<A, MemoryContext>,
) {
g.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic));
for size in SIZES {
let layout = Layout::from_size_align(size as usize, 8).unwrap();
g.throughput(criterion::Throughput::Bytes(size));
g.bench_with_input(BenchmarkId::new("default", size), &layout, |b, &layout| {
let bs = criterion::BatchSize::NumBatches(10 + size.ilog2() as u64);
b.iter_batched(|| Alloc::new(alloc1, layout), drop, bs);
});
g.bench_with_input(BenchmarkId::new("tracked", size), &layout, |b, &layout| {
let _scope = alloc2.scope(MemoryContext::Test);
let bs = criterion::BatchSize::NumBatches(10 + size.ilog2() as u64);
b.iter_batched(|| Alloc::new(alloc2, layout), drop, bs);
});
}
}
struct Alloc<'a, A: GlobalAlloc> {
alloc: &'a A,
@@ -36,72 +108,3 @@ impl<'a, A: GlobalAlloc> Drop for Alloc<'a, A> {
unsafe { self.alloc.dealloc(self.ptr, self.layout) };
}
}
#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
#[label(singleton = "memory_context")]
pub enum MemoryContext {
Root,
Test,
}
static ALLOC_SYSTEM: TrackedAllocator<System, MemoryContext> =
unsafe { TrackedAllocator::new(System, MemoryContext::Root) };
static ALLOC_JEMALLOC: TrackedAllocator<Jemalloc, MemoryContext> =
unsafe { TrackedAllocator::new(Jemalloc, MemoryContext::Root) };
fn bench_alloc(c: &mut Criterion) {
const KB: u64 = 1024;
let sizes = [64, 256, KB, 4 * KB, 16 * KB, KB * KB];
let mut g = c.benchmark_group("alloc");
g.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic));
for size in sizes {
g.throughput(Throughput::Bytes(size));
let layout = Layout::from_size_align(size as usize, 8).unwrap();
let bs = BatchSize::NumBatches(10 + size.ilog2() as u64);
g.bench_with_input(Id::new("system", size), &layout, |b, layout| {
b.iter_batched(|| {}, |()| Alloc::new(&System, *layout), bs);
});
g.bench_with_input(Id::new("tracked[system]", size), &layout, |b, layout| {
let _scope = ALLOC_SYSTEM.scope(MemoryContext::Test);
b.iter_batched(|| {}, |()| Alloc::new(&ALLOC_SYSTEM, *layout), bs);
});
g.bench_with_input(Id::new("jemalloc", size), &layout, |b, layout| {
b.iter_batched(|| {}, |()| Alloc::new(&Jemalloc, *layout), bs);
});
g.bench_with_input(Id::new("tracked[jemalloc]", size), &layout, |b, layout| {
let _scope = ALLOC_JEMALLOC.scope(MemoryContext::Test);
b.iter_batched(|| {}, |()| Alloc::new(&ALLOC_JEMALLOC, *layout), bs);
});
}
g.finish();
let mut g = c.benchmark_group("dealloc");
g.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic));
for size in sizes {
g.throughput(Throughput::Bytes(size));
let layout = Layout::from_size_align(size as usize, 8).unwrap();
let bs = BatchSize::NumBatches(10 + size.ilog2() as u64);
g.bench_with_input(Id::new("system", size), &layout, |b, layout| {
b.iter_batched(|| Alloc::new(&System, *layout), drop, bs)
});
g.bench_with_input(Id::new("tracked[system]", size), &layout, |b, layout| {
let _scope = ALLOC_SYSTEM.scope(MemoryContext::Test);
b.iter_batched(|| Alloc::new(&ALLOC_SYSTEM, *layout), drop, bs)
});
g.bench_with_input(Id::new("jemalloc", size), &layout, |b, layout| {
b.iter_batched(|| Alloc::new(&Jemalloc, *layout), drop, bs)
});
g.bench_with_input(Id::new("tracked[jemalloc]", size), &layout, |b, layout| {
let _scope = ALLOC_JEMALLOC.scope(MemoryContext::Test);
b.iter_batched(|| Alloc::new(&ALLOC_JEMALLOC, *layout), drop, bs)
});
}
g.finish();
}

View File

@@ -16,9 +16,9 @@ pub struct DenseCounterPairVec<
}
impl<A: CounterPairAssoc<LabelGroupSet = StaticLabelSet<L>>, L: FixedCardinalityLabel + LabelGroup>
Default for DenseCounterPairVec<A, L>
DenseCounterPairVec<A, L>
{
fn default() -> Self {
pub fn new() -> Self {
Self {
vec: DenseMetricVec::new(),
_marker: PhantomData,

View File

@@ -31,10 +31,7 @@ pub struct TrackedAllocator<A, T: 'static + Send + Sync + FixedCardinalityLabel
/// Default tag to use if this thread is not registered.
default_tag: T,
/// Current memory context for this thread.
thread_scope: OnceLock<ThreadLocal<Cell<T>>>,
/// per thread state containing low contention counters for faster allocations.
thread_state: OnceLock<ThreadLocal<ThreadState<T>>>,
thread: OnceLock<RegisteredThread<T>>,
/// where thread alloc data is eventually saved to, even if threads are shutdown.
global: OnceLock<AllocCounter<T>>,
@@ -59,8 +56,7 @@ where
count: AtomicU64::new(0),
},
},
thread_scope: OnceLock::new(),
thread_state: OnceLock::new(),
thread: OnceLock::new(),
global: OnceLock::new(),
}
}
@@ -77,73 +73,71 @@ where
}
fn register_thread_inner(&'static self) -> &'static Cell<T> {
self.thread_state
.get_or_init(ThreadLocal::new)
.get_or(|| ThreadState {
counters: AllocCounter::default(),
global: self.global.get_or_init(AllocCounter::default),
});
let thread = self.thread.get_or_init(|| RegisteredThread {
scope: ThreadLocal::new(),
state: ThreadLocal::new(),
});
self.thread_scope
.get_or_init(ThreadLocal::new)
.get_or(|| Cell::new(self.default_tag))
thread.state.get_or(|| ThreadState {
counters: AllocCounter::new(),
global: self.global.get_or_init(AllocCounter::new),
});
thread.scope.get_or(|| Cell::new(self.default_tag))
}
fn current_counters_alloc_safe(&self) -> Option<&AllocCounter<T>> {
// We are being very careful here to not allocate or panic.
self.thread_state
self.thread
.get()
.and_then(ThreadLocal::get)
.and_then(|s| s.state.get())
.map(|s| &s.counters)
.or_else(|| self.global.get())
}
fn current_tag_alloc_safe(&self) -> T {
// We are being very careful here to not allocate or panic.
self.thread_scope
self.thread
.get()
.and_then(ThreadLocal::get)
.and_then(|s| s.scope.get())
.map_or(self.default_tag, Cell::get)
}
}
impl<A, T> TrackedAllocator<A, T>
where
T: 'static + Send + Sync + FixedCardinalityLabel + LabelGroup,
{
unsafe fn alloc_inner(&self, layout: Layout, alloc: impl FnOnce(Layout) -> *mut u8) -> *mut u8 {
let Ok((tagged_layout, tag_offset)) = layout.extend(Layout::new::<T>()) else {
return std::ptr::null_mut();
};
let tagged_layout = tagged_layout.pad_to_align();
macro_rules! alloc {
($alloc_fn:ident) => {
unsafe fn $alloc_fn(&self, layout: Layout) -> *mut u8 {
let Ok((tagged_layout, tag_offset)) = layout.extend(Layout::new::<T>()) else {
return std::ptr::null_mut();
};
let tagged_layout = tagged_layout.pad_to_align();
// Safety: The layout is not zero-sized.
let ptr = alloc(tagged_layout);
// Safety: The layout is not zero-sized.
let ptr = unsafe { self.inner.$alloc_fn(tagged_layout) };
// allocation failed.
if ptr.is_null() {
return ptr;
// allocation failed.
if ptr.is_null() {
return ptr;
}
let tag = self.current_tag_alloc_safe();
// Allocation successful. Write our tag
// Safety: tag_offset is inbounds of the ptr
unsafe { ptr.add(tag_offset).cast::<T>().write(tag) }
let metric = if let Some(counters) = self.current_counters_alloc_safe() {
counters.vec.get_metric(tag)
} else {
// if tag is not default, then global would have been registered, therefore tag must be default.
&self.default_counters
};
metric.inc.count.fetch_add(layout.size() as u64, Relaxed);
ptr
}
let tag = self.current_tag_alloc_safe();
// Allocation successful. Write our tag
// Safety: tag_offset is inbounds of the ptr
unsafe { ptr.add(tag_offset).cast::<T>().write(tag) }
let metric = if let Some(counters) = self.current_counters_alloc_safe() {
// safety: caller ensured that <T as FixedCardinalitySet> is implemented correctly.
let id = unsafe { counters.vec.try_with_labels(tag).unwrap_unchecked() };
counters.vec.get_metric(id)
} else {
// if tag is not default, then global would have been registered, therefore tag must be default.
&self.default_counters
};
metric.inc_by(layout.size() as u64);
ptr
}
};
}
// We will tag our allocation by adding `T` to the end of the layout.
@@ -156,19 +150,8 @@ where
A: GlobalAlloc,
T: 'static + Send + Sync + FixedCardinalityLabel + LabelGroup,
{
unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
// safety: same as caller
unsafe { self.alloc_inner(layout, |tagged_layout| self.inner.alloc(tagged_layout)) }
}
unsafe fn alloc_zeroed(&self, layout: Layout) -> *mut u8 {
// safety: same as caller
unsafe {
self.alloc_inner(layout, |tagged_layout| {
self.inner.alloc_zeroed(tagged_layout)
})
}
}
alloc!(alloc);
alloc!(alloc_zeroed);
unsafe fn realloc(&self, ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 {
// SAFETY: the caller must ensure that the `new_size` does not overflow.
@@ -209,12 +192,8 @@ where
unsafe { new_ptr.add(new_tag_offset).cast::<T>().write(new_tag) }
let (new_metric, old_metric) = if let Some(counters) = self.current_counters_alloc_safe() {
// safety: caller ensured that <T as FixedCardinalitySet> is implemented correctly.
let new_id = unsafe { counters.vec.try_with_labels(new_tag).unwrap_unchecked() };
// safety: caller ensured that <T as FixedCardinalitySet> is implemented correctly.
let old_id = unsafe { counters.vec.try_with_labels(tag).unwrap_unchecked() };
let new_metric = counters.vec.get_metric(new_id);
let old_metric = counters.vec.get_metric(old_id);
let new_metric = counters.vec.get_metric(new_tag);
let old_metric = counters.vec.get_metric(tag);
(new_metric, old_metric)
} else {
@@ -230,8 +209,8 @@ where
(0, (layout.size() - new_layout.size()) as u64)
};
new_metric.inc.inc_by(inc);
old_metric.dec.inc_by(dec);
new_metric.inc.count.fetch_add(inc, Relaxed);
old_metric.dec.count.fetch_add(dec, Relaxed);
new_ptr
}
@@ -252,15 +231,13 @@ where
unsafe { self.inner.dealloc(ptr, tagged_layout) }
let metric = if let Some(counters) = self.current_counters_alloc_safe() {
// safety: caller ensured that <T as FixedCardinalitySet> is implemented correctly.
let id = unsafe { counters.vec.try_with_labels(tag).unwrap_unchecked() };
counters.vec.get_metric(id)
counters.vec.get_metric(tag)
} else {
// if tag is not default, then global would have been registered, therefore tag must be default.
&self.default_counters
};
metric.dec_by(layout.size() as u64);
metric.dec.count.fetch_add(layout.size() as u64, Relaxed);
}
}
@@ -287,6 +264,13 @@ impl<T: FixedCardinalityLabel + LabelGroup> CounterPairAssoc for AllocPair<T> {
type LabelGroupSet = StaticLabelSet<T>;
}
struct RegisteredThread<T: 'static + Send + Sync + FixedCardinalityLabel + LabelGroup> {
/// Current memory context for this thread.
scope: ThreadLocal<Cell<T>>,
/// per thread state containing low contention counters for faster allocations.
state: ThreadLocal<ThreadState<T>>,
}
struct ThreadState<T: 'static + FixedCardinalityLabel + LabelGroup> {
counters: AllocCounter<T>,
global: &'static AllocCounter<T>,
@@ -298,14 +282,12 @@ impl<T: 'static + FixedCardinalityLabel + LabelGroup> Drop for ThreadState<T> {
// iterate over all labels
for tag in (0..T::cardinality()).map(T::decode) {
// load and reset the counts in the thread-local counters.
let id = self.counters.vec.with_labels(tag);
let m = self.counters.vec.get_metric_mut(id);
let m = self.counters.vec.get_metric_mut(tag);
let inc = *m.inc.count.get_mut();
let dec = *m.dec.count.get_mut();
// add the counts into the global counters.
let id = self.global.vec.with_labels(tag);
let m = self.global.vec.get_metric(id);
let m = self.global.vec.get_metric(tag);
m.inc.count.fetch_add(inc, Relaxed);
m.dec.count.fetch_add(dec, Relaxed);
}
@@ -319,14 +301,13 @@ where
CounterState: MetricEncoding<Enc>,
{
fn collect_group_into(&self, enc: &mut Enc) -> Result<(), Enc::Err> {
let global = self.global.get_or_init(AllocCounter::default);
let global = self.global.get_or_init(AllocCounter::new);
// iterate over all counter threads
for s in self.thread_state.get().into_iter().flat_map(|s| s.iter()) {
for s in self.thread.get().into_iter().flat_map(|s| s.state.iter()) {
// iterate over all labels
for tag in (0..T::cardinality()).map(T::decode) {
let id = s.counters.vec.with_labels(tag);
sample(global, s.counters.vec.get_metric(id), tag);
sample(global, s.counters.vec.get_metric(tag), tag);
}
}
@@ -346,8 +327,7 @@ fn sample<T: FixedCardinalityLabel + LabelGroup>(
let dec = local.dec.count.swap(0, Relaxed);
// add the counts into the global counters.
let id = global.vec.with_labels(tag);
let m = global.vec.get_metric(id);
let m = global.vec.get_metric(tag);
m.inc.count.fetch_add(inc, Relaxed);
m.dec.count.fetch_add(dec, Relaxed);
}

View File

@@ -2,20 +2,16 @@
use measured::{
FixedCardinalityLabel, LabelGroup,
label::{LabelGroupSet, StaticLabelSet},
label::StaticLabelSet,
metric::{
MetricEncoding, MetricFamilyEncoding, MetricType, group::Encoding, name::MetricNameEncoder,
},
};
pub struct DenseMetricVec<M: MetricType, L: FixedCardinalityLabel + LabelGroup> {
metrics: VecInner<M>,
metrics: Box<[M]>,
metadata: M::Metadata,
label_set: StaticLabelSet<L>,
}
enum VecInner<M: MetricType> {
Dense(Box<[M]>),
_label_set: StaticLabelSet<L>,
}
fn new_dense<M: MetricType>(c: usize) -> Box<[M]> {
@@ -34,72 +30,32 @@ where
}
}
impl<M: MetricType, L: FixedCardinalityLabel + LabelGroup> Default for DenseMetricVec<M, L>
where
M::Metadata: Default,
{
fn default() -> Self {
Self::new()
}
}
impl<M: MetricType> VecInner<M> {
fn get_metric(&self, id: usize) -> &M {
match self {
VecInner::Dense(metrics) => &metrics[id],
}
}
fn get_metric_mut(&mut self, id: usize) -> &mut M {
match self {
VecInner::Dense(metrics) => &mut metrics[id],
}
}
}
impl<M: MetricType, L: FixedCardinalityLabel + LabelGroup> DenseMetricVec<M, L> {
/// Create a new metric vec with the given label set and metric metadata
pub fn with_metadata(metadata: M::Metadata) -> Self {
let metrics = VecInner::Dense(new_dense(L::cardinality()));
Self {
metrics,
metrics: new_dense(L::cardinality()),
metadata,
label_set: StaticLabelSet::new(),
_label_set: StaticLabelSet::new(),
}
}
/// Get an identifier for the specific metric identified by this label group
/// Get the individual metric at the given identifier.
///
/// # Panics
/// Panics if the label group is not contained within the label set.
pub fn with_labels(&self, label: L) -> usize {
self.try_with_labels(label)
.expect("label group was not contained within this label set")
}
/// Get an identifier for the specific metric identified by this label group
///
/// # Errors
/// Returns None if the label group is not contained within the label set.
pub fn try_with_labels(&self, label: L) -> Option<usize> {
self.label_set.encode(label)
/// Can panic or cause strange behaviour if the label ID comes from a different metric family.
pub fn get_metric(&self, label: L) -> &M {
// safety: The caller has guarantees that the label encoding is valid.
unsafe { self.metrics.get_unchecked(label.encode()) }
}
/// Get the individual metric at the given identifier.
///
/// # Panics
/// Can panic or cause strange behaviour if the label ID comes from a different metric family.
pub fn get_metric(&self, id: usize) -> &M {
self.metrics.get_metric(id)
}
/// Get the individual metric at the given identifier.
///
/// # Panics
/// Can panic or cause strange behaviour if the label ID comes from a different metric family.
pub fn get_metric_mut(&mut self, id: usize) -> &mut M {
self.metrics.get_metric_mut(id)
pub fn get_metric_mut(&mut self, label: L) -> &mut M {
// safety: The caller has guarantees that the label encoding is valid.
unsafe { self.metrics.get_unchecked_mut(label.encode()) }
}
}
@@ -108,17 +64,8 @@ impl<M: MetricEncoding<T>, L: FixedCardinalityLabel + LabelGroup, T: Encoding>
{
fn collect_family_into(&self, name: impl MetricNameEncoder, enc: &mut T) -> Result<(), T::Err> {
M::write_type(&name, enc)?;
match &self.metrics {
VecInner::Dense(m) => {
for (index, value) in m.iter().enumerate() {
value.collect_into(
&self.metadata,
self.label_set.decode_dense(index),
&name,
enc,
)?;
}
}
for (index, value) in self.metrics.iter().enumerate() {
value.collect_into(&self.metadata, L::decode(index), &name, enc)?;
}
Ok(())
}