mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-09 06:22:57 +00:00
feat(pageserver): more log lines around frozen layers (#9697)
We saw pageserver OOMs https://github.com/neondatabase/cloud/issues/19715 for tenants doing large writes. Add log lines around in-memory layers to hopefully collect some info during my on-call shift next week. ## Summary of changes * Estimate in-memory size of an in-mem layer. * Print frozen layer number if there are too many layers accumulated in memory. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
@@ -67,6 +67,8 @@ pub struct InMemoryLayer {
|
||||
/// The above fields never change, except for `end_lsn`, which is only set once.
|
||||
/// All other changing parts are in `inner`, and protected by a mutex.
|
||||
inner: RwLock<InMemoryLayerInner>,
|
||||
|
||||
estimated_in_mem_size: AtomicU64,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for InMemoryLayer {
|
||||
@@ -543,6 +545,10 @@ impl InMemoryLayer {
|
||||
Ok(inner.file.len())
|
||||
}
|
||||
|
||||
pub fn estimated_in_mem_size(&self) -> u64 {
|
||||
self.estimated_in_mem_size.load(AtomicOrdering::Relaxed)
|
||||
}
|
||||
|
||||
/// Create a new, empty, in-memory layer
|
||||
pub async fn create(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -572,6 +578,7 @@ impl InMemoryLayer {
|
||||
file,
|
||||
resource_units: GlobalResourceUnits::new(),
|
||||
}),
|
||||
estimated_in_mem_size: AtomicU64::new(0),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -642,6 +649,12 @@ impl InMemoryLayer {
|
||||
// because this case is unexpected, and we would like tests to fail if this happens.
|
||||
warn!("Key {} at {} written twice at same LSN", key, lsn);
|
||||
}
|
||||
self.estimated_in_mem_size.fetch_add(
|
||||
(std::mem::size_of::<CompactKey>()
|
||||
+ std::mem::size_of::<Lsn>()
|
||||
+ std::mem::size_of::<IndexEntry>()) as u64,
|
||||
AtomicOrdering::Relaxed,
|
||||
);
|
||||
}
|
||||
|
||||
inner.resource_units.maybe_publish_size(new_size);
|
||||
|
||||
@@ -23,6 +23,7 @@ use handle::ShardTimelineId;
|
||||
use offload::OffloadError;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::{
|
||||
config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
|
||||
key::{
|
||||
KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
|
||||
NON_INHERITED_SPARSE_RANGE,
|
||||
@@ -3501,18 +3502,37 @@ impl Timeline {
|
||||
|
||||
let timer = self.metrics.flush_time_histo.start_timer();
|
||||
|
||||
let num_frozen_layers;
|
||||
let frozen_layer_total_size;
|
||||
let layer_to_flush = {
|
||||
let guard = self.layers.read().await;
|
||||
let Ok(lm) = guard.layer_map() else {
|
||||
info!("dropping out of flush loop for timeline shutdown");
|
||||
return;
|
||||
};
|
||||
num_frozen_layers = lm.frozen_layers.len();
|
||||
frozen_layer_total_size = lm
|
||||
.frozen_layers
|
||||
.iter()
|
||||
.map(|l| l.estimated_in_mem_size())
|
||||
.sum::<u64>();
|
||||
lm.frozen_layers.front().cloned()
|
||||
// drop 'layers' lock to allow concurrent reads and writes
|
||||
};
|
||||
let Some(layer_to_flush) = layer_to_flush else {
|
||||
break Ok(());
|
||||
};
|
||||
if num_frozen_layers
|
||||
> std::cmp::max(
|
||||
self.get_compaction_threshold(),
|
||||
DEFAULT_COMPACTION_THRESHOLD,
|
||||
)
|
||||
&& frozen_layer_total_size >= /* 64 MB */ 64000000
|
||||
{
|
||||
tracing::warn!(
|
||||
"too many frozen layers: {num_frozen_layers} layers with estimated in-mem size of {frozen_layer_total_size} bytes",
|
||||
);
|
||||
}
|
||||
match self.flush_frozen_layer(layer_to_flush, ctx).await {
|
||||
Ok(this_layer_to_lsn) => {
|
||||
flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn);
|
||||
|
||||
Reference in New Issue
Block a user