From 373c7057cc73fd13f2873c353ced250ddb900700 Mon Sep 17 00:00:00 2001 From: Em Sharnoff Date: Thu, 14 Sep 2023 03:21:50 -0700 Subject: [PATCH] vm-monitor: Fix cgroup throttling (#5303) I believe this (not actual IO problems) is the cause of the "disk speed issue" that we've had for VMs recently. See e.g.: 1. https://neondb.slack.com/archives/C03H1K0PGKH/p1694287808046179?thread_ts=1694271790.580099&cid=C03H1K0PGKH 2. https://neondb.slack.com/archives/C03H1K0PGKH/p1694511932560659 The vm-informant (and now, the vm-monitor, its replacement) is supposed to gradually increase the `neon-postgres` cgroup's memory.high value, because otherwise the kernel will throttle all the processes in the cgroup. This PR fixes a bug with the vm-monitor's implementation of this behavior. --- Other references, for the vm-informant's implementation: - Original issue: neondatabase/autoscaling#44 - Original PR: neondatabase/autoscaling#223 --- libs/vm_monitor/src/cgroup.rs | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/libs/vm_monitor/src/cgroup.rs b/libs/vm_monitor/src/cgroup.rs index 4f529d16fd..fe1c514f76 100644 --- a/libs/vm_monitor/src/cgroup.rs +++ b/libs/vm_monitor/src/cgroup.rs @@ -315,12 +315,8 @@ impl CgroupWatcher { where E: Stream>, { - // There are several actions might do when receiving a `memory.high`, - // such as freezing the cgroup, or increasing its `memory.high`. We don't - // want to do these things too often (because postgres needs to run, and - // we only have so much memory). These timers serve as rate limits for this. let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO)); - let mut wait_to_increase_memory_high = pin!(tokio::time::sleep(Duration::ZERO)); + let mut last_memory_high_increase_at: Option = None; let mut events = pin!(events); // Are we waiting to be upscaled? Could be true if we request upscale due @@ -332,6 +328,8 @@ impl CgroupWatcher { upscale = upscales.recv() => { let Sequenced { seqnum, data } = upscale .context("failed to listen on upscale notification channel")?; + waiting_on_upscale = false; + last_memory_high_increase_at = None; self.last_upscale_seqnum.store(seqnum, Ordering::Release); info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale"); } @@ -396,12 +394,17 @@ impl CgroupWatcher { .send(()) .await .context("failed to request upscale")?; + waiting_on_upscale = true; continue; } // Shoot, we can't freeze or and we're still waiting on upscale, // increase memory.high to reduce throttling - if wait_to_increase_memory_high.is_elapsed() { + let can_increase_memory_high = match last_memory_high_increase_at { + None => true, + Some(t) => t.elapsed() > self.config.memory_high_increase_every, + }; + if can_increase_memory_high { info!( "received memory.high event, \ but too soon to refreeze and already requested upscale \ @@ -437,12 +440,11 @@ impl CgroupWatcher { ); self.set_high_bytes(new_high) .context("failed to set memory.high")?; - wait_to_increase_memory_high - .as_mut() - .reset(Instant::now() + self.config.memory_high_increase_every) + last_memory_high_increase_at = Some(Instant::now()); + continue; } - // we can't do anything + info!("received memory.high event, but can't do anything"); } }; }