From 1cac923af8142888570335e686fce385b959fca4 Mon Sep 17 00:00:00 2001 From: Em Sharnoff Date: Sun, 10 Sep 2023 10:33:53 -0700 Subject: [PATCH] vm-monitor: Rate-limit upscale requests (#5263) Some VMs, when already scaled up as much as possible, end up spamming the autoscaler-agent with upscale requests that will never be fulfilled. If postgres is using memory greater than the cgroup's memory.high, it can emit new memory.high events 1000 times per second, which... just means unnecessary load on the rest of the system. This changes the vm-monitor so that we skip sending upscale requests if we already sent one within the last second, to avoid spamming the autoscaler-agent. This matches previous behavior that the vm-informant hand. --- libs/vm_monitor/src/runner.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs index 82055fda2e..8f904b879d 100644 --- a/libs/vm_monitor/src/runner.rs +++ b/libs/vm_monitor/src/runner.rs @@ -5,6 +5,7 @@ //! all functionality. use std::sync::Arc; +use std::time::{Duration, Instant}; use std::{fmt::Debug, mem}; use anyhow::{bail, Context}; @@ -36,6 +37,8 @@ pub struct Runner { /// by us vs the autoscaler-agent. counter: usize, + last_upscale_request_at: Option, + /// A signal to kill the main thread produced by `self.run()`. This is triggered /// when the server receives a new connection. When the thread receives the /// signal off this channel, it will gracefully shutdown. @@ -99,6 +102,7 @@ impl Runner { cgroup: None, dispatcher, counter: 1, // NB: must be odd, see the comment about the field for more. + last_upscale_request_at: None, kill, }; @@ -397,6 +401,20 @@ impl Runner { if request.is_none() { bail!("failed to listen for upscale event from cgroup") } + + // If it's been less than 1 second since the last time we requested upscaling, + // ignore the event, to avoid spamming the agent (otherwise, this can happen + // ~1k times per second). + if let Some(t) = self.last_upscale_request_at { + let elapsed = t.elapsed(); + if elapsed < Duration::from_secs(1) { + info!(elapsed_millis = elapsed.as_millis(), "cgroup asked for upscale but too soon to forward the request, ignoring"); + continue; + } + } + + self.last_upscale_request_at = Some(Instant::now()); + info!("cgroup asking for upscale; forwarding request"); self.counter += 2; // Increment, preserving parity (i.e. keep the // counter odd). See the field comment for more.