From 2cf6a47cca9adee72fd2cad4ac69414f4ba2d059 Mon Sep 17 00:00:00 2001 From: Em Sharnoff Date: Thu, 19 Oct 2023 11:09:37 -0700 Subject: [PATCH] vm-monitor: Deny not fail downscale if no memory stats yet (#5606) Fixes an issue we observed on staging that happens when the autoscaler-agent attempts to immediately downscale the VM after binding, which is typical for pooled computes. The issue was occurring because the autoscaler-agent was requesting downscaling before the vm-monitor had gathered sufficient cgroup memory stats to be confident in approving it. When the vm-monitor returned an internal error instead of denying downscaling, the autoscaler-agent retried the connection and immediately hit the same issue (in part because cgroup stats are collected per-connection, rather than globally). --- libs/vm_monitor/src/runner.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs index e818292196..99bea6d7ca 100644 --- a/libs/vm_monitor/src/runner.rs +++ b/libs/vm_monitor/src/runner.rs @@ -253,11 +253,22 @@ impl Runner { if let Some(cgroup) = &self.cgroup { let (last_time, last_history) = *cgroup.watcher.borrow(); + // NB: The ordering of these conditions is intentional. During startup, we should deny + // downscaling until we have enough information to determine that it's safe to do so + // (i.e. enough samples have come in). But if it's been a while and we *still* haven't + // received any information, we should *fail* instead of just denying downscaling. + // + // `last_time` is set to `Instant::now()` on startup, so checking `last_time.elapsed()` + // serves double-duty: it trips if we haven't received *any* metrics for long enough, + // OR if we haven't received metrics *recently enough*. + // // TODO: make the duration here configurable. if last_time.elapsed() > Duration::from_secs(5) { bail!("haven't gotten cgroup memory stats recently enough to determine downscaling information"); } else if last_history.samples_count <= 1 { - bail!("haven't received enough cgroup memory stats yet"); + let status = "haven't received enough cgroup memory stats yet"; + info!(status, "discontinuing downscale"); + return Ok((false, status.to_owned())); } let new_threshold = self