diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 1bbba8e3fd..65a60884da 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -595,6 +595,8 @@ jobs: defaults: run: shell: sh -eu {0} + env: + VM_INFORMANT_VERSION: 0.1.1 steps: - name: Downloading latest vm-builder @@ -606,9 +608,22 @@ jobs: run: | docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + - name: Downloading VM informant version ${{ env.VM_INFORMANT_VERSION }} + run: | + curl -L https://github.com/neondatabase/autoscaling/releases/download/${{ env.VM_INFORMANT_VERSION }}/vm-informant -o vm-informant + chmod +x vm-informant + + - name: Adding VM informant to compute-node image + run: | + ID=$(docker create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}) + docker cp vm-informant $ID:/bin/vm-informant + docker commit $ID temp-vm-compute-node + docker rm -f $ID + - name: Build vm image run: | - ./vm-builder -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + # note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images + ./vm-builder -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - name: Pushing vm-compute-node image run: | diff --git a/compute_tools/README.md b/compute_tools/README.md index 97a7513344..305ccae5dd 100644 --- a/compute_tools/README.md +++ b/compute_tools/README.md @@ -19,6 +19,10 @@ Also `compute_ctl` spawns two separate service threads: - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the last activity requests. +If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM +compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates +downscaling and (eventually) will request immediate upscaling under resource pressure. + Usage example: ```sh compute_ctl -D /var/db/postgres/compute \ diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index f3b787209d..307300cfd8 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -18,6 +18,10 @@ //! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the //! last activity requests. //! +//! If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM +//! compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates +//! downscaling and (eventually) will request immediate upscaling under resource pressure. +//! //! Usage example: //! ```sh //! compute_ctl -D /var/db/postgres/compute \ @@ -40,6 +44,7 @@ use log::{error, info}; use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus}; use compute_tools::http::api::launch_http_server; +use compute_tools::informant::spawn_vm_informant_if_present; use compute_tools::logger::*; use compute_tools::monitor::launch_monitor; use compute_tools::params::*; @@ -114,6 +119,8 @@ fn main() -> Result<()> { // requests, while configuration is still in progress. let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread"); let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread"); + // Also spawn the thread responsible for handling the VM informant -- if it's present + let _vm_informant_handle = spawn_vm_informant_if_present().expect("cannot launch VM informant"); // Run compute (Postgres) and hang waiting on it. match compute.prepare_and_run() { diff --git a/compute_tools/src/informant.rs b/compute_tools/src/informant.rs new file mode 100644 index 0000000000..09bd5e3138 --- /dev/null +++ b/compute_tools/src/informant.rs @@ -0,0 +1,50 @@ +use log::{info, warn}; +use std::path::Path; +use std::process; +use std::thread; +use std::time::Duration; + +use anyhow::{Context, Result}; + +const VM_INFORMANT_PATH: &str = "/bin/vm-informant"; +const RESTART_INFORMANT_AFTER_MILLIS: u64 = 5000; + +/// Launch a thread to start the VM informant if it's present (and restart, on failure) +pub fn spawn_vm_informant_if_present() -> Result>> { + let exists = Path::new(VM_INFORMANT_PATH) + .try_exists() + .context("could not check if path exists")?; + + if !exists { + return Ok(None); + } + + Ok(Some( + thread::Builder::new() + .name("run-vm-informant".into()) + .spawn(move || run_informant())?, + )) +} + +fn run_informant() -> ! { + let restart_wait = Duration::from_millis(RESTART_INFORMANT_AFTER_MILLIS); + + info!("starting VM informant"); + + loop { + let mut cmd = process::Command::new(VM_INFORMANT_PATH); + // Block on subprocess: + let result = cmd.status(); + + match result { + Err(e) => warn!("failed to run VM informant at {VM_INFORMANT_PATH:?}: {e}"), + Ok(status) if !status.success() => { + warn!("{VM_INFORMANT_PATH} exited with code {status:?}, retrying") + } + Ok(_) => info!("{VM_INFORMANT_PATH} ended gracefully (unexpectedly). Retrying"), + } + + // Wait before retrying + thread::sleep(restart_wait); + } +} diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index aee6b53e6a..a71b92f91a 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -8,6 +8,7 @@ pub mod http; #[macro_use] pub mod logger; pub mod compute; +pub mod informant; pub mod monitor; pub mod params; pub mod pg_helpers;