From a845295cb3ba27c5d78dfcdf651c2ecf706bd8eb Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Wed, 30 Jul 2025 10:29:03 +0100 Subject: [PATCH] refactor TLS processing. Only use blocking-IO, split out the loading of certificates from the updating of certificates --- compute_tools/src/compute.rs | 28 +++++++------ compute_tools/src/config.rs | 3 +- compute_tools/src/tls.rs | 76 ++++++++++++++++++++++-------------- 3 files changed, 64 insertions(+), 43 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index cca8dd5f97..99f438173d 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -57,7 +57,6 @@ use crate::rsyslog::{ use crate::spec::*; use crate::swap::resize_swap; use crate::sync_sk::{check_if_synced, ping_safekeeper}; -use crate::tls::watch_cert_for_changes; use crate::{config, extension_server, local_proxy}; pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0); @@ -844,9 +843,8 @@ impl ComputeNode { // Make sure TLS certificates are properly loaded and in the right place. if self.compute_ctl_config.tls.is_some() { let this = self.clone(); - pre_tasks.spawn(async move { - this.watch_cert_for_changes().await; - + pre_tasks.spawn_blocking(|| { + this.watch_cert_for_changes(); Ok::<(), anyhow::Error>(()) }); } @@ -2187,16 +2185,20 @@ impl ComputeNode { Ok(()) } - pub async fn watch_cert_for_changes(self: Arc) { + pub fn watch_cert_for_changes(self: Arc) { // update status on cert renewal if let Some(tls_config) = &self.compute_ctl_config.tls { let tls_config = tls_config.clone(); // wait until the cert exists. - let mut cert_watch = watch_cert_for_changes(tls_config.cert_path.clone()).await; + let mut digest = crate::tls::compute_digest(&tls_config.cert_path); + info!( + cert_path = tls_config.cert_path, + key_path = tls_config.key_path, + "TLS certificates found" + ); tokio::task::spawn_blocking(move || { - let handle = tokio::runtime::Handle::current(); 'cert_update: loop { // let postgres/pgbouncer/local_proxy know the new cert/key exists. // we need to wait until it's configurable first. @@ -2233,11 +2235,13 @@ impl ComputeNode { drop(state); // wait for a new certificate update - if handle.block_on(cert_watch.changed()).is_err() { - error!("certificate renewal monitoring task closed unexpectedly"); - break; - } - info!("TLS certificates renewed"); + digest = crate::tls::wait_until_cert_changed(digest, &tls_config.cert_path); + + info!( + cert_path = tls_config.cert_path, + key_path = tls_config.key_path, + "TLS certificates renewed", + ); } }); } diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index e7dde5c5f5..b72953eb47 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -184,7 +184,8 @@ pub fn write_postgres_conf( // postgres requires the keyfile to be in a secure file, // currently too complicated to ensure that at the VM level, // so we just copy them to another file instead. :shrug: - tls::update_key_path_blocking(pgdata_path, tls_config); + let keys = tls::load_certs_blocking(tls_config); + tls::update_key_path_blocking(pgdata_path, &keys)?; // these are the default, but good to be explicit. writeln!(file, "ssl_cert_file = '{SERVER_CRT}'")?; diff --git a/compute_tools/src/tls.rs b/compute_tools/src/tls.rs index ab32a9459a..af4299b7aa 100644 --- a/compute_tools/src/tls.rs +++ b/compute_tools/src/tls.rs @@ -8,37 +8,39 @@ use x509_cert::Certificate; #[derive(Clone, Copy)] pub struct CertDigest(digest::Digest); -pub async fn watch_cert_for_changes(cert_path: String) -> tokio::sync::watch::Receiver { - let mut digest = compute_digest(&cert_path).await; - let (tx, rx) = tokio::sync::watch::channel(digest); - tokio::spawn(async move { - while !tx.is_closed() { - let new_digest = compute_digest(&cert_path).await; - if digest.0.as_ref() != new_digest.0.as_ref() { - digest = new_digest; - _ = tx.send(digest); - } - - tokio::time::sleep(Duration::from_secs(60)).await - } - }); - rx +impl PartialEq for CertDigest { + fn eq(&self, other: &Self) -> bool { + self.0.as_ref() == other.0.as_ref() + } } -async fn compute_digest(cert_path: &str) -> CertDigest { +pub fn wait_until_cert_changed(digest: CertDigest, cert_path: &str) -> CertDigest { loop { - match try_compute_digest(cert_path).await { + let new_digest = compute_digest(cert_path); + if digest != new_digest { + break new_digest; + } + + // Wait a while before checking the certificates. + // We renew on a daily basis, so there's no rush. + std::thread::sleep(Duration::from_secs(60)); + } +} + +pub fn compute_digest(cert_path: &str) -> CertDigest { + loop { + match try_compute_digest(cert_path) { Ok(d) => break d, Err(e) => { tracing::error!("could not read cert file {e:?}"); - tokio::time::sleep(Duration::from_secs(1)).await + std::thread::sleep(Duration::from_secs(1)) } } } } -async fn try_compute_digest(cert_path: &str) -> Result { - let data = tokio::fs::read(cert_path).await?; +fn try_compute_digest(cert_path: &str) -> Result { + let data = std::fs::read(cert_path)?; // sha256 is extremely collision resistent. can safely assume the digest to be unique Ok(CertDigest(digest::digest(&digest::SHA256, &data))) } @@ -46,28 +48,37 @@ async fn try_compute_digest(cert_path: &str) -> Result { pub const SERVER_CRT: &str = "server.crt"; pub const SERVER_KEY: &str = "server.key"; -pub fn update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) { +pub struct KeyPair { + crt: String, + key: String, +} + +pub fn load_certs_blocking(tls_config: &TlsConfig) -> KeyPair { loop { - match try_update_key_path_blocking(pg_data, tls_config) { - Ok(()) => break, + match try_load_certs_blocking(tls_config) { + Ok(key_pair) => break key_pair, Err(e) => { - tracing::error!(error = ?e, "could not create key file"); + tracing::error!(error = ?e, "could not load certs"); std::thread::sleep(Duration::from_secs(1)) } } } } -// Postgres requires the keypath be "secure". This means -// 1. Owned by the postgres user. -// 2. Have permission 600. -fn try_update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) -> Result<()> { +fn try_load_certs_blocking(tls_config: &TlsConfig) -> Result { let key = std::fs::read_to_string(&tls_config.key_path)?; let crt = std::fs::read_to_string(&tls_config.cert_path)?; // to mitigate a race condition during renewal. verify_key_cert(&key, &crt)?; + Ok(KeyPair { key, crt }) +} + +// Postgres requires the keypath be "secure". This means +// 1. Owned by the postgres user. +// 2. Have permission 600. +pub fn update_key_path_blocking(pg_data: &Path, key_pair: &KeyPair) -> Result<()> { let mut key_file = std::fs::OpenOptions::new() .write(true) .create(true) @@ -82,8 +93,13 @@ fn try_update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) -> Resul .mode(0o600) .open(pg_data.join(SERVER_CRT))?; - key_file.write_all(key.as_bytes())?; - crt_file.write_all(crt.as_bytes())?; + // NOTE: We currently ensure that an explicit reload does not happen during TLS renewal, but + // there's a chance that postgres/pgbouncer/local_proxy reloads implicitly halfway between + // these writes. This could allow them to reads the wrong keys to the wrong certs. + // There doesn't seem to be any way to prevent that. However, we will issue a reload shortly + // after which should at least correct it. + key_file.write_all(key_pair.key.as_bytes())?; + crt_file.write_all(key_pair.crt.as_bytes())?; Ok(()) }