refactor TLS processing. Only use blocking-IO, split out the loading of certificates from the updating of certificates

This commit is contained in:
Conrad Ludgate
2025-07-30 10:29:03 +01:00
parent e288cd2198
commit a845295cb3
3 changed files with 64 additions and 43 deletions

View File

@@ -57,7 +57,6 @@ use crate::rsyslog::{
use crate::spec::*;
use crate::swap::resize_swap;
use crate::sync_sk::{check_if_synced, ping_safekeeper};
use crate::tls::watch_cert_for_changes;
use crate::{config, extension_server, local_proxy};
pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
@@ -844,9 +843,8 @@ impl ComputeNode {
// Make sure TLS certificates are properly loaded and in the right place.
if self.compute_ctl_config.tls.is_some() {
let this = self.clone();
pre_tasks.spawn(async move {
this.watch_cert_for_changes().await;
pre_tasks.spawn_blocking(|| {
this.watch_cert_for_changes();
Ok::<(), anyhow::Error>(())
});
}
@@ -2187,16 +2185,20 @@ impl ComputeNode {
Ok(())
}
pub async fn watch_cert_for_changes(self: Arc<Self>) {
pub fn watch_cert_for_changes(self: Arc<Self>) {
// update status on cert renewal
if let Some(tls_config) = &self.compute_ctl_config.tls {
let tls_config = tls_config.clone();
// wait until the cert exists.
let mut cert_watch = watch_cert_for_changes(tls_config.cert_path.clone()).await;
let mut digest = crate::tls::compute_digest(&tls_config.cert_path);
info!(
cert_path = tls_config.cert_path,
key_path = tls_config.key_path,
"TLS certificates found"
);
tokio::task::spawn_blocking(move || {
let handle = tokio::runtime::Handle::current();
'cert_update: loop {
// let postgres/pgbouncer/local_proxy know the new cert/key exists.
// we need to wait until it's configurable first.
@@ -2233,11 +2235,13 @@ impl ComputeNode {
drop(state);
// wait for a new certificate update
if handle.block_on(cert_watch.changed()).is_err() {
error!("certificate renewal monitoring task closed unexpectedly");
break;
}
info!("TLS certificates renewed");
digest = crate::tls::wait_until_cert_changed(digest, &tls_config.cert_path);
info!(
cert_path = tls_config.cert_path,
key_path = tls_config.key_path,
"TLS certificates renewed",
);
}
});
}

View File

@@ -184,7 +184,8 @@ pub fn write_postgres_conf(
// postgres requires the keyfile to be in a secure file,
// currently too complicated to ensure that at the VM level,
// so we just copy them to another file instead. :shrug:
tls::update_key_path_blocking(pgdata_path, tls_config);
let keys = tls::load_certs_blocking(tls_config);
tls::update_key_path_blocking(pgdata_path, &keys)?;
// these are the default, but good to be explicit.
writeln!(file, "ssl_cert_file = '{SERVER_CRT}'")?;

View File

@@ -8,37 +8,39 @@ use x509_cert::Certificate;
#[derive(Clone, Copy)]
pub struct CertDigest(digest::Digest);
pub async fn watch_cert_for_changes(cert_path: String) -> tokio::sync::watch::Receiver<CertDigest> {
let mut digest = compute_digest(&cert_path).await;
let (tx, rx) = tokio::sync::watch::channel(digest);
tokio::spawn(async move {
while !tx.is_closed() {
let new_digest = compute_digest(&cert_path).await;
if digest.0.as_ref() != new_digest.0.as_ref() {
digest = new_digest;
_ = tx.send(digest);
}
tokio::time::sleep(Duration::from_secs(60)).await
}
});
rx
impl PartialEq for CertDigest {
fn eq(&self, other: &Self) -> bool {
self.0.as_ref() == other.0.as_ref()
}
}
async fn compute_digest(cert_path: &str) -> CertDigest {
pub fn wait_until_cert_changed(digest: CertDigest, cert_path: &str) -> CertDigest {
loop {
match try_compute_digest(cert_path).await {
let new_digest = compute_digest(cert_path);
if digest != new_digest {
break new_digest;
}
// Wait a while before checking the certificates.
// We renew on a daily basis, so there's no rush.
std::thread::sleep(Duration::from_secs(60));
}
}
pub fn compute_digest(cert_path: &str) -> CertDigest {
loop {
match try_compute_digest(cert_path) {
Ok(d) => break d,
Err(e) => {
tracing::error!("could not read cert file {e:?}");
tokio::time::sleep(Duration::from_secs(1)).await
std::thread::sleep(Duration::from_secs(1))
}
}
}
}
async fn try_compute_digest(cert_path: &str) -> Result<CertDigest> {
let data = tokio::fs::read(cert_path).await?;
fn try_compute_digest(cert_path: &str) -> Result<CertDigest> {
let data = std::fs::read(cert_path)?;
// sha256 is extremely collision resistent. can safely assume the digest to be unique
Ok(CertDigest(digest::digest(&digest::SHA256, &data)))
}
@@ -46,28 +48,37 @@ async fn try_compute_digest(cert_path: &str) -> Result<CertDigest> {
pub const SERVER_CRT: &str = "server.crt";
pub const SERVER_KEY: &str = "server.key";
pub fn update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) {
pub struct KeyPair {
crt: String,
key: String,
}
pub fn load_certs_blocking(tls_config: &TlsConfig) -> KeyPair {
loop {
match try_update_key_path_blocking(pg_data, tls_config) {
Ok(()) => break,
match try_load_certs_blocking(tls_config) {
Ok(key_pair) => break key_pair,
Err(e) => {
tracing::error!(error = ?e, "could not create key file");
tracing::error!(error = ?e, "could not load certs");
std::thread::sleep(Duration::from_secs(1))
}
}
}
}
// Postgres requires the keypath be "secure". This means
// 1. Owned by the postgres user.
// 2. Have permission 600.
fn try_update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) -> Result<()> {
fn try_load_certs_blocking(tls_config: &TlsConfig) -> Result<KeyPair> {
let key = std::fs::read_to_string(&tls_config.key_path)?;
let crt = std::fs::read_to_string(&tls_config.cert_path)?;
// to mitigate a race condition during renewal.
verify_key_cert(&key, &crt)?;
Ok(KeyPair { key, crt })
}
// Postgres requires the keypath be "secure". This means
// 1. Owned by the postgres user.
// 2. Have permission 600.
pub fn update_key_path_blocking(pg_data: &Path, key_pair: &KeyPair) -> Result<()> {
let mut key_file = std::fs::OpenOptions::new()
.write(true)
.create(true)
@@ -82,8 +93,13 @@ fn try_update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) -> Resul
.mode(0o600)
.open(pg_data.join(SERVER_CRT))?;
key_file.write_all(key.as_bytes())?;
crt_file.write_all(crt.as_bytes())?;
// NOTE: We currently ensure that an explicit reload does not happen during TLS renewal, but
// there's a chance that postgres/pgbouncer/local_proxy reloads implicitly halfway between
// these writes. This could allow them to reads the wrong keys to the wrong certs.
// There doesn't seem to be any way to prevent that. However, we will issue a reload shortly
// after which should at least correct it.
key_file.write_all(key_pair.key.as_bytes())?;
crt_file.write_all(key_pair.crt.as_bytes())?;
Ok(())
}