diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml index e6707381ac..0cf72b6f74 100644 --- a/compute/vm-image-spec-bookworm.yaml +++ b/compute/vm-image-spec-bookworm.yaml @@ -145,7 +145,7 @@ merge: | COPY compute_rsyslog.conf /etc/compute_rsyslog.conf RUN chmod 0666 /etc/compute_rsyslog.conf - RUN chmod 0666 /var/log/ + RUN mkdir /var/log/rsyslog && chown -R postgres /var/log/rsyslog COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index c89ee112dc..9deaf3ea55 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -140,7 +140,7 @@ merge: | COPY compute_rsyslog.conf /etc/compute_rsyslog.conf RUN chmod 0666 /etc/compute_rsyslog.conf - RUN chmod 0666 /var/log/ + RUN mkdir /var/log/rsyslog && chown -R postgres /var/log/rsyslog COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index a0654ea0e4..58b99dde53 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -37,7 +37,7 @@ use crate::logger::startup_context_from_env; use crate::lsn_lease::launch_lsn_lease_bg_task_for_static; use crate::monitor::launch_monitor; use crate::pg_helpers::*; -use crate::rsyslog::configure_audit_rsyslog; +use crate::rsyslog::{configure_audit_rsyslog, launch_pgaudit_gc}; use crate::spec::*; use crate::swap::resize_swap; use crate::sync_sk::{check_if_synced, ping_safekeeper}; @@ -625,13 +625,11 @@ impl ComputeNode { } let log_directory_path = Path::new(&self.params.pgdata).join("log"); - // TODO: make this more robust - // now rsyslog starts once and there is no monitoring or restart if it fails - configure_audit_rsyslog( - log_directory_path.to_str().unwrap(), - "hipaa", - &remote_endpoint, - )?; + let log_directory_path = log_directory_path.to_string_lossy().to_string(); + configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?; + + // Launch a background task to clean up the audit logs + launch_pgaudit_gc(log_directory_path); } // Launch remaining service threads diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 7aa7360f9d..e4acc5471c 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -167,7 +167,8 @@ pub fn write_postgres_conf( writeln!(file, "# Managed by compute_ctl audit settings: begin")?; // This log level is very verbose // but this is necessary for HIPAA compliance. - writeln!(file, "pgaudit.log='all'")?; + // Exclude 'misc' category, because it doesn't contain anythig relevant. + writeln!(file, "pgaudit.log='all, -misc'")?; writeln!(file, "pgaudit.log_parameter=on")?; // Disable logging of catalog queries // The catalog doesn't contain sensitive data, so we don't need to audit it. diff --git a/compute_tools/src/config_template/compute_audit_rsyslog_template.conf b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf index bef3c36446..1937cdc292 100644 --- a/compute_tools/src/config_template/compute_audit_rsyslog_template.conf +++ b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf @@ -4,7 +4,8 @@ module(load="imfile") # Input configuration for log files in the specified directory # Replace {log_directory} with the directory containing the log files input(type="imfile" File="{log_directory}/*.log" Tag="{tag}" Severity="info" Facility="local0") -global(workDirectory="/var/log") +# the directory to store rsyslog state files +global(workDirectory="/var/log/rsyslog") # Forward logs to remote syslog server *.* @@{remote_endpoint} \ No newline at end of file diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs index dab32d5dc1..4caa48307e 100644 --- a/compute_tools/src/metrics.rs +++ b/compute_tools/src/metrics.rs @@ -1,6 +1,8 @@ -use metrics::core::Collector; +use metrics::core::{AtomicF64, Collector, GenericGauge}; use metrics::proto::MetricFamily; -use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec}; +use metrics::{ + IntCounterVec, UIntGaugeVec, register_gauge, register_int_counter_vec, register_uint_gauge_vec, +}; use once_cell::sync::Lazy; pub(crate) static INSTALLED_EXTENSIONS: Lazy = Lazy::new(|| { @@ -59,10 +61,20 @@ pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy = Lazy::new(|| .expect("failed to define a metric") }); +// Size of audit log directory in bytes +pub(crate) static AUDIT_LOG_DIR_SIZE: Lazy> = Lazy::new(|| { + register_gauge!( + "compute_audit_log_dir_size", + "Size of audit log directory in bytes", + ) + .expect("failed to define a metric") +}); + pub fn collect() -> Vec { let mut metrics = INSTALLED_EXTENSIONS.collect(); metrics.extend(CPLANE_REQUESTS_TOTAL.collect()); metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect()); metrics.extend(DB_MIGRATION_FAILED.collect()); + metrics.extend(AUDIT_LOG_DIR_SIZE.collect()); metrics } diff --git a/compute_tools/src/rsyslog.rs b/compute_tools/src/rsyslog.rs index c8fba4fdcd..7537fafaa5 100644 --- a/compute_tools/src/rsyslog.rs +++ b/compute_tools/src/rsyslog.rs @@ -1,8 +1,11 @@ +use std::fs; +use std::path::Path; use std::process::Command; +use std::time::Duration; use std::{fs::OpenOptions, io::Write}; use anyhow::{Context, Result}; -use tracing::info; +use tracing::{error, info, instrument, warn}; fn get_rsyslog_pid() -> Option { let output = Command::new("pgrep") @@ -43,7 +46,7 @@ fn restart_rsyslog() -> Result<()> { } pub fn configure_audit_rsyslog( - log_directory: &str, + log_directory: String, tag: &str, remote_endpoint: &str, ) -> Result<()> { @@ -75,3 +78,61 @@ pub fn configure_audit_rsyslog( Ok(()) } + +#[instrument(skip_all)] +async fn pgaudit_gc_main_loop(log_directory: String) -> Result<()> { + info!("running pgaudit GC main loop"); + loop { + // Check log_directory for old pgaudit logs and delete them. + // New log files are checked every 5 minutes, as set in pgaudit.log_rotation_age + // Find files that were not modified in the last 15 minutes and delete them. + // This should be enough time for rsyslog to process the logs and for us to catch the alerts. + // + // In case of a very high load, we might need to adjust this value and pgaudit.log_rotation_age. + // + // TODO: add some smarter logic to delete the files that are fully streamed according to rsyslog + // imfile-state files, but for now just do a simple GC to avoid filling up the disk. + let _ = Command::new("find") + .arg(&log_directory) + .arg("-name") + .arg("audit*.log") + .arg("-mmin") + .arg("+15") + .arg("-delete") + .output()?; + + // also collect the metric for the size of the log directory + async fn get_log_files_size(path: &Path) -> Result { + let mut total_size = 0; + + for entry in fs::read_dir(path)? { + let entry = entry?; + let entry_path = entry.path(); + + if entry_path.is_file() && entry_path.to_string_lossy().ends_with("log") { + total_size += entry.metadata()?.len(); + } + } + + Ok(total_size) + } + + let log_directory_size = get_log_files_size(Path::new(&log_directory)) + .await + .unwrap_or_else(|e| { + warn!("Failed to get log directory size: {}", e); + 0 + }); + crate::metrics::AUDIT_LOG_DIR_SIZE.set(log_directory_size as f64); + tokio::time::sleep(Duration::from_secs(60)).await; + } +} + +// launch pgaudit GC thread to clean up the old pgaudit logs stored in the log_directory +pub fn launch_pgaudit_gc(log_directory: String) { + tokio::spawn(async move { + if let Err(e) = pgaudit_gc_main_loop(log_directory).await { + error!("pgaudit GC main loop failed: {}", e); + } + }); +}