feat(compute_ctl): add pgaudt log gc to compute_ctl (#11169)

- add pgaudt_gc thread to compute_ctl
to cleanup old pgaudit logs if they exist.
pgaudit can rotate files, but it doesn't delete the old files
  
- Add AUDIT_LOG_DIR_SIZE metric to compute_ctl
to track the size of the audit log directory in bytes.

- Fix permissions for rsyslog state files directory
This commit is contained in:
Anastasia Lubennikova
2025-03-14 14:08:16 +00:00
committed by GitHub
parent 7fe5a689b4
commit b7c6738524
7 changed files with 89 additions and 16 deletions

View File

@@ -145,7 +145,7 @@ merge: |
COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
RUN chmod 0666 /etc/compute_rsyslog.conf
RUN chmod 0666 /var/log/
RUN mkdir /var/log/rsyslog && chown -R postgres /var/log/rsyslog
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/

View File

@@ -140,7 +140,7 @@ merge: |
COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
RUN chmod 0666 /etc/compute_rsyslog.conf
RUN chmod 0666 /var/log/
RUN mkdir /var/log/rsyslog && chown -R postgres /var/log/rsyslog
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/

View File

@@ -37,7 +37,7 @@ use crate::logger::startup_context_from_env;
use crate::lsn_lease::launch_lsn_lease_bg_task_for_static;
use crate::monitor::launch_monitor;
use crate::pg_helpers::*;
use crate::rsyslog::configure_audit_rsyslog;
use crate::rsyslog::{configure_audit_rsyslog, launch_pgaudit_gc};
use crate::spec::*;
use crate::swap::resize_swap;
use crate::sync_sk::{check_if_synced, ping_safekeeper};
@@ -625,13 +625,11 @@ impl ComputeNode {
}
let log_directory_path = Path::new(&self.params.pgdata).join("log");
// TODO: make this more robust
// now rsyslog starts once and there is no monitoring or restart if it fails
configure_audit_rsyslog(
log_directory_path.to_str().unwrap(),
"hipaa",
&remote_endpoint,
)?;
let log_directory_path = log_directory_path.to_string_lossy().to_string();
configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
// Launch a background task to clean up the audit logs
launch_pgaudit_gc(log_directory_path);
}
// Launch remaining service threads

View File

@@ -167,7 +167,8 @@ pub fn write_postgres_conf(
writeln!(file, "# Managed by compute_ctl audit settings: begin")?;
// This log level is very verbose
// but this is necessary for HIPAA compliance.
writeln!(file, "pgaudit.log='all'")?;
// Exclude 'misc' category, because it doesn't contain anythig relevant.
writeln!(file, "pgaudit.log='all, -misc'")?;
writeln!(file, "pgaudit.log_parameter=on")?;
// Disable logging of catalog queries
// The catalog doesn't contain sensitive data, so we don't need to audit it.

View File

@@ -4,7 +4,8 @@ module(load="imfile")
# Input configuration for log files in the specified directory
# Replace {log_directory} with the directory containing the log files
input(type="imfile" File="{log_directory}/*.log" Tag="{tag}" Severity="info" Facility="local0")
global(workDirectory="/var/log")
# the directory to store rsyslog state files
global(workDirectory="/var/log/rsyslog")
# Forward logs to remote syslog server
*.* @@{remote_endpoint}

View File

@@ -1,6 +1,8 @@
use metrics::core::Collector;
use metrics::core::{AtomicF64, Collector, GenericGauge};
use metrics::proto::MetricFamily;
use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec};
use metrics::{
IntCounterVec, UIntGaugeVec, register_gauge, register_int_counter_vec, register_uint_gauge_vec,
};
use once_cell::sync::Lazy;
pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
@@ -59,10 +61,20 @@ pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(||
.expect("failed to define a metric")
});
// Size of audit log directory in bytes
pub(crate) static AUDIT_LOG_DIR_SIZE: Lazy<GenericGauge<AtomicF64>> = Lazy::new(|| {
register_gauge!(
"compute_audit_log_dir_size",
"Size of audit log directory in bytes",
)
.expect("failed to define a metric")
});
pub fn collect() -> Vec<MetricFamily> {
let mut metrics = INSTALLED_EXTENSIONS.collect();
metrics.extend(CPLANE_REQUESTS_TOTAL.collect());
metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect());
metrics.extend(DB_MIGRATION_FAILED.collect());
metrics.extend(AUDIT_LOG_DIR_SIZE.collect());
metrics
}

View File

@@ -1,8 +1,11 @@
use std::fs;
use std::path::Path;
use std::process::Command;
use std::time::Duration;
use std::{fs::OpenOptions, io::Write};
use anyhow::{Context, Result};
use tracing::info;
use tracing::{error, info, instrument, warn};
fn get_rsyslog_pid() -> Option<String> {
let output = Command::new("pgrep")
@@ -43,7 +46,7 @@ fn restart_rsyslog() -> Result<()> {
}
pub fn configure_audit_rsyslog(
log_directory: &str,
log_directory: String,
tag: &str,
remote_endpoint: &str,
) -> Result<()> {
@@ -75,3 +78,61 @@ pub fn configure_audit_rsyslog(
Ok(())
}
#[instrument(skip_all)]
async fn pgaudit_gc_main_loop(log_directory: String) -> Result<()> {
info!("running pgaudit GC main loop");
loop {
// Check log_directory for old pgaudit logs and delete them.
// New log files are checked every 5 minutes, as set in pgaudit.log_rotation_age
// Find files that were not modified in the last 15 minutes and delete them.
// This should be enough time for rsyslog to process the logs and for us to catch the alerts.
//
// In case of a very high load, we might need to adjust this value and pgaudit.log_rotation_age.
//
// TODO: add some smarter logic to delete the files that are fully streamed according to rsyslog
// imfile-state files, but for now just do a simple GC to avoid filling up the disk.
let _ = Command::new("find")
.arg(&log_directory)
.arg("-name")
.arg("audit*.log")
.arg("-mmin")
.arg("+15")
.arg("-delete")
.output()?;
// also collect the metric for the size of the log directory
async fn get_log_files_size(path: &Path) -> Result<u64> {
let mut total_size = 0;
for entry in fs::read_dir(path)? {
let entry = entry?;
let entry_path = entry.path();
if entry_path.is_file() && entry_path.to_string_lossy().ends_with("log") {
total_size += entry.metadata()?.len();
}
}
Ok(total_size)
}
let log_directory_size = get_log_files_size(Path::new(&log_directory))
.await
.unwrap_or_else(|e| {
warn!("Failed to get log directory size: {}", e);
0
});
crate::metrics::AUDIT_LOG_DIR_SIZE.set(log_directory_size as f64);
tokio::time::sleep(Duration::from_secs(60)).await;
}
}
// launch pgaudit GC thread to clean up the old pgaudit logs stored in the log_directory
pub fn launch_pgaudit_gc(log_directory: String) {
tokio::spawn(async move {
if let Err(e) = pgaudit_gc_main_loop(log_directory).await {
error!("pgaudit GC main loop failed: {}", e);
}
});
}