mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-04 03:52:56 +00:00
feat(compute_ctl): add pgaudt log gc to compute_ctl (#11169)
- add pgaudt_gc thread to compute_ctl to cleanup old pgaudit logs if they exist. pgaudit can rotate files, but it doesn't delete the old files - Add AUDIT_LOG_DIR_SIZE metric to compute_ctl to track the size of the audit log directory in bytes. - Fix permissions for rsyslog state files directory
This commit is contained in:
committed by
GitHub
parent
7fe5a689b4
commit
b7c6738524
@@ -145,7 +145,7 @@ merge: |
|
||||
|
||||
COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
|
||||
RUN chmod 0666 /etc/compute_rsyslog.conf
|
||||
RUN chmod 0666 /var/log/
|
||||
RUN mkdir /var/log/rsyslog && chown -R postgres /var/log/rsyslog
|
||||
|
||||
|
||||
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
|
||||
|
||||
@@ -140,7 +140,7 @@ merge: |
|
||||
|
||||
COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
|
||||
RUN chmod 0666 /etc/compute_rsyslog.conf
|
||||
RUN chmod 0666 /var/log/
|
||||
RUN mkdir /var/log/rsyslog && chown -R postgres /var/log/rsyslog
|
||||
|
||||
|
||||
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
|
||||
|
||||
@@ -37,7 +37,7 @@ use crate::logger::startup_context_from_env;
|
||||
use crate::lsn_lease::launch_lsn_lease_bg_task_for_static;
|
||||
use crate::monitor::launch_monitor;
|
||||
use crate::pg_helpers::*;
|
||||
use crate::rsyslog::configure_audit_rsyslog;
|
||||
use crate::rsyslog::{configure_audit_rsyslog, launch_pgaudit_gc};
|
||||
use crate::spec::*;
|
||||
use crate::swap::resize_swap;
|
||||
use crate::sync_sk::{check_if_synced, ping_safekeeper};
|
||||
@@ -625,13 +625,11 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
let log_directory_path = Path::new(&self.params.pgdata).join("log");
|
||||
// TODO: make this more robust
|
||||
// now rsyslog starts once and there is no monitoring or restart if it fails
|
||||
configure_audit_rsyslog(
|
||||
log_directory_path.to_str().unwrap(),
|
||||
"hipaa",
|
||||
&remote_endpoint,
|
||||
)?;
|
||||
let log_directory_path = log_directory_path.to_string_lossy().to_string();
|
||||
configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
|
||||
|
||||
// Launch a background task to clean up the audit logs
|
||||
launch_pgaudit_gc(log_directory_path);
|
||||
}
|
||||
|
||||
// Launch remaining service threads
|
||||
|
||||
@@ -167,7 +167,8 @@ pub fn write_postgres_conf(
|
||||
writeln!(file, "# Managed by compute_ctl audit settings: begin")?;
|
||||
// This log level is very verbose
|
||||
// but this is necessary for HIPAA compliance.
|
||||
writeln!(file, "pgaudit.log='all'")?;
|
||||
// Exclude 'misc' category, because it doesn't contain anythig relevant.
|
||||
writeln!(file, "pgaudit.log='all, -misc'")?;
|
||||
writeln!(file, "pgaudit.log_parameter=on")?;
|
||||
// Disable logging of catalog queries
|
||||
// The catalog doesn't contain sensitive data, so we don't need to audit it.
|
||||
|
||||
@@ -4,7 +4,8 @@ module(load="imfile")
|
||||
# Input configuration for log files in the specified directory
|
||||
# Replace {log_directory} with the directory containing the log files
|
||||
input(type="imfile" File="{log_directory}/*.log" Tag="{tag}" Severity="info" Facility="local0")
|
||||
global(workDirectory="/var/log")
|
||||
# the directory to store rsyslog state files
|
||||
global(workDirectory="/var/log/rsyslog")
|
||||
|
||||
# Forward logs to remote syslog server
|
||||
*.* @@{remote_endpoint}
|
||||
@@ -1,6 +1,8 @@
|
||||
use metrics::core::Collector;
|
||||
use metrics::core::{AtomicF64, Collector, GenericGauge};
|
||||
use metrics::proto::MetricFamily;
|
||||
use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec};
|
||||
use metrics::{
|
||||
IntCounterVec, UIntGaugeVec, register_gauge, register_int_counter_vec, register_uint_gauge_vec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
@@ -59,10 +61,20 @@ pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(||
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Size of audit log directory in bytes
|
||||
pub(crate) static AUDIT_LOG_DIR_SIZE: Lazy<GenericGauge<AtomicF64>> = Lazy::new(|| {
|
||||
register_gauge!(
|
||||
"compute_audit_log_dir_size",
|
||||
"Size of audit log directory in bytes",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub fn collect() -> Vec<MetricFamily> {
|
||||
let mut metrics = INSTALLED_EXTENSIONS.collect();
|
||||
metrics.extend(CPLANE_REQUESTS_TOTAL.collect());
|
||||
metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect());
|
||||
metrics.extend(DB_MIGRATION_FAILED.collect());
|
||||
metrics.extend(AUDIT_LOG_DIR_SIZE.collect());
|
||||
metrics
|
||||
}
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use std::process::Command;
|
||||
use std::time::Duration;
|
||||
use std::{fs::OpenOptions, io::Write};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use tracing::info;
|
||||
use tracing::{error, info, instrument, warn};
|
||||
|
||||
fn get_rsyslog_pid() -> Option<String> {
|
||||
let output = Command::new("pgrep")
|
||||
@@ -43,7 +46,7 @@ fn restart_rsyslog() -> Result<()> {
|
||||
}
|
||||
|
||||
pub fn configure_audit_rsyslog(
|
||||
log_directory: &str,
|
||||
log_directory: String,
|
||||
tag: &str,
|
||||
remote_endpoint: &str,
|
||||
) -> Result<()> {
|
||||
@@ -75,3 +78,61 @@ pub fn configure_audit_rsyslog(
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn pgaudit_gc_main_loop(log_directory: String) -> Result<()> {
|
||||
info!("running pgaudit GC main loop");
|
||||
loop {
|
||||
// Check log_directory for old pgaudit logs and delete them.
|
||||
// New log files are checked every 5 minutes, as set in pgaudit.log_rotation_age
|
||||
// Find files that were not modified in the last 15 minutes and delete them.
|
||||
// This should be enough time for rsyslog to process the logs and for us to catch the alerts.
|
||||
//
|
||||
// In case of a very high load, we might need to adjust this value and pgaudit.log_rotation_age.
|
||||
//
|
||||
// TODO: add some smarter logic to delete the files that are fully streamed according to rsyslog
|
||||
// imfile-state files, but for now just do a simple GC to avoid filling up the disk.
|
||||
let _ = Command::new("find")
|
||||
.arg(&log_directory)
|
||||
.arg("-name")
|
||||
.arg("audit*.log")
|
||||
.arg("-mmin")
|
||||
.arg("+15")
|
||||
.arg("-delete")
|
||||
.output()?;
|
||||
|
||||
// also collect the metric for the size of the log directory
|
||||
async fn get_log_files_size(path: &Path) -> Result<u64> {
|
||||
let mut total_size = 0;
|
||||
|
||||
for entry in fs::read_dir(path)? {
|
||||
let entry = entry?;
|
||||
let entry_path = entry.path();
|
||||
|
||||
if entry_path.is_file() && entry_path.to_string_lossy().ends_with("log") {
|
||||
total_size += entry.metadata()?.len();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(total_size)
|
||||
}
|
||||
|
||||
let log_directory_size = get_log_files_size(Path::new(&log_directory))
|
||||
.await
|
||||
.unwrap_or_else(|e| {
|
||||
warn!("Failed to get log directory size: {}", e);
|
||||
0
|
||||
});
|
||||
crate::metrics::AUDIT_LOG_DIR_SIZE.set(log_directory_size as f64);
|
||||
tokio::time::sleep(Duration::from_secs(60)).await;
|
||||
}
|
||||
}
|
||||
|
||||
// launch pgaudit GC thread to clean up the old pgaudit logs stored in the log_directory
|
||||
pub fn launch_pgaudit_gc(log_directory: String) {
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = pgaudit_gc_main_loop(log_directory).await {
|
||||
error!("pgaudit GC main loop failed: {}", e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user