diff --git a/Cargo.lock b/Cargo.lock index ee6aa9e613..ea5a29a142 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1144,16 +1144,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" -[[package]] -name = "close_fds" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bc416f33de9d59e79e57560f450d21ff8393adcf1cdfc3e6d8fb93d5f88a2ed" -dependencies = [ - "cfg-if", - "libc", -] - [[package]] name = "colorchoice" version = "1.0.0" @@ -3418,7 +3408,6 @@ dependencies = [ "camino-tempfile", "chrono", "clap", - "close_fds", "const_format", "consumption_metrics", "crc32c", diff --git a/Cargo.toml b/Cargo.toml index 26cf604a91..d3006985ab 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -64,7 +64,6 @@ camino = "1.1.6" cfg-if = "1.0.0" chrono = { version = "0.4", default-features = false, features = ["clock"] } clap = { version = "4.0", features = ["derive"] } -close_fds = "0.3.2" comfy-table = "6.1" const_format = "0.2" crc32c = "0.6" diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index e44501d1ed..95d558bb7b 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -21,7 +21,6 @@ camino.workspace = true camino-tempfile.workspace = true chrono = { workspace = true, features = ["serde"] } clap = { workspace = true, features = ["string"] } -close_fds.workspace = true const_format.workspace = true consumption_metrics.workspace = true crc32c.workspace = true diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 793bcc1f00..5bc897b730 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -30,7 +30,6 @@ use std::io; use std::io::prelude::*; use std::ops::{Deref, DerefMut}; use std::os::unix::io::AsRawFd; -use std::os::unix::prelude::CommandExt; use std::process::Stdio; use std::process::{Child, ChildStdin, ChildStdout, Command}; use std::sync::{Arc, Mutex, MutexGuard, RwLock}; @@ -628,40 +627,6 @@ impl PostgresRedoManager { } } -/// -/// Command with ability not to give all file descriptors to child process -/// -trait CloseFileDescriptors: CommandExt { - /// - /// Close file descriptors (other than stdin, stdout, stderr) in child process - /// - fn close_fds(&mut self) -> &mut Command; -} - -impl CloseFileDescriptors for C { - fn close_fds(&mut self) -> &mut Command { - // SAFETY: Code executed inside pre_exec should have async-signal-safety, - // which means it should be safe to execute inside a signal handler. - // The precise meaning depends on platform. See `man signal-safety` - // for the linux definition. - // - // The set_fds_cloexec_threadsafe function is documented to be - // async-signal-safe. - // - // Aside from this function, the rest of the code is re-entrant and - // doesn't make any syscalls. We're just passing constants. - // - // NOTE: It's easy to indirectly cause a malloc or lock a mutex, - // which is not async-signal-safe. Be careful. - unsafe { - self.pre_exec(move || { - close_fds::set_fds_cloexec_threadsafe(3, &[]); - Ok(()) - }) - } - } -} - struct WalRedoProcess { #[allow(dead_code)] conf: &'static PageServerConf, @@ -701,16 +666,14 @@ impl WalRedoProcess { .env_clear() .env("LD_LIBRARY_PATH", &pg_lib_dir_path) .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) - // The redo process is not trusted, and runs in seccomp mode that - // doesn't allow it to open any files. We have to also make sure it - // doesn't inherit any file descriptors from the pageserver, that - // would allow an attacker to read any files that happen to be open - // in the pageserver. - // - // The Rust standard library makes sure to mark any file descriptors with - // as close-on-exec by default, but that's not enough, since we use - // libraries that directly call libc open without setting that flag. - .close_fds() + // NB: The redo process is not trusted after we sent it the first + // walredo work. Before that, it is trusted. Specifically, we trust + // it to + // 1. close all file descriptors except stdin, stdout, stderr because + // pageserver might not be 100% diligent in setting FD_CLOEXEC on all + // the files it opens, and + // 2. to use seccomp to sandbox itself before processing the first + // walredo request. .spawn_no_leak_child(tenant_shard_id) .context("spawn process")?; WAL_REDO_PROCESS_COUNTERS.started.inc(); diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c index bdc50b0aa9..7ca4fe93df 100644 --- a/pgxn/neon_walredo/walredoproc.c +++ b/pgxn/neon_walredo/walredoproc.c @@ -140,9 +140,42 @@ static XLogReaderState *reader_state; #define TRACE DEBUG5 #ifdef HAVE_LIBSECCOMP + + +/* + * https://man7.org/linux/man-pages/man2/close_range.2.html + * + * The `close_range` syscall is available as of Linux 5.9. + * + * The `close_range` libc wrapper is only available in glibc >= 2.34. + * Debian Bullseye ships a libc package based on glibc 2.31. + * => write the wrapper ourselves, using the syscall number from the kernel headers. + * + * If the Linux uAPI headers don't define the system call number, + * fail the build deliberately rather than ifdef'ing it to ENOSYS. + * We prefer a compile time over a runtime error for walredo. + */ +#include +#include +#include +int close_range(unsigned int start_fd, unsigned int count, unsigned int flags) { + return syscall(__NR_close_range, start_fd, count, flags); +} + static void enter_seccomp_mode(void) { + + /* + * The pageserver process relies on us to close all the file descriptors + * it potentially leaked to us, _before_ we start processing potentially dangerous + * wal records. See the comment in the Rust code that launches this process. + */ + int err; + if (err = close_range(3, ~0U, 0)) { + ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not close files >= fd 3"))); + } + PgSeccompRule syscalls[] = { /* Hard requirements */