From 4707d2df6d0557e2bf01a9b8313b8055bd92d6a0 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 15 Feb 2024 18:29:52 +0000 Subject: [PATCH] revert pageserver parts of "feat(walredo): use posix_spawn by moving close_fds() work to walredo C code (#6574)" the addition of close_range() to the C code remains, it doesn't matter for the purposes of this reproducer. This reverts parts of commit 1be5e564ceac53456a4479bd8a8dc4c1af7c2b28. --- Cargo.lock | 11 +++++++ Cargo.toml | 1 + pageserver/Cargo.toml | 1 + pageserver/src/walredo/process.rs | 54 +++++++++++++++++++++++++------ 4 files changed, 58 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 74cd2c8d2c..ddcf29b870 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1156,6 +1156,16 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" +[[package]] +name = "close_fds" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bc416f33de9d59e79e57560f450d21ff8393adcf1cdfc3e6d8fb93d5f88a2ed" +dependencies = [ + "cfg-if", + "libc", +] + [[package]] name = "colorchoice" version = "1.0.0" @@ -3457,6 +3467,7 @@ dependencies = [ "camino-tempfile", "chrono", "clap", + "close_fds", "const_format", "consumption_metrics", "crc32c", diff --git a/Cargo.toml b/Cargo.toml index 8952f7627f..0c1a2e3007 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -66,6 +66,7 @@ camino = "1.1.6" cfg-if = "1.0.0" chrono = { version = "0.4", default-features = false, features = ["clock"] } clap = { version = "4.0", features = ["derive"] } +close_fds = "0.3.2" comfy-table = "6.1" const_format = "0.2" crc32c = "0.6" diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 95d558bb7b..e44501d1ed 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -21,6 +21,7 @@ camino.workspace = true camino-tempfile.workspace = true chrono = { workspace = true, features = ["serde"] } clap = { workspace = true, features = ["string"] } +close_fds.workspace = true const_format.workspace = true consumption_metrics.workspace = true crc32c.workspace = true diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs index bcbb263663..a4aba525a3 100644 --- a/pageserver/src/walredo/process.rs +++ b/pageserver/src/walredo/process.rs @@ -9,7 +9,7 @@ use bytes::Bytes; use nix::poll::{PollFd, PollFlags}; use pageserver_api::{reltag::RelTag, shard::TenantShardId}; use postgres_ffi::BLCKSZ; -use std::os::fd::AsRawFd; +use std::os::{fd::AsRawFd, unix::process::CommandExt}; #[cfg(feature = "testing")] use std::sync::atomic::AtomicUsize; use std::{ @@ -26,6 +26,40 @@ mod no_leak_child; /// The IPC protocol that pageserver and walredo process speak over their shared pipe. mod protocol; +/// +/// Command with ability not to give all file descriptors to child process +/// +trait CloseFileDescriptors: CommandExt { + /// + /// Close file descriptors (other than stdin, stdout, stderr) in child process + /// + fn close_fds(&mut self) -> &mut Command; +} + +impl CloseFileDescriptors for C { + fn close_fds(&mut self) -> &mut Command { + // SAFETY: Code executed inside pre_exec should have async-signal-safety, + // which means it should be safe to execute inside a signal handler. + // The precise meaning depends on platform. See `man signal-safety` + // for the linux definition. + // + // The set_fds_cloexec_threadsafe function is documented to be + // async-signal-safe. + // + // Aside from this function, the rest of the code is re-entrant and + // doesn't make any syscalls. We're just passing constants. + // + // NOTE: It's easy to indirectly cause a malloc or lock a mutex, + // which is not async-signal-safe. Be careful. + unsafe { + self.pre_exec(move || { + close_fds::set_fds_cloexec_threadsafe(3, &[]); + Ok(()) + }) + } + } +} + pub struct WalRedoProcess { #[allow(dead_code)] conf: &'static PageServerConf, @@ -79,14 +113,16 @@ impl WalRedoProcess { .env_clear() .env("LD_LIBRARY_PATH", &pg_lib_dir_path) .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) - // NB: The redo process is not trusted after we sent it the first - // walredo work. Before that, it is trusted. Specifically, we trust - // it to - // 1. close all file descriptors except stdin, stdout, stderr because - // pageserver might not be 100% diligent in setting FD_CLOEXEC on all - // the files it opens, and - // 2. to use seccomp to sandbox itself before processing the first - // walredo request. + // The redo process is not trusted, and runs in seccomp mode that + // doesn't allow it to open any files. We have to also make sure it + // doesn't inherit any file descriptors from the pageserver, that + // would allow an attacker to read any files that happen to be open + // in the pageserver. + // + // The Rust standard library makes sure to mark any file descriptors with + // as close-on-exec by default, but that's not enough, since we use + // libraries that directly call libc open without setting that flag. + .close_fds() .spawn_no_leak_child(tenant_shard_id) .context("spawn process")?; WAL_REDO_PROCESS_COUNTERS.started.inc();