feat(walredo): use posix_spawn by moving close_fds() work to walredo C code (#6574)

The rust stdlib uses the efficient `posix_spawn` by default.
However, before this PR, pageserver used `pre_exec()` in our
`close_fds()` ext trait.

This PR moves the work that `close_fds()` did to the walredo C code.
I verified manually using `gdb` that we're now forking out the walredo
process using `posix_spawn`.

refs https://github.com/neondatabase/neon/issues/6565
This commit is contained in:
Christian Schwarz
2024-02-01 22:38:34 +01:00
committed by GitHub
parent 7a70ef991f
commit 1be5e564ce
5 changed files with 41 additions and 58 deletions

11
Cargo.lock generated
View File

@@ -1144,16 +1144,6 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
[[package]]
name = "close_fds"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3bc416f33de9d59e79e57560f450d21ff8393adcf1cdfc3e6d8fb93d5f88a2ed"
dependencies = [
"cfg-if",
"libc",
]
[[package]]
name = "colorchoice"
version = "1.0.0"
@@ -3418,7 +3408,6 @@ dependencies = [
"camino-tempfile",
"chrono",
"clap",
"close_fds",
"const_format",
"consumption_metrics",
"crc32c",

View File

@@ -64,7 +64,6 @@ camino = "1.1.6"
cfg-if = "1.0.0"
chrono = { version = "0.4", default-features = false, features = ["clock"] }
clap = { version = "4.0", features = ["derive"] }
close_fds = "0.3.2"
comfy-table = "6.1"
const_format = "0.2"
crc32c = "0.6"

View File

@@ -21,7 +21,6 @@ camino.workspace = true
camino-tempfile.workspace = true
chrono = { workspace = true, features = ["serde"] }
clap = { workspace = true, features = ["string"] }
close_fds.workspace = true
const_format.workspace = true
consumption_metrics.workspace = true
crc32c.workspace = true

View File

@@ -30,7 +30,6 @@ use std::io;
use std::io::prelude::*;
use std::ops::{Deref, DerefMut};
use std::os::unix::io::AsRawFd;
use std::os::unix::prelude::CommandExt;
use std::process::Stdio;
use std::process::{Child, ChildStdin, ChildStdout, Command};
use std::sync::{Arc, Mutex, MutexGuard, RwLock};
@@ -628,40 +627,6 @@ impl PostgresRedoManager {
}
}
///
/// Command with ability not to give all file descriptors to child process
///
trait CloseFileDescriptors: CommandExt {
///
/// Close file descriptors (other than stdin, stdout, stderr) in child process
///
fn close_fds(&mut self) -> &mut Command;
}
impl<C: CommandExt> CloseFileDescriptors for C {
fn close_fds(&mut self) -> &mut Command {
// SAFETY: Code executed inside pre_exec should have async-signal-safety,
// which means it should be safe to execute inside a signal handler.
// The precise meaning depends on platform. See `man signal-safety`
// for the linux definition.
//
// The set_fds_cloexec_threadsafe function is documented to be
// async-signal-safe.
//
// Aside from this function, the rest of the code is re-entrant and
// doesn't make any syscalls. We're just passing constants.
//
// NOTE: It's easy to indirectly cause a malloc or lock a mutex,
// which is not async-signal-safe. Be careful.
unsafe {
self.pre_exec(move || {
close_fds::set_fds_cloexec_threadsafe(3, &[]);
Ok(())
})
}
}
}
struct WalRedoProcess {
#[allow(dead_code)]
conf: &'static PageServerConf,
@@ -701,16 +666,14 @@ impl WalRedoProcess {
.env_clear()
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
// The redo process is not trusted, and runs in seccomp mode that
// doesn't allow it to open any files. We have to also make sure it
// doesn't inherit any file descriptors from the pageserver, that
// would allow an attacker to read any files that happen to be open
// in the pageserver.
//
// The Rust standard library makes sure to mark any file descriptors with
// as close-on-exec by default, but that's not enough, since we use
// libraries that directly call libc open without setting that flag.
.close_fds()
// NB: The redo process is not trusted after we sent it the first
// walredo work. Before that, it is trusted. Specifically, we trust
// it to
// 1. close all file descriptors except stdin, stdout, stderr because
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
// the files it opens, and
// 2. to use seccomp to sandbox itself before processing the first
// walredo request.
.spawn_no_leak_child(tenant_shard_id)
.context("spawn process")?;
WAL_REDO_PROCESS_COUNTERS.started.inc();

View File

@@ -140,9 +140,42 @@ static XLogReaderState *reader_state;
#define TRACE DEBUG5
#ifdef HAVE_LIBSECCOMP
/*
* https://man7.org/linux/man-pages/man2/close_range.2.html
*
* The `close_range` syscall is available as of Linux 5.9.
*
* The `close_range` libc wrapper is only available in glibc >= 2.34.
* Debian Bullseye ships a libc package based on glibc 2.31.
* => write the wrapper ourselves, using the syscall number from the kernel headers.
*
* If the Linux uAPI headers don't define the system call number,
* fail the build deliberately rather than ifdef'ing it to ENOSYS.
* We prefer a compile time over a runtime error for walredo.
*/
#include <unistd.h>
#include <sys/syscall.h>
#include <errno.h>
int close_range(unsigned int start_fd, unsigned int count, unsigned int flags) {
return syscall(__NR_close_range, start_fd, count, flags);
}
static void
enter_seccomp_mode(void)
{
/*
* The pageserver process relies on us to close all the file descriptors
* it potentially leaked to us, _before_ we start processing potentially dangerous
* wal records. See the comment in the Rust code that launches this process.
*/
int err;
if (err = close_range(3, ~0U, 0)) {
ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not close files >= fd 3")));
}
PgSeccompRule syscalls[] =
{
/* Hard requirements */