mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-04 12:02:55 +00:00
feat(walredo): use posix_spawn by moving close_fds() work to walredo C code (#6574)
The rust stdlib uses the efficient `posix_spawn` by default. However, before this PR, pageserver used `pre_exec()` in our `close_fds()` ext trait. This PR moves the work that `close_fds()` did to the walredo C code. I verified manually using `gdb` that we're now forking out the walredo process using `posix_spawn`. refs https://github.com/neondatabase/neon/issues/6565
This commit is contained in:
committed by
GitHub
parent
7a70ef991f
commit
1be5e564ce
11
Cargo.lock
generated
11
Cargo.lock
generated
@@ -1144,16 +1144,6 @@ version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
|
||||
|
||||
[[package]]
|
||||
name = "close_fds"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3bc416f33de9d59e79e57560f450d21ff8393adcf1cdfc3e6d8fb93d5f88a2ed"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "colorchoice"
|
||||
version = "1.0.0"
|
||||
@@ -3418,7 +3408,6 @@ dependencies = [
|
||||
"camino-tempfile",
|
||||
"chrono",
|
||||
"clap",
|
||||
"close_fds",
|
||||
"const_format",
|
||||
"consumption_metrics",
|
||||
"crc32c",
|
||||
|
||||
@@ -64,7 +64,6 @@ camino = "1.1.6"
|
||||
cfg-if = "1.0.0"
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
||||
clap = { version = "4.0", features = ["derive"] }
|
||||
close_fds = "0.3.2"
|
||||
comfy-table = "6.1"
|
||||
const_format = "0.2"
|
||||
crc32c = "0.6"
|
||||
|
||||
@@ -21,7 +21,6 @@ camino.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
chrono = { workspace = true, features = ["serde"] }
|
||||
clap = { workspace = true, features = ["string"] }
|
||||
close_fds.workspace = true
|
||||
const_format.workspace = true
|
||||
consumption_metrics.workspace = true
|
||||
crc32c.workspace = true
|
||||
|
||||
@@ -30,7 +30,6 @@ use std::io;
|
||||
use std::io::prelude::*;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::os::unix::io::AsRawFd;
|
||||
use std::os::unix::prelude::CommandExt;
|
||||
use std::process::Stdio;
|
||||
use std::process::{Child, ChildStdin, ChildStdout, Command};
|
||||
use std::sync::{Arc, Mutex, MutexGuard, RwLock};
|
||||
@@ -628,40 +627,6 @@ impl PostgresRedoManager {
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Command with ability not to give all file descriptors to child process
|
||||
///
|
||||
trait CloseFileDescriptors: CommandExt {
|
||||
///
|
||||
/// Close file descriptors (other than stdin, stdout, stderr) in child process
|
||||
///
|
||||
fn close_fds(&mut self) -> &mut Command;
|
||||
}
|
||||
|
||||
impl<C: CommandExt> CloseFileDescriptors for C {
|
||||
fn close_fds(&mut self) -> &mut Command {
|
||||
// SAFETY: Code executed inside pre_exec should have async-signal-safety,
|
||||
// which means it should be safe to execute inside a signal handler.
|
||||
// The precise meaning depends on platform. See `man signal-safety`
|
||||
// for the linux definition.
|
||||
//
|
||||
// The set_fds_cloexec_threadsafe function is documented to be
|
||||
// async-signal-safe.
|
||||
//
|
||||
// Aside from this function, the rest of the code is re-entrant and
|
||||
// doesn't make any syscalls. We're just passing constants.
|
||||
//
|
||||
// NOTE: It's easy to indirectly cause a malloc or lock a mutex,
|
||||
// which is not async-signal-safe. Be careful.
|
||||
unsafe {
|
||||
self.pre_exec(move || {
|
||||
close_fds::set_fds_cloexec_threadsafe(3, &[]);
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct WalRedoProcess {
|
||||
#[allow(dead_code)]
|
||||
conf: &'static PageServerConf,
|
||||
@@ -701,16 +666,14 @@ impl WalRedoProcess {
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
// The redo process is not trusted, and runs in seccomp mode that
|
||||
// doesn't allow it to open any files. We have to also make sure it
|
||||
// doesn't inherit any file descriptors from the pageserver, that
|
||||
// would allow an attacker to read any files that happen to be open
|
||||
// in the pageserver.
|
||||
//
|
||||
// The Rust standard library makes sure to mark any file descriptors with
|
||||
// as close-on-exec by default, but that's not enough, since we use
|
||||
// libraries that directly call libc open without setting that flag.
|
||||
.close_fds()
|
||||
// NB: The redo process is not trusted after we sent it the first
|
||||
// walredo work. Before that, it is trusted. Specifically, we trust
|
||||
// it to
|
||||
// 1. close all file descriptors except stdin, stdout, stderr because
|
||||
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
|
||||
// the files it opens, and
|
||||
// 2. to use seccomp to sandbox itself before processing the first
|
||||
// walredo request.
|
||||
.spawn_no_leak_child(tenant_shard_id)
|
||||
.context("spawn process")?;
|
||||
WAL_REDO_PROCESS_COUNTERS.started.inc();
|
||||
|
||||
@@ -140,9 +140,42 @@ static XLogReaderState *reader_state;
|
||||
#define TRACE DEBUG5
|
||||
|
||||
#ifdef HAVE_LIBSECCOMP
|
||||
|
||||
|
||||
/*
|
||||
* https://man7.org/linux/man-pages/man2/close_range.2.html
|
||||
*
|
||||
* The `close_range` syscall is available as of Linux 5.9.
|
||||
*
|
||||
* The `close_range` libc wrapper is only available in glibc >= 2.34.
|
||||
* Debian Bullseye ships a libc package based on glibc 2.31.
|
||||
* => write the wrapper ourselves, using the syscall number from the kernel headers.
|
||||
*
|
||||
* If the Linux uAPI headers don't define the system call number,
|
||||
* fail the build deliberately rather than ifdef'ing it to ENOSYS.
|
||||
* We prefer a compile time over a runtime error for walredo.
|
||||
*/
|
||||
#include <unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <errno.h>
|
||||
int close_range(unsigned int start_fd, unsigned int count, unsigned int flags) {
|
||||
return syscall(__NR_close_range, start_fd, count, flags);
|
||||
}
|
||||
|
||||
static void
|
||||
enter_seccomp_mode(void)
|
||||
{
|
||||
|
||||
/*
|
||||
* The pageserver process relies on us to close all the file descriptors
|
||||
* it potentially leaked to us, _before_ we start processing potentially dangerous
|
||||
* wal records. See the comment in the Rust code that launches this process.
|
||||
*/
|
||||
int err;
|
||||
if (err = close_range(3, ~0U, 0)) {
|
||||
ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not close files >= fd 3")));
|
||||
}
|
||||
|
||||
PgSeccompRule syscalls[] =
|
||||
{
|
||||
/* Hard requirements */
|
||||
|
||||
Reference in New Issue
Block a user