Compare commits

...

4 Commits

Author SHA1 Message Date
Christian Schwarz
6aefbb1aa5 [DO NOT MERGE] more debug logging to prove hypothesis that it's the fsyncs 2024-01-26 10:54:35 +00:00
Christian Schwarz
a7f8032d28 force CI run after adding run-benchmarks label 2024-01-25 20:13:13 +00:00
Christian Schwarz
dac2f40b26 re-enable the n_tenants=100 2024-01-25 20:11:17 +00:00
Christian Schwarz
eb2088264f suspicion: the fsync after creating a lot of data is the culprit; add info! line to get a timestamp before we start claiming pidfile 2024-01-25 20:10:54 +00:00
5 changed files with 31 additions and 3 deletions

View File

@@ -5,6 +5,7 @@ use std::{
};
use camino::{Utf8Path, Utf8PathBuf};
use tracing::info;
/// Similar to [`std::fs::create_dir`], except we fsync the
/// created directory and its parent.
@@ -81,6 +82,22 @@ pub fn path_with_suffix_extension(
original_path.as_ref().with_extension(new_extension)
}
#[tracing::instrument(skip_all)]
pub fn fsync_file_and_parent_log(file_path: &Utf8Path) -> io::Result<()> {
let parent = file_path.parent().ok_or_else(|| {
io::Error::new(
io::ErrorKind::Other,
format!("File {file_path:?} has no parent"),
)
})?;
info!("fsync file");
fsync(file_path)?;
info!("fsync parent");
fsync(parent)?;
info!("done");
Ok(())
}
pub fn fsync_file_and_parent(file_path: &Utf8Path) -> io::Result<()> {
let parent = file_path.parent().ok_or_else(|| {
io::Error::new(
@@ -88,7 +105,6 @@ pub fn fsync_file_and_parent(file_path: &Utf8Path) -> io::Result<()> {
format!("File {file_path:?} has no parent"),
)
})?;
fsync(file_path)?;
fsync(parent)?;
Ok(())

View File

@@ -16,6 +16,7 @@ use std::{
use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use nix::{errno::Errno::EAGAIN, fcntl};
use tracing::info;
use crate::crashsafe;
@@ -41,14 +42,19 @@ impl Deref for LockFileGuard {
impl UnwrittenLockFile {
/// Replace the content of this lock file with the byte representation of `contents`.
#[tracing::instrument(skip_all)]
pub fn write_content(mut self, contents: String) -> anyhow::Result<LockFileGuard> {
info!("truncate");
self.file
.set_len(0)
.context("Failed to truncate lockfile")?;
info!("write_all");
self.file
.write_all(contents.as_bytes())
.with_context(|| format!("Failed to write '{contents}' contents into lockfile"))?;
crashsafe::fsync_file_and_parent(&self.path).context("fsync lockfile")?;
info!("fsync file and parent");
crashsafe::fsync_file_and_parent_log(&self.path).context("fsync lockfile")?;
info!("done");
Ok(LockFileGuard(self.file))
}
}

View File

@@ -54,6 +54,7 @@ use std::ops::Deref;
use anyhow::Context;
use camino::Utf8Path;
use nix::unistd::Pid;
use tracing::info;
use crate::lock_file::{self, LockFileRead};
@@ -85,12 +86,16 @@ impl Deref for PidFileGuard {
/// The claim ends as soon as the returned guard object is dropped.
/// To maintain the claim for the remaining lifetime of the current process,
/// use [`std::mem::forget`] or similar.
#[tracing::instrument(skip_all)]
pub fn claim_for_current_process(path: &Utf8Path) -> anyhow::Result<PidFileGuard> {
info!("create_exclusive");
let unwritten_lock_file = lock_file::create_exclusive(path).context("lock file")?;
// if any of the next steps fail, we drop the file descriptor and thereby release the lock
info!("write_content");
let guard = unwritten_lock_file
.write_content(Pid::this().to_string())
.context("write pid to lock file")?;
info!("done");
Ok(PidFileGuard(guard))
}

View File

@@ -293,6 +293,7 @@ fn start_pageserver(
// Create and lock PID file. This ensures that there cannot be more than one
// pageserver process running at the same time.
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
info!("Claiming pid file at {lock_file_path:?}");
let lock_file =
utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
info!("Claimed pid file at {lock_file_path:?}");

View File

@@ -29,7 +29,7 @@ from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking
# 46G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-6
@pytest.mark.parametrize("duration", [30])
@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(s) for s in [100, 200]])
@pytest.mark.parametrize("n_tenants", [1, 10])
@pytest.mark.parametrize("n_tenants", [1, 10, 100])
@pytest.mark.timeout(
10000
) # TODO: this value is just "a really high number"; have this per instance type