Merge branch 'main' into problame/integrate-tokio-epoll-uring/benchmarking/2024-01-31-prs/async-walredo

This commit is contained in:
Christian Schwarz
2024-03-20 14:32:17 +00:00
126 changed files with 6964 additions and 1591 deletions

View File

@@ -47,9 +47,10 @@ impl<T, const L: usize> ops::Deref for HistoryBufferWithDropCounter<T, L> {
}
}
#[derive(serde::Serialize)]
#[derive(serde::Serialize, serde::Deserialize)]
struct SerdeRepr<T> {
buffer: Vec<T>,
buffer_size: usize,
drop_count: u64,
}
@@ -61,6 +62,7 @@ where
let HistoryBufferWithDropCounter { buffer, drop_count } = value;
SerdeRepr {
buffer: buffer.iter().cloned().collect(),
buffer_size: L,
drop_count: *drop_count,
}
}
@@ -78,19 +80,52 @@ where
}
}
impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter<T, L>
where
T: Clone + serde::Deserialize<'de>,
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let SerdeRepr {
buffer: des_buffer,
drop_count,
buffer_size,
} = SerdeRepr::<T>::deserialize(deserializer)?;
if buffer_size != L {
use serde::de::Error;
return Err(D::Error::custom(format!(
"invalid buffer_size, expecting {L} got {buffer_size}"
)));
}
let mut buffer = HistoryBuffer::new();
buffer.extend(des_buffer);
Ok(HistoryBufferWithDropCounter { buffer, drop_count })
}
}
#[cfg(test)]
mod test {
use super::HistoryBufferWithDropCounter;
#[test]
fn test_basics() {
let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
let mut b = HistoryBufferWithDropCounter::<usize, 2>::default();
b.write(1);
b.write(2);
b.write(3);
assert!(b.iter().any(|e| *e == 2));
assert!(b.iter().any(|e| *e == 3));
assert!(!b.iter().any(|e| *e == 1));
// round-trip serde
let round_tripped: HistoryBufferWithDropCounter<usize, 2> =
serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap();
assert_eq!(
round_tripped.iter().cloned().collect::<Vec<_>>(),
b.iter().cloned().collect::<Vec<_>>()
);
}
#[test]

View File

@@ -87,6 +87,8 @@ pub mod failpoint_support;
pub mod yielding_loop;
pub mod zstd;
pub mod poison;
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages

View File

@@ -29,12 +29,10 @@ pub struct PageserverFeedback {
// Serialize with RFC3339 format.
#[serde(with = "serde_systemtime")]
pub replytime: SystemTime,
/// Used to track feedbacks from different shards. Always zero for unsharded tenants.
pub shard_number: u32,
}
// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
// Do not remove previously available fields because this might be backwards incompatible.
pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
impl PageserverFeedback {
pub fn empty() -> PageserverFeedback {
PageserverFeedback {
@@ -43,6 +41,7 @@ impl PageserverFeedback {
remote_consistent_lsn: Lsn::INVALID,
disk_consistent_lsn: Lsn::INVALID,
replytime: *PG_EPOCH,
shard_number: 0,
}
}
@@ -59,17 +58,26 @@ impl PageserverFeedback {
//
// TODO: change serialized fields names once all computes migrate to rename.
pub fn serialize(&self, buf: &mut BytesMut) {
buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
let buf_ptr = buf.len();
buf.put_u8(0); // # of keys, will be filled later
let mut nkeys = 0;
nkeys += 1;
buf.put_slice(b"current_timeline_size\0");
buf.put_i32(8);
buf.put_u64(self.current_timeline_size);
nkeys += 1;
buf.put_slice(b"ps_writelsn\0");
buf.put_i32(8);
buf.put_u64(self.last_received_lsn.0);
nkeys += 1;
buf.put_slice(b"ps_flushlsn\0");
buf.put_i32(8);
buf.put_u64(self.disk_consistent_lsn.0);
nkeys += 1;
buf.put_slice(b"ps_applylsn\0");
buf.put_i32(8);
buf.put_u64(self.remote_consistent_lsn.0);
@@ -80,9 +88,19 @@ impl PageserverFeedback {
.expect("failed to serialize pg_replytime earlier than PG_EPOCH")
.as_micros() as i64;
nkeys += 1;
buf.put_slice(b"ps_replytime\0");
buf.put_i32(8);
buf.put_i64(timestamp);
if self.shard_number > 0 {
nkeys += 1;
buf.put_slice(b"shard_number\0");
buf.put_i32(4);
buf.put_u32(self.shard_number);
}
buf[buf_ptr] = nkeys;
}
// Deserialize PageserverFeedback message
@@ -125,9 +143,8 @@ impl PageserverFeedback {
}
b"shard_number" => {
let len = buf.get_i32();
// TODO: this will be implemented in the next update,
// for now, we just skip the value.
buf.advance(len as usize);
assert_eq!(len, 4);
rf.shard_number = buf.get_u32();
}
_ => {
let len = buf.get_i32();
@@ -200,10 +217,7 @@ mod tests {
rf.serialize(&mut data);
// Add an extra field to the buffer and adjust number of keys
if let Some(first) = data.first_mut() {
*first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
}
data[0] += 1;
data.put_slice(b"new_field_one\0");
data.put_i32(8);
data.put_u64(42);

View File

@@ -110,6 +110,49 @@ impl<T> OnceCell<T> {
}
}
/// Returns a guard to an existing initialized value, or returns an unique initialization
/// permit which can be used to initialize this `OnceCell` using `OnceCell::set`.
pub async fn get_or_init_detached(&self) -> Result<Guard<'_, T>, InitPermit> {
// It looks like OnceCell::get_or_init could be implemented using this method instead of
// duplication. However, that makes the future be !Send due to possibly holding on to the
// MutexGuard over an await point.
loop {
let sem = {
let guard = self.inner.lock().unwrap();
if guard.value.is_some() {
return Ok(Guard(guard));
}
guard.init_semaphore.clone()
};
{
let permit = {
// increment the count for the duration of queued
let _guard = CountWaitingInitializers::start(self);
sem.acquire().await
};
let Ok(permit) = permit else {
let guard = self.inner.lock().unwrap();
if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
// there was a take_and_deinit in between
continue;
}
assert!(
guard.value.is_some(),
"semaphore got closed, must be initialized"
);
return Ok(Guard(guard));
};
permit.forget();
}
let permit = InitPermit(sem);
return Err(permit);
}
}
/// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
/// to complete initializing the inner value.
///
@@ -481,4 +524,39 @@ mod tests {
assert_eq!("t1", *cell.get().unwrap());
}
#[tokio::test(start_paused = true)]
async fn detached_init_smoke() {
let target = OnceCell::default();
let Err(permit) = target.get_or_init_detached().await else {
unreachable!("it is not initialized")
};
tokio::time::timeout(
std::time::Duration::from_secs(3600 * 24 * 7 * 365),
target.get_or_init(|permit2| async { Ok::<_, Infallible>((11, permit2)) }),
)
.await
.expect_err("should timeout since we are already holding the permit");
target.set(42, permit);
let (_answer, permit) = {
let mut guard = target
.get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
.await
.unwrap();
assert_eq!(*guard, 42);
guard.take_and_deinit()
};
assert!(target.get().is_none());
target.set(11, permit);
assert_eq!(*target.get().unwrap(), 11);
}
}

78
libs/utils/src/zstd.rs Normal file
View File

@@ -0,0 +1,78 @@
use std::io::SeekFrom;
use anyhow::{Context, Result};
use async_compression::{
tokio::{bufread::ZstdDecoder, write::ZstdEncoder},
zstd::CParameter,
Level,
};
use camino::Utf8Path;
use nix::NixPath;
use tokio::{
fs::{File, OpenOptions},
io::AsyncBufRead,
io::AsyncSeekExt,
io::AsyncWriteExt,
};
use tokio_tar::{Archive, Builder, HeaderMode};
use walkdir::WalkDir;
/// Creates a Zstandard tarball.
pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> {
let file = OpenOptions::new()
.create(true)
.truncate(true)
.read(true)
.write(true)
.open(&tarball)
.await
.with_context(|| format!("tempfile creation {tarball}"))?;
let mut paths = Vec::new();
for entry in WalkDir::new(path) {
let entry = entry?;
let metadata = entry.metadata().expect("error getting dir entry metadata");
// Also allow directories so that we also get empty directories
if !(metadata.is_file() || metadata.is_dir()) {
continue;
}
let path = entry.into_path();
paths.push(path);
}
// Do a sort to get a more consistent listing
paths.sort_unstable();
let zstd = ZstdEncoder::with_quality_and_params(
file,
Level::Default,
&[CParameter::enable_long_distance_matching(true)],
);
let mut builder = Builder::new(zstd);
// Use reproducible header mode
builder.mode(HeaderMode::Deterministic);
for p in paths {
let rel_path = p.strip_prefix(path)?;
if rel_path.is_empty() {
// The top directory should not be compressed,
// the tar crate doesn't like that
continue;
}
builder.append_path_with_name(&p, rel_path).await?;
}
let mut zstd = builder.into_inner().await?;
zstd.shutdown().await?;
let mut compressed = zstd.into_inner();
let compressed_len = compressed.metadata().await?.len();
compressed.seek(SeekFrom::Start(0)).await?;
Ok((compressed, compressed_len))
}
/// Creates a Zstandard tarball.
pub async fn extract_zst_tarball(
path: &Utf8Path,
tarball: impl AsyncBufRead + Unpin,
) -> Result<()> {
let decoder = Box::pin(ZstdDecoder::new(tarball));
let mut archive = Archive::new(decoder);
archive.unpack(path).await?;
Ok(())
}