mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-30 03:20:36 +00:00
Merge branch 'main' into problame/integrate-tokio-epoll-uring/benchmarking/2024-01-31-prs/async-walredo
This commit is contained in:
@@ -47,9 +47,10 @@ impl<T, const L: usize> ops::Deref for HistoryBufferWithDropCounter<T, L> {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
#[derive(serde::Serialize, serde::Deserialize)]
|
||||
struct SerdeRepr<T> {
|
||||
buffer: Vec<T>,
|
||||
buffer_size: usize,
|
||||
drop_count: u64,
|
||||
}
|
||||
|
||||
@@ -61,6 +62,7 @@ where
|
||||
let HistoryBufferWithDropCounter { buffer, drop_count } = value;
|
||||
SerdeRepr {
|
||||
buffer: buffer.iter().cloned().collect(),
|
||||
buffer_size: L,
|
||||
drop_count: *drop_count,
|
||||
}
|
||||
}
|
||||
@@ -78,19 +80,52 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter<T, L>
|
||||
where
|
||||
T: Clone + serde::Deserialize<'de>,
|
||||
{
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
let SerdeRepr {
|
||||
buffer: des_buffer,
|
||||
drop_count,
|
||||
buffer_size,
|
||||
} = SerdeRepr::<T>::deserialize(deserializer)?;
|
||||
if buffer_size != L {
|
||||
use serde::de::Error;
|
||||
return Err(D::Error::custom(format!(
|
||||
"invalid buffer_size, expecting {L} got {buffer_size}"
|
||||
)));
|
||||
}
|
||||
let mut buffer = HistoryBuffer::new();
|
||||
buffer.extend(des_buffer);
|
||||
Ok(HistoryBufferWithDropCounter { buffer, drop_count })
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::HistoryBufferWithDropCounter;
|
||||
|
||||
#[test]
|
||||
fn test_basics() {
|
||||
let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
|
||||
let mut b = HistoryBufferWithDropCounter::<usize, 2>::default();
|
||||
b.write(1);
|
||||
b.write(2);
|
||||
b.write(3);
|
||||
assert!(b.iter().any(|e| *e == 2));
|
||||
assert!(b.iter().any(|e| *e == 3));
|
||||
assert!(!b.iter().any(|e| *e == 1));
|
||||
|
||||
// round-trip serde
|
||||
let round_tripped: HistoryBufferWithDropCounter<usize, 2> =
|
||||
serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap();
|
||||
assert_eq!(
|
||||
round_tripped.iter().cloned().collect::<Vec<_>>(),
|
||||
b.iter().cloned().collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -87,6 +87,8 @@ pub mod failpoint_support;
|
||||
|
||||
pub mod yielding_loop;
|
||||
|
||||
pub mod zstd;
|
||||
|
||||
pub mod poison;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
|
||||
@@ -29,12 +29,10 @@ pub struct PageserverFeedback {
|
||||
// Serialize with RFC3339 format.
|
||||
#[serde(with = "serde_systemtime")]
|
||||
pub replytime: SystemTime,
|
||||
/// Used to track feedbacks from different shards. Always zero for unsharded tenants.
|
||||
pub shard_number: u32,
|
||||
}
|
||||
|
||||
// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
|
||||
// Do not remove previously available fields because this might be backwards incompatible.
|
||||
pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
|
||||
|
||||
impl PageserverFeedback {
|
||||
pub fn empty() -> PageserverFeedback {
|
||||
PageserverFeedback {
|
||||
@@ -43,6 +41,7 @@ impl PageserverFeedback {
|
||||
remote_consistent_lsn: Lsn::INVALID,
|
||||
disk_consistent_lsn: Lsn::INVALID,
|
||||
replytime: *PG_EPOCH,
|
||||
shard_number: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,17 +58,26 @@ impl PageserverFeedback {
|
||||
//
|
||||
// TODO: change serialized fields names once all computes migrate to rename.
|
||||
pub fn serialize(&self, buf: &mut BytesMut) {
|
||||
buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
|
||||
let buf_ptr = buf.len();
|
||||
buf.put_u8(0); // # of keys, will be filled later
|
||||
let mut nkeys = 0;
|
||||
|
||||
nkeys += 1;
|
||||
buf.put_slice(b"current_timeline_size\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.current_timeline_size);
|
||||
|
||||
nkeys += 1;
|
||||
buf.put_slice(b"ps_writelsn\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.last_received_lsn.0);
|
||||
|
||||
nkeys += 1;
|
||||
buf.put_slice(b"ps_flushlsn\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.disk_consistent_lsn.0);
|
||||
|
||||
nkeys += 1;
|
||||
buf.put_slice(b"ps_applylsn\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.remote_consistent_lsn.0);
|
||||
@@ -80,9 +88,19 @@ impl PageserverFeedback {
|
||||
.expect("failed to serialize pg_replytime earlier than PG_EPOCH")
|
||||
.as_micros() as i64;
|
||||
|
||||
nkeys += 1;
|
||||
buf.put_slice(b"ps_replytime\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_i64(timestamp);
|
||||
|
||||
if self.shard_number > 0 {
|
||||
nkeys += 1;
|
||||
buf.put_slice(b"shard_number\0");
|
||||
buf.put_i32(4);
|
||||
buf.put_u32(self.shard_number);
|
||||
}
|
||||
|
||||
buf[buf_ptr] = nkeys;
|
||||
}
|
||||
|
||||
// Deserialize PageserverFeedback message
|
||||
@@ -125,9 +143,8 @@ impl PageserverFeedback {
|
||||
}
|
||||
b"shard_number" => {
|
||||
let len = buf.get_i32();
|
||||
// TODO: this will be implemented in the next update,
|
||||
// for now, we just skip the value.
|
||||
buf.advance(len as usize);
|
||||
assert_eq!(len, 4);
|
||||
rf.shard_number = buf.get_u32();
|
||||
}
|
||||
_ => {
|
||||
let len = buf.get_i32();
|
||||
@@ -200,10 +217,7 @@ mod tests {
|
||||
rf.serialize(&mut data);
|
||||
|
||||
// Add an extra field to the buffer and adjust number of keys
|
||||
if let Some(first) = data.first_mut() {
|
||||
*first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
|
||||
}
|
||||
|
||||
data[0] += 1;
|
||||
data.put_slice(b"new_field_one\0");
|
||||
data.put_i32(8);
|
||||
data.put_u64(42);
|
||||
|
||||
@@ -110,6 +110,49 @@ impl<T> OnceCell<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a guard to an existing initialized value, or returns an unique initialization
|
||||
/// permit which can be used to initialize this `OnceCell` using `OnceCell::set`.
|
||||
pub async fn get_or_init_detached(&self) -> Result<Guard<'_, T>, InitPermit> {
|
||||
// It looks like OnceCell::get_or_init could be implemented using this method instead of
|
||||
// duplication. However, that makes the future be !Send due to possibly holding on to the
|
||||
// MutexGuard over an await point.
|
||||
loop {
|
||||
let sem = {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
if guard.value.is_some() {
|
||||
return Ok(Guard(guard));
|
||||
}
|
||||
guard.init_semaphore.clone()
|
||||
};
|
||||
|
||||
{
|
||||
let permit = {
|
||||
// increment the count for the duration of queued
|
||||
let _guard = CountWaitingInitializers::start(self);
|
||||
sem.acquire().await
|
||||
};
|
||||
|
||||
let Ok(permit) = permit else {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
|
||||
// there was a take_and_deinit in between
|
||||
continue;
|
||||
}
|
||||
assert!(
|
||||
guard.value.is_some(),
|
||||
"semaphore got closed, must be initialized"
|
||||
);
|
||||
return Ok(Guard(guard));
|
||||
};
|
||||
|
||||
permit.forget();
|
||||
}
|
||||
|
||||
let permit = InitPermit(sem);
|
||||
return Err(permit);
|
||||
}
|
||||
}
|
||||
|
||||
/// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
|
||||
/// to complete initializing the inner value.
|
||||
///
|
||||
@@ -481,4 +524,39 @@ mod tests {
|
||||
|
||||
assert_eq!("t1", *cell.get().unwrap());
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn detached_init_smoke() {
|
||||
let target = OnceCell::default();
|
||||
|
||||
let Err(permit) = target.get_or_init_detached().await else {
|
||||
unreachable!("it is not initialized")
|
||||
};
|
||||
|
||||
tokio::time::timeout(
|
||||
std::time::Duration::from_secs(3600 * 24 * 7 * 365),
|
||||
target.get_or_init(|permit2| async { Ok::<_, Infallible>((11, permit2)) }),
|
||||
)
|
||||
.await
|
||||
.expect_err("should timeout since we are already holding the permit");
|
||||
|
||||
target.set(42, permit);
|
||||
|
||||
let (_answer, permit) = {
|
||||
let mut guard = target
|
||||
.get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(*guard, 42);
|
||||
|
||||
guard.take_and_deinit()
|
||||
};
|
||||
|
||||
assert!(target.get().is_none());
|
||||
|
||||
target.set(11, permit);
|
||||
|
||||
assert_eq!(*target.get().unwrap(), 11);
|
||||
}
|
||||
}
|
||||
|
||||
78
libs/utils/src/zstd.rs
Normal file
78
libs/utils/src/zstd.rs
Normal file
@@ -0,0 +1,78 @@
|
||||
use std::io::SeekFrom;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use async_compression::{
|
||||
tokio::{bufread::ZstdDecoder, write::ZstdEncoder},
|
||||
zstd::CParameter,
|
||||
Level,
|
||||
};
|
||||
use camino::Utf8Path;
|
||||
use nix::NixPath;
|
||||
use tokio::{
|
||||
fs::{File, OpenOptions},
|
||||
io::AsyncBufRead,
|
||||
io::AsyncSeekExt,
|
||||
io::AsyncWriteExt,
|
||||
};
|
||||
use tokio_tar::{Archive, Builder, HeaderMode};
|
||||
use walkdir::WalkDir;
|
||||
|
||||
/// Creates a Zstandard tarball.
|
||||
pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> {
|
||||
let file = OpenOptions::new()
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.read(true)
|
||||
.write(true)
|
||||
.open(&tarball)
|
||||
.await
|
||||
.with_context(|| format!("tempfile creation {tarball}"))?;
|
||||
|
||||
let mut paths = Vec::new();
|
||||
for entry in WalkDir::new(path) {
|
||||
let entry = entry?;
|
||||
let metadata = entry.metadata().expect("error getting dir entry metadata");
|
||||
// Also allow directories so that we also get empty directories
|
||||
if !(metadata.is_file() || metadata.is_dir()) {
|
||||
continue;
|
||||
}
|
||||
let path = entry.into_path();
|
||||
paths.push(path);
|
||||
}
|
||||
// Do a sort to get a more consistent listing
|
||||
paths.sort_unstable();
|
||||
let zstd = ZstdEncoder::with_quality_and_params(
|
||||
file,
|
||||
Level::Default,
|
||||
&[CParameter::enable_long_distance_matching(true)],
|
||||
);
|
||||
let mut builder = Builder::new(zstd);
|
||||
// Use reproducible header mode
|
||||
builder.mode(HeaderMode::Deterministic);
|
||||
for p in paths {
|
||||
let rel_path = p.strip_prefix(path)?;
|
||||
if rel_path.is_empty() {
|
||||
// The top directory should not be compressed,
|
||||
// the tar crate doesn't like that
|
||||
continue;
|
||||
}
|
||||
builder.append_path_with_name(&p, rel_path).await?;
|
||||
}
|
||||
let mut zstd = builder.into_inner().await?;
|
||||
zstd.shutdown().await?;
|
||||
let mut compressed = zstd.into_inner();
|
||||
let compressed_len = compressed.metadata().await?.len();
|
||||
compressed.seek(SeekFrom::Start(0)).await?;
|
||||
Ok((compressed, compressed_len))
|
||||
}
|
||||
|
||||
/// Creates a Zstandard tarball.
|
||||
pub async fn extract_zst_tarball(
|
||||
path: &Utf8Path,
|
||||
tarball: impl AsyncBufRead + Unpin,
|
||||
) -> Result<()> {
|
||||
let decoder = Box::pin(ZstdDecoder::new(tarball));
|
||||
let mut archive = Archive::new(decoder);
|
||||
archive.unpack(path).await?;
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user