mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-28 10:30:40 +00:00
fix(pageserver): make delta split layer writer finish atomic (#9048)
similar to https://github.com/neondatabase/neon/pull/8841, we make the delta layer writer atomic when finishing the layers. ## Summary of changes * `put_value` not taking discard fn anymore * `finish` decides what layers to keep --------- Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
@@ -170,7 +170,7 @@ impl SplitImageLayerWriter {
|
||||
}
|
||||
Err(e) => {
|
||||
// ImageLayerWriter::finish will clean up the temporary layer if anything goes wrong,
|
||||
// so we don't need to remove it by ourselves.
|
||||
// so we don't need to remove the layer we just failed to create by ourselves.
|
||||
clean_up_layers(generated_layers);
|
||||
return Err(e);
|
||||
}
|
||||
@@ -206,7 +206,7 @@ impl SplitImageLayerWriter {
|
||||
pub struct SplitDeltaLayerWriter {
|
||||
inner: Option<(Key, DeltaLayerWriter)>,
|
||||
target_layer_size: u64,
|
||||
generated_layers: Vec<SplitWriterResult>,
|
||||
generated_layer_writers: Vec<(DeltaLayerWriter, PersistentLayerKey)>,
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -225,7 +225,7 @@ impl SplitDeltaLayerWriter {
|
||||
Ok(Self {
|
||||
target_layer_size,
|
||||
inner: None,
|
||||
generated_layers: Vec::new(),
|
||||
generated_layer_writers: Vec::new(),
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
@@ -234,20 +234,13 @@ impl SplitDeltaLayerWriter {
|
||||
})
|
||||
}
|
||||
|
||||
/// Put value into the layer writer. In the case the writer decides to produce a layer, and the discard fn returns true, no layer will be written in the end.
|
||||
pub async fn put_value_with_discard_fn<D, F>(
|
||||
pub async fn put_value(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: Value,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
discard: D,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
D: FnOnce(&PersistentLayerKey) -> F,
|
||||
F: Future<Output = bool>,
|
||||
{
|
||||
) -> anyhow::Result<()> {
|
||||
// The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
|
||||
// number, and therefore the final layer size could be a little bit larger or smaller than the target.
|
||||
//
|
||||
@@ -291,24 +284,8 @@ impl SplitDeltaLayerWriter {
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
is_delta: true,
|
||||
};
|
||||
if discard(&layer_key).await {
|
||||
drop(prev_delta_writer);
|
||||
self.generated_layers
|
||||
.push(SplitWriterResult::Discarded(layer_key));
|
||||
} else {
|
||||
// `finish` will remove the file if anything goes wrong, while we need to handle deleting temporary
|
||||
// files for `finish_creating`.
|
||||
let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
|
||||
let delta_layer = match Layer::finish_creating(self.conf, tline, desc, &path) {
|
||||
Ok(layer) => layer,
|
||||
Err(e) => {
|
||||
tokio::fs::remove_file(&path).await.ok();
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
self.generated_layers
|
||||
.push(SplitWriterResult::Produced(delta_layer));
|
||||
}
|
||||
self.generated_layer_writers
|
||||
.push((prev_delta_writer, layer_key));
|
||||
} else if inner.estimated_size() >= S3_UPLOAD_LIMIT {
|
||||
// We have to produce a very large file b/c a key is updated too often.
|
||||
anyhow::bail!(
|
||||
@@ -323,60 +300,68 @@ impl SplitDeltaLayerWriter {
|
||||
inner.put_value(key, lsn, val, ctx).await
|
||||
}
|
||||
|
||||
pub async fn put_value(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: Value,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
self.put_value_with_discard_fn(key, lsn, val, tline, ctx, |_| async { false })
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn finish_with_discard_fn<D, F>(
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
discard: D,
|
||||
discard_fn: D,
|
||||
) -> anyhow::Result<Vec<SplitWriterResult>>
|
||||
where
|
||||
D: FnOnce(&PersistentLayerKey) -> F,
|
||||
D: Fn(&PersistentLayerKey) -> F,
|
||||
F: Future<Output = bool>,
|
||||
{
|
||||
let Self {
|
||||
mut generated_layers,
|
||||
mut generated_layer_writers,
|
||||
inner,
|
||||
..
|
||||
} = self;
|
||||
let Some((start_key, inner)) = inner else {
|
||||
return Ok(generated_layers);
|
||||
};
|
||||
if inner.num_keys() == 0 {
|
||||
return Ok(generated_layers);
|
||||
if let Some((start_key, writer)) = inner {
|
||||
if writer.num_keys() != 0 {
|
||||
let end_key = self.last_key_written.next();
|
||||
let layer_key = PersistentLayerKey {
|
||||
key_range: start_key..end_key,
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
is_delta: true,
|
||||
};
|
||||
generated_layer_writers.push((writer, layer_key));
|
||||
}
|
||||
}
|
||||
let end_key = self.last_key_written.next();
|
||||
let layer_key = PersistentLayerKey {
|
||||
key_range: start_key..end_key,
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
is_delta: true,
|
||||
};
|
||||
if discard(&layer_key).await {
|
||||
generated_layers.push(SplitWriterResult::Discarded(layer_key));
|
||||
} else {
|
||||
// `finish` will remove the file if anything goes wrong, while we need to handle deleting temporary
|
||||
// files for `finish_creating`.
|
||||
let (desc, path) = inner.finish(end_key, ctx).await?;
|
||||
let delta_layer = match Layer::finish_creating(self.conf, tline, desc, &path) {
|
||||
Ok(layer) => layer,
|
||||
Err(e) => {
|
||||
tokio::fs::remove_file(&path).await.ok();
|
||||
return Err(e);
|
||||
let clean_up_layers = |generated_layers: Vec<SplitWriterResult>| {
|
||||
for produced_layer in generated_layers {
|
||||
if let SplitWriterResult::Produced(delta_layer) = produced_layer {
|
||||
let layer: Layer = delta_layer.into();
|
||||
layer.delete_on_drop();
|
||||
}
|
||||
};
|
||||
generated_layers.push(SplitWriterResult::Produced(delta_layer));
|
||||
}
|
||||
};
|
||||
// BEGIN: catch every error and do the recovery in the below section
|
||||
let mut generated_layers = Vec::new();
|
||||
for (inner, layer_key) in generated_layer_writers {
|
||||
if discard_fn(&layer_key).await {
|
||||
generated_layers.push(SplitWriterResult::Discarded(layer_key));
|
||||
} else {
|
||||
let layer = match inner.finish(layer_key.key_range.end, ctx).await {
|
||||
Ok((desc, path)) => {
|
||||
match Layer::finish_creating(self.conf, tline, desc, &path) {
|
||||
Ok(layer) => layer,
|
||||
Err(e) => {
|
||||
tokio::fs::remove_file(&path).await.ok();
|
||||
clean_up_layers(generated_layers);
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
// DeltaLayerWriter::finish will clean up the temporary layer if anything goes wrong,
|
||||
// so we don't need to remove the layer we just failed to create by ourselves.
|
||||
clean_up_layers(generated_layers);
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
generated_layers.push(SplitWriterResult::Produced(layer));
|
||||
}
|
||||
}
|
||||
// END: catch every error and do the recovery in the above section
|
||||
Ok(generated_layers)
|
||||
}
|
||||
|
||||
@@ -389,11 +374,6 @@ impl SplitDeltaLayerWriter {
|
||||
self.finish_with_discard_fn(tline, ctx, |_| async { false })
|
||||
.await
|
||||
}
|
||||
|
||||
/// This function will be deprecated with #8841.
|
||||
pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, Option<DeltaLayerWriter>)> {
|
||||
Ok((self.generated_layers, self.inner.map(|x| x.1)))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -473,13 +453,7 @@ mod tests {
|
||||
assert_eq!(layers.len(), 1);
|
||||
|
||||
delta_writer
|
||||
.put_value(
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
Value::Image(get_img(0)),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.put_value(get_key(0), Lsn(0x18), Value::Image(get_img(0)), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
|
||||
@@ -551,14 +525,7 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
delta_writer
|
||||
.put_value_with_discard_fn(
|
||||
get_key(i),
|
||||
Lsn(0x20),
|
||||
Value::Image(get_large_img()),
|
||||
&tline,
|
||||
&ctx,
|
||||
|_| async { discard },
|
||||
)
|
||||
.put_value(get_key(i), Lsn(0x20), Value::Image(get_large_img()), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
@@ -664,23 +631,11 @@ mod tests {
|
||||
assert_eq!(layers.len(), 2);
|
||||
|
||||
delta_writer
|
||||
.put_value(
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
Value::Image(get_img(0)),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.put_value(get_key(0), Lsn(0x18), Value::Image(get_img(0)), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
delta_writer
|
||||
.put_value(
|
||||
get_key(1),
|
||||
Lsn(0x1A),
|
||||
Value::Image(get_large_img()),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.put_value(get_key(1), Lsn(0x1A), Value::Image(get_large_img()), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
|
||||
@@ -744,7 +699,6 @@ mod tests {
|
||||
get_key(0),
|
||||
Lsn(i as u64 * 16 + 0x10),
|
||||
Value::Image(get_large_img()),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -121,18 +121,12 @@ impl KeyHistoryRetention {
|
||||
async fn pipe_to(
|
||||
self,
|
||||
key: Key,
|
||||
tline: &Arc<Timeline>,
|
||||
delta_writer: &mut SplitDeltaLayerWriter,
|
||||
mut image_writer: Option<&mut SplitImageLayerWriter>,
|
||||
stat: &mut CompactionStatistics,
|
||||
dry_run: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut first_batch = true;
|
||||
let discard = |key: &PersistentLayerKey| {
|
||||
let key = key.clone();
|
||||
async move { Self::discard_key(&key, tline, dry_run).await }
|
||||
};
|
||||
for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon {
|
||||
if first_batch {
|
||||
if logs.len() == 1 && logs[0].1.is_image() {
|
||||
@@ -144,40 +138,27 @@ impl KeyHistoryRetention {
|
||||
image_writer.put_image(key, img.clone(), ctx).await?;
|
||||
} else {
|
||||
delta_writer
|
||||
.put_value_with_discard_fn(
|
||||
key,
|
||||
cutoff_lsn,
|
||||
Value::Image(img.clone()),
|
||||
tline,
|
||||
ctx,
|
||||
discard,
|
||||
)
|
||||
.put_value(key, cutoff_lsn, Value::Image(img.clone()), ctx)
|
||||
.await?;
|
||||
}
|
||||
} else {
|
||||
for (lsn, val) in logs {
|
||||
stat.produce_key(&val);
|
||||
delta_writer
|
||||
.put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
|
||||
.await?;
|
||||
delta_writer.put_value(key, lsn, val, ctx).await?;
|
||||
}
|
||||
}
|
||||
first_batch = false;
|
||||
} else {
|
||||
for (lsn, val) in logs {
|
||||
stat.produce_key(&val);
|
||||
delta_writer
|
||||
.put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
|
||||
.await?;
|
||||
delta_writer.put_value(key, lsn, val, ctx).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
|
||||
for (lsn, val) in above_horizon_logs {
|
||||
stat.produce_key(&val);
|
||||
delta_writer
|
||||
.put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
|
||||
.await?;
|
||||
delta_writer.put_value(key, lsn, val, ctx).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -1988,11 +1969,9 @@ impl Timeline {
|
||||
retention
|
||||
.pipe_to(
|
||||
*last_key,
|
||||
self,
|
||||
&mut delta_layer_writer,
|
||||
image_layer_writer.as_mut(),
|
||||
&mut stat,
|
||||
dry_run,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -2019,11 +1998,9 @@ impl Timeline {
|
||||
retention
|
||||
.pipe_to(
|
||||
last_key,
|
||||
self,
|
||||
&mut delta_layer_writer,
|
||||
image_layer_writer.as_mut(),
|
||||
&mut stat,
|
||||
dry_run,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -2051,8 +2028,7 @@ impl Timeline {
|
||||
.finish_with_discard_fn(self, ctx, discard)
|
||||
.await?
|
||||
} else {
|
||||
let (layers, _) = delta_layer_writer.take()?;
|
||||
assert!(layers.is_empty(), "delta layers produced in dry run mode?");
|
||||
drop(delta_layer_writer);
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user