pageserver: hard-link layers during shard split

This commit is contained in:
John Spray
2024-02-11 17:54:41 +00:00
parent 0526603adb
commit 54d683ae07

View File

@@ -31,6 +31,7 @@ use crate::control_plane_client::{
ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
};
use crate::deletion_queue::DeletionQueueClient;
use crate::disk_usage_eviction_task::EvictionLayer;
use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::{
@@ -1439,8 +1440,10 @@ impl TenantManager {
}
};
// TODO: hardlink layers from the parent into the child shard directories so that they don't immediately re-download
// TODO: erase the dentries from the parent
// Optimization: hardlink layers from the parent into the children, so that they don't have to
// re-download & duplicate the data referenced in their initial IndexPart
self.shard_split_hardlink(parent, child_shards.clone())
.await?;
// Take a snapshot of where the parent's WAL ingest had got to: we will wait for
// child shards to reach this point.
@@ -1479,10 +1482,11 @@ impl TenantManager {
// Phase 4: wait for child chards WAL ingest to catch up to target LSN
for child_shard_id in &child_shards {
let child_shard_id = *child_shard_id;
let child_shard = {
let locked = TENANTS.read().unwrap();
let peek_slot =
tenant_map_peek_slot(&locked, child_shard_id, TenantSlotPeekMode::Read)?;
tenant_map_peek_slot(&locked, &child_shard_id, TenantSlotPeekMode::Read)?;
peek_slot.and_then(|s| s.get_attached()).cloned()
};
if let Some(t) = child_shard {
@@ -1550,6 +1554,113 @@ impl TenantManager {
Ok(child_shards)
}
/// Part of [`Self::shard_split`]: hard link parent shard layers into child shards, as an optimization
/// to avoid the children downloading them again.
///
/// For each resident layer in the parent shard, we will hard link it into all of the child shards.
async fn shard_split_hardlink(
&self,
parent_shard: &Tenant,
child_shards: Vec<TenantShardId>,
) -> anyhow::Result<()> {
let parent_path = self.conf.tenant_path(parent_shard.get_tenant_shard_id());
let (parent_timelines, parent_layers) = {
let mut parent_layers = Vec::new();
let timelines = parent_shard.timelines.lock().unwrap().clone();
let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
for timeline in timelines.values() {
let timeline_layers = timeline.get_local_layers_for_disk_usage_eviction().await;
for layer in timeline_layers.resident_layers.into_iter().map(|r| r.layer) {
let layer = match layer {
EvictionLayer::Attached(l) => l,
EvictionLayer::Secondary(_) => {
// Unreachable, because we fetched layers from an object of type `Tenant`, it can
// only return attached layers
unreachable!();
}
};
let relative_path = layer
.local_path()
.strip_prefix(&parent_path)
.context("Removing prefix from parent layer path")?;
parent_layers.push(relative_path.to_owned());
}
}
(parent_timelines, parent_layers)
};
for child in child_shards {
let child_prefix = self.conf.tenant_path(&child);
let mut create_dirs = vec![child_prefix.clone()];
create_dirs.extend(
parent_timelines
.iter()
.map(|t| self.conf.timeline_path(&child, t)),
);
let layers_clone = parent_layers.clone();
let parent_path = parent_path.clone();
// Since we will do a large number of small filesystem metadata operations, batch them into
// spawn_blocking calls rather than doing each one as a tokio::fs round-trip.
let jh = tokio::task::spawn_blocking(move || -> anyhow::Result<()> {
for dir in create_dirs {
if let Err(e) = std::fs::create_dir_all(&dir) {
// Ignore AlreadyExists errors, drop out on all other errors
match e.kind() {
std::io::ErrorKind::AlreadyExists => {}
_ => {
return Err(anyhow::anyhow!(e).context(format!("Creating {dir}")));
}
}
}
}
for relative_layer in layers_clone {
let parent_path = parent_path.join(&relative_layer);
let child_path = child_prefix.join(&relative_layer);
if let Err(e) = std::fs::hard_link(&parent_path, &child_path) {
match e.kind() {
std::io::ErrorKind::AlreadyExists => {}
std::io::ErrorKind::NotFound => {
tracing::info!(
"Layer {} not found during hard-linking, evicted during split?",
relative_layer
);
}
_ => {
return Err(anyhow::anyhow!(e).context(format!(
"Hard linking {relative_layer} into {child}"
)))
}
}
}
}
Ok(())
});
match jh.await {
Ok(Ok(())) => {
tracing::info!(
"Linked {} layers into child shard {}",
parent_layers.len(),
child
);
}
Ok(Err(e)) => {
// This is an optimization, so we tolerate failure.
tracing::warn!("Error hard-linking layers, proceeding anyway: {e}")
}
Err(e) => {
// This is something totally unexpected like a panic, so bail out.
anyhow::bail!("Error joining hard linking task: {e}");
}
}
}
Ok(())
}
}
#[derive(Debug, thiserror::Error)]