safekeeper: don't un-evict timelines during snapshot API handler (#9428)

## Problem

When we use pull_timeline API on an evicted timeline, it gets downloaded
to serve the snapshot API request. That means that to evacuate all the
timelines from a node, the node needs enough disk space to download
partial segments from all timelines, which may not be physically the
case.

Closes: #8833 

## Summary of changes

- Add a "try" variant of acquiring a residence guard, that returns None
if the timeline is offloaded
- During snapshot API handler, take a different code path if the
timeline isn't resident, where we just read the checkpoint and don't try
to read any segments.
This commit is contained in:
John Spray
2024-10-28 08:47:12 +00:00
committed by GitHub
parent e7277885b3
commit 923974d4da
6 changed files with 298 additions and 42 deletions

View File

@@ -262,14 +262,6 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
check_permission(&request, Some(ttid.tenant_id))?;
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
// Note: with evicted timelines it should work better then de-evict them and
// stream; probably start_snapshot would copy partial s3 file to dest path
// and stream control file, or return WalResidentTimeline if timeline is not
// evicted.
let tli = tli
.wal_residence_guard()
.await
.map_err(ApiError::InternalServerError)?;
// To stream the body use wrap_stream which wants Stream of Result<Bytes>,
// so create the chan and write to it in another task.

View File

@@ -8,6 +8,7 @@ use serde::{Deserialize, Serialize};
use std::{
cmp::min,
io::{self, ErrorKind},
sync::Arc,
};
use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task};
use tokio_tar::{Archive, Builder, Header};
@@ -25,8 +26,8 @@ use crate::{
routes::TimelineStatus,
},
safekeeper::Term,
state::TimelinePersistentState,
timeline::WalResidentTimeline,
state::{EvictionState, TimelinePersistentState},
timeline::{Timeline, WalResidentTimeline},
timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
wal_backup,
wal_storage::open_wal_file,
@@ -43,18 +44,33 @@ use utils::{
/// Stream tar archive of timeline to tx.
#[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))]
pub async fn stream_snapshot(
tli: WalResidentTimeline,
tli: Arc<Timeline>,
source: NodeId,
destination: NodeId,
tx: mpsc::Sender<Result<Bytes>>,
) {
if let Err(e) = stream_snapshot_guts(tli, source, destination, tx.clone()).await {
// Error type/contents don't matter as they won't can't reach the client
// (hyper likely doesn't do anything with it), but http stream will be
// prematurely terminated. It would be nice to try to send the error in
// trailers though.
tx.send(Err(anyhow!("snapshot failed"))).await.ok();
error!("snapshot failed: {:#}", e);
match tli.try_wal_residence_guard().await {
Err(e) => {
tx.send(Err(anyhow!("Error checking residence: {:#}", e)))
.await
.ok();
}
Ok(maybe_resident_tli) => {
if let Err(e) = match maybe_resident_tli {
Some(resident_tli) => {
stream_snapshot_resident_guts(resident_tli, source, destination, tx.clone())
.await
}
None => stream_snapshot_offloaded_guts(tli, source, destination, tx.clone()).await,
} {
// Error type/contents don't matter as they won't can't reach the client
// (hyper likely doesn't do anything with it), but http stream will be
// prematurely terminated. It would be nice to try to send the error in
// trailers though.
tx.send(Err(anyhow!("snapshot failed"))).await.ok();
error!("snapshot failed: {:#}", e);
}
}
}
}
@@ -80,12 +96,10 @@ impl Drop for SnapshotContext {
}
}
pub async fn stream_snapshot_guts(
tli: WalResidentTimeline,
source: NodeId,
destination: NodeId,
/// Build a tokio_tar stream that sends encoded bytes into a Bytes channel.
fn prepare_tar_stream(
tx: mpsc::Sender<Result<Bytes>>,
) -> Result<()> {
) -> tokio_tar::Builder<impl AsyncWrite + Unpin + Send> {
// tokio-tar wants Write implementor, but we have mpsc tx <Result<Bytes>>;
// use SinkWriter as a Write impl. That is,
// - create Sink from the tx. It returns PollSendError if chan is closed.
@@ -100,12 +114,38 @@ pub async fn stream_snapshot_guts(
// - SinkWriter (not surprisingly) wants sink of &[u8], not bytes, so wrap
// into CopyToBytes. This is a data copy.
let copy_to_bytes = CopyToBytes::new(oksink);
let mut writer = SinkWriter::new(copy_to_bytes);
let pinned_writer = std::pin::pin!(writer);
let writer = SinkWriter::new(copy_to_bytes);
let pinned_writer = Box::pin(writer);
// Note that tokio_tar append_* funcs use tokio::io::copy with 8KB buffer
// which is also likely suboptimal.
let mut ar = Builder::new_non_terminated(pinned_writer);
Builder::new_non_terminated(pinned_writer)
}
/// Implementation of snapshot for an offloaded timeline, only reads control file
pub(crate) async fn stream_snapshot_offloaded_guts(
tli: Arc<Timeline>,
source: NodeId,
destination: NodeId,
tx: mpsc::Sender<Result<Bytes>>,
) -> Result<()> {
let mut ar = prepare_tar_stream(tx);
tli.snapshot_offloaded(&mut ar, source, destination).await?;
ar.finish().await?;
Ok(())
}
/// Implementation of snapshot for a timeline which is resident (includes some segment data)
pub async fn stream_snapshot_resident_guts(
tli: WalResidentTimeline,
source: NodeId,
destination: NodeId,
tx: mpsc::Sender<Result<Bytes>>,
) -> Result<()> {
let mut ar = prepare_tar_stream(tx);
let bctx = tli.start_snapshot(&mut ar, source, destination).await?;
pausable_failpoint!("sk-snapshot-after-list-pausable");
@@ -138,6 +178,70 @@ pub async fn stream_snapshot_guts(
Ok(())
}
impl Timeline {
/// Simple snapshot for an offloaded timeline: we will only upload a renamed partial segment and
/// pass a modified control file into the provided tar stream (nothing with data segments on disk, since
/// we are offloaded and there aren't any)
async fn snapshot_offloaded<W: AsyncWrite + Unpin + Send>(
self: &Arc<Timeline>,
ar: &mut tokio_tar::Builder<W>,
source: NodeId,
destination: NodeId,
) -> Result<()> {
// Take initial copy of control file, then release state lock
let mut control_file = {
let shared_state = self.write_shared_state().await;
let control_file = TimelinePersistentState::clone(shared_state.sk.state());
// Rare race: we got unevicted between entering function and reading control file.
// We error out and let API caller retry.
if !matches!(control_file.eviction_state, EvictionState::Offloaded(_)) {
bail!("Timeline was un-evicted during snapshot, please retry");
}
control_file
};
// Modify the partial segment of the in-memory copy for the control file to
// point to the destination safekeeper.
let replace = control_file
.partial_backup
.replace_uploaded_segment(source, destination)?;
let Some(replace) = replace else {
// In Manager:: ready_for_eviction, we do not permit eviction unless the timeline
// has a partial segment. It is unexpected that
anyhow::bail!("Timeline has no partial segment, cannot generate snapshot");
};
tracing::info!("Replacing uploaded partial segment in in-mem control file: {replace:?}");
// Optimistically try to copy the partial segment to the destination's path: this
// can fail if the timeline was un-evicted and modified in the background.
let remote_timeline_path = &self.remote_path;
wal_backup::copy_partial_segment(
&replace.previous.remote_path(remote_timeline_path),
&replace.current.remote_path(remote_timeline_path),
)
.await?;
// Since the S3 copy succeeded with the path given in our control file snapshot, and
// we are sending that snapshot in our response, we are giving the caller a consistent
// snapshot even if our local Timeline was unevicted or otherwise modified in the meantime.
let buf = control_file
.write_to_buf()
.with_context(|| "failed to serialize control store")?;
let mut header = Header::new_gnu();
header.set_size(buf.len().try_into().expect("never breaches u64"));
ar.append_data(&mut header, CONTROL_FILE_NAME, buf.as_slice())
.await
.with_context(|| "failed to append to archive")?;
Ok(())
}
}
impl WalResidentTimeline {
/// Start streaming tar archive with timeline:
/// 1) stream control file under lock;

View File

@@ -797,14 +797,17 @@ impl Timeline {
state.sk.term_bump(to).await
}
/// Get the timeline guard for reading/writing WAL files.
/// If WAL files are not present on disk (evicted), they will be automatically
/// downloaded from remote storage. This is done in the manager task, which is
/// responsible for issuing all guards.
///
/// NB: don't use this function from timeline_manager, it will deadlock.
/// NB: don't use this function while holding shared_state lock.
pub async fn wal_residence_guard(self: &Arc<Self>) -> Result<WalResidentTimeline> {
/// Guts of [`Self::wal_residence_guard`] and [`Self::try_wal_residence_guard`]
async fn do_wal_residence_guard(
self: &Arc<Self>,
block: bool,
) -> Result<Option<WalResidentTimeline>> {
let op_label = if block {
"wal_residence_guard"
} else {
"try_wal_residence_guard"
};
if self.is_cancelled() {
bail!(TimelineError::Cancelled(self.ttid));
}
@@ -816,10 +819,13 @@ impl Timeline {
// Wait 30 seconds for the guard to be acquired. It can time out if someone is
// holding the lock (e.g. during `SafeKeeper::process_msg()`) or manager task
// is stuck.
let res = tokio::time::timeout_at(
started_at + Duration::from_secs(30),
self.manager_ctl.wal_residence_guard(),
)
let res = tokio::time::timeout_at(started_at + Duration::from_secs(30), async {
if block {
self.manager_ctl.wal_residence_guard().await.map(Some)
} else {
self.manager_ctl.try_wal_residence_guard().await
}
})
.await;
let guard = match res {
@@ -827,14 +833,14 @@ impl Timeline {
let finished_at = Instant::now();
let elapsed = finished_at - started_at;
MISC_OPERATION_SECONDS
.with_label_values(&["wal_residence_guard"])
.with_label_values(&[op_label])
.observe(elapsed.as_secs_f64());
guard
}
Ok(Err(e)) => {
warn!(
"error while acquiring WalResidentTimeline guard, statuses {:?} => {:?}",
"error acquiring in {op_label}, statuses {:?} => {:?}",
status_before,
self.mgr_status.get()
);
@@ -842,7 +848,7 @@ impl Timeline {
}
Err(_) => {
warn!(
"timeout while acquiring WalResidentTimeline guard, statuses {:?} => {:?}",
"timeout acquiring in {op_label} guard, statuses {:?} => {:?}",
status_before,
self.mgr_status.get()
);
@@ -850,7 +856,28 @@ impl Timeline {
}
};
Ok(WalResidentTimeline::new(self.clone(), guard))
Ok(guard.map(|g| WalResidentTimeline::new(self.clone(), g)))
}
/// Get the timeline guard for reading/writing WAL files.
/// If WAL files are not present on disk (evicted), they will be automatically
/// downloaded from remote storage. This is done in the manager task, which is
/// responsible for issuing all guards.
///
/// NB: don't use this function from timeline_manager, it will deadlock.
/// NB: don't use this function while holding shared_state lock.
pub async fn wal_residence_guard(self: &Arc<Self>) -> Result<WalResidentTimeline> {
self.do_wal_residence_guard(true)
.await
.map(|m| m.expect("Always get Some in block=true mode"))
}
/// Get the timeline guard for reading/writing WAL files if the timeline is resident,
/// else return None
pub(crate) async fn try_wal_residence_guard(
self: &Arc<Self>,
) -> Result<Option<WalResidentTimeline>> {
self.do_wal_residence_guard(false).await
}
pub async fn backup_partial_reset(self: &Arc<Self>) -> Result<Vec<String>> {

View File

@@ -56,6 +56,9 @@ impl Manager {
// This also works for the first segment despite last_removed_segno
// being 0 on init because this 0 triggers run of wal_removal_task
// on success of which manager updates the horizon.
//
// **Note** pull_timeline functionality assumes that evicted timelines always have
// a partial segment: if we ever change this condition, must also update that code.
&& self
.partial_backup_uploaded
.as_ref()

View File

@@ -100,6 +100,8 @@ const REFRESH_INTERVAL: Duration = Duration::from_millis(300);
pub enum ManagerCtlMessage {
/// Request to get a guard for WalResidentTimeline, with WAL files available locally.
GuardRequest(tokio::sync::oneshot::Sender<anyhow::Result<ResidenceGuard>>),
/// Get a guard for WalResidentTimeline if the timeline is not currently offloaded, else None
TryGuardRequest(tokio::sync::oneshot::Sender<Option<ResidenceGuard>>),
/// Request to drop the guard.
GuardDrop(GuardId),
/// Request to reset uploaded partial backup state.
@@ -110,6 +112,7 @@ impl std::fmt::Debug for ManagerCtlMessage {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ManagerCtlMessage::GuardRequest(_) => write!(f, "GuardRequest"),
ManagerCtlMessage::TryGuardRequest(_) => write!(f, "TryGuardRequest"),
ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({:?})", id),
ManagerCtlMessage::BackupPartialReset(_) => write!(f, "BackupPartialReset"),
}
@@ -152,6 +155,19 @@ impl ManagerCtl {
.and_then(std::convert::identity)
}
/// Issue a new guard if the timeline is currently not offloaded, else return None
/// Sends a message to the manager and waits for the response.
/// Can be blocked indefinitely if the manager is stuck.
pub async fn try_wal_residence_guard(&self) -> anyhow::Result<Option<ResidenceGuard>> {
let (tx, rx) = tokio::sync::oneshot::channel();
self.manager_tx
.send(ManagerCtlMessage::TryGuardRequest(tx))?;
// wait for the manager to respond with the guard
rx.await
.map_err(|e| anyhow::anyhow!("response read fail: {:?}", e))
}
/// Request timeline manager to reset uploaded partial segment state and
/// wait for the result.
pub async fn backup_partial_reset(&self) -> anyhow::Result<Vec<String>> {
@@ -674,6 +690,17 @@ impl Manager {
warn!("failed to reply with a guard, receiver dropped");
}
}
Some(ManagerCtlMessage::TryGuardRequest(tx)) => {
let result = if self.is_offloaded {
None
} else {
Some(self.access_service.create_guard())
};
if tx.send(result).is_err() {
warn!("failed to reply with a guard, receiver dropped");
}
}
Some(ManagerCtlMessage::GuardDrop(guard_id)) => {
self.access_service.drop_guard(guard_id);
}

View File

@@ -1998,6 +1998,109 @@ def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder):
pt_handle.join()
def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
"""
Verify that when pull_timeline is used on an evicted timeline, it does not result in
promoting any segments to local disk on the source, and the timeline is correctly instantiated
in evicted state on the destination. This behavior is important to avoid ballooning disk
usage when doing mass migration of timelines.
"""
neon_env_builder.num_safekeepers = 4
neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
# Configure safekeepers with ultra-fast eviction policy
neon_env_builder.safekeeper_extra_opts = [
"--enable-offload",
"--partial-backup-timeout",
"50ms",
"--control-file-save-interval",
"1s",
# Safekeepers usually wait a while before evicting something: for this test we want them to
# evict things as soon as they are inactive.
"--eviction-min-resident=100ms",
"--delete-offloaded-wal",
]
initial_tenant_conf = {"lagging_wal_timeout": "1s", "checkpoint_timeout": "100ms"}
env = neon_env_builder.init_start(initial_tenant_conf=initial_tenant_conf)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
(src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[-1])
log.info(f"Will pull_timeline on destination {dst_sk.id} from source {src_sk.id}")
ep = env.endpoints.create("main")
ep.active_safekeepers = [s.id for s in env.safekeepers if s.id != dst_sk.id]
log.info(f"Compute writing initially to safekeepers: {ep.active_safekeepers}")
ep.active_safekeepers = [1, 2, 3] # Exclude dst_sk from set written by compute initially
ep.start()
ep.safe_psql("CREATE TABLE t(i int)")
ep.safe_psql("INSERT INTO t VALUES (0)")
ep.stop()
wait_lsn_force_checkpoint_at_sk(src_sk, tenant_id, timeline_id, env.pageserver)
src_http = src_sk.http_client()
dst_http = dst_sk.http_client()
def evicted_on_source():
# Wait for timeline to go into evicted state
assert src_http.get_eviction_state(timeline_id) != "Present"
assert (
src_http.get_metric_value(
"safekeeper_eviction_events_completed_total", {"kind": "evict"}
)
or 0 > 0
)
assert src_http.get_metric_value("safekeeper_evicted_timelines") or 0 > 0
# Check that on source no segment files are present
assert src_sk.list_segments(tenant_id, timeline_id) == []
wait_until(60, 1, evicted_on_source)
# Invoke pull_timeline: source should serve snapshot request without promoting anything to local disk,
# destination should import the control file only & go into evicted mode immediately
dst_sk.pull_timeline([src_sk], tenant_id, timeline_id)
# Check that on source and destination no segment files are present
assert src_sk.list_segments(tenant_id, timeline_id) == []
assert dst_sk.list_segments(tenant_id, timeline_id) == []
# Check that the timeline on the destination is in the expected evicted state.
evicted_on_source() # It should still be evicted on the source
def evicted_on_destination():
assert dst_http.get_eviction_state(timeline_id) != "Present"
assert dst_http.get_metric_value("safekeeper_evicted_timelines") or 0 > 0
# This should be fast, it is a wait_until because eviction state is updated
# in the background wrt pull_timeline.
wait_until(10, 0.1, evicted_on_destination)
# Delete the timeline on the source, to prove that deletion works on an
# evicted timeline _and_ that the final compute test is really not using
# the original location
src_sk.http_client().timeline_delete(tenant_id, timeline_id, only_local=True)
# Check that using the timeline correctly un-evicts it on the new location
ep.active_safekeepers = [2, 3, 4]
ep.start()
ep.safe_psql("INSERT INTO t VALUES (0)")
ep.stop()
def unevicted_on_dest():
assert (
dst_http.get_metric_value(
"safekeeper_eviction_events_completed_total", {"kind": "restore"}
)
or 0 > 0
)
n_evicted = dst_sk.http_client().get_metric_value("safekeeper_evicted_timelines")
assert n_evicted == 0
wait_until(10, 1, unevicted_on_dest)
# In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
# when compute is active, but there are no writes to the timeline. In that case
# pageserver should maintain a single connection to safekeeper and don't attempt