mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-03 19:42:55 +00:00
## Problem With safekeeper migration in mind, we can now pull/exclude the timeline multiple times within the same safekeeper. To avoid races between out of order requests, we need to ignore the pull/exclude requests if we have already seen a higher generation. - Closes: https://github.com/neondatabase/neon/issues/12186 - Closes: [LKB-949](https://databricks.atlassian.net/browse/LKB-949) ## Summary of changes - Annotate timeline tombstones in safekeeper with request generation. - Replace `ignore_tombstone` option with `mconf` in `PullTimelineRequest` - Switch membership in `pull_timeline` if the existing/pulled timeline has an older generation. - Refuse to switch membership if the timeline is being deleted (`is_canceled`). - Refuse to switch membership in compute greeting request if the safekeeper is not a member of `mconf`. - Pass `mconf` in `PullTimelineRequest` in safekeeper_service --------- Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
765 lines
30 KiB
Rust
765 lines
30 KiB
Rust
use std::cmp::min;
|
|
use std::io::{self, ErrorKind};
|
|
use std::ops::RangeInclusive;
|
|
use std::sync::Arc;
|
|
|
|
use anyhow::{Context, Result, anyhow, bail};
|
|
use bytes::Bytes;
|
|
use camino::Utf8PathBuf;
|
|
use chrono::{DateTime, Utc};
|
|
use futures::{SinkExt, StreamExt, TryStreamExt};
|
|
use http::StatusCode;
|
|
use http_utils::error::ApiError;
|
|
use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
|
|
use remote_storage::GenericRemoteStorage;
|
|
use reqwest::Certificate;
|
|
use safekeeper_api::models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus};
|
|
use safekeeper_api::{Term, membership};
|
|
use safekeeper_client::mgmt_api;
|
|
use safekeeper_client::mgmt_api::Client;
|
|
use serde::Deserialize;
|
|
use tokio::fs::OpenOptions;
|
|
use tokio::io::AsyncWrite;
|
|
use tokio::sync::mpsc;
|
|
use tokio::task;
|
|
use tokio::time::sleep;
|
|
use tokio_tar::{Archive, Builder, Header};
|
|
use tokio_util::io::{CopyToBytes, SinkWriter};
|
|
use tokio_util::sync::PollSender;
|
|
use tracing::{error, info, instrument, warn};
|
|
use utils::crashsafe::fsync_async_opt;
|
|
use utils::id::{NodeId, TenantTimelineId};
|
|
use utils::logging::SecretString;
|
|
use utils::lsn::Lsn;
|
|
use utils::pausable_failpoint;
|
|
|
|
use crate::control_file::CONTROL_FILE_NAME;
|
|
use crate::state::{EvictionState, TimelinePersistentState};
|
|
use crate::timeline::{Timeline, TimelineError, WalResidentTimeline};
|
|
use crate::timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline};
|
|
use crate::wal_storage::{open_wal_file, wal_file_paths};
|
|
use crate::{GlobalTimelines, debug_dump, wal_backup};
|
|
|
|
/// Stream tar archive of timeline to tx.
|
|
#[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))]
|
|
pub async fn stream_snapshot(
|
|
tli: Arc<Timeline>,
|
|
source: NodeId,
|
|
destination: NodeId,
|
|
tx: mpsc::Sender<Result<Bytes>>,
|
|
storage: Option<Arc<GenericRemoteStorage>>,
|
|
) {
|
|
match tli.try_wal_residence_guard().await {
|
|
Err(e) => {
|
|
tx.send(Err(anyhow!("Error checking residence: {:#}", e)))
|
|
.await
|
|
.ok();
|
|
}
|
|
Ok(maybe_resident_tli) => {
|
|
if let Err(e) = match maybe_resident_tli {
|
|
Some(resident_tli) => {
|
|
stream_snapshot_resident_guts(
|
|
resident_tli,
|
|
source,
|
|
destination,
|
|
tx.clone(),
|
|
storage,
|
|
)
|
|
.await
|
|
}
|
|
None => {
|
|
if let Some(storage) = storage {
|
|
stream_snapshot_offloaded_guts(
|
|
tli,
|
|
source,
|
|
destination,
|
|
tx.clone(),
|
|
&storage,
|
|
)
|
|
.await
|
|
} else {
|
|
tx.send(Err(anyhow!("remote storage not configured")))
|
|
.await
|
|
.ok();
|
|
return;
|
|
}
|
|
}
|
|
} {
|
|
// Error type/contents don't matter as they won't can't reach the client
|
|
// (hyper likely doesn't do anything with it), but http stream will be
|
|
// prematurely terminated. It would be nice to try to send the error in
|
|
// trailers though.
|
|
tx.send(Err(anyhow!("snapshot failed"))).await.ok();
|
|
error!("snapshot failed: {:#}", e);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// State needed while streaming the snapshot.
|
|
pub struct SnapshotContext {
|
|
/// The interval of segment numbers. If None, the timeline hasn't had writes yet, so only send the control file
|
|
pub from_to_segno: Option<RangeInclusive<XLogSegNo>>,
|
|
pub term: Term,
|
|
pub last_log_term: Term,
|
|
pub flush_lsn: Lsn,
|
|
pub wal_seg_size: usize,
|
|
// used to remove WAL hold off in Drop.
|
|
pub tli: WalResidentTimeline,
|
|
}
|
|
|
|
impl Drop for SnapshotContext {
|
|
fn drop(&mut self) {
|
|
let tli = self.tli.clone();
|
|
task::spawn(async move {
|
|
let mut shared_state = tli.write_shared_state().await;
|
|
shared_state.wal_removal_on_hold = false;
|
|
});
|
|
}
|
|
}
|
|
|
|
/// Build a tokio_tar stream that sends encoded bytes into a Bytes channel.
|
|
fn prepare_tar_stream(
|
|
tx: mpsc::Sender<Result<Bytes>>,
|
|
) -> tokio_tar::Builder<impl AsyncWrite + Unpin + Send> {
|
|
// tokio-tar wants Write implementor, but we have mpsc tx <Result<Bytes>>;
|
|
// use SinkWriter as a Write impl. That is,
|
|
// - create Sink from the tx. It returns PollSendError if chan is closed.
|
|
let sink = PollSender::new(tx);
|
|
// - SinkWriter needs sink error to be io one, map it.
|
|
let sink_io_err = sink.sink_map_err(|_| io::Error::from(ErrorKind::BrokenPipe));
|
|
// - SinkWriter wants sink type to be just Bytes, not Result<Bytes>, so map
|
|
// it with with(). Note that with() accepts async function which we don't
|
|
// need and allows the map to fail, which we don't need either, but hence
|
|
// two Oks.
|
|
let oksink = sink_io_err.with(|b: Bytes| async { io::Result::Ok(Result::Ok(b)) });
|
|
// - SinkWriter (not surprisingly) wants sink of &[u8], not bytes, so wrap
|
|
// into CopyToBytes. This is a data copy.
|
|
let copy_to_bytes = CopyToBytes::new(oksink);
|
|
let writer = SinkWriter::new(copy_to_bytes);
|
|
let pinned_writer = Box::pin(writer);
|
|
|
|
// Note that tokio_tar append_* funcs use tokio::io::copy with 8KB buffer
|
|
// which is also likely suboptimal.
|
|
Builder::new_non_terminated(pinned_writer)
|
|
}
|
|
|
|
/// Implementation of snapshot for an offloaded timeline, only reads control file
|
|
pub(crate) async fn stream_snapshot_offloaded_guts(
|
|
tli: Arc<Timeline>,
|
|
source: NodeId,
|
|
destination: NodeId,
|
|
tx: mpsc::Sender<Result<Bytes>>,
|
|
storage: &GenericRemoteStorage,
|
|
) -> Result<()> {
|
|
let mut ar = prepare_tar_stream(tx);
|
|
|
|
tli.snapshot_offloaded(&mut ar, source, destination, storage)
|
|
.await?;
|
|
|
|
ar.finish().await?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Implementation of snapshot for a timeline which is resident (includes some segment data)
|
|
pub async fn stream_snapshot_resident_guts(
|
|
tli: WalResidentTimeline,
|
|
source: NodeId,
|
|
destination: NodeId,
|
|
tx: mpsc::Sender<Result<Bytes>>,
|
|
storage: Option<Arc<GenericRemoteStorage>>,
|
|
) -> Result<()> {
|
|
let mut ar = prepare_tar_stream(tx);
|
|
|
|
let bctx = tli
|
|
.start_snapshot(&mut ar, source, destination, storage)
|
|
.await?;
|
|
pausable_failpoint!("sk-snapshot-after-list-pausable");
|
|
|
|
if let Some(from_to_segno) = &bctx.from_to_segno {
|
|
let tli_dir = tli.get_timeline_dir();
|
|
info!(
|
|
"sending {} segments [{:#X}-{:#X}], term={}, last_log_term={}, flush_lsn={}",
|
|
from_to_segno.end() - from_to_segno.start() + 1,
|
|
from_to_segno.start(),
|
|
from_to_segno.end(),
|
|
bctx.term,
|
|
bctx.last_log_term,
|
|
bctx.flush_lsn,
|
|
);
|
|
for segno in from_to_segno.clone() {
|
|
let Some((mut sf, is_partial)) =
|
|
open_wal_file(&tli_dir, segno, bctx.wal_seg_size).await?
|
|
else {
|
|
// File is not found
|
|
let (wal_file_path, _wal_file_partial_path) =
|
|
wal_file_paths(&tli_dir, segno, bctx.wal_seg_size);
|
|
tracing::warn!("couldn't find WAL segment file {wal_file_path}");
|
|
bail!("couldn't find WAL segment file {wal_file_path}")
|
|
};
|
|
let mut wal_file_name = XLogFileName(PG_TLI, segno, bctx.wal_seg_size);
|
|
if is_partial {
|
|
wal_file_name.push_str(".partial");
|
|
}
|
|
ar.append_file(&wal_file_name, &mut sf).await?;
|
|
}
|
|
} else {
|
|
info!("Not including any segments into the snapshot");
|
|
}
|
|
|
|
// Do the term check before ar.finish to make archive corrupted in case of
|
|
// term change. Client shouldn't ignore abrupt stream end, but to be sure.
|
|
tli.finish_snapshot(&bctx).await?;
|
|
|
|
ar.finish().await?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
impl Timeline {
|
|
/// Simple snapshot for an offloaded timeline: we will only upload a renamed partial segment and
|
|
/// pass a modified control file into the provided tar stream (nothing with data segments on disk, since
|
|
/// we are offloaded and there aren't any)
|
|
async fn snapshot_offloaded<W: AsyncWrite + Unpin + Send>(
|
|
self: &Arc<Timeline>,
|
|
ar: &mut tokio_tar::Builder<W>,
|
|
source: NodeId,
|
|
destination: NodeId,
|
|
storage: &GenericRemoteStorage,
|
|
) -> Result<()> {
|
|
// Take initial copy of control file, then release state lock
|
|
let mut control_file = {
|
|
let shared_state = self.write_shared_state().await;
|
|
|
|
let control_file = TimelinePersistentState::clone(shared_state.sk.state());
|
|
|
|
// Rare race: we got unevicted between entering function and reading control file.
|
|
// We error out and let API caller retry.
|
|
if !matches!(control_file.eviction_state, EvictionState::Offloaded(_)) {
|
|
bail!("Timeline was un-evicted during snapshot, please retry");
|
|
}
|
|
|
|
control_file
|
|
};
|
|
|
|
// Modify the partial segment of the in-memory copy for the control file to
|
|
// point to the destination safekeeper.
|
|
let replace = control_file
|
|
.partial_backup
|
|
.replace_uploaded_segment(source, destination)?;
|
|
|
|
let Some(replace) = replace else {
|
|
// In Manager:: ready_for_eviction, we do not permit eviction unless the timeline
|
|
// has a partial segment. It is unexpected that
|
|
anyhow::bail!("Timeline has no partial segment, cannot generate snapshot");
|
|
};
|
|
|
|
tracing::info!("Replacing uploaded partial segment in in-mem control file: {replace:?}");
|
|
|
|
// Optimistically try to copy the partial segment to the destination's path: this
|
|
// can fail if the timeline was un-evicted and modified in the background.
|
|
let remote_timeline_path = &self.remote_path;
|
|
wal_backup::copy_partial_segment(
|
|
storage,
|
|
&replace.previous.remote_path(remote_timeline_path),
|
|
&replace.current.remote_path(remote_timeline_path),
|
|
)
|
|
.await?;
|
|
|
|
// Since the S3 copy succeeded with the path given in our control file snapshot, and
|
|
// we are sending that snapshot in our response, we are giving the caller a consistent
|
|
// snapshot even if our local Timeline was unevicted or otherwise modified in the meantime.
|
|
let buf = control_file
|
|
.write_to_buf()
|
|
.with_context(|| "failed to serialize control store")?;
|
|
let mut header = Header::new_gnu();
|
|
header.set_size(buf.len().try_into().expect("never breaches u64"));
|
|
ar.append_data(&mut header, CONTROL_FILE_NAME, buf.as_slice())
|
|
.await
|
|
.with_context(|| "failed to append to archive")?;
|
|
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
impl WalResidentTimeline {
|
|
/// Start streaming tar archive with timeline:
|
|
/// 1) stream control file under lock;
|
|
/// 2) hold off WAL removal;
|
|
/// 3) collect SnapshotContext to understand which WAL segments should be
|
|
/// streamed.
|
|
///
|
|
/// Snapshot streams data up to flush_lsn. To make this safe, we must check
|
|
/// that term doesn't change during the procedure, or we risk sending mix of
|
|
/// WAL from different histories. Term is remembered in the SnapshotContext
|
|
/// and checked in finish_snapshot. Note that in the last segment some WAL
|
|
/// higher than flush_lsn set here might be streamed; that's fine as long as
|
|
/// terms doesn't change.
|
|
///
|
|
/// Alternatively we could send only up to commit_lsn to get some valid
|
|
/// state which later will be recovered by compute, in this case term check
|
|
/// is not needed, but we likely don't want that as there might be no
|
|
/// compute which could perform the recovery.
|
|
///
|
|
/// When returned SnapshotContext is dropped WAL hold is removed.
|
|
async fn start_snapshot<W: AsyncWrite + Unpin + Send>(
|
|
&self,
|
|
ar: &mut tokio_tar::Builder<W>,
|
|
source: NodeId,
|
|
destination: NodeId,
|
|
storage: Option<Arc<GenericRemoteStorage>>,
|
|
) -> Result<SnapshotContext> {
|
|
let mut shared_state = self.write_shared_state().await;
|
|
let wal_seg_size = shared_state.get_wal_seg_size();
|
|
|
|
let mut control_store = TimelinePersistentState::clone(shared_state.sk.state());
|
|
// Modify the partial segment of the in-memory copy for the control file to
|
|
// point to the destination safekeeper.
|
|
let replace = control_store
|
|
.partial_backup
|
|
.replace_uploaded_segment(source, destination)?;
|
|
|
|
if let Some(replace) = replace {
|
|
// The deserialized control file has an uploaded partial. We upload a copy
|
|
// of it to object storage for the destination safekeeper and send an updated
|
|
// control file in the snapshot.
|
|
tracing::info!(
|
|
"Replacing uploaded partial segment in in-mem control file: {replace:?}"
|
|
);
|
|
|
|
let remote_timeline_path = &self.tli.remote_path;
|
|
wal_backup::copy_partial_segment(
|
|
&*storage.context("remote storage not configured")?,
|
|
&replace.previous.remote_path(remote_timeline_path),
|
|
&replace.current.remote_path(remote_timeline_path),
|
|
)
|
|
.await?;
|
|
}
|
|
|
|
let buf = control_store
|
|
.write_to_buf()
|
|
.with_context(|| "failed to serialize control store")?;
|
|
let mut header = Header::new_gnu();
|
|
header.set_size(buf.len().try_into().expect("never breaches u64"));
|
|
ar.append_data(&mut header, CONTROL_FILE_NAME, buf.as_slice())
|
|
.await
|
|
.with_context(|| "failed to append to archive")?;
|
|
|
|
// We need to stream since the oldest segment someone (s3 or pageserver)
|
|
// still needs. This duplicates calc_horizon_lsn logic.
|
|
//
|
|
// We know that WAL wasn't removed up to this point because it cannot be
|
|
// removed further than `backup_lsn`. Since we're holding shared_state
|
|
// lock and setting `wal_removal_on_hold` later, it guarantees that WAL
|
|
// won't be removed until we're done.
|
|
let timeline_state = shared_state.sk.state();
|
|
let from_lsn = min(
|
|
timeline_state.remote_consistent_lsn,
|
|
timeline_state.backup_lsn,
|
|
);
|
|
let flush_lsn = shared_state.sk.flush_lsn();
|
|
let (send_segments, msg) = if from_lsn == Lsn::INVALID {
|
|
(false, "snapshot is called on uninitialized timeline")
|
|
} else {
|
|
(true, "timeline is initialized")
|
|
};
|
|
tracing::info!(
|
|
remote_consistent_lsn=%timeline_state.remote_consistent_lsn,
|
|
backup_lsn=%timeline_state.backup_lsn,
|
|
%flush_lsn,
|
|
"{msg}"
|
|
);
|
|
let from_segno = from_lsn.segment_number(wal_seg_size);
|
|
let term = shared_state.sk.state().acceptor_state.term;
|
|
let last_log_term = shared_state.sk.last_log_term();
|
|
let upto_segno = flush_lsn.segment_number(wal_seg_size);
|
|
// have some limit on max number of segments as a sanity check
|
|
const MAX_ALLOWED_SEGS: u64 = 1000;
|
|
let num_segs = upto_segno - from_segno + 1;
|
|
if num_segs > MAX_ALLOWED_SEGS {
|
|
bail!(
|
|
"snapshot is called on timeline with {} segments, but the limit is {}",
|
|
num_segs,
|
|
MAX_ALLOWED_SEGS
|
|
);
|
|
}
|
|
|
|
// Prevent WAL removal while we're streaming data.
|
|
//
|
|
// Since this a flag, not a counter just bail out if already set; we
|
|
// shouldn't need concurrent snapshotting.
|
|
if shared_state.wal_removal_on_hold {
|
|
bail!("wal_removal_on_hold is already true");
|
|
}
|
|
shared_state.wal_removal_on_hold = true;
|
|
|
|
// Drop shared_state to release the lock, before calling wal_residence_guard().
|
|
drop(shared_state);
|
|
|
|
let tli_copy = self.wal_residence_guard().await?;
|
|
let from_to_segno = send_segments.then_some(from_segno..=upto_segno);
|
|
let bctx = SnapshotContext {
|
|
from_to_segno,
|
|
term,
|
|
last_log_term,
|
|
flush_lsn,
|
|
wal_seg_size,
|
|
tli: tli_copy,
|
|
};
|
|
|
|
Ok(bctx)
|
|
}
|
|
|
|
/// Finish snapshotting: check that term(s) hasn't changed.
|
|
///
|
|
/// Note that WAL gc hold off is removed in Drop of SnapshotContext to not
|
|
/// forget this if snapshotting fails mid the way.
|
|
pub async fn finish_snapshot(&self, bctx: &SnapshotContext) -> Result<()> {
|
|
let shared_state = self.read_shared_state().await;
|
|
let term = shared_state.sk.state().acceptor_state.term;
|
|
let last_log_term = shared_state.sk.last_log_term();
|
|
// There are some cases to relax this check (e.g. last_log_term might
|
|
// change, but as long as older history is strictly part of new that's
|
|
// fine), but there is no need to do it.
|
|
if bctx.term != term || bctx.last_log_term != last_log_term {
|
|
bail!(
|
|
"term(s) changed during snapshot: were term={}, last_log_term={}, now term={}, last_log_term={}",
|
|
bctx.term,
|
|
bctx.last_log_term,
|
|
term,
|
|
last_log_term
|
|
);
|
|
}
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
/// Response for debug dump request.
|
|
#[derive(Debug, Deserialize)]
|
|
pub struct DebugDumpResponse {
|
|
pub start_time: DateTime<Utc>,
|
|
pub finish_time: DateTime<Utc>,
|
|
pub timelines: Vec<debug_dump::Timeline>,
|
|
pub timelines_count: usize,
|
|
pub config: debug_dump::Config,
|
|
}
|
|
|
|
/// Find the most advanced safekeeper and pull timeline from it.
|
|
pub async fn handle_request(
|
|
request: PullTimelineRequest,
|
|
sk_auth_token: Option<SecretString>,
|
|
ssl_ca_certs: Vec<Certificate>,
|
|
global_timelines: Arc<GlobalTimelines>,
|
|
wait_for_peer_timeline_status: bool,
|
|
) -> Result<PullTimelineResponse, ApiError> {
|
|
if let Some(mconf) = &request.mconf {
|
|
let sk_id = global_timelines.get_sk_id();
|
|
if !mconf.contains(sk_id) {
|
|
return Err(ApiError::BadRequest(anyhow!(
|
|
"refused to pull timeline with {mconf}, node {sk_id} is not member of it",
|
|
)));
|
|
}
|
|
}
|
|
|
|
let existing_tli = global_timelines.get(TenantTimelineId::new(
|
|
request.tenant_id,
|
|
request.timeline_id,
|
|
));
|
|
if let Ok(timeline) = existing_tli {
|
|
let cur_generation = timeline
|
|
.read_shared_state()
|
|
.await
|
|
.sk
|
|
.state()
|
|
.mconf
|
|
.generation;
|
|
|
|
info!(
|
|
"Timeline {} already exists with generation {cur_generation}",
|
|
request.timeline_id,
|
|
);
|
|
|
|
if let Some(mconf) = request.mconf {
|
|
timeline
|
|
.membership_switch(mconf)
|
|
.await
|
|
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
|
|
}
|
|
|
|
return Ok(PullTimelineResponse {
|
|
safekeeper_host: None,
|
|
});
|
|
}
|
|
|
|
let mut http_client = reqwest::Client::builder();
|
|
for ssl_ca_cert in ssl_ca_certs {
|
|
http_client = http_client.add_root_certificate(ssl_ca_cert);
|
|
}
|
|
let http_client = http_client
|
|
.build()
|
|
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
|
|
|
let http_hosts = request.http_hosts.clone();
|
|
|
|
// Figure out statuses of potential donors.
|
|
let mut statuses = Vec::new();
|
|
if !wait_for_peer_timeline_status {
|
|
let responses: Vec<Result<TimelineStatus, mgmt_api::Error>> =
|
|
futures::future::join_all(http_hosts.iter().map(|url| async {
|
|
let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone());
|
|
let resp = cclient
|
|
.timeline_status(request.tenant_id, request.timeline_id)
|
|
.await?;
|
|
let info: TimelineStatus = resp
|
|
.json()
|
|
.await
|
|
.context("Failed to deserialize timeline status")
|
|
.map_err(|e| mgmt_api::Error::ReceiveErrorBody(e.to_string()))?;
|
|
Ok(info)
|
|
}))
|
|
.await;
|
|
|
|
for (i, response) in responses.into_iter().enumerate() {
|
|
match response {
|
|
Ok(status) => {
|
|
if let Some(mconf) = &request.mconf {
|
|
if status.mconf.generation > mconf.generation {
|
|
// We probably raced with another timeline membership change with higher generation.
|
|
// Ignore this request.
|
|
return Err(ApiError::Conflict(format!(
|
|
"cannot pull timeline with generation {}: timeline {} already exists with generation {} on {}",
|
|
mconf.generation,
|
|
request.timeline_id,
|
|
status.mconf.generation,
|
|
http_hosts[i],
|
|
)));
|
|
}
|
|
}
|
|
statuses.push((status, i));
|
|
}
|
|
Err(e) => {
|
|
info!("error fetching status from {}: {e}", http_hosts[i]);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Allow missing responses from up to one safekeeper (say due to downtime)
|
|
// e.g. if we created a timeline on PS A and B, with C being offline. Then B goes
|
|
// offline and C comes online. Then we want a pull on C with A and B as hosts to work.
|
|
let min_required_successful = (http_hosts.len() - 1).max(1);
|
|
if statuses.len() < min_required_successful {
|
|
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
|
"only got {} successful status responses. required: {min_required_successful}",
|
|
statuses.len()
|
|
)));
|
|
}
|
|
} else {
|
|
let mut retry = true;
|
|
// We must get status from all other peers.
|
|
// Otherwise, we may run into split-brain scenario.
|
|
while retry {
|
|
statuses.clear();
|
|
retry = false;
|
|
for (i, url) in http_hosts.iter().enumerate() {
|
|
let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone());
|
|
match cclient
|
|
.timeline_status(request.tenant_id, request.timeline_id)
|
|
.await
|
|
{
|
|
Ok(resp) => {
|
|
if resp.status() == StatusCode::NOT_FOUND {
|
|
warn!(
|
|
"Timeline {} not found on peer SK {}, no need to pull it",
|
|
TenantTimelineId::new(request.tenant_id, request.timeline_id),
|
|
url
|
|
);
|
|
return Ok(PullTimelineResponse {
|
|
safekeeper_host: None,
|
|
});
|
|
}
|
|
let info: TimelineStatus = resp
|
|
.json()
|
|
.await
|
|
.context("Failed to deserialize timeline status")
|
|
.map_err(ApiError::InternalServerError)?;
|
|
statuses.push((info, i));
|
|
}
|
|
Err(e) => {
|
|
match e {
|
|
// If we get a 404, it means the timeline doesn't exist on this safekeeper.
|
|
// We can ignore this error.
|
|
mgmt_api::Error::ApiError(status, _)
|
|
if status == StatusCode::NOT_FOUND =>
|
|
{
|
|
warn!(
|
|
"Timeline {} not found on peer SK {}, no need to pull it",
|
|
TenantTimelineId::new(request.tenant_id, request.timeline_id),
|
|
url
|
|
);
|
|
return Ok(PullTimelineResponse {
|
|
safekeeper_host: None,
|
|
});
|
|
}
|
|
_ => {}
|
|
}
|
|
retry = true;
|
|
error!("Failed to get timeline status from {}: {:#}", url, e);
|
|
}
|
|
}
|
|
}
|
|
sleep(std::time::Duration::from_millis(100)).await;
|
|
}
|
|
}
|
|
|
|
// Find the most advanced safekeeper
|
|
let (status, i) = statuses
|
|
.into_iter()
|
|
.max_by_key(|(status, _)| {
|
|
(
|
|
status.acceptor_state.epoch,
|
|
/* BEGIN_HADRON */
|
|
// We need to pull from the SK with the highest term.
|
|
// This is because another compute may come online and vote the same highest term again on the other two SKs.
|
|
// Then, there will be 2 computes running on the same term.
|
|
status.acceptor_state.term,
|
|
/* END_HADRON */
|
|
status.flush_lsn,
|
|
status.commit_lsn,
|
|
)
|
|
})
|
|
.unwrap();
|
|
let safekeeper_host = http_hosts[i].clone();
|
|
|
|
assert!(status.tenant_id == request.tenant_id);
|
|
assert!(status.timeline_id == request.timeline_id);
|
|
|
|
match pull_timeline(
|
|
status,
|
|
safekeeper_host,
|
|
sk_auth_token,
|
|
http_client,
|
|
global_timelines,
|
|
request.mconf,
|
|
)
|
|
.await
|
|
{
|
|
Ok(resp) => Ok(resp),
|
|
Err(e) => {
|
|
match e.downcast_ref::<TimelineError>() {
|
|
Some(TimelineError::AlreadyExists(_)) => Ok(PullTimelineResponse {
|
|
safekeeper_host: None,
|
|
}),
|
|
Some(TimelineError::Deleted(_)) => Err(ApiError::Conflict(format!(
|
|
"Timeline {}/{} deleted",
|
|
request.tenant_id, request.timeline_id
|
|
))),
|
|
Some(TimelineError::CreationInProgress(_)) => {
|
|
// We don't return success here because creation might still fail.
|
|
Err(ApiError::Conflict("Creation in progress".to_owned()))
|
|
}
|
|
_ => Err(ApiError::InternalServerError(e)),
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
async fn pull_timeline(
|
|
status: TimelineStatus,
|
|
host: String,
|
|
sk_auth_token: Option<SecretString>,
|
|
http_client: reqwest::Client,
|
|
global_timelines: Arc<GlobalTimelines>,
|
|
mconf: Option<membership::Configuration>,
|
|
) -> Result<PullTimelineResponse> {
|
|
let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id);
|
|
info!(
|
|
"pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}",
|
|
ttid,
|
|
host,
|
|
status.commit_lsn,
|
|
status.flush_lsn,
|
|
status.acceptor_state.term,
|
|
status.acceptor_state.epoch
|
|
);
|
|
|
|
let conf = &global_timelines.get_global_config();
|
|
|
|
let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
|
|
let client = Client::new(http_client, host.clone(), sk_auth_token.clone());
|
|
// Request stream with basebackup archive.
|
|
let bb_resp = client
|
|
.snapshot(status.tenant_id, status.timeline_id, conf.my_id)
|
|
.await?;
|
|
|
|
// Make Stream of Bytes from it...
|
|
let bb_stream = bb_resp.bytes_stream().map_err(std::io::Error::other);
|
|
// and turn it into StreamReader implementing AsyncRead.
|
|
let bb_reader = tokio_util::io::StreamReader::new(bb_stream);
|
|
|
|
// Extract it on the fly to the disk. We don't use simple unpack() to fsync
|
|
// files.
|
|
let mut entries = Archive::new(bb_reader).entries()?;
|
|
while let Some(base_tar_entry) = entries.next().await {
|
|
let mut entry = base_tar_entry?;
|
|
let header = entry.header();
|
|
let file_path = header.path()?.into_owned();
|
|
match header.entry_type() {
|
|
tokio_tar::EntryType::Regular => {
|
|
let utf8_file_path =
|
|
Utf8PathBuf::from_path_buf(file_path).expect("non-Unicode path");
|
|
let dst_path = tli_dir_path.join(utf8_file_path);
|
|
let mut f = OpenOptions::new()
|
|
.create(true)
|
|
.truncate(true)
|
|
.write(true)
|
|
.open(&dst_path)
|
|
.await?;
|
|
tokio::io::copy(&mut entry, &mut f).await?;
|
|
// fsync the file
|
|
f.sync_all().await?;
|
|
}
|
|
_ => {
|
|
bail!(
|
|
"entry {} in backup tar archive is of unexpected type: {:?}",
|
|
file_path.display(),
|
|
header.entry_type()
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
// fsync temp timeline directory to remember its contents.
|
|
fsync_async_opt(&tli_dir_path, !conf.no_sync).await?;
|
|
|
|
let generation = mconf.as_ref().map(|c| c.generation);
|
|
|
|
// Let's create timeline from temp directory and verify that it's correct
|
|
let (commit_lsn, flush_lsn) =
|
|
validate_temp_timeline(conf, ttid, &tli_dir_path, generation).await?;
|
|
info!(
|
|
"finished downloading timeline {}, commit_lsn={}, flush_lsn={}",
|
|
ttid, commit_lsn, flush_lsn
|
|
);
|
|
assert!(status.commit_lsn <= status.flush_lsn);
|
|
|
|
// Finally, load the timeline.
|
|
let timeline = global_timelines
|
|
.load_temp_timeline(ttid, &tli_dir_path, generation)
|
|
.await?;
|
|
|
|
if let Some(mconf) = mconf {
|
|
// Switch to provided mconf to guarantee that the timeline will not
|
|
// be deleted by request with older generation.
|
|
// The generation might already be higer than the one in mconf, e.g.
|
|
// if another membership_switch request was executed between `load_temp_timeline`
|
|
// and `membership_switch`, but that's totaly fine. `membership_switch` will
|
|
// ignore switch to older generation.
|
|
timeline.membership_switch(mconf).await?;
|
|
}
|
|
|
|
Ok(PullTimelineResponse {
|
|
safekeeper_host: Some(host),
|
|
})
|
|
}
|