diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 654dde8da6..714d8ac403 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -841,6 +841,10 @@ impl PostgresBackend { let expected_end = match &end { ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF | Cancelled => true, + // The timeline doesn't exist and we have been requested to not auto-create it. + // Compute requests for timelines that haven't been created yet + // might reach us before the storcon request to create those timelines. + TimelineNoCreate => true, CopyStreamHandlerEnd::Disconnected(ConnectionError::Io(io_error)) if is_expected_io_error(io_error) => { @@ -1059,6 +1063,8 @@ pub enum CopyStreamHandlerEnd { Terminate, #[error("EOF on COPY stream")] EOF, + #[error("timeline not found, and allow_timeline_creation is false")] + TimelineNoCreate, /// The connection was lost #[error("connection error: {0}")] Disconnected(#[from] ConnectionError), diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 51f88625da..cc31b38fe7 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -303,7 +303,8 @@ pub struct PullTimelineRequest { #[derive(Debug, Serialize, Deserialize)] pub struct PullTimelineResponse { - // Donor safekeeper host - pub safekeeper_host: String, + /// Donor safekeeper host. + /// None if no pull happened because the timeline already exists. + pub safekeeper_host: Option, // TODO: add more fields? } diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 653b084ad8..1510a51019 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -401,7 +401,10 @@ pub async fn handle_request( request.timeline_id, )); if existing_tli.is_ok() { - bail!("Timeline {} already exists", request.timeline_id); + info!("Timeline {} already exists", request.timeline_id); + return Ok(PullTimelineResponse { + safekeeper_host: None, + }); } let mut http_client = reqwest::Client::builder(); @@ -425,8 +428,25 @@ pub async fn handle_request( let mut statuses = Vec::new(); for (i, response) in responses.into_iter().enumerate() { - let status = response.context(format!("fetching status from {}", http_hosts[i]))?; - statuses.push((status, i)); + match response { + Ok(status) => { + statuses.push((status, i)); + } + Err(e) => { + info!("error fetching status from {}: {e}", http_hosts[i]); + } + } + } + + // Allow missing responses from up to one safekeeper (say due to downtime) + // e.g. if we created a timeline on PS A and B, with C being offline. Then B goes + // offline and C comes online. Then we want a pull on C with A and B as hosts to work. + let min_required_successful = (http_hosts.len() - 1).max(1); + if statuses.len() < min_required_successful { + bail!( + "only got {} successful status responses. required: {min_required_successful}", + statuses.len() + ) } // Find the most advanced safekeeper @@ -536,6 +556,6 @@ async fn pull_timeline( .await?; Ok(PullTimelineResponse { - safekeeper_host: host, + safekeeper_host: Some(host), }) } diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 9975153f6c..eb8eee6ab8 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -32,7 +32,7 @@ use crate::metrics::{ WAL_RECEIVERS, }; use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage}; -use crate::timeline::WalResidentTimeline; +use crate::timeline::{TimelineError, WalResidentTimeline}; const DEFAULT_FEEDBACK_CAPACITY: usize = 8; @@ -357,9 +357,14 @@ impl NetworkReader<'_, IO> { .await .context("create timeline")? } else { - self.global_timelines - .get(self.ttid) - .context("get timeline")? + let timeline_res = self.global_timelines.get(self.ttid); + match timeline_res { + Ok(tl) => tl, + Err(TimelineError::NotFound(_)) => { + return Err(CopyStreamHandlerEnd::TimelineNoCreate); + } + other => other.context("get_timeline")?, + } }; tli.wal_residence_guard().await? } diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs index 74308cabff..71c73a0112 100644 --- a/storage_controller/src/service/safekeeper_reconciler.rs +++ b/storage_controller/src/service/safekeeper_reconciler.rs @@ -306,10 +306,11 @@ impl SafekeeperReconcilerInner { req, async |client| client.pull_timeline(&pull_req).await, |resp| { - tracing::info!( - "pulled timeline from {} onto {req_host}", - resp.safekeeper_host, - ); + if let Some(host) = resp.safekeeper_host { + tracing::info!("pulled timeline from {host} onto {req_host}"); + } else { + tracing::info!("timeline already present on safekeeper on {req_host}"); + } }, req_cancel, )