diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 377b625be6..64dc07212e 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -15,6 +15,7 @@ use anyhow::{bail, Context}; use bytes::Bytes; use futures::Stream; use pageserver_api::models::TimelineState; +use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; use tokio::sync::watch; use tokio_util::io::StreamReader; @@ -360,6 +361,44 @@ impl Drop for TimelineUninitMark { } } +// We should not blindly overwrite local metadata with remote one. +// For example, consider the following case: +// Checkpoint comes, we update local metadata and start upload task but after that +// pageserver crashes. During startup we'll load new metadata, and then reset it +// to the state of remote one. But current layermap will have layers from the old +// metadata which is inconsistent. +// And with current logic it wont disgard them during load because during layermap +// load it sees local disk consistent lsn which is ahead of layer lsns. +// If we treat remote as source of truth we need to completely sync with it, +// i e delete local files which are missing on the remote. This will add extra work, +// wal for these layers needs to be reingested for example +// +// So the solution is to take remote metadata only when it has greater disk_consistent_lsn +pub fn merge_local_remote_metadata<'a>( + local: Option<&'a TimelineMetadata>, + remote: Option<&'a TimelineMetadata>, +) -> anyhow::Result<(&'a TimelineMetadata, bool)> { + match (local, remote) { + (None, None) => anyhow::bail!("we should have either local metadata or remote"), + (None, Some(remote)) => Ok((remote, false)), + (Some(local), None) => Ok((local, true)), + (Some(local), Some(remote)) => { + // take metadata with highest disk_consistent_lsn. + // FIXME Do we need to merge something? + if local.disk_consistent_lsn() < remote.disk_consistent_lsn() { + Ok((remote, false)) + } else { + Ok((local, true)) + } + } + } +} + +struct RemoteStartupData { + index_part: IndexPart, + remote_metadata: TimelineMetadata, +} + /// A repository corresponds to one .neon directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. impl Tenant { @@ -369,13 +408,20 @@ impl Tenant { &self, timeline_id: TimelineId, remote_client: Option, - index_part: Option<&IndexPart>, - metadata: TimelineMetadata, + remote_startup_data: Option, + local_metadata: Option, ancestor: Option>, first_save: bool, ) -> anyhow::Result<()> { let tenant_id = self.tenant_id; + let (up_to_date_metadata, picked_local) = merge_local_remote_metadata( + local_metadata.as_ref(), + remote_startup_data.as_ref().map(|r| &r.remote_metadata), + ) + .context("merge_local_remote_metadata")? + .to_owned(); + let timeline = { // avoiding holding it across awaits let mut timelines_accessor = self.timelines.lock().unwrap(); @@ -387,7 +433,7 @@ impl Tenant { let dummy_timeline = self.create_timeline_data( timeline_id, - metadata.clone(), + up_to_date_metadata.clone(), ancestor.clone(), remote_client, )?; @@ -413,7 +459,12 @@ impl Tenant { // can be called directly on Uninitielized timeline // also leades to redundant .clone let broken_timeline = self - .create_timeline_data(timeline_id, metadata.clone(), ancestor, None) + .create_timeline_data( + timeline_id, + up_to_date_metadata.clone(), + ancestor, + None, + ) .with_context(|| { format!( "Failed to crate broken timeline data for {tenant_id}/{timeline_id}" @@ -431,11 +482,26 @@ impl Tenant { // missing locally, and scheduling uploads for anything that's missing // in remote storage. timeline - .reconcile_with_remote(metadata, index_part, first_save) + .reconcile_with_remote( + up_to_date_metadata, + remote_startup_data.as_ref().map(|r| &r.index_part), + ) .await .context("failed to reconcile with remote")? } + // Save the metadata file to local disk. + if !picked_local { + save_metadata( + self.conf, + timeline_id, + tenant_id, + up_to_date_metadata, + first_save, + ) + .context("save_metadata")?; + } + // Finally launch walreceiver timeline.launch_wal_receiver(); @@ -536,27 +602,28 @@ impl Tenant { info!("found {} timelines", remote_timelines.len()); let mut timeline_ancestors: HashMap = HashMap::new(); - let mut index_parts: HashMap = HashMap::new(); - for (timeline_id, index_part) in remote_timelines.iter() { + let mut index_parts: HashMap = HashMap::new(); + for (timeline_id, index_part) in remote_timelines { let remote_metadata = index_part.parse_metadata().with_context(|| { format!( "Failed to parse metadata file from remote storage for tenant {} timeline {}", self.tenant_id, timeline_id ) })?; - timeline_ancestors.insert(*timeline_id, remote_metadata); - index_parts.insert(*timeline_id, index_part); + timeline_ancestors.insert(timeline_id, remote_metadata); + index_parts.insert(timeline_id, index_part); } // For every timeline, download the metadata file, scan the local directory, // and build a layer map that contains an entry for each remote and local // layer file. let sorted_timelines = tree_sort_timelines(timeline_ancestors)?; - for (timeline_id, _metadata) in sorted_timelines { + for (timeline_id, remote_metadata) in sorted_timelines { // TODO again handle early failure self.load_remote_timeline( timeline_id, - index_parts[&timeline_id], + index_parts.remove(&timeline_id).unwrap(), + remote_metadata, remote_storage.clone(), ) .await @@ -584,11 +651,12 @@ impl Tenant { Ok(()) } - #[instrument(skip(self, index_part, remote_storage), fields(timeline_id=%timeline_id))] + #[instrument(skip(self, index_part, remote_metadata, remote_storage), fields(timeline_id=%timeline_id))] async fn load_remote_timeline( &self, timeline_id: TimelineId, - index_part: &IndexPart, + index_part: IndexPart, + remote_metadata: TimelineMetadata, remote_storage: GenericRemoteStorage, ) -> anyhow::Result<()> { info!("downloading index file for timeline {}", timeline_id); @@ -596,13 +664,6 @@ impl Tenant { .await .context("Failed to create new timeline directory")?; - let remote_metadata = index_part.parse_metadata().with_context(|| { - format!( - "Failed to parse metadata file from remote storage for tenant {}", - self.tenant_id - ) - })?; - let remote_client = create_remote_timeline_client(remote_storage, self.conf, self.tenant_id, timeline_id)?; @@ -619,11 +680,19 @@ impl Tenant { None }; + // Even if there is local metadata it cannot be ahead of the remote one + // since we're attaching. Even if we resume interrupted attach remote one + // cannot be older than the local one + let local_metadata = None; + self.setup_timeline( timeline_id, Some(remote_client), - Some(index_part), - remote_metadata, + Some(RemoteStartupData { + index_part, + remote_metadata, + }), + local_metadata, ancestor, true, ) @@ -810,12 +879,13 @@ impl Tenant { // 1. "Timeline has no ancestor and no layer files" // XXX get rid of enable_background_jobs - let enable_background_jobs = sorted_timelines.len() > 0; + let enable_background_jobs = !sorted_timelines.is_empty(); - for (timeline_id, metadata) in sorted_timelines { + for (timeline_id, local_metadata) in sorted_timelines { // FIXME should we fail load of whole tenant if one timeline failed? // consider branch hierarchy. Maybe set one to broken and others to Paused or something - self.load_local_timeline(timeline_id, metadata).await?; + self.load_local_timeline(timeline_id, local_metadata) + .await?; } // We're ready for business. @@ -831,13 +901,13 @@ impl Tenant { /// Subroutine of `load_tenant`, to load an individual timeline /// /// NB: The parent is assumed to be already loaded! - #[instrument(skip(self, metadata), fields(timeline_id=%timeline_id))] + #[instrument(skip(self, local_metadata), fields(timeline_id=%timeline_id))] async fn load_local_timeline( &self, timeline_id: TimelineId, - metadata: TimelineMetadata, + local_metadata: TimelineMetadata, ) -> anyhow::Result<()> { - let ancestor = if let Some(ancestor_timeline_id) = metadata.ancestor_timeline() { + let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() { let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false) .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?; Some(ancestor_timeline) @@ -858,8 +928,33 @@ impl Tenant { }) .transpose()?; - self.setup_timeline(timeline_id, remote_client, None, metadata, ancestor, false) - .await + let remote_startup_data = match &remote_client { + Some(remote_client) => match remote_client.download_index_file().await { + Ok(index_part) => { + let remote_metadata = index_part.parse_metadata().context("parse_metadata")?; + Some(RemoteStartupData { + index_part, + remote_metadata, + }) + } + Err(DownloadError::NotFound) => { + info!("no index file was found on the remote"); + None + } + Err(e) => return Err(anyhow::anyhow!(e)), + }, + None => None, + }; + + self.setup_timeline( + timeline_id, + remote_client, + remote_startup_data, + Some(local_metadata), + ancestor, + false, + ) + .await } pub fn tenant_id(&self) -> TenantId { diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 6f22fbd5d2..f3a0a5171a 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -242,7 +242,8 @@ pub fn save_metadata( let mut file = VirtualFile::open_with_options( &path, OpenOptions::new().write(true).create_new(first_save), - )?; + ) + .context("open_with_options")?; let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index bc57759d97..ab100da132 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -6,12 +6,10 @@ use fail::fail_point; use itertools::Itertools; use once_cell::sync::OnceCell; use pageserver_api::models::TimelineState; -use remote_storage::DownloadError; use tokio::sync::watch; use tokio::task::spawn_blocking; use tracing::*; -use std::borrow::Cow; use std::cmp::{max, min, Ordering}; use std::collections::{HashMap, HashSet}; use std::fs; @@ -972,18 +970,8 @@ impl Timeline { index_part: &IndexPart, remote_client: &RemoteTimelineClient, mut local_filenames: HashSet, - local_metadata: TimelineMetadata, - first_save: bool, - ) -> anyhow::Result<(HashSet, TimelineMetadata)> { - let remote_metadata = index_part.parse_metadata().with_context(|| { - format!( - "Failed to parse metadata file from remote storage for tenant {}", - self.tenant_id - ) - })?; - - let remote_consistent_lsn = remote_metadata.disk_consistent_lsn(); - + up_to_date_disk_consistent_lsn: Lsn, + ) -> anyhow::Result> { let mut remote_filenames: HashSet = HashSet::new(); for fname in index_part.timeline_layers.iter() { remote_filenames.insert(fname.to_local_path(&PathBuf::from(""))); @@ -1048,10 +1036,10 @@ impl Timeline { .unwrap_or(LayerFileMetadata::MISSING); if let Some(imgfilename) = ImageFileName::parse_str(fname) { - if imgfilename.lsn > remote_consistent_lsn { + if imgfilename.lsn > up_to_date_disk_consistent_lsn { warn!( "found future image layer {} on timeline {} remote_consistent_lsn is {}", - imgfilename, self.timeline_id, remote_consistent_lsn + imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn ); continue; } @@ -1078,10 +1066,10 @@ impl Timeline { // OK for a delta layer to have end LSN 101, but if the end LSN // is 102, then it might not have been fully flushed to disk // before crash. - if deltafilename.lsn_range.end > remote_consistent_lsn + 1 { + if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 { warn!( "found future delta layer {} on timeline {} remote_consistent_lsn is {}", - deltafilename, self.timeline_id, remote_consistent_lsn + deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn ); continue; } @@ -1106,50 +1094,12 @@ impl Timeline { } } - // Save the metadata file to local disk. - // Do this last, so that if we crash in-between, we won't think that the - // local state is valid. - // - // We should not blindly overwrite local metadata with remote one - // consider the case when checkpoint came, we updated local metadata and started upload task - // but then pageserver crashes, then on start we'll load new metadata, and then reset it to the state of - // remote one. But current layermap will have layers from the old metadata which is inconsistent. - // And with current logic it wont disgard them during load because during layermap load it sees local - // disk consistent lsn which is ahead of layer lsns. - // If we treat remote as source of truth we need to completely syc with it, i e delete local files which are - // missing on the remote. This will add extra work, wal for these layers needs to be reingested for example - // - // So the solution is to update local metadata only when remote one has greater disk_consistent_lsn - - // We should merge local metadata with remote one - // 1) take metadata with highest disk_consistent_lsn. - // 2) Update disk_consistent_lsn inside timeline itself - // FIXME Something else we need to merge? - let up_to_date_metadata = - if local_metadata.disk_consistent_lsn() < remote_metadata.disk_consistent_lsn() { - save_metadata( - self.conf, - self.timeline_id, - self.tenant_id, - &remote_metadata, - first_save, - )?; - // NOTE: walreceiver strictly shouldnt be running at this point. - // because it will intersect with this update - // it will start replication from wrong point - self.disk_consistent_lsn - .store(remote_metadata.disk_consistent_lsn()); - remote_metadata - } else { - local_metadata - }; - // now these are local only filenames let local_only_filenames = local_filenames .difference(&remote_filenames) .cloned() .collect(); - Ok((local_only_filenames, up_to_date_metadata)) + Ok(local_only_filenames) } /// @@ -1164,15 +1114,15 @@ impl Timeline { /// This is used during tenant attach. The layer map must have been loaded /// with local filesystem contents already. /// - /// The caller can provide IndexPart if it has it already. If it's None, - /// this function will download it. + /// The caller should provide IndexPart if it exists on the remote storage. If it's None, + /// we assume that it is missing on the remote storage, which means that we initialized + /// a timeline and then restarted before successful upload was performed /// - #[instrument(skip(self, index_part, local_metadata, first_save))] + #[instrument(skip(self, index_part, up_to_date_metadata))] pub async fn reconcile_with_remote( &self, - local_metadata: TimelineMetadata, + up_to_date_metadata: &TimelineMetadata, index_part: Option<&IndexPart>, - first_save: bool, ) -> anyhow::Result<()> { info!("starting"); let remote_client = self @@ -1180,42 +1130,33 @@ impl Timeline { .as_ref() .ok_or_else(|| anyhow!("cannot download without remote storage"))?; + let disk_consistent_lsn = up_to_date_metadata.disk_consistent_lsn(); + // Build a map of local layers for quick lookups let mut local_filenames: HashSet = HashSet::new(); for layer in self.layers.read().unwrap().iter_historic_layers() { local_filenames.insert(layer.filename()); } - // If the caller supplied an IndexPart, use it. Otherwise download it from remote storage. - // In case there is no such file on remote storage (which is possible if pageserver - // was interrupted before index file was uploaded). - // FIXME One way around it would be not considering timeline as initialized before upload finishes. - // So if there is no index file on the remote we consider all locally existing files as local only - // and schedule their upload - let index_part: Result, DownloadError> = match index_part { - Some(ip) => Ok(Cow::Borrowed(ip)), - None => remote_client.download_index_file().await.map(Cow::Owned), - }; - let (local_only_filenames, up_to_date_metadata) = match index_part { - Ok(index_part) => { - remote_client.init_upload_queue(&index_part)?; - let (local_only_filenames, up_to_date_metadata) = self + let local_only_filenames = match index_part { + Some(index_part) => { + info!("initializing upload queue from index"); + let local_only_filenames = self .download_missing( - &index_part, + index_part, remote_client, local_filenames, - local_metadata, - first_save, + disk_consistent_lsn, ) .await?; - (local_only_filenames, up_to_date_metadata) + remote_client.init_upload_queue(index_part)?; + local_only_filenames } - Err(DownloadError::NotFound) => { - info!("no index file was found on the remote, assuming initial upload"); - remote_client.init_upload_queue_for_empty_remote(&local_metadata)?; - (local_filenames, local_metadata) + None => { + info!("initializing upload queue as empty"); + remote_client.init_upload_queue_for_empty_remote(up_to_date_metadata)?; + local_filenames } - Err(e) => return Err(anyhow::anyhow!(e)), }; // TODO what to do with physical size? @@ -1232,9 +1173,7 @@ impl Timeline { remote_client.schedule_layer_file_upload(&absolute, &LayerFileMetadata::new(sz))?; } if !local_only_filenames.is_empty() { - // FIXME this we should merge local and remote metadata, at least remote_consistent_lsn - // see comment in download_missing - remote_client.schedule_index_upload(&up_to_date_metadata)?; + remote_client.schedule_index_upload(up_to_date_metadata)?; } info!("Done"); @@ -1746,7 +1685,8 @@ impl Timeline { // After crash, we will restart WAL streaming and processing from that point. if disk_consistent_lsn != old_disk_consistent_lsn { assert!(disk_consistent_lsn > old_disk_consistent_lsn); - self.update_metadata_file(disk_consistent_lsn, layer_paths_to_upload)?; + self.update_metadata_file(disk_consistent_lsn, layer_paths_to_upload) + .context("update_metadata_file")?; // Also update the in-memory copy self.disk_consistent_lsn.store(disk_consistent_lsn); } @@ -1801,13 +1741,18 @@ impl Timeline { self.tenant_id, &metadata, false, - )?; + ) + .context("save_metadata")?; if let Some(remote_client) = &self.remote_client { for (path, layer_metadata) in layer_paths_to_upload { - remote_client.schedule_layer_file_upload(&path, &layer_metadata)?; + remote_client + .schedule_layer_file_upload(&path, &layer_metadata) + .context("schedule_layer_file_upload")?; } - remote_client.schedule_index_upload(&metadata)?; + remote_client + .schedule_index_upload(&metadata) + .context("schedule_layer_file_upload")?; } Ok(())