diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 60cd5c9695..2e3ce45c2b 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3395,6 +3395,12 @@ impl Tenant { let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id; let unfinished_timeline = raw_timeline.raw_timeline()?; + // Flush the new layer files to disk, before we make the timeline as available to + // the outside world. + // + // Flush loop needs to be spawned in order to be able to flush. + unfinished_timeline.maybe_spawn_flush_loop(); + import_datadir::import_timeline_from_postgres_datadir( unfinished_timeline, &pgdata_path, @@ -3406,12 +3412,6 @@ impl Tenant { format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}") })?; - // Flush the new layer files to disk, before we make the timeline as available to - // the outside world. - // - // Flush loop needs to be spawned in order to be able to flush. - unfinished_timeline.maybe_spawn_flush_loop(); - fail::fail_point!("before-checkpoint-new-timeline", |_| { anyhow::bail!("failpoint before-checkpoint-new-timeline"); }); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 58bdd84906..6da0f9d91c 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1571,7 +1571,15 @@ impl Timeline { // This exists to provide a non-span creating version of `freeze_and_flush` we can call without // polluting the span hierarchy. pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> { - let to_lsn = self.freeze_inmem_layer(false).await; + let to_lsn = { + // Freeze the current open in-memory layer. It will be written to disk on next + // iteration. + let mut g = self.write_lock.lock().await; + + let to_lsn = self.get_last_record_lsn(); + self.freeze_inmem_layer_at(to_lsn, &mut g).await; + to_lsn + }; self.flush_frozen_layers_and_wait(to_lsn).await } @@ -1657,25 +1665,35 @@ impl Timeline { self.last_freeze_at.load(), open_layer.get_opened_at(), ) { - match open_layer.info() { + let at_lsn = match open_layer.info() { InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => { // We may reach this point if the layer was already frozen by not yet flushed: flushing // happens asynchronously in the background. tracing::debug!( "Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})" ); + None } InMemoryLayerInfo::Open { .. } => { // Upgrade to a write lock and freeze the layer drop(layers_guard); let mut layers_guard = self.layers.write().await; - layers_guard - .try_freeze_in_memory_layer(current_lsn, &self.last_freeze_at) + let froze = layers_guard + .try_freeze_in_memory_layer( + current_lsn, + &self.last_freeze_at, + &mut write_guard, + ) .await; + Some(current_lsn).filter(|_| froze) + } + }; + if let Some(lsn) = at_lsn { + let res: Result = self.flush_frozen_layers(lsn); + if let Err(e) = res { + tracing::info!("failed to flush frozen layer after background freeze: {e:#}"); } } - write_guard.take(); - self.flush_frozen_layers(); } } @@ -2384,7 +2402,7 @@ impl Timeline { let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error); self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await; let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap(); - assert!(matches!(*flush_loop_state, FlushLoopState::Running{ ..})); + assert!(matches!(*flush_loop_state, FlushLoopState::Running{..})); *flush_loop_state = FlushLoopState::Exited; Ok(()) } @@ -3647,31 +3665,21 @@ impl Timeline { self.last_record_lsn.advance(new_lsn); } - /// Whether there was a layer to freeze or not, return the value of get_last_record_lsn - /// before we attempted the freeze: this guarantees that ingested data is frozen up to this lsn (inclusive). - async fn freeze_inmem_layer(&self, write_lock_held: bool) -> Lsn { - // Freeze the current open in-memory layer. It will be written to disk on next - // iteration. - - let _write_guard = if write_lock_held { - None - } else { - let mut g = self.write_lock.lock().await; - // remove the reference to an open layer - g.take(); - Some(g) + async fn freeze_inmem_layer_at( + &self, + at: Lsn, + write_lock: &mut tokio::sync::MutexGuard<'_, Option>, + ) { + let frozen = { + let mut guard = self.layers.write().await; + guard + .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock) + .await }; - - let to_lsn = self.get_last_record_lsn(); - self.freeze_inmem_layer_at(to_lsn).await; - to_lsn - } - - async fn freeze_inmem_layer_at(&self, at: Lsn) { - let mut guard = self.layers.write().await; - guard - .try_freeze_in_memory_layer(at, &self.last_freeze_at) - .await; + if frozen { + let now = Instant::now(); + *(self.last_freeze_ts.write().unwrap()) = now; + } } /// Layer flusher task's main loop. @@ -3765,18 +3773,14 @@ impl Timeline { } } - /// Request the flush loop to write out all frozen layers up to `to_lsn` as Delta L0 files to disk. - /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer`]. + /// Request the flush loop to write out all frozen layers up to `at_lsn` as Delta L0 files to disk. + /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer_at`]. /// - /// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case, - /// it means no data will be written between the top of the highest frozen layer and to_lsn, - /// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL. - async fn flush_frozen_layers_and_wait( - &self, - last_record_lsn: Lsn, - ) -> Result<(), FlushLayerError> { - let mut rx = self.layer_flush_done_tx.subscribe(); - + /// `at_lsn` may be higher than the highest LSN of a frozen layer: if this is the + /// case, it means no data will be written between the top of the highest frozen layer and + /// to_lsn, e.g. because this tenant shard has ingested up to to_lsn and not written any data + /// locally for that part of the WAL. + fn flush_frozen_layers(&self, at_lsn: Lsn) -> Result { // Increment the flush cycle counter and wake up the flush task. // Remember the new value, so that when we listen for the flush // to finish, we know when the flush that we initiated has @@ -3791,13 +3795,18 @@ impl Timeline { self.layer_flush_start_tx.send_modify(|(counter, lsn)| { my_flush_request = *counter + 1; *counter = my_flush_request; - *lsn = std::cmp::max(last_record_lsn, *lsn); + *lsn = std::cmp::max(at_lsn, *lsn); }); + Ok(my_flush_request) + } + + async fn wait_flush_completion(&self, request: u64) -> Result<(), FlushLayerError> { + let mut rx = self.layer_flush_done_tx.subscribe(); loop { { let (last_result_counter, last_result) = &*rx.borrow(); - if *last_result_counter >= my_flush_request { + if *last_result_counter >= request { if let Err(err) = last_result { // We already logged the original error in // flush_loop. We cannot propagate it to the caller @@ -3824,12 +3833,9 @@ impl Timeline { } } - fn flush_frozen_layers(&self) { - self.layer_flush_start_tx.send_modify(|(counter, lsn)| { - *counter += 1; - - *lsn = std::cmp::max(*lsn, Lsn(self.last_freeze_at.load().0 - 1)); - }); + async fn flush_frozen_layers_and_wait(&self, at_lsn: Lsn) -> Result<(), FlushLayerError> { + let token = self.flush_frozen_layers(at_lsn)?; + self.wait_flush_completion(token).await } /// Flush one frozen in-memory layer to disk, as a new delta layer. @@ -5672,16 +5678,15 @@ impl<'a> TimelineWriter<'a> { } async fn roll_layer(&mut self, freeze_at: Lsn) -> anyhow::Result<()> { - assert!(self.write_guard.is_some()); - - self.tl.freeze_inmem_layer_at(freeze_at).await; - - let now = Instant::now(); - *(self.last_freeze_ts.write().unwrap()) = now; - - self.tl.flush_frozen_layers(); - let current_size = self.write_guard.as_ref().unwrap().current_size; + + // self.write_guard will be taken by the freezing + self.tl + .freeze_inmem_layer_at(freeze_at, &mut self.write_guard) + .await; + + self.tl.flush_frozen_layers(freeze_at)?; + if current_size >= self.get_checkpoint_distance() * 2 { warn!("Flushed oversized open layer with size {}", current_size) } @@ -5695,20 +5700,27 @@ impl<'a> TimelineWriter<'a> { return OpenLayerAction::Open; }; + #[cfg(feature = "testing")] if state.cached_last_freeze_at < self.tl.last_freeze_at.load() { - // TODO(#7993): branch is needed before refactoring the many places of freezing for the - // possibility `state` having a "dangling" reference to an already frozen in-memory - // layer. + // this check and assertion are not really needed because + // LayerManager::try_freeze_in_memory_layer will always clear out the + // TimelineWriterState if something is frozen. however, we can advance last_freeze_at when there + // is no TimelineWriterState. assert!( state.open_layer.end_lsn.get().is_some(), "our open_layer must be outdated" ); - return OpenLayerAction::Open; + + // this would be a memory leak waiting to happen because the in-memory layer always has + // an index + panic!("BUG: TimelineWriterState held on to frozen in-memory layer."); } if state.prev_lsn == Some(lsn) { - // Rolling mid LSN is not supported by downstream code. + // Rolling mid LSN is not supported by [downstream code]. // Hence, only roll at LSN boundaries. + // + // [downstream code]: https://github.com/neondatabase/neon/pull/7993#discussion_r1633345422 return OpenLayerAction::None; } diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 0e82dedecb..21e64d562a 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -21,6 +21,8 @@ use crate::{ }, }; +use super::TimelineWriterState; + /// Provides semantic APIs to manipulate the layer map. #[derive(Default)] pub(crate) struct LayerManager { @@ -120,18 +122,20 @@ impl LayerManager { Ok(layer) } - /// Called from `freeze_inmem_layer`, returns true if successfully frozen. - pub(crate) async fn try_freeze_in_memory_layer( + /// Tries to freeze an open layer and also manages clearing the TimelineWriterState. + /// + /// Returns true if anything was frozen. + pub(super) async fn try_freeze_in_memory_layer( &mut self, lsn: Lsn, last_freeze_at: &AtomicLsn, - ) { + write_lock: &mut tokio::sync::MutexGuard<'_, Option>, + ) -> bool { let Lsn(last_record_lsn) = lsn; let end_lsn = Lsn(last_record_lsn + 1); - if let Some(open_layer) = &self.layer_map.open_layer { + let froze = if let Some(open_layer) = &self.layer_map.open_layer { let open_layer_rc = Arc::clone(open_layer); - // Does this layer need freezing? open_layer.freeze(end_lsn).await; // The layer is no longer open, update the layer map to reflect this. @@ -139,11 +143,25 @@ impl LayerManager { self.layer_map.frozen_layers.push_back(open_layer_rc); self.layer_map.open_layer = None; self.layer_map.next_open_layer_at = Some(end_lsn); - } + + true + } else { + false + }; // Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this // accounts for regions in the LSN range where we might have ingested no data due to sharding. last_freeze_at.store(end_lsn); + + // the writer state must no longer have a reference to the frozen layer + let taken = write_lock.take(); + assert_eq!( + froze, + taken.is_some(), + "should only had frozen a layer when TimelineWriterState existed" + ); + + froze } /// Add image layers to the layer map, called from `create_image_layers`.