Separate flushing in-memory layer to disk from checkpoints.

When 'checkpoint_distance' is reached, freeze the current in-memory layer directly in the WAL receiver thread. And to flush the frozen layer to disk, launch a separate "layer flushing thread". This leaves only the compaction duty to the checkpoint thread.
2026-05-17 05:00:38 +00:00 · 2022-03-14 11:37:22 +02:00
parent c559c72ede
commit 2d8587f67d
9 changed files with 175 additions and 135 deletions
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
@@ -29,7 +29,7 @@ use std::io::Write;
 use std::ops::{Bound::Included, Deref, Range};
 use std::path::{Path, PathBuf};
 use std::sync::atomic::{self, AtomicBool};
-use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard};
+use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard, TryLockError};
 use std::time::Instant;

 use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME};
@@ -720,6 +720,8 @@ pub struct LayeredTimeline {

    layers: Mutex<LayerMap>,

+    last_freeze_at: AtomicLsn,
+
    // WAL redo manager
    walredo_mgr: Arc<dyn WalRedoManager + Sync + Send>,

@@ -768,6 +770,9 @@ pub struct LayeredTimeline {
    /// to avoid deadlock.
    write_lock: Mutex<()>,

+    /// Used to ensure that there is only one thread
+    layer_flush_lock: Mutex<()>,
+
    // Prevent concurrent checkpoints.
    // Checkpoints are normally performed by one thread. But checkpoint can also be manually requested by admin
    // (that's used in tests), and shutdown also forces a checkpoint. These forced checkpoints run in a different thread
@@ -854,15 +859,24 @@ impl Timeline for LayeredTimeline {
    /// metrics collection.
    fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()> {
        match cconf {
-            CheckpointConfig::Flush => self
-                .flush_checkpoint_time_histo
-                .observe_closure_duration(|| self.checkpoint_internal(0, false)),
-            CheckpointConfig::Forced => self
-                .forced_checkpoint_time_histo
-                .observe_closure_duration(|| self.checkpoint_internal(0, true)),
-            CheckpointConfig::Distance(distance) => self
+            CheckpointConfig::Flush => {
+                self.flush_checkpoint_time_histo
+                    .observe_closure_duration(|| {
+                        self.freeze_inmem_layer(false);
+                        self.flush_frozen_layers(true)
+                    })
+            }
+            CheckpointConfig::Forced => {
+                self.forced_checkpoint_time_histo
+                    .observe_closure_duration(|| {
+                        self.freeze_inmem_layer(false);
+                        self.flush_frozen_layers(true)?;
+                        self.checkpoint_internal()
+                    })
+            }
+            CheckpointConfig::Distance => self
                .checkpoint_time_histo
-                .observe_closure_duration(|| self.checkpoint_internal(distance, true)),
+                .observe_closure_duration(|| self.checkpoint_internal()),
        }
    }

@@ -969,6 +983,8 @@ impl LayeredTimeline {
            }),
            disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0),

+            last_freeze_at: AtomicLsn::new(0),
+
            ancestor_timeline: ancestor,
            ancestor_lsn: metadata.ancestor_lsn(),

@@ -980,6 +996,7 @@ impl LayeredTimeline {
            upload_relishes: AtomicBool::new(upload_relishes),

            write_lock: Mutex::new(()),
+            layer_flush_lock: Mutex::new(()),
            checkpoint_cs: Mutex::new(()),

            gc_info: RwLock::new(GcInfo {
@@ -1100,7 +1117,7 @@ impl LayeredTimeline {
        let mut result = ValueReconstructResult::Continue;
        let mut cont_lsn = Lsn(request_lsn.0 + 1);

-        loop {
+        'outer: loop {
            // The function should have updated 'state'
            //info!("CALLED for {} at {}: {:?} with {} records", reconstruct_state.key, reconstruct_state.lsn, result, reconstruct_state.records.len());
            match result {
@@ -1169,7 +1186,7 @@ impl LayeredTimeline {
                    continue;
                }
            }
-            if let Some(frozen_layer) = &layers.frozen_layer {
+            for frozen_layer in layers.frozen_layers.iter() {
                let start_lsn = frozen_layer.get_lsn_range().start;
                if cont_lsn > start_lsn {
                    //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
@@ -1180,7 +1197,7 @@ impl LayeredTimeline {
                    )?;
                    cont_lsn = start_lsn;
                    path.push((result, cont_lsn, frozen_layer.clone()));
-                    continue;
+                    continue 'outer;
                }
            }

@@ -1258,7 +1275,7 @@ impl LayeredTimeline {
                lsn
            );
            let new_layer =
-                InMemoryLayer::create(self.conf, self.timelineid, self.tenantid, start_lsn, lsn)?;
+                InMemoryLayer::create(self.conf, self.timelineid, self.tenantid, start_lsn)?;
            let layer_rc = Arc::new(new_layer);

            layers.open_layer = Some(Arc::clone(&layer_rc));
@@ -1273,7 +1290,6 @@ impl LayeredTimeline {
        //info!("PUT: key {} at {}", key, lsn);
        let layer = self.get_layer_for_write(lsn)?;
        layer.put_value(key, lsn, val)?;
-
        Ok(())
    }

@@ -1284,80 +1300,53 @@ impl LayeredTimeline {
        Ok(())
    }

+    fn finish_write(&self, new_lsn: Lsn) {
+        assert!(new_lsn.is_aligned());
+
+        self.last_record_lsn.advance(new_lsn);
+    }
+
+    fn freeze_inmem_layer(&self, write_lock_held: bool) {
+        // Freeze the current open in-memory layer. It will be written to disk on next
+        // iteration.
+        let _write_guard = if write_lock_held {
+            None
+        } else {
+            Some(self.write_lock.lock().unwrap())
+        };
+        let mut layers = self.layers.lock().unwrap();
+        if let Some(open_layer) = &layers.open_layer {
+            let open_layer_rc = Arc::clone(open_layer);
+            // Does this layer need freezing?
+            let end_lsn = Lsn(self.get_last_record_lsn().0 + 1);
+            open_layer.freeze(end_lsn);
+
+            // The layer is no longer open, update the layer map to reflect this.
+            // We will replace it with on-disk historics below.
+            layers.frozen_layers.push_back(open_layer_rc);
+            layers.open_layer = None;
+            layers.next_open_layer_at = Some(end_lsn);
+            self.last_freeze_at.store(end_lsn);
+        }
+        drop(layers);
+    }
+
    ///
    /// Flush to disk all data that was written with the put_* functions
    ///
    /// NOTE: This has nothing to do with checkpoint in PostgreSQL.
-    fn checkpoint_internal(&self, checkpoint_distance: u64, reconstruct_pages: bool) -> Result<()> {
+    fn checkpoint_internal(&self) -> Result<()> {
        info!("checkpoint starting");
        // Prevent concurrent checkpoints
+        // FIXME: This does compaction now, not the flushing of layers.
+        // Is this lock still needed?
        let _checkpoint_cs = self.checkpoint_cs.lock().unwrap();

-        // If the in-memory layer is larger than 'checkpoint_distance', write it
-        // to a delta file.  That's necessary to limit the amount of WAL that
-        // needs to be kept in the safekeepers, and that needs to be reprocessed
-        // on page server crash.
-        //
-        // TODO: It's not a great policy for keeping memory usage in check,
-        // though. We should also aim at flushing layers that consume a lot of
-        // memory and/or aren't receiving much updates anymore.
-        loop {
-            // Do we have a frozen in-memory layer that we need to write out?
-            // If we do, write it out now. Otherwise, check if the current
-            // in-memory layer is old enough that we should freeze and write it out.
-            let write_guard = self.write_lock.lock().unwrap();
-            let mut layers = self.layers.lock().unwrap();
-            if let Some(frozen_layer) = &layers.frozen_layer {
-                // Write out the frozen in-memory layer to disk, as a delta file
-                let frozen_layer = Arc::clone(frozen_layer);
-                drop(write_guard);
-                drop(layers);
-                self.flush_frozen_layer(frozen_layer)?;
-            } else {
-                // Freeze the current open in-memory layer, if it's larger than
-                // 'checkpoint_distance'. It will be written to disk on next
-                // iteration.
-                if let Some(open_layer) = &layers.open_layer {
-                    // Does this layer need freezing?
-                    let RecordLsn {
-                        last: last_record_lsn,
-                        prev: _prev_record_lsn,
-                    } = self.last_record_lsn.load();
-                    let oldest_lsn = open_layer.get_oldest_lsn();
-                    let distance = last_record_lsn.widening_sub(oldest_lsn);
-                    if distance < 0 || distance < checkpoint_distance.into() {
-                        info!(
-                            "the oldest layer is now {} which is {} bytes behind last_record_lsn",
-                            open_layer.filename().display(),
-                            distance
-                        );
-                        break;
-                    }
-                    let end_lsn = Lsn(self.get_last_record_lsn().0 + 1);
-                    open_layer.freeze(end_lsn);
-
-                    // The layer is no longer open, update the layer map to reflect this.
-                    // We will replace it with on-disk historics below.
-                    layers.frozen_layer = Some(Arc::clone(open_layer));
-                    layers.open_layer = None;
-                    layers.next_open_layer_at = Some(end_lsn);
-                } else {
-                    break;
-                }
-                // We will write the now-frozen layer to disk on next iteration.
-                // That could take a while, so release the lock while do it
-                drop(layers);
-                drop(write_guard);
-            }
-        }
-
        // Create new image layers to allow GC and to reduce read latency
-        if reconstruct_pages {
-            // TODO: the threshold for how often we create image layers is
-            // currently hard-coded at 3. It means, write out a new image layer,
-            // if there are at least three delta layers on top of it.
-            self.compact(TARGET_FILE_SIZE_BYTES as usize)?;
-        }
+        // TODO: the threshold for how often we create image layers is
+        // currently hard-coded at 3. It means, write out a new image layer,
+        // if there are at least three delta layers on top of it.
+        self.compact(TARGET_FILE_SIZE_BYTES as usize)?;

        // TODO: We should also compact existing delta layers here.

@@ -1373,20 +1362,86 @@ impl LayeredTimeline {
        Ok(())
    }

+    pub fn check_checkpoint_distance(self: &Arc<LayeredTimeline>) -> Result<()> {
+        let last_lsn = self.get_last_record_lsn();
+
+        let distance = last_lsn.widening_sub(self.last_freeze_at.load());
+        if distance >= self.conf.checkpoint_distance.into() {
+            self.freeze_inmem_layer(true);
+            self.last_freeze_at.store(last_lsn);
+        }
+        if let Ok(guard) = self.layer_flush_lock.try_lock() {
+            drop(guard);
+            let self_clone = Arc::clone(self);
+            thread_mgr::spawn(
+                thread_mgr::ThreadKind::LayerFlushThread,
+                Some(self.tenantid),
+                Some(self.timelineid),
+                "layer flush thread",
+                move || self_clone.flush_frozen_layers(false),
+            )?;
+        }
+        Ok(())
+    }
+
+    /// Flush all frozen layers to disk.
+    ///
+    /// Only one thread at a time can be doing layer-flushing for a
+    /// given timeline. If 'wait' is true, and another thread is
+    /// currently doing the flushing, this function will wait for it
+    /// to finish. If 'wait' is false, this function will return
+    /// immediately instead.
+    fn flush_frozen_layers(&self, wait: bool) -> Result<()> {
+        let flush_lock_guard = if wait {
+            self.layer_flush_lock.lock().unwrap()
+        } else {
+            match self.layer_flush_lock.try_lock() {
+                Ok(guard) => guard,
+                Err(TryLockError::WouldBlock) => return Ok(()),
+                Err(TryLockError::Poisoned(err)) => panic!("{:?}", err),
+            }
+        };
+
+        loop {
+            let layers = self.layers.lock().unwrap();
+            if let Some(frozen_layer) = layers.frozen_layers.front() {
+                let frozen_layer = Arc::clone(frozen_layer);
+                drop(layers); // to allow concurrent reads and writes
+                self.flush_frozen_layer(frozen_layer)?;
+            } else {
+                // Drop the 'layer_flush_lock' *before* 'layers'. That
+                // way, if you freeze a layer, and then call
+                // flush_frozen_layers(false), it is guaranteed that
+                // if another thread was busy flushing layers and the
+                // call therefore returns immediately, the other
+                // thread will have seen the newly-frozen layer and
+                // will flush that too (assuming no errors).
+                drop(flush_lock_guard);
+                drop(layers);
+                break;
+            }
+        }
+
+        Ok(())
+    }
+
    fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> Result<()> {
        // Do we have a frozen in-memory layer that we need to write out?
        let new_delta = frozen_layer.write_to_disk()?;

        // Finally, replace the frozen in-memory layer with the new on-disk layers
-        let write_guard = self.write_lock.lock().unwrap();
        let mut layers = self.layers.lock().unwrap();
-        layers.frozen_layer = None;
+        let l = layers.frozen_layers.pop_front();
+
+        // Only one thread may call this function at a time (for this
+        // timeline). If two threads tried to flush the same frozen
+        // layer to disk at the same time, that would not work.
+        assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer));

        // Add the new delta layer to the LayerMap
        let mut layer_paths = vec![new_delta.path()];
        layers.insert_historic(Arc::new(new_delta));

-        drop(write_guard);
        drop(layers);

        // Sync layers
@@ -1929,10 +1984,8 @@ impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> {
    ///
    /// Remember the (end of) last valid WAL record remembered in the timeline.
    ///
-    fn advance_last_record_lsn(&self, new_lsn: Lsn) {
-        assert!(new_lsn.is_aligned());
-
-        self.tl.last_record_lsn.advance(new_lsn);
+    fn finish_write(&self, new_lsn: Lsn) {
+        self.tl.finish_write(new_lsn);
    }
 }

@@ -2031,7 +2084,7 @@ mod tests {

        let writer = tline.writer();
        writer.put(TEST_KEY, Lsn(0x10), Value::Image(TEST_IMG("foo at 0x10")))?;
-        writer.advance_last_record_lsn(Lsn(0x10));
+        writer.finish_write(Lsn(0x10));
        drop(writer);

        tline.checkpoint(CheckpointConfig::Forced)?;
@@ -2039,7 +2092,7 @@ mod tests {

        let writer = tline.writer();
        writer.put(TEST_KEY, Lsn(0x20), Value::Image(TEST_IMG("foo at 0x20")))?;
-        writer.advance_last_record_lsn(Lsn(0x20));
+        writer.finish_write(Lsn(0x20));
        drop(writer);

        tline.checkpoint(CheckpointConfig::Forced)?;
@@ -2047,7 +2100,7 @@ mod tests {

        let writer = tline.writer();
        writer.put(TEST_KEY, Lsn(0x30), Value::Image(TEST_IMG("foo at 0x30")))?;
-        writer.advance_last_record_lsn(Lsn(0x30));
+        writer.finish_write(Lsn(0x30));
        drop(writer);

        tline.checkpoint(CheckpointConfig::Forced)?;
@@ -2055,7 +2108,7 @@ mod tests {

        let writer = tline.writer();
        writer.put(TEST_KEY, Lsn(0x40), Value::Image(TEST_IMG("foo at 0x40")))?;
-        writer.advance_last_record_lsn(Lsn(0x40));
+        writer.finish_write(Lsn(0x40));
        drop(writer);

        tline.checkpoint(CheckpointConfig::Forced)?;
@@ -2094,7 +2147,7 @@ mod tests {
                    lsn,
                    Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
                )?;
-                writer.advance_last_record_lsn(lsn);
+                writer.finish_write(lsn);
                drop(writer);

                keyspace.add_key(test_key);
@@ -2145,7 +2198,7 @@ mod tests {
                lsn,
                Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
            )?;
-            writer.advance_last_record_lsn(lsn);
+            writer.finish_write(lsn);
            updated[blknum] = lsn;
            drop(writer);

@@ -2167,7 +2220,7 @@ mod tests {
                    Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
                )?;
                println!("updating {} at {}", blknum, lsn);
-                writer.advance_last_record_lsn(lsn);
+                writer.finish_write(lsn);
                drop(writer);
                updated[blknum] = lsn;
            }
@@ -2219,7 +2272,7 @@ mod tests {
                lsn,
                Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
            )?;
-            writer.advance_last_record_lsn(lsn);
+            writer.finish_write(lsn);
            updated[blknum] = lsn;
            drop(writer);

@@ -2253,7 +2306,7 @@ mod tests {
                    Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
                )?;
                println!("updating {} at {}", blknum, lsn);
-                writer.advance_last_record_lsn(lsn);
+                writer.finish_write(lsn);
                drop(writer);
                updated[blknum] = lsn;
            }
--- a/pageserver/src/layered_repository/inmemory_layer.rs
+++ b/pageserver/src/layered_repository/inmemory_layer.rs
@@ -34,21 +34,6 @@ pub struct InMemoryLayer {
    ///
    start_lsn: Lsn,

-    ///
-    /// LSN of the oldest value stored in this layer.
-    ///
-    /// This is different from 'start_lsn' in that we enforce that the 'start_lsn'
-    /// of a layer always matches the 'end_lsn' of its predecessor, even if there
-    /// are no page versions until at a later LSN. That way you can detect any
-    /// missing layer files more easily. 'oldest_lsn' is the first page version
-    /// actually stored in this layer. In the range between 'start_lsn' and
-    /// 'oldest_lsn', there are no changes to the segment.
-    /// 'oldest_lsn' is used to adjust 'disk_consistent_lsn' and that is why it should
-    /// point to the beginning of WAL record. This is the other difference with 'start_lsn'
-    /// which points to end of WAL record. This is why 'oldest_lsn' can be smaller than 'start_lsn'.
-    ///
-    oldest_lsn: Lsn,
-
    /// The above fields never change. The parts that do change are in 'inner',
    /// and protected by mutex.
    inner: RwLock<InMemoryLayerInner>,
@@ -236,11 +221,6 @@ impl Layer for InMemoryLayer {
 }

 impl InMemoryLayer {
-    /// Return the oldest page version that's stored in this layer
-    pub fn get_oldest_lsn(&self) -> Lsn {
-        self.oldest_lsn
-    }
-
    ///
    /// Create a new, empty, in-memory layer
    ///
@@ -249,7 +229,6 @@ impl InMemoryLayer {
        timelineid: ZTimelineId,
        tenantid: ZTenantId,
        start_lsn: Lsn,
-        oldest_lsn: Lsn,
    ) -> Result<InMemoryLayer> {
        trace!(
            "initializing new empty InMemoryLayer for writing on timeline {} at {}",
@@ -264,7 +243,6 @@ impl InMemoryLayer {
            timelineid,
            tenantid,
            start_lsn,
-            oldest_lsn,
            inner: RwLock::new(InMemoryLayerInner {
                end_lsn: None,
                index: HashMap::new(),
--- a/pageserver/src/layered_repository/layer_map.rs
+++ b/pageserver/src/layered_repository/layer_map.rs
@@ -16,6 +16,7 @@ use crate::layered_repository::InMemoryLayer;
 use crate::repository::Key;
 use anyhow::Result;
 use lazy_static::lazy_static;
+use std::collections::VecDeque;
 use std::ops::Range;
 use std::sync::Arc;
 use tracing::*;
@@ -47,7 +48,7 @@ pub struct LayerMap {
    /// layer is during checkpointing, when an InMemoryLayer is being written out
    /// to disk.
    ///
-    pub frozen_layer: Option<Arc<InMemoryLayer>>,
+    pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,

    /// All the historic layers are kept here

--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -42,7 +42,7 @@ pub const LOG_FILE_NAME: &str = "pageserver.log";
 #[derive(Debug, Clone, Copy)]
 pub enum CheckpointConfig {
    // Flush in-memory data that is older than this
-    Distance(u64),
+    Distance,
    // Flush all in-memory data
    Flush,
    // Flush all in-memory data and reconstruct all page images
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -798,7 +798,7 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> {
            writer.delete(key_range.clone(), self.lsn)?;
        }

-        writer.advance_last_record_lsn(self.lsn);
+        writer.finish_write(self.lsn);

        if last_partitioning == Lsn(0)
            || self.lsn.0 - last_partitioning.0 > TARGET_FILE_SIZE_BYTES / 8
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -401,11 +401,14 @@ pub trait TimelineWriter<'a> {

    fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> Result<()>;

-    /// Track end of the latest digested WAL record.
+    /// Track the end of the latest digested WAL record.
    ///
-    /// Advance requires aligned LSN as an argument and would wake wait_lsn() callers.
-    /// Previous last record LSN is stored alongside the latest and can be read.
-    fn advance_last_record_lsn(&self, lsn: Lsn);
+    /// Call this after you have finished writing all the WAL up to 'lsn'.
+    ///
+    /// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for
+    /// the 'lsn' or anything older. The previous last record LSN is stored alongside
+    /// the latest and can be read.
+    fn finish_write(&self, lsn: Lsn);
 }

 #[cfg(test)]
@@ -554,12 +557,12 @@ mod tests {

        let writer = tline.writer();
        writer.put(*TEST_KEY, Lsn(0x10), Value::Image(TEST_IMG("foo at 0x10")))?;
-        writer.advance_last_record_lsn(Lsn(0x10));
+        writer.finish_write(Lsn(0x10));
        drop(writer);

        let writer = tline.writer();
        writer.put(*TEST_KEY, Lsn(0x20), Value::Image(TEST_IMG("foo at 0x20")))?;
-        writer.advance_last_record_lsn(Lsn(0x20));
+        writer.finish_write(Lsn(0x20));
        drop(writer);

        assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
@@ -594,12 +597,12 @@ mod tests {
        // Insert a value on the timeline
        writer.put(TEST_KEY_A, Lsn(0x20), test_value("foo at 0x20"))?;
        writer.put(TEST_KEY_B, Lsn(0x20), test_value("foobar at 0x20"))?;
-        writer.advance_last_record_lsn(Lsn(0x20));
+        writer.finish_write(Lsn(0x20));

        writer.put(TEST_KEY_A, Lsn(0x30), test_value("foo at 0x30"))?;
-        writer.advance_last_record_lsn(Lsn(0x30));
+        writer.finish_write(Lsn(0x30));
        writer.put(TEST_KEY_A, Lsn(0x40), test_value("foo at 0x40"))?;
-        writer.advance_last_record_lsn(Lsn(0x40));
+        writer.finish_write(Lsn(0x40));

        //assert_current_logical_size(&tline, Lsn(0x40));

@@ -611,7 +614,7 @@ mod tests {
        };
        let new_writer = newtline.writer();
        new_writer.put(TEST_KEY_A, Lsn(0x40), test_value("bar at 0x40"))?;
-        new_writer.advance_last_record_lsn(Lsn(0x40));
+        new_writer.finish_write(Lsn(0x40));

        // Check page contents on both branches
        assert_eq!(
@@ -643,14 +646,14 @@ mod tests {
                lsn,
                Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
            )?;
-            writer.advance_last_record_lsn(lsn);
+            writer.finish_write(lsn);
            lsn += 0x10;
            writer.put(
                *TEST_KEY,
                lsn,
                Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
            )?;
-            writer.advance_last_record_lsn(lsn);
+            writer.finish_write(lsn);
            lsn += 0x10;
        }
        tline.checkpoint(CheckpointConfig::Forced)?;
@@ -661,14 +664,14 @@ mod tests {
                lsn,
                Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
            )?;
-            writer.advance_last_record_lsn(lsn);
+            writer.finish_write(lsn);
            lsn += 0x10;
            writer.put(
                *TEST_KEY,
                lsn,
                Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
            )?;
-            writer.advance_last_record_lsn(lsn);
+            writer.finish_write(lsn);
        }
        tline.checkpoint(CheckpointConfig::Forced)
    }
--- a/pageserver/src/tenant_threads.rs
+++ b/pageserver/src/tenant_threads.rs
@@ -34,7 +34,7 @@ fn checkpoint_loop_ext(tenantid: ZTenantId, conf: &'static PageServerConf) -> Re
        // checkpoint timelines that have accumulated more than CHECKPOINT_DISTANCE
        // bytes of WAL since last checkpoint.
        let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
-        repo.checkpoint_iteration(CheckpointConfig::Distance(conf.checkpoint_distance))?;
+        repo.checkpoint_iteration(CheckpointConfig::Distance)?;
    }

    trace!(
--- a/pageserver/src/thread_mgr.rs
+++ b/pageserver/src/thread_mgr.rs
@@ -98,6 +98,9 @@ pub enum ThreadKind {
    // Thread that handles GC of a tenant
    GarbageCollector,

+    // FIXME
+    LayerFlushThread,
+
    // Thread for synchronizing pageserver relish data with the remote storage.
    // Shared by all tenants.
    StorageSync,
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -266,6 +266,8 @@ fn walreceiver_main(
                    caught_up = true;
                }

+                timeline.tline.check_checkpoint_distance()?;
+
                Some(endlsn)
            }