REVERT ME. Add unlink test and a bunch of debug printlns

Attempt to fix list_rels & list_nonrels for dropped segments
Rename put_unlink() to drop_relish() in Timeline trait.
2026-01-22 21:02:56 +00:00 · 2021-09-02 17:29:29 +03:00 · 2021-09-02 16:46:11 +03:00 · 2021-09-02 16:37:05 +03:00 · 2021-09-02 16:33:47 +03:00 · 2021-09-02 16:33:30 +03:00
13 changed files with 249 additions and 112 deletions
--- a/docs/glossary.md
+++ b/docs/glossary.md
@@ -26,7 +26,7 @@ A checkpoint record in the WAL marks a point in the WAL sequence at which it is
 NOTE: This is an overloaded term.

 Whenever enough WAL has been accumulated in memory, the page server []
-writes out the changes in memory into new layer files[]. This process
+writes out the changes from in-memory layers into new layer files[]. This process
 is called "checkpointing". The page server only creates layer files for
 relations that have been modified since the last checkpoint. 

@@ -41,17 +41,28 @@ Stateless Postgres node that stores data in pageserver.
 Each of the separate segmented file sets in which a relation is stored. The main fork is where the actual data resides. There also exist two secondary forks for metadata: the free space map and the visibility map.
 Each PostgreSQL fork is considered a separate relish.

-### Layer file
+### Layer
+
+Each layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs.
+There are two kinds of layers, in-memory and on-disk layers. In-memory
+layers are used to ingest incoming WAL, and provide fast access
+to the recent page versions. On-disk layers are stored as files on disk, and
+are immutable.
+### Layer file (on-disk layer)

 Layered repository on-disk format is based on immutable files.  The
-files are called "layer files". Each file corresponds to one 10 MB
+files are called "layer files". Each file corresponds to one RELISH_SEG_SIZE
 segment of a PostgreSQL relation fork. There are two kinds of layer
 files: image files and delta files. An image file contains a
 "snapshot" of the segment at a particular LSN, and a delta file
 contains WAL records applicable to the segment, in a range of LSNs.

+### Layer map
+
+The layer map tracks what layers exist for all the relishes in a timeline.
 ### Layered repository

+Zenith repository implementation that keeps data in layers.
 ### LSN


@@ -121,7 +132,7 @@ Each SLRU segment is considered a separate relish[].

 ### Tenant (Multitenancy)
 Tenant represents a single customer, interacting with Zenith.
-Wal redo[] activity, timelines[], snapshots[] are managed for each tenant independently.
+Wal redo[] activity, timelines[], layers[] are managed for each tenant independently.
 One pageserver[] can serve multiple tenants at once.
 One safekeeper 

--- a/docs/multitenancy.md
+++ b/docs/multitenancy.md
@@ -37,7 +37,7 @@ On the page server tenants introduce one level of indirection, so data directory
   ├── de182bc61fb11a5a6b390a8aed3a804a
   └── ee6016ec31116c1b7c33dfdfca38891f
 ```
-Wal redo activity, timelines, snapshots are managed for each tenant independently.
+Wal redo activity and timelines are managed for each tenant independently.

 For local environment used for example in tests there also new level of indirection for tenants. It touches `pgdatadirs` directory. Now it contains `tenants` subdirectory so the structure looks the following way:

--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
@@ -583,7 +583,7 @@ impl Timeline for LayeredTimeline {
    }

    fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelTag>> {
-        trace!("list_rels called at {}", lsn);
+        println!("list_rels called at {}", lsn);

        // List all rels in this timeline, and all its ancestors.
        let mut all_rels = HashSet::new();
@@ -675,14 +675,14 @@ impl Timeline for LayeredTimeline {
            (relsize - 1) / RELISH_SEG_SIZE
        };

-        // Unlink segments beyond the last remaining segment.
+        // Drop segments beyond the last remaining segment.
        for remove_segno in (last_remain_seg + 1)..=old_last_seg {
            let seg = SegmentTag {
                rel,
                segno: remove_segno,
            };
            let layer = self.get_layer_for_write(seg, lsn)?;
-            layer.put_unlink(lsn)?;
+            layer.drop_segment(lsn)?;
        }

        // Truncate the last remaining segment to the specified size
@@ -698,8 +698,8 @@ impl Timeline for LayeredTimeline {
        Ok(())
    }

-    fn put_unlink(&self, rel: RelishTag, lsn: Lsn) -> Result<()> {
-        trace!("put_unlink: {} at {}", rel, lsn);
+    fn drop_relish(&self, rel: RelishTag, lsn: Lsn) -> Result<()> {
+        trace!("drop_segment: {} at {}", rel, lsn);

        if rel.is_blocky() {
            let oldsize_opt = self.get_relish_size(rel, self.get_last_record_lsn())?;
@@ -710,25 +710,25 @@ impl Timeline for LayeredTimeline {
                    (oldsize - 1) / RELISH_SEG_SIZE
                };

-                // Unlink all segments
+                // Drop all segments of the relish
                for remove_segno in 0..=old_last_seg {
                    let seg = SegmentTag {
                        rel,
                        segno: remove_segno,
                    };
                    let layer = self.get_layer_for_write(seg, lsn)?;
-                    layer.put_unlink(lsn)?;
+                    layer.drop_segment(lsn)?;
                }
            } else {
                warn!(
-                    "put_unlink called on non-existent relish {} at {}",
+                    "drop_segment called on non-existent relish {} at {}",
                    rel, lsn
                );
            }
        } else {
            let seg = SegmentTag::from_blknum(rel, 0);
            let layer = self.get_layer_for_write(seg, lsn)?;
-            layer.put_unlink(lsn)?;
+            layer.drop_segment(lsn)?;
        }

        Ok(())
@@ -927,7 +927,6 @@ impl LayeredTimeline {
                assert!(layer.get_start_lsn() <= lsn);

                if layer.is_dropped() && layer.get_end_lsn() <= lsn {
-                    // The segment was unlinked
                    return Ok(None);
                }

@@ -988,7 +987,7 @@ impl LayeredTimeline {
            if prev_layer.get_timeline_id() != self.timelineid {
                // First modification on this timeline
                start_lsn = self.ancestor_lsn;
-                trace!(
+                println!(
                    "creating file for write for {} at branch point {}/{}",
                    seg,
                    self.timelineid,
@@ -996,14 +995,14 @@ impl LayeredTimeline {
                );
            } else {
                start_lsn = prev_layer.get_end_lsn();
-                trace!(
+                println!(
                    "creating file for write for {} after previous layer {}/{}",
                    seg,
                    self.timelineid,
                    start_lsn
                );
            }
-            trace!(
+            println!(
                "prev layer is at {}/{} - {}",
                prev_layer.get_timeline_id(),
                prev_layer.get_start_lsn(),
@@ -1019,7 +1018,7 @@ impl LayeredTimeline {
            )?;
        } else {
            // New relation.
-            trace!(
+            println!(
                "creating layer for write for new rel {} at {}/{}",
                seg,
                self.timelineid,
@@ -1031,6 +1030,7 @@ impl LayeredTimeline {
        }

        let mut layers = self.layers.lock().unwrap();
+        println!("before layers.insert_open {:?}", layer.filename());
        let layer_rc: Arc<InMemoryLayer> = Arc::new(layer);
        layers.insert_open(Arc::clone(&layer_rc));

@@ -1091,7 +1091,7 @@ impl LayeredTimeline {
            prev: prev_record_lsn,
        } = self.last_record_lsn.load();

-        trace!(
+        println!(
            "checkpointing timeline {} at {}",
            self.timelineid,
            last_record_lsn
@@ -1227,7 +1227,7 @@ impl LayeredTimeline {
        //
        // Determine for each file if it needs to be retained
        // FIXME: also scan open in-memory layers. Normally we cannot remove the
-        // latest layer of any seg, but if it was unlinked it's possible
+        // latest layer of any seg, but if it was dropped it's possible
        let mut layers = self.layers.lock().unwrap();
        'outer: for l in layers.iter_historic_layers() {
            let seg = l.get_seg_tag();
--- a/pageserver/src/layered_repository/README.md
+++ b/pageserver/src/layered_repository/README.md
@@ -6,13 +6,14 @@ which pages they apply to, and accumulates the incoming changes in
 memory. Every now and then, the accumulated changes are written out to
 new files.

-The files are called "snapshot files". Each snapshot file corresponds
-to one 10 MB slice of a PostgreSQL relation fork. The snapshot files
+The files are called "layer files". Each layer file corresponds
+to one RELISH_SEG_SIZE slice of a PostgreSQL relation fork or
+non-rel file in a range of LSNs. The layer files
 for each timeline are stored in the timeline's subdirectory under
 .zenith/tenants/<tenantid>/timelines.

-There are two kind of snapshot file: base images, and deltas. A base
-image file contains a snapshot of a segment as it was at one LSN,
+There are two kind of layer file: base images, and deltas. A base
+image file contains a layer of a segment as it was at one LSN,
 whereas a delta file contains modifications to a segment - mostly in
 the form of WAL records - in a range of LSN

@@ -44,7 +45,7 @@ managed, except that the first part of file names is different.
 Internally, the relations and non-relation files that are managed in
 the versioned store are together called "relishes".

-If a file has been dropped, the last snapshot file for it is created
+If a file has been dropped, the last layer file for it is created
 with the _DROPPED suffix, e.g.

    rel_1663_13990_2609_0_10_000000000169C348_0000000001702000_DROPPED
@@ -67,7 +68,7 @@ for 'orders' table on 'main' branch is represented like this:
    main/orders_100_200


-# Creating snapshot files
+# Creating layer files

 Let's start with a simple example with a system that contains one
 branch called 'main' and two tables, 'orders' and 'customers'. The end
@@ -86,10 +87,10 @@ end of WAL at 250 are kept in memory. If the page server crashes, the
 latest records between 200-250 need to be re-read from the WAL.

 Whenever enough WAL has been accumulated in memory, the page server
-writes out the changes in memory into new snapshot files. This process
+writes out the changes in memory into new layer files. This process
 is called "checkpointing" (not to be confused with the PostgreSQL
 checkpoints, that's a different thing). The page server only creates
-snapshot files for relations that have been modified since the last
+layer files for relations that have been modified since the last
 checkpoint. For example, if the current end of WAL is at LSN 450, and
 the last checkpoint happened at LSN 400 but there hasn't been any
 recent changes to 'customers' table, you would have these files on
@@ -108,7 +109,7 @@ disk:

 If the customers table is modified later, a new file is created for it
 at the next checkpoint. The new file will cover the "gap" from the
-last snapshot file, so the LSN ranges are always contiguous:
+last layer file, so the LSN ranges are always contiguous:

 	main/orders_100
 	main/orders_100_200
@@ -130,13 +131,13 @@ page server needs to reconstruct the requested page, as it was at the
 requested LSN. To do that, the page server first checks the recent
 in-memory layer; if the requested page version is found there, it can
 be returned immediatedly without looking at the files on
-disk. Otherwise the page server needs to locate the snapshot file that
+disk. Otherwise the page server needs to locate the layer file that
 contains the requested page version.

 For example, if a request comes in for table 'orders' at LSN 250, the
 page server would load the 'main/orders_200_300' file into memory, and
 reconstruct and return the requested page from it, as it was at
-LSN 250. Because the snapshot file consists of a full image of the
+LSN 250. Because the layer file consists of a full image of the
 relation at the start LSN and the WAL, reconstructing the page
 involves replaying any WAL records applicable to the page between LSNs
 200-250, starting from the base image at LSN 200.
@@ -171,7 +172,7 @@ Then, the 'orders' table is updated differently on the 'main' and

 Because the 'customers' table hasn't been modified on the child
 branch, there is no file for it there. If you request a page for it on
-the 'child' branch, the page server will not find any snapshot file
+the 'child' branch, the page server will not find any layer file
 for it in the 'child' directory, so it will recurse to look into the
 parent 'main' branch instead.

@@ -217,7 +218,7 @@ branch at a historic LSN, is how we support PITR in Zenith.

 # Garbage collection

-In this scheme, we keep creating new snapshot files over time. We also
+In this scheme, we keep creating new layer files over time. We also
 need a mechanism to remove old files that are no longer needed,
 because disk space isn't infinite.

@@ -245,7 +246,7 @@ of the branch is LSN 525, so that the GC horizon is currently at
 	main/customers_200

 We can remove the following files because the end LSNs of those files are
-older than GC horizon 375, and there are more recent snapshot files for the
+older than GC horizon 375, and there are more recent layer files for the
 table:

 	main/orders_100       DELETE
@@ -262,7 +263,7 @@ table:
 	main/customers_200      KEEP, NO NEWER VERSION

 'main/customers_100_200' is old enough, but it cannot be
-removed because there is no newer snapshot file for the table.
+removed because there is no newer layer file for the table.

 Things get slightly more complicated with multiple branches. All of
 the above still holds, but in addition to recent files we must also
@@ -308,7 +309,7 @@ new base image and delta file for it on the child:

 After this, the 'main/orders_100' and 'main/orders_100_200' file could
 be removed. It is no longer needed by the child branch, because there
-is a newer snapshot file there. TODO: This optimization hasn't been
+is a newer layer file there. TODO: This optimization hasn't been
 implemented! The GC algorithm will currently keep the file on the
 'main' branch anyway, for as long as the child branch exists.

@@ -346,7 +347,7 @@ It would also be OK to have overlapping LSN ranges for the same relation:
 	main/orders_300_400
 	main/orders_400

-The code that reads the snapshot files should cope with this, but this
+The code that reads the layer files should cope with this, but this
 situation doesn't arise either, because the checkpointing code never
 does that.  It could be useful, however, as a transient state when
 garbage collecting around branch points, or explicit recovery
@@ -360,6 +361,6 @@ points. For example, if we start with this:

 And there is a branch or explicit recovery point at LSN 150, we could
 replace 'main/orders_100_200' with 'main/orders_150' to keep a
-snapshot only at that exact point that's still needed, removing the
+layer only at that exact point that's still needed, removing the
 other page versions around it. But such compaction has not been
 implemented yet.
--- a/pageserver/src/layered_repository/delta_layer.rs
+++ b/pageserver/src/layered_repository/delta_layer.rs
@@ -174,7 +174,7 @@ impl Layer for DeltaLayer {

        {
            // Open the file and lock the metadata in memory
-            // TODO: avoid opening the snapshot file for each read
+            // TODO: avoid opening the file for each read
            let (_path, book) = self.open_book()?;
            let page_version_reader = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
            let inner = self.load()?;
@@ -251,6 +251,8 @@ impl Layer for DeltaLayer {

    /// Does this segment exist at given LSN?
    fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
+        assert!(lsn >= self.start_lsn);
+
        // Is the requested LSN after the rel was dropped?
        if self.dropped && lsn >= self.end_lsn {
            return Ok(false);
--- a/pageserver/src/layered_repository/image_layer.rs
+++ b/pageserver/src/layered_repository/image_layer.rs
@@ -2,8 +2,9 @@
 //! It is stored in a file on disk.
 //!
 //! On disk, the image files are stored in timelines/<timelineid> directory.
-//! Currently, there are no subdirectories, and each snapshot file is named like this:
+//! Currently, there are no subdirectories, and each image layer file is named like this:
 //!
+//! Note that segno is
 //!    <spcnode>_<dbnode>_<relnode>_<forknum>_<segno>_<LSN>
 //!
 //! For example:
@@ -15,10 +16,10 @@
 //! Only metadata is loaded into memory by the load function.
 //! When images are needed, they are read directly from disk.
 //!
-//! For blocky segments, the images are stored in BLOCKY_IMAGES_CHAPTER.
+//! For blocky relishes, the images are stored in BLOCKY_IMAGES_CHAPTER.
 //! All the images are required to be BLOCK_SIZE, which allows for random access.
 //!
-//! For non-blocky segments, the image can be found in NONBLOCKY_IMAGE_CHAPTER.
+//! For non-blocky relishes, the image can be found in NONBLOCKY_IMAGE_CHAPTER.
 //!
 use crate::layered_repository::filename::{ImageFileName, PathOrConf};
 use crate::layered_repository::storage_layer::{
@@ -156,7 +157,9 @@ impl Layer for ImageLayer {
    }

    /// Get size of the segment
-    fn get_seg_size(&self, _lsn: Lsn) -> Result<u32> {
+    fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
+        assert!(lsn >= self.lsn);
+
        let inner = self.load()?;
        match inner.image_type {
            ImageType::Blocky { num_blocks } => Ok(num_blocks),
@@ -165,7 +168,8 @@ impl Layer for ImageLayer {
    }

    /// Does this segment exist at given LSN?
-    fn get_seg_exists(&self, _lsn: Lsn) -> Result<bool> {
+    fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
+        assert!(lsn >= self.lsn);
        Ok(true)
    }

--- a/pageserver/src/layered_repository/inmemory_layer.rs
+++ b/pageserver/src/layered_repository/inmemory_layer.rs
@@ -198,6 +198,8 @@ impl Layer for InMemoryLayer {

    /// Does this segment exist at given LSN?
    fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
+        assert!(lsn >= self.start_lsn);
+
        let inner = self.inner.lock().unwrap();

        // Is the requested LSN after the segment was dropped?
@@ -239,8 +241,8 @@ impl Layer for InMemoryLayer {
            .unwrap_or_default();

        println!(
-            "----- in-memory layer for {} {}-{} ----",
-            self.seg, self.start_lsn, end_str
+            "----- in-memory layer for tli {} seg {} {}-{} ----",
+            self.timelineid, self.seg, self.start_lsn, end_str
        );

        for (k, v) in inner.segsizes.iter() {
@@ -271,7 +273,7 @@ impl InMemoryLayer {
        start_lsn: Lsn,
        oldest_pending_lsn: Lsn,
    ) -> Result<InMemoryLayer> {
-        trace!(
+        println!(
            "initializing new empty InMemoryLayer for writing {} on timeline {} at {}",
            seg,
            timelineid,
@@ -325,7 +327,7 @@ impl InMemoryLayer {
    pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> Result<()> {
        assert!(self.seg.blknum_in_seg(blknum));

-        trace!(
+        println!(
            "put_page_version blk {} of {} at {}/{}",
            blknum,
            self.seg.rel,
@@ -352,7 +354,7 @@ impl InMemoryLayer {
            // which we've just acquired above
            let oldsize = inner.get_seg_size(lsn);
            if newsize > oldsize {
-                trace!(
+                println!(
                    "enlarging segment {} from {} to {} blocks at {}",
                    self.seg,
                    oldsize,
@@ -380,13 +382,41 @@ impl InMemoryLayer {
    }

    /// Remember that the segment was dropped at given LSN
-    pub fn put_unlink(&self, lsn: Lsn) -> anyhow::Result<()> {
+    pub fn drop_segment(&self, lsn: Lsn) -> anyhow::Result<()> {
        let mut inner = self.inner.lock().unwrap();

        assert!(inner.drop_lsn.is_none());
        inner.drop_lsn = Some(lsn);

-        info!("dropped segment {} at {}", self.seg, lsn);
+        println!("dropped segment {} at {}", self.seg, lsn);
+
+        let end_str = inner
+        .drop_lsn
+        .as_ref()
+        .map(|drop_lsn| drop_lsn.to_string())
+        .unwrap_or_default();
+
+        {
+            let mut result = format!(
+                "----- inmemory layer tli {} for {} {}-{} ----\n",
+                self.timelineid, self.seg, self.start_lsn, end_str
+            );
+
+            for (k, v) in inner.segsizes.iter() {
+                result += &format!("{}: {}\n", k, v);
+            }
+            for (k, v) in inner.page_versions.iter() {
+                result += &format!(
+                    "blk {} at {}: {}/{}\n",
+                    k.0,
+                    k.1,
+                    v.page_image.is_some(),
+                    v.record.is_some()
+                );
+            }
+
+            println!("{}", result);
+        }

        Ok(())
    }
@@ -405,7 +435,7 @@ impl InMemoryLayer {
    ) -> Result<InMemoryLayer> {
        let seg = src.get_seg_tag();

-        trace!(
+        println!(
            "initializing new InMemoryLayer for writing {} on timeline {} at {}",
            seg,
            timelineid,
@@ -436,7 +466,7 @@ impl InMemoryLayer {
    }

    ///
-    /// Write the this in-memory layer to disk, as a snapshot layer.
+    /// Write the this in-memory layer to disk.
    ///
    /// The cutoff point for the layer that's written to disk is 'end_lsn'.
    ///
@@ -455,7 +485,7 @@ impl InMemoryLayer {
        // This is needed just to call materialize_page()
        timeline: &LayeredTimeline,
    ) -> Result<(Vec<Arc<dyn Layer>>, Option<Arc<InMemoryLayer>>)> {
-        info!(
+        println!(
            "freezing in memory layer for {} on timeline {} at {}",
            self.seg, self.timelineid, cutoff_lsn
        );
@@ -531,7 +561,7 @@ impl InMemoryLayer {
            )?;
            let delta_layer_rc: Arc<dyn Layer> = Arc::new(delta_layer);
            frozen_layers.push(delta_layer_rc);
-            trace!(
+            println!(
                "freeze: created delta layer {} {}-{}",
                self.seg,
                self.start_lsn,
@@ -547,7 +577,7 @@ impl InMemoryLayer {
            let imgfile = ImageLayer::create_from_src(self.conf, timeline, self, end_lsn)?;
            let imgfile_rc: Arc<dyn Layer> = Arc::new(imgfile);
            frozen_layers.push(Arc::clone(&imgfile_rc));
-            trace!("freeze: created image layer {} at {}", self.seg, end_lsn);
+            println!("freeze: created image layer {} at {}", self.seg, end_lsn);

            // If there were any page versions newer than the cutoff, initialize a new in-memory
            // layer to hold them
@@ -564,7 +594,7 @@ impl InMemoryLayer {
                new_inner.page_versions.append(&mut after_page_versions);
                new_inner.segsizes.append(&mut after_segsizes);
                drop(new_inner);
-                trace!("freeze: created new in-mem layer {} {}-", self.seg, end_lsn);
+                println!("freeze: created new in-mem layer {} {}-", self.seg, end_lsn);

                new_open_rc = Some(Arc::new(new_open))
            }
@@ -576,13 +606,22 @@ impl InMemoryLayer {
    /// debugging function to print out the contents of the layer
    #[allow(unused)]
    pub fn dump(&self) -> String {
-        let mut result = format!(
-            "----- inmemory layer for {} {}-> ----\n",
-            self.seg, self.start_lsn
-        );
+

        let inner = self.inner.lock().unwrap();

+        let end_str = inner
+        .drop_lsn
+        .as_ref()
+        .map(|drop_lsn| drop_lsn.to_string())
+        .unwrap_or_default();
+
+        let mut result =  format!(
+            "----- dump inmemory layer for tli {} seg {} {}-{} ----",
+            self.timelineid, self.seg, self.start_lsn, end_str
+        );
+
+
        for (k, v) in inner.segsizes.iter() {
            result += &format!("{}: {}\n", k, v);
        }
@@ -596,6 +635,8 @@ impl InMemoryLayer {
            );
        }

+        println!("inner println {}", result);
        result
+
    }
 }
--- a/pageserver/src/layered_repository/layer_map.rs
+++ b/pageserver/src/layered_repository/layer_map.rs
@@ -1,5 +1,5 @@
 //!
-//! The layer map tracks what layers exist for all the relations in a timeline.
+//! The layer map tracks what layers exist for all the relishes in a timeline.
 //!
 //! When the timeline is first accessed, the server lists of all layer files
 //! in the timelines/<timelineid> directory, and populates this map with
@@ -149,6 +149,7 @@ impl LayerMap {
        if let Some(segentry) = self.segs.get_mut(&tag) {
            if let Some(_old) = &segentry.open {
                // FIXME: shouldn't exist, but check
+                println!("insert_open() // FIXME: shouldn't exist, but check");
            }
            segentry.open = Some(Arc::clone(&layer));
        } else {
@@ -220,6 +221,8 @@ impl LayerMap {
    pub fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelTag>> {
        let mut rels: HashSet<RelTag> = HashSet::new();

+        println!("layer list_rels()");
+
        for (seg, segentry) in self.segs.iter() {
            if let RelishTag::Relation(reltag) = seg.rel {
                if (spcnode == 0 || reltag.spcnode == spcnode)
@@ -227,15 +230,33 @@ impl LayerMap {
                {
                    // Add only if it exists at the requested LSN.
                    if let Some(open) = &segentry.open {
-                        if open.get_end_lsn() > lsn {
+                        println!("explore segentry.open {:?}", open.filename());
+
+                        if open.get_end_lsn() > lsn && open.get_start_lsn() <= lsn {
                            rels.insert(reltag);
+                            println!("found rel {} at lsn {} in open layer start_lsn {} end lsn {}",
+                             reltag, lsn, open.get_start_lsn(), open.get_end_lsn());
+                            println!("{}", open.dump());
+
                        }
-                    } else if let Some((_k, _v)) = segentry
+                        else
+                        {
+                            println!("found DROPPED rel {} at lsn {} in open layer start_lsn {} end lsn {}",
+                            reltag, lsn, open.get_start_lsn(), open.get_end_lsn());
+                            println!("{}", open.dump());  
+                        }
+                    } else if let Some((l_start_lsn, layer)) = segentry
                        .historic
                        .range((Included(Lsn(0)), Included(lsn)))
                        .next_back()
                    {
-                        rels.insert(reltag);
+                        println!("explore historic segment {:?}", layer.filename());
+
+                        if !layer.is_dropped() && l_start_lsn <= &lsn {
+                            rels.insert(reltag);
+                            println!("found rel {} in historic layer", reltag);
+                            layer.dump()?;
+                        }
                    }
                }
            }
@@ -253,15 +274,17 @@ impl LayerMap {
            } else {
                // Add only if it exists at the requested LSN.
                if let Some(open) = &segentry.open {
-                    if open.get_end_lsn() > lsn {
+                    if open.get_end_lsn() > lsn && open.get_start_lsn() <= lsn {
                        rels.insert(seg.rel);
                    }
-                } else if let Some((_k, _v)) = segentry
+                } else if let Some((l_start_lsn, layer)) = segentry
                    .historic
                    .range((Included(Lsn(0)), Included(lsn)))
                    .next_back()
                {
-                    rels.insert(seg.rel);
+                    if !layer.is_dropped() && l_start_lsn <= &lsn {
+                        rels.insert(seg.rel);
+                    }
                }
            }
        }
@@ -271,15 +294,16 @@ impl LayerMap {
    /// Is there a newer image layer for given segment?
    ///
    /// This is used for garbage collection, to determine if an old layer can
-    /// be deleted. We ignore in-memory layers because they are not durable
-    /// on disk, and delta layers because they depend on an older layer.
+    /// be deleted.
    pub fn newer_image_layer_exists(&self, seg: SegmentTag, lsn: Lsn) -> bool {
        if let Some(segentry) = self.segs.get(&seg) {
+            // We only check on-disk layers, because
+            // in-memory layers are not durable
            for (newer_lsn, layer) in segentry
                .historic
                .range((Included(lsn), Included(Lsn(u64::MAX))))
            {
-                // Ignore delta layers.
+                // Ignore layers that depend on an older layer.
                if layer.is_incremental() {
                    continue;
                }
--- a/pageserver/src/layered_repository/storage_layer.rs
+++ b/pageserver/src/layered_repository/storage_layer.rs
@@ -97,24 +97,32 @@ pub enum PageReconstructResult {
 }

 ///
-/// A Layer holds all page versions for one segment of a relish, in a range of LSNs.
-/// There are two kinds of layers, in-memory and snapshot layers. In-memory
+/// A Layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs.
+/// There are two kinds of layers, in-memory and on-disk layers. In-memory
 /// layers are used to ingest incoming WAL, and provide fast access
-/// to the recent page versions. Snaphot layers are stored on disk, and
+/// to the recent page versions. On-disk layers are stored as files on disk, and
 /// are immutable. This trait presents the common functionality of
-/// in-memory and snapshot layers.
-///
-/// Each layer contains a full snapshot of the segment at the start
-/// LSN. In addition to that, it contains WAL (or more page images)
-/// needed to recontruct any page version up to the end LSN.
+/// in-memory and on-disk layers.
 ///
 pub trait Layer: Send + Sync {
-    // These functions identify the relish segment and the LSN range
-    // that this Layer holds.
+    /// Identify the timeline this relish belongs to
    fn get_timeline_id(&self) -> ZTimelineId;
+
+    /// Identify the relish segment
    fn get_seg_tag(&self) -> SegmentTag;
+
+    /// Inclusive start bound of the LSN range that this layer hold
    fn get_start_lsn(&self) -> Lsn;
+
+    /// 'end_lsn' meaning depends on the layer kind:
+    /// - in-memory layer is either unbounded (end_lsn = MAX_LSN) or dropped (end_lsn = drop_lsn)
+    /// - image layer represents snapshot at one LSN, so end_lsn = lsn
+    /// - delta layer has end_lsn
+    ///
+    /// TODO Is end_lsn always exclusive for all layer kinds?
    fn get_end_lsn(&self) -> Lsn;
+
+    /// Is the segment represented by this layer dropped by PostgreSQL?
    fn is_dropped(&self) -> bool;

    /// Filename used to store this layer on disk. (Even in-memory layers
--- a/pageserver/src/logger.rs
+++ b/pageserver/src/logger.rs
@@ -24,7 +24,7 @@ pub fn init_logging(
        if record.level().is_at_least(slog::Level::Info) {
            return true;
        }
-        false
+        true
    });
    let drain = std::sync::Mutex::new(drain).fuse();
    let logger = slog::Logger::root(
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -346,7 +346,7 @@ impl PageServerHandler {
        pgb.write_message(&BeMessage::CopyOutResponse)?;
        info!("sent CopyOut");

-        /* Send a tarball of the latest snapshot on the timeline */
+        /* Send a tarball of the latest layer on the timeline */
        {
            let mut writer = CopyDataSink { pgb };
            let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn);
@@ -582,18 +582,18 @@ impl postgres_backend::Handler for PageServerHandler {
            let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;

            pgb.write_message_noflush(&BeMessage::RowDescription(&[
-                RowDescriptor::int8_col(b"snapshot_relfiles_total"),
-                RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_cutoff"),
-                RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_branches"),
-                RowDescriptor::int8_col(b"snapshot_relfiles_not_updated"),
-                RowDescriptor::int8_col(b"snapshot_relfiles_removed"),
-                RowDescriptor::int8_col(b"snapshot_relfiles_dropped"),
-                RowDescriptor::int8_col(b"snapshot_nonrelfiles_total"),
-                RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_cutoff"),
-                RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_branches"),
-                RowDescriptor::int8_col(b"snapshot_nonrelfiles_not_updated"),
-                RowDescriptor::int8_col(b"snapshot_nonrelfiles_removed"),
-                RowDescriptor::int8_col(b"snapshot_nonrelfiles_dropped"),
+                RowDescriptor::int8_col(b"layer_relfiles_total"),
+                RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"),
+                RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"),
+                RowDescriptor::int8_col(b"layer_relfiles_not_updated"),
+                RowDescriptor::int8_col(b"layer_relfiles_removed"),
+                RowDescriptor::int8_col(b"layer_relfiles_dropped"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_total"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_removed"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"),
                RowDescriptor::int8_col(b"elapsed"),
            ]))?
            .write_message_noflush(&BeMessage::DataRow(&[
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -133,9 +133,8 @@ pub trait Timeline: Send + Sync {
    /// Truncate relation
    fn put_truncation(&self, rel: RelishTag, lsn: Lsn, nblocks: u32) -> Result<()>;

-    /// Unlink relish.
-    /// This method is used for marking dropped relations and truncated SLRU segments
-    fn put_unlink(&self, tag: RelishTag, lsn: Lsn) -> Result<()>;
+    /// This method is used for marking dropped relations and truncated SLRU files
+    fn drop_relish(&self, tag: RelishTag, lsn: Lsn) -> Result<()>;

    /// Track end of the latest digested WAL record.
    ///
@@ -229,6 +228,8 @@ mod tests {
        forknum: 0,
    });

+    const TESTDB: u32 = 111;
+
    /// Convenience function to create a page image with given string as the only content
    #[allow(non_snake_case)]
    fn TEST_IMG(s: &str) -> Bytes {
@@ -417,6 +418,53 @@ mod tests {
        Ok(())
    }

+    ///
+    /// Test list_rels() function, with branches and unlinking
+    ///
+    #[test]
+    fn test_list_rels_and_unlink() -> Result<()> {
+        let repo = get_test_repo("test_unlink")?;
+        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
+        let tline = repo.create_empty_timeline(timelineid, Lsn(0x00))?;
+
+        // Import initial dummy checkpoint record, otherwise the get_timeline() call
+        // after branching fails below
+        tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_PAGE.clone())?;
+
+        // Create a relation on the timeline
+        tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
+
+        // Check that list_rels() lists it after LSN 2, but no before it.
+        let reltag = match TESTREL_A {
+            RelishTag::Relation(reltag) => reltag,
+            _ => panic!("unexpected relish")
+        };
+        assert!(!tline.list_rels(0, TESTDB, Lsn(0x10))?.contains(&reltag));
+        assert!(tline.list_rels(0, TESTDB, Lsn(0x20))?.contains(&reltag));
+        assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&reltag));
+
+        // Create a branch, check that the relation is visible there
+        let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap();
+        repo.branch_timeline(timelineid, newtimelineid, Lsn(0x30))?;
+        let newtline = repo.get_timeline(newtimelineid)?;
+
+        assert!(newtline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&reltag));
+
+        // Unlink it on the branch
+        newtline.drop_relish(TESTREL_A, Lsn(0x40))?;
+
+        // Check that it's no longer listed on the branch after the point where it was unlinked
+        assert!(newtline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&reltag));
+        assert!(!newtline.list_rels(0, TESTDB, Lsn(0x40))?.contains(&reltag));
+
+        // Run checkpoint and garbage collection and check that it's still not visible
+        newtline.checkpoint()?;
+        repo.gc_iteration(Some(newtimelineid), 0, true)?;
+        assert!(!newtline.list_rels(0, TESTDB, Lsn(0x40))?.contains(&reltag));
+
+        Ok(())
+    }
+
    ///
    /// Test branch creation
    ///
--- a/pageserver/src/restore_local_repo.rs
+++ b/pageserver/src/restore_local_repo.rs
@@ -45,7 +45,6 @@ pub fn import_timeline_from_postgres_datadir(
        match direntry.file_name().to_str() {
            None => continue,

-            // These special files appear in the snapshot, but are not needed by the page server
            Some("pg_control") => {
                import_nonrel_file(timeline, lsn, RelishTag::ControlFile, &direntry.path())?;
                // Extract checkpoint record from pg_control and store is as separate object
@@ -93,7 +92,6 @@ pub fn import_timeline_from_postgres_datadir(
            match direntry.file_name().to_str() {
                None => continue,

-                // These special files appear in the snapshot, but are not needed by the page server
                Some("PG_VERSION") => continue,
                Some("pg_filenode.map") => import_nonrel_file(
                    timeline,
@@ -153,7 +151,7 @@ fn import_relfile(

    let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
    if let Err(e) = p {
-        warn!("unrecognized file in snapshot: {:?} ({})", path, e);
+        warn!("unrecognized file in postgres datadir: {:?} ({})", path, e);
        return Err(e.into());
    }
    let (relnode, forknum, segno) = p.unwrap();
@@ -397,15 +395,15 @@ pub fn save_decoded_record(
            for tablespace_id in dropdb.tablespace_ids {
                let rels = timeline.list_rels(tablespace_id, dropdb.db_id, lsn)?;
                for rel in rels {
-                    timeline.put_unlink(RelishTag::Relation(rel), lsn)?;
+                    timeline.drop_relish(RelishTag::Relation(rel), lsn)?;
                }
                trace!(
-                    "Unlink FileNodeMap {}, {} at lsn {}",
+                    "Drop FileNodeMap {}, {} at lsn {}",
                    tablespace_id,
                    dropdb.db_id,
                    lsn
                );
-                timeline.put_unlink(
+                timeline.drop_relish(
                    RelishTag::FileNodeMap {
                        spcnode: tablespace_id,
                        dbnode: dropdb.db_id,
@@ -448,12 +446,12 @@ pub fn save_decoded_record(
            save_xact_record(timeline, lsn, &parsed_xact, decoded)?;
            // Remove twophase file. see RemoveTwoPhaseFile() in postgres code
            trace!(
-                "unlink twophaseFile for xid {} parsed_xact.xid {} here at {}",
+                "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}",
                decoded.xl_xid,
                parsed_xact.xid,
                lsn
            );
-            timeline.put_unlink(
+            timeline.drop_relish(
                RelishTag::TwoPhase {
                    xid: parsed_xact.xid,
                },
@@ -733,7 +731,7 @@ fn save_xact_record(
                dbnode: xnode.dbnode,
                relnode: xnode.relnode,
            };
-            timeline.put_unlink(RelishTag::Relation(rel), lsn)?;
+            timeline.drop_relish(RelishTag::Relation(rel), lsn)?;
        }
    }
    Ok(())
@@ -775,7 +773,7 @@ fn save_clog_truncate_record(
        return Ok(());
    }

-    // Iterate via SLRU CLOG segments and unlink segments that we're ready to truncate
+    // Iterate via SLRU CLOG segments and drop segments that we're ready to truncate
    // TODO This implementation is very inefficient -
    // it scans all non-rels only to find Clog
    //
@@ -790,8 +788,8 @@ fn save_clog_truncate_record(
                if slru == SlruKind::Clog {
                    let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
                    if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
-                        timeline.put_unlink(RelishTag::Slru { slru, segno }, lsn)?;
-                        trace!("unlink CLOG segment {:>04X} at lsn {}", segno, lsn);
+                        timeline.drop_relish(RelishTag::Slru { slru, segno }, lsn)?;
+                        trace!("Drop CLOG segment {:>04X} at lsn {}", segno, lsn);
                    }
                }
            }
@@ -894,7 +892,7 @@ fn save_multixact_truncate_record(
    // Delete all the segments except the last one. The last segment can still
    // contain, possibly partially, valid data.
    while segment != endsegment {
-        timeline.put_unlink(
+        timeline.drop_relish(
            RelishTag::Slru {
                slru: SlruKind::MultiXactMembers,
                segno: segment as u32,
Author	SHA1	Message	Date
anastasia	5e91d15f06	REVERT ME. Add unlink test and a bunch of debug printlns	2021-09-02 17:29:29 +03:00
anastasia	8c46178256	Attempt to fix list_rels & list_nonrels for dropped segments	2021-09-02 16:46:11 +03:00
anastasia	2a4d405f96	Rename put_unlink() to drop_relish() in Timeline trait. Rename put_unlink() to drop_segment() in Layer trait.	2021-09-02 16:37:05 +03:00
anastasia	66174c0342	Improve comments for Layer trait.	2021-09-02 16:33:47 +03:00
anastasia	cee366ac17	Don't use term 'snapshot' to describe layers	2021-09-02 16:33:30 +03:00