Set hint bits in pageserver

Create branch just to run tests
Revert "Bump postgres version"
2026-01-20 11:52:56 +00:00 · 2021-09-10 18:27:34 +03:00 · 2021-09-07 15:12:39 +03:00 · 2021-09-07 15:12:39 +03:00 · 2021-09-07 15:12:39 +03:00 · 2021-09-07 15:12:39 +03:00
53 changed files with 1331 additions and 612 deletions
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -4,14 +4,17 @@ use std::net::SocketAddr;
 use std::net::TcpStream;
 use std::os::unix::fs::PermissionsExt;
 use std::process::Command;
+use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 use std::{collections::BTreeMap, path::PathBuf};

 use anyhow::{Context, Result};
 use lazy_static::lazy_static;
+use postgres_ffi::pg_constants;
 use regex::Regex;
 use zenith_utils::connstring::connection_host_port;
+use zenith_utils::lsn::Lsn;
 use zenith_utils::postgres_backend::AuthType;
 use zenith_utils::zid::ZTenantId;
 use zenith_utils::zid::ZTimelineId;
@@ -86,7 +89,6 @@ impl ComputeControlPlane {
        &mut self,
        tenantid: ZTenantId,
        branch_name: &str,
-        config_only: bool,
    ) -> Result<Arc<PostgresNode>> {
        let timeline_id = self
            .pageserver
@@ -101,25 +103,15 @@ impl ComputeControlPlane {
            is_test: false,
            timelineid: timeline_id,
            tenantid,
+            uses_wal_proposer: false,
        });

-        node.init_from_page_server(self.env.auth_type, config_only)?;
+        node.create_pgdata()?;
+        node.setup_pg_conf(self.env.auth_type)?;
+
        self.nodes
            .insert((tenantid, node.name.clone()), Arc::clone(&node));

-        // Configure the node to stream WAL directly to the pageserver
-        node.append_conf(
-            "postgresql.conf",
-            format!(
-                concat!(
-                    "synchronous_standby_names = 'pageserver'\n", // TODO: add a new function arg?
-                    "zenith.callmemaybe_connstring = '{}'\n",     // FIXME escaping
-                ),
-                node.connstr(),
-            )
-            .as_str(),
-        )?;
-
        Ok(node)
    }
 }
@@ -135,6 +127,7 @@ pub struct PostgresNode {
    is_test: bool,
    pub timelineid: ZTimelineId,
    pub tenantid: ZTenantId,
+    uses_wal_proposer: bool,
 }

 impl PostgresNode {
@@ -219,6 +212,8 @@ impl PostgresNode {
            .parse()
            .with_context(|| err_msg)?;

+        let uses_wal_proposer = config.contains("wal_acceptors");
+
        // ok now
        Ok(PostgresNode {
            address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
@@ -228,15 +223,48 @@ impl PostgresNode {
            is_test: false,
            timelineid,
            tenantid,
+            uses_wal_proposer,
        })
    }

+    fn sync_walkeepers(&self) -> Result<Lsn> {
+        let pg_path = self.env.pg_bin_dir().join("postgres");
+        let sync_output = Command::new(pg_path)
+            .arg("--sync-safekeepers")
+            .env_clear()
+            .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
+            .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
+            .env("PGDATA", self.pgdata().to_str().unwrap())
+            .output()
+            .with_context(|| "sync-walkeepers failed")?;
+
+        if !sync_output.status.success() {
+            anyhow::bail!(
+                "sync-walkeepers failed: '{}'",
+                String::from_utf8_lossy(&sync_output.stderr)
+            );
+        }
+
+        let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?;
+        println!("Walkeepers synced on {}", lsn);
+        Ok(lsn)
+    }
+
    /// Get basebackup from the pageserver as a tar archive and extract it
    /// to the `self.pgdata()` directory.
-    pub fn do_basebackup(&self) -> Result<()> {
-        let pgdata = self.pgdata();
+    fn do_basebackup(&self, lsn: Option<Lsn>) -> Result<()> {
+        println!(
+            "Extracting base backup to create postgres instance: path={} port={}",
+            self.pgdata().display(),
+            self.address.port()
+        );
+
+        let sql = if let Some(lsn) = lsn {
+            format!("basebackup {} {} {}", self.tenantid, self.timelineid, lsn)
+        } else {
+            format!("basebackup {} {}", self.tenantid, self.timelineid)
+        };

-        let sql = format!("basebackup {} {}", self.tenantid, self.timelineid);
        let mut client = self
            .pageserver
            .page_server_psql_client()
@@ -248,47 +276,32 @@ impl PostgresNode {

        // Read the archive directly from the `CopyOutReader`
        tar::Archive::new(copyreader)
-            .unpack(&pgdata)
+            .unpack(&self.pgdata())
            .with_context(|| "extracting page backup failed")?;

        Ok(())
    }

-    /// Connect to a pageserver, get basebackup, and untar it to initialize a
-    /// new data directory
-    pub fn init_from_page_server(&self, auth_type: AuthType, config_only: bool) -> Result<()> {
-        let pgdata = self.pgdata();
-
-        println!(
-            "Extracting base backup to create postgres instance: path={} port={}",
-            pgdata.display(),
-            self.address.port()
-        );
-
-        // initialize data directory
-        if self.is_test {
-            fs::remove_dir_all(&pgdata).ok();
-        }
-
-        fs::create_dir_all(&pgdata)
-            .with_context(|| format!("could not create data directory {}", pgdata.display()))?;
-        fs::set_permissions(pgdata.as_path(), fs::Permissions::from_mode(0o700)).with_context(
-            || {
+    fn create_pgdata(&self) -> Result<()> {
+        fs::create_dir_all(&self.pgdata()).with_context(|| {
+            format!(
+                "could not create data directory {}",
+                self.pgdata().display()
+            )
+        })?;
+        fs::set_permissions(self.pgdata().as_path(), fs::Permissions::from_mode(0o700))
+            .with_context(|| {
                format!(
                    "could not set permissions in data directory {}",
-                    pgdata.display()
+                    self.pgdata().display()
                )
-            },
-        )?;
+            })
+    }

-        if config_only {
-            //Just create an empty config file
-            File::create(self.pgdata().join("postgresql.conf").to_str().unwrap())?;
-        } else {
-            self.do_basebackup()?;
-            fs::create_dir_all(self.pgdata().join("pg_wal"))?;
-            fs::create_dir_all(self.pgdata().join("pg_wal").join("archive_status"))?;
-        }
+    // Connect to a page server, get base backup, and untar it to initialize a
+    // new data directory
+    fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> {
+        File::create(self.pgdata().join("postgresql.conf").to_str().unwrap())?;

        // wal_log_hints is mandatory when running against pageserver (see gh issue#192)
        // TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE?
@@ -342,6 +355,40 @@ impl PostgresNode {
            .as_str(),
        )?;

+        // Configure the node to stream WAL directly to the pageserver
+        self.append_conf(
+            "postgresql.conf",
+            format!(
+                concat!(
+                    "synchronous_standby_names = 'pageserver'\n", // TODO: add a new function arg?
+                    "zenith.callmemaybe_connstring = '{}'\n",     // FIXME escaping
+                ),
+                self.connstr(),
+            )
+            .as_str(),
+        )?;
+
+        Ok(())
+    }
+
+    fn load_basebackup(&self) -> Result<()> {
+        let lsn = if self.uses_wal_proposer {
+            // LSN WAL_SEGMENT_SIZE means that it is bootstrap and we need to download just
+            // latest data from the pageserver. That is a bit clumsy but whole bootstrap
+            // procedure evolves quite actively right now, so let's think about it again
+            // when things would be more stable (TODO).
+            let lsn = self.sync_walkeepers()?;
+            if lsn == Lsn(pg_constants::WAL_SEGMENT_SIZE as u64) {
+                None
+            } else {
+                Some(lsn)
+            }
+        } else {
+            None
+        };
+
+        self.do_basebackup(lsn)?;
+
        Ok(())
    }

@@ -408,38 +455,22 @@ impl PostgresNode {
        }

        // 1. We always start compute node from scratch, so
-        // if old dir exists, preserve config files and drop the directory
-
-        // XXX Now we only use 'postgresql.conf'.
-        // If we will need 'pg_hba.conf', support it here too
-
+        // if old dir exists, preserve 'postgresql.conf' and drop the directory
        let postgresql_conf_path = self.pgdata().join("postgresql.conf");
-        let postgresql_conf = fs::read(postgresql_conf_path.clone()).with_context(|| {
+        let postgresql_conf = fs::read(&postgresql_conf_path).with_context(|| {
            format!(
                "failed to read config file in {}",
                postgresql_conf_path.to_str().unwrap()
            )
        })?;
-
-        println!(
-            "Destroying postgres data directory '{}'",
-            self.pgdata().to_str().unwrap()
-        );
        fs::remove_dir_all(&self.pgdata())?;
+        self.create_pgdata()?;

-        // 2. Create new node
-        self.init_from_page_server(self.env.auth_type, false)?;
+        // 2. Bring back config files
+        fs::write(&postgresql_conf_path, postgresql_conf)?;

-        // 3. Bring back config files
-
-        if let Ok(mut file) = OpenOptions::new()
-            .append(false)
-            .write(true)
-            .open(&postgresql_conf_path)
-        {
-            file.write_all(&postgresql_conf)?;
-            file.sync_all()?;
-        }
+        // 3. Load basebackup
+        self.load_basebackup()?;

        // 4. Finally start the compute node postgres
        println!("Starting postgres node at '{}'", self.connstr());
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -74,7 +74,10 @@ impl PageServerNode {
            args.extend(&["--auth-type", "ZenithJWT"]);
        }

-        create_tenant.map(|tenantid| args.extend(&["--create-tenant", tenantid]));
+        if let Some(tenantid) = create_tenant {
+            args.extend(&["--create-tenant", tenantid])
+        }
+
        let status = cmd
            .args(args)
            .env_clear()
--- a/docs/README.md
+++ b/docs/README.md
@@ -11,3 +11,4 @@
 - [postgres_ffi/README](/postgres_ffi/README) — Postgres FFI overview.
 - [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
 - [walkeeper/README](/walkeeper/README.md) — WAL service overview.
+- [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core
--- a/docs/core_changes.md
+++ b/docs/core_changes.md
@@ -0,0 +1,202 @@
+1. Add t_cid to XLOG record
+- Why?
+  The cmin/cmax on a heap page is a real bummer. I don't see any other way to fix that than bite the bullet and modify the WAL-logging routine to include the cmin/cmax.
+
+  To recap, the problem is that the XLOG_HEAP_INSERT record does not include the command id of the inserted row. And same with deletion/update. So in the primary, a row is inserted with current xmin + cmin. But in the replica, the cmin is always set to 1. That works, because the command id is only relevant to the inserting transaction itself. After commit/abort, no one cares abut it anymore.
+
+- Alternatives?
+  I don't know
+
+2. Add PD_WAL_LOGGED.
+- Why?
+  Postgres sometimes writes data to the page before it is wal-logged. If such page ais swapped out, we  will loose this change. The problem is currently solved by setting PD_WAL_LOGGED bit in page header. When page without this bit set is written to the SMGR, then it is forced to be written to the WAL as FPI using log_newpage_copy() function.
+
+  There was wrong assumption that it can happen only during construction of some exotic indexes (like gist). It is not true. The same situation can happen with COPY,VACUUM and when record hint bits are set.
+
+- Discussion:
+  https://discord.com/channels/869525774699462656/882681420986851359
+
+- Alternatives:
+  Do not store this flag in page header, but associate this bit with shared buffer. Logically it is more correct but in practice we will get not advantages: neither in space, neither in CPU overhead.
+
+
+3. XLogReadBufferForRedo not always loads and pins requested buffer. So we need to add extra checks that buffer is really pinned. Also do not use BufferGetBlockNumber for buffer returned by XLogReadBufferForRedo.
+- Why?
+  XLogReadBufferForRedo is not pinning pages which are not requested by wal-redo. It is specific only for wal-redo Postgres.
+
+- Alternatives?
+  No
+
+
+4. Eliminate reporting of some warnings related with hint bits, for example
+"page is not marked all-visible but visibility map bit is set in relation".
+- Why?
+  Hint bit may be not WAL logged.
+
+- Alternative?
+  Always wal log any page changes.
+
+
+5. Maintain last written LSN.
+- Why?
+  When compute node requests page from page server, we need to specify LSN. Ideally it should be LSN
+  of WAL record performing last update of this pages. But we do not know it, because we do not have page.
+  We can use current WAL flush position, but in this case there is high probability that page server
+  will be blocked until this peace of WAL is delivered.
+  As better approximation we can keep max LSN of written page. It will be better to take in account LSNs only of evicted pages,
+  but SMGR API doesn't provide such knowledge.
+
+- Alternatives?
+  Maintain map of LSNs of evicted pages.
+
+
+6. Launching Postgres without WAL.
+- Why?
+  According to Zenith architecture compute node is stateless. So when we are launching
+  compute node, we need to provide some dummy PG_DATADIR. Relation pages
+  can be requested on demand from page server. But Postgres still need some non-relational data:
+  control and configuration files, SLRUs,...
+  It is currently implemented  using basebackup (do not mix with pg_basebackup) which is created
+  by pageserver. It includes in this tarball config/control files, SLRUs and required directories.
+  As far as pageserver do not have original (non-scattered) WAL segments, it includes in
+  this tarball dummy WAL segment which contains only SHUTDOWN_CHECKPOINT record at the beginning of segment,
+  which redo field points to the end of wal. It allows to load checkpoint record in more or less
+  standard way with minimal changes of Postgres, but then some special handling is needed,
+  including restoring previous record position from zenith.signal file.
+  Also we have to correctly initialize header of last WAL page (pointed by checkpoint.redo)
+  to pass checks performed by XLogReader.
+
+- Alternatives?
+  We may not include fake WAL segment in tarball at all and modify xlog.c to load checkpoint record
+  in special way. But it may only increase number of changes in xlog.c
+
+7. Add redo_read_buffer_filter callback to XLogReadBufferForRedoExtended
+- Why?
+  We need a way in wal-redo Postgres to ignore pages which are not requested by pageserver.
+  So wal-redo Postgres reconstructs only requested page and for all other returns BLK_DONE
+  which means that recovery for them is not needed.
+
+- Alternatives?
+  No
+
+8. Enforce WAL logging of sequence updates.
+- Why?
+  Due to performance reasons Postgres don't want to log each fetching of a value from a sequence,
+  so we pre-log a few fetches in advance. In the event of crash we can lose
+  (skip over) as many values as we pre-logged.
+  But it doesn't work with Zenith because page with sequence value can be evicted from buffer cache
+  and we will get a gap in sequence values even without crash.
+
+- Alternatives:
+  Do not try to preserve sequential order but avoid performance penalty.
+
+
+9. Treat unlogged tables as normal (permanent) tables.
+- Why?
+  Unlogged tables are not transient, so them have to survive node restart (unlike temporary tables).
+  But as far as compute node is stateless, we need to persist their data to storage node.
+  And it can only be done through the WAL.
+
+- Alternatives?
+  * Store unlogged tables locally (violates requirement of stateless compute nodes).
+  * Prohibit unlogged tables at all.
+
+
+10. Support start Postgres in wal-redo mode
+- Why?
+  To be able to apply WAL record and reconstruct pages at page server.
+
+- Alternatives?
+  * Rewrite redo handlers in Rust
+  * Do not reconstruct pages at page server at all and do it at compute node.
+
+
+11. WAL proposer
+- Why?
+  WAL proposer is communicating with safekeeper and ensures WAL durability by quorum writes.
+  It is currently implemented as patch to standard WAL sender.
+
+- Alternatives?
+  Can be moved to extension if some extra callbacks will be added to wal sender code.
+
+
+12. Secure Computing BPF API wrapper.
+- Why?
+  Pageserver delegates complex WAL decoding duties to Postgres,
+  which means that the latter might fall victim to carefully designed
+  malicious WAL records and start doing harmful things to the system.
+  To prevent this, it has been decided to limit possible interactions
+  with the outside world using the Secure Computing BPF mode.
+
+- Alternatives:
+  * Rewrite redo handlers in Rust.
+  * Add more checks to guarantee correctness of WAL records.
+  * Move seccomp.c to extension
+  * Many other discussed approaches to neutralize incorrect WAL records vulnerabilities.
+
+
+13. Callbacks for replica feedbacks
+- Why?
+  Allowing waproposer to interact with walsender code.
+
+- Alternatives
+  Copy walsender code to walproposer.
+
+
+14. Support multiple SMGR implementations.
+- Why?
+  Postgres provides abstract API for storage manager but it has only one implementation
+  and provides no way to replace it with custom storage manager.
+
+- Alternatives?
+  None.
+
+
+15. Calculate database size as sum of all database relations.
+- Why?
+  Postgres is calculating database size by traversing data directory
+  but as far as Zenith compute node is stateless we can not do it.
+
+- Alternatives?
+  Send this request directly to pageserver and calculate real (physical) size
+  of Zenith representation of database/timeline, rather than sum logical size of all relations.
+
+
+-----------------------------------------------
+Not currently committed but proposed:
+
+1. Disable ring buffer buffer manager strategies
+- Why?
+  Postgres tries to avoid cache flushing by bulk operations (copy, seqscan, vacuum,...).
+  Even if there are free space in buffer cache, pages may be evicted.
+  Negative effect of it can be somehow compensated by file system cache, but in case of Zenith
+  cost of requesting page from page server is much higher.
+
+- Alternatives?
+  Instead of just prohibiting ring buffer we may try to implement more flexible eviction policy,
+  for example copy evicted page from ring buffer to some other buffer if there is free space
+  in buffer cache.
+
+2. Disable marking page as dirty when hint bits are set.
+- Why?
+  Postgres has to modify page twice: first time when some tuple is updated and second time when
+  hint bits are set. Wal logging hint bits updates requires FPI which significantly increase size of WAL.
+
+- Alternatives?
+  Add special WAL record for setting page hints.
+
+3. Prefetching
+- Why?
+  As far as pages in Zenith are loaded on demand, to reduce node startup time
+  and also sppedup some massive queries we need some mechanism for bulk loading to
+  reduce page request round-trip overhead.
+
+  Currently Postgres is supporting prefetching only for bitmap scan.
+  In Zenith we also use prefetch for sequential and index scan. For sequential scan we prefetch
+  some number of following pages. For index scan we prefetch pages of heap relation addressed by TIDs.
+
+4. Prewarming.
+- Why?
+  Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Zenith.
+  But overhead of request-response round-trip for loading pages on demand can make started node warm-up quite slow.
+  We can capture state of compute node buffer cache and send bulk request for this pages at startup.
--- a/docs/glossary.md
+++ b/docs/glossary.md
@@ -26,7 +26,7 @@ A checkpoint record in the WAL marks a point in the WAL sequence at which it is
 NOTE: This is an overloaded term.

 Whenever enough WAL has been accumulated in memory, the page server []
-writes out the changes in memory into new layer files[]. This process
+writes out the changes from in-memory layers into new layer files[]. This process
 is called "checkpointing". The page server only creates layer files for
 relations that have been modified since the last checkpoint. 

@@ -41,17 +41,28 @@ Stateless Postgres node that stores data in pageserver.
 Each of the separate segmented file sets in which a relation is stored. The main fork is where the actual data resides. There also exist two secondary forks for metadata: the free space map and the visibility map.
 Each PostgreSQL fork is considered a separate relish.

-### Layer file
+### Layer
+
+Each layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs.
+There are two kinds of layers, in-memory and on-disk layers. In-memory
+layers are used to ingest incoming WAL, and provide fast access
+to the recent page versions. On-disk layers are stored as files on disk, and
+are immutable.
+### Layer file (on-disk layer)

 Layered repository on-disk format is based on immutable files.  The
-files are called "layer files". Each file corresponds to one 10 MB
+files are called "layer files". Each file corresponds to one RELISH_SEG_SIZE
 segment of a PostgreSQL relation fork. There are two kinds of layer
 files: image files and delta files. An image file contains a
 "snapshot" of the segment at a particular LSN, and a delta file
 contains WAL records applicable to the segment, in a range of LSNs.

+### Layer map
+
+The layer map tracks what layers exist for all the relishes in a timeline.
 ### Layered repository

+Zenith repository implementation that keeps data in layers.
 ### LSN


@@ -121,7 +132,7 @@ Each SLRU segment is considered a separate relish[].

 ### Tenant (Multitenancy)
 Tenant represents a single customer, interacting with Zenith.
-Wal redo[] activity, timelines[], snapshots[] are managed for each tenant independently.
+Wal redo[] activity, timelines[], layers[] are managed for each tenant independently.
 One pageserver[] can serve multiple tenants at once.
 One safekeeper 

--- a/docs/multitenancy.md
+++ b/docs/multitenancy.md
@@ -37,7 +37,7 @@ On the page server tenants introduce one level of indirection, so data directory
   ├── de182bc61fb11a5a6b390a8aed3a804a
   └── ee6016ec31116c1b7c33dfdfca38891f
 ```
-Wal redo activity, timelines, snapshots are managed for each tenant independently.
+Wal redo activity and timelines are managed for each tenant independently.

 For local environment used for example in tests there also new level of indirection for tenants. It touches `pgdatadirs` directory. Now it contains `tenants` subdirectory so the structure looks the following way:

--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -47,28 +47,45 @@ impl<'a> Basebackup<'a> {
        timeline: &'a Arc<dyn Timeline>,
        req_lsn: Option<Lsn>,
    ) -> Basebackup<'a> {
+        // current_prev may be zero if we are at the start of timeline branched from old lsn
        let RecordLsn {
-            last: lsn,
-            prev: prev_record_lsn,
-        } = if let Some(lsn) = req_lsn {
-            // FIXME: that wouldn't work since we don't know prev for old LSN's.
-            // Probably it is better to avoid using prev in compute node start
-            // at all and acept the fact that first WAL record in the timeline would
-            // have zero as prev. https://github.com/zenithdb/zenith/issues/506
-            RecordLsn {
-                last: lsn,
-                prev: lsn,
+            last: current_last,
+            prev: current_prev,
+        } = timeline.get_last_record_rlsn();
+
+        // Compute postgres doesn't have any previous WAL files, but the first record that this
+        // postgres is going to write need to have LSN of previous record (xl_prev). So we are
+        // writing prev_lsn to "zenith.signal" file so that postgres can read it during the start.
+        // In some cases we don't know prev_lsn (branch or basebackup @old_lsn) so pass Lsn(0)
+        // instead and embrace the wrong xl_prev in this situations.
+        let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
+            if req_lsn > current_last {
+                // FIXME: now wait_lsn() is inside of list_nonrels() so we don't have a way
+                // to get it from there. It is better to wait just here.
+                (Lsn(0), req_lsn)
+            } else if req_lsn < current_last {
+                // we don't know prev already. We don't currently use basebackup@old_lsn
+                // but may use it for read only replicas in future
+                (Lsn(0), req_lsn)
+            } else {
+                // we are exactly at req_lsn and know prev
+                (current_prev, req_lsn)
            }
        } else {
-            // Atomically get last and prev LSN's
-            timeline.get_last_record_rlsn()
+            // None in req_lsn means that we are branching from the latest LSN
+            (current_prev, current_last)
        };

+        info!(
+            "taking basebackup lsn={}, prev_lsn={}",
+            backup_prev, backup_lsn
+        );
+
        Basebackup {
            ar: Builder::new(write),
            timeline,
-            lsn,
-            prev_record_lsn,
+            lsn: backup_lsn,
+            prev_record_lsn: backup_prev,
        }
    }

@@ -84,10 +101,10 @@ impl<'a> Basebackup<'a> {
        for filepath in pg_constants::PGDATA_SPECIAL_FILES.iter() {
            if *filepath == "pg_hba.conf" {
                let data = pg_constants::PG_HBA.as_bytes();
-                let header = new_tar_header(&filepath, data.len() as u64)?;
-                self.ar.append(&header, &data[..])?;
+                let header = new_tar_header(filepath, data.len() as u64)?;
+                self.ar.append(&header, data)?;
            } else {
-                let header = new_tar_header(&filepath, 0)?;
+                let header = new_tar_header(filepath, 0)?;
                self.ar.append(&header, &mut io::empty())?;
            }
        }
@@ -166,14 +183,12 @@ impl<'a> Basebackup<'a> {
            self.lsn,
        )?;
        let path = if spcnode == pg_constants::GLOBALTABLESPACE_OID {
-            let dst_path = "PG_VERSION";
            let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
-            let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
-            self.ar.append(&header, &version_bytes[..])?;
+            let header = new_tar_header("PG_VERSION", version_bytes.len() as u64)?;
+            self.ar.append(&header, version_bytes)?;

-            let dst_path = format!("global/PG_VERSION");
-            let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
-            self.ar.append(&header, &version_bytes[..])?;
+            let header = new_tar_header("global/PG_VERSION", version_bytes.len() as u64)?;
+            self.ar.append(&header, version_bytes)?;

            String::from("global/pg_filenode.map") // filenode map for global tablespace
        } else {
@@ -188,7 +203,7 @@ impl<'a> Basebackup<'a> {
            let dst_path = format!("base/{}/PG_VERSION", dbnode);
            let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
            let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
-            self.ar.append(&header, &version_bytes[..])?;
+            self.ar.append(&header, version_bytes)?;

            format!("base/{}/pg_filenode.map", dbnode)
        };
@@ -238,7 +253,7 @@ impl<'a> Basebackup<'a> {
            XLOG_SIZE_OF_XLOG_LONG_PHD as u32,
            pg_constants::WAL_SEGMENT_SIZE,
        );
-        checkpoint.redo = self.lsn.0 + self.lsn.calc_padding(8u32);
+        checkpoint.redo = normalize_lsn(self.lsn, pg_constants::WAL_SEGMENT_SIZE).0;

        //reset some fields we don't want to preserve
        //TODO Check this.
@@ -251,9 +266,14 @@ impl<'a> Basebackup<'a> {
        pg_control.state = pg_constants::DB_SHUTDOWNED;

        // add zenith.signal file
+        let xl_prev = if self.prev_record_lsn == Lsn(0) {
+            0xBAD0 // magic value to indicate that we don't know prev_lsn
+        } else {
+            self.prev_record_lsn.0
+        };
        self.ar.append(
            &new_tar_header("zenith.signal", 8)?,
-            &self.prev_record_lsn.0.to_le_bytes()[..],
+            &xl_prev.to_le_bytes()[..],
        )?;

        //send pg_control
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -113,7 +113,7 @@ impl CfgFileParams {
            .auth_type
            .as_ref()
            .map_or(Ok(AuthType::Trust), |auth_type| {
-                AuthType::from_str(&auth_type)
+                AuthType::from_str(auth_type)
            })?;

        if !pg_distrib_dir.join("bin/postgres").exists() {
@@ -273,7 +273,7 @@ fn main() -> Result<()> {

 fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
    // Initialize logger
-    let (_scope_guard, log_file) = logger::init_logging(&conf, "pageserver.log")?;
+    let (_scope_guard, log_file) = logger::init_logging(conf, "pageserver.log")?;
    let _log_guard = slog_stdlog::init()?;

    // Note: this `info!(...)` macro comes from `log` crate
@@ -284,7 +284,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
    if conf.daemonize {
        info!("daemonizing...");

-        // There should'n be any logging to stdin/stdout. Redirect it to the main log so
+        // There shouldn't be any logging to stdin/stdout. Redirect it to the main log so
        // that we will see any accidental manual fprintf's or backtraces.
        let stdout = log_file.try_clone().unwrap();
        let stderr = log_file;
--- a/pageserver/src/branches.rs
+++ b/pageserver/src/branches.rs
@@ -43,7 +43,7 @@ pub struct PointInTime {

 pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str>) -> Result<()> {
    // Initialize logger
-    let (_scope_guard, _log_file) = logger::init_logging(&conf, "pageserver.log")?;
+    let (_scope_guard, _log_file) = logger::init_logging(conf, "pageserver.log")?;
    let _log_guard = slog_stdlog::init()?;

    // We don't use the real WAL redo manager, because we don't want to spawn the WAL redo
@@ -264,15 +264,22 @@ pub(crate) fn create_branch(
    }

    let mut startpoint = parse_point_in_time(conf, startpoint_str, tenantid)?;
-
+    let timeline = repo.get_timeline(startpoint.timelineid)?;
    if startpoint.lsn == Lsn(0) {
        // Find end of WAL on the old timeline
-        let end_of_wal = repo
-            .get_timeline(startpoint.timelineid)?
-            .get_last_record_lsn();
+        let end_of_wal = timeline.get_last_record_lsn();
        info!("branching at end of WAL: {}", end_of_wal);
        startpoint.lsn = end_of_wal;
    }
+    startpoint.lsn = startpoint.lsn.align();
+    if timeline.get_start_lsn() > startpoint.lsn {
+        anyhow::bail!(
+            "invalid startpoint {} for the branch {}: less than timeline start {}",
+            startpoint.lsn,
+            branchname,
+            timeline.get_start_lsn()
+        );
+    }

    // create a new timeline directory for it
    let newtli = create_timeline(conf, Some(startpoint), tenantid)?;
@@ -284,7 +291,7 @@ pub(crate) fn create_branch(
    // FIXME: there's a race condition, if you create a branch with the same
    // name concurrently.
    let data = newtli.to_string();
-    fs::write(conf.branch_path(&branchname, tenantid), data)?;
+    fs::write(conf.branch_path(branchname, tenantid), data)?;

    Ok(BranchInfo {
        name: branchname.to_string(),
@@ -333,21 +340,21 @@ fn parse_point_in_time(

    // Check if it's a tag
    if lsn.is_none() {
-        let tagpath = conf.tag_path(name, &tenantid);
+        let tagpath = conf.tag_path(name, tenantid);
        if tagpath.exists() {
            let pointstr = fs::read_to_string(tagpath)?;

-            return parse_point_in_time(conf, &pointstr, &tenantid);
+            return parse_point_in_time(conf, &pointstr, tenantid);
        }
    }

    // Check if it's a branch
    // Check if it's branch @ LSN
-    let branchpath = conf.branch_path(name, &tenantid);
+    let branchpath = conf.branch_path(name, tenantid);
    if branchpath.exists() {
        let pointstr = fs::read_to_string(branchpath)?;

-        let mut result = parse_point_in_time(conf, &pointstr, &tenantid)?;
+        let mut result = parse_point_in_time(conf, &pointstr, tenantid)?;

        result.lsn = lsn.unwrap_or(Lsn(0));
        return Ok(result);
@@ -356,7 +363,7 @@ fn parse_point_in_time(
    // Check if it's a timelineid
    // Check if it's timelineid @ LSN
    if let Ok(timelineid) = ZTimelineId::from_str(name) {
-        let tlipath = conf.timeline_path(&timelineid, &tenantid);
+        let tlipath = conf.timeline_path(&timelineid, tenantid);
        if tlipath.exists() {
            return Ok(PointInTime {
                timelineid,
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
@@ -71,7 +71,7 @@ static TIMEOUT: Duration = Duration::from_secs(60);
 // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
 // would be more appropriate. But a low value forces the code to be exercised more,
 // which is good for now to trigger bugs.
-static OLDEST_INMEM_DISTANCE: u64 = 16 * 1024 * 1024;
+static OLDEST_INMEM_DISTANCE: i128 = 16 * 1024 * 1024;

 // Metrics collected on operations on the storage repository.
 lazy_static! {
@@ -150,12 +150,24 @@ impl Repository for LayeredRepository {
    fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()> {
        let src_timeline = self.get_timeline(src)?;

+        let RecordLsn {
+            last: src_last,
+            prev: src_prev,
+        } = src_timeline.get_last_record_rlsn();
+
+        // Use src_prev from the source timeline only if we branched at the last record.
+        let dst_prev = if src_last == start_lsn {
+            Some(src_prev)
+        } else {
+            None
+        };
+
        // Create the metadata file, noting the ancestor of the new timeline.
        // There is initially no data in it, but all the read-calls know to look
        // into the ancestor.
        let metadata = TimelineMetadata {
            disk_consistent_lsn: start_lsn,
-            prev_record_lsn: Some(src_timeline.get_prev_record_lsn()), // FIXME not atomic with start_lsn
+            prev_record_lsn: dst_prev,
            ancestor_timeline: Some(src),
            ancestor_lsn: start_lsn,
        };
@@ -246,8 +258,8 @@ impl LayeredRepository {
        tenantid: ZTenantId,
    ) -> LayeredRepository {
        LayeredRepository {
-            tenantid: tenantid,
-            conf: conf,
+            tenantid,
+            conf,
            timelines: Mutex::new(HashMap::new()),
            walredo_mgr,
        }
@@ -675,14 +687,14 @@ impl Timeline for LayeredTimeline {
            (relsize - 1) / RELISH_SEG_SIZE
        };

-        // Unlink segments beyond the last remaining segment.
+        // Drop segments beyond the last remaining segment.
        for remove_segno in (last_remain_seg + 1)..=old_last_seg {
            let seg = SegmentTag {
                rel,
                segno: remove_segno,
            };
            let layer = self.get_layer_for_write(seg, lsn)?;
-            layer.put_unlink(lsn)?;
+            layer.drop_segment(lsn)?;
        }

        // Truncate the last remaining segment to the specified size
@@ -698,8 +710,8 @@ impl Timeline for LayeredTimeline {
        Ok(())
    }

-    fn put_unlink(&self, rel: RelishTag, lsn: Lsn) -> Result<()> {
-        trace!("put_unlink: {} at {}", rel, lsn);
+    fn drop_relish(&self, rel: RelishTag, lsn: Lsn) -> Result<()> {
+        trace!("drop_segment: {} at {}", rel, lsn);

        if rel.is_blocky() {
            let oldsize_opt = self.get_relish_size(rel, self.get_last_record_lsn())?;
@@ -710,25 +722,25 @@ impl Timeline for LayeredTimeline {
                    (oldsize - 1) / RELISH_SEG_SIZE
                };

-                // Unlink all segments
+                // Drop all segments of the relish
                for remove_segno in 0..=old_last_seg {
                    let seg = SegmentTag {
                        rel,
                        segno: remove_segno,
                    };
                    let layer = self.get_layer_for_write(seg, lsn)?;
-                    layer.put_unlink(lsn)?;
+                    layer.drop_segment(lsn)?;
                }
            } else {
                warn!(
-                    "put_unlink called on non-existent relish {} at {}",
+                    "drop_segment called on non-existent relish {} at {}",
                    rel, lsn
                );
            }
        } else {
            let seg = SegmentTag::from_blknum(rel, 0);
            let layer = self.get_layer_for_write(seg, lsn)?;
-            layer.put_unlink(lsn)?;
+            layer.drop_segment(lsn)?;
        }

        Ok(())
@@ -782,6 +794,14 @@ impl Timeline for LayeredTimeline {
    fn get_last_record_rlsn(&self) -> RecordLsn {
        self.last_record_lsn.load()
    }
+
+    fn get_start_lsn(&self) -> Lsn {
+        if let Some(ancestor) = self.ancestor_timeline.as_ref() {
+            ancestor.get_start_lsn()
+        } else {
+            self.ancestor_lsn
+        }
+    }
 }

 impl LayeredTimeline {
@@ -902,7 +922,7 @@ impl LayeredTimeline {

        while lsn < timeline.ancestor_lsn {
            trace!("going into ancestor {} ", timeline.ancestor_lsn);
-            timeline = &timeline.ancestor_timeline.as_ref().unwrap();
+            timeline = timeline.ancestor_timeline.as_ref().unwrap();
        }

        // Now we have the right starting timeline for our search.
@@ -927,7 +947,6 @@ impl LayeredTimeline {
                assert!(layer.get_start_lsn() <= lsn);

                if layer.is_dropped() && layer.get_end_lsn() <= lsn {
-                    // The segment was unlinked
                    return Ok(None);
                }

@@ -937,7 +956,7 @@ impl LayeredTimeline {
            // If not, check if there's a layer on the ancestor timeline
            if let Some(ancestor) = &timeline.ancestor_timeline {
                lsn = timeline.ancestor_lsn;
-                timeline = &ancestor.as_ref();
+                timeline = ancestor.as_ref();
                trace!("recursing into ancestor at {}/{}", timeline.timelineid, lsn);
                continue;
            }
@@ -1052,7 +1071,7 @@ impl LayeredTimeline {
        // FIXME: we can deadlock if we call wait_lsn() from WAL receiver. And we actually
        // it a lot from there. Only deadlock that I caught was while trying to add wait_lsn()
        // in list_rels(). But it makes sense to make all functions in timeline non-waiting;
-        // assert that arg_lsn <= current_record_lsn; call wait_lsn explicetly where it is
+        // assert that arg_lsn <= current_record_lsn; call wait_lsn explicitly where it is
        // needed (page_service and basebackup); uncomment this check:
        // assert_ne!(thread::current().name(), Some("WAL receiver thread"));

@@ -1074,6 +1093,18 @@ impl LayeredTimeline {
    /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
    /// know anything about them here in the repository.
    fn checkpoint_internal(&self, force: bool) -> Result<()> {
+        // Grab lock on the layer map.
+        //
+        // TODO: We hold it locked throughout the checkpoint operation. That's bad,
+        // the checkpointing could take many seconds, and any incoming get_page_at_lsn()
+        // requests will block.
+        let mut layers = self.layers.lock().unwrap();
+
+        // Bump the generation number in the layer map, so that we can distinguish
+        // entries inserted after the checkpoint started
+        let current_generation = layers.increment_generation();
+
+        // Read 'last_record_lsn'. That becomes the cutoff LSN for frozen layers.
        let RecordLsn {
            last: last_record_lsn,
            prev: prev_record_lsn,
@@ -1085,13 +1116,6 @@ impl LayeredTimeline {
            last_record_lsn
        );

-        // Grab lock on the layer map.
-        //
-        // TODO: We hold it locked throughout the checkpoint operation. That's bad,
-        // the checkpointing could take many seconds, and any incoming get_page_at_lsn()
-        // requests will block.
-        let mut layers = self.layers.lock().unwrap();
-
        // Take the in-memory layer with the oldest WAL record. If it's older
        // than the threshold, write it out to disk as a new image and delta file.
        // Repeat until all remaining in-memory layers are within the threshold.
@@ -1102,14 +1126,26 @@ impl LayeredTimeline {
        // check, though. We should also aim at flushing layers that consume
        // a lot of memory and/or aren't receiving much updates anymore.
        let mut disk_consistent_lsn = last_record_lsn;
-        while let Some(oldest_layer) = layers.peek_oldest_open() {
-            // Does this layer need freezing?
+
+        while let Some((oldest_layer, oldest_generation)) = layers.peek_oldest_open() {
            let oldest_pending_lsn = oldest_layer.get_oldest_pending_lsn();
-            let distance = last_record_lsn.0 - oldest_pending_lsn.0;
-            if !force && distance < OLDEST_INMEM_DISTANCE {
+
+            // Does this layer need freezing?
+            //
+            // Write out all in-memory layers that contain WAL older than OLDEST_INMEM_DISTANCE.
+            // Or if 'force' is true, write out all of them. If we reach a layer with the same
+            // generation number, we know that we have cycled through all layers that were open
+            // when we started. We don't want to process layers inserted after we started, to
+            // avoid getting into an infinite loop trying to process again entries that we
+            // inserted ourselves.
+            let distance = last_record_lsn.widening_sub(oldest_pending_lsn);
+            if distance < 0
+                || (!force && distance < OLDEST_INMEM_DISTANCE)
+                || oldest_generation == current_generation
+            {
                info!(
                    "the oldest layer is now {} which is {} bytes behind last_record_lsn",
-                    oldest_layer.get_seg_tag(),
+                    oldest_layer.filename().display(),
                    distance
                );
                disk_consistent_lsn = oldest_pending_lsn;
@@ -1117,7 +1153,7 @@ impl LayeredTimeline {
            }

            // freeze it
-            let (new_historics, new_open) = oldest_layer.freeze(last_record_lsn, &self)?;
+            let (new_historics, new_open) = oldest_layer.freeze(last_record_lsn, self)?;

            // replace this layer with the new layers that 'freeze' returned
            layers.pop_oldest_open();
@@ -1159,7 +1195,7 @@ impl LayeredTimeline {
        let ancestor_timelineid = self.ancestor_timeline.as_ref().map(|x| x.timelineid);

        let metadata = TimelineMetadata {
-            disk_consistent_lsn: disk_consistent_lsn,
+            disk_consistent_lsn,
            prev_record_lsn: ondisk_prev_record_lsn,
            ancestor_timeline: ancestor_timelineid,
            ancestor_lsn: self.ancestor_lsn,
@@ -1210,7 +1246,7 @@ impl LayeredTimeline {
        //
        // Determine for each file if it needs to be retained
        // FIXME: also scan open in-memory layers. Normally we cannot remove the
-        // latest layer of any seg, but if it was unlinked it's possible
+        // latest layer of any seg, but if it was dropped it's possible
        let mut layers = self.layers.lock().unwrap();
        'outer: for l in layers.iter_historic_layers() {
            let seg = l.get_seg_tag();
@@ -1287,18 +1323,14 @@ impl LayeredTimeline {
            doomed_layer.delete()?;
            layers.remove_historic(&*doomed_layer);

-            if doomed_layer.is_dropped() {
-                if doomed_layer.get_seg_tag().rel.is_relation() {
-                    result.ondisk_relfiles_dropped += 1;
-                } else {
-                    result.ondisk_nonrelfiles_dropped += 1;
-                }
-            } else {
-                if doomed_layer.get_seg_tag().rel.is_relation() {
-                    result.ondisk_relfiles_removed += 1;
-                } else {
-                    result.ondisk_nonrelfiles_removed += 1;
-                }
+            match (
+                doomed_layer.is_dropped(),
+                doomed_layer.get_seg_tag().rel.is_relation(),
+            ) {
+                (true, true) => result.ondisk_relfiles_dropped += 1,
+                (true, false) => result.ondisk_nonrelfiles_dropped += 1,
+                (false, true) => result.ondisk_relfiles_removed += 1,
+                (false, false) => result.ondisk_nonrelfiles_removed += 1,
            }
        }

@@ -1414,6 +1446,7 @@ impl LayeredTimeline {
                    trace!("found {} WAL records that will init the page for blk {} in {} at {}/{}, performing WAL redo", data.records.len(), blknum, rel, self.timelineid, request_lsn);
                }
                let img = self.walredo_mgr.request_redo(
+                    self,
                    rel,
                    blknum,
                    request_lsn,
--- a/pageserver/src/layered_repository/README.md
+++ b/pageserver/src/layered_repository/README.md
@@ -6,13 +6,14 @@ which pages they apply to, and accumulates the incoming changes in
 memory. Every now and then, the accumulated changes are written out to
 new files.

-The files are called "snapshot files". Each snapshot file corresponds
-to one 10 MB slice of a PostgreSQL relation fork. The snapshot files
+The files are called "layer files". Each layer file corresponds
+to one RELISH_SEG_SIZE slice of a PostgreSQL relation fork or
+non-rel file in a range of LSNs. The layer files
 for each timeline are stored in the timeline's subdirectory under
 .zenith/tenants/<tenantid>/timelines.

-There are two kind of snapshot file: base images, and deltas. A base
-image file contains a snapshot of a segment as it was at one LSN,
+There are two kind of layer file: base images, and deltas. A base
+image file contains a layer of a segment as it was at one LSN,
 whereas a delta file contains modifications to a segment - mostly in
 the form of WAL records - in a range of LSN

@@ -44,7 +45,7 @@ managed, except that the first part of file names is different.
 Internally, the relations and non-relation files that are managed in
 the versioned store are together called "relishes".

-If a file has been dropped, the last snapshot file for it is created
+If a file has been dropped, the last layer file for it is created
 with the _DROPPED suffix, e.g.

    rel_1663_13990_2609_0_10_000000000169C348_0000000001702000_DROPPED
@@ -67,7 +68,7 @@ for 'orders' table on 'main' branch is represented like this:
    main/orders_100_200


-# Creating snapshot files
+# Creating layer files

 Let's start with a simple example with a system that contains one
 branch called 'main' and two tables, 'orders' and 'customers'. The end
@@ -86,10 +87,10 @@ end of WAL at 250 are kept in memory. If the page server crashes, the
 latest records between 200-250 need to be re-read from the WAL.

 Whenever enough WAL has been accumulated in memory, the page server
-writes out the changes in memory into new snapshot files. This process
+writes out the changes in memory into new layer files. This process
 is called "checkpointing" (not to be confused with the PostgreSQL
 checkpoints, that's a different thing). The page server only creates
-snapshot files for relations that have been modified since the last
+layer files for relations that have been modified since the last
 checkpoint. For example, if the current end of WAL is at LSN 450, and
 the last checkpoint happened at LSN 400 but there hasn't been any
 recent changes to 'customers' table, you would have these files on
@@ -108,7 +109,7 @@ disk:

 If the customers table is modified later, a new file is created for it
 at the next checkpoint. The new file will cover the "gap" from the
-last snapshot file, so the LSN ranges are always contiguous:
+last layer file, so the LSN ranges are always contiguous:

 	main/orders_100
 	main/orders_100_200
@@ -130,13 +131,13 @@ page server needs to reconstruct the requested page, as it was at the
 requested LSN. To do that, the page server first checks the recent
 in-memory layer; if the requested page version is found there, it can
 be returned immediatedly without looking at the files on
-disk. Otherwise the page server needs to locate the snapshot file that
+disk. Otherwise the page server needs to locate the layer file that
 contains the requested page version.

 For example, if a request comes in for table 'orders' at LSN 250, the
 page server would load the 'main/orders_200_300' file into memory, and
 reconstruct and return the requested page from it, as it was at
-LSN 250. Because the snapshot file consists of a full image of the
+LSN 250. Because the layer file consists of a full image of the
 relation at the start LSN and the WAL, reconstructing the page
 involves replaying any WAL records applicable to the page between LSNs
 200-250, starting from the base image at LSN 200.
@@ -171,7 +172,7 @@ Then, the 'orders' table is updated differently on the 'main' and

 Because the 'customers' table hasn't been modified on the child
 branch, there is no file for it there. If you request a page for it on
-the 'child' branch, the page server will not find any snapshot file
+the 'child' branch, the page server will not find any layer file
 for it in the 'child' directory, so it will recurse to look into the
 parent 'main' branch instead.

@@ -217,7 +218,7 @@ branch at a historic LSN, is how we support PITR in Zenith.

 # Garbage collection

-In this scheme, we keep creating new snapshot files over time. We also
+In this scheme, we keep creating new layer files over time. We also
 need a mechanism to remove old files that are no longer needed,
 because disk space isn't infinite.

@@ -245,7 +246,7 @@ of the branch is LSN 525, so that the GC horizon is currently at
 	main/customers_200

 We can remove the following files because the end LSNs of those files are
-older than GC horizon 375, and there are more recent snapshot files for the
+older than GC horizon 375, and there are more recent layer files for the
 table:

 	main/orders_100       DELETE
@@ -262,7 +263,7 @@ table:
 	main/customers_200      KEEP, NO NEWER VERSION

 'main/customers_100_200' is old enough, but it cannot be
-removed because there is no newer snapshot file for the table.
+removed because there is no newer layer file for the table.

 Things get slightly more complicated with multiple branches. All of
 the above still holds, but in addition to recent files we must also
@@ -308,7 +309,7 @@ new base image and delta file for it on the child:

 After this, the 'main/orders_100' and 'main/orders_100_200' file could
 be removed. It is no longer needed by the child branch, because there
-is a newer snapshot file there. TODO: This optimization hasn't been
+is a newer layer file there. TODO: This optimization hasn't been
 implemented! The GC algorithm will currently keep the file on the
 'main' branch anyway, for as long as the child branch exists.

@@ -346,7 +347,7 @@ It would also be OK to have overlapping LSN ranges for the same relation:
 	main/orders_300_400
 	main/orders_400

-The code that reads the snapshot files should cope with this, but this
+The code that reads the layer files should cope with this, but this
 situation doesn't arise either, because the checkpointing code never
 does that.  It could be useful, however, as a transient state when
 garbage collecting around branch points, or explicit recovery
@@ -360,6 +361,6 @@ points. For example, if we start with this:

 And there is a branch or explicit recovery point at LSN 150, we could
 replace 'main/orders_100_200' with 'main/orders_150' to keep a
-snapshot only at that exact point that's still needed, removing the
+layer only at that exact point that's still needed, removing the
 other page versions around it. But such compaction has not been
 implemented yet.
--- a/pageserver/src/layered_repository/delta_layer.rs
+++ b/pageserver/src/layered_repository/delta_layer.rs
@@ -130,23 +130,23 @@ pub struct DeltaLayerInner {

 impl Layer for DeltaLayer {
    fn get_timeline_id(&self) -> ZTimelineId {
-        return self.timelineid;
+        self.timelineid
    }

    fn get_seg_tag(&self) -> SegmentTag {
-        return self.seg;
+        self.seg
    }

    fn is_dropped(&self) -> bool {
-        return self.dropped;
+        self.dropped
    }

    fn get_start_lsn(&self) -> Lsn {
-        return self.start_lsn;
+        self.start_lsn
    }

    fn get_end_lsn(&self) -> Lsn {
-        return self.end_lsn;
+        self.end_lsn
    }

    fn filename(&self) -> PathBuf {
@@ -174,7 +174,7 @@ impl Layer for DeltaLayer {

        {
            // Open the file and lock the metadata in memory
-            // TODO: avoid opening the snapshot file for each read
+            // TODO: avoid opening the file for each read
            let (_path, book) = self.open_book()?;
            let page_version_reader = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
            let inner = self.load()?;
@@ -285,8 +285,8 @@ impl Layer for DeltaLayer {
    /// debugging function to print out the contents of the layer
    fn dump(&self) -> Result<()> {
        println!(
-            "----- delta layer for {} {}-{} ----",
-            self.seg, self.start_lsn, self.end_lsn
+            "----- delta layer for tli {} seg {} {}-{} ----",
+            self.timelineid, self.seg, self.start_lsn, self.end_lsn
        );

        println!("--- relsizes ---");
@@ -358,6 +358,7 @@ impl DeltaLayer {
    /// This is used to write the in-memory layer to disk. The in-memory layer uses the same
    /// data structure with two btreemaps as we do, so passing the btreemaps is currently
    /// expedient.
+    #[allow(clippy::too_many_arguments)]
    pub fn create(
        conf: &'static PageServerConf,
        timelineid: ZTimelineId,
@@ -372,16 +373,16 @@ impl DeltaLayer {
    ) -> Result<DeltaLayer> {
        let delta_layer = DeltaLayer {
            path_or_conf: PathOrConf::Conf(conf),
-            timelineid: timelineid,
-            tenantid: tenantid,
-            seg: seg,
-            start_lsn: start_lsn,
+            timelineid,
+            tenantid,
+            seg,
+            start_lsn,
            end_lsn,
            dropped,
            inner: Mutex::new(DeltaLayerInner {
                loaded: true,
                page_version_metas: BTreeMap::new(),
-                relsizes: relsizes,
+                relsizes,
            }),
            predecessor,
        };
--- a/pageserver/src/layered_repository/filename.rs
+++ b/pageserver/src/layered_repository/filename.rs
@@ -111,8 +111,10 @@ impl DeltaFileName {
            dropped,
        })
    }
+}

-    fn to_string(&self) -> String {
+impl fmt::Display for DeltaFileName {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let basename = match self.seg.rel {
            RelishTag::Relation(reltag) => format!(
                "rel_{}_{}_{}_{}",
@@ -134,11 +136,12 @@ impl DeltaFileName {
                format!("pg_filenodemap_{}_{}", spcnode, dbnode)
            }
            RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
-            RelishTag::Checkpoint => format!("pg_control_checkpoint"),
-            RelishTag::ControlFile => format!("pg_control"),
+            RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
+            RelishTag::ControlFile => "pg_control".to_string(),
        };

-        format!(
+        write!(
+            f,
            "{}_{}_{:016X}_{:016X}{}",
            basename,
            self.seg.segno,
@@ -149,12 +152,6 @@ impl DeltaFileName {
    }
 }

-impl fmt::Display for DeltaFileName {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "{}", self.to_string())
-    }
-}
-
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
 pub struct ImageFileName {
    pub seg: SegmentTag,
@@ -233,8 +230,10 @@ impl ImageFileName {

        Some(ImageFileName { seg, lsn })
    }
+}

-    fn to_string(&self) -> String {
+impl fmt::Display for ImageFileName {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let basename = match self.seg.rel {
            RelishTag::Relation(reltag) => format!(
                "rel_{}_{}_{}_{}",
@@ -256,11 +255,12 @@ impl ImageFileName {
                format!("pg_filenodemap_{}_{}", spcnode, dbnode)
            }
            RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
-            RelishTag::Checkpoint => format!("pg_control_checkpoint"),
-            RelishTag::ControlFile => format!("pg_control"),
+            RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
+            RelishTag::ControlFile => "pg_control".to_string(),
        };

-        format!(
+        write!(
+            f,
            "{}_{}_{:016X}",
            basename,
            self.seg.segno,
@@ -269,12 +269,6 @@ impl ImageFileName {
    }
 }

-impl fmt::Display for ImageFileName {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "{}", self.to_string())
-    }
-}
-
 /// Scan timeline directory and create ImageFileName and DeltaFilename
 /// structs representing all files on disk
 ///
@@ -302,7 +296,7 @@ pub fn list_files(
            warn!("unrecognized filename in timeline dir: {}", fname);
        }
    }
-    return Ok((imgfiles, deltafiles));
+    Ok((imgfiles, deltafiles))
 }

 /// Helper enum to hold a PageServerConf, or a path
--- a/pageserver/src/layered_repository/image_layer.rs
+++ b/pageserver/src/layered_repository/image_layer.rs
@@ -2,8 +2,9 @@
 //! It is stored in a file on disk.
 //!
 //! On disk, the image files are stored in timelines/<timelineid> directory.
-//! Currently, there are no subdirectories, and each snapshot file is named like this:
+//! Currently, there are no subdirectories, and each image layer file is named like this:
 //!
+//! Note that segno is
 //!    <spcnode>_<dbnode>_<relnode>_<forknum>_<segno>_<LSN>
 //!
 //! For example:
@@ -15,10 +16,10 @@
 //! Only metadata is loaded into memory by the load function.
 //! When images are needed, they are read directly from disk.
 //!
-//! For blocky segments, the images are stored in BLOCKY_IMAGES_CHAPTER.
+//! For blocky relishes, the images are stored in BLOCKY_IMAGES_CHAPTER.
 //! All the images are required to be BLOCK_SIZE, which allows for random access.
 //!
-//! For non-blocky segments, the image can be found in NONBLOCKY_IMAGE_CHAPTER.
+//! For non-blocky relishes, the image can be found in NONBLOCKY_IMAGE_CHAPTER.
 //!
 use crate::layered_repository::filename::{ImageFileName, PathOrConf};
 use crate::layered_repository::storage_layer::{
@@ -97,23 +98,23 @@ impl Layer for ImageLayer {
    }

    fn get_timeline_id(&self) -> ZTimelineId {
-        return self.timelineid;
+        self.timelineid
    }

    fn get_seg_tag(&self) -> SegmentTag {
-        return self.seg;
+        self.seg
    }

    fn is_dropped(&self) -> bool {
-        return false;
+        false
    }

    fn get_start_lsn(&self) -> Lsn {
-        return self.lsn;
+        self.lsn
    }

    fn get_end_lsn(&self) -> Lsn {
-        return self.lsn;
+        self.lsn
    }

    /// Look up given page in the file
@@ -192,7 +193,10 @@ impl Layer for ImageLayer {

    /// debugging function to print out the contents of the layer
    fn dump(&self) -> Result<()> {
-        println!("----- image layer for {} at {} ----", self.seg, self.lsn);
+        println!(
+            "----- image layer for tli {} seg {} at {} ----",
+            self.timelineid, self.seg, self.lsn
+        );

        let inner = self.load()?;

@@ -255,10 +259,10 @@ impl ImageLayer {

        let layer = ImageLayer {
            path_or_conf: PathOrConf::Conf(conf),
-            timelineid: timelineid,
-            tenantid: tenantid,
-            seg: seg,
-            lsn: lsn,
+            timelineid,
+            tenantid,
+            seg,
+            lsn,
            inner: Mutex::new(ImageLayerInner {
                loaded: true,
                image_type: image_type.clone(),
--- a/pageserver/src/layered_repository/inmemory_layer.rs
+++ b/pageserver/src/layered_repository/inmemory_layer.rs
@@ -7,6 +7,7 @@ use crate::layered_repository::storage_layer::{
    Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag, RELISH_SEG_SIZE,
 };
 use crate::layered_repository::LayeredTimeline;
+use crate::layered_repository::ZERO_PAGE;
 use crate::layered_repository::{DeltaLayer, ImageLayer};
 use crate::repository::WALRecord;
 use crate::PageServerConf;
@@ -14,6 +15,7 @@ use crate::{ZTenantId, ZTimelineId};
 use anyhow::{bail, Result};
 use bytes::Bytes;
 use log::*;
+use std::cmp::Ordering;
 use std::collections::BTreeMap;
 use std::ops::Bound::Included;
 use std::path::PathBuf;
@@ -93,8 +95,8 @@ impl Layer for InMemoryLayer {
        let delta_filename = DeltaFileName {
            seg: self.seg,
            start_lsn: self.start_lsn,
-            end_lsn: end_lsn,
-            dropped: dropped,
+            end_lsn,
+            dropped,
        }
        .to_string();

@@ -102,15 +104,15 @@ impl Layer for InMemoryLayer {
    }

    fn get_timeline_id(&self) -> ZTimelineId {
-        return self.timelineid;
+        self.timelineid
    }

    fn get_seg_tag(&self) -> SegmentTag {
-        return self.seg;
+        self.seg
    }

    fn get_start_lsn(&self) -> Lsn {
-        return self.start_lsn;
+        self.start_lsn
    }

    fn get_end_lsn(&self) -> Lsn {
@@ -239,21 +241,32 @@ impl Layer for InMemoryLayer {
            .unwrap_or_default();

        println!(
-            "----- in-memory layer for {} {}-{} ----",
-            self.seg, self.start_lsn, end_str
+            "----- in-memory layer for tli {} seg {} {}-{} ----",
+            self.timelineid, self.seg, self.start_lsn, end_str
        );

        for (k, v) in inner.segsizes.iter() {
-            println!("{}: {}", k, v);
+            println!("segsizes {}: {}", k, v);
+        }
+
+        for (k, v) in inner.page_versions.iter() {
+            println!(
+                "blk {} at {}: {}/{}\n",
+                k.0,
+                k.1,
+                v.page_image.is_some(),
+                v.record.is_some()
+            );
        }
-        //for (k, v) in inner.page_versions.iter() {
-        //    println!("blk {} at {}: {}/{}", k.0, k.1, v.page_image.is_some(), v.record.is_some());
-        //}

        Ok(())
    }
 }

+// Type alias to simplify InMemoryLayer::freeze signature
+//
+type SuccessorLayers = (Vec<Arc<dyn Layer>>, Option<Arc<InMemoryLayer>>);
+
 impl InMemoryLayer {
    /// Return the oldest page version that's stored in this layer
    pub fn get_oldest_pending_lsn(&self) -> Lsn {
@@ -359,6 +372,36 @@ impl InMemoryLayer {
                    newsize,
                    lsn
                );
+
+                // If we are extending the relation by more than one page, initialize the "gap"
+                // with zeros
+                //
+                // XXX: What if the caller initializes the gap with subsequent call with same LSN?
+                // I don't think that can happen currently, but that is highly dependent on how
+                // PostgreSQL writes its WAL records and there's no guarantee of it. If it does
+                // happen, we would hit the "page version already exists" warning above on the
+                // subsequent call to initialize the gap page.
+                let gapstart = self.seg.segno * RELISH_SEG_SIZE + oldsize;
+                for gapblknum in gapstart..blknum {
+                    let zeropv = PageVersion {
+                        page_image: Some(ZERO_PAGE.clone()),
+                        record: None,
+                    };
+                    println!(
+                        "filling gap blk {} with zeros for write of {}",
+                        gapblknum, blknum
+                    );
+                    let old = inner.page_versions.insert((gapblknum, lsn), zeropv);
+                    // We already had an entry for this LSN. That's odd..
+
+                    if old.is_some() {
+                        warn!(
+                            "Page version of rel {} blk {} at {} already exists",
+                            self.seg.rel, blknum, lsn
+                        );
+                    }
+                }
+
                inner.segsizes.insert(lsn, newsize);
            }
        }
@@ -380,7 +423,7 @@ impl InMemoryLayer {
    }

    /// Remember that the segment was dropped at given LSN
-    pub fn put_unlink(&self, lsn: Lsn) -> anyhow::Result<()> {
+    pub fn drop_segment(&self, lsn: Lsn) -> anyhow::Result<()> {
        let mut inner = self.inner.lock().unwrap();

        assert!(inner.drop_lsn.is_none());
@@ -429,14 +472,14 @@ impl InMemoryLayer {
            inner: Mutex::new(InMemoryLayerInner {
                drop_lsn: None,
                page_versions: BTreeMap::new(),
-                segsizes: segsizes,
+                segsizes,
            }),
            predecessor: Some(src),
        })
    }

    ///
-    /// Write the this in-memory layer to disk, as a snapshot layer.
+    /// Write the this in-memory layer to disk.
    ///
    /// The cutoff point for the layer that's written to disk is 'end_lsn'.
    ///
@@ -454,7 +497,7 @@ impl InMemoryLayer {
        cutoff_lsn: Lsn,
        // This is needed just to call materialize_page()
        timeline: &LayeredTimeline,
-    ) -> Result<(Vec<Arc<dyn Layer>>, Option<Arc<InMemoryLayer>>)> {
+    ) -> Result<SuccessorLayers> {
        info!(
            "freezing in memory layer for {} on timeline {} at {}",
            self.seg, self.timelineid, cutoff_lsn
@@ -494,13 +537,17 @@ impl InMemoryLayer {
            before_page_versions = BTreeMap::new();
            after_page_versions = BTreeMap::new();
            for ((blknum, lsn), pv) in inner.page_versions.iter() {
-                if *lsn == end_lsn {
-                    // Page versions at the cutoff LSN will be stored in the
-                    // materialized image layer.
-                } else if *lsn > end_lsn {
-                    after_page_versions.insert((*blknum, *lsn), pv.clone());
-                } else {
-                    before_page_versions.insert((*blknum, *lsn), pv.clone());
+                match lsn.cmp(&end_lsn) {
+                    Ordering::Less => {
+                        before_page_versions.insert((*blknum, *lsn), pv.clone());
+                    }
+                    Ordering::Equal => {
+                        // Page versions at the cutoff LSN will be stored in the
+                        // materialized image layer.
+                    }
+                    Ordering::Greater => {
+                        after_page_versions.insert((*blknum, *lsn), pv.clone());
+                    }
                }
            }
        } else {
@@ -572,30 +619,4 @@ impl InMemoryLayer {

        Ok((frozen_layers, new_open_rc))
    }
-
-    /// debugging function to print out the contents of the layer
-    #[allow(unused)]
-    pub fn dump(&self) -> String {
-        let mut result = format!(
-            "----- inmemory layer for {} {}-> ----\n",
-            self.seg, self.start_lsn
-        );
-
-        let inner = self.inner.lock().unwrap();
-
-        for (k, v) in inner.segsizes.iter() {
-            result += &format!("{}: {}\n", k, v);
-        }
-        for (k, v) in inner.page_versions.iter() {
-            result += &format!(
-                "blk {} at {}: {}/{}\n",
-                k.0,
-                k.1,
-                v.page_image.is_some(),
-                v.record.is_some()
-            );
-        }
-
-        result
-    }
 }
--- a/pageserver/src/layered_repository/layer_map.rs
+++ b/pageserver/src/layered_repository/layer_map.rs
@@ -1,5 +1,5 @@
 //!
-//! The layer map tracks what layers exist for all the relations in a timeline.
+//! The layer map tracks what layers exist for all the relishes in a timeline.
 //!
 //! When the timeline is first accessed, the server lists of all layer files
 //! in the timelines/<timelineid> directory, and populates this map with
@@ -43,6 +43,10 @@ pub struct LayerMap {
    /// This allows easy access to the in-memory layer that contains the
    /// oldest WAL record.
    open_segs: BinaryHeap<OpenSegEntry>,
+
+    /// Generation number, used to distinguish newly inserted entries in the
+    /// binary heap from older entries during checkpoint.
+    current_generation: u64,
 }

 ///
@@ -59,9 +63,13 @@ struct SegEntry {

 /// Entry held LayerMap.open_segs, with boilerplate comparison
 /// routines to implement a min-heap ordered by 'oldest_pending_lsn'
+///
+/// Each entry also carries a generation number. It can be used to distinguish
+/// entries with the same 'oldest_pending_lsn'.
 struct OpenSegEntry {
    pub oldest_pending_lsn: Lsn,
    pub layer: Arc<InMemoryLayer>,
+    pub generation: u64,
 }
 impl Ord for OpenSegEntry {
    fn cmp(&self, other: &Self) -> Ordering {
@@ -73,10 +81,13 @@ impl Ord for OpenSegEntry {
 impl PartialOrd for OpenSegEntry {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
-        // to get that.
-        other
-            .oldest_pending_lsn
-            .partial_cmp(&self.oldest_pending_lsn)
+        // to get that. Entries with identical oldest_pending_lsn are ordered by generation
+        Some(
+            other
+                .oldest_pending_lsn
+                .cmp(&self.oldest_pending_lsn)
+                .then_with(|| other.generation.cmp(&self.generation)),
+        )
    }
 }
 impl PartialEq for OpenSegEntry {
@@ -98,7 +109,7 @@ impl LayerMap {

        if let Some(open) = &segentry.open {
            if open.get_start_lsn() <= lsn {
-                let x: Arc<dyn Layer> = Arc::clone(&open) as _;
+                let x: Arc<dyn Layer> = Arc::clone(open) as _;
                return Some(x);
            }
        }
@@ -108,7 +119,7 @@ impl LayerMap {
            .range((Included(Lsn(0)), Included(lsn)))
            .next_back()
        {
-            let x: Arc<dyn Layer> = Arc::clone(&v) as _;
+            let x: Arc<dyn Layer> = Arc::clone(v) as _;
            Some(x)
        } else {
            None
@@ -121,12 +132,7 @@ impl LayerMap {
    ///
    pub fn get_open(&self, tag: &SegmentTag) -> Option<Arc<InMemoryLayer>> {
        let segentry = self.segs.get(tag)?;
-
-        if let Some(open) = &segentry.open {
-            Some(Arc::clone(open))
-        } else {
-            None
-        }
+        segentry.open.as_ref().map(Arc::clone)
    }

    ///
@@ -150,7 +156,8 @@ impl LayerMap {

        let opensegentry = OpenSegEntry {
            oldest_pending_lsn: layer.get_oldest_pending_lsn(),
-            layer: layer,
+            layer,
+            generation: self.current_generation,
        };
        self.open_segs.push(opensegentry);

@@ -259,15 +266,16 @@ impl LayerMap {
    /// Is there a newer image layer for given segment?
    ///
    /// This is used for garbage collection, to determine if an old layer can
-    /// be deleted. We ignore in-memory layers because they are not durable
-    /// on disk, and delta layers because they depend on an older layer.
+    /// be deleted.
    pub fn newer_image_layer_exists(&self, seg: SegmentTag, lsn: Lsn) -> bool {
        if let Some(segentry) = self.segs.get(&seg) {
+            // We only check on-disk layers, because
+            // in-memory layers are not durable
            for (newer_lsn, layer) in segentry
                .historic
                .range((Included(lsn), Included(Lsn(u64::MAX))))
            {
-                // Ignore delta layers.
+                // Ignore layers that depend on an older layer.
                if layer.is_incremental() {
                    continue;
                }
@@ -290,21 +298,46 @@ impl LayerMap {
        false
    }

-    /// Return the oldest in-memory layer.
-    pub fn peek_oldest_open(&self) -> Option<Arc<InMemoryLayer>> {
+    /// Return the oldest in-memory layer, along with its generation number.
+    pub fn peek_oldest_open(&self) -> Option<(Arc<InMemoryLayer>, u64)> {
        if let Some(opensegentry) = self.open_segs.peek() {
-            Some(Arc::clone(&opensegentry.layer))
+            Some((Arc::clone(&opensegentry.layer), opensegentry.generation))
        } else {
            None
        }
    }

+    /// Increment the generation number used to stamp open in-memory layers. Layers
+    /// added with `insert_open` after this call will be associated with the new
+    /// generation. Returns the new generation number.
+    pub fn increment_generation(&mut self) -> u64 {
+        self.current_generation += 1;
+        self.current_generation
+    }
+
    pub fn iter_historic_layers(&self) -> HistoricLayerIter {
        HistoricLayerIter {
            segiter: self.segs.iter(),
            iter: None,
        }
    }
+
+    /// debugging function to print out the contents of the layer map
+    #[allow(unused)]
+    pub fn dump(&self) -> Result<()> {
+        println!("Begin dump LayerMap");
+        for (seg, segentry) in self.segs.iter() {
+            if let Some(open) = &segentry.open {
+                open.dump()?;
+            }
+
+            for (_, layer) in segentry.historic.iter() {
+                layer.dump()?;
+            }
+        }
+        println!("End dump LayerMap");
+        Ok(())
+    }
 }

 impl Default for LayerMap {
@@ -312,6 +345,7 @@ impl Default for LayerMap {
        LayerMap {
            segs: HashMap::new(),
            open_segs: BinaryHeap::new(),
+            current_generation: 0,
        }
    }
 }
--- a/pageserver/src/layered_repository/storage_layer.rs
+++ b/pageserver/src/layered_repository/storage_layer.rs
@@ -97,24 +97,32 @@ pub enum PageReconstructResult {
 }

 ///
-/// A Layer holds all page versions for one segment of a relish, in a range of LSNs.
-/// There are two kinds of layers, in-memory and snapshot layers. In-memory
+/// A Layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs.
+/// There are two kinds of layers, in-memory and on-disk layers. In-memory
 /// layers are used to ingest incoming WAL, and provide fast access
-/// to the recent page versions. Snaphot layers are stored on disk, and
+/// to the recent page versions. On-disk layers are stored as files on disk, and
 /// are immutable. This trait presents the common functionality of
-/// in-memory and snapshot layers.
-///
-/// Each layer contains a full snapshot of the segment at the start
-/// LSN. In addition to that, it contains WAL (or more page images)
-/// needed to recontruct any page version up to the end LSN.
+/// in-memory and on-disk layers.
 ///
 pub trait Layer: Send + Sync {
-    // These functions identify the relish segment and the LSN range
-    // that this Layer holds.
+    /// Identify the timeline this relish belongs to
    fn get_timeline_id(&self) -> ZTimelineId;
+
+    /// Identify the relish segment
    fn get_seg_tag(&self) -> SegmentTag;
+
+    /// Inclusive start bound of the LSN range that this layer hold
    fn get_start_lsn(&self) -> Lsn;
+
+    /// 'end_lsn' meaning depends on the layer kind:
+    /// - in-memory layer is either unbounded (end_lsn = MAX_LSN) or dropped (end_lsn = drop_lsn)
+    /// - image layer represents snapshot at one LSN, so end_lsn = lsn
+    /// - delta layer has end_lsn
+    ///
+    /// TODO Is end_lsn always exclusive for all layer kinds?
    fn get_end_lsn(&self) -> Lsn;
+
+    /// Is the segment represented by this layer dropped by PostgreSQL?
    fn is_dropped(&self) -> bool;

    /// Filename used to store this layer on disk. (Even in-memory layers
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -346,7 +346,7 @@ impl PageServerHandler {
        pgb.write_message(&BeMessage::CopyOutResponse)?;
        info!("sent CopyOut");

-        /* Send a tarball of the latest snapshot on the timeline */
+        /* Send a tarball of the latest layer on the timeline */
        {
            let mut writer = CopyDataSink { pgb };
            let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn);
@@ -372,7 +372,7 @@ impl PageServerHandler {
            .claims
            .as_ref()
            .expect("claims presence already checked");
-        Ok(auth::check_permission(claims, tenantid)?)
+        auth::check_permission(claims, tenantid)
    }
 }

@@ -389,7 +389,7 @@ impl postgres_backend::Handler for PageServerHandler {
            .as_ref()
            .as_ref()
            .unwrap()
-            .decode(&str::from_utf8(jwt_response)?)?;
+            .decode(str::from_utf8(jwt_response)?)?;

        if matches!(data.claims.scope, Scope::Tenant) {
            ensure!(
@@ -425,7 +425,7 @@ impl postgres_backend::Handler for PageServerHandler {
            self.handle_controlfile(pgb)?;
        } else if query_string.starts_with("pagestream ") {
            let (_, params_raw) = query_string.split_at("pagestream ".len());
-            let params = params_raw.split(" ").collect::<Vec<_>>();
+            let params = params_raw.split(' ').collect::<Vec<_>>();
            ensure!(
                params.len() == 2,
                "invalid param number for pagestream command"
@@ -484,7 +484,7 @@ impl postgres_backend::Handler for PageServerHandler {
                .get_timeline(timelineid)
                .context(format!("error fetching timeline {}", timelineid))?;

-            walreceiver::launch_wal_receiver(&self.conf, timelineid, &connstr, tenantid.to_owned());
+            walreceiver::launch_wal_receiver(self.conf, timelineid, &connstr, tenantid.to_owned());

            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else if query_string.starts_with("branch_create ") {
@@ -492,10 +492,10 @@ impl postgres_backend::Handler for PageServerHandler {

            // branch_create <tenantid> <branchname> <startpoint>
            // TODO lazy static
-            // TOOD: escaping, to allow branch names with spaces
+            // TODO: escaping, to allow branch names with spaces
            let re = Regex::new(r"^branch_create ([[:xdigit:]]+) (\S+) ([^\r\n\s;]+)[\r\n\s;]*;?$")
                .unwrap();
-            let caps = re.captures(&query_string).ok_or_else(err)?;
+            let caps = re.captures(query_string).ok_or_else(err)?;

            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
            let branchname = caps.get(2).ok_or_else(err)?.as_str().to_owned();
@@ -504,7 +504,7 @@ impl postgres_backend::Handler for PageServerHandler {
            self.check_permission(Some(tenantid))?;

            let branch =
-                branches::create_branch(&self.conf, &branchname, &startpoint_str, &tenantid)?;
+                branches::create_branch(self.conf, &branchname, &startpoint_str, &tenantid)?;
            let branch = serde_json::to_vec(&branch)?;

            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
@@ -519,14 +519,14 @@ impl postgres_backend::Handler for PageServerHandler {

            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;

-            let branches = crate::branches::get_branches(&self.conf, &tenantid)?;
+            let branches = crate::branches::get_branches(self.conf, &tenantid)?;
            let branches_buf = serde_json::to_vec(&branches)?;

            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
                .write_message_noflush(&BeMessage::DataRow(&[Some(&branches_buf)]))?
                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else if query_string.starts_with("tenant_list") {
-            let tenants = crate::branches::get_tenants(&self.conf)?;
+            let tenants = crate::branches::get_tenants(self.conf)?;
            let tenants_buf = serde_json::to_vec(&tenants)?;

            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
@@ -537,13 +537,13 @@ impl postgres_backend::Handler for PageServerHandler {

            // tenant_create <tenantid>
            let re = Regex::new(r"^tenant_create ([[:xdigit:]]+)$").unwrap();
-            let caps = re.captures(&query_string).ok_or_else(err)?;
+            let caps = re.captures(query_string).ok_or_else(err)?;

            self.check_permission(None)?;

            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;

-            tenant_mgr::create_repository_for_tenant(&self.conf, tenantid)?;
+            tenant_mgr::create_repository_for_tenant(self.conf, tenantid)?;

            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
@@ -582,54 +582,54 @@ impl postgres_backend::Handler for PageServerHandler {
            let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;

            pgb.write_message_noflush(&BeMessage::RowDescription(&[
-                RowDescriptor::int8_col(b"snapshot_relfiles_total"),
-                RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_cutoff"),
-                RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_branches"),
-                RowDescriptor::int8_col(b"snapshot_relfiles_not_updated"),
-                RowDescriptor::int8_col(b"snapshot_relfiles_removed"),
-                RowDescriptor::int8_col(b"snapshot_relfiles_dropped"),
-                RowDescriptor::int8_col(b"snapshot_nonrelfiles_total"),
-                RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_cutoff"),
-                RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_branches"),
-                RowDescriptor::int8_col(b"snapshot_nonrelfiles_not_updated"),
-                RowDescriptor::int8_col(b"snapshot_nonrelfiles_removed"),
-                RowDescriptor::int8_col(b"snapshot_nonrelfiles_dropped"),
+                RowDescriptor::int8_col(b"layer_relfiles_total"),
+                RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"),
+                RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"),
+                RowDescriptor::int8_col(b"layer_relfiles_not_updated"),
+                RowDescriptor::int8_col(b"layer_relfiles_removed"),
+                RowDescriptor::int8_col(b"layer_relfiles_dropped"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_total"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_removed"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"),
                RowDescriptor::int8_col(b"elapsed"),
            ]))?
            .write_message_noflush(&BeMessage::DataRow(&[
-                Some(&result.ondisk_relfiles_total.to_string().as_bytes()),
+                Some(result.ondisk_relfiles_total.to_string().as_bytes()),
                Some(
-                    &result
+                    result
                        .ondisk_relfiles_needed_by_cutoff
                        .to_string()
                        .as_bytes(),
                ),
                Some(
-                    &result
+                    result
                        .ondisk_relfiles_needed_by_branches
                        .to_string()
                        .as_bytes(),
                ),
-                Some(&result.ondisk_relfiles_not_updated.to_string().as_bytes()),
-                Some(&result.ondisk_relfiles_removed.to_string().as_bytes()),
-                Some(&result.ondisk_relfiles_dropped.to_string().as_bytes()),
-                Some(&result.ondisk_nonrelfiles_total.to_string().as_bytes()),
+                Some(result.ondisk_relfiles_not_updated.to_string().as_bytes()),
+                Some(result.ondisk_relfiles_removed.to_string().as_bytes()),
+                Some(result.ondisk_relfiles_dropped.to_string().as_bytes()),
+                Some(result.ondisk_nonrelfiles_total.to_string().as_bytes()),
                Some(
-                    &result
+                    result
                        .ondisk_nonrelfiles_needed_by_cutoff
                        .to_string()
                        .as_bytes(),
                ),
                Some(
-                    &result
+                    result
                        .ondisk_nonrelfiles_needed_by_branches
                        .to_string()
                        .as_bytes(),
                ),
-                Some(&result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()),
-                Some(&result.ondisk_nonrelfiles_removed.to_string().as_bytes()),
-                Some(&result.ondisk_nonrelfiles_dropped.to_string().as_bytes()),
-                Some(&result.elapsed.as_millis().to_string().as_bytes()),
+                Some(result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()),
+                Some(result.ondisk_nonrelfiles_removed.to_string().as_bytes()),
+                Some(result.ondisk_nonrelfiles_dropped.to_string().as_bytes()),
+                Some(result.elapsed.as_millis().to_string().as_bytes()),
            ]))?
            .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else {
--- a/pageserver/src/relish.rs
+++ b/pageserver/src/relish.rs
@@ -125,11 +125,7 @@ impl RelishTag {

    // convenience function to check if this relish is a normal relation.
    pub const fn is_relation(&self) -> bool {
-        if let RelishTag::Relation(_) = self {
-            true
-        } else {
-            false
-        }
+        matches!(self, RelishTag::Relation(_))
    }
 }

--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -113,7 +113,7 @@ pub trait Timeline: Send + Sync {
    fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelTag>>;

    /// Get a list of non-relational objects
-    fn list_nonrels<'a>(&'a self, lsn: Lsn) -> Result<HashSet<RelishTag>>;
+    fn list_nonrels(&self, lsn: Lsn) -> Result<HashSet<RelishTag>>;

    //------------------------------------------------------------------------------
    // Public PUT functions, to update the repository with new page versions.
@@ -133,9 +133,8 @@ pub trait Timeline: Send + Sync {
    /// Truncate relation
    fn put_truncation(&self, rel: RelishTag, lsn: Lsn, nblocks: u32) -> Result<()>;

-    /// Unlink relish.
-    /// This method is used for marking dropped relations and truncated SLRU segments
-    fn put_unlink(&self, tag: RelishTag, lsn: Lsn) -> Result<()>;
+    /// This method is used for marking dropped relations and truncated SLRU files
+    fn drop_relish(&self, tag: RelishTag, lsn: Lsn) -> Result<()>;

    /// Track end of the latest digested WAL record.
    ///
@@ -147,6 +146,7 @@ pub trait Timeline: Send + Sync {
    /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
    fn get_last_record_lsn(&self) -> Lsn;
    fn get_prev_record_lsn(&self) -> Lsn;
+    fn get_start_lsn(&self) -> Lsn;

    ///
    /// Flush to disk all data that was written with the put_* functions
@@ -201,6 +201,7 @@ impl WALRecord {
 ///
 /// Tests that should work the same with any Repository/Timeline implementation.
 ///
+#[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -357,6 +358,37 @@ mod tests {
            TEST_IMG("foo blk 2 at 5")
        );

+        // Truncate to zero length
+        tline.put_truncation(TESTREL_A, Lsn(0x60), 0)?;
+        tline.advance_last_record_lsn(Lsn(0x60));
+        assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x60))?.unwrap(), 0);
+
+        // Extend from 0 to 2 blocks, leaving a gap
+        tline.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?;
+        tline.advance_last_record_lsn(Lsn(0x70));
+        assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x70))?.unwrap(), 2);
+        assert_eq!(tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, ZERO_PAGE);
+        assert_eq!(
+            tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x70))?,
+            TEST_IMG("foo blk 1")
+        );
+
+        // Extend a lot more, leaving a big gap that spans across segments
+        // FIXME: This is currently broken, see https://github.com/zenithdb/zenith/issues/500
+        /*
+        tline.put_page_image(TESTREL_A, 1500, Lsn(0x80), TEST_IMG("foo blk 1500"))?;
+        tline.advance_last_record_lsn(Lsn(0x80));
+        assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x80))?.unwrap(), 1501);
+        for blk in 2..1500 {
+            assert_eq!(
+                tline.get_page_at_lsn(TESTREL_A, blk, Lsn(0x80))?,
+                ZERO_PAGE);
+        }
+        assert_eq!(
+            tline.get_page_at_lsn(TESTREL_A, 1500, Lsn(0x80))?,
+            TEST_IMG("foo blk 1500"));
+         */
+
        Ok(())
    }

@@ -475,6 +507,7 @@ mod tests {
    impl WalRedoManager for TestRedoManager {
        fn request_redo(
            &self,
+            timeline: &dyn Timeline,
            rel: RelishTag,
            blknum: u32,
            lsn: Lsn,
--- a/pageserver/src/restore_local_repo.rs
+++ b/pageserver/src/restore_local_repo.rs
@@ -29,7 +29,7 @@ use zenith_utils::lsn::Lsn;
 const MAX_MBR_BLKNO: u32 =
    pg_constants::MAX_MULTIXACT_ID / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;

-const ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
+static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);

 ///
 /// Import all relation data pages from local disk into the repository.
@@ -45,7 +45,6 @@ pub fn import_timeline_from_postgres_datadir(
        match direntry.file_name().to_str() {
            None => continue,

-            // These special files appear in the snapshot, but are not needed by the page server
            Some("pg_control") => {
                import_nonrel_file(timeline, lsn, RelishTag::ControlFile, &direntry.path())?;
                // Extract checkpoint record from pg_control and store is as separate object
@@ -93,7 +92,6 @@ pub fn import_timeline_from_postgres_datadir(
            match direntry.file_name().to_str() {
                None => continue,

-                // These special files appear in the snapshot, but are not needed by the page server
                Some("PG_VERSION") => continue,
                Some("pg_filenode.map") => import_nonrel_file(
                    timeline,
@@ -130,7 +128,7 @@ pub fn import_timeline_from_postgres_datadir(
    }
    for entry in fs::read_dir(path.join("pg_twophase"))? {
        let entry = entry?;
-        let xid = u32::from_str_radix(&entry.path().to_str().unwrap(), 16)?;
+        let xid = u32::from_str_radix(entry.path().to_str().unwrap(), 16)?;
        import_nonrel_file(timeline, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
    }
    // TODO: Scan pg_tblspc
@@ -153,7 +151,7 @@ fn import_relfile(

    let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
    if let Err(e) = p {
-        warn!("unrecognized file in snapshot: {:?} ({})", path, e);
+        warn!("unrecognized file in postgres datadir: {:?} ({})", path, e);
        return Err(e.into());
    }
    let (relnode, forknum, segno) = p.unwrap();
@@ -397,15 +395,15 @@ pub fn save_decoded_record(
            for tablespace_id in dropdb.tablespace_ids {
                let rels = timeline.list_rels(tablespace_id, dropdb.db_id, lsn)?;
                for rel in rels {
-                    timeline.put_unlink(RelishTag::Relation(rel), lsn)?;
+                    timeline.drop_relish(RelishTag::Relation(rel), lsn)?;
                }
                trace!(
-                    "Unlink FileNodeMap {}, {} at lsn {}",
+                    "Drop FileNodeMap {}, {} at lsn {}",
                    tablespace_id,
                    dropdb.db_id,
                    lsn
                );
-                timeline.put_unlink(
+                timeline.drop_relish(
                    RelishTag::FileNodeMap {
                        spcnode: tablespace_id,
                        dbnode: dropdb.db_id,
@@ -429,7 +427,7 @@ pub fn save_decoded_record(
                },
                rpageno,
                lsn,
-                ZERO_PAGE,
+                ZERO_PAGE.clone(),
            )?;
        } else {
            assert!(info == pg_constants::CLOG_TRUNCATE);
@@ -448,12 +446,12 @@ pub fn save_decoded_record(
            save_xact_record(timeline, lsn, &parsed_xact, decoded)?;
            // Remove twophase file. see RemoveTwoPhaseFile() in postgres code
            trace!(
-                "unlink twophaseFile for xid {} parsed_xact.xid {} here at {}",
+                "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}",
                decoded.xl_xid,
                parsed_xact.xid,
                lsn
            );
-            timeline.put_unlink(
+            timeline.drop_relish(
                RelishTag::TwoPhase {
                    xid: parsed_xact.xid,
                },
@@ -486,7 +484,7 @@ pub fn save_decoded_record(
                },
                rpageno,
                lsn,
-                ZERO_PAGE,
+                ZERO_PAGE.clone(),
            )?;
        } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
            let pageno = buf.get_u32_le();
@@ -499,7 +497,7 @@ pub fn save_decoded_record(
                },
                rpageno,
                lsn,
-                ZERO_PAGE,
+                ZERO_PAGE.clone(),
            )?;
        } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
            let xlrec = XlMultiXactCreate::decode(&mut buf);
@@ -597,19 +595,16 @@ fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatab
    // TODO This implementation is very inefficient -
    // it scans all non-rels only to find FileNodeMaps
    for tag in timeline.list_nonrels(req_lsn)? {
-        match tag {
-            RelishTag::FileNodeMap { spcnode, dbnode } => {
-                if spcnode == src_tablespace_id && dbnode == src_db_id {
-                    let img = timeline.get_page_at_lsn_nowait(tag, 0, req_lsn)?;
-                    let new_tag = RelishTag::FileNodeMap {
-                        spcnode: tablespace_id,
-                        dbnode: db_id,
-                    };
-                    timeline.put_page_image(new_tag, 0, lsn, img)?;
-                    break;
-                }
+        if let RelishTag::FileNodeMap { spcnode, dbnode } = tag {
+            if spcnode == src_tablespace_id && dbnode == src_db_id {
+                let img = timeline.get_page_at_lsn_nowait(tag, 0, req_lsn)?;
+                let new_tag = RelishTag::FileNodeMap {
+                    spcnode: tablespace_id,
+                    dbnode: db_id,
+                };
+                timeline.put_page_image(new_tag, 0, lsn, img)?;
+                break;
            }
-            _ => {} // do nothing
        }
    }
    info!(
@@ -733,7 +728,7 @@ fn save_xact_record(
                dbnode: xnode.dbnode,
                relnode: xnode.relnode,
            };
-            timeline.put_unlink(RelishTag::Relation(rel), lsn)?;
+            timeline.drop_relish(RelishTag::Relation(rel), lsn)?;
        }
    }
    Ok(())
@@ -775,7 +770,7 @@ fn save_clog_truncate_record(
        return Ok(());
    }

-    // Iterate via SLRU CLOG segments and unlink segments that we're ready to truncate
+    // Iterate via SLRU CLOG segments and drop segments that we're ready to truncate
    // TODO This implementation is very inefficient -
    // it scans all non-rels only to find Clog
    //
@@ -785,17 +780,14 @@ fn save_clog_truncate_record(
    // instead.
    let req_lsn = min(timeline.get_last_record_lsn(), lsn);
    for obj in timeline.list_nonrels(req_lsn)? {
-        match obj {
-            RelishTag::Slru { slru, segno } => {
-                if slru == SlruKind::Clog {
-                    let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
-                    if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
-                        timeline.put_unlink(RelishTag::Slru { slru, segno }, lsn)?;
-                        trace!("unlink CLOG segment {:>04X} at lsn {}", segno, lsn);
-                    }
+        if let RelishTag::Slru { slru, segno } = obj {
+            if slru == SlruKind::Clog {
+                let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
+                if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
+                    timeline.drop_relish(RelishTag::Slru { slru, segno }, lsn)?;
+                    trace!("Drop CLOG segment {:>04X} at lsn {}", segno, lsn);
                }
            }
-            _ => {}
        }
    }

@@ -894,7 +886,7 @@ fn save_multixact_truncate_record(
    // Delete all the segments except the last one. The last segment can still
    // contain, possibly partially, valid data.
    while segment != endsegment {
-        timeline.put_unlink(
+        timeline.drop_relish(
            RelishTag::Slru {
                slru: SlruKind::MultiXactMembers,
                segno: segment as u32,
--- a/pageserver/src/waldecoder.rs
+++ b/pageserver/src/waldecoder.rs
@@ -197,6 +197,7 @@ impl WalStreamDecoder {
 }

 #[allow(dead_code)]
+#[derive(Default)]
 pub struct DecodedBkpBlock {
    /* Is this block ref in use? */
    //in_use: bool,
@@ -229,25 +230,7 @@ pub struct DecodedBkpBlock {

 impl DecodedBkpBlock {
    pub fn new() -> DecodedBkpBlock {
-        DecodedBkpBlock {
-            rnode_spcnode: 0,
-            rnode_dbnode: 0,
-            rnode_relnode: 0,
-            forknum: 0,
-            blkno: 0,
-
-            flags: 0,
-            has_image: false,
-            apply_image: false,
-            will_init: false,
-            hole_offset: 0,
-            hole_length: 0,
-            bimg_len: 0,
-            bimg_info: 0,
-
-            has_data: false,
-            data_len: 0,
-        }
+        Default::default()
    }
 }

--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -164,7 +164,7 @@ fn walreceiver_main(
    // There might be some padding after the last full record, skip it.
    startpoint += startpoint.calc_padding(8u32);

-    debug!(
+    info!(
        "last_record_lsn {} starting replication from {} for timeline {}, server is at {}...",
        last_rec_lsn, startpoint, timelineid, end_of_wal
    );
@@ -457,7 +457,7 @@ fn write_wal_file(
                {
                    Ok(mut file) => {
                        for _ in 0..(wal_seg_size / XLOG_BLCKSZ) {
-                            file.write_all(&ZERO_BLOCK)?;
+                            file.write_all(ZERO_BLOCK)?;
                        }
                        wal_file = file;
                    }
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -43,7 +43,7 @@ use zenith_utils::lsn::Lsn;
 use zenith_utils::zid::ZTenantId;

 use crate::relish::*;
-use crate::repository::WALRecord;
+use crate::repository::{Timeline, WALRecord};
 use crate::waldecoder::XlMultiXactCreate;
 use crate::waldecoder::XlXactParsedRecord;
 use crate::PageServerConf;
@@ -79,6 +79,7 @@ pub trait WalRedoManager: Send + Sync {
    /// the reords.
    fn request_redo(
        &self,
+        timeline: &dyn Timeline,
        rel: RelishTag,
        blknum: u32,
        lsn: Lsn,
@@ -96,6 +97,7 @@ pub struct DummyRedoManager {}
 impl crate::walredo::WalRedoManager for DummyRedoManager {
    fn request_redo(
        &self,
+        _timeline: &dyn Timeline,
        _rel: RelishTag,
        _blknum: u32,
        _lsn: Lsn,
@@ -176,6 +178,7 @@ impl WalRedoManager for PostgresRedoManager {
    ///
    fn request_redo(
        &self,
+        timeline: &dyn Timeline,
        rel: RelishTag,
        blknum: u32,
        lsn: Lsn,
@@ -209,13 +212,20 @@ impl WalRedoManager for PostgresRedoManager {
            let process = (*process_guard).as_ref().unwrap();

            self.runtime
-                .block_on(self.handle_apply_request(&process, &request))
+                .block_on(self.handle_apply_request(process, &request))
        };
        end_time = Instant::now();

        WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
        WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());

+        if let Ok(page) = result {
+            let mut buf = BytesMut::new();
+            buf.extend_from_slice(&page);
+            self.set_hint_bits(timeline, &mut buf, lsn, &request.records);
+            return Ok(buf.freeze());
+        }
+
        result
    }
 }
@@ -242,6 +252,117 @@ impl PostgresRedoManager {
        }
    }

+    fn xid_status(&self, timeline: &dyn Timeline, xid: u32, lsn: Lsn) -> u8 {
+        let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
+        let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+        let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+        if let Ok(clog_page) = timeline.get_page_at_lsn_nowait(
+            RelishTag::Slru {
+                slru: SlruKind::Clog,
+                segno,
+            },
+            rpageno,
+            lsn,
+        ) {
+            postgres_ffi::nonrelfile_utils::transaction_id_get_status(xid, &clog_page[..])
+        } else {
+            pg_constants::TRANSACTION_STATUS_IN_PROGRESS
+        }
+    }
+
+    fn set_hint_bits(
+        &self,
+        timeline: &dyn Timeline,
+        page: &mut BytesMut,
+        lsn: Lsn,
+        records: &Vec<WALRecord>,
+    ) {
+        let mut flags = LittleEndian::read_u16(
+            &page[pg_constants::PD_FLAGS_OFFSET..pg_constants::PD_FLAGS_OFFSET + 2],
+        );
+        if (flags & (pg_constants::PD_HEAP_RELATION | pg_constants::PD_NONHEAP_RELATION)) == 0 {
+            // If type of relation was not determined yet,
+            // then do it now
+            for r in records {
+                let xl_rmid = r.rec[pg_constants::XL_RMID_OFFS];
+                if xl_rmid == pg_constants::RM_HEAP_ID || xl_rmid == pg_constants::RM_HEAP2_ID {
+                    flags |= pg_constants::PD_HEAP_RELATION;
+                    break;
+                }
+            }
+            if (flags & pg_constants::PD_HEAP_RELATION) == 0 {
+                flags |= pg_constants::PD_NONHEAP_RELATION;
+            }
+            LittleEndian::write_u16(
+                &mut page[pg_constants::PD_FLAGS_OFFSET..pg_constants::PD_FLAGS_OFFSET + 2],
+                flags,
+            );
+        }
+        if (flags & pg_constants::PD_HEAP_RELATION) != 0 {
+            // Set hint bits for heap relation page
+            let pd_lower = LittleEndian::read_u16(
+                &page[pg_constants::PD_LOWER_OFFSET..pg_constants::PD_LOWER_OFFSET + 2],
+            ) as usize;
+            let mut tid_offs = pg_constants::SIZE_OF_PAGE_HEADER_DATA;
+            while tid_offs < pd_lower {
+                let tid = LittleEndian::read_u32(&page[tid_offs..tid_offs + 4]);
+                let lp_off = (tid & 0x7FFF) as usize;
+                if ((tid >> 15) & 3) == pg_constants::LP_NORMAL {
+                    // normal item pointer
+                    let t_xmin = LittleEndian::read_u32(
+                        &page[lp_off + pg_constants::T_XMIN_OFFS
+                            ..lp_off + pg_constants::T_XMIN_OFFS + 4],
+                    );
+                    let t_xmax = LittleEndian::read_u32(
+                        &page[lp_off + pg_constants::T_XMAX_OFFS
+                            ..lp_off + pg_constants::T_XMAX_OFFS + 4],
+                    );
+                    let mut t_infomask = LittleEndian::read_u16(
+                        &page[lp_off + pg_constants::T_INFOMASK_OFFS
+                            ..lp_off + pg_constants::T_INFOMASK_OFFS + 2],
+                    );
+                    if (t_infomask
+                        & (pg_constants::HEAP_XMIN_COMMITTED | pg_constants::HEAP_XMIN_INVALID))
+                        == 0
+                        && t_xmin != 0
+                    {
+                        let status = self.xid_status(timeline, t_xmin, lsn);
+                        if status == pg_constants::TRANSACTION_STATUS_COMMITTED {
+                            t_infomask |= pg_constants::HEAP_XMIN_COMMITTED;
+                        } else if status == pg_constants::TRANSACTION_STATUS_ABORTED {
+                            t_infomask |= pg_constants::HEAP_XMIN_INVALID;
+                        }
+                        LittleEndian::write_u16(
+                            &mut page[lp_off + pg_constants::T_INFOMASK_OFFS
+                                ..lp_off + pg_constants::T_INFOMASK_OFFS + 2],
+                            t_infomask,
+                        );
+                    }
+                    if (t_infomask
+                        & (pg_constants::HEAP_XMAX_COMMITTED
+                            | pg_constants::HEAP_XMAX_INVALID
+                            | pg_constants::HEAP_XMAX_IS_MULTI))
+                        == 0
+                        && t_xmax != 0
+                    {
+                        let status = self.xid_status(timeline, t_xmax, lsn);
+                        if status == pg_constants::TRANSACTION_STATUS_COMMITTED {
+                            t_infomask |= pg_constants::HEAP_XMAX_COMMITTED;
+                        } else if status == pg_constants::TRANSACTION_STATUS_ABORTED {
+                            t_infomask |= pg_constants::HEAP_XMAX_INVALID;
+                        }
+                        LittleEndian::write_u16(
+                            &mut page[lp_off + pg_constants::T_INFOMASK_OFFS
+                                ..lp_off + pg_constants::T_INFOMASK_OFFS + 2],
+                            t_infomask,
+                        );
+                    }
+                }
+                tid_offs += 4;
+            }
+        }
+    }
+
    ///
    /// Process one request for WAL redo.
    ///
@@ -324,7 +445,7 @@ impl PostgresRedoManager {
                            if rec_segno == segno && blknum == rpageno {
                                transaction_id_set_status(
                                    *subxact,
-                                    pg_constants::TRANSACTION_STATUS_SUB_COMMITTED,
+                                    pg_constants::TRANSACTION_STATUS_COMMITTED,
                                    &mut page,
                                );
                            }
@@ -453,7 +574,7 @@ impl PostgresRedoProcess {
        // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
        // just create one with constant name. That fails if you try to launch more than
        // one WAL redo manager concurrently.
-        let datadir = conf.tenant_path(&tenantid).join("wal-redo-datadir");
+        let datadir = conf.tenant_path(tenantid).join("wal-redo-datadir");

        // Create empty data directory for wal-redo postgres, deleting old one first.
        if datadir.exists() {
--- a/postgres_ffi/src/controlfile_utils.rs
+++ b/postgres_ffi/src/controlfile_utils.rs
@@ -11,7 +11,7 @@
 //! data directory is compatible with a postgres binary. That includes
 //! a version number, configuration options that can be set at
 //! compilation time like the block size, and the platform's alignment
-//! and endianess information. (The PostgreSQL on-disk file format is
+//! and endianness information. (The PostgreSQL on-disk file format is
 //! not portable across platforms.)
 //!
 //! The control file is stored in the PostgreSQL data directory, as
--- a/postgres_ffi/src/pg_constants.rs
+++ b/postgres_ffi/src/pg_constants.rs
@@ -46,6 +46,7 @@ pub const SIZE_OF_PAGE_HEADER: u16 = 24;
 pub const BITS_PER_HEAPBLOCK: u16 = 2;
 pub const HEAPBLOCKS_PER_PAGE: u16 = (BLCKSZ - SIZE_OF_PAGE_HEADER) * 8 / BITS_PER_HEAPBLOCK;

+pub const TRANSACTION_STATUS_IN_PROGRESS: u8 = 0x00;
 pub const TRANSACTION_STATUS_COMMITTED: u8 = 0x01;
 pub const TRANSACTION_STATUS_ABORTED: u8 = 0x02;
 pub const TRANSACTION_STATUS_SUB_COMMITTED: u8 = 0x03;
@@ -189,11 +190,36 @@ pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
 pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
 pub const XLP_LONG_HEADER: u16 = 0x0002;

-pub const PG_MAJORVERSION: &'static str = "14";
+pub const PG_MAJORVERSION: &str = "14";
+
+// Zenith specific page flags used to distinguish heap and non-heap relations
+pub const PD_HEAP_RELATION: u16 = 0x10;
+pub const PD_NONHEAP_RELATION: u16 = 0x20;
+
+// bufpage.h
+pub const PD_FLAGS_OFFSET: usize = 10; // PageHeaderData.pd_flags
+pub const PD_LOWER_OFFSET: usize = 12; // PageHeaderData.pd_lower
+
+// itemid.h
+pub const LP_NORMAL: u32 = 1;
+
+// htup_details.h
+pub const T_XMIN_OFFS: usize = 0;
+pub const T_XMAX_OFFS: usize = 4;
+pub const T_INFOMASK_OFFS: usize = 4 * 3 + 2 * 3 + 2;
+pub const HEAP_XMIN_COMMITTED: u16 = 0x0100; /* t_xmin committed */
+pub const HEAP_XMIN_INVALID: u16 = 0x0200; /* t_xmin invalid/aborted */
+pub const HEAP_XMAX_COMMITTED: u16 = 0x0400; /* t_xmax committed */
+pub const HEAP_XMAX_INVALID: u16 = 0x0800; /* t_xmax invalid/aborted */
+pub const HEAP_XMAX_IS_MULTI: u16 = 0x1000; /* t_xmax is a MultiXactId */
+pub const SIZE_OF_PAGE_HEADER_DATA: usize = 24;
+
+// xlogrecord.h
+pub const XL_RMID_OFFS: usize = 17;

 // List of subdirectories inside pgdata.
 // Copied from src/bin/initdb/initdb.c
-pub const PGDATA_SUBDIRS: [&'static str; 22] = [
+pub const PGDATA_SUBDIRS: [&str; 22] = [
    "global",
    "pg_wal/archive_status",
    "pg_commit_ts",
@@ -218,11 +244,11 @@ pub const PGDATA_SUBDIRS: [&'static str; 22] = [
    "pg_logical/mappings",
 ];

-pub const PGDATA_SPECIAL_FILES: [&'static str; 4] = [
-    "pg_hba.conf",
-    "pg_ident.conf",
-    "postgresql.conf",
-    "postgresql.auto.conf",
-];
+// Don't include postgresql.conf as it is inconvenient on node start:
+// we need postgresql.conf before basebackup to synchronize safekeepers
+// so no point in overwriting it during backup restore. Rest of the files
+// here are not needed before backup so it is okay to edit them after.
+pub const PGDATA_SPECIAL_FILES: [&str; 3] =
+    ["pg_hba.conf", "pg_ident.conf", "postgresql.auto.conf"];

-pub static PG_HBA: &'static str = include_str!("../samples/pg_hba.conf");
+pub static PG_HBA: &str = include_str!("../samples/pg_hba.conf");
--- a/postgres_ffi/src/xlog_utils.rs
+++ b/postgres_ffi/src/xlog_utils.rs
@@ -26,6 +26,7 @@ use std::fs::{self, File};
 use std::io::prelude::*;
 use std::path::{Path, PathBuf};
 use std::time::SystemTime;
+use zenith_utils::lsn::Lsn;

 pub const XLOG_FNAME_LEN: usize = 24;
 pub const XLOG_BLCKSZ: usize = 8192;
@@ -37,6 +38,7 @@ pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;
 pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = std::mem::size_of::<XLogPageHeaderData>();
 pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = std::mem::size_of::<XLogLongPageHeaderData>();
 pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::<XLogRecord>();
+#[allow(clippy::identity_op)]
 pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;

 pub type XLogRecPtr = u64;
@@ -88,6 +90,21 @@ pub fn IsPartialXLogFileName(fname: &str) -> bool {
    fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8])
 }

+/// If LSN points to the beginning of the page, then shift it to first record,
+/// otherwise align on 8-bytes boundary (required for WAL records)
+pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn {
+    if lsn.0 % XLOG_BLCKSZ as u64 == 0 {
+        let hdr_size = if lsn.0 % seg_sz as u64 == 0 {
+            XLOG_SIZE_OF_XLOG_LONG_PHD
+        } else {
+            XLOG_SIZE_OF_XLOG_SHORT_PHD
+        };
+        lsn + hdr_size as u64
+    } else {
+        lsn.align()
+    }
+}
+
 pub fn get_current_timestamp() -> TimestampTz {
    const UNIX_EPOCH_JDATE: u64 = 2440588; /* == date2j(1970, 1, 1) */
    const POSTGRES_EPOCH_JDATE: u64 = 2451545; /* == date2j(2000, 1, 1) */
@@ -173,12 +190,11 @@ fn find_end_of_wal_segment(
                let crc_offs = page_offs - rec_offs + XLOG_RECORD_CRC_OFFS;
                wal_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]);
                crc = crc32c_append(0, &buf[crc_offs + 4..page_offs + n]);
-                crc = !crc;
            } else {
                crc ^= 0xFFFFFFFFu32;
                crc = crc32c_append(crc, &buf[page_offs..page_offs + n]);
-                crc = !crc;
            }
+            crc = !crc;
            rec_offs += n;
            offs += n;
            contlen -= n;
@@ -416,7 +432,6 @@ mod tests {
    use super::*;
    use regex::Regex;
    use std::{env, process::Command, str::FromStr};
-    use zenith_utils::lsn::Lsn;

    // Run find_end_of_wal against file in test_wal dir
    // Ensure that it finds last record correctly
@@ -465,7 +480,7 @@ mod tests {
        let waldump_output = std::str::from_utf8(&waldump_output.stderr).unwrap();
        println!("waldump_output = '{}'", &waldump_output);
        let re = Regex::new(r"invalid record length at (.+):").unwrap();
-        let caps = re.captures(&waldump_output).unwrap();
+        let caps = re.captures(waldump_output).unwrap();
        let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();

        // 5. Rename file to partial to actually find last valid lsn
--- a/proxy/src/cplane_api.rs
+++ b/proxy/src/cplane_api.rs
@@ -56,7 +56,7 @@ impl CPlaneApi {
            md5::compute([stored_hash.as_bytes(), salt].concat())
        );

-        let received_hash = std::str::from_utf8(&md5_response)?;
+        let received_hash = std::str::from_utf8(md5_response)?;

        println!(
            "auth: {} rh={} sh={} ssh={} {:?}",
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -143,10 +143,10 @@ fn main() -> anyhow::Result<()> {
        // for each connection.
        thread::Builder::new()
            .name("Proxy thread".into())
-            .spawn(move || proxy::thread_main(&state, pageserver_listener))?,
+            .spawn(move || proxy::thread_main(state, pageserver_listener))?,
        thread::Builder::new()
            .name("Mgmt thread".into())
-            .spawn(move || mgmt::thread_main(&state, mgmt_listener))?,
+            .spawn(move || mgmt::thread_main(state, mgmt_listener))?,
    ];

    for t in threads.into_iter() {
--- a/test_runner/batch_others/test_branch_behind.py
+++ b/test_runner/batch_others/test_branch_behind.py
@@ -1,3 +1,4 @@
+import subprocess
 from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver


@@ -74,3 +75,18 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
    # All the rows are visible on the main branch
    main_cur.execute('SELECT count(*) FROM foo')
    assert main_cur.fetchone() == (200100, )
+
+    # Check bad lsn's for branching
+
+    # branch at segment boundary
+    zenith_cli.run(["branch", "test_branch_segment_boundary", "test_branch_behind@0/3000000"])
+    pg = postgres.create_start("test_branch_segment_boundary")
+    cur = pg.connect().cursor()
+    cur.execute('SELECT 1')
+    assert cur.fetchone() == (1, )
+
+    # branch at pre-initdb lsn
+    try:
+        zenith_cli.run(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
+    except subprocess.CalledProcessError:
+        print("Branch creation with pre-initdb LSN failed (as expected)")
--- a/test_runner/batch_others/test_clog_truncate.py
+++ b/test_runner/batch_others/test_clog_truncate.py
@@ -3,7 +3,7 @@ import os

 from contextlib import closing

-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content

 pytest_plugins = ("fixtures.zenith_fixtures")

--- a/test_runner/batch_others/test_createdropdb.py
+++ b/test_runner/batch_others/test_createdropdb.py
@@ -2,7 +2,7 @@ import os
 import pathlib

 from contextlib import closing
-from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory, ZenithCli
+from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory, ZenithCli, check_restored_datadir_content

 pytest_plugins = ("fixtures.zenith_fixtures")

@@ -69,6 +69,8 @@ def test_dropdb(
        with conn.cursor() as cur:
            cur.execute('DROP DATABASE foodb')

+            cur.execute('CHECKPOINT')
+
            cur.execute('SELECT pg_current_wal_insert_lsn()')
            lsn_after_drop = cur.fetchone()[0]

@@ -94,3 +96,6 @@ def test_dropdb(
    print(dbpath)

    assert os.path.isdir(dbpath) == False
+
+    # Check that we restore the content of the datadir correctly
+    check_restored_datadir_content(zenith_cli, pg, lsn_after_drop, postgres)
--- a/test_runner/batch_others/test_multixact.py
+++ b/test_runner/batch_others/test_multixact.py
@@ -1,4 +1,4 @@
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content

 pytest_plugins = ("fixtures.zenith_fixtures")

@@ -63,3 +63,6 @@ def test_multixact(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_b

    # Check that we restored pg_controlfile correctly
    assert next_multixact_id_new == next_multixact_id
+
+    # Check that we restore the content of the datadir correctly
+    check_restored_datadir_content(zenith_cli, pg, lsn, postgres)
--- a/test_runner/batch_others/test_snapfiles_gc.py
+++ b/test_runner/batch_others/test_snapfiles_gc.py
@@ -6,19 +6,19 @@ pytest_plugins = ("fixtures.zenith_fixtures")

 def print_gc_result(row):
    print("GC duration {elapsed} ms".format_map(row));
-    print("  REL    total: {snapshot_relfiles_total}, needed_by_cutoff {snapshot_relfiles_needed_by_cutoff}, needed_by_branches: {snapshot_relfiles_needed_by_branches}, not_updated: {snapshot_relfiles_not_updated}, removed: {snapshot_relfiles_removed}, dropped: {snapshot_relfiles_dropped}".format_map(row))
-    print("  NONREL total: {snapshot_nonrelfiles_total}, needed_by_cutoff {snapshot_nonrelfiles_needed_by_cutoff}, needed_by_branches: {snapshot_nonrelfiles_needed_by_branches}, not_updated: {snapshot_nonrelfiles_not_updated}, removed: {snapshot_nonrelfiles_removed}, dropped: {snapshot_nonrelfiles_dropped}".format_map(row))
+    print("  REL    total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}".format_map(row))
+    print("  NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}".format_map(row))


 #
-# Test Garbage Collection of old snapshot files
+# Test Garbage Collection of old layer files
 #
 # This test is pretty tightly coupled with the current implementation of layered
 # storage, in layered_repository.rs.
 #
-def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
-    zenith_cli.run(["branch", "test_snapfiles_gc", "empty"])
-    pg = postgres.create_start('test_snapfiles_gc')
+def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
+    zenith_cli.run(["branch", "test_layerfiles_gc", "empty"])
+    pg = postgres.create_start('test_layerfiles_gc')

    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
@@ -55,8 +55,8 @@ def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
                    row = pscur.fetchone()
                    print_gc_result(row);
                    # remember the number of files
-                    snapshot_relfiles_remain = row['snapshot_relfiles_total'] - row['snapshot_relfiles_removed']
-                    assert snapshot_relfiles_remain > 0
+                    layer_relfiles_remain = row['layer_relfiles_total'] - row['layer_relfiles_removed']
+                    assert layer_relfiles_remain > 0

                    # Insert a row.
                    print("Inserting one row and running GC")
@@ -64,12 +64,12 @@ def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
                    print_gc_result(row);
-                    assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1
-                    assert row['snapshot_relfiles_removed'] == 1
-                    assert row['snapshot_relfiles_dropped'] == 0
+                    assert row['layer_relfiles_total'] == layer_relfiles_remain + 1
+                    assert row['layer_relfiles_removed'] == 1
+                    assert row['layer_relfiles_dropped'] == 0

                    # Insert two more rows and run GC.
-                    # This should create a new snapshot file with the new contents, and
+                    # This should create a new layer file with the new contents, and
                    # remove the old one.
                    print("Inserting two more rows and running GC")
                    cur.execute("INSERT INTO foo VALUES (2)")
@@ -78,11 +78,11 @@ def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
                    print_gc_result(row);
-                    assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1
-                    assert row['snapshot_relfiles_removed'] == 1
-                    assert row['snapshot_relfiles_dropped'] == 0
+                    assert row['layer_relfiles_total'] == layer_relfiles_remain + 1
+                    assert row['layer_relfiles_removed'] == 1
+                    assert row['layer_relfiles_dropped'] == 0

-                    # Do it again. Should again create a new snapshot file and remove old one.
+                    # Do it again. Should again create a new layer file and remove old one.
                    print("Inserting two more rows and running GC")
                    cur.execute("INSERT INTO foo VALUES (2)")
                    cur.execute("INSERT INTO foo VALUES (3)")
@@ -90,18 +90,18 @@ def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
                    print_gc_result(row);
-                    assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1
-                    assert row['snapshot_relfiles_removed'] == 1
-                    assert row['snapshot_relfiles_dropped'] == 0
+                    assert row['layer_relfiles_total'] == layer_relfiles_remain + 1
+                    assert row['layer_relfiles_removed'] == 1
+                    assert row['layer_relfiles_dropped'] == 0

                    # Run GC again, with no changes in the database. Should not remove anything.
                    print("Run GC again, with nothing to do")
                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
                    print_gc_result(row);
-                    assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain
-                    assert row['snapshot_relfiles_removed'] == 0
-                    assert row['snapshot_relfiles_dropped'] == 0
+                    assert row['layer_relfiles_total'] == layer_relfiles_remain
+                    assert row['layer_relfiles_removed'] == 0
+                    assert row['layer_relfiles_dropped'] == 0

                    #
                    # Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
@@ -114,11 +114,11 @@ def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
                    print_gc_result(row);

                    # Each relation fork is counted separately, hence 3.
-                    assert row['snapshot_relfiles_dropped'] == 3
+                    assert row['layer_relfiles_dropped'] == 3

-                    # The catalog updates also create new snapshot files of the catalogs, which
+                    # The catalog updates also create new layer files of the catalogs, which
                    # are counted as 'removed'
-                    assert row['snapshot_relfiles_removed'] > 0
+                    assert row['layer_relfiles_removed'] > 0

                    # TODO: perhaps we should count catalog and user relations separately,
                    # to make this kind of testing more robust
--- a/test_runner/batch_others/test_twophase.py
+++ b/test_runner/batch_others/test_twophase.py
@@ -59,9 +59,8 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
    # Create a branch with the transaction in prepared state
    zenith_cli.run(["branch", "test_twophase_prepared", "test_twophase"])

-    # Create compute node, but don't start.
-    # We want to observe pgdata before postgres starts
-    pg2 = postgres.create(
+    # Start compute on the new branch
+    pg2 = postgres.create_start(
        'test_twophase_prepared',
        config_lines=['max_prepared_transactions=5'],
    )
@@ -71,7 +70,6 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
    print(twophase_files2)
    assert twophase_files2.sort() == twophase_files.sort()

-    pg2 = pg2.start()
    conn2 = pg2.connect()
    cur2 = conn2.cursor()

--- a/test_runner/batch_pg_regress/test_pg_regress.py
+++ b/test_runner/batch_pg_regress/test_pg_regress.py
@@ -1,7 +1,7 @@
 import os

 from fixtures.utils import mkdir_if_needed
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content

 pytest_plugins = ("fixtures.zenith_fixtures")

@@ -49,3 +49,10 @@ def test_pg_regress(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_
    # logs the exact same data to `regression.out` anyway.
    with capsys.disabled():
        pg_bin.run(pg_regress_command, env=env, cwd=runpath)
+
+        # checkpoint one more time to ensure that the lsn we get is the latest one
+        pg.safe_psql('CHECKPOINT')
+        lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0]
+
+        # Check that we restore the content of the datadir correctly
+        check_restored_datadir_content(zenith_cli, pg, lsn, postgres)
--- a/test_runner/batch_pg_regress/test_zenith_regress.py
+++ b/test_runner/batch_pg_regress/test_zenith_regress.py
@@ -1,7 +1,7 @@
 import os

 from fixtures.utils import mkdir_if_needed
-from fixtures.zenith_fixtures import PostgresFactory
+from fixtures.zenith_fixtures import PostgresFactory, check_restored_datadir_content

 pytest_plugins = ("fixtures.zenith_fixtures")

@@ -50,3 +50,10 @@ def test_zenith_regress(postgres: PostgresFactory, pg_bin, zenith_cli, test_outp
    # logs the exact same data to `regression.out` anyway.
    with capsys.disabled():
        pg_bin.run(pg_regress_command, env=env, cwd=runpath)
+
+        # checkpoint one more time to ensure that the lsn we get is the latest one
+        pg.safe_psql('CHECKPOINT')
+        lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0]
+
+        # Check that we restore the content of the datadir correctly
+        check_restored_datadir_content(zenith_cli, pg, lsn, postgres)
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -1,6 +1,7 @@
 from pprint import pprint

 import os
+import re
 import timeit
 import pathlib
 import uuid
@@ -78,7 +79,7 @@ class ZenithBenchmarkResults:

        self.results.append((test_name, metric_name, metric_value, unit))

-# Sesssion scope fixture that initializes the results object
+# Session scope fixture that initializes the results object
@pytest.fixture(autouse=True, scope='session')
 def zenbenchmark_global(request) -> Iterator[ZenithBenchmarkResults]:
    """
@@ -120,6 +121,35 @@ class ZenithBenchmarker:

        self.results.record(self.request.node.name, metric_name, end - start, 's')

+    def get_io_writes(self, pageserver) -> int:
+        """
+        Fetch the "cumulative # of bytes written" metric from the pageserver
+        """
+        # Fetch all the exposed prometheus metrics from page server
+        all_metrics = pageserver.http_client().get_metrics()
+        # Use a regular expression to extract the one we're interested in
+        #
+        # TODO: If we start to collect more of the prometheus metrics in the
+        # performance test suite like this, we should refactor this to load and
+        # parse all the metrics into a more convenient structure in one go.
+        #
+        # The metric should be an integer, as it's a number of bytes. But in general
+        # all prometheus metrics are floats. So to be pedantic, read it as a float
+        # and round to integer.
+        matches = re.search(r'pageserver_disk_io_bytes{io_operation="write"} (\S+)', all_metrics)
+        return int(round(float(matches.group(1))))
+
+    @contextmanager
+    def record_pageserver_writes(self, pageserver, metric_name):
+        """
+        Record bytes written by the pageserver during a test.
+        """
+        before = self.get_io_writes(pageserver)
+        yield
+        after = self.get_io_writes(pageserver)
+
+        self.results.record(self.request.node.name, metric_name, round((after - before) / (1024 * 1024)), 'MB')
+
@pytest.fixture(scope='function')
 def zenbenchmark(zenbenchmark_global, request) -> Iterator[ZenithBenchmarker]:
    """
--- a/test_runner/fixtures/zenith_fixtures.py
+++ b/test_runner/fixtures/zenith_fixtures.py
@@ -10,6 +10,8 @@ import shutil
 import signal
 import subprocess
 import time
+import filecmp
+import difflib

 from contextlib import closing
 from pathlib import Path
@@ -169,12 +171,23 @@ class ZenithCli:

        args = [self.bin_zenith] + arguments
        print('Running command "{}"'.format(' '.join(args)))
-        return subprocess.run(args,
-                              env=self.env,
-                              check=True,
-                              universal_newlines=True,
-                              stdout=subprocess.PIPE,
-                              stderr=subprocess.PIPE)
+
+        # Interceipt CalledProcessError and print more info
+        try:
+            res = subprocess.run(args,
+                                env=self.env,
+                                check=True,
+                                universal_newlines=True,
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE)
+        except subprocess.CalledProcessError as err:
+            print(f"Run failed: {err}")
+            print(f" stdout: {err.stdout}")
+            print(f" stderr: {err.stderr}")
+
+            raise err
+
+        return res


@zenfixture
@@ -226,6 +239,11 @@ class ZenithPageserverHttpClient(requests.Session):
        res.raise_for_status()
        return res.json()

+    def get_metrics(self) -> str:
+        res = self.get(f"http://localhost:{self.port}/metrics")
+        res.raise_for_status()
+        return res.text
+

@dataclass
 class AuthKeys:
@@ -434,7 +452,6 @@ class Postgres(PgProtocol):
        branch: str,
        wal_acceptors: Optional[str] = None,
        config_lines: Optional[List[str]] = None,
-        config_only: bool = False,
    ) -> 'Postgres':
        """
        Create the pg data directory.
@@ -446,10 +463,7 @@ class Postgres(PgProtocol):
        if not config_lines:
            config_lines = []

-        if config_only:
-            self.zenith_cli.run(['pg', 'create', '--config-only', branch, f'--tenantid={self.tenant_id}'])
-        else:
-            self.zenith_cli.run(['pg', 'create', branch, f'--tenantid={self.tenant_id}'])
+        self.zenith_cli.run(['pg', 'create', branch, f'--tenantid={self.tenant_id}'])
        self.branch = branch
        path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id / self.branch
        self.pgdata_dir = os.path.join(self.repo_dir, path)
@@ -470,11 +484,13 @@ class Postgres(PgProtocol):

        assert self.branch is not None

-        print(f"Starting postgres on brach {self.branch}")
+        print(f"Starting postgres on branch {self.branch}")

-        self.zenith_cli.run(['pg', 'start', self.branch, f'--tenantid={self.tenant_id}'])
+        run_result = self.zenith_cli.run(['pg', 'start', self.branch, f'--tenantid={self.tenant_id}'])
        self.running = True

+        print(f"stdout: {run_result.stdout}")
+
        self.pg_bin.run(['pg_controldata', self.pg_data_dir_path()])

        return self
@@ -572,7 +588,6 @@ class Postgres(PgProtocol):
            branch=branch,
            wal_acceptors=wal_acceptors,
            config_lines=config_lines,
-            config_only=True,
        ).start()

        return self
@@ -584,6 +599,23 @@ class Postgres(PgProtocol):
        self.stop()


+    def list_files_to_compare(self):
+        pgdata_files = []
+        for root, _file, filenames in os.walk(self.pgdata_dir):
+            for filename in filenames:
+                rel_dir = os.path.relpath(root, self.pgdata_dir)
+                # Skip some dirs and files we don't want to compare
+                skip_dirs = ['pg_wal', 'pg_stat', 'pg_stat_tmp', 'pg_subtrans', 'pg_logical']
+                skip_files = ['pg_internal.init', 'pg.log', 'zenith.signal', 'postgresql.conf',
+                            'postmaster.opts', 'postmaster.pid', 'pg_control']
+                if rel_dir not in skip_dirs and filename not in skip_files:
+                    rel_file = os.path.join(rel_dir, filename)
+                    pgdata_files.append(rel_file)
+
+        pgdata_files.sort()
+        print(pgdata_files)
+        return pgdata_files
+
 class PostgresFactory:
    """ An object representing multiple running postgres daemons. """
    def __init__(self, zenith_cli: ZenithCli, repo_dir: str, pg_bin: PgBin, initial_tenant: str, base_port: int = 55431):
@@ -911,3 +943,55 @@ class TenantFactory:
@zenfixture
 def tenant_factory(zenith_cli: ZenithCli):
    return TenantFactory(zenith_cli)
+
+
+# pg is the existing comute node we want to compare our basebackup to
+# lsn is the latest lsn of this node
+def check_restored_datadir_content(zenith_cli, pg, lsn, postgres: PostgresFactory):
+    # stop postgres to ensure that files won't change
+    pg.stop()
+
+    # list files we're going to compare
+    pgdata_files = pg.list_files_to_compare()
+
+    # create new branch, but don't start postgres
+    # We only need 'basebackup' result here.
+    zenith_cli.run(
+        ["branch", "check_restored_datadir", pg.branch + "@" + lsn])
+
+    pg2 = postgres.create('check_restored_datadir')
+    print('postgres is created on check_restored_datadir branch')
+
+    print('files in a basebackup')
+    # list files we're going to compare
+    pgdata_files2 = pg2.list_files_to_compare()
+
+    # check that file sets are equal
+    assert pgdata_files == pgdata_files2
+
+    # compare content of the files
+    # filecmp returns (match, mismatch, error) lists
+    # We've already filtered all mismatching files in list_files_to_compare(),
+    # so here expect that the content is identical
+    (match, mismatch, error) = filecmp.cmpfiles(pg.pgdata_dir,
+                                                pg2.pgdata_dir,
+                                                pgdata_files,
+                                                shallow=False)
+    print('filecmp result mismatch and error lists:')
+    print(mismatch)
+    print(error)
+
+    for f in mismatch:
+
+        f1 = os.path.join(pg.pgdata_dir, f)
+        f2 = os.path.join(pg2.pgdata_dir, f)
+        stdout_filename = "{}.diff".format(f2)
+
+        with open(stdout_filename, 'w') as stdout_f:
+            subprocess.run("xxd -b {} > {}.hex ".format(f1, f1), shell=True)
+            subprocess.run("xxd -b {} > {}.hex ".format(f2, f2), shell=True)
+
+            cmd = ['diff {}.hex {}.hex'.format(f1, f2)]
+            subprocess.run(cmd, stdout=stdout_f, shell=True)
+
+    assert (mismatch, error) == ([], [])
--- a/test_runner/performance/test_perf_pgbench.py
+++ b/test_runner/performance/test_perf_pgbench.py
@@ -46,13 +46,14 @@ def test_pgbench(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin

    connstr = pg.connstr()

-    # Initialize pgbench database
-    with zenbenchmark.record_duration('init'):
-        pg_bin.run_capture(['pgbench', '-s5', '-i', connstr])
+    # Initialize pgbench database, recording the time and I/O it takes
+    with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
+        with zenbenchmark.record_duration('init'):
+            pg_bin.run_capture(['pgbench', '-s5', '-i', connstr])

-        # Flush the layers from memory to disk. The time to do that is included in the
-        # reported init time.
-        pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+            # Flush the layers from memory to disk. This is included in the reported
+            # time and I/O
+            pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")

    # Run pgbench for 5000 transactions
    with zenbenchmark.record_duration('5000_xacts'):
--- a/vendor/postgres
+++ b/vendor/postgres
--- a/walkeeper/src/replication.rs
+++ b/walkeeper/src/replication.rs
@@ -148,7 +148,7 @@ impl ReplicationConn {
            }
        });

-        let (mut start_pos, mut stop_pos) = Self::parse_start_stop(&cmd)?;
+        let (mut start_pos, mut stop_pos) = Self::parse_start_stop(cmd)?;

        let mut wal_seg_size: usize;
        loop {
@@ -229,7 +229,7 @@ impl ReplicationConn {

            start_pos += send_size as u64;

-            debug!("Sent WAL to page server up to {}", end_pos);
+            debug!("sent WAL up to {}", end_pos);

            // Decide whether to reuse this file. If we don't set wal_file here
            // a new file will be opened next time.
--- a/walkeeper/src/safekeeper.rs
+++ b/walkeeper/src/safekeeper.rs
@@ -10,6 +10,7 @@ use log::*;
 use postgres_ffi::xlog_utils::TimeLineID;
 use serde::{Deserialize, Serialize};
 use std::cmp::max;
+use std::cmp::min;
 use std::io;
 use std::io::Read;

@@ -47,6 +48,7 @@ pub struct ServerInfo {
    /// Postgres server version
    pub pg_version: u32,
    pub system_id: SystemId,
+    pub tenant_id: ZTenantId,
    /// Zenith timelineid
    pub ztli: ZTimelineId,
    pub tli: TimeLineID,
@@ -65,10 +67,9 @@ pub struct SafeKeeperState {
    /// information about server
    pub server: ServerInfo,
    /// Unique id of the last *elected* proposer we dealed with. Not needed
-    /// correctness, exists for monitoring purposes.
+    /// for correctness, exists for monitoring purposes.
    pub proposer_uuid: PgUuid,
-    /// part of WAL acknowledged by quorum (note that we might not have wal to
-    /// up this point locally)
+    /// part of WAL acknowledged by quorum and available locally
    pub commit_lsn: Lsn,
    /// minimal LSN which may be needed for recovery of some safekeeper (end lsn
    /// + 1 of last record streamed to everyone)
@@ -84,6 +85,7 @@ impl SafeKeeperState {
            server: ServerInfo {
                pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */
                system_id: 0,                       /* Postgres system identifier */
+                tenant_id: ZTenantId::from([0u8; 16]),
                ztli: ZTimelineId::from([0u8; 16]),
                tli: 0,
                wal_seg_size: 0,
@@ -95,6 +97,12 @@ impl SafeKeeperState {
    }
 }

+impl Default for SafeKeeperState {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 // protocol messages

 /// Initial Proposer -> Acceptor message
@@ -155,7 +163,7 @@ pub struct AppendRequestHeader {
    end_lsn: Lsn,
    /// LSN committed by quorum of safekeepers
    commit_lsn: Lsn,
-    /// restart LSN position  (minimal LSN which may be needed by proposer to perform recovery)
+    /// restart LSN position (minimal LSN which may be needed by proposer to perform recovery)
    restart_lsn: Lsn,
    // only for logging/debugging
    proposer_uuid: PgUuid,
@@ -172,6 +180,9 @@ pub struct AppendResponse {
    // make much sense without taking epoch into account, as history can be
    // diverged.
    pub flush_lsn: Lsn,
+    // We report back our awareness about which WAL is committed, as this is
+    // a criterion for walproposer --sync mode exit
+    pub commit_lsn: Lsn,
    pub hs_feedback: HotStandbyFeedback,
 }

@@ -205,7 +216,7 @@ impl ProposerAcceptorMessage {
                let rec_size = hdr
                    .end_lsn
                    .checked_sub(hdr.begin_lsn)
-                    .ok_or(anyhow!("begin_lsn > end_lsn in AppendRequest"))?
+                    .ok_or_else(|| anyhow!("begin_lsn > end_lsn in AppendRequest"))?
                    .0 as usize;
                if rec_size > MAX_SEND_SIZE {
                    bail!(
@@ -217,10 +228,7 @@ impl ProposerAcceptorMessage {
                let mut wal_data_vec: Vec<u8> = vec![0; rec_size];
                stream.read_exact(&mut wal_data_vec)?;
                let wal_data = Bytes::from(wal_data_vec);
-                let msg = AppendRequest {
-                    h: hdr,
-                    wal_data: wal_data,
-                };
+                let msg = AppendRequest { h: hdr, wal_data };

                Ok(ProposerAcceptorMessage::AppendRequest(msg))
            }
@@ -274,7 +282,9 @@ pub struct SafeKeeper<ST: Storage> {
    /// reading wal.
    pub flush_lsn: Lsn,
    pub tli: u32,
-    pub flushed_truncate_lsn: Lsn,
+    /// not-yet-flushed pairs of same named fields in s.*
+    pub commit_lsn: Lsn,
+    pub truncate_lsn: Lsn,
    pub storage: ST,
    pub s: SafeKeeperState,          // persistent part
    pub elected_proposer_term: Term, // for monitoring/debugging
@@ -289,7 +299,8 @@ where
        SafeKeeper {
            flush_lsn,
            tli,
-            flushed_truncate_lsn: Lsn(0),
+            commit_lsn: state.commit_lsn,
+            truncate_lsn: state.truncate_lsn,
            storage,
            s: state,
            elected_proposer_term: 0,
@@ -320,13 +331,6 @@ where
                SK_PROTOCOL_VERSION
            );
        }
-        if self.s.server.system_id != 0 && self.s.server.system_id != msg.system_id {
-            bail!(
-                "system identifier changed: got {}, expected {}",
-                msg.system_id,
-                self.s.server.system_id,
-            );
-        }
        /* Postgres upgrade is not treated as fatal error */
        if msg.pg_version != self.s.server.pg_version
            && self.s.server.pg_version != UNKNOWN_SERVER_VERSION
@@ -339,6 +343,7 @@ where

        // set basic info about server, if not yet
        self.s.server.system_id = msg.system_id;
+        self.s.server.tenant_id = msg.tenant_id;
        self.s.server.ztli = msg.ztli;
        self.s.server.tli = msg.tli;
        self.s.server.wal_seg_size = msg.wal_seg_size;
@@ -378,12 +383,13 @@ where
    }

    /// Handle request to append WAL.
+    #[allow(clippy::comparison_chain)]
    fn handle_append_request(&mut self, msg: &AppendRequest) -> Result<AcceptorProposerMessage> {
        // log first AppendRequest from this proposer
        if self.elected_proposer_term < msg.h.term {
            info!(
-                "start receiving WAL from timeline {} term {}",
-                self.s.server.ztli, msg.h.term,
+                "start accepting WAL from timeline {}, tenant {}, term {}, epochStartLsn {:?}",
+                self.s.server.ztli, self.s.server.tenant_id, msg.h.term, msg.h.epoch_start_lsn,
            );
            self.elected_proposer_term = msg.h.term;
        }
@@ -398,6 +404,7 @@ where
            let resp = AppendResponse {
                term: self.s.acceptor_state.term,
                epoch: self.s.acceptor_state.epoch,
+                commit_lsn: Lsn(0),
                flush_lsn: Lsn(0),
                hs_feedback: HotStandbyFeedback::empty(),
            };
@@ -414,9 +421,13 @@ where
         * maximum (vcl) determined by WAL proposer during handshake.
         * Switching epoch means that node completes recovery and start writing in the WAL new data.
         * XXX: this is wrong, we must actively truncate not matching part of log.
+         *
+         * The non-strict inequality is important for us, as proposer in --sync mode doesn't
+         * generate new records, but to advance commit_lsn epoch switch must happen on majority.
+         * We can regard this as commit of empty entry in new epoch, this should be safe.
         */
        if self.s.acceptor_state.epoch < msg.h.term
-            && msg.h.end_lsn > max(self.flush_lsn, msg.h.epoch_start_lsn)
+            && msg.h.end_lsn >= max(self.flush_lsn, msg.h.epoch_start_lsn)
        {
            info!("switched to new epoch {}", msg.h.term);
            self.s.acceptor_state.epoch = msg.h.term; /* bump epoch */
@@ -427,8 +438,20 @@ where
        }

        self.s.proposer_uuid = msg.h.proposer_uuid;
-        self.s.commit_lsn = msg.h.commit_lsn;
-        self.s.truncate_lsn = msg.h.restart_lsn;
+        // Advance commit_lsn taking into account what we have locally.
+        // xxx this is wrapped into epoch check because we overwrite wal
+        // instead of truncating it, so without it commit_lsn might include
+        // wrong part. Anyway, nobody is much interested in our commit_lsn while
+        // epoch switch hasn't happened, right?
+        if self.s.acceptor_state.epoch == msg.h.term {
+            let commit_lsn = min(msg.h.commit_lsn, self.flush_lsn);
+            // If new commit_lsn reached epoch switch, force sync of control file:
+            // walproposer in sync mode is very interested when this happens.
+            sync_control_file |=
+                commit_lsn >= msg.h.epoch_start_lsn && self.s.commit_lsn < msg.h.epoch_start_lsn;
+            self.commit_lsn = commit_lsn;
+        }
+        self.truncate_lsn = msg.h.restart_lsn;

        /*
         * Update restart LSN in control file.
@@ -436,24 +459,26 @@ where
         * when restart_lsn delta exceeds WAL segment size.
         */
        sync_control_file |=
-            self.flushed_truncate_lsn + (self.s.server.wal_seg_size as u64) < self.s.truncate_lsn;
-        self.storage.persist(&self.s, sync_control_file)?;
+            self.s.truncate_lsn + (self.s.server.wal_seg_size as u64) < self.truncate_lsn;
        if sync_control_file {
-            self.flushed_truncate_lsn = self.s.truncate_lsn;
+            self.s.commit_lsn = self.commit_lsn;
+            self.s.truncate_lsn = self.truncate_lsn;
        }
+        self.storage.persist(&self.s, sync_control_file)?;

        let resp = AppendResponse {
            term: self.s.acceptor_state.term,
            epoch: self.s.acceptor_state.epoch,
            flush_lsn: self.flush_lsn,
+            commit_lsn: self.s.commit_lsn,
            // will be filled by caller code to avoid bothering safekeeper
            hs_feedback: HotStandbyFeedback::empty(),
        };
-        trace!(
-            "processed AppendRequest of len {}, flush_lsn={:X}/{:>08X}, resp {:?}",
+        debug!(
+            "processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, resp {:?}",
            msg.wal_data.len(),
-            (self.flush_lsn.0 >> 32) as u32,
-            self.flush_lsn.0 as u32,
+            msg.h.end_lsn,
+            msg.h.commit_lsn,
            &resp,
        );
        Ok(AcceptorProposerMessage::AppendResponse(resp))
@@ -492,7 +517,7 @@ mod tests {
        let mut vote_resp = sk.process_msg(&vote_request);
        match vote_resp.unwrap() {
            AcceptorProposerMessage::VoteResponse(resp) => assert!(resp.vote_given != 0),
-            _ => assert!(false),
+            r => panic!("unexpected response: {:?}", r),
        }

        // reboot...
@@ -506,7 +531,7 @@ mod tests {
        vote_resp = sk.process_msg(&vote_request);
        match vote_resp.unwrap() {
            AcceptorProposerMessage::VoteResponse(resp) => assert!(resp.vote_given == 0),
-            _ => assert!(false),
+            r => panic!("unexpected response: {:?}", r),
        }
    }

@@ -519,7 +544,7 @@ mod tests {

        let mut ar_hdr = AppendRequestHeader {
            term: 1,
-            epoch_start_lsn: Lsn(2),
+            epoch_start_lsn: Lsn(3),
            begin_lsn: Lsn(1),
            end_lsn: Lsn(2),
            commit_lsn: Lsn(0),
@@ -531,20 +556,20 @@ mod tests {
            wal_data: Bytes::from_static(b"b"),
        };

-        // check that AppendRequest before VCL doesn't switch epoch
+        // check that AppendRequest before epochStartLsn doesn't switch epoch
        let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request));
        assert!(resp.is_ok());
-        assert!(sk.storage.persisted_state.acceptor_state.epoch == 0);
+        assert_eq!(sk.storage.persisted_state.acceptor_state.epoch, 0);

-        // but record after VCL does the switch
+        // but record at epochStartLsn does the switch
        ar_hdr.begin_lsn = Lsn(2);
        ar_hdr.end_lsn = Lsn(3);
        append_request = AppendRequest {
-            h: ar_hdr.clone(),
+            h: ar_hdr,
            wal_data: Bytes::from_static(b"b"),
        };
        let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request));
        assert!(resp.is_ok());
-        assert!(sk.storage.persisted_state.acceptor_state.epoch == 1);
+        assert_eq!(sk.storage.persisted_state.acceptor_state.epoch, 1);
    }
 }
--- a/walkeeper/src/timeline.rs
+++ b/walkeeper/src/timeline.rs
@@ -127,7 +127,7 @@ impl SharedState {
                    if let CreateControlFile::False = create {
                        bail!("control file is empty");
                    }
-                    return Ok((file, SafeKeeperState::new()));
+                    Ok((file, SafeKeeperState::new()))
                } else {
                    match SafeKeeperState::des_from(&mut file) {
                        Err(e) => {
@@ -144,7 +144,7 @@ impl SharedState {
                                    SK_FORMAT_VERSION
                                );
                            }
-                            return Ok((file, s));
+                            Ok((file, s))
                        }
                    }
                }
@@ -217,14 +217,11 @@ impl Timeline {
            rmsg = shared_state.sk.process_msg(msg)?;
            // locally available commit lsn. flush_lsn can be smaller than
            // commit_lsn if we are catching up safekeeper.
-            commit_lsn = min(shared_state.sk.flush_lsn, shared_state.sk.s.commit_lsn);
+            commit_lsn = shared_state.sk.commit_lsn;

            // if this is AppendResponse, fill in proper hot standby feedback
-            match rmsg {
-                AcceptorProposerMessage::AppendResponse(ref mut resp) => {
-                    resp.hs_feedback = shared_state.hs_feedback.clone();
-                }
-                _ => (),
+            if let AcceptorProposerMessage::AppendResponse(ref mut resp) = rmsg {
+                resp.hs_feedback = shared_state.hs_feedback.clone();
            }
        }
        // Ping wal sender that new data might be available.
@@ -401,7 +398,7 @@ impl Storage for FileStorage {
                    {
                        Ok(mut file) => {
                            for _ in 0..(wal_seg_size / XLOG_BLCKSZ) {
-                                file.write_all(&ZERO_BLOCK)?;
+                                file.write_all(ZERO_BLOCK)?;
                            }
                            wal_file = file;
                        }
--- a/zenith/src/main.rs
+++ b/zenith/src/main.rs
@@ -95,7 +95,7 @@ fn main() -> Result<()> {
                            .required(false)
                    ))
                .subcommand(SubCommand::with_name("start")
-                    .about("Start a postrges compute node.\n This command actually creates new node from scrath, but preserves existing config files")
+                    .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files")
                    .arg(timeline_arg.clone()).arg(tenantid_arg.clone()))
                .subcommand(
                    SubCommand::with_name("stop")
@@ -359,7 +359,7 @@ fn get_branch_infos(
 }

 fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    let pageserver = PageServerNode::from_env(&env);
+    let pageserver = PageServerNode::from_env(env);
    match tenant_match.subcommand() {
        ("list", Some(_)) => {
            for tenant in pageserver.tenant_list()? {
@@ -381,12 +381,12 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result
 }

 fn handle_branch(branch_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    let pageserver = PageServerNode::from_env(&env);
+    let pageserver = PageServerNode::from_env(env);

    if let Some(branchname) = branch_match.value_of("branchname") {
        let startpoint_str = branch_match
            .value_of("start-point")
-            .ok_or(anyhow!("Missing start-point"))?;
+            .ok_or_else(|| anyhow!("Missing start-point"))?;
        let tenantid: ZTenantId = branch_match
            .value_of("tenantid")
            .map_or(Ok(env.tenantid), |value| value.parse())?;
@@ -447,9 +447,8 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
                .value_of("tenantid")
                .map_or(Ok(env.tenantid), |value| value.parse())?;
            let timeline_name = create_match.value_of("timeline").unwrap_or("main");
-            let config_only = create_match.is_present("config-only");

-            cplane.new_node(tenantid, timeline_name, config_only)?;
+            cplane.new_node(tenantid, timeline_name)?;
        }
        ("start", Some(start_match)) => {
            let tenantid: ZTenantId = start_match
@@ -466,11 +465,15 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
                None
            };

-            println!("Starting postgres on timeline {}...", timeline_name);
+            println!(
+                "Starting {} postgres on timeline {}...",
+                if node.is_some() { "existing" } else { "new" },
+                timeline_name
+            );
            if let Some(node) = node {
                node.start(&auth_token)?;
            } else {
-                let node = cplane.new_node(tenantid, timeline_name, false)?;
+                let node = cplane.new_node(tenantid, timeline_name)?;
                node.start(&auth_token)?;
            }
        }
--- a/zenith_metrics/src/lib.rs
+++ b/zenith_metrics/src/lib.rs
@@ -13,7 +13,6 @@ pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
 pub use prometheus::{Encoder, TextEncoder};

 mod wrappers;
-use libc::{c_long, getrusage, rusage, suseconds_t, time_t, timeval, RUSAGE_SELF};
 pub use wrappers::{CountedReader, CountedWriter};

 /// Gathers all Prometheus metrics and records the I/O stats just before that.
@@ -42,40 +41,26 @@ lazy_static! {
 // performed by the process.
 // We know the the size of the block, so we can determine the I/O bytes out of it.
 // The value might be not 100% exact, but should be fine for Prometheus metrics in this case.
+#[allow(clippy::unnecessary_cast)]
 fn update_io_metrics() {
-    let mut usage = rusage {
-        ru_utime: timeval {
-            tv_sec: 0 as time_t,
-            tv_usec: 0 as suseconds_t,
-        },
-        ru_stime: timeval {
-            tv_sec: 0 as time_t,
-            tv_usec: 0 as suseconds_t,
-        },
-        ru_maxrss: 0 as c_long,
-        ru_ixrss: 0 as c_long,
-        ru_idrss: 0 as c_long,
-        ru_isrss: 0 as c_long,
-        ru_minflt: 0 as c_long,
-        ru_majflt: 0 as c_long,
-        ru_nswap: 0 as c_long,
-        ru_inblock: 0 as c_long,
-        ru_oublock: 0 as c_long,
-        ru_msgsnd: 0 as c_long,
-        ru_msgrcv: 0 as c_long,
-        ru_nsignals: 0 as c_long,
-        ru_nvcsw: 0 as c_long,
-        ru_nivcsw: 0 as c_long,
-    };
-    unsafe {
-        getrusage(RUSAGE_SELF, (&mut usage) as *mut rusage);
-    }
+    let rusage_stats = get_rusage_stats();

    const BYTES_IN_BLOCK: i64 = 512;
    DISK_IO_BYTES
        .with_label_values(&["read"])
-        .set(usage.ru_inblock * BYTES_IN_BLOCK);
+        .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
    DISK_IO_BYTES
        .with_label_values(&["write"])
-        .set(usage.ru_oublock * BYTES_IN_BLOCK);
+        .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
+}
+
+fn get_rusage_stats() -> libc::rusage {
+    let mut rusage = std::mem::MaybeUninit::uninit();
+
+    // SAFETY: kernel will initialize the struct for us
+    unsafe {
+        let ret = libc::getrusage(libc::RUSAGE_SELF, rusage.as_mut_ptr());
+        assert!(ret == 0, "getrusage failed: bad args");
+        rusage.assume_init()
+    }
 }
--- a/zenith_metrics/src/wrappers.rs
+++ b/zenith_metrics/src/wrappers.rs
@@ -186,7 +186,7 @@ mod tests {
        assert_eq!(total, stream.len());
    }

-    // This mimicks the constraints of std::thread::spawn
+    // This mimics the constraints of std::thread::spawn
    fn assert_send_sync(_x: impl Sync + Send + 'static) {}

    #[test]
--- a/zenith_utils/src/auth.rs
+++ b/zenith_utils/src/auth.rs
@@ -1,6 +1,6 @@
 // For details about authentication see docs/authentication.md
 // TODO there are two issues for our use case in jsonwebtoken library which will be resolved in next release
-// The fisrt one is that there is no way to disable expiration claim, but it can be excluded from validation, so use this as a workaround for now.
+// The first one is that there is no way to disable expiration claim, but it can be excluded from validation, so use this as a workaround for now.
 // Relevant issue: https://github.com/Keats/jsonwebtoken/issues/190
 // The second one is that we wanted to use ed25519 keys, but they are also not supported until next version. So we go with RSA keys for now.
 // Relevant issue: https://github.com/Keats/jsonwebtoken/issues/162
@@ -8,7 +8,8 @@
 use hex::{self, FromHex};
 use serde::de::Error;
 use serde::{self, Deserializer, Serializer};
-use std::{fs, path::PathBuf};
+use std::fs;
+use std::path::Path;

 use anyhow::{bail, Result};
 use jsonwebtoken::{
@@ -43,8 +44,8 @@ where
 {
    let opt: Option<String> = Option::deserialize(deserializer)?;
    match opt {
-        Some(tid) => return Ok(Some(ZTenantId::from_hex(tid).map_err(Error::custom)?)),
-        None => return Ok(None),
+        Some(tid) => Ok(Some(ZTenantId::from_hex(tid).map_err(Error::custom)?)),
+        None => Ok(None),
    }
 }

@@ -91,7 +92,7 @@ pub struct JwtAuth {
 }

 impl JwtAuth {
-    pub fn new<'a>(decoding_key: DecodingKey<'a>) -> Self {
+    pub fn new(decoding_key: DecodingKey<'_>) -> Self {
        Self {
            decoding_key: decoding_key.into_static(),
            validation: Validation {
@@ -102,7 +103,7 @@ impl JwtAuth {
        }
    }

-    pub fn from_key_path(key_path: &PathBuf) -> Result<Self> {
+    pub fn from_key_path(key_path: &Path) -> Result<Self> {
        let public_key = fs::read_to_string(key_path)?;
        Ok(Self::new(DecodingKey::from_rsa_pem(public_key.as_bytes())?))
    }
@@ -113,8 +114,8 @@ impl JwtAuth {
 }

 // this function is used only for testing purposes in CLI e g generate tokens during init
-pub fn encode_from_key_path(claims: &Claims, key_path: &PathBuf) -> Result<String> {
+pub fn encode_from_key_path(claims: &Claims, key_path: &Path) -> Result<String> {
    let key_data = fs::read_to_string(key_path)?;
-    let key = EncodingKey::from_rsa_pem(&key_data.as_bytes())?;
+    let key = EncodingKey::from_rsa_pem(key_data.as_bytes())?;
    Ok(encode(&Header::new(JWT_ALGORITHM), claims, &key)?)
 }
--- a/zenith_utils/src/bin_ser.rs
+++ b/zenith_utils/src/bin_ser.rs
@@ -6,7 +6,7 @@
 //!
 //! The [`LeSer`] trait does the same thing, in little-endian form.
 //!
-//! Note: you will get a compile error if you try to `use` both trais
+//! Note: you will get a compile error if you try to `use` both traits
 //! in the same module or scope. This is intended to be a safety
 //! mechanism: mixing big-endian and little-endian encoding in the same file
 //! is error-prone.
--- a/zenith_utils/src/http/endpoint.rs
+++ b/zenith_utils/src/http/endpoint.rs
@@ -95,13 +95,13 @@ pub fn attach_openapi_ui(

 fn parse_token(header_value: &str) -> Result<&str, ApiError> {
    // header must be in form Bearer <token>
-    let (prefix, token) = header_value.split_once(' ').ok_or(ApiError::Unauthorized(
-        "malformed authorization header".to_string(),
-    ))?;
+    let (prefix, token) = header_value
+        .split_once(' ')
+        .ok_or_else(|| ApiError::Unauthorized("malformed authorization header".to_string()))?;
    if prefix != "Bearer" {
-        Err(ApiError::Unauthorized(
+        return Err(ApiError::Unauthorized(
            "malformed authorization header".to_string(),
-        ))?
+        ));
    }
    Ok(token)
 }
@@ -123,9 +123,11 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
                        .map_err(|_| ApiError::Unauthorized("malformed jwt token".to_string()))?;
                    req.set_context(data.claims);
                }
-                None => Err(ApiError::Unauthorized(
-                    "missing authorization header".to_string(),
-                ))?,
+                None => {
+                    return Err(ApiError::Unauthorized(
+                        "missing authorization header".to_string(),
+                    ))
+                }
            }
        }
        Ok(req)
@@ -145,7 +147,7 @@ pub fn serve_thread_main(
    addr: String,
 ) -> anyhow::Result<()> {
    let addr = addr.parse()?;
-    log::info!("Starting a http endoint at {}", addr);
+    log::info!("Starting an http endpoint at {}", addr);

    // Create a Service from the router above to handle incoming requests.
    let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();
--- a/zenith_utils/src/lib.rs
+++ b/zenith_utils/src/lib.rs
@@ -1,6 +1,8 @@
 //! zenith_utils is intended to be a place to put code that is shared
 //! between other crates in this repository.

+#![allow(clippy::manual_range_contains)]
+
 /// `Lsn` type implements common tasks on Log Sequence Numbers
 pub mod lsn;
 /// SeqWait allows waiting for a future sequence number to arrive
--- a/zenith_utils/src/lsn.rs
+++ b/zenith_utils/src/lsn.rs
@@ -32,6 +32,12 @@ impl Lsn {
        self.0.checked_sub(other).map(Lsn)
    }

+    /// Subtract a number, returning the difference as i128 to avoid overflow.
+    pub fn widening_sub<T: Into<u64>>(self, other: T) -> i128 {
+        let other: u64 = other.into();
+        i128::from(self.0) - i128::from(other)
+    }
+
    /// Parse an LSN from a filename in the form `0000000000000000`
    pub fn from_filename<F>(filename: F) -> Result<Self, LsnParseError>
    where
@@ -264,6 +270,11 @@ mod tests {
        assert_eq!(Lsn(1234).checked_sub(1233u64), Some(Lsn(1)));
        assert_eq!(Lsn(1234).checked_sub(1235u64), None);

+        assert_eq!(Lsn(1235).widening_sub(1234u64), 1);
+        assert_eq!(Lsn(1234).widening_sub(1235u64), -1);
+        assert_eq!(Lsn(u64::MAX).widening_sub(0u64), i128::from(u64::MAX));
+        assert_eq!(Lsn(0).widening_sub(u64::MAX), -i128::from(u64::MAX));
+
        let seg_sz: usize = 16 * 1024 * 1024;
        assert_eq!(Lsn(0x1000007).segment_offset(seg_sz), 7);
        assert_eq!(Lsn(0x1000007).segment_number(seg_sz), 1u64);
--- a/zenith_utils/src/sock_split.rs
+++ b/zenith_utils/src/sock_split.rs
@@ -107,12 +107,21 @@ impl io::Write for WriteStream {
    }
 }

+pub struct TlsBoxed {
+    stream: BufStream,
+    session: rustls::ServerSession,
+}
+
+impl TlsBoxed {
+    fn rustls_stream(&mut self) -> rustls::Stream<rustls::ServerSession, BufStream> {
+        rustls::Stream::new(&mut self.session, &mut self.stream)
+    }
+}
+
 pub enum BidiStream {
    Tcp(BufStream),
-    Tls {
-        stream: BufStream,
-        session: rustls::ServerSession,
-    },
+    /// This variant is boxed, because [`rustls::ServerSession`] is quite larger than [`BufStream`].
+    Tls(Box<TlsBoxed>),
 }

 impl BidiStream {
@@ -123,17 +132,13 @@ impl BidiStream {
    pub fn shutdown(&mut self, how: Shutdown) -> io::Result<()> {
        match self {
            Self::Tcp(stream) => stream.get_ref().shutdown(how),
-            Self::Tls {
-                stream: reader,
-                session,
-            } => {
+            Self::Tls(tls_boxed) => {
                if how == Shutdown::Read {
-                    reader.get_ref().shutdown(how)
+                    tls_boxed.stream.get_ref().shutdown(how)
                } else {
-                    session.send_close_notify();
-                    let mut stream = rustls::Stream::new(session, reader);
-                    let res = stream.flush();
-                    reader.get_ref().shutdown(how)?;
+                    tls_boxed.session.send_close_notify();
+                    let res = tls_boxed.rustls_stream().flush();
+                    tls_boxed.stream.get_ref().shutdown(how)?;
                    res
                }
            }
@@ -149,8 +154,8 @@ impl BidiStream {

                (ReadStream::Tcp(reader), WriteStream::Tcp(stream))
            }
-            Self::Tls { stream, session } => {
-                let reader = stream.into_reader();
+            Self::Tls(tls_boxed) => {
+                let reader = tls_boxed.stream.into_reader();
                let buffer_data = reader.buffer().to_owned();
                let read_buf_cfg = rustls_split::BufCfg::with_data(buffer_data, 8192);
                let write_buf_cfg = rustls_split::BufCfg::with_capacity(8192);
@@ -159,7 +164,7 @@ impl BidiStream {
                let socket = Arc::try_unwrap(reader.into_inner().0).unwrap();

                let (read_half, write_half) =
-                    rustls_split::split(socket, session, read_buf_cfg, write_buf_cfg);
+                    rustls_split::split(socket, tls_boxed.session, read_buf_cfg, write_buf_cfg);
                (ReadStream::Tls(read_half), WriteStream::Tls(write_half))
            }
        }
@@ -170,7 +175,7 @@ impl BidiStream {
            Self::Tcp(mut stream) => {
                session.complete_io(&mut stream)?;
                assert!(!session.is_handshaking());
-                Ok(Self::Tls { stream, session })
+                Ok(Self::Tls(Box::new(TlsBoxed { stream, session })))
            }
            Self::Tls { .. } => Err(io::Error::new(
                io::ErrorKind::InvalidInput,
@@ -184,7 +189,7 @@ impl io::Read for BidiStream {
    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
        match self {
            Self::Tcp(stream) => stream.read(buf),
-            Self::Tls { stream, session } => rustls::Stream::new(session, stream).read(buf),
+            Self::Tls(tls_boxed) => tls_boxed.rustls_stream().read(buf),
        }
    }
 }
@@ -193,14 +198,14 @@ impl io::Write for BidiStream {
    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
        match self {
            Self::Tcp(stream) => stream.write(buf),
-            Self::Tls { stream, session } => rustls::Stream::new(session, stream).write(buf),
+            Self::Tls(tls_boxed) => tls_boxed.rustls_stream().write(buf),
        }
    }

    fn flush(&mut self) -> io::Result<()> {
        match self {
            Self::Tcp(stream) => stream.flush(),
-            Self::Tls { stream, session } => rustls::Stream::new(session, stream).flush(),
+            Self::Tls(tls_boxed) => tls_boxed.rustls_stream().flush(),
        }
    }
 }
Author	SHA1	Message	Date
Konstantin Knizhnik	59ea3973a4	Set hint bits in pageserver	2021-09-10 18:27:34 +03:00
Konstantin Knizhnik	08bc808043	Create branch just to run tests	2021-09-07 15:12:39 +03:00
Konstantin Knizhnik	ba563ee93e	Revert "Bump postgres version" This reverts commit `511873aaed`.	2021-09-07 15:12:39 +03:00
anastasia	194b33ac3b	print diff for mismatching files in check_restored_datadir_content()	2021-09-07 15:12:39 +03:00
Konstantin Knizhnik	a190c0eb88	Transaction commit redo handler should set TRANSACTION_STATUS_COMMITTED status for subtransactions, not TRANSACTION_STATUS_SUB_COMMITTED Closes #535	2021-09-07 15:12:39 +03:00
anastasia	2b5405ac6e	Add test funciton to compare files in compute nodes to catch bugs in SLRU replay. Compare files in existing compute node's pgdata with fresh basebackup at the same lsn. We expect that content is identical, except tmp files Use it after some tests.	2021-09-07 15:12:39 +03:00
Arseny Sher	1d75c827a0	Adapt safekeepers to --sync-safekeepers walproposer mode. 1) Do epoch switch without record from new epoch, immediately after recovery -- --sync-safekeepers mode doesn't generate new records. 2) Fix commit_lsn advancement by taking into account wal we have locally -- setting it further is incorrect. 3) Report it back to walproposer so he knows when sync is done. 4) Remove system id check as it is unknown in sync mode. And make logging slightly better. ref #439	2021-09-07 15:12:39 +03:00
Stas Kelvich	e1e43f13df	Make use of `postgres --sync-safekeepers` in tests and CLI. Change control plane code to call `postgres --sync-safekeepers` before compute node start when safekeepers are enabled. Now `pg create` will create an empty data directory with the proper config file. Subsequent `pg start` will run `sync-safekeepers` and will call basebackup with the resulting LSN. Also change few tests to accommodate this new behavior.	2021-09-07 15:12:39 +03:00
Konstantin Knizhnik	b2e0490d5e	Add description of Zenith changes in Postgres core (#533 ) * Add description of Zenith changes in Postgres core * Update README.md	2021-09-07 15:12:39 +03:00
Kirill Bulatov	1d3c86e17a	Check rusage return code	2021-09-07 15:12:39 +03:00
Konstantin Knizhnik	e8c22488b9	Set proper xl_prev in basebackup, when possible. In a passing fix two minor issues with basabackup: * check that we can't create branches with pre-initdb LSN's * normalize branch LSN's that are pointing to the segment boundary patch by @knizhnik closes #506	2021-09-07 15:12:39 +03:00
anastasia	9c1dbe3783	Add LayerMap.dump() funciton for debugging. Print timelineid in layer dumps	2021-09-07 15:12:39 +03:00
anastasia	1365f8c703	Rename put_unlink() to drop_relish() in Timeline trait. Rename put_unlink() to drop_segment() in Layer trait.	2021-09-07 15:12:39 +03:00
anastasia	df4ce15456	Improve comments for Layer trait.	2021-09-07 15:12:39 +03:00
anastasia	9ed4db273d	Don't use term 'snapshot' to describe layers	2021-09-07 15:12:39 +03:00
Heikki Linnakangas	21cf4a3e11	Include # of bytes written in pgbench benchmark result Now that the page server collects this metric (since commit `212920e47e`), let's include it in the performance test results The new metric looks like this: performance/test_perf_pgbench.py . [100%] --------------- Benchmark results ---------------- test_pgbench.init: 6.784 s test_pgbench.pageserver_writes: 466 MB <---- THIS IS NEW test_pgbench.5000_xacts: 8.196 s test_pgbench.size: 163 MB =============== 1 passed in 21.00s ===============	2021-09-07 15:12:39 +03:00
Heikki Linnakangas	2c10224c9a	Partial fix for issue with extending relation with a gap. This should fix the sporadic regression test failures we've been seeing lately with "no base img found" errors. This fixes the common case, but one corner case is still not handled: If a relation is extended across a segment boundary, leaving a gap block in the segment preceding the segment containing the target block, the preceding segment will not be padded with zeros correctly. This adds a test case for that, but it's commented out. See github issue https://github.com/zenithdb/zenith/issues/500	2021-09-07 15:12:39 +03:00
Patrick Insinger	c33faf98d1	zenith_utils - box BidiStream::Tls variant Clippy warns that one variant is 40 bytes and the other is 568 bytes. Box the larger variant to avoid this warning	2021-09-07 15:12:39 +03:00
Dmitry Rodionov	95453bc4af	fix clippy warnings	2021-09-07 15:12:39 +03:00
Kirill Bulatov	3a37877edc	Fix some typos	2021-09-07 15:12:39 +03:00
Heikki Linnakangas	2145ec5fe8	Fix infinite loop with forced repository checkpoint. To fix, break out of the loop when you reach an in-memory layer that was created after the checkpoint started. To do that, add a "generation" counter into the layer map. Fixes https://github.com/zenithdb/zenith/issues/494	2021-09-07 15:12:39 +03:00
Konstantin Knizhnik	49d14cbde7	Create branch just to run tests	2021-09-07 13:32:45 +03:00