Get rid of snapshot directory + related code cleanup and refactoring.

- Add new subdir postgres_ffi/samples/ for config file samples. - Don't copy wal to the new branch on zenith init or zenith branch. - Import_timeline_wal on zenith init.
2025-12-26 23:59:58 +00:00 · 2021-07-19 13:19:59 +03:00
parent 47824c5fca
commit 1e6267a35f
9 changed files with 324 additions and 336 deletions
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -20,6 +20,7 @@ use crate::local_env::LocalEnv;
 use pageserver::{ZTenantId, ZTimelineId};

 use crate::storage::PageServerNode;
+use postgres_ffi::pg_constants;

 //
 // ComputeControlPlane
@@ -278,10 +279,19 @@ impl PostgresNode {
            },
        )?;

-        // FIXME: The compute node should be able to stream the WAL it needs from the WAL safekeepers or archive.
-        // But that's not implemented yet. For now, 'pg_wal' is included in the base backup tarball that
-        // we receive from the Page Server, so we don't need to create the empty 'pg_wal' directory here.
-        //fs::create_dir_all(pgdata.join("pg_wal"))?;
+        // Create pgdata subdirs structure
+        for dir in pg_constants::PGDATA_SUBDIRS.iter() {
+            let path = pgdata.as_path().join(*dir);
+
+            fs::create_dir_all(path.clone())?;
+
+            fs::set_permissions(path, fs::Permissions::from_mode(0o700)).with_context(|| {
+                format!(
+                    "could not set permissions in data directory {}",
+                    pgdata.display()
+                )
+            })?;
+        }

        let mut copyreader = client
            .copy_out(sql.as_str())
@@ -322,7 +332,7 @@ impl PostgresNode {
        // Never clean up old WAL. TODO: We should use a replication
        // slot or something proper, to prevent the compute node
        // from removing WAL that hasn't been streamed to the safekeepr or
-        // page server yet. But this will do for now.
+        // page server yet. (gh issue #349)
        self.append_conf("postgresql.conf", "wal_keep_size='10TB'\n")?;

        // Connect it to the page server.
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -4,23 +4,22 @@
 //! TODO: this module has nothing to do with PostgreSQL pg_basebackup.
 //! It could use a better name.
 //!
-//! Stateless Postgres compute node is launched by sending tarball which contains non-relational data (multixacts, clog, filenodemaps, twophase files)
-//! and generate pg_control and dummy segment of WAL. This module is responsible for creation of such tarball from snapshot directory and
-//! data stored in object storage.
+//! Stateless Postgres compute node is launched by sending a tarball
+//! which contains non-relational data (multixacts, clog, filenodemaps, twophase files),
+//! generated pg_control and dummy segment of WAL.
+//! This module is responsible for creation of such tarball
+//! from data stored in object storage.
 //!
-use crate::{PageServerConf, ZTenantId, ZTimelineId};
 use bytes::{BufMut, BytesMut};
 use log::*;
+use std::io;
 use std::io::Write;
-use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::SystemTime;
-use tar::{Builder, Header};
-use walkdir::WalkDir;
+use tar::{Builder, EntryType, Header};

-use crate::object_key::*;
+use crate::object_key::{DatabaseTag, ObjectTag};
 use crate::repository::Timeline;
-use postgres_ffi::relfile_utils::*;
 use postgres_ffi::xlog_utils::*;
 use postgres_ffi::*;
 use zenith_utils::lsn::Lsn;
@@ -33,7 +32,6 @@ pub struct Basebackup<'a> {
    timeline: &'a Arc<dyn Timeline>,
    lsn: Lsn,
    prev_record_lsn: Lsn,
-    snappath: PathBuf,
    slru_buf: [u8; pg_constants::SLRU_SEG_SIZE],
    slru_segno: u32,
    slru_path: &'static str,
@@ -41,23 +39,16 @@ pub struct Basebackup<'a> {

 impl<'a> Basebackup<'a> {
    pub fn new(
-        conf: &PageServerConf,
        write: &'a mut dyn Write,
-        tenantid: ZTenantId,
-        timelineid: ZTimelineId,
        timeline: &'a Arc<dyn Timeline>,
        lsn: Lsn,
        prev_record_lsn: Lsn,
-        snapshot_lsn: Lsn,
    ) -> Basebackup<'a> {
        Basebackup {
            ar: Builder::new(write),
            timeline,
            lsn,
            prev_record_lsn,
-            snappath: conf
-                .snapshots_path(&timelineid, &tenantid)
-                .join(format!("{:016X}", snapshot_lsn.0)),
            slru_path: "",
            slru_segno: u32::MAX,
            slru_buf: [0u8; pg_constants::SLRU_SEG_SIZE],
@@ -65,47 +56,22 @@ impl<'a> Basebackup<'a> {
    }

    pub fn send_tarball(&mut self) -> anyhow::Result<()> {
-        debug!("sending tarball of snapshot in {}", self.snappath.display());
-        for entry in WalkDir::new(&self.snappath) {
-            let entry = entry?;
-            let fullpath = entry.path();
-            let relpath = entry.path().strip_prefix(&self.snappath).unwrap();
-
-            if relpath.to_str().unwrap() == "" {
-                continue;
-            }
-
-            if entry.file_type().is_dir() {
-                trace!(
-                    "sending dir {} as {}",
-                    fullpath.display(),
-                    relpath.display()
-                );
-                self.ar.append_dir(relpath, fullpath)?;
-            } else if entry.file_type().is_symlink() {
-                error!("ignoring symlink in snapshot dir");
-            } else if entry.file_type().is_file() {
-                if !is_rel_file_path(relpath.to_str().unwrap()) {
-                    if entry.file_name() != "pg_filenode.map" // this files will be generated from object storage
-                        && !relpath.starts_with("pg_xact/")
-                        && !relpath.starts_with("pg_multixact/")
-                    {
-                        trace!("sending {}", relpath.display());
-                        self.ar.append_path_with_name(fullpath, relpath)?;
-                    }
-                } else {
-                    // relation pages are loaded on demand and should not be included in tarball
-                    trace!("not sending {}", relpath.display());
-                }
+        // Send empty config files.
+        for filepath in pg_constants::PGDATA_SPECIAL_FILES.iter() {
+            if *filepath == "pg_hba.conf" {
+                let data = pg_constants::PG_HBA.as_bytes();
+                let header = new_tar_header(&filepath, data.len() as u64)?;
+                self.ar.append(&header, &data[..])?;
            } else {
-                error!("unknown file type: {}", fullpath.display());
+                let header = new_tar_header(&filepath, 0)?;
+                self.ar.append(&header, &mut io::empty())?;
            }
        }

-        // Generate non-relational files.
-        // Iteration is sorted order: all objects of the same time are grouped and traversed
+        // Gather non-relational files from object storage pages.
+        // Iteration is sorted order: all objects of the same type are grouped and traversed
        // in key ascending order. For example all pg_xact records precede pg_multixact records and are sorted by block number.
-        // It allows to easily construct SLRU segments (32 blocks).
+        // It allows to easily construct SLRU segments.
        for obj in self.timeline.list_nonrels(self.lsn)? {
            match obj {
                ObjectTag::Clog(slru) => self.add_slru_segment("pg_xact", &obj, slru.blknum)?,
@@ -120,7 +86,10 @@ impl<'a> Basebackup<'a> {
                _ => {}
            }
        }
-        self.finish_slru_segment()?; // write last non-completed SLRU segment (if any)
+
+        // write last non-completed SLRU segment (if any)
+        self.finish_slru_segment()?;
+        // Generate pg_control and bootstrap WAL segment.
        self.add_pgcontrol_file()?;
        self.ar.finish()?;
        debug!("all tarred up!");
@@ -129,19 +98,18 @@ impl<'a> Basebackup<'a> {

    //
    // Generate SLRU segment files from repository. Path identifies SLRU kind (pg_xact, pg_multixact/members, ...).
-    // Intially pass an empty string.
    //
    fn add_slru_segment(
        &mut self,
        path: &'static str,
        tag: &ObjectTag,
-        page: u32,
+        blknum: u32,
    ) -> anyhow::Result<()> {
        let img = self.timeline.get_page_at_lsn_nowait(*tag, self.lsn)?;
        // Zero length image indicates truncated segment: just skip it
        if !img.is_empty() {
            assert!(img.len() == pg_constants::BLCKSZ as usize);
-            let segno = page / pg_constants::SLRU_PAGES_PER_SEGMENT;
+            let segno = blknum / pg_constants::SLRU_PAGES_PER_SEGMENT;
            if self.slru_path != "" && (self.slru_segno != segno || self.slru_path != path) {
                // Switch to new segment: save old one
                let segname = format!("{}/{:>04X}", self.slru_path, self.slru_segno);
@@ -151,7 +119,7 @@ impl<'a> Basebackup<'a> {
            }
            self.slru_segno = segno;
            self.slru_path = path;
-            let offs_start = (page % pg_constants::SLRU_PAGES_PER_SEGMENT) as usize
+            let offs_start = (blknum % pg_constants::SLRU_PAGES_PER_SEGMENT) as usize
                * pg_constants::BLCKSZ as usize;
            let offs_end = offs_start + pg_constants::BLCKSZ as usize;
            self.slru_buf[offs_start..offs_end].copy_from_slice(&img);
@@ -175,18 +143,36 @@ impl<'a> Basebackup<'a> {

    //
    // Extract pg_filenode.map files from repository
+    // Along with them also send PG_VERSION for each database.
    //
    fn add_relmap_file(&mut self, tag: &ObjectTag, db: &DatabaseTag) -> anyhow::Result<()> {
+        trace!("add_relmap_file {:?}", db);
        let img = self.timeline.get_page_at_lsn_nowait(*tag, self.lsn)?;
-        info!("add_relmap_file {:?}", db);
        let path = if db.spcnode == pg_constants::GLOBALTABLESPACE_OID {
+            let dst_path = "PG_VERSION";
+            let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
+            let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
+            self.ar.append(&header, &version_bytes[..])?;
+
+            let dst_path = format!("global/PG_VERSION");
+            let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
+            self.ar.append(&header, &version_bytes[..])?;
+
            String::from("global/pg_filenode.map") // filenode map for global tablespace
        } else {
            // User defined tablespaces are not supported
            assert!(db.spcnode == pg_constants::DEFAULTTABLESPACE_OID);
-            let src_path = self.snappath.join("base/1/PG_VERSION");
+
+            // Append dir path for each database
+            let path = format!("base/{}", db.dbnode);
+            let header = new_tar_header_dir(&path)?;
+            self.ar.append(&header, &mut io::empty())?;
+
            let dst_path = format!("base/{}/PG_VERSION", db.dbnode);
-            self.ar.append_path_with_name(&src_path, &dst_path)?;
+            let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
+            let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
+            self.ar.append(&header, &version_bytes[..])?;
+
            format!("base/{}/pg_filenode.map", db.dbnode)
        };
        assert!(img.len() == 512);
@@ -216,7 +202,8 @@ impl<'a> Basebackup<'a> {
    }

    //
-    // Add generated pg_control file
+    // Add generated pg_control file and bootstrap WAL segment.
+    // Also send zenith.signal file with extra bootstrap data.
    //
    fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
        let checkpoint_bytes = self
@@ -238,12 +225,13 @@ impl<'a> Basebackup<'a> {
        checkpoint.redo = self.lsn.0 + self.lsn.calc_padding(8u32);

        //reset some fields we don't want to preserve
+        //TODO Check this.
+        //We may need to determine the value from twophase data.
        checkpoint.oldestActiveXid = 0;

        //save new values in pg_control
        pg_control.checkPoint = checkpoint_lsn;
        pg_control.checkPointCopy = checkpoint;
-        info!("pg_control.state = {}", pg_control.state);
        pg_control.state = pg_constants::DB_SHUTDOWNED;

        // add zenith.signal file
@@ -271,60 +259,6 @@ impl<'a> Basebackup<'a> {
    }
 }

-///
-/// Parse a path, relative to the root of PostgreSQL data directory, as
-/// a PostgreSQL relation data file.
-///
-fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> {
-    /*
-     * Relation data files can be in one of the following directories:
-     *
-     * global/
-     *		shared relations
-     *
-     * base/<db oid>/
-     *		regular relations, default tablespace
-     *
-     * pg_tblspc/<tblspc oid>/<tblspc version>/
-     *		within a non-default tablespace (the name of the directory
-     *		depends on version)
-     *
-     * And the relation data files themselves have a filename like:
-     *
-     * <oid>.<segment number>
-     */
-    if let Some(fname) = path.strip_prefix("global/") {
-        let (_relnode, _forknum, _segno) = parse_relfilename(fname)?;
-
-        Ok(())
-    } else if let Some(dbpath) = path.strip_prefix("base/") {
-        let mut s = dbpath.split('/');
-        let dbnode_str = s.next().ok_or(FilePathError::InvalidFileName)?;
-        let _dbnode = dbnode_str.parse::<u32>()?;
-        let fname = s.next().ok_or(FilePathError::InvalidFileName)?;
-        if s.next().is_some() {
-            return Err(FilePathError::InvalidFileName);
-        };
-
-        let (_relnode, _forknum, _segno) = parse_relfilename(fname)?;
-
-        Ok(())
-    } else if path.strip_prefix("pg_tblspc/").is_some() {
-        // TODO
-        error!("tablespaces not implemented yet");
-        Err(FilePathError::InvalidFileName)
-    } else {
-        Err(FilePathError::InvalidFileName)
-    }
-}
-
-//
-// Check if it is relational file
-//
-fn is_rel_file_path(path: &str) -> bool {
-    parse_rel_file_path(path).is_ok()
-}
-
 //
 // Create new tarball entry header
 //
@@ -343,3 +277,20 @@ fn new_tar_header(path: &str, size: u64) -> anyhow::Result<Header> {
    header.set_cksum();
    Ok(header)
 }
+
+fn new_tar_header_dir(path: &str) -> anyhow::Result<Header> {
+    let mut header = Header::new_gnu();
+    header.set_size(0);
+    header.set_path(path)?;
+    header.set_mode(0o755); // -rw-------
+    header.set_entry_type(EntryType::dir());
+    header.set_mtime(
+        // use currenttime as last modified time
+        SystemTime::now()
+            .duration_since(SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_secs(),
+    );
+    header.set_cksum();
+    Ok(header)
+}
--- a/pageserver/src/branches.rs
+++ b/pageserver/src/branches.rs
@@ -5,21 +5,19 @@
 //

 use anyhow::{bail, ensure, Context, Result};
-use fs::File;
-use postgres_ffi::{pg_constants, xlog_utils, ControlFileData};
+use postgres_ffi::ControlFileData;
 use serde::{Deserialize, Serialize};
 use std::env;
-use std::io::Read;
-use std::sync::Arc;
 use std::{
    fs,
-    path::{Path, PathBuf},
+    path::Path,
    process::{Command, Stdio},
    str::FromStr,
+    sync::Arc,
 };
-use zenith_utils::lsn::Lsn;

 use log::*;
+use zenith_utils::lsn::Lsn;

 use crate::logger;
 use crate::object_repository::ObjectRepository;
@@ -65,6 +63,7 @@ pub fn init_pageserver(
        .with_context(|| "failed to create repo")?;
    }
    fs::create_dir_all(conf.tenants_path())?;
+
    println!("pageserver init succeeded");
    Ok(())
 }
@@ -92,43 +91,7 @@ pub fn create_repo(

    info!("created directory structure in {}", repo_dir.display());

-    // Run initdb
-    //
-    // We create the cluster temporarily in a "tmp" directory inside the repository,
-    // and move it to the right location from there.
-    let tmppath = conf.tenant_path(&tenantid).join("tmp");
-
-    info!("running initdb... ");
-
-    let initdb_path = conf.pg_bin_dir().join("initdb");
-    let initdb_otput = Command::new(initdb_path)
-        .args(&["-D", tmppath.to_str().unwrap()])
-        .args(&["-U", &conf.superuser])
-        .arg("--no-instructions")
-        .env_clear()
-        .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
-        .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
-        .stdout(Stdio::null())
-        .output()
-        .with_context(|| "failed to execute initdb")?;
-    if !initdb_otput.status.success() {
-        anyhow::bail!(
-            "initdb failed: '{}'",
-            String::from_utf8_lossy(&initdb_otput.stderr)
-        );
-    }
-    info!("initdb succeeded");
-
-    // Read control file to extract the LSN and system id
-    let controlfile_path = tmppath.join("global").join("pg_control");
-    let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?;
-    // let systemid = controlfile.system_identifier;
-    let lsn = controlfile.checkPoint;
-    let lsnstr = format!("{:016X}", lsn);
-
-    // Bootstrap the repository by loading the newly-initdb'd cluster into 'main' branch.
    let tli = create_timeline(conf, None, &tenantid)?;
-    let timelinedir = conf.timeline_path(&tli, &tenantid);

    // We don't use page_cache here, because we don't want to spawn the WAL redo thread during
    // repository initialization.
@@ -147,38 +110,93 @@ pub fn create_repo(
        wal_redo_manager,
        tenantid,
    );
-    let timeline = repo.create_empty_timeline(tli, Lsn(lsn))?;

-    restore_local_repo::import_timeline_from_postgres_datadir(&tmppath, &*timeline, Lsn(lsn))?;
+    // Load data into pageserver
+    // TODO To implement zenith import we need to
+    //      move data loading out of create_repo()
+    bootstrap_timeline(conf, tenantid, tli, &repo)?;

-    // Move the initial WAL file
-    fs::rename(
-        tmppath.join("pg_wal").join("000000010000000000000001"),
-        timelinedir
-            .join("wal")
-            .join("000000010000000000000001.partial"),
-    )?;
-    info!("created initial timeline {}", tli);
+    Ok(repo)
+}
+
+// Returns checkpoint LSN from controlfile
+fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
+    // Read control file to extract the LSN
+    let controlfile_path = path.join("global").join("pg_control");
+    let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?;
+    let lsn = controlfile.checkPoint;
+
+    Ok(Lsn(lsn))
+}
+
+// Create the cluster temporarily in a initdbpath directory inside the repository
+// to get bootstrap data for timeline initialization.
+//
+fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
+    info!("running initdb... ");
+
+    let initdb_path = conf.pg_bin_dir().join("initdb");
+    let initdb_otput = Command::new(initdb_path)
+        .args(&["-D", initdbpath.to_str().unwrap()])
+        .args(&["-U", &conf.superuser])
+        .arg("--no-instructions")
+        .env_clear()
+        .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
+        .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
+        .stdout(Stdio::null())
+        .output()
+        .with_context(|| "failed to execute initdb")?;
+    if !initdb_otput.status.success() {
+        anyhow::bail!(
+            "initdb failed: '{}'",
+            String::from_utf8_lossy(&initdb_otput.stderr)
+        );
+    }
+    info!("initdb succeeded");
+
+    Ok(())
+}
+
+//
+// - run initdb to init temporary instance and get bootstrap data
+// - after initialization complete, remove the temp dir.
+//
+fn bootstrap_timeline(
+    conf: &'static PageServerConf,
+    tenantid: ZTenantId,
+    tli: ZTimelineId,
+    repo: &dyn Repository,
+) -> Result<()> {
+    let initdb_path = conf.tenant_path(&tenantid).join("tmp");
+
+    // Init temporarily repo to get bootstrap data
+    run_initdb(conf, &initdb_path)?;
+    let pgdata_path = initdb_path;
+
+    let lsn = get_lsn_from_controlfile(&pgdata_path)?;
+
+    info!("bootstrap_timeline {:?} at lsn {}", pgdata_path, lsn);
+
+    let timeline = repo.create_empty_timeline(tli, lsn)?;
+    restore_local_repo::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?;
+
+    let wal_dir = pgdata_path.join("pg_wal");
+    restore_local_repo::import_timeline_wal(&wal_dir, &*timeline, timeline.get_last_record_lsn())?;
+
+    println!(
+        "created initial timeline {} timeline.lsn {}",
+        tli,
+        timeline.get_last_record_lsn()
+    );

    let data = tli.to_string();
    fs::write(conf.branch_path("main", &tenantid), data)?;
-    info!("created main branch");
+    println!("created main branch");

-    // Remove pg_wal
-    fs::remove_dir_all(tmppath.join("pg_wal"))?;
+    // Remove temp dir. We don't need it anymore
+    fs::remove_dir_all(pgdata_path)?;

-    // Move the data directory as an initial base backup.
-    // FIXME: It would be enough to only copy the non-relation files here, the relation
-    // data was already loaded into the repository.
-    let target = timelinedir.join("snapshots").join(&lsnstr);
-    fs::rename(tmppath, &target)?;
-
-    info!(
-        "new zenith repository was created in {}",
-        repo_dir.display()
-    );
-
-    Ok(repo)
+    Ok(())
 }

 pub(crate) fn get_tenants(conf: &PageServerConf) -> Result<Vec<String>> {
@@ -269,26 +287,10 @@ pub(crate) fn create_branch(

    // create a new timeline directory for it
    let newtli = create_timeline(conf, Some(startpoint), tenantid)?;
-    let newtimelinedir = conf.timeline_path(&newtli, tenantid);

    // Let the Repository backend do its initialization
    repo.branch_timeline(startpoint.timelineid, newtli, startpoint.lsn)?;

-    // Copy the latest snapshot (TODO: before the startpoint) and all WAL
-    // TODO: be smarter and avoid the copying...
-    let (_maxsnapshot, oldsnapshotdir) =
-        find_latest_snapshot(conf, &startpoint.timelineid, tenantid)?;
-    let copy_opts = fs_extra::dir::CopyOptions::new();
-    fs_extra::dir::copy(oldsnapshotdir, newtimelinedir.join("snapshots"), &copy_opts)?;
-
-    let oldtimelinedir = conf.timeline_path(&startpoint.timelineid, tenantid);
-    copy_wal(
-        &oldtimelinedir.join("wal"),
-        &newtimelinedir.join("wal"),
-        startpoint.lsn,
-        pg_constants::WAL_SEGMENT_SIZE,
-    )?;
-
    // Remember the human-readable branch name for the new timeline.
    // FIXME: there's a race condition, if you create a branch with the same
    // name concurrently.
@@ -389,7 +391,6 @@ fn create_timeline(
    let timelinedir = conf.timeline_path(&timelineid, tenantid);

    fs::create_dir(&timelinedir)?;
-    fs::create_dir(&timelinedir.join("snapshots"))?;
    fs::create_dir(&timelinedir.join("wal"))?;

    if let Some(ancestor) = ancestor {
@@ -399,76 +400,3 @@ fn create_timeline(

    Ok(timelineid)
 }
-
-///
-/// Copy all WAL segments from one directory to another, up to given LSN.
-///
-/// If the given LSN is in the middle of a segment, the last segment containing it
-/// is written out as .partial, and padded with zeros.
-///
-fn copy_wal(src_dir: &Path, dst_dir: &Path, upto: Lsn, wal_seg_size: usize) -> Result<()> {
-    let last_segno = upto.segment_number(wal_seg_size);
-    let last_segoff = upto.segment_offset(wal_seg_size);
-
-    for entry in fs::read_dir(src_dir).unwrap().flatten() {
-        let entry_name = entry.file_name();
-        let fname = entry_name.to_str().unwrap();
-
-        // Check if the filename looks like an xlog file, or a .partial file.
-        if !xlog_utils::IsXLogFileName(fname) && !xlog_utils::IsPartialXLogFileName(fname) {
-            continue;
-        }
-        let (segno, _tli) = xlog_utils::XLogFromFileName(fname, wal_seg_size as usize);
-
-        let copylen;
-        let mut dst_fname = PathBuf::from(fname);
-        if segno > last_segno {
-            // future segment, skip
-            continue;
-        } else if segno < last_segno {
-            copylen = wal_seg_size;
-            dst_fname.set_extension("");
-        } else {
-            copylen = last_segoff;
-            dst_fname.set_extension("partial");
-        }
-
-        let src_file = File::open(entry.path())?;
-        let mut dst_file = File::create(dst_dir.join(&dst_fname))?;
-        std::io::copy(&mut src_file.take(copylen as u64), &mut dst_file)?;
-
-        if copylen < wal_seg_size {
-            std::io::copy(
-                &mut std::io::repeat(0).take((wal_seg_size - copylen) as u64),
-                &mut dst_file,
-            )?;
-        }
-    }
-    Ok(())
-}
-
-// Find the latest snapshot for a timeline
-fn find_latest_snapshot(
-    conf: &PageServerConf,
-    timelineid: &ZTimelineId,
-    tenantid: &ZTenantId,
-) -> Result<(Lsn, PathBuf)> {
-    let snapshotsdir = conf.snapshots_path(timelineid, tenantid);
-    let paths = fs::read_dir(&snapshotsdir)?;
-    let mut maxsnapshot = Lsn(0);
-    let mut snapshotdir: Option<PathBuf> = None;
-    for path in paths {
-        let path = path?;
-        let filename = path.file_name().to_str().unwrap().to_owned();
-        if let Ok(lsn) = Lsn::from_hex(&filename) {
-            maxsnapshot = std::cmp::max(lsn, maxsnapshot);
-            snapshotdir = Some(path.path());
-        }
-    }
-    if maxsnapshot == Lsn(0) {
-        // TODO: check ancestor timeline
-        anyhow::bail!("no snapshot found in {}", snapshotsdir.display());
-    }
-
-    Ok((maxsnapshot, snapshotdir.unwrap()))
-}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -78,10 +78,6 @@ impl PageServerConf {
        self.timelines_path(tenantid).join(timelineid.to_string())
    }

-    fn snapshots_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
-        self.timeline_path(timelineid, tenantid).join("snapshots")
-    }
-
    fn ancestor_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
        self.timeline_path(timelineid, tenantid).join("ancestor")
    }
--- a/pageserver/src/object_repository.rs
+++ b/pageserver/src/object_repository.rs
@@ -145,7 +145,8 @@ impl Repository for ObjectRepository {
    fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, at_lsn: Lsn) -> Result<()> {
        let src_timeline = self.get_timeline(src)?;

-        // Write a metadata key, noting the ancestor of th new timeline. There is initially
+        trace!("branch_timeline at lsn {}", at_lsn);
+        // Write a metadata key, noting the ancestor of the new timeline. There is initially
        // no data in it, but all the read-calls know to look into the ancestor.
        let metadata = MetadataEntry {
            last_valid_lsn: at_lsn,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -32,7 +32,6 @@ use crate::branches;
 use crate::object_key::ObjectTag;
 use crate::page_cache;
 use crate::repository::{BufferTag, Modification, RelTag};
-use crate::restore_local_repo;
 use crate::walreceiver;
 use crate::walredo::PostgresRedoManager;
 use crate::PageServerConf;
@@ -295,23 +294,15 @@ impl PageServerHandler {

        /* Send a tarball of the latest snapshot on the timeline */

-        // find latest snapshot
-        let snapshot_lsn =
-            restore_local_repo::find_latest_snapshot(&self.conf, &timelineid, &tenantid).unwrap();
-
        let req_lsn = lsn.unwrap_or_else(|| timeline.get_last_valid_lsn());

        {
            let mut writer = CopyDataSink { pgb };
            let mut basebackup = basebackup::Basebackup::new(
-                self.conf,
                &mut writer,
-                tenantid,
-                timelineid,
                &timeline,
                req_lsn,
                timeline.get_prev_record_lsn(),
-                snapshot_lsn,
            );
            basebackup.send_tarball()?;
        }
--- a/pageserver/src/restore_local_repo.rs
+++ b/pageserver/src/restore_local_repo.rs
@@ -3,7 +3,7 @@
 //! zenith Timeline.
 //!
 use log::*;
-use std::cmp::{max, min};
+use std::cmp::min;
 use std::fs;
 use std::fs::File;
 use std::io::Read;
@@ -17,9 +17,6 @@ use bytes::{Buf, Bytes};
 use crate::object_key::*;
 use crate::repository::*;
 use crate::waldecoder::*;
-use crate::PageServerConf;
-use crate::ZTenantId;
-use crate::ZTimelineId;
 use postgres_ffi::relfile_utils::*;
 use postgres_ffi::xlog_utils::*;
 use postgres_ffi::{pg_constants, CheckPoint, ControlFileData};
@@ -28,36 +25,6 @@ use zenith_utils::lsn::Lsn;
 const MAX_MBR_BLKNO: u32 =
    pg_constants::MAX_MULTIXACT_ID / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;

-///
-/// Find latest snapshot in a timeline's 'snapshots' directory
-///
-pub fn find_latest_snapshot(
-    conf: &PageServerConf,
-    timelineid: &ZTimelineId,
-    tenantid: &ZTenantId,
-) -> Result<Lsn> {
-    let snapshotspath = conf.snapshots_path(timelineid, tenantid);
-    let mut last_snapshot_lsn = Lsn(0);
-    for direntry in fs::read_dir(&snapshotspath).unwrap() {
-        let filename = direntry.unwrap().file_name();
-
-        if let Ok(lsn) = Lsn::from_filename(&filename) {
-            last_snapshot_lsn = max(lsn, last_snapshot_lsn);
-        } else {
-            error!("unrecognized file in snapshots directory: {:?}", filename);
-        }
-    }
-
-    if last_snapshot_lsn == Lsn(0) {
-        error!(
-            "could not find valid snapshot in {}",
-            snapshotspath.display()
-        );
-        // TODO return error?
-    }
-    Ok(last_snapshot_lsn)
-}
-
 ///
 /// Import all relation data pages from local disk into the repository.
 ///
@@ -108,6 +75,11 @@ pub fn import_timeline_from_postgres_datadir(
    for direntry in fs::read_dir(path.join("base"))? {
        let direntry = direntry?;

+        //skip all temporary files
+        if direntry.file_name().to_str().unwrap() == "pgsql_tmp" {
+            continue;
+        }
+
        let dboid = direntry.file_name().to_str().unwrap().parse::<u32>()?;

        for direntry in fs::read_dir(direntry.path())? {
@@ -296,8 +268,8 @@ fn import_slru_file(
    Ok(())
 }

-/// Scan PostgreSQL WAL files in given directory, and load all records >= 'startpoint' into
-/// the repository.
+/// Scan PostgreSQL WAL files in given directory
+/// and load all records >= 'startpoint' into the repository.
 pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint: Lsn) -> Result<()> {
    let mut waldecoder = WalStreamDecoder::new(startpoint);

@@ -311,6 +283,9 @@ pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint:
    loop {
        // FIXME: assume postgresql tli 1 for now
        let filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE);
+        let mut buf = Vec::new();
+
+        //Read local file
        let mut path = walpath.join(&filename);

        // It could be as .partial
@@ -331,12 +306,12 @@ pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint:
            file.seek(SeekFrom::Start(offset as u64))?;
        }

-        let mut buf = Vec::new();
        let nread = file.read_to_end(&mut buf)?;
        if nread != pg_constants::WAL_SEGMENT_SIZE - offset as usize {
            // Maybe allow this for .partial files?
            error!("read only {} bytes from WAL file", nread);
        }
+
        waldecoder.feed_bytes(&buf);

        let mut nrecords = 0;
@@ -358,19 +333,19 @@ pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint:
            nrecords += 1;
        }

-        info!(
-            "imported {} records from WAL file {} up to {}",
-            nrecords,
-            path.display(),
-            last_lsn
-        );
+        info!("imported {} records up to {}", nrecords, last_lsn);

        segno += 1;
        offset = 0;
    }
+
    info!("reached end of WAL at {}", last_lsn);
    let checkpoint_bytes = checkpoint.encode();
    timeline.put_page_image(ObjectTag::Checkpoint, last_lsn, checkpoint_bytes, false)?;
+
+    timeline.advance_last_valid_lsn(last_lsn);
+    timeline.checkpoint()?;
+
    Ok(())
 }

--- a/postgres_ffi/samples/pg_hba.conf
+++ b/postgres_ffi/samples/pg_hba.conf
@@ -0,0 +1,98 @@
+# PostgreSQL Client Authentication Configuration File
+# ===================================================
+#
+# Refer to the "Client Authentication" section in the PostgreSQL
+# documentation for a complete description of this file.  A short
+# synopsis follows.
+#
+# This file controls: which hosts are allowed to connect, how clients
+# are authenticated, which PostgreSQL user names they can use, which
+# databases they can access.  Records take one of these forms:
+#
+# local         DATABASE  USER  METHOD  [OPTIONS]
+# host          DATABASE  USER  ADDRESS  METHOD  [OPTIONS]
+# hostssl       DATABASE  USER  ADDRESS  METHOD  [OPTIONS]
+# hostnossl     DATABASE  USER  ADDRESS  METHOD  [OPTIONS]
+# hostgssenc    DATABASE  USER  ADDRESS  METHOD  [OPTIONS]
+# hostnogssenc  DATABASE  USER  ADDRESS  METHOD  [OPTIONS]
+#
+# (The uppercase items must be replaced by actual values.)
+#
+# The first field is the connection type:
+# - "local" is a Unix-domain socket
+# - "host" is a TCP/IP socket (encrypted or not)
+# - "hostssl" is a TCP/IP socket that is SSL-encrypted
+# - "hostnossl" is a TCP/IP socket that is not SSL-encrypted
+# - "hostgssenc" is a TCP/IP socket that is GSSAPI-encrypted
+# - "hostnogssenc" is a TCP/IP socket that is not GSSAPI-encrypted
+#
+# DATABASE can be "all", "sameuser", "samerole", "replication", a
+# database name, or a comma-separated list thereof. The "all"
+# keyword does not match "replication". Access to replication
+# must be enabled in a separate record (see example below).
+#
+# USER can be "all", a user name, a group name prefixed with "+", or a
+# comma-separated list thereof.  In both the DATABASE and USER fields
+# you can also write a file name prefixed with "@" to include names
+# from a separate file.
+#
+# ADDRESS specifies the set of hosts the record matches.  It can be a
+# host name, or it is made up of an IP address and a CIDR mask that is
+# an integer (between 0 and 32 (IPv4) or 128 (IPv6) inclusive) that
+# specifies the number of significant bits in the mask.  A host name
+# that starts with a dot (.) matches a suffix of the actual host name.
+# Alternatively, you can write an IP address and netmask in separate
+# columns to specify the set of hosts.  Instead of a CIDR-address, you
+# can write "samehost" to match any of the server's own IP addresses,
+# or "samenet" to match any address in any subnet that the server is
+# directly connected to.
+#
+# METHOD can be "trust", "reject", "md5", "password", "scram-sha-256",
+# "gss", "sspi", "ident", "peer", "pam", "ldap", "radius" or "cert".
+# Note that "password" sends passwords in clear text; "md5" or
+# "scram-sha-256" are preferred since they send encrypted passwords.
+#
+# OPTIONS are a set of options for the authentication in the format
+# NAME=VALUE.  The available options depend on the different
+# authentication methods -- refer to the "Client Authentication"
+# section in the documentation for a list of which options are
+# available for which authentication methods.
+#
+# Database and user names containing spaces, commas, quotes and other
+# special characters must be quoted.  Quoting one of the keywords
+# "all", "sameuser", "samerole" or "replication" makes the name lose
+# its special character, and just match a database or username with
+# that name.
+#
+# This file is read on server startup and when the server receives a
+# SIGHUP signal.  If you edit the file on a running system, you have to
+# SIGHUP the server for the changes to take effect, run "pg_ctl reload",
+# or execute "SELECT pg_reload_conf()".
+#
+# Put your actual configuration here
+# ----------------------------------
+#
+# If you want to allow non-local connections, you need to add more
+# "host" records.  In that case you will also need to make PostgreSQL
+# listen on a non-local interface via the listen_addresses
+# configuration parameter, or via the -i or -h command line switches.
+
+# CAUTION: Configuring the system for local "trust" authentication
+# allows any local user to connect as any PostgreSQL user, including
+# the database superuser.  If you do not trust all your local users,
+# use another authentication method.
+
+
+# TYPE  DATABASE        USER            ADDRESS                 METHOD
+
+# "local" is for Unix domain socket connections only
+local   all             all                                     trust
+# IPv4 local connections:
+host    all             all             127.0.0.1/32            trust
+# IPv6 local connections:
+host    all             all             ::1/128                 trust
+# Allow replication connections from localhost, by a user with the
+# replication privilege.
+local   replication     all                                     trust
+host    replication     all             127.0.0.1/32            trust
+host    replication     all             ::1/128                 trust
--- a/postgres_ffi/src/pg_constants.rs
+++ b/postgres_ffi/src/pg_constants.rs
@@ -185,3 +185,41 @@ pub const XLOG_BLCKSZ: usize = 8192;
 pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
 pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
 pub const XLP_LONG_HEADER: u16 = 0x0002;
+
+pub const PG_MAJORVERSION: &'static str = "14";
+
+// List of subdirectories inside pgdata.
+// Copied from src/bin/initdb/initdb.c
+pub const PGDATA_SUBDIRS: [&'static str; 22] = [
+    "global",
+    "pg_wal/archive_status",
+    "pg_commit_ts",
+    "pg_dynshmem",
+    "pg_notify",
+    "pg_serial",
+    "pg_snapshots",
+    "pg_subtrans",
+    "pg_twophase",
+    "pg_multixact",
+    "pg_multixact/members",
+    "pg_multixact/offsets",
+    "base",
+    "base/1",
+    "pg_replslot",
+    "pg_tblspc",
+    "pg_stat",
+    "pg_stat_tmp",
+    "pg_xact",
+    "pg_logical",
+    "pg_logical/snapshots",
+    "pg_logical/mappings",
+];
+
+pub const PGDATA_SPECIAL_FILES: [&'static str; 4] = [
+    "pg_hba.conf",
+    "pg_ident.conf",
+    "postgresql.conf",
+    "postgresql.auto.conf",
+];
+
+pub static PG_HBA: &'static str = include_str!("../samples/pg_hba.conf");