Introduce a new "layered" repository implementation.

This replaces the RocksDB based implementation with an approach using "snapshot files" on disk, and in-memory btreemaps to hold the recent changes. This make the repository implementation a configuration option. You can choose 'layered' or 'rocksdb' with "zenith init --repository-format=<format>" The unit tests have been refactored to exercise both implementations. 'layered' is now the default. Push/pull is not implemented. The 'test_history_inmemory' test has been commented out accordingly. It's not clear how we will implement that functionality; probably by copying the snapshot files directly.
2026-06-02 13:00:37 +00:00 · 2021-08-16 10:06:48 +03:00
parent 5eb1738e8b
commit 2450f82de5
24 changed files with 3435 additions and 75 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1,7 +1,5 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-version = 3
-
 [[package]]
 name = "ahash"
 version = "0.4.7"
@@ -82,6 +80,30 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"

+[[package]]
+name = "aversion"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41992ab8cfcc3026ef9abceffe0c2b0479c043183fc23825e30d22baab6df334"
+dependencies = [
+ "aversion-macros",
+ "byteorder",
+ "serde",
+ "serde_cbor",
+ "thiserror",
+]
+
+[[package]]
+name = "aversion-macros"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ba5785f953985aa0caca927ba4005880f3b4f53de87f134e810ae3549f744d2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "aws-creds"
 version = "0.26.0"
@@ -166,6 +188,18 @@ dependencies = [
 "generic-array",
 ]

+[[package]]
+name = "bookfile"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "efa3e2086414e1bbecbc10730f265e5b079ab4ea0b830e7219a70dab6471e753"
+dependencies = [
+ "aversion",
+ "byteorder",
+ "serde",
+ "thiserror",
+]
+
 [[package]]
 name = "boxfnonce"
 version = "0.1.1"
@@ -646,6 +680,12 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "half"
+version = "1.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62aca2aba2d62b4a7f5b33f3712cb1b0692779a56fb510499d5c0aa594daeaf3"
+
 [[package]]
 name = "hashbrown"
 version = "0.9.1"
@@ -1139,6 +1179,7 @@ name = "pageserver"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "bookfile",
 "byteorder",
 "bytes",
 "chrono",
@@ -1276,24 +1317,6 @@ dependencies = [
 "tokio-postgres 0.7.1",
 ]

-[[package]]
-name = "postgres-protocol"
-version = "0.6.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ff3e0f70d32e20923cabf2df02913be7c1842d4c772db8065c00fcfdd1d1bff3"
-dependencies = [
- "base64 0.13.0",
- "byteorder",
- "bytes",
- "fallible-iterator",
- "hmac",
- "md-5",
- "memchr",
- "rand",
- "sha2",
- "stringprep",
-]
-
 [[package]]
 name = "postgres-protocol"
 version = "0.6.1"
@@ -1313,14 +1336,21 @@ dependencies = [
 ]

 [[package]]
-name = "postgres-types"
-version = "0.2.1"
+name = "postgres-protocol"
+version = "0.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "430f4131e1b7657b0cd9a2b0c3408d77c9a43a042d300b8c77f981dffcc43a2f"
+checksum = "ff3e0f70d32e20923cabf2df02913be7c1842d4c772db8065c00fcfdd1d1bff3"
 dependencies = [
+ "base64 0.13.0",
+ "byteorder",
 "bytes",
 "fallible-iterator",
- "postgres-protocol 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "hmac",
+ "md-5",
+ "memchr",
+ "rand",
+ "sha2",
+ "stringprep",
 ]

 [[package]]
@@ -1333,6 +1363,17 @@ dependencies = [
 "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)",
 ]

+[[package]]
+name = "postgres-types"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "430f4131e1b7657b0cd9a2b0c3408d77c9a43a042d300b8c77f981dffcc43a2f"
+dependencies = [
+ "bytes",
+ "fallible-iterator",
+ "postgres-protocol 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
 [[package]]
 name = "postgres_ffi"
 version = "0.1.0"
@@ -1735,6 +1776,16 @@ dependencies = [
 "xml-rs",
 ]

+[[package]]
+name = "serde_cbor"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e18acfa2f90e8b735b2836ab8d538de304cbb6729a7360729ea5a895d15a622"
+dependencies = [
+ "half",
+ "serde",
+]
+
 [[package]]
 name = "serde_derive"
 version = "1.0.126"
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -42,6 +42,9 @@ pub struct LocalEnv {
    #[serde(with = "hex")]
    pub tenantid: ZTenantId,

+    // Repository format, 'rocksdb' or 'layered' or None for default
+    pub repository_format: Option<String>,
+
    // jwt auth token used for communication with pageserver
    pub auth_token: String,

@@ -101,6 +104,7 @@ pub fn init(
    remote_pageserver: Option<&str>,
    tenantid: ZTenantId,
    auth_type: AuthType,
+    repository_format: Option<&str>,
 ) -> Result<()> {
    // check if config already exists
    let base_path = base_path();
@@ -176,6 +180,7 @@ pub fn init(
            base_data_dir: base_path,
            remotes: BTreeMap::default(),
            tenantid,
+            repository_format: repository_format.map(|x| x.into()),
            auth_token,
            auth_type,
            private_key_path,
@@ -194,6 +199,7 @@ pub fn init(
            base_data_dir: base_path,
            remotes: BTreeMap::default(),
            tenantid,
+            repository_format: repository_format.map(|x| x.into()),
            auth_token,
            auth_type,
            private_key_path,
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -50,7 +50,7 @@ impl PageServerNode {
            .unwrap()
    }

-    pub fn init(&self, create_tenant: Option<&str>, enable_auth: bool) -> Result<()> {
+    pub fn init(&self, create_tenant: Option<&str>, enable_auth: bool, repository_format: Option<&str>) -> Result<()> {
        let mut cmd = Command::new(self.env.pageserver_bin()?);
        let mut args = vec![
            "--init",
@@ -65,6 +65,10 @@ impl PageServerNode {
            args.extend(&["--auth-type", "ZenithJWT"]);
        }

+        if let Some(repo_format) = repository_format {
+            args.extend(&["--repository-format", repo_format]);
+        }
+
        create_tenant.map(|tenantid| args.extend(&["--create-tenant", tenantid]));
        let status = cmd
            .args(args)
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -7,6 +7,7 @@ edition = "2018"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
+bookfile = "^0.3"
 chrono = "0.4.19"
 rand = "0.8.3"
 regex = "1.4.5"
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -20,14 +20,14 @@ use anyhow::{ensure, Result};
 use clap::{App, Arg, ArgMatches};
 use daemonize::Daemonize;

-use pageserver::{branches, logger, page_cache, page_service, PageServerConf};
+use pageserver::{branches, logger, page_cache, page_service, PageServerConf, RepositoryFormat};
 use zenith_utils::http_endpoint;

 const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:64000";
 const DEFAULT_HTTP_ENDPOINT_ADDR: &str = "127.0.0.1:9898";

 const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
-const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100);
+const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(10);

 const DEFAULT_SUPERUSER: &str = "zenith_admin";

@@ -41,6 +41,7 @@ struct CfgFileParams {
    pg_distrib_dir: Option<String>,
    auth_validation_public_key_path: Option<String>,
    auth_type: Option<String>,
+    repository_format: Option<String>,
 }

 impl CfgFileParams {
@@ -58,6 +59,7 @@ impl CfgFileParams {
            pg_distrib_dir: get_arg("postgres-distrib"),
            auth_validation_public_key_path: get_arg("auth-validation-public-key-path"),
            auth_type: get_arg("auth-type"),
+            repository_format: get_arg("repository-format"),
        }
    }

@@ -74,6 +76,7 @@ impl CfgFileParams {
                .auth_validation_public_key_path
                .or(other.auth_validation_public_key_path),
            auth_type: self.auth_type.or(other.auth_type),
+            repository_format: self.repository_format.or(other.repository_format),
        }
    }

@@ -133,6 +136,16 @@ impl CfgFileParams {
            );
        }

+        let repository_format = match self.repository_format.as_ref() {
+            Some(repo_format_str) if repo_format_str == "rocksdb" => RepositoryFormat::RocksDb,
+            Some(repo_format_str) if repo_format_str == "layered" => RepositoryFormat::Layered,
+            Some(repo_format_str) => anyhow::bail!(
+                "invalid --repository-format '{}', must be 'rocksdb' or 'layered'",
+                repo_format_str
+            ),
+            None => RepositoryFormat::Layered, // default
+        };
+
        Ok(PageServerConf {
            daemonize: false,

@@ -148,8 +161,9 @@ impl CfgFileParams {
            pg_distrib_dir,

            auth_validation_public_key_path,
-
            auth_type,
+
+            repository_format,
        })
    }
 }
@@ -221,6 +235,12 @@ fn main() -> Result<()> {
                .takes_value(true)
                .help("Authentication scheme type. One of: Trust, MD5, ZenithJWT"),
        )
+        .arg(
+            Arg::with_name("repository-format")
+                .long("repository-format")
+                .takes_value(true)
+                .help("Which repository implementation to use, 'rocksdb' or 'layered'"),
+        )
        .get_matches();

    let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
--- a/pageserver/src/branches.rs
+++ b/pageserver/src/branches.rs
@@ -24,7 +24,7 @@ use crate::object_repository::ObjectRepository;
 use crate::page_cache;
 use crate::restore_local_repo;
 use crate::walredo::WalRedoManager;
-use crate::{repository::Repository, PageServerConf};
+use crate::{repository::Repository, PageServerConf, RepositoryFormat};

 #[derive(Serialize, Deserialize, Clone)]
 pub struct BranchInfo {
@@ -65,8 +65,8 @@ pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str
 pub fn create_repo(
    conf: &'static PageServerConf,
    tenantid: ZTenantId,
-    wal_redo_manager: Arc<dyn WalRedoManager>,
-) -> Result<ObjectRepository> {
+    wal_redo_manager: Arc<dyn WalRedoManager + Send + Sync>,
+) -> Result<Arc<dyn Repository>> {
    let repo_dir = conf.tenant_path(&tenantid);
    if repo_dir.exists() {
        bail!("repo for {} already exists", tenantid)
@@ -96,19 +96,27 @@ pub fn create_repo(
    // and we failed to run initdb again in the same directory. This has been solved for the
    // rapid init+start case now, but the general race condition remains if you restart the
    // server quickly.
-    let storage = crate::rocksdb_storage::RocksObjectStore::create(conf, &tenantid)?;
+    let repo: Arc<dyn Repository + Sync + Send> =
+        match conf.repository_format {
+            RepositoryFormat::Layered => Arc::new(
+                crate::layered_repository::LayeredRepository::new(conf, wal_redo_manager, tenantid),
+            ),
+            RepositoryFormat::RocksDb => {
+                let obj_store = crate::rocksdb_storage::RocksObjectStore::create(conf, &tenantid)?;

-    let repo = crate::object_repository::ObjectRepository::new(
-        conf,
-        std::sync::Arc::new(storage),
-        wal_redo_manager,
-        tenantid,
-    );
+                Arc::new(ObjectRepository::new(
+                    conf,
+                    Arc::new(obj_store),
+                    wal_redo_manager,
+                    tenantid,
+                ))
+            }
+        };

    // Load data into pageserver
    // TODO To implement zenith import we need to
    //      move data loading out of create_repo()
-    bootstrap_timeline(conf, tenantid, tli, &repo)?;
+    bootstrap_timeline(conf, tenantid, tli, &*repo)?;

    Ok(repo)
 }
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
--- a/pageserver/src/layered_repository/README.md
+++ b/pageserver/src/layered_repository/README.md
@@ -0,0 +1,298 @@
+# Overview
+
+The on-disk format is based on immutable files. The page server
+receives a stream of incoming WAL, parses the WAL records to determine
+which pages they apply to, and accumulates the incoming changes in
+memory. Every now and then, the accumulated changes are written out to
+new files.
+
+The files are called "snapshot files". Each snapshot file corresponds
+to one PostgreSQL relation fork. The snapshot files for each timeline
+are stored in the timeline's subdirectory under
+.zenith/tenants/<tenantid>/timelines.
+
+The files are named like this:
+
+    rel_<spcnode>_<dbnode>_<relnode>_<forknum>_<start LSN>_<end LSN>
+
+For example:
+
+    rel_1663_13990_2609_0_000000000169C348_0000000001702000
+
+Some non-relation files are also stored in repository. For example,
+a CLOG segment would be named like this:
+
+    pg_xact_0000_00000000198B06B0_00000000198C2550
+
+There is no difference in how the relation and non-relation files are
+managed, except that the first part of file names is different.
+Internally, the relations and non-relation files that are managed in
+the versioned store are together called "relishes".
+
+Each snapshot file contains a full snapshot, that is, full copy of all
+pages in the relation, as of the "start LSN". It also contains all WAL
+records applicable to the relation between the start and end
+LSNs. With this information, the page server can reconstruct any page
+version of the relation in the LSN range.
+
+If a file has been dropped, the last snapshot file for it is created
+with the _DROPPED suffix, e.g.
+
+    rel_1663_13990_2609_0_000000000169C348_0000000001702000_DROPPED
+
+In addition to the relations, with "rel_*" prefix, we use the same
+format for storing various smaller files from the PostgreSQL data
+directory. They will use different suffixes and the naming scheme
+up to the LSN range varies. The Zenith source code uses the term
+"relish" to mean "a relation, or other file that's treated like a
+relation in the storage"
+
+## Notation used in this document
+
+The full path of a snapshot file looks like this:
+
+    .zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_000000000169C348_0000000001702000
+
+For simplicity, the examples below use a simplified notation for the
+paths.  The tenant ID is left out, the timeline ID is replaced with
+the human-readable branch name, and spcnode+dbnode+relnode+forkum with
+a human-readable table name. The LSNs are also shorter. For example, a
+snapshot file for 'orders' table on 'main' branch, with LSN range
+100-200 would be:
+
+    main/orders_100_200
+
+
+# Creating snapshot files
+
+Let's start with a simple example with a system that contains one
+branch called 'main' and two tables, 'orders' and 'customers'. The end
+of WAL is currently at LSN 250. In this starting situation, you would
+have two files on disk:
+
+	main/orders_100_200
+	main/customers_100_200
+
+In addition to those files, the recent changes between LSN 200 and the
+end of WAL at 250 are kept in memory. If the page server crashes, the
+latest records between 200-250 need to be re-read from the WAL.
+
+Whenever enough WAL has been accumulated in memory, the page server
+writes out the changes in memory into new snapshot files. This process
+is called "checkpointing" (not to be confused with the PostgreSQL
+checkpoints, that's a different thing). The page server only creates
+snapshot files for relations that have been modified since the last
+checkpoint. For example, if the current end of WAL is at LSN 450, and
+the last checkpoint happened at LSN 400 but there hasn't been any
+recent changes to 'customers' table, you would have these files on
+disk:
+
+	main/orders_100_200
+	main/orders_200_300
+	main/orders_300_400
+	main/customers_100_200
+
+If the customers table is modified later, a new file is created for it
+at the next checkpoint. The new file will cover the "gap" from the
+last snapshot file, so the LSN ranges are always contiguous:
+
+	main/orders_100_200
+	main/orders_200_300
+	main/orders_300_400
+	main/customers_100_200
+	main/customers_200_500
+
+## Reading page versions
+
+Whenever a GetPage@LSN request comes in from the compute node, the
+page server needs to reconstruct the requested page, as it was at the
+requested LSN. To do that, the page server first checks the recent
+in-memory layer; if the requested page version is found there, it can
+be returned immediatedly without looking at the files on
+disk. Otherwise the page server needs to locate the snapshot file that
+contains the requested page version.
+
+For example, if a request comes in for table 'orders' at LSN 250, the
+page server would load the 'main/orders_200_300' file into memory, and
+reconstruct and return the requested page from it, as it was at
+LSN 250. Because the snapshot file consists of a full image of the
+relation at the start LSN and the WAL, reconstructing the page
+involves replaying any WAL records applicable to the page between LSNs
+200-250, starting from the base image at LSN 200.
+
+A request at a file boundary can be satisfied using either file. For
+example, if there are two files on disk:
+
+	main/orders_100_200
+	main/orders_200_300
+
+And a request comes with LSN 200, either file can be used for it. It
+is better to use the later file, however, because it contains an
+already materialized version of all the pages at LSN 200. Using the
+first file, you would need to apply any WAL records between 100 and
+200 to reconstruct the requested page.
+
+# Multiple branches
+
+Imagine that a child branch is created at LSN 250:
+
+            @250
+    ----main--+-------------------------->
+               \
+                +---child-------------->
+
+
+Then, the 'orders' table is updated differently on the 'main' and
+'child' branches. You now have this situation on disk:
+
+    main/orders_100_200
+    main/orders_200_300
+    main/orders_300_400
+    main/customers_100_200
+    child/orders_250_300
+    child/orders_300_400
+
+Because the 'customers' table hasn't been modified on the child
+branch, there is no file for it there. If you request a page for it on
+the 'child' branch, the page server will not find any snapshot file
+for it in the 'child' directory, so it will recurse to look into the
+parent 'main' branch instead.
+
+From the 'child' branch's point of view, the history for each relation
+is linear, and the request's LSN identifies unambiguously which file
+you need to look at. For example, the history for the 'orders' table
+on the 'main' branch consists of these files:
+
+    main/orders_100_200
+    main/orders_200_300
+    main/orders_300_400
+
+And from the 'child' branch's point of view, it consists of these
+files:
+
+    main/orders_100_200
+    main/orders_200_300
+    child/orders_250_300
+    child/orders_300_400
+
+The branch metadata includes the point where the child branch was
+created, LSN 250. If a page request comes with LSN 275, we read the
+page version from the 'child/orders_250_300' file. If the request LSN
+is 225, we read it from the 'main/orders_200_300' file instead.  The
+page versions between 250-300 in the 'main/orders_200_300' file are
+ignored when operating on the child branch.
+
+Note: It doesn't make any difference if the child branch is created
+when the end of the main branch was at LSN 250, or later when the tip of
+the main branch had already moved on. The latter case, creating a
+branch at a historic LSN, is how we support PITR in Zenith.
+
+
+# Garbage collection
+
+In this scheme, we keep creating new snapshot files over time. We also
+need a mechanism to remove old files that are no longer needed,
+because disk space isn't infinite.
+
+What files are still needed? Currently, the page server supports PITR
+and branching from any branch at any LSN that is "recent enough" from
+the tip of the branch.  "Recent enough" is defined as an LSN horizon,
+which by default is 64 MB.  (See DEFAULT_GC_HORIZON). For this
+example, let's assume that the LSN horizon is 150 units.
+
+Let's look at the single branch scenario again. Imagine that the end
+of the branch is LSN 525, so that the GC horizon is currently at
+525-150 = 375
+
+	main/orders_100_200
+	main/orders_200_300
+	main/orders_300_400
+	main/orders_400_500
+	main/customers_100_200
+
+We can remove files 'main/orders_100_200' and 'main/orders_200_300',
+because the end LSNs of those files are older than GC horizon 375, and
+there are more recent snapshot files for the table. 'main/orders_300_400'
+and 'main/orders_400_500' are still within the horizon, so they must be
+retained. 'main/customers_100_200' is old enough, but it cannot be
+removed because there is no newer snapshot file for the table.
+
+Things get slightly more complicated with multiple branches. All of
+the above still holds, but in addition to recent files we must also
+retain older shapshot files that are still needed by child branches.
+For example, if child branch is created at LSN 150, and the 'customers'
+table is updated on the branch, you would have these files:
+
+	main/orders_100_200
+	main/orders_200_300
+	main/orders_300_400
+	main/orders_400_500
+	main/customers_100_200
+	child/customers_150_300
+
+In this situation, the 'main/orders_100_200' file cannot be removed,
+even though it is older than the GC horizon, because it is still
+needed by the child branch.  'main/orders_200_300' can still be
+removed. So after garbage collection, these files would remain:
+
+	main/orders_100_200
+
+	main/orders_300_400
+	main/orders_400_500
+	main/customers_100_200
+	child/customers_150_300
+
+If 'orders' is modified later on the 'child' branch, we will create a
+snapshot file for it on the child:
+
+	main/orders_100_200
+
+	main/orders_300_400
+	main/orders_400_500
+	main/customers_100_200
+	child/customers_150_300
+	child/orders_150_400
+
+After this, the 'main/orders_100_200' file can be removed. It is no
+longer needed by the child branch, because there is a newer snapshot
+file there. TODO: This optimization hasn't been implemented! The GC
+algorithm will currently keep the file on the 'main' branch anyway, for
+as long as the child branch exists.
+
+
+# TODO: On LSN ranges
+
+In principle, each relation can be checkpointed separately, i.e. the
+LSN ranges of the files don't need to line up. So this would be legal:
+
+	main/orders_100_200
+	main/orders_200_300
+	main/orders_300_400
+	main/customers_150_250
+	main/customers_250_500
+
+However, the code currently always checkpoints all relations together.
+So that situation doesn't arise in practice.
+
+It would also be OK to have overlapping LSN ranges for the same relation:
+
+	main/orders_100_200
+	main/orders_200_300
+	main/orders_250_350
+	main/orders_300_400
+
+The code that reads the snapshot files should cope with this, but this
+situation doesn't arise either, because the checkpointing code never
+does that.  It could be useful, however, as a transient state when
+garbage collecting around branch points, or explicit recovery
+points. For example, if we start with this:
+
+	main/orders_100_200
+	main/orders_200_300
+	main/orders_300_400
+
+And there is a branch or explicit recovery point at LSN 150, we could
+replace 'main/orders_100_200' with 'main/orders_150_150' to keep a
+snapshot only at that exact point that's still needed, removing the
+other page versions around it. But such compaction has not been
+implemented yet.
--- a/pageserver/src/layered_repository/inmemory_layer.rs
+++ b/pageserver/src/layered_repository/inmemory_layer.rs
@@ -0,0 +1,534 @@
+//!
+//! An in-memory layer stores recently received page versions in memory. The page versions
+//! are held in a BTreeMap, and there's another BTreeMap to track the size of the relation.
+//!
+
+use crate::layered_repository::storage_layer::Layer;
+use crate::layered_repository::storage_layer::PageVersion;
+use crate::layered_repository::SnapshotLayer;
+use crate::relish::*;
+use crate::repository::WALRecord;
+use crate::walredo::WalRedoManager;
+use crate::PageServerConf;
+use crate::{ZTenantId, ZTimelineId};
+use anyhow::{bail, Result};
+use bytes::Bytes;
+use log::*;
+use std::collections::BTreeMap;
+use std::ops::Bound::Included;
+use std::sync::{Arc, Mutex};
+
+use zenith_utils::lsn::Lsn;
+
+static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
+
+pub struct InMemoryLayer {
+    conf: &'static PageServerConf,
+    tenantid: ZTenantId,
+    timelineid: ZTimelineId,
+    rel: RelishTag,
+
+    ///
+    /// This layer contains all the changes from 'start_lsn'. The
+    /// start is inclusive. There is no end LSN; we only use in-memory
+    /// layer at the end of a timeline.
+    ///
+    start_lsn: Lsn,
+
+    /// The above fields never change. The parts that do change are in 'inner',
+    /// and protected by mutex.
+    inner: Mutex<InMemoryLayerInner>,
+}
+
+pub struct InMemoryLayerInner {
+    /// If this relation was dropped, remember when that happened.
+    drop_lsn: Option<Lsn>,
+
+    ///
+    /// All versions of all pages in the layer are are kept here.
+    /// Indexed by block number and LSN.
+    ///
+    page_versions: BTreeMap<(u32, Lsn), PageVersion>,
+
+    ///
+    /// `relsizes` tracks the size of the relation at different points in time.
+    ///
+    relsizes: BTreeMap<Lsn, u32>,
+}
+
+impl Layer for InMemoryLayer {
+    fn is_frozen(&self) -> bool {
+        return false;
+    }
+
+    fn get_timeline_id(&self) -> ZTimelineId {
+        return self.timelineid;
+    }
+
+    fn get_relish_tag(&self) -> RelishTag {
+        return self.rel;
+    }
+
+    fn get_start_lsn(&self) -> Lsn {
+        return self.start_lsn;
+    }
+
+    fn get_end_lsn(&self) -> Lsn {
+        return Lsn(u64::MAX);
+    }
+
+    fn is_dropped(&self) -> bool {
+        let inner = self.inner.lock().unwrap();
+        inner.drop_lsn.is_some()
+    }
+
+    /// Look up given page in the cache.
+    fn get_page_at_lsn(
+        &self,
+        walredo_mgr: &dyn WalRedoManager,
+        blknum: u32,
+        lsn: Lsn,
+    ) -> Result<Bytes> {
+        // Scan the BTreeMap backwards, starting from the given entry.
+        let mut records: Vec<WALRecord> = Vec::new();
+        let mut page_img: Option<Bytes> = None;
+        let mut need_base_image_lsn: Option<Lsn> = Some(lsn);
+
+        {
+            let inner = self.inner.lock().unwrap();
+            let minkey = (blknum, Lsn(0));
+            let maxkey = (blknum, lsn);
+            let mut iter = inner
+                .page_versions
+                .range((Included(&minkey), Included(&maxkey)));
+            while let Some(((_blknum, entry_lsn), entry)) = iter.next_back() {
+                if let Some(img) = &entry.page_image {
+                    page_img = Some(img.clone());
+                    need_base_image_lsn = None;
+                    break;
+                } else if let Some(rec) = &entry.record {
+                    records.push(rec.clone());
+                    if rec.will_init {
+                        // This WAL record initializes the page, so no need to go further back
+                        need_base_image_lsn = None;
+                        break;
+                    } else {
+                        need_base_image_lsn = Some(*entry_lsn);
+                    }
+                } else {
+                    // No base image, and no WAL record. Huh?
+                    bail!("no page image or WAL record for requested page");
+                }
+            }
+
+            // release lock on 'page_versions'
+        }
+        records.reverse();
+
+        // If we needed a base image to apply the WAL records against, we should have found it in memory.
+        if let Some(lsn) = need_base_image_lsn {
+            if records.is_empty() {
+                // no records, and no base image. This can happen if PostgreSQL extends a relation
+                // but never writes the page.
+                //
+                // Would be nice to detect that situation better.
+                warn!("Page {} blk {} at {} not found", self.rel, blknum, lsn);
+                return Ok(ZERO_PAGE.clone());
+            }
+            bail!(
+                "No base image found for page {} blk {} at {}/{}",
+                self.rel,
+                blknum,
+                self.timelineid,
+                lsn
+            );
+        }
+
+        // If we have a page image, and no WAL, we're all set
+        if records.is_empty() {
+            if let Some(img) = page_img {
+                trace!(
+                    "found page image for blk {} in {} at {}/{}, no WAL redo required",
+                    blknum,
+                    self.rel,
+                    self.timelineid,
+                    lsn
+                );
+                Ok(img)
+            } else {
+                // FIXME: this ought to be an error?
+                warn!("Page {} blk {} at {} not found", self.rel, blknum, lsn);
+                Ok(ZERO_PAGE.clone())
+            }
+        } else {
+            // We need to do WAL redo.
+            //
+            // If we don't have a base image, then the oldest WAL record better initialize
+            // the page
+            if page_img.is_none() && !records.first().unwrap().will_init {
+                // FIXME: this ought to be an error?
+                warn!(
+                    "Base image for page {}/{} at {} not found, but got {} WAL records",
+                    self.rel,
+                    blknum,
+                    lsn,
+                    records.len()
+                );
+                Ok(ZERO_PAGE.clone())
+            } else {
+                if page_img.is_some() {
+                    trace!("found {} WAL records and a base image for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.rel, self.timelineid, lsn);
+                } else {
+                    trace!("found {} WAL records that will init the page for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.rel, self.timelineid, lsn);
+                }
+                let img = walredo_mgr.request_redo(self.rel, blknum, lsn, page_img, records)?;
+
+                self.put_page_image(blknum, lsn, img.clone())?;
+
+                Ok(img)
+            }
+        }
+    }
+
+    /// Get size of the relation at given LSN
+    fn get_relish_size(&self, lsn: Lsn) -> Result<Option<u32>> {
+        // Scan the BTreeMap backwards, starting from the given entry.
+        let inner = self.inner.lock().unwrap();
+        let mut iter = inner.relsizes.range((Included(&Lsn(0)), Included(&lsn)));
+
+        if let Some((_entry_lsn, entry)) = iter.next_back() {
+            let result = *entry;
+            drop(inner);
+            trace!("get_relish_size: {} at {} -> {}", self.rel, lsn, result);
+            Ok(Some(result))
+        } else {
+            Ok(None)
+        }
+    }
+
+    /// Does this relation exist at given LSN?
+    fn get_rel_exists(&self, lsn: Lsn) -> Result<bool> {
+        let inner = self.inner.lock().unwrap();
+
+        // Is the requested LSN after the rel was dropped?
+        if let Some(drop_lsn) = inner.drop_lsn {
+            if lsn >= drop_lsn {
+                return Ok(false);
+            }
+        }
+
+        // Otherwise, it exists
+        Ok(true)
+    }
+
+    // Write operations
+
+    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
+    /// Adds the page version to the in-memory tree
+    fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> Result<()> {
+        trace!(
+            "put_page_version blk {} of {} at {}/{}",
+            blknum,
+            self.rel,
+            self.timelineid,
+            lsn
+        );
+        let mut inner = self.inner.lock().unwrap();
+
+        let old = inner.page_versions.insert((blknum, lsn), pv);
+
+        if old.is_some() {
+            // We already had an entry for this LSN. That's odd..
+            warn!(
+                "Page version of rel {:?} blk {} at {} already exists",
+                self.rel, blknum, lsn
+            );
+        }
+
+        // Also update the relation size, if this extended the relation.
+        if self.rel.is_blocky() {
+            let mut iter = inner.relsizes.range((Included(&Lsn(0)), Included(&lsn)));
+
+            let oldsize;
+            if let Some((_entry_lsn, entry)) = iter.next_back() {
+                oldsize = *entry;
+            } else {
+                oldsize = 0;
+                //bail!("No old size found for {} at {}", self.tag, lsn);
+            }
+            if blknum >= oldsize {
+                trace!(
+                    "enlarging relation {} from {} to {} blocks at {}",
+                    self.rel,
+                    oldsize,
+                    blknum + 1,
+                    lsn
+                );
+                inner.relsizes.insert(lsn, blknum + 1);
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Remember that the relation was truncated at given LSN
+    fn put_truncation(&self, lsn: Lsn, relsize: u32) -> anyhow::Result<()> {
+        let mut inner = self.inner.lock().unwrap();
+        let old = inner.relsizes.insert(lsn, relsize);
+
+        if old.is_some() {
+            // We already had an entry for this LSN. That's odd..
+            warn!("Inserting truncation, but had an entry for the LSN already");
+        }
+
+        Ok(())
+    }
+
+    /// Remember that the relation was dropped at given LSN
+    fn put_unlink(&self, lsn: Lsn) -> anyhow::Result<()> {
+        let mut inner = self.inner.lock().unwrap();
+
+        assert!(inner.drop_lsn.is_none());
+        inner.drop_lsn = Some(lsn);
+
+        info!("dropped relation {} at {}", self.rel, lsn);
+
+        Ok(())
+    }
+
+    ///
+    /// Write the this in-memory layer to disk, as a snapshot layer.
+    ///
+    /// The cutoff point for the layer that's written to disk is 'end_lsn'.
+    ///
+    /// Returns new layers that replace this one. Always returns a
+    /// SnapshotLayer containing the page versions that were written to disk,
+    /// but if there were page versions newer than 'end_lsn', also return a new
+    /// in-memory layer containing those page versions. The caller replaces
+    /// this layer with the returned layers in the layer map.
+    ///
+    fn freeze(
+        &self,
+        cutoff_lsn: Lsn,
+        walredo_mgr: &dyn WalRedoManager,
+    ) -> Result<Vec<Arc<dyn Layer>>> {
+        info!(
+            "freezing in memory layer for {} on timeline {} at {}",
+            self.rel, self.timelineid, cutoff_lsn
+        );
+
+        let inner = self.inner.lock().unwrap();
+
+        // Normally, use the cutoff LSN as the end of the frozen layer.
+        // But if the relation was dropped, we know that there are no
+        // more changes coming in for it, and in particular we know that
+        // there are no changes "in flight" for the LSN anymore, so we use
+        // the drop LSN instead. The drop-LSN could be ahead of the
+        // caller-specified LSN!
+        let dropped = inner.drop_lsn.is_some();
+        let end_lsn =
+            if dropped {
+                inner.drop_lsn.unwrap()
+            } else {
+                cutoff_lsn
+            };
+
+        // Divide all the page versions into old and new at the 'end_lsn' cutoff point.
+        let mut before_page_versions;
+        let mut before_relsizes;
+        let mut after_page_versions;
+        let mut after_relsizes;
+        if !dropped {
+            before_relsizes = BTreeMap::new();
+            after_relsizes = BTreeMap::new();
+            for (lsn, size) in inner.relsizes.iter() {
+                if *lsn > end_lsn {
+                    after_relsizes.insert(*lsn, *size);
+                } else {
+                    before_relsizes.insert(*lsn, *size);
+                }
+            }
+
+            before_page_versions = BTreeMap::new();
+            after_page_versions = BTreeMap::new();
+            for ((blknum, lsn), pv) in inner.page_versions.iter() {
+                if *lsn > end_lsn {
+                    after_page_versions.insert((*blknum, *lsn), pv.clone());
+                } else {
+                    before_page_versions.insert((*blknum, *lsn), pv.clone());
+                }
+            }
+        } else {
+            before_page_versions = inner.page_versions.clone();
+            before_relsizes = inner.relsizes.clone();
+            after_relsizes = BTreeMap::new();
+            after_page_versions = BTreeMap::new();
+        }
+
+        // we can release the lock now.
+        drop(inner);
+
+        // Write the page versions before the cutoff to disk.
+        let snapfile = SnapshotLayer::create(
+            self.conf,
+            self.timelineid,
+            self.tenantid,
+            self.rel,
+            self.start_lsn,
+            end_lsn,
+            dropped,
+            before_page_versions,
+            before_relsizes,
+        )?;
+        let mut result: Vec<Arc<dyn Layer>> = Vec::new();
+
+        // If there were any page versions after the cutoff, initialize a new in-memory layer
+        // to hold them
+        if !after_relsizes.is_empty() || !after_page_versions.is_empty() {
+            info!("created new in-mem layer for {} {}-", self.rel, end_lsn);
+
+            let new_layer = Self::copy_snapshot(
+                self.conf,
+                walredo_mgr,
+                &snapfile,
+                self.timelineid,
+                self.tenantid,
+                end_lsn,
+            )?;
+            let mut new_inner = new_layer.inner.lock().unwrap();
+            new_inner.page_versions.append(&mut after_page_versions);
+            new_inner.relsizes.append(&mut after_relsizes);
+            drop(new_inner);
+
+            result.push(Arc::new(new_layer));
+        }
+        result.push(Arc::new(snapfile));
+
+        Ok(result)
+    }
+
+    fn delete(&self) -> Result<()> {
+        // Nothing to do. When the reference is dropped, the memory is released.
+        Ok(())
+    }
+
+    fn unload(&self) -> Result<()> {
+        // cannot unload in-memory layer. Freeze instead
+        Ok(())
+    }
+}
+
+impl InMemoryLayer {
+    ///
+    /// Create a new, empty, in-memory layer
+    ///
+    pub fn create(
+        conf: &'static PageServerConf,
+        timelineid: ZTimelineId,
+        tenantid: ZTenantId,
+        rel: RelishTag,
+        start_lsn: Lsn,
+    ) -> Result<InMemoryLayer> {
+        trace!(
+            "initializing new empty InMemoryLayer for writing {} on timeline {} at {}",
+            rel,
+            timelineid,
+            start_lsn
+        );
+
+        Ok(InMemoryLayer {
+            conf,
+            timelineid,
+            tenantid,
+            rel,
+            start_lsn,
+            inner: Mutex::new(InMemoryLayerInner {
+                drop_lsn: None,
+                page_versions: BTreeMap::new(),
+                relsizes: BTreeMap::new(),
+            }),
+        })
+    }
+
+    ///
+    /// Initialize a new InMemoryLayer for, by copying the state at the given
+    /// point in time from given existing layer.
+    ///
+    pub fn copy_snapshot(
+        conf: &'static PageServerConf,
+        walredo_mgr: &dyn WalRedoManager,
+        src: &dyn Layer,
+        timelineid: ZTimelineId,
+        tenantid: ZTenantId,
+        lsn: Lsn,
+    ) -> Result<InMemoryLayer> {
+        trace!(
+            "initializing new InMemoryLayer for writing {} on timeline {} at {}",
+            src.get_relish_tag(),
+            timelineid,
+            lsn
+        );
+        let mut page_versions = BTreeMap::new();
+        let mut relsizes = BTreeMap::new();
+
+        let size;
+        if src.get_relish_tag().is_blocky() {
+            if let Some(sz) = src.get_relish_size(lsn)? {
+                relsizes.insert(lsn, sz);
+                size = sz;
+            } else {
+                bail!("no size found or {} at {}", src.get_relish_tag(), lsn);
+            }
+        } else {
+            size = 1;
+        }
+
+        for blknum in 0..size {
+            let img = src.get_page_at_lsn(walredo_mgr, blknum, lsn)?;
+            let pv = PageVersion {
+                page_image: Some(img),
+                record: None,
+            };
+            page_versions.insert((blknum, lsn), pv);
+        }
+
+        Ok(InMemoryLayer {
+            conf,
+            timelineid,
+            tenantid,
+            rel: src.get_relish_tag(),
+            start_lsn: lsn,
+            inner: Mutex::new(InMemoryLayerInner {
+                drop_lsn: None,
+                page_versions: page_versions,
+                relsizes: relsizes,
+            }),
+        })
+    }
+
+    /// debugging function to print out the contents of the layer
+    #[allow(unused)]
+    pub fn dump(&self) -> String {
+        let mut result = format!(
+            "----- inmemory layer for {} {}-> ----\n",
+            self.rel, self.start_lsn
+        );
+
+        let inner = self.inner.lock().unwrap();
+
+        for (k, v) in inner.relsizes.iter() {
+            result += &format!("{}: {}\n", k, v);
+        }
+        for (k, v) in inner.page_versions.iter() {
+            result += &format!(
+                "blk {} at {}: {}/{}\n",
+                k.0,
+                k.1,
+                v.page_image.is_some(),
+                v.record.is_some()
+            );
+        }
+
+        result
+    }
+}
--- a/pageserver/src/layered_repository/layer_map.rs
+++ b/pageserver/src/layered_repository/layer_map.rs
@@ -0,0 +1,132 @@
+//!
+//! The layer map tracks what layers exist for all the relations in a timeline.
+//!
+//! When the timeline is first accessed, the server lists of all snapshot files
+//! in the timelines/<timelineid> directory, and populates this map with
+//! SnapshotLayers corresponding to each file. When new WAL is received,
+//! we create InMemoryLayers to hold the incoming records. Now and then,
+//! in the checkpoint() function, the in-memory layers are frozen, forming
+//! new snapshot layers and corresponding files are written to disk.
+//!
+
+use crate::layered_repository::storage_layer::Layer;
+use crate::relish::*;
+use anyhow::Result;
+use log::*;
+use std::collections::BTreeMap;
+use std::collections::HashSet;
+use std::ops::Bound::Included;
+use std::sync::Arc;
+use zenith_utils::lsn::Lsn;
+
+/// LayerMap is a BTreeMap keyed by RelishTag and the layer's start LSN.
+/// It provides a couple of convenience functions over a plain BTreeMap
+pub struct LayerMap {
+    pub inner: BTreeMap<(RelishTag, Lsn), Arc<dyn Layer>>,
+}
+
+impl LayerMap {
+    ///
+    /// Look up using the given rel tag and LSN. This differs from a plain
+    /// key-value lookup in that if there is any layer that covers the
+    /// given LSN, or precedes the given LSN, it is returned. In other words,
+    /// you don't need to know the exact start LSN of the layer.
+    ///
+    pub fn get(&self, tag: RelishTag, lsn: Lsn) -> Option<Arc<dyn Layer>> {
+        let startkey = (tag, Lsn(0));
+        let endkey = (tag, lsn);
+
+        if let Some((_k, v)) = self
+            .inner
+            .range((Included(startkey), Included(endkey)))
+            .next_back()
+        {
+            Some(Arc::clone(v))
+        } else {
+            None
+        }
+    }
+
+    pub fn insert(&mut self, layer: Arc<dyn Layer>) {
+        let rel = layer.get_relish_tag();
+        let start_lsn = layer.get_start_lsn();
+
+        self.inner.insert((rel, start_lsn), Arc::clone(&layer));
+    }
+
+    pub fn remove(&mut self, layer: &dyn Layer) {
+        let rel = layer.get_relish_tag();
+        let start_lsn = layer.get_start_lsn();
+
+        self.inner.remove(&(rel, start_lsn));
+    }
+
+    pub fn list_rels(&self, spcnode: u32, dbnode: u32) -> Result<HashSet<RelTag>> {
+        let mut rels: HashSet<RelTag> = HashSet::new();
+
+        // Scan the timeline directory to get all rels in this timeline.
+        for ((rel, _lsn), _l) in self.inner.iter() {
+            if let RelishTag::Relation(reltag) = rel {
+                // FIXME: skip if it was dropped before the requested LSN. But there is no
+                // LSN argument
+
+                if (spcnode == 0 || reltag.spcnode == spcnode)
+                    && (dbnode == 0 || reltag.dbnode == dbnode)
+                {
+                    rels.insert(*reltag);
+                }
+            }
+        }
+        Ok(rels)
+    }
+
+    pub fn list_nonrels(&self, _lsn: Lsn) -> Result<HashSet<RelishTag>> {
+        let mut rels: HashSet<RelishTag> = HashSet::new();
+
+        // Scan the timeline directory to get all rels in this timeline.
+        for ((rel, _lsn), _l) in self.inner.iter() {
+            // FIXME: skip if it was dropped before the requested LSN.
+
+            if let RelishTag::Relation(_) = rel {
+            } else {
+                rels.insert(*rel);
+            }
+        }
+        Ok(rels)
+    }
+
+    /// Is there a newer layer for given relation?
+    pub fn newer_layer_exists(&self, rel: RelishTag, lsn: Lsn) -> bool {
+        let startkey = (rel, lsn);
+        let endkey = (rel, Lsn(u64::MAX));
+
+        for ((_rel, newer_lsn), layer) in self.inner.range((Included(startkey), Included(endkey))) {
+            if layer.get_end_lsn() > lsn {
+                trace!(
+                    "found later layer for rel {}, {} {}-{}",
+                    rel,
+                    lsn,
+                    newer_lsn,
+                    layer.get_end_lsn()
+                );
+                return true;
+            } else {
+                trace!(
+                    "found singleton layer for rel {}, {} {}",
+                    rel, lsn, newer_lsn
+                );
+                continue;
+            }
+        }
+        trace!("no later layer found for rel {}, {}", rel, lsn);
+        false
+    }
+}
+
+impl Default for LayerMap {
+    fn default() -> Self {
+        LayerMap {
+            inner: BTreeMap::new(),
+        }
+    }
+}
--- a/pageserver/src/layered_repository/snapshot_layer.rs
+++ b/pageserver/src/layered_repository/snapshot_layer.rs
@@ -0,0 +1,631 @@
+//!
+//! A SnapshotLayer represents one snapshot file on disk. One file holds all page
+//! version and size information of one relation, in a range of LSN.
+//! The name "snapshot file" is a bit of a misnomer because a snapshot file doesn't
+//! contain a snapshot at a specific LSN, but rather all the page versions in a range
+//! of LSNs.
+//!
+//! Currently, a snapshot file contains full information needed to reconstruct any
+//! page version in the LSN range, without consulting any other snapshot files. When
+//! a new snapshot file is created for writing, the full contents of relation are
+//! materialized as it is at the beginning of the LSN range. That can be very expensive,
+//! we should find a way to store differential files. But this keeps the read-side
+//! of things simple. You can find the correct snapshot file based on RelishTag and
+//! timeline+LSN, and once you've located it, you have all the data you need to in that
+//! file.
+//!
+//! When a snapshot file needs to be accessed, we slurp the whole file into memory, into
+//! the SnapshotLayer struct. See load() and unload() functions.
+//!
+//! On disk, the snapshot files are stored in timelines/<timelineid> directory.
+//! Currently, there are no subdirectories, and each snapshot file is named like this:
+//!
+//!    <spcnode>_<dbnode>_<relnode>_<forknum>_<start LSN>_<end LSN>
+//!
+//! For example:
+//!
+//!    1663_13990_2609_0_000000000169C348_000000000169C349
+//!
+//! If a relation is dropped, we add a '_DROPPED' to the end of the filename to indicate that.
+//! So the above example would become:
+//!
+//!    1663_13990_2609_0_000000000169C348_000000000169C349_DROPPED
+//!
+//! The end LSN indicates when it was dropped in that case, we don't store it in the
+//! file contents in any way.
+//!
+//! A snapshot file is constructed using the 'bookfile' crate. Each file consists of two
+//! parts: the page versions and the relation sizes. They are stored as separate chapters.
+//!
+use crate::layered_repository::storage_layer::Layer;
+use crate::layered_repository::storage_layer::PageVersion;
+use crate::layered_repository::storage_layer::ZERO_PAGE;
+use crate::relish::*;
+use crate::repository::WALRecord;
+use crate::walredo::WalRedoManager;
+use crate::PageServerConf;
+use crate::{ZTenantId, ZTimelineId};
+use anyhow::{bail, Result};
+use bytes::Bytes;
+use log::*;
+use std::collections::BTreeMap;
+use std::fmt;
+use std::fs;
+use std::fs::File;
+use std::io::Write;
+use std::ops::Bound::Included;
+use std::path::PathBuf;
+use std::sync::{Arc, Mutex, MutexGuard};
+
+use bookfile::{Book, BookWriter};
+
+use zenith_utils::bin_ser::BeSer;
+use zenith_utils::lsn::Lsn;
+
+// Magic constant to identify a Zenith snapshot file
+static SNAPSHOT_FILE_MAGIC: u32 = 0x5A616E01;
+
+static PAGE_VERSIONS_CHAPTER: u64 = 1;
+static REL_SIZES_CHAPTER: u64 = 2;
+
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
+struct SnapshotFileName {
+    rel: RelishTag,
+    start_lsn: Lsn,
+    end_lsn: Lsn,
+    dropped: bool,
+}
+
+impl SnapshotFileName {
+    fn from_str(fname: &str) -> Option<Self> {
+        // Split the filename into parts
+        //
+        //    <spcnode>_<dbnode>_<relnode>_<forknum>_<start LSN>_<end LSN>
+        //
+        // or if it was dropped:
+        //
+        //    <spcnode>_<dbnode>_<relnode>_<forknum>_<start LSN>_<end LSN>_DROPPED
+        //
+        let rel;
+        let mut parts;
+        if let Some(rest) = fname.strip_prefix("rel_") {
+            parts = rest.split('_');
+            rel = RelishTag::Relation(RelTag {
+                spcnode: parts.next()?.parse::<u32>().ok()?,
+                dbnode: parts.next()?.parse::<u32>().ok()?,
+                relnode: parts.next()?.parse::<u32>().ok()?,
+                forknum: parts.next()?.parse::<u8>().ok()?,
+            });
+        } else if let Some(rest) = fname.strip_prefix("pg_xact_") {
+            parts = rest.split('_');
+            rel = RelishTag::Slru {
+                slru: SlruKind::Clog,
+                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
+            };
+        } else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") {
+            parts = rest.split('_');
+            rel = RelishTag::Slru {
+                slru: SlruKind::MultiXactMembers,
+                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
+            };
+        } else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") {
+            parts = rest.split('_');
+            rel = RelishTag::Slru {
+                slru: SlruKind::MultiXactOffsets,
+                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
+            };
+        } else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") {
+            parts = rest.split('_');
+            rel = RelishTag::FileNodeMap {
+                spcnode: parts.next()?.parse::<u32>().ok()?,
+                dbnode: parts.next()?.parse::<u32>().ok()?,
+            };
+        } else if let Some(rest) = fname.strip_prefix("pg_twophase_") {
+            parts = rest.split('_');
+            rel = RelishTag::TwoPhase {
+                xid: parts.next()?.parse::<u32>().ok()?,
+            };
+        } else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") {
+            parts = rest.split('_');
+            rel = RelishTag::Checkpoint;
+        } else if let Some(rest) = fname.strip_prefix("pg_control_") {
+            parts = rest.split('_');
+            rel = RelishTag::ControlFile;
+        } else {
+            return None;
+        }
+
+        let start_lsn = Lsn::from_hex(parts.next()?).ok()?;
+        let end_lsn = Lsn::from_hex(parts.next()?).ok()?;
+
+        let mut dropped = false;
+        if let Some(suffix) = parts.next() {
+            if suffix == "DROPPED" {
+                dropped = true;
+            } else {
+                warn!("unrecognized filename in timeline dir: {}", fname);
+                return None;
+            }
+        }
+        if parts.next().is_some() {
+            warn!("unrecognized filename in timeline dir: {}", fname);
+            return None;
+        }
+
+        Some(SnapshotFileName {
+            rel,
+            start_lsn,
+            end_lsn,
+            dropped,
+        })
+    }
+
+    fn to_string(&self) -> String {
+        let basename = match self.rel {
+            RelishTag::Relation(reltag) => format!(
+                "rel_{}_{}_{}_{}",
+                reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum
+            ),
+            RelishTag::Slru {
+                slru: SlruKind::Clog,
+                segno,
+            } => format!("pg_xact_{:04X}", segno),
+            RelishTag::Slru {
+                slru: SlruKind::MultiXactMembers,
+                segno,
+            } => format!("pg_multixact_members_{:04X}", segno),
+            RelishTag::Slru {
+                slru: SlruKind::MultiXactOffsets,
+                segno,
+            } => format!("pg_multixact_offsets_{:04X}", segno),
+            RelishTag::FileNodeMap { spcnode, dbnode } => {
+                format!("pg_filenodemap_{}_{}", spcnode, dbnode)
+            }
+            RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
+            RelishTag::Checkpoint => format!("pg_control_checkpoint"),
+            RelishTag::ControlFile => format!("pg_control"),
+        };
+
+        format!(
+            "{}_{:016X}_{:016X}{}",
+            basename,
+            u64::from(self.start_lsn),
+            u64::from(self.end_lsn),
+            if self.dropped { "_DROPPED" } else { "" }
+        )
+    }
+}
+
+impl fmt::Display for SnapshotFileName {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self.to_string())
+    }
+}
+
+///
+/// SnapshotLayer is the in-memory data structure associated with an
+/// on-disk snapshot file.  We keep a SnapshotLayer in memory for each
+/// file, in the LayerMap. If a layer is in "loaded" state, we have a
+/// copy of the file in memory, in 'inner'. Otherwise the struct is
+/// just a placeholder for a file that exists on disk, and it needs to
+/// be loaded before using it in queries.
+///
+pub struct SnapshotLayer {
+    conf: &'static PageServerConf,
+    pub tenantid: ZTenantId,
+    pub timelineid: ZTimelineId,
+    pub rel: RelishTag,
+
+    //
+    // This entry contains all the changes from 'start_lsn' to 'end_lsn'. The
+    // start is inclusive, and end is exclusive.
+    pub start_lsn: Lsn,
+    pub end_lsn: Lsn,
+
+    dropped: bool,
+
+    inner: Mutex<SnapshotLayerInner>,
+}
+
+pub struct SnapshotLayerInner {
+    /// If false, the 'page_versions' and 'relsizes' have not been
+    /// loaded into memory yet.
+    loaded: bool,
+
+    /// All versions of all pages in the file are are kept here.
+    /// Indexed by block number and LSN.
+    page_versions: BTreeMap<(u32, Lsn), PageVersion>,
+
+    /// `relsizes` tracks the size of the relation at different points in time.
+    relsizes: BTreeMap<Lsn, u32>,
+}
+
+impl Layer for SnapshotLayer {
+    fn is_frozen(&self) -> bool {
+        return true;
+    }
+
+    fn get_timeline_id(&self) -> ZTimelineId {
+        return self.timelineid;
+    }
+
+    fn get_relish_tag(&self) -> RelishTag {
+        return self.rel;
+    }
+
+    fn is_dropped(&self) -> bool {
+        return self.dropped;
+    }
+
+    fn get_start_lsn(&self) -> Lsn {
+        return self.start_lsn;
+    }
+
+    fn get_end_lsn(&self) -> Lsn {
+        return self.end_lsn;
+    }
+
+    /// Look up given page in the cache.
+    fn get_page_at_lsn(
+        &self,
+        walredo_mgr: &dyn WalRedoManager,
+        blknum: u32,
+        lsn: Lsn,
+    ) -> Result<Bytes> {
+        // Scan the BTreeMap backwards, starting from the given entry.
+        let mut records: Vec<WALRecord> = Vec::new();
+        let mut page_img: Option<Bytes> = None;
+        let mut need_base_image_lsn: Option<Lsn> = Some(lsn);
+        {
+            let inner = self.load()?;
+            let minkey = (blknum, Lsn(0));
+            let maxkey = (blknum, lsn);
+            let mut iter = inner
+                .page_versions
+                .range((Included(&minkey), Included(&maxkey)));
+            while let Some(((_blknum, entry_lsn), entry)) = iter.next_back() {
+                if let Some(img) = &entry.page_image {
+                    page_img = Some(img.clone());
+                    need_base_image_lsn = None;
+                    break;
+                } else if let Some(rec) = &entry.record {
+                    records.push(rec.clone());
+                    if rec.will_init {
+                        // This WAL record initializes the page, so no need to go further back
+                        need_base_image_lsn = None;
+                        break;
+                    } else {
+                        need_base_image_lsn = Some(*entry_lsn);
+                    }
+                } else {
+                    // No base image, and no WAL record. Huh?
+                    bail!("no page image or WAL record for requested page");
+                }
+            }
+
+            // release lock on 'inner'
+        }
+        records.reverse();
+
+        // If we needed a base image to apply the WAL records against, we should have found it in memory.
+        if let Some(lsn) = need_base_image_lsn {
+            if records.is_empty() {
+                // no records, and no base image. This can happen if PostgreSQL extends a relation
+                // but never writes the page.
+                //
+                // Would be nice to detect that situation better.
+                warn!("Page {} blk {} at {} not found", self.rel, blknum, lsn);
+                return Ok(ZERO_PAGE.clone());
+            }
+            bail!(
+                "No base image found for page {} blk {} at {}/{}",
+                self.rel,
+                blknum,
+                self.timelineid,
+                lsn
+            );
+        }
+
+        // If we have a page image, and no WAL, we're all set
+        if records.is_empty() {
+            if let Some(img) = page_img {
+                trace!(
+                    "found page image for blk {} in {} at {}/{}, no WAL redo required",
+                    blknum,
+                    self.rel,
+                    self.timelineid,
+                    lsn
+                );
+                Ok(img)
+            } else {
+                // FIXME: this ought to be an error?
+                warn!("Page {} blk {} at {} not found", self.rel, blknum, lsn);
+                Ok(ZERO_PAGE.clone())
+            }
+        } else {
+            // We need to do WAL redo.
+            //
+            // If we don't have a base image, then the oldest WAL record better initialize
+            // the page
+            if page_img.is_none() && !records.first().unwrap().will_init {
+                // FIXME: this ought to be an error?
+                warn!(
+                    "Base image for page {} blk {} at {} not found, but got {} WAL records",
+                    self.rel,
+                    blknum,
+                    lsn,
+                    records.len()
+                );
+                Ok(ZERO_PAGE.clone())
+            } else {
+                if page_img.is_some() {
+                    trace!("found {} WAL records and a base image for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.rel, self.timelineid, lsn);
+                } else {
+                    trace!("found {} WAL records that will init the page for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.rel, self.timelineid, lsn);
+                }
+                let img = walredo_mgr.request_redo(self.rel, blknum, lsn, page_img, records)?;
+
+                // FIXME: Should we memoize the page image in memory, so that
+                // we wouldn't need to reconstruct it again, if it's requested again?
+                //self.put_page_image(blknum, lsn, img.clone())?;
+
+                Ok(img)
+            }
+        }
+    }
+
+    /// Get size of the relation at given LSN
+    fn get_relish_size(&self, lsn: Lsn) -> Result<Option<u32>> {
+        // Scan the BTreeMap backwards, starting from the given entry.
+        let inner = self.load()?;
+        let mut iter = inner.relsizes.range((Included(&Lsn(0)), Included(&lsn)));
+
+        if let Some((_entry_lsn, entry)) = iter.next_back() {
+            let result = *entry;
+            drop(inner);
+            trace!("get_relsize: {} at {} -> {}", self.rel, lsn, result);
+            Ok(Some(result))
+        } else {
+            Ok(None)
+        }
+    }
+
+    /// Does this relation exist at given LSN?
+    fn get_rel_exists(&self, lsn: Lsn) -> Result<bool> {
+        // Is the requested LSN after the rel was dropped?
+        if self.dropped && lsn >= self.end_lsn {
+            return Ok(false);
+        }
+
+        // Otherwise, it exists.
+        Ok(true)
+    }
+
+    // Unsupported write operations
+    fn put_page_version(&self, blknum: u32, lsn: Lsn, _pv: PageVersion) -> Result<()> {
+        panic!(
+            "cannot modify historical snapshot layer, rel {} blk {} at {}/{}, {}-{}",
+            self.rel, blknum, self.timelineid, lsn, self.start_lsn, self.end_lsn
+        );
+    }
+    fn put_truncation(&self, _lsn: Lsn, _relsize: u32) -> anyhow::Result<()> {
+        bail!("cannot modify historical snapshot layer");
+    }
+
+    fn put_unlink(&self, _lsn: Lsn) -> anyhow::Result<()> {
+        bail!("cannot modify historical snapshot layer");
+    }
+
+    fn freeze(
+        &self,
+        _end_lsn: Lsn,
+        _walredo_mgr: &dyn WalRedoManager,
+    ) -> Result<Vec<Arc<dyn Layer>>> {
+        bail!("cannot freeze historical snapshot layer");
+    }
+
+    fn delete(&self) -> Result<()> {
+        // delete underlying file
+        fs::remove_file(self.path())?;
+        Ok(())
+    }
+
+    ///
+    /// Release most of the memory used by this layer. If it's accessed again later,
+    /// it will need to be loaded back.
+    ///
+    fn unload(&self) -> Result<()> {
+        let mut inner = self.inner.lock().unwrap();
+        inner.page_versions = BTreeMap::new();
+        inner.relsizes = BTreeMap::new();
+        inner.loaded = false;
+        Ok(())
+    }
+}
+
+impl SnapshotLayer {
+    fn path(&self) -> PathBuf {
+        Self::path_for(
+            self.conf,
+            self.timelineid,
+            self.tenantid,
+            &SnapshotFileName {
+                rel: self.rel,
+                start_lsn: self.start_lsn,
+                end_lsn: self.end_lsn,
+                dropped: self.dropped,
+            },
+        )
+    }
+
+    fn path_for(
+        conf: &'static PageServerConf,
+        timelineid: ZTimelineId,
+        tenantid: ZTenantId,
+        fname: &SnapshotFileName,
+    ) -> PathBuf {
+        conf.timeline_path(&timelineid, &tenantid)
+            .join(fname.to_string())
+    }
+
+    /// Create a new snapshot file, using the given btreemaps containing the page versions and
+    /// relsizes.
+    ///
+    /// This is used to write the in-memory layer to disk. The in-memory layer uses the same
+    /// data structure with two btreemaps as we do, so passing the btreemaps is currently
+    /// expedient.
+    pub fn create(
+        conf: &'static PageServerConf,
+        timelineid: ZTimelineId,
+        tenantid: ZTenantId,
+        rel: RelishTag,
+        start_lsn: Lsn,
+        end_lsn: Lsn,
+        dropped: bool,
+        page_versions: BTreeMap<(u32, Lsn), PageVersion>,
+        relsizes: BTreeMap<Lsn, u32>,
+    ) -> Result<SnapshotLayer> {
+        let snapfile = SnapshotLayer {
+            conf: conf,
+            timelineid: timelineid,
+            tenantid: tenantid,
+            rel: rel,
+            start_lsn: start_lsn,
+            end_lsn,
+            dropped,
+            inner: Mutex::new(SnapshotLayerInner {
+                loaded: true,
+                page_versions: page_versions,
+                relsizes: relsizes,
+            }),
+        };
+        let inner = snapfile.inner.lock().unwrap();
+
+        // Write the in-memory btreemaps into a file
+        let path = snapfile.path();
+
+        // Note: This overwrites any existing file. There shouldn't be any.
+        // FIXME: throw an error instead?
+        let file = File::create(&path)?;
+        let book = BookWriter::new(file, SNAPSHOT_FILE_MAGIC)?;
+
+        // Write out page versions
+        let mut chapter = book.new_chapter(PAGE_VERSIONS_CHAPTER);
+        let buf = BTreeMap::ser(&inner.page_versions)?;
+        chapter.write_all(&buf)?;
+        let book = chapter.close()?;
+
+        // and relsizes to separate chapter
+        let mut chapter = book.new_chapter(REL_SIZES_CHAPTER);
+        let buf = BTreeMap::ser(&inner.relsizes)?;
+        chapter.write_all(&buf)?;
+        let book = chapter.close()?;
+
+        book.close()?;
+
+        trace!("saved {}", &path.display());
+
+        drop(inner);
+
+        Ok(snapfile)
+    }
+
+    ///
+    /// Load the contents of the file into memory
+    ///
+    fn load(&self) -> Result<MutexGuard<SnapshotLayerInner>> {
+        // quick exit if already loaded
+        let mut inner = self.inner.lock().unwrap();
+
+        if inner.loaded {
+            return Ok(inner);
+        }
+
+        let path = Self::path_for(
+            self.conf,
+            self.timelineid,
+            self.tenantid,
+            &SnapshotFileName {
+                rel: self.rel,
+                start_lsn: self.start_lsn,
+                end_lsn: self.end_lsn,
+                dropped: self.dropped,
+            },
+        );
+
+        let file = File::open(&path)?;
+        let book = Book::new(file)?;
+
+        let chapter = book.read_chapter(PAGE_VERSIONS_CHAPTER)?;
+        let page_versions = BTreeMap::des(&chapter)?;
+
+        let chapter = book.read_chapter(REL_SIZES_CHAPTER)?;
+        let relsizes = BTreeMap::des(&chapter)?;
+
+        debug!("loaded from {}", &path.display());
+
+        *inner = SnapshotLayerInner {
+            loaded: true,
+            page_versions,
+            relsizes,
+        };
+
+        Ok(inner)
+    }
+
+    /// Create SnapshotLayers representing all files on dik
+    ///
+    // TODO: returning an Iterator would be more idiomatic
+    pub fn list_snapshot_files(
+        conf: &'static PageServerConf,
+        timelineid: ZTimelineId,
+        tenantid: ZTenantId,
+    ) -> Result<Vec<Arc<dyn Layer>>> {
+        let path = conf.timeline_path(&timelineid, &tenantid);
+
+        let mut snapfiles: Vec<Arc<dyn Layer>> = Vec::new();
+        for direntry in fs::read_dir(path)? {
+            let fname = direntry?.file_name();
+            let fname = fname.to_str().unwrap();
+
+            if let Some(snapfilename) = SnapshotFileName::from_str(fname) {
+                let snapfile = SnapshotLayer {
+                    conf,
+                    timelineid,
+                    tenantid,
+                    rel: snapfilename.rel,
+                    start_lsn: snapfilename.start_lsn,
+                    end_lsn: snapfilename.end_lsn,
+                    dropped: snapfilename.dropped,
+                    inner: Mutex::new(SnapshotLayerInner {
+                        loaded: false,
+                        page_versions: BTreeMap::new(),
+                        relsizes: BTreeMap::new(),
+                    }),
+                };
+
+                snapfiles.push(Arc::new(snapfile));
+            }
+        }
+        return Ok(snapfiles);
+    }
+
+    /// debugging function to print out the contents of the layer
+    #[allow(unused)]
+    pub fn dump(&self) -> String {
+        let mut result = format!(
+            "----- snapshot layer for {} {}-{} ----\n",
+            self.rel, self.start_lsn, self.end_lsn
+        );
+
+        let inner = self.inner.lock().unwrap();
+        for (k, v) in inner.relsizes.iter() {
+            result += &format!("{}: {}\n", k, v);
+        }
+        //for (k, v) in inner.page_versions.iter() {
+        //    result += &format!("blk {} at {}: {}/{}\n", k.0, k.1, v.page_image.is_some(), v.record.is_some());
+        //}
+
+        result
+    }
+}
--- a/pageserver/src/layered_repository/storage_layer.rs
+++ b/pageserver/src/layered_repository/storage_layer.rs
@@ -0,0 +1,123 @@
+use crate::relish::RelishTag;
+use crate::repository::WALRecord;
+use crate::walredo::WalRedoManager;
+use crate::ZTimelineId;
+use anyhow::Result;
+use bytes::Bytes;
+use serde::{Deserialize, Serialize};
+use std::sync::Arc;
+
+use zenith_utils::lsn::Lsn;
+
+pub static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
+
+///
+/// Represents a version of a page at a specific LSN. The LSN is the key of the
+/// entry in the 'page_versions' hash, it is not duplicated here.
+///
+/// A page version can be stored as a full page image, or as WAL record that needs
+/// to be applied over the previous page version to reconstruct this version.
+///
+/// It's also possible to have both a WAL record and a page image in the same
+/// PageVersion. That happens if page version is originally stored as a WAL record
+/// but it is later reconstructed by a GetPage@LSN request by performing WAL
+/// redo. The get_page_at_lsn() code will store the reconstructed pag image next to
+/// the WAL record in that case. TODO: That's pretty accidental, not the result
+/// of any grand design. If we want to keep reconstructed page versions around, we
+/// probably should have a separate buffer cache so that we could control the
+/// replacement policy globally. Or if we keep a reconstructed page image, we
+/// could throw away the WAL record.
+///
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PageVersion {
+    /// an 8kb page image
+    pub page_image: Option<Bytes>,
+    /// WAL record to get from previous page version to this one.
+    pub record: Option<WALRecord>,
+}
+
+///
+/// A Layer holds all page versions for one relish, in a range of LSNs.
+/// There are two kinds of layers, in-memory and snapshot layers. In-memory
+/// layers are used to ingest incoming WAL, and provide fast access
+/// to the recent page versions. Snaphot layers are stored on disk, and
+/// are immutable.
+///
+/// Each layer contains a full snapshot of the relish at the start
+/// LSN. In addition to that, it contains WAL (or more page images)
+/// needed to recontruct any page version up to the end LSN.
+///
+pub trait Layer: Send + Sync {
+    // These functions identify the relish and the LSN range that this Layer
+    // holds.
+    fn get_timeline_id(&self) -> ZTimelineId;
+    fn get_relish_tag(&self) -> RelishTag;
+    fn get_start_lsn(&self) -> Lsn;
+    fn get_end_lsn(&self) -> Lsn;
+    fn is_dropped(&self) -> bool;
+
+    /// Frozen layers are stored on disk, an cannot accept cannot accept new WAL
+    /// records, whereas an unfrozen layer can still be modified, but is not
+    /// durable in case of a crash. Snapshot layers are always frozen, and
+    /// in-memory layers are always unfrozen.
+    fn is_frozen(&self) -> bool;
+
+    // Functions that correspond to the Timeline trait functions.
+    fn get_page_at_lsn(
+        &self,
+        walredo_mgr: &dyn WalRedoManager,
+        blknum: u32,
+        lsn: Lsn,
+    ) -> Result<Bytes>;
+
+    fn get_relish_size(&self, lsn: Lsn) -> Result<Option<u32>>;
+
+    fn get_rel_exists(&self, lsn: Lsn) -> Result<bool>;
+
+    fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> Result<()>;
+
+    fn put_truncation(&self, lsn: Lsn, relsize: u32) -> anyhow::Result<()>;
+
+    fn put_unlink(&self, lsn: Lsn) -> anyhow::Result<()>;
+
+    /// Remember new page version, as a WAL record over previous version
+    fn put_wal_record(&self, blknum: u32, rec: WALRecord) -> Result<()> {
+        self.put_page_version(
+            blknum,
+            rec.lsn,
+            PageVersion {
+                page_image: None,
+                record: Some(rec),
+            },
+        )
+    }
+
+    /// Remember new page version, as a full page image
+    fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()> {
+        self.put_page_version(
+            blknum,
+            lsn,
+            PageVersion {
+                page_image: Some(img),
+                record: None,
+            },
+        )
+    }
+
+    ///
+    /// Split off an immutable layer from existing layer.
+    ///
+    /// Returns new layers that replace this one.
+    ///
+    fn freeze(&self, end_lsn: Lsn, walredo_mgr: &dyn WalRedoManager)
+        -> Result<Vec<Arc<dyn Layer>>>;
+
+    /// Permanently delete this layer
+    fn delete(&self) -> Result<()>;
+
+    /// Try to release memory used by this layer. This is currently
+    /// only used by snapshot layers, to free the copy of the file
+    /// from memory. (TODO: a smarter, more granular caching scheme
+    /// would be nice)
+    fn unload(&self) -> Result<()>;
+}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -9,6 +9,7 @@ use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};

 pub mod basebackup;
 pub mod branches;
+pub mod layered_repository;
 pub mod logger;
 pub mod object_key;
 pub mod object_repository;
@@ -54,6 +55,14 @@ pub struct PageServerConf {
    pub auth_type: AuthType,

    pub auth_validation_public_key_path: Option<PathBuf>,
+
+    pub repository_format: RepositoryFormat,
+}
+
+#[derive(Debug, Clone, PartialEq)]
+pub enum RepositoryFormat {
+    Layered,
+    RocksDb,
 }

 impl PageServerConf {
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -2,11 +2,12 @@
 //! page server.

 use crate::branches;
+use crate::layered_repository::LayeredRepository;
 use crate::object_repository::ObjectRepository;
 use crate::repository::Repository;
 use crate::rocksdb_storage::RocksObjectStore;
 use crate::walredo::PostgresRedoManager;
-use crate::PageServerConf;
+use crate::{PageServerConf, RepositoryFormat};
 use anyhow::{anyhow, bail, Result};
 use lazy_static::lazy_static;
 use log::info;
@@ -27,16 +28,35 @@ pub fn init(conf: &'static PageServerConf) {
    for dir_entry in fs::read_dir(conf.tenants_path()).unwrap() {
        let tenantid =
            ZTenantId::from_str(dir_entry.unwrap().file_name().to_str().unwrap()).unwrap();
-        let obj_store = RocksObjectStore::open(conf, &tenantid).unwrap();

        // Set up a WAL redo manager, for applying WAL records.
        let walredo_mgr = PostgresRedoManager::new(conf, tenantid);

        // Set up an object repository, for actual data storage.
-        let repo =
-            ObjectRepository::new(conf, Arc::new(obj_store), Arc::new(walredo_mgr), tenantid);
+        let repo: Arc<dyn Repository + Sync + Send> = match conf.repository_format {
+            RepositoryFormat::Layered => {
+                let repo = Arc::new(LayeredRepository::new(
+                    conf,
+                    Arc::new(walredo_mgr),
+                    tenantid,
+                ));
+                LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
+                repo
+            }
+            RepositoryFormat::RocksDb => {
+                let obj_store = RocksObjectStore::open(conf, &tenantid).unwrap();
+
+                Arc::new(ObjectRepository::new(
+                    conf,
+                    Arc::new(obj_store),
+                    Arc::new(walredo_mgr),
+                    tenantid,
+                ))
+            }
+        };
+
        info!("initialized storage for tenant: {}", &tenantid);
-        m.insert(tenantid, Arc::new(repo));
+        m.insert(tenantid, repo);
    }
 }

@@ -53,7 +73,7 @@ pub fn create_repository_for_tenant(
    let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
    let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?;

-    m.insert(tenantid, Arc::new(repo));
+    m.insert(tenantid, repo);

    Ok(())
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -693,6 +693,18 @@ impl postgres_backend::Handler for PageServerHandler {
                RowDescriptor::int8_col(b"control_deleted"),
                RowDescriptor::int8_col(b"filenodemap_deleted"),
                RowDescriptor::int8_col(b"dropped"),
+                RowDescriptor::int8_col(b"snapshot_relfiles_total"),
+                RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_cutoff"),
+                RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_branches"),
+                RowDescriptor::int8_col(b"snapshot_relfiles_not_updated"),
+                RowDescriptor::int8_col(b"snapshot_relfiles_removed"),
+                RowDescriptor::int8_col(b"snapshot_relfiles_dropped"),
+                RowDescriptor::int8_col(b"snapshot_nonrelfiles_total"),
+                RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_cutoff"),
+                RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_branches"),
+                RowDescriptor::int8_col(b"snapshot_nonrelfiles_not_updated"),
+                RowDescriptor::int8_col(b"snapshot_nonrelfiles_removed"),
+                RowDescriptor::int8_col(b"snapshot_nonrelfiles_dropped"),
                RowDescriptor::int8_col(b"elapsed"),
            ]))?
            .write_message_noflush(&BeMessage::DataRow(&[
@@ -705,6 +717,43 @@ impl postgres_backend::Handler for PageServerHandler {
                Some(&result.control_deleted.to_string().as_bytes()),
                Some(&result.filenodemap_deleted.to_string().as_bytes()),
                Some(&result.dropped.to_string().as_bytes()),
+                Some(&result.snapshot_relfiles_total.to_string().as_bytes()),
+                Some(
+                    &result
+                        .snapshot_relfiles_needed_by_cutoff
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(
+                    &result
+                        .snapshot_relfiles_needed_by_branches
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(&result.snapshot_relfiles_not_updated.to_string().as_bytes()),
+                Some(&result.snapshot_relfiles_removed.to_string().as_bytes()),
+                Some(&result.snapshot_relfiles_dropped.to_string().as_bytes()),
+                Some(&result.snapshot_nonrelfiles_total.to_string().as_bytes()),
+                Some(
+                    &result
+                        .snapshot_nonrelfiles_needed_by_cutoff
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(
+                    &result
+                        .snapshot_nonrelfiles_needed_by_branches
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(
+                    &result
+                        .snapshot_nonrelfiles_not_updated
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(&result.snapshot_nonrelfiles_removed.to_string().as_bytes()),
+                Some(&result.snapshot_nonrelfiles_dropped.to_string().as_bytes()),
                Some(&result.elapsed.as_millis().to_string().as_bytes()),
            ]))?
            .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
--- a/pageserver/src/relish.rs
+++ b/pageserver/src/relish.rs
@@ -120,7 +120,16 @@ impl RelishTag {

            // and these don't
            | RelishTag::ControlFile
-            | RelishTag::Checkpoint => false,
+                | RelishTag::Checkpoint => false,
+        }
+    }
+
+    // convenience function to check if this relish is a normal relation.
+    pub const fn is_relation(&self) -> bool {
+        if let RelishTag::Relation(_) = self {
+            true
+        } else {
+            false
        }
    }
 }
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -5,6 +5,7 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
 use serde::{Deserialize, Serialize};
 use std::collections::HashSet;
 use std::iter::Iterator;
+use std::ops::AddAssign;
 use std::sync::Arc;
 use std::time::Duration;
 use zenith_utils::lsn::Lsn;
@@ -56,6 +57,8 @@ pub trait Repository: Send + Sync {
 ///
 #[derive(Default)]
 pub struct GcResult {
+    // FIXME: These counters make sense for the ObjectRepository. They are not used
+    // by the LayeredRepository.
    pub n_relations: u64,
    pub inspected: u64,
    pub truncated: u64,
@@ -66,9 +69,51 @@ pub struct GcResult {
    pub control_deleted: u64,     // RelishTag::ControlFile
    pub filenodemap_deleted: u64, // RelishTag::FileNodeMap
    pub dropped: u64,
+
+    // These are used for the LayeredRepository instead
+    pub snapshot_relfiles_total: u64,
+    pub snapshot_relfiles_needed_by_cutoff: u64,
+    pub snapshot_relfiles_needed_by_branches: u64,
+    pub snapshot_relfiles_not_updated: u64,
+    pub snapshot_relfiles_removed: u64, // # of snapshot files removed because they have been made obsolete by newer snapshot files.
+    pub snapshot_relfiles_dropped: u64, // # of snapshot files removed because the relation was dropped
+
+    pub snapshot_nonrelfiles_total: u64,
+    pub snapshot_nonrelfiles_needed_by_cutoff: u64,
+    pub snapshot_nonrelfiles_needed_by_branches: u64,
+    pub snapshot_nonrelfiles_not_updated: u64,
+    pub snapshot_nonrelfiles_removed: u64, // # of snapshot files removed because they have been made obsolete by newer snapshot files.
+    pub snapshot_nonrelfiles_dropped: u64, // # of snapshot files removed because the relation was dropped
+
    pub elapsed: Duration,
 }

+impl AddAssign for GcResult {
+    fn add_assign(&mut self, other: Self) {
+        self.n_relations += other.n_relations;
+        self.truncated += other.truncated;
+        self.deleted += other.deleted;
+        self.dropped += other.dropped;
+
+        self.snapshot_relfiles_total += other.snapshot_relfiles_total;
+        self.snapshot_relfiles_needed_by_cutoff += other.snapshot_relfiles_needed_by_cutoff;
+        self.snapshot_relfiles_needed_by_branches += other.snapshot_relfiles_needed_by_branches;
+        self.snapshot_relfiles_not_updated += other.snapshot_relfiles_not_updated;
+        self.snapshot_relfiles_removed += other.snapshot_relfiles_removed;
+        self.snapshot_relfiles_dropped += other.snapshot_relfiles_dropped;
+
+        self.snapshot_nonrelfiles_total += other.snapshot_nonrelfiles_total;
+        self.snapshot_nonrelfiles_needed_by_cutoff += other.snapshot_nonrelfiles_needed_by_cutoff;
+        self.snapshot_nonrelfiles_needed_by_branches +=
+            other.snapshot_nonrelfiles_needed_by_branches;
+        self.snapshot_nonrelfiles_not_updated += other.snapshot_nonrelfiles_not_updated;
+        self.snapshot_nonrelfiles_removed += other.snapshot_nonrelfiles_removed;
+        self.snapshot_nonrelfiles_dropped += other.snapshot_nonrelfiles_dropped;
+
+        self.elapsed += other.elapsed;
+    }
+}
+
 pub trait Timeline: Send + Sync {
    //------------------------------------------------------------------------------
    // Public GET functions
@@ -234,11 +279,12 @@ impl WALRecord {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use crate::layered_repository::LayeredRepository;
    use crate::object_repository::ObjectRepository;
    use crate::object_repository::{ObjectValue, PageEntry, RelationSizeEntry};
    use crate::rocksdb_storage::RocksObjectStore;
    use crate::walredo::{WalRedoError, WalRedoManager};
-    use crate::PageServerConf;
+    use crate::{PageServerConf, RepositoryFormat};
    use postgres_ffi::pg_constants;
    use std::fs;
    use std::path::PathBuf;
@@ -272,10 +318,16 @@ mod tests {
        buf.freeze()
    }

-    fn get_test_repo(test_name: &str) -> Result<Box<dyn Repository>> {
+    static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
+
+    fn get_test_repo(
+        test_name: &str,
+        repository_format: RepositoryFormat,
+    ) -> Result<Box<dyn Repository>> {
        let repo_dir = PathBuf::from(format!("../tmp_check/test_{}", test_name));
        let _ = fs::remove_dir_all(&repo_dir);
-        fs::create_dir_all(&repo_dir).unwrap();
+        fs::create_dir_all(&repo_dir)?;
+        fs::create_dir_all(&repo_dir.join("timelines"))?;

        let conf = PageServerConf {
            daemonize: false,
@@ -288,6 +340,7 @@ mod tests {
            pg_distrib_dir: "".into(),
            auth_type: AuthType::Trust,
            auth_validation_public_key_path: None,
+            repository_format,
        };
        // Make a static copy of the config. This can never be free'd, but that's
        // OK in a test.
@@ -295,24 +348,47 @@ mod tests {
        let tenantid = ZTenantId::generate();
        fs::create_dir_all(conf.tenant_path(&tenantid)).unwrap();

-        let obj_store = RocksObjectStore::create(conf, &tenantid)?;
-
        let walredo_mgr = TestRedoManager {};

-        let repo =
-            ObjectRepository::new(conf, Arc::new(obj_store), Arc::new(walredo_mgr), tenantid);
+        let repo: Box<dyn Repository + Sync + Send> = match conf.repository_format {
+            RepositoryFormat::Layered => Box::new(LayeredRepository::new(
+                conf,
+                Arc::new(walredo_mgr),
+                tenantid,
+            )),
+            RepositoryFormat::RocksDb => {
+                let obj_store = RocksObjectStore::create(conf, &tenantid)?;

-        Ok(Box::new(repo))
+                Box::new(ObjectRepository::new(
+                    conf,
+                    Arc::new(obj_store),
+                    Arc::new(walredo_mgr),
+                    tenantid,
+                ))
+            }
+        };
+
+        Ok(repo)
    }

    /// Test get_relsize() and truncation.
    #[test]
-    fn test_relsize() -> Result<()> {
+    fn test_relsize_rocksdb() -> Result<()> {
+        let repo = get_test_repo("test_relsize_rocksdb", RepositoryFormat::RocksDb)?;
+        test_relsize(&*repo)
+    }
+
+    #[test]
+    fn test_relsize_layered() -> Result<()> {
+        let repo = get_test_repo("test_relsize_layered", RepositoryFormat::Layered)?;
+        test_relsize(&*repo)
+    }
+
+    fn test_relsize(repo: &dyn Repository) -> Result<()> {
        // get_timeline() with non-existent timeline id should fail
        //repo.get_timeline("11223344556677881122334455667788");

        // Create timeline to work on
-        let repo = get_test_repo("test_relsize")?;
        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
        let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;

@@ -397,14 +473,24 @@ mod tests {
    /// This isn't very interesting with the RocksDb implementation, as we don't pay
    /// any attention to Postgres segment boundaries there.
    #[test]
-    fn test_large_rel() -> Result<()> {
-        let repo = get_test_repo("test_large_rel")?;
+    fn test_large_rel_rocksdb() -> Result<()> {
+        let repo = get_test_repo("test_large_rel_rocksdb", RepositoryFormat::RocksDb)?;
+        test_large_rel(&*repo)
+    }
+
+    #[test]
+    fn test_large_rel_layered() -> Result<()> {
+        let repo = get_test_repo("test_large_rel_layered", RepositoryFormat::Layered)?;
+        test_large_rel(&*repo)
+    }
+
+    fn test_large_rel(repo: &dyn Repository) -> Result<()> {
        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
        let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;

        tline.init_valid_lsn(Lsn(1));

-        let mut lsn = 0;
+        let mut lsn = 1;
        for blknum in 0..pg_constants::RELSEG_SIZE + 1 {
            let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
            lsn += 1;
@@ -450,15 +536,29 @@ mod tests {
        }))
    }

+    #[test]
+    fn test_branch_rocksdb() -> Result<()> {
+        let repo = get_test_repo("test_branch_rocksdb", RepositoryFormat::RocksDb)?;
+        test_branch(&*repo)
+    }
+
+    #[test]
+    fn test_branch_layered() -> Result<()> {
+        let repo = get_test_repo("test_branch_layered", RepositoryFormat::Layered)?;
+        test_branch(&*repo)
+    }
+
    ///
    /// Test branch creation
    ///
-    #[test]
-    fn test_branch() -> Result<()> {
-        let repo = get_test_repo("test_branch")?;
+    fn test_branch(repo: &dyn Repository) -> Result<()> {
        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
        let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;

+        // Import initial dummy checkpoint record, otherwise the get_timeline() call
+        // after branching fails below
+        tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(1), ZERO_PAGE.clone(), false)?;
+
        // Create a relation on the timeline
        tline.init_valid_lsn(Lsn(1));
        tline.put_page_image(TESTREL_A, 0, Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
@@ -500,8 +600,19 @@ mod tests {
    }

    #[test]
-    fn test_history() -> Result<()> {
-        let repo = get_test_repo("test_snapshot")?;
+    fn test_history_rocksdb() -> Result<()> {
+        let repo = get_test_repo("test_history_rocksdb", RepositoryFormat::RocksDb)?;
+        test_history(&*repo)
+    }
+    #[test]
+    // TODO: This doesn't work with the layered storage, the functions needed for push/pull
+    // functionality haven't been implemented yet.
+    #[ignore]
+    fn test_history_layered() -> Result<()> {
+        let repo = get_test_repo("test_history_layered", RepositoryFormat::Layered)?;
+        test_history(&*repo)
+    }
+    fn test_history(repo: &dyn Repository) -> Result<()> {
        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
        let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;

--- a/pageserver/src/restore_local_repo.rs
+++ b/pageserver/src/restore_local_repo.rs
@@ -132,6 +132,7 @@ pub fn import_timeline_from_postgres_datadir(
    }
    // TODO: Scan pg_tblspc

+    timeline.advance_last_valid_lsn(lsn);
    timeline.checkpoint()?;

    Ok(())
@@ -425,12 +426,12 @@ pub fn save_decoded_record(
            save_xact_record(timeline, lsn, &parsed_xact, decoded)?;
            // Remove twophase file. see RemoveTwoPhaseFile() in postgres code
            info!(
-                "unlink twophaseFile for xid {} parsed_xact.xid {} here",
-                decoded.xl_xid, parsed_xact.xid
+                "unlink twophaseFile for xid {} parsed_xact.xid {} here at {}",
+                decoded.xl_xid, parsed_xact.xid, lsn
            );
            timeline.put_unlink(
                RelishTag::TwoPhase {
-                    xid: decoded.xl_xid,
+                    xid: parsed_xact.xid,
                },
                lsn,
            )?;
@@ -795,7 +796,13 @@ fn save_clog_truncate_record(
    // Iterate via SLRU CLOG segments and unlink segments that we're ready to truncate
    // TODO This implementation is very inefficient -
    // it scans all non-rels only to find Clog
-    for obj in timeline.list_nonrels(lsn)? {
+    //
+    // We cannot pass 'lsn' to the Timeline.list_nonrels(), or it
+    // will block waiting for the last valid LSN to advance up to
+    // it. So we use the previous record's LSN in the get calls
+    // instead.
+    let req_lsn = min(timeline.get_last_record_lsn(), lsn);
+    for obj in timeline.list_nonrels(req_lsn)? {
        match obj {
            RelishTag::Slru { slru, segno } => {
                if slru == SlruKind::Clog {
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -8,7 +8,7 @@ use crate::page_cache;
 use crate::relish::*;
 use crate::restore_local_repo;
 use crate::waldecoder::*;
-use crate::PageServerConf;
+use crate::{PageServerConf, RepositoryFormat};
 use anyhow::{Error, Result};
 use lazy_static::lazy_static;
 use log::*;
@@ -264,7 +264,11 @@ fn walreceiver_main(
                    )?;

                    if newest_segno - oldest_segno >= 10 {
-                        timeline.checkpoint()?;
+                        // FIXME: The layered repository performs checkpointing in a separate thread, so this
+                        // isn't needed anymore. Remove 'checkpoint' from the Timeline trait altogether?
+                        if conf.repository_format == RepositoryFormat::RocksDb {
+                            timeline.checkpoint()?;
+                        }

                        // TODO: This is where we could remove WAL older than last_rec_lsn.
                        //remove_wal_files(timelineid, pg_constants::WAL_SEGMENT_SIZE, last_rec_lsn)?;
--- a/test_runner/batch_others/test_gc.py
+++ b/test_runner/batch_others/test_gc.py
@@ -14,7 +14,8 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
@pytest.mark.skip(reason=""""
    Current GC test is flaky and overly strict. Since we are migrating to the layered repo format
-    with different GC implementation let's just silence this test for now.
+    with different GC implementation let's just silence this test for now. This test only
+    works with the RocksDB implementation.
 """)
 def test_gc(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
    zenith_cli.run(["branch", "test_gc", "empty"])
--- a/test_runner/batch_others/test_snapfiles_gc.py
+++ b/test_runner/batch_others/test_snapfiles_gc.py
@@ -0,0 +1,122 @@
+from contextlib import closing
+import psycopg2.extras
+import time;
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+def print_gc_result(row):
+    print("GC duration {elapsed} ms".format_map(row));
+    print("  REL    total: {snapshot_relfiles_total}, needed_by_cutoff {snapshot_relfiles_needed_by_cutoff}, needed_by_branches: {snapshot_relfiles_needed_by_branches}, not_updated: {snapshot_relfiles_not_updated}, removed: {snapshot_relfiles_removed}, dropped: {snapshot_relfiles_dropped}".format_map(row))
+    print("  NONREL total: {snapshot_nonrelfiles_total}, needed_by_cutoff {snapshot_nonrelfiles_needed_by_cutoff}, needed_by_branches: {snapshot_nonrelfiles_needed_by_branches}, not_updated: {snapshot_nonrelfiles_not_updated}, removed: {snapshot_nonrelfiles_removed}, dropped: {snapshot_nonrelfiles_dropped}".format_map(row))
+
+
+#
+# Test Garbage Collection of old snapshot files
+#
+# This test is pretty tightly coupled with the current implementation of layered
+# storage, in layered_repository.rs.
+#
+def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
+    zenith_cli.run(["branch", "test_snapfiles_gc", "empty"])
+    pg = postgres.create_start('test_snapfiles_gc')
+
+    with closing(pg.connect()) as conn:
+        with conn.cursor() as cur:
+            with closing(pageserver.connect()) as psconn:
+                with psconn.cursor(cursor_factory = psycopg2.extras.DictCursor) as pscur:
+
+                    # Get the timeline ID of our branch. We need it for the 'do_gc' command
+                    cur.execute("SHOW zenith.zenith_timeline")
+                    timeline = cur.fetchone()[0]
+
+                    # Create a test table
+                    cur.execute("CREATE TABLE foo(x integer)")
+
+                    print("Inserting two more rows and running GC")
+                    cur.execute("select relfilenode from pg_class where oid = 'foo'::regclass");
+                    row = cur.fetchone();
+                    print("relfilenode is {}", row[0]);
+
+                    # Run GC, to clear out any garbage left behind in the catalogs by
+                    # the CREATE TABLE command. We want to have a clean slate with no garbage
+                    # before running the actual tests below, otherwise the counts won't match
+                    # what we expect.
+                    #
+                    # Also run vacuum first to make it less likely that autovacuum or pruning
+                    # kicks in and confuses our numbers.
+                    cur.execute("VACUUM")
+
+                    print("Running GC before test")
+                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    row = pscur.fetchone()
+                    print_gc_result(row);
+                    # remember the number of files
+                    snapshot_relfiles_remain = row['snapshot_relfiles_total'] - row['snapshot_relfiles_removed']
+                    assert snapshot_relfiles_remain > 0
+
+                    # Insert a row. The first insert will also create a metadata entry for the
+                    # relation, with size == 1 block. Hence, bump up the expected relation count.
+                    snapshot_relfiles_remain += 1;
+                    print("Inserting one row and running GC")
+                    cur.execute("INSERT INTO foo VALUES (1)")
+                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    row = pscur.fetchone()
+                    print_gc_result(row);
+                    assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain
+                    assert row['snapshot_relfiles_removed'] == 0
+                    assert row['snapshot_relfiles_dropped'] == 0
+
+                    # Insert two more rows and run GC.
+                    # This should create a new snapshot file with the new contents, and
+                    # remove the old one.
+                    print("Inserting two more rows and running GC")
+                    cur.execute("INSERT INTO foo VALUES (2)")
+                    cur.execute("INSERT INTO foo VALUES (3)")
+
+                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    row = pscur.fetchone()
+                    print_gc_result(row);
+                    assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1
+                    assert row['snapshot_relfiles_removed'] == 1
+                    assert row['snapshot_relfiles_dropped'] == 0
+
+                    # Do it again. Should again create a new snapshot file and remove old one.
+                    print("Inserting two more rows and running GC")
+                    cur.execute("INSERT INTO foo VALUES (2)")
+                    cur.execute("INSERT INTO foo VALUES (3)")
+
+                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    row = pscur.fetchone()
+                    print_gc_result(row);
+                    assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1
+                    assert row['snapshot_relfiles_removed'] == 1
+                    assert row['snapshot_relfiles_dropped'] == 0
+
+                    # Run GC again, with no changes in the database. Should not remove anything.
+                    print("Run GC again, with nothing to do")
+                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    row = pscur.fetchone()
+                    print_gc_result(row);
+                    assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain
+                    assert row['snapshot_relfiles_removed'] == 0
+                    assert row['snapshot_relfiles_dropped'] == 0
+
+                    #
+                    # Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
+                    #
+                    print("Drop table and run GC again");
+                    cur.execute("DROP TABLE foo")
+
+                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    row = pscur.fetchone()
+                    print_gc_result(row);
+
+                    # Each relation fork is counted separately, hence 3.
+                    assert row['snapshot_relfiles_dropped'] == 3
+
+                    # The catalog updates also create new snapshot files of the catalogs, which
+                    # are counted as 'removed'
+                    assert row['snapshot_relfiles_removed'] > 0
+
+                    # TODO: perhaps we should count catalog and user relations separately,
+                    # to make this kind of testing more robust
--- a/vendor/postgres
+++ b/vendor/postgres
--- a/zenith/src/main.rs
+++ b/zenith/src/main.rs
@@ -61,6 +61,13 @@ fn main() -> Result<()> {
                        .long("enable-auth")
                        .takes_value(false)
                        .help("Enable authentication using ZenithJWT")
+                )
+                .arg(
+                    Arg::with_name("repository-format")
+                        .long("repository-format")
+                        .takes_value(false)
+                        .value_name("repository-format")
+                        .help("Choose repository format, 'layered' or 'rocksdb'")
                ),
        )
        .subcommand(
@@ -131,8 +138,8 @@ fn main() -> Result<()> {
        } else {
            AuthType::Trust
        };
-
-        local_env::init(pageserver_uri, tenantid, auth_type)
+        let repository_format = init_match.value_of("repository-format");
+        local_env::init(pageserver_uri, tenantid, auth_type, repository_format)
            .with_context(|| "Failed to create config file")?;
    }

@@ -151,6 +158,7 @@ fn main() -> Result<()> {
            if let Err(e) = pageserver.init(
                Some(&env.tenantid.to_string()),
                init_match.is_present("enable-auth"),
+                init_match.value_of("repository-format"),
            ) {
                eprintln!("pageserver init failed: {}", e);
                exit(1);
--- a/zenith_utils/src/zid.rs
+++ b/zenith_utils/src/zid.rs
@@ -126,7 +126,7 @@ macro_rules! zid_newtype {
 /// is separate from PostgreSQL timelines, and doesn't have those
 /// limitations. A zenith timeline is identified by a 128-bit ID, which
 /// is usually printed out as a hex string.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)]
 pub struct ZTimelineId(ZId);

 zid_newtype!(ZTimelineId);