Merge with main

2026-01-07 13:32:57 +00:00 · 2021-08-20 10:55:34 +03:00
parent d61699b0f8 39bb6fb19c
commit 3ca4b638ac
41 changed files with 4298 additions and 245 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7,7 +7,7 @@ executors:
  zenith-build-executor:
    resource_class: xlarge
    docker:
-      - image: cimg/rust:1.51.0
+      - image: cimg/rust:1.52.1

 jobs:

@@ -237,6 +237,23 @@ jobs:
      - store_test_results:
          path: /tmp/test_output

+  # Build zenithdb/zenith:latest image and push it to Docker hub
+  docker-image:
+    docker:
+      - image: cimg/base:2021.04
+    steps:
+      - checkout
+      - setup_remote_docker:
+          docker_layer_caching: true
+      - run:
+          name: Init postgres submodule
+          command: git submodule update --init --depth 1
+      - run:
+          name: Build and push Docker image
+          command: |
+            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
+            docker build -t zenithdb/zenith:latest . && docker push zenithdb/zenith:latest
+
 workflows:
  build_and_test:
    jobs:
@@ -265,3 +282,14 @@ workflows:
          test_selection: batch_others
          requires:
            - build-zenith-<< matrix.build_type >>
+      - docker-image:
+          # Context gives an ability to login
+          context: Docker Hub
+          # Build image only for commits to main
+          filters:
+            branches:
+              only:
+                - main
+          requires:
+            - pg_regress tests release
+            - other tests release
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -82,6 +82,30 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"

+[[package]]
+name = "aversion"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41992ab8cfcc3026ef9abceffe0c2b0479c043183fc23825e30d22baab6df334"
+dependencies = [
+ "aversion-macros",
+ "byteorder",
+ "serde",
+ "serde_cbor",
+ "thiserror",
+]
+
+[[package]]
+name = "aversion-macros"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ba5785f953985aa0caca927ba4005880f3b4f53de87f134e810ae3549f744d2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "aws-creds"
 version = "0.26.0"
@@ -166,6 +190,18 @@ dependencies = [
 "generic-array",
 ]

+[[package]]
+name = "bookfile"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "efa3e2086414e1bbecbc10730f265e5b079ab4ea0b830e7219a70dab6471e753"
+dependencies = [
+ "aversion",
+ "byteorder",
+ "serde",
+ "thiserror",
+]
+
 [[package]]
 name = "boxfnonce"
 version = "0.1.1"
@@ -646,6 +682,12 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "half"
+version = "1.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62aca2aba2d62b4a7f5b33f3712cb1b0692779a56fb510499d5c0aa594daeaf3"
+
 [[package]]
 name = "hashbrown"
 version = "0.9.1"
@@ -1139,6 +1181,7 @@ name = "pageserver"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "bookfile",
 "byteorder",
 "bytes",
 "chrono",
@@ -1735,6 +1778,16 @@ dependencies = [
 "xml-rs",
 ]

+[[package]]
+name = "serde_cbor"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e18acfa2f90e8b735b2836ab8d538de304cbb6729a7360729ea5a895d15a622"
+dependencies = [
+ "half",
+ "serde",
+]
+
 [[package]]
 name = "serde_derive"
 version = "1.0.126"
--- a/83
+++ b/83
@@ -1,94 +1,77 @@
 #
 # Docker image for console integration testing.
 #
-# We may also reuse it in CI to unify installation process and as a general binaries building
-# tool for production servers.
-#
-# Dynamic linking is used for librocksdb and libstdc++ bacause librocksdb-sys calls
-# bindgen with "dynamic" feature flag. This also prevents usage of dockerhub alpine-rust
-# images which are statically linked and have guards against any dlopen. I would rather
-# prefer all static binaries so we may change the way librocksdb-sys builds or wait until
-# we will have our own storage and drop rockdb dependency.
-#
-# Cargo-chef is used to separate dependencies building from main binaries building. This
-# way `docker build` will download and install dependencies only of there are changes to
-# out Cargo.toml files.
-#
-

 #
-# build postgres separately -- this layer will be rebuilt only if one of
-# mentioned paths will get any changes
+# Build Postgres separately --- this layer will be rebuilt only if one of
+# mentioned paths will get any changes.
 #
-FROM alpine:3.13 as pg-build
-RUN apk add --update clang llvm compiler-rt compiler-rt-static lld musl-dev binutils \
-                     make bison flex readline-dev zlib-dev perl linux-headers libseccomp-dev
-WORKDIR zenith
+FROM zenithdb/build:buster AS pg-build
+WORKDIR /zenith
 COPY ./vendor/postgres vendor/postgres
 COPY ./Makefile Makefile
-# Build using clang and lld
-RUN CC='clang' LD='lld' CFLAGS='-fuse-ld=lld --rtlib=compiler-rt' make postgres -j4
+RUN make -j $(getconf _NPROCESSORS_ONLN) -s postgres

 #
 # Calculate cargo dependencies.
 # This will always run, but only generate recipe.json with list of dependencies without
 # installing them.
 #
-FROM alpine:20210212 as cargo-deps-inspect
-RUN apk add --update rust cargo
-RUN cargo install cargo-chef
-WORKDIR zenith
+FROM zenithdb/build:buster AS cargo-deps-inspect
+WORKDIR /zenith
 COPY . .
-RUN cargo chef prepare --recipe-path recipe.json
+RUN cargo chef prepare --recipe-path /zenith/recipe.json

 #
 # Build cargo dependencies.
-# This temp cantainner would be build only if recipe.json was changed.
+# This temp cantainner should be rebuilt only if recipe.json was changed.
 #
-FROM alpine:20210212 as deps-build
-RUN apk add --update rust cargo openssl-dev clang build-base
-# rust-rocksdb can be built against system-wide rocksdb -- that saves about
-# 10 minutes during build. Rocksdb apk package is in testing now, but use it
-# anyway. In case of any troubles we can download and build rocksdb here manually
-# (to cache it as a docker layer).
-RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev
-WORKDIR zenith
+FROM zenithdb/build:buster AS deps-build
+WORKDIR /zenith
 COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
-COPY --from=cargo-deps-inspect /root/.cargo/bin/cargo-chef /root/.cargo/bin/
+COPY --from=cargo-deps-inspect /usr/local/cargo/bin/cargo-chef /usr/local/cargo/bin/
 COPY --from=cargo-deps-inspect /zenith/recipe.json recipe.json
 RUN ROCKSDB_LIB_DIR=/usr/lib/ cargo chef cook --release --recipe-path recipe.json

 #
 # Build zenith binaries
 #
-FROM alpine:20210212 as build
-RUN apk add --update rust cargo openssl-dev clang build-base
-RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev
-WORKDIR zenith
+FROM zenithdb/build:buster AS build
+WORKDIR /zenith
 COPY . .
 # Copy cached dependencies
 COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
 COPY --from=deps-build /zenith/target target
-COPY --from=deps-build /root/.cargo /root/.cargo
+COPY --from=deps-build /usr/local/cargo/ /usr/local/cargo/
 RUN cargo build --release

 #
 # Copy binaries to resulting image.
-# build-base hare to provide libstdc++ (it will also bring gcc, but leave it this way until we figure
-# out how to statically link rocksdb or avoid it at all).
 #
-FROM alpine:3.13
-RUN apk add --update openssl build-base libseccomp-dev
-RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb
+FROM debian:buster-slim
+WORKDIR /data
+
+RUN apt-get update && apt-get -yq install librocksdb-dev libseccomp-dev openssl && \
+    mkdir zenith_install
+
 COPY --from=build /zenith/target/release/pageserver /usr/local/bin
 COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
 COPY --from=build /zenith/target/release/proxy /usr/local/bin
-COPY --from=pg-build /zenith/tmp_install /usr/local
+COPY --from=pg-build /zenith/tmp_install postgres_install
 COPY docker-entrypoint.sh /docker-entrypoint.sh

-RUN addgroup zenith && adduser -h /data -D -G zenith zenith
+# Remove build artifacts (~ 500 MB)
+RUN rm -rf postgres_install/build && \
+    # 'Install' Postgres binaries locally
+    cp -r postgres_install/* /usr/local/ && \
+    # Prepare an archive of Postgres binaries (should be around 11 MB)
+    # and keep it inside container for an ease of deploy pipeline.
+    cd postgres_install && tar -czf /data/postgres_install.tar.gz . && cd .. && \
+    rm -rf postgres_install
+
+RUN useradd -m -d /data zenith
+
 VOLUME ["/data"]
-WORKDIR /data
 USER zenith
 EXPOSE 6400
 ENTRYPOINT ["/docker-entrypoint.sh"]
--- a/Dockerfile.alpine
+++ b/Dockerfile.alpine
@@ -0,0 +1,95 @@
+#
+# Docker image for console integration testing.
+#
+# We may also reuse it in CI to unify installation process and as a general binaries building
+# tool for production servers.
+#
+# Dynamic linking is used for librocksdb and libstdc++ bacause librocksdb-sys calls
+# bindgen with "dynamic" feature flag. This also prevents usage of dockerhub alpine-rust
+# images which are statically linked and have guards against any dlopen. I would rather
+# prefer all static binaries so we may change the way librocksdb-sys builds or wait until
+# we will have our own storage and drop rockdb dependency.
+#
+# Cargo-chef is used to separate dependencies building from main binaries building. This
+# way `docker build` will download and install dependencies only of there are changes to
+# out Cargo.toml files.
+#
+
+
+#
+# build postgres separately -- this layer will be rebuilt only if one of
+# mentioned paths will get any changes
+#
+FROM alpine:3.13 as pg-build
+RUN apk add --update clang llvm compiler-rt compiler-rt-static lld musl-dev binutils \
+                     make bison flex readline-dev zlib-dev perl linux-headers libseccomp-dev
+WORKDIR zenith
+COPY ./vendor/postgres vendor/postgres
+COPY ./Makefile Makefile
+# Build using clang and lld
+RUN CC='clang' LD='lld' CFLAGS='-fuse-ld=lld --rtlib=compiler-rt' make postgres -j4
+
+#
+# Calculate cargo dependencies.
+# This will always run, but only generate recipe.json with list of dependencies without
+# installing them.
+#
+FROM alpine:20210212 as cargo-deps-inspect
+RUN apk add --update rust cargo
+RUN cargo install cargo-chef
+WORKDIR zenith
+COPY . .
+RUN cargo chef prepare --recipe-path recipe.json
+
+#
+# Build cargo dependencies.
+# This temp cantainner would be build only if recipe.json was changed.
+#
+FROM alpine:20210212 as deps-build
+RUN apk add --update rust cargo openssl-dev clang build-base
+# rust-rocksdb can be built against system-wide rocksdb -- that saves about
+# 10 minutes during build. Rocksdb apk package is in testing now, but use it
+# anyway. In case of any troubles we can download and build rocksdb here manually
+# (to cache it as a docker layer).
+RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev
+WORKDIR zenith
+COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
+COPY --from=cargo-deps-inspect /root/.cargo/bin/cargo-chef /root/.cargo/bin/
+COPY --from=cargo-deps-inspect /zenith/recipe.json recipe.json
+RUN ROCKSDB_LIB_DIR=/usr/lib/ cargo chef cook --release --recipe-path recipe.json
+
+#
+# Build zenith binaries
+#
+FROM alpine:20210212 as build
+RUN apk add --update rust cargo openssl-dev clang build-base
+RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev
+WORKDIR zenith
+COPY . .
+# Copy cached dependencies
+COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
+COPY --from=deps-build /zenith/target target
+COPY --from=deps-build /root/.cargo /root/.cargo
+RUN cargo build --release
+
+#
+# Copy binaries to resulting image.
+# build-base hare to provide libstdc++ (it will also bring gcc, but leave it this way until we figure
+# out how to statically link rocksdb or avoid it at all).
+#
+FROM alpine:3.13
+RUN apk add --update openssl build-base libseccomp-dev
+RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb
+COPY --from=build /zenith/target/release/pageserver /usr/local/bin
+COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
+COPY --from=build /zenith/target/release/proxy /usr/local/bin
+COPY --from=pg-build /zenith/tmp_install /usr/local
+COPY docker-entrypoint.sh /docker-entrypoint.sh
+
+RUN addgroup zenith && adduser -h /data -D -G zenith zenith
+VOLUME ["/data"]
+WORKDIR /data
+USER zenith
+EXPOSE 6400
+ENTRYPOINT ["/docker-entrypoint.sh"]
+CMD ["pageserver"]
--- a/Dockerfile.build
+++ b/Dockerfile.build
@@ -0,0 +1,15 @@
+#
+# Image with all the required dependencies to build https://github.com/zenithdb/zenith
+# and Postgres from https://github.com/zenithdb/postgres
+# Also includes some rust development and build tools.
+#
+FROM rust:slim-buster
+WORKDIR /zenith
+
+# Install postgres and zenith build dependencies
+# clang is for rocksdb
+RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
+                                          libseccomp-dev pkg-config libssl-dev librocksdb-dev clang
+
+# Install rust tools
+RUN rustup component add clippy && cargo install cargo-chef cargo-audit
--- a/README.md
+++ b/README.md
@@ -12,15 +12,12 @@ apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libsec
 libssl-dev clang
 ```

-[Rust] 1.48 or later is also required.
+[Rust] 1.52 or later is also required.

 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.

 To run the integration tests (not required to use the code), install
-Python (3.6 or higher), and install python3 packages with `pip` (called `pip3` on some systems):
-```
-pip install pytest psycopg2
-```
+Python (3.6 or higher), and install python3 packages with `pipenv` using `pipenv install` in the project directory.

 2. Build zenith and patched postgres
 ```sh
@@ -106,10 +103,9 @@ pytest

 ## Documentation

-Now we use README files to cover design ideas and overall architecture for each module.
-And rustdoc style documentation comments.
+Now we use README files to cover design ideas and overall architecture for each module and `rustdoc` style documentation comments. See also [/docs/](/docs/) a top-level overview of all available markdown documentation.

-To view your documentation in a browser, try running `cargo doc --no-deps --open`
+To view your `rustdoc` documentation in a browser, try running `cargo doc --no-deps --open`

 ## Source tree layout

--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -7,7 +7,7 @@ use std::sync::Arc;
 use std::time::Duration;
 use std::{collections::BTreeMap, path::PathBuf};
 use std::{
-    fs::{self, OpenOptions},
+    fs::{self, File, OpenOptions},
    io::Read,
 };

@@ -85,48 +85,36 @@ impl ComputeControlPlane {
        }
    }

-    /// Connect to a page server, get base backup, and untar it to initialize a
-    /// new data directory
-    pub fn new_from_page_server(
-        &mut self,
-        is_test: bool,
-        timelineid: ZTimelineId,
-        name: &str,
-        tenantid: ZTenantId,
-    ) -> Result<Arc<PostgresNode>> {
-        let node = Arc::new(PostgresNode {
-            name: name.to_owned(),
-            address: SocketAddr::new("127.0.0.1".parse().unwrap(), self.get_port()),
-            env: self.env.clone(),
-            pageserver: Arc::clone(&self.pageserver),
-            is_test,
-            timelineid,
-            tenantid,
-        });
-
-        node.init_from_page_server(self.env.auth_type)?;
-        self.nodes
-            .insert((tenantid, node.name.clone()), Arc::clone(&node));
-
-        Ok(node)
-    }
-
    pub fn new_node(
        &mut self,
        tenantid: ZTenantId,
        branch_name: &str,
+        config_only: bool,
    ) -> Result<Arc<PostgresNode>> {
        let timeline_id = self
            .pageserver
            .branch_get_by_name(&tenantid, branch_name)?
            .timeline_id;
-        let node = self.new_from_page_server(false, timeline_id, branch_name, tenantid)?;
+
+        let node = Arc::new(PostgresNode {
+            name: branch_name.to_owned(),
+            address: SocketAddr::new("127.0.0.1".parse().unwrap(), self.get_port()),
+            env: self.env.clone(),
+            pageserver: Arc::clone(&self.pageserver),
+            is_test: false,
+            timelineid: timeline_id,
+            tenantid,
+        });
+
+        node.init_from_page_server(self.env.auth_type, config_only)?;
+        self.nodes
+            .insert((tenantid, node.name.clone()), Arc::clone(&node));
+
        // Configure the node to stream WAL directly to the pageserver
        node.append_conf(
            "postgresql.conf",
            format!(
                concat!(
-                    "shared_preload_libraries = zenith\n",
                    "synchronous_standby_names = 'pageserver'\n", // TODO: add a new function arg?
                    "zenith.callmemaybe_connstring = '{}'\n",     // FIXME escaping
                ),
@@ -246,39 +234,15 @@ impl PostgresNode {
        })
    }

-    // Connect to a page server, get base backup, and untar it to initialize a
-    // new data directory
-    pub fn init_from_page_server(&self, auth_type: AuthType) -> Result<()> {
+    pub fn do_basebackup(&self) -> Result<()> {
        let pgdata = self.pgdata();

-        println!(
-            "Extracting base backup to create postgres instance: path={} port={}",
-            pgdata.display(),
-            self.address.port()
-        );
-
-        // initialize data directory
-        if self.is_test {
-            fs::remove_dir_all(&pgdata).ok();
-        }
-
        let sql = format!("basebackup {} {}", self.tenantid, self.timelineid);
        let mut client = self
            .pageserver
            .page_server_psql_client()
            .with_context(|| "connecting to page server failed")?;

-        fs::create_dir_all(&pgdata)
-            .with_context(|| format!("could not create data directory {}", pgdata.display()))?;
-        fs::set_permissions(pgdata.as_path(), fs::Permissions::from_mode(0o700)).with_context(
-            || {
-                format!(
-                    "could not set permissions in data directory {}",
-                    pgdata.display()
-                )
-            },
-        )?;
-
        let mut copyreader = client
            .copy_out(sql.as_str())
            .with_context(|| "page server 'basebackup' command failed")?;
@@ -294,6 +258,45 @@ impl PostgresNode {
        ar.unpack(&pgdata)
            .with_context(|| "extracting page backup failed")?;

+        Ok(())
+    }
+
+    // Connect to a page server, get base backup, and untar it to initialize a
+    // new data directory
+    pub fn init_from_page_server(&self, auth_type: AuthType, config_only: bool) -> Result<()> {
+        let pgdata = self.pgdata();
+
+        println!(
+            "Extracting base backup to create postgres instance: path={} port={}",
+            pgdata.display(),
+            self.address.port()
+        );
+
+        // initialize data directory
+        if self.is_test {
+            fs::remove_dir_all(&pgdata).ok();
+        }
+
+        fs::create_dir_all(&pgdata)
+            .with_context(|| format!("could not create data directory {}", pgdata.display()))?;
+        fs::set_permissions(pgdata.as_path(), fs::Permissions::from_mode(0o700)).with_context(
+            || {
+                format!(
+                    "could not set permissions in data directory {}",
+                    pgdata.display()
+                )
+            },
+        )?;
+
+        if config_only {
+            //Just create an empty config file
+            File::create(self.pgdata().join("postgresql.conf").to_str().unwrap())?;
+        } else {
+            self.do_basebackup()?;
+            fs::create_dir_all(self.pgdata().join("pg_wal"))?;
+            fs::create_dir_all(self.pgdata().join("pg_wal").join("archive_status"))?;
+        }
+
        // wal_log_hints is mandatory when running against pageserver (see gh issue#192)
        // TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE?
        self.append_conf(
@@ -321,8 +324,6 @@ impl PostgresNode {
        // page server yet. (gh issue #349)
        self.append_conf("postgresql.conf", "wal_keep_size='10TB'\n")?;

-        // Connect it to the page server.
-
        // set up authentication
        let password = if let AuthType::ZenithJWT = auth_type {
            "$ZENITH_AUTH_TOKEN"
@@ -348,8 +349,6 @@ impl PostgresNode {
            .as_str(),
        )?;

-        fs::create_dir_all(self.pgdata().join("pg_wal"))?;
-        fs::create_dir_all(self.pgdata().join("pg_wal").join("archive_status"))?;
        Ok(())
    }

@@ -410,6 +409,46 @@ impl PostgresNode {
    }

    pub fn start(&self, auth_token: &Option<String>) -> Result<()> {
+        // Bail if the node already running.
+        if self.status() == "running" {
+            anyhow::bail!("The node is already running");
+        }
+
+        // 1. We always start compute node from scratch, so
+        // if old dir exists, preserve config files and drop the directory
+
+        // XXX Now we only use 'postgresql.conf'.
+        // If we will need 'pg_hba.conf', support it here too
+
+        let postgresql_conf_path = self.pgdata().join("postgresql.conf");
+        let postgresql_conf = fs::read(postgresql_conf_path.clone()).with_context(|| {
+            format!(
+                "failed to read config file in {}",
+                postgresql_conf_path.to_str().unwrap()
+            )
+        })?;
+
+        println!(
+            "Destroying postgres data directory '{}'",
+            self.pgdata().to_str().unwrap()
+        );
+        fs::remove_dir_all(&self.pgdata())?;
+
+        // 2. Create new node
+        self.init_from_page_server(self.env.auth_type, false)?;
+
+        // 3. Bring back config files
+
+        if let Ok(mut file) = OpenOptions::new()
+            .append(false)
+            .write(true)
+            .open(&postgresql_conf_path)
+        {
+            file.write_all(&postgresql_conf)?;
+            file.sync_all()?;
+        }
+
+        // 4. Finally start the compute node postgres
        println!("Starting postgres node at '{}'", self.connstr());
        self.pg_ctl(&["start"], auth_token)
    }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -42,6 +42,9 @@ pub struct LocalEnv {
    #[serde(with = "hex")]
    pub tenantid: ZTenantId,

+    // Repository format, 'rocksdb' or 'layered' or None for default
+    pub repository_format: Option<String>,
+
    // jwt auth token used for communication with pageserver
    pub auth_token: String,

@@ -101,6 +104,7 @@ pub fn init(
    remote_pageserver: Option<&str>,
    tenantid: ZTenantId,
    auth_type: AuthType,
+    repository_format: Option<&str>,
 ) -> Result<()> {
    // check if config already exists
    let base_path = base_path();
@@ -176,6 +180,7 @@ pub fn init(
            base_data_dir: base_path,
            remotes: BTreeMap::default(),
            tenantid,
+            repository_format: repository_format.map(|x| x.into()),
            auth_token,
            auth_type,
            private_key_path,
@@ -194,6 +199,7 @@ pub fn init(
            base_data_dir: base_path,
            remotes: BTreeMap::default(),
            tenantid,
+            repository_format: repository_format.map(|x| x.into()),
            auth_token,
            auth_type,
            private_key_path,
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -50,7 +50,12 @@ impl PageServerNode {
            .unwrap()
    }

-    pub fn init(&self, create_tenant: Option<&str>, enable_auth: bool) -> Result<()> {
+    pub fn init(
+        &self,
+        create_tenant: Option<&str>,
+        enable_auth: bool,
+        repository_format: Option<&str>,
+    ) -> Result<()> {
        let mut cmd = Command::new(self.env.pageserver_bin()?);
        let mut args = vec![
            "--init",
@@ -65,6 +70,10 @@ impl PageServerNode {
            args.extend(&["--auth-type", "ZenithJWT"]);
        }

+        if let Some(repo_format) = repository_format {
+            args.extend(&["--repository-format", repo_format]);
+        }
+
        create_tenant.map(|tenantid| args.extend(&["--create-tenant", tenantid]));
        let status = cmd
            .args(args)
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
 if [ "$1" = 'pageserver' ]; then
-    if [ ! -d "/data/timelines" ]; then
+    if [ ! -d "/data/tenants" ]; then
        echo "Initializing pageserver data directory"
        pageserver --init -D /data --postgres-distrib /usr/local
    fi
--- a/docs/README.md
+++ b/docs/README.md
@@ -0,0 +1,11 @@
+# Zenith documentation
+
+## Table of contents
+
+- [authentication.md](authentication.md) — pageserver JWT authentication.
+- [docker.md](docker.md) — Docker images and building pipeline.
+- [multitenancy.md](multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI.
+- [pageserver/README](/pageserver/README) — pageserver overview.
+- [postgres_ffi/README](/postgres_ffi/README) — Postgres FFI overview.
+- [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
+- [walkeeper/README](/walkeeper/README.md) — WAL service overview.
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -0,0 +1,38 @@
+# Docker images of Zenith
+
+## Images
+
+Currently we build two main images:
+
+- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `wal_acceptor` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
+- [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres).
+
+And two intermediate images used either to reduce build time or to deliver some additional binary tools from other repos:
+
+- [zenithdb/build](https://hub.docker.com/repository/docker/zenithdb/build) — image with all the dependencies required to build Zenith and compute node images. This image is based on `rust:slim-buster`, so it also has a proper `rust` environment. Built from [/Dockerfile.build](/Dockerfile.build).
+- [zenithdb/compute-tools](https://hub.docker.com/repository/docker/zenithdb/compute-tools) — compute node configuration management tools.
+
+## Building pipeline
+
+1. Image `zenithdb/compute-tools` is re-built automatically.
+
+2. Image `zenithdb/build` is built manually. If you want to introduce any new compile time dependencies to Zenith or compute node you have to update this image as well, build it and push to Docker Hub.
+
+Build:
+```sh
+docker build -t zenithdb/build:buster -f Dockerfile.build .
+```
+
+Login:
+```sh
+docker login
+```
+
+Push to Docker Hub:
+```sh
+docker push zenithdb/build:buster
+```
+
+3. Image `zenithdb/compute-node` is built independently in the [zenithdb/postgres](https://github.com/zenithdb/postgres) repo.
+
+4. Image `zenithdb/zenith` is built in this repo after a successful `release` tests run and pushed to Docker Hub automatically.
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -7,6 +7,7 @@ edition = "2018"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
+bookfile = "^0.3"
 chrono = "0.4.19"
 rand = "0.8.3"
 regex = "1.4.5"
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -179,7 +179,9 @@ impl<'a> Basebackup<'a> {
    // Extract twophase state files
    //
    fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
-        if let Ok(img) =  self.timeline.get_page_at_lsn_nowait(RelishTag::TwoPhase { xid }, 0, self.lsn)
+        if let Ok(img) =
+            self.timeline
+                .get_page_at_lsn_nowait(RelishTag::TwoPhase { xid }, 0, self.lsn)
        {
            let mut buf = BytesMut::new();
            buf.extend_from_slice(&img[..]);
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -20,14 +20,14 @@ use anyhow::{ensure, Result};
 use clap::{App, Arg, ArgMatches};
 use daemonize::Daemonize;

-use pageserver::{branches, logger, page_cache, page_service, PageServerConf};
+use pageserver::{branches, logger, page_cache, page_service, PageServerConf, RepositoryFormat};
 use zenith_utils::http_endpoint;

 const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:64000";
 const DEFAULT_HTTP_ENDPOINT_ADDR: &str = "127.0.0.1:9898";

 const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
-const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100);
+const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(10);

 const DEFAULT_SUPERUSER: &str = "zenith_admin";

@@ -41,6 +41,7 @@ struct CfgFileParams {
    pg_distrib_dir: Option<String>,
    auth_validation_public_key_path: Option<String>,
    auth_type: Option<String>,
+    repository_format: Option<String>,
 }

 impl CfgFileParams {
@@ -58,6 +59,7 @@ impl CfgFileParams {
            pg_distrib_dir: get_arg("postgres-distrib"),
            auth_validation_public_key_path: get_arg("auth-validation-public-key-path"),
            auth_type: get_arg("auth-type"),
+            repository_format: get_arg("repository-format"),
        }
    }

@@ -74,6 +76,7 @@ impl CfgFileParams {
                .auth_validation_public_key_path
                .or(other.auth_validation_public_key_path),
            auth_type: self.auth_type.or(other.auth_type),
+            repository_format: self.repository_format.or(other.repository_format),
        }
    }

@@ -133,6 +136,16 @@ impl CfgFileParams {
            );
        }

+        let repository_format = match self.repository_format.as_ref() {
+            Some(repo_format_str) if repo_format_str == "rocksdb" => RepositoryFormat::RocksDb,
+            Some(repo_format_str) if repo_format_str == "layered" => RepositoryFormat::Layered,
+            Some(repo_format_str) => anyhow::bail!(
+                "invalid --repository-format '{}', must be 'rocksdb' or 'layered'",
+                repo_format_str
+            ),
+            None => RepositoryFormat::Layered, // default
+        };
+
        Ok(PageServerConf {
            daemonize: false,

@@ -148,8 +161,9 @@ impl CfgFileParams {
            pg_distrib_dir,

            auth_validation_public_key_path,
-
            auth_type,
+
+            repository_format,
        })
    }
 }
@@ -221,6 +235,12 @@ fn main() -> Result<()> {
                .takes_value(true)
                .help("Authentication scheme type. One of: Trust, MD5, ZenithJWT"),
        )
+        .arg(
+            Arg::with_name("repository-format")
+                .long("repository-format")
+                .takes_value(true)
+                .help("Which repository implementation to use, 'rocksdb' or 'layered'"),
+        )
        .get_matches();

    let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
--- a/pageserver/src/branches.rs
+++ b/pageserver/src/branches.rs
@@ -24,7 +24,7 @@ use crate::object_repository::ObjectRepository;
 use crate::page_cache;
 use crate::restore_local_repo;
 use crate::walredo::WalRedoManager;
-use crate::{repository::Repository, PageServerConf};
+use crate::{repository::Repository, PageServerConf, RepositoryFormat};

 #[derive(Serialize, Deserialize, Clone)]
 pub struct BranchInfo {
@@ -65,8 +65,8 @@ pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str
 pub fn create_repo(
    conf: &'static PageServerConf,
    tenantid: ZTenantId,
-    wal_redo_manager: Arc<dyn WalRedoManager>,
-) -> Result<ObjectRepository> {
+    wal_redo_manager: Arc<dyn WalRedoManager + Send + Sync>,
+) -> Result<Arc<dyn Repository>> {
    let repo_dir = conf.tenant_path(&tenantid);
    if repo_dir.exists() {
        bail!("repo for {} already exists", tenantid)
@@ -96,19 +96,27 @@ pub fn create_repo(
    // and we failed to run initdb again in the same directory. This has been solved for the
    // rapid init+start case now, but the general race condition remains if you restart the
    // server quickly.
-    let storage = crate::rocksdb_storage::RocksObjectStore::create(conf, &tenantid)?;
+    let repo: Arc<dyn Repository + Sync + Send> =
+        match conf.repository_format {
+            RepositoryFormat::Layered => Arc::new(
+                crate::layered_repository::LayeredRepository::new(conf, wal_redo_manager, tenantid),
+            ),
+            RepositoryFormat::RocksDb => {
+                let obj_store = crate::rocksdb_storage::RocksObjectStore::create(conf, &tenantid)?;

-    let repo = crate::object_repository::ObjectRepository::new(
-        conf,
-        std::sync::Arc::new(storage),
-        wal_redo_manager,
-        tenantid,
-    );
+                Arc::new(ObjectRepository::new(
+                    conf,
+                    Arc::new(obj_store),
+                    wal_redo_manager,
+                    tenantid,
+                ))
+            }
+        };

    // Load data into pageserver
    // TODO To implement zenith import we need to
    //      move data loading out of create_repo()
-    bootstrap_timeline(conf, tenantid, tli, &repo)?;
+    bootstrap_timeline(conf, tenantid, tli, &*repo)?;

    Ok(repo)
 }
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
--- a/pageserver/src/layered_repository/README.md
+++ b/pageserver/src/layered_repository/README.md
@@ -0,0 +1,298 @@
+# Overview
+
+The on-disk format is based on immutable files. The page server
+receives a stream of incoming WAL, parses the WAL records to determine
+which pages they apply to, and accumulates the incoming changes in
+memory. Every now and then, the accumulated changes are written out to
+new files.
+
+The files are called "snapshot files". Each snapshot file corresponds
+to one 10 MB slice of a PostgreSQL relation fork. The snapshot files
+for each timeline are stored in the timeline's subdirectory under
+.zenith/tenants/<tenantid>/timelines.
+
+The files are named like this:
+
+    rel_<spcnode>_<dbnode>_<relnode>_<forknum>_<segno>_<start LSN>_<end LSN>
+
+For example:
+
+    rel_1663_13990_2609_0_10_000000000169C348_0000000001702000
+
+Some non-relation files are also stored in repository. For example,
+a CLOG segment would be named like this:
+
+    pg_xact_0000_0_00000000198B06B0_00000000198C2550
+
+There is no difference in how the relation and non-relation files are
+managed, except that the first part of file names is different.
+Internally, the relations and non-relation files that are managed in
+the versioned store are together called "relishes".
+
+Each snapshot file contains a full snapshot, that is, full copy of all
+pages in the relation, as of the "start LSN". It also contains all WAL
+records applicable to the relation between the start and end
+LSNs. With this information, the page server can reconstruct any page
+version of the relation in the LSN range.
+
+If a file has been dropped, the last snapshot file for it is created
+with the _DROPPED suffix, e.g.
+
+    rel_1663_13990_2609_0_10_000000000169C348_0000000001702000_DROPPED
+
+In addition to the relations, with "rel_*" prefix, we use the same
+format for storing various smaller files from the PostgreSQL data
+directory. They will use different suffixes and the naming scheme
+up to the LSN range varies. The Zenith source code uses the term
+"relish" to mean "a relation, or other file that's treated like a
+relation in the storage"
+
+## Notation used in this document
+
+The full path of a snapshot file looks like this:
+
+    .zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000
+
+For simplicity, the examples below use a simplified notation for the
+paths.  The tenant ID is left out, the timeline ID is replaced with
+the human-readable branch name, and spcnode+dbnode+relnode+forkum+segno
+with a human-readable table name. The LSNs are also shorter. For
+example, a snapshot file for 'orders' table on 'main' branch, with LSN
+range 100-200 would be:
+
+    main/orders_100_200
+
+
+# Creating snapshot files
+
+Let's start with a simple example with a system that contains one
+branch called 'main' and two tables, 'orders' and 'customers'. The end
+of WAL is currently at LSN 250. In this starting situation, you would
+have two files on disk:
+
+	main/orders_100_200
+	main/customers_100_200
+
+In addition to those files, the recent changes between LSN 200 and the
+end of WAL at 250 are kept in memory. If the page server crashes, the
+latest records between 200-250 need to be re-read from the WAL.
+
+Whenever enough WAL has been accumulated in memory, the page server
+writes out the changes in memory into new snapshot files. This process
+is called "checkpointing" (not to be confused with the PostgreSQL
+checkpoints, that's a different thing). The page server only creates
+snapshot files for relations that have been modified since the last
+checkpoint. For example, if the current end of WAL is at LSN 450, and
+the last checkpoint happened at LSN 400 but there hasn't been any
+recent changes to 'customers' table, you would have these files on
+disk:
+
+	main/orders_100_200
+	main/orders_200_300
+	main/orders_300_400
+	main/customers_100_200
+
+If the customers table is modified later, a new file is created for it
+at the next checkpoint. The new file will cover the "gap" from the
+last snapshot file, so the LSN ranges are always contiguous:
+
+	main/orders_100_200
+	main/orders_200_300
+	main/orders_300_400
+	main/customers_100_200
+	main/customers_200_500
+
+## Reading page versions
+
+Whenever a GetPage@LSN request comes in from the compute node, the
+page server needs to reconstruct the requested page, as it was at the
+requested LSN. To do that, the page server first checks the recent
+in-memory layer; if the requested page version is found there, it can
+be returned immediatedly without looking at the files on
+disk. Otherwise the page server needs to locate the snapshot file that
+contains the requested page version.
+
+For example, if a request comes in for table 'orders' at LSN 250, the
+page server would load the 'main/orders_200_300' file into memory, and
+reconstruct and return the requested page from it, as it was at
+LSN 250. Because the snapshot file consists of a full image of the
+relation at the start LSN and the WAL, reconstructing the page
+involves replaying any WAL records applicable to the page between LSNs
+200-250, starting from the base image at LSN 200.
+
+A request at a file boundary can be satisfied using either file. For
+example, if there are two files on disk:
+
+	main/orders_100_200
+	main/orders_200_300
+
+And a request comes with LSN 200, either file can be used for it. It
+is better to use the later file, however, because it contains an
+already materialized version of all the pages at LSN 200. Using the
+first file, you would need to apply any WAL records between 100 and
+200 to reconstruct the requested page.
+
+# Multiple branches
+
+Imagine that a child branch is created at LSN 250:
+
+            @250
+    ----main--+-------------------------->
+               \
+                +---child-------------->
+
+
+Then, the 'orders' table is updated differently on the 'main' and
+'child' branches. You now have this situation on disk:
+
+    main/orders_100_200
+    main/orders_200_300
+    main/orders_300_400
+    main/customers_100_200
+    child/orders_250_300
+    child/orders_300_400
+
+Because the 'customers' table hasn't been modified on the child
+branch, there is no file for it there. If you request a page for it on
+the 'child' branch, the page server will not find any snapshot file
+for it in the 'child' directory, so it will recurse to look into the
+parent 'main' branch instead.
+
+From the 'child' branch's point of view, the history for each relation
+is linear, and the request's LSN identifies unambiguously which file
+you need to look at. For example, the history for the 'orders' table
+on the 'main' branch consists of these files:
+
+    main/orders_100_200
+    main/orders_200_300
+    main/orders_300_400
+
+And from the 'child' branch's point of view, it consists of these
+files:
+
+    main/orders_100_200
+    main/orders_200_300
+    child/orders_250_300
+    child/orders_300_400
+
+The branch metadata includes the point where the child branch was
+created, LSN 250. If a page request comes with LSN 275, we read the
+page version from the 'child/orders_250_300' file. If the request LSN
+is 225, we read it from the 'main/orders_200_300' file instead.  The
+page versions between 250-300 in the 'main/orders_200_300' file are
+ignored when operating on the child branch.
+
+Note: It doesn't make any difference if the child branch is created
+when the end of the main branch was at LSN 250, or later when the tip of
+the main branch had already moved on. The latter case, creating a
+branch at a historic LSN, is how we support PITR in Zenith.
+
+
+# Garbage collection
+
+In this scheme, we keep creating new snapshot files over time. We also
+need a mechanism to remove old files that are no longer needed,
+because disk space isn't infinite.
+
+What files are still needed? Currently, the page server supports PITR
+and branching from any branch at any LSN that is "recent enough" from
+the tip of the branch.  "Recent enough" is defined as an LSN horizon,
+which by default is 64 MB.  (See DEFAULT_GC_HORIZON). For this
+example, let's assume that the LSN horizon is 150 units.
+
+Let's look at the single branch scenario again. Imagine that the end
+of the branch is LSN 525, so that the GC horizon is currently at
+525-150 = 375
+
+	main/orders_100_200
+	main/orders_200_300
+	main/orders_300_400
+	main/orders_400_500
+	main/customers_100_200
+
+We can remove files 'main/orders_100_200' and 'main/orders_200_300',
+because the end LSNs of those files are older than GC horizon 375, and
+there are more recent snapshot files for the table. 'main/orders_300_400'
+and 'main/orders_400_500' are still within the horizon, so they must be
+retained. 'main/customers_100_200' is old enough, but it cannot be
+removed because there is no newer snapshot file for the table.
+
+Things get slightly more complicated with multiple branches. All of
+the above still holds, but in addition to recent files we must also
+retain older shapshot files that are still needed by child branches.
+For example, if child branch is created at LSN 150, and the 'customers'
+table is updated on the branch, you would have these files:
+
+	main/orders_100_200
+	main/orders_200_300
+	main/orders_300_400
+	main/orders_400_500
+	main/customers_100_200
+	child/customers_150_300
+
+In this situation, the 'main/orders_100_200' file cannot be removed,
+even though it is older than the GC horizon, because it is still
+needed by the child branch.  'main/orders_200_300' can still be
+removed. So after garbage collection, these files would remain:
+
+	main/orders_100_200
+
+	main/orders_300_400
+	main/orders_400_500
+	main/customers_100_200
+	child/customers_150_300
+
+If 'orders' is modified later on the 'child' branch, we will create a
+snapshot file for it on the child:
+
+	main/orders_100_200
+
+	main/orders_300_400
+	main/orders_400_500
+	main/customers_100_200
+	child/customers_150_300
+	child/orders_150_400
+
+After this, the 'main/orders_100_200' file can be removed. It is no
+longer needed by the child branch, because there is a newer snapshot
+file there. TODO: This optimization hasn't been implemented! The GC
+algorithm will currently keep the file on the 'main' branch anyway, for
+as long as the child branch exists.
+
+
+# TODO: On LSN ranges
+
+In principle, each relation can be checkpointed separately, i.e. the
+LSN ranges of the files don't need to line up. So this would be legal:
+
+	main/orders_100_200
+	main/orders_200_300
+	main/orders_300_400
+	main/customers_150_250
+	main/customers_250_500
+
+However, the code currently always checkpoints all relations together.
+So that situation doesn't arise in practice.
+
+It would also be OK to have overlapping LSN ranges for the same relation:
+
+	main/orders_100_200
+	main/orders_200_300
+	main/orders_250_350
+	main/orders_300_400
+
+The code that reads the snapshot files should cope with this, but this
+situation doesn't arise either, because the checkpointing code never
+does that.  It could be useful, however, as a transient state when
+garbage collecting around branch points, or explicit recovery
+points. For example, if we start with this:
+
+	main/orders_100_200
+	main/orders_200_300
+	main/orders_300_400
+
+And there is a branch or explicit recovery point at LSN 150, we could
+replace 'main/orders_100_200' with 'main/orders_150_150' to keep a
+snapshot only at that exact point that's still needed, removing the
+other page versions around it. But such compaction has not been
+implemented yet.
--- a/pageserver/src/layered_repository/inmemory_layer.rs
+++ b/pageserver/src/layered_repository/inmemory_layer.rs
@@ -0,0 +1,491 @@
+//!
+//! An in-memory layer stores recently received page versions in memory. The page versions
+//! are held in a BTreeMap, and there's another BTreeMap to track the size of the relation.
+//!
+use crate::layered_repository::storage_layer::{
+    Layer, PageReconstructData, PageVersion, SegmentTag, RELISH_SEG_SIZE,
+};
+use crate::layered_repository::LayeredTimeline;
+use crate::layered_repository::SnapshotLayer;
+use crate::repository::WALRecord;
+use crate::PageServerConf;
+use crate::{ZTenantId, ZTimelineId};
+use anyhow::{bail, Result};
+use bytes::Bytes;
+use log::*;
+use std::collections::BTreeMap;
+use std::ops::Bound::Included;
+use std::sync::{Arc, Mutex};
+
+use zenith_utils::lsn::Lsn;
+
+pub struct InMemoryLayer {
+    conf: &'static PageServerConf,
+    tenantid: ZTenantId,
+    timelineid: ZTimelineId,
+    seg: SegmentTag,
+
+    ///
+    /// This layer contains all the changes from 'start_lsn'. The
+    /// start is inclusive. There is no end LSN; we only use in-memory
+    /// layer at the end of a timeline.
+    ///
+    start_lsn: Lsn,
+
+    /// The above fields never change. The parts that do change are in 'inner',
+    /// and protected by mutex.
+    inner: Mutex<InMemoryLayerInner>,
+}
+
+pub struct InMemoryLayerInner {
+    /// If this relation was dropped, remember when that happened.
+    drop_lsn: Option<Lsn>,
+
+    ///
+    /// All versions of all pages in the layer are are kept here.
+    /// Indexed by block number and LSN.
+    ///
+    page_versions: BTreeMap<(u32, Lsn), PageVersion>,
+
+    ///
+    /// `segsizes` tracks the size of the segment at different points in time.
+    ///
+    segsizes: BTreeMap<Lsn, u32>,
+}
+
+impl Layer for InMemoryLayer {
+    fn get_timeline_id(&self) -> ZTimelineId {
+        return self.timelineid;
+    }
+
+    fn get_seg_tag(&self) -> SegmentTag {
+        return self.seg;
+    }
+
+    fn get_start_lsn(&self) -> Lsn {
+        return self.start_lsn;
+    }
+
+    fn get_end_lsn(&self) -> Lsn {
+        let inner = self.inner.lock().unwrap();
+
+        if let Some(drop_lsn) = inner.drop_lsn {
+            drop_lsn
+        } else {
+            Lsn(u64::MAX)
+        }
+    }
+
+    fn is_dropped(&self) -> bool {
+        let inner = self.inner.lock().unwrap();
+        inner.drop_lsn.is_some()
+    }
+
+    /// Look up given page in the cache.
+    fn get_page_reconstruct_data(
+        &self,
+        blknum: u32,
+        lsn: Lsn,
+        reconstruct_data: &mut PageReconstructData,
+    ) -> Result<Option<Lsn>> {
+        // Scan the BTreeMap backwards, starting from reconstruct_data.lsn.
+        let mut need_base_image_lsn: Option<Lsn> = Some(lsn);
+
+        assert!(self.seg.blknum_in_seg(blknum));
+
+        {
+            let inner = self.inner.lock().unwrap();
+            let minkey = (blknum, Lsn(0));
+            let maxkey = (blknum, lsn);
+            let mut iter = inner
+                .page_versions
+                .range((Included(&minkey), Included(&maxkey)));
+            while let Some(((_blknum, entry_lsn), entry)) = iter.next_back() {
+                if let Some(img) = &entry.page_image {
+                    reconstruct_data.page_img = Some(img.clone());
+                    need_base_image_lsn = None;
+                    break;
+                } else if let Some(rec) = &entry.record {
+                    reconstruct_data.records.push(rec.clone());
+                    if rec.will_init {
+                        // This WAL record initializes the page, so no need to go further back
+                        need_base_image_lsn = None;
+                        break;
+                    } else {
+                        need_base_image_lsn = Some(*entry_lsn);
+                    }
+                } else {
+                    // No base image, and no WAL record. Huh?
+                    bail!("no page image or WAL record for requested page");
+                }
+            }
+
+            // release lock on 'page_versions'
+        }
+
+        Ok(need_base_image_lsn)
+    }
+
+    /// Get size of the relation at given LSN
+    fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
+        // Scan the BTreeMap backwards, starting from the given entry.
+        let inner = self.inner.lock().unwrap();
+        let mut iter = inner.segsizes.range((Included(&Lsn(0)), Included(&lsn)));
+
+        if let Some((_entry_lsn, entry)) = iter.next_back() {
+            let result = *entry;
+            drop(inner);
+            trace!("get_seg_size: {} at {} -> {}", self.seg, lsn, result);
+            Ok(result)
+        } else {
+            bail!("No size found for {} at {} in memory", self.seg, lsn);
+        }
+    }
+
+    /// Does this segment exist at given LSN?
+    fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
+        let inner = self.inner.lock().unwrap();
+
+        // Is the requested LSN after the segment was dropped?
+        if let Some(drop_lsn) = inner.drop_lsn {
+            if lsn >= drop_lsn {
+                return Ok(false);
+            }
+        }
+
+        // Otherwise, it exists
+        Ok(true)
+    }
+}
+
+impl InMemoryLayer {
+    ///
+    /// Create a new, empty, in-memory layer
+    ///
+    pub fn create(
+        conf: &'static PageServerConf,
+        timelineid: ZTimelineId,
+        tenantid: ZTenantId,
+        seg: SegmentTag,
+        start_lsn: Lsn,
+    ) -> Result<InMemoryLayer> {
+        trace!(
+            "initializing new empty InMemoryLayer for writing {} on timeline {} at {}",
+            seg,
+            timelineid,
+            start_lsn
+        );
+
+        Ok(InMemoryLayer {
+            conf,
+            timelineid,
+            tenantid,
+            seg,
+            start_lsn,
+            inner: Mutex::new(InMemoryLayerInner {
+                drop_lsn: None,
+                page_versions: BTreeMap::new(),
+                segsizes: BTreeMap::new(),
+            }),
+        })
+    }
+
+    // Write operations
+
+    /// Remember new page version, as a WAL record over previous version
+    pub fn put_wal_record(&self, blknum: u32, rec: WALRecord) -> Result<()> {
+        self.put_page_version(
+            blknum,
+            rec.lsn,
+            PageVersion {
+                page_image: None,
+                record: Some(rec),
+            },
+        )
+    }
+
+    /// Remember new page version, as a full page image
+    pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()> {
+        self.put_page_version(
+            blknum,
+            lsn,
+            PageVersion {
+                page_image: Some(img),
+                record: None,
+            },
+        )
+    }
+
+    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
+    /// Adds the page version to the in-memory tree
+    pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> Result<()> {
+        assert!(self.seg.blknum_in_seg(blknum));
+
+        trace!(
+            "put_page_version blk {} of {} at {}/{}",
+            blknum,
+            self.seg.rel,
+            self.timelineid,
+            lsn
+        );
+        let mut inner = self.inner.lock().unwrap();
+
+        let old = inner.page_versions.insert((blknum, lsn), pv);
+
+        if old.is_some() {
+            // We already had an entry for this LSN. That's odd..
+            warn!(
+                "Page version of rel {} blk {} at {} already exists",
+                self.seg.rel, blknum, lsn
+            );
+        }
+
+        // Also update the relation size, if this extended the relation.
+        if self.seg.rel.is_blocky() {
+            let newsize = blknum - self.seg.segno * RELISH_SEG_SIZE + 1;
+
+            let mut iter = inner.segsizes.range((Included(&Lsn(0)), Included(&lsn)));
+
+            let oldsize;
+            if let Some((_entry_lsn, entry)) = iter.next_back() {
+                oldsize = *entry;
+            } else {
+                oldsize = 0;
+                //bail!("No old size found for {} at {}", self.tag, lsn);
+            }
+            if newsize > oldsize {
+                trace!(
+                    "enlarging segment {} from {} to {} blocks at {}",
+                    self.seg,
+                    oldsize,
+                    newsize,
+                    lsn
+                );
+                inner.segsizes.insert(lsn, newsize);
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Remember that the relation was truncated at given LSN
+    pub fn put_truncation(&self, lsn: Lsn, segsize: u32) -> anyhow::Result<()> {
+        let mut inner = self.inner.lock().unwrap();
+        let old = inner.segsizes.insert(lsn, segsize);
+
+        if old.is_some() {
+            // We already had an entry for this LSN. That's odd..
+            warn!("Inserting truncation, but had an entry for the LSN already");
+        }
+
+        Ok(())
+    }
+
+    /// Remember that the segment was dropped at given LSN
+    pub fn put_unlink(&self, lsn: Lsn) -> anyhow::Result<()> {
+        let mut inner = self.inner.lock().unwrap();
+
+        assert!(inner.drop_lsn.is_none());
+        inner.drop_lsn = Some(lsn);
+
+        info!("dropped segment {} at {}", self.seg, lsn);
+
+        Ok(())
+    }
+
+    ///
+    /// Initialize a new InMemoryLayer for, by copying the state at the given
+    /// point in time from given existing layer.
+    ///
+    pub fn copy_snapshot(
+        conf: &'static PageServerConf,
+        timeline: &LayeredTimeline,
+        src: &dyn Layer,
+        timelineid: ZTimelineId,
+        tenantid: ZTenantId,
+        lsn: Lsn,
+    ) -> Result<InMemoryLayer> {
+        trace!(
+            "initializing new InMemoryLayer for writing {} on timeline {} at {}",
+            src.get_seg_tag(),
+            timelineid,
+            lsn
+        );
+        let mut page_versions = BTreeMap::new();
+        let mut segsizes = BTreeMap::new();
+
+        let seg = src.get_seg_tag();
+
+        let startblk;
+        let size;
+        if seg.rel.is_blocky() {
+            size = src.get_seg_size(lsn)?;
+            segsizes.insert(lsn, size);
+            startblk = seg.segno * RELISH_SEG_SIZE;
+        } else {
+            size = 1;
+            startblk = 0;
+        }
+
+        for blknum in startblk..(startblk + size) {
+            let img = timeline.materialize_page(seg, blknum, lsn, src)?;
+            let pv = PageVersion {
+                page_image: Some(img),
+                record: None,
+            };
+            page_versions.insert((blknum, lsn), pv);
+        }
+
+        Ok(InMemoryLayer {
+            conf,
+            timelineid,
+            tenantid,
+            seg: src.get_seg_tag(),
+            start_lsn: lsn,
+            inner: Mutex::new(InMemoryLayerInner {
+                drop_lsn: None,
+                page_versions: page_versions,
+                segsizes: segsizes,
+            }),
+        })
+    }
+
+    ///
+    /// Write the this in-memory layer to disk, as a snapshot layer.
+    ///
+    /// The cutoff point for the layer that's written to disk is 'end_lsn'.
+    ///
+    /// Returns new layers that replace this one. Always returns a
+    /// SnapshotLayer containing the page versions that were written to disk,
+    /// but if there were page versions newer than 'end_lsn', also return a new
+    /// in-memory layer containing those page versions. The caller replaces
+    /// this layer with the returned layers in the layer map.
+    ///
+    pub fn freeze(
+        &self,
+        cutoff_lsn: Lsn,
+        // This is needed just to call materialize_page()
+        timeline: &LayeredTimeline,
+    ) -> Result<(Option<Arc<SnapshotLayer>>, Option<Arc<InMemoryLayer>>)> {
+        info!(
+            "freezing in memory layer for {} on timeline {} at {}",
+            self.seg, self.timelineid, cutoff_lsn
+        );
+
+        let inner = self.inner.lock().unwrap();
+
+        // Normally, use the cutoff LSN as the end of the frozen layer.
+        // But if the relation was dropped, we know that there are no
+        // more changes coming in for it, and in particular we know that
+        // there are no changes "in flight" for the LSN anymore, so we use
+        // the drop LSN instead. The drop-LSN could be ahead of the
+        // caller-specified LSN!
+        let dropped = inner.drop_lsn.is_some();
+        let end_lsn = if dropped {
+            inner.drop_lsn.unwrap()
+        } else {
+            cutoff_lsn
+        };
+
+        // Divide all the page versions into old and new at the 'end_lsn' cutoff point.
+        let mut before_page_versions;
+        let mut before_segsizes;
+        let mut after_page_versions;
+        let mut after_segsizes;
+        if !dropped {
+            before_segsizes = BTreeMap::new();
+            after_segsizes = BTreeMap::new();
+            for (lsn, size) in inner.segsizes.iter() {
+                if *lsn > end_lsn {
+                    after_segsizes.insert(*lsn, *size);
+                } else {
+                    before_segsizes.insert(*lsn, *size);
+                }
+            }
+
+            before_page_versions = BTreeMap::new();
+            after_page_versions = BTreeMap::new();
+            for ((blknum, lsn), pv) in inner.page_versions.iter() {
+                if *lsn > end_lsn {
+                    after_page_versions.insert((*blknum, *lsn), pv.clone());
+                } else {
+                    before_page_versions.insert((*blknum, *lsn), pv.clone());
+                }
+            }
+        } else {
+            before_page_versions = inner.page_versions.clone();
+            before_segsizes = inner.segsizes.clone();
+            after_segsizes = BTreeMap::new();
+            after_page_versions = BTreeMap::new();
+        }
+
+        // we can release the lock now.
+        drop(inner);
+
+        // Write the page versions before the cutoff to disk.
+        let snapfile = SnapshotLayer::create(
+            self.conf,
+            self.timelineid,
+            self.tenantid,
+            self.seg,
+            self.start_lsn,
+            end_lsn,
+            dropped,
+            before_page_versions,
+            before_segsizes,
+        )?;
+
+        // If there were any "new" page versions, initialize a new in-memory layer to hold
+        // them
+        let new_open = if !after_segsizes.is_empty() || !after_page_versions.is_empty() {
+            info!("created new in-mem layer for {} {}-", self.seg, end_lsn);
+
+            let new_open = Self::copy_snapshot(
+                self.conf,
+                timeline,
+                &snapfile,
+                self.timelineid,
+                self.tenantid,
+                end_lsn,
+            )?;
+            let mut new_inner = new_open.inner.lock().unwrap();
+            new_inner.page_versions.append(&mut after_page_versions);
+            new_inner.segsizes.append(&mut after_segsizes);
+            drop(new_inner);
+
+            Some(Arc::new(new_open))
+        } else {
+            None
+        };
+
+        let new_historic = Some(Arc::new(snapfile));
+
+        Ok((new_historic, new_open))
+    }
+
+    /// debugging function to print out the contents of the layer
+    #[allow(unused)]
+    pub fn dump(&self) -> String {
+        let mut result = format!(
+            "----- inmemory layer for {} {}-> ----\n",
+            self.seg, self.start_lsn
+        );
+
+        let inner = self.inner.lock().unwrap();
+
+        for (k, v) in inner.segsizes.iter() {
+            result += &format!("{}: {}\n", k, v);
+        }
+        for (k, v) in inner.page_versions.iter() {
+            result += &format!(
+                "blk {} at {}: {}/{}\n",
+                k.0,
+                k.1,
+                v.page_image.is_some(),
+                v.record.is_some()
+            );
+        }
+
+        result
+    }
+}
--- a/pageserver/src/layered_repository/layer_map.rs
+++ b/pageserver/src/layered_repository/layer_map.rs
@@ -0,0 +1,281 @@
+//!
+//! The layer map tracks what layers exist for all the relations in a timeline.
+//!
+//! When the timeline is first accessed, the server lists of all snapshot files
+//! in the timelines/<timelineid> directory, and populates this map with
+//! SnapshotLayers corresponding to each file. When new WAL is received,
+//! we create InMemoryLayers to hold the incoming records. Now and then,
+//! in the checkpoint() function, the in-memory layers are frozen, forming
+//! new snapshot layers and corresponding files are written to disk.
+//!
+
+use crate::layered_repository::storage_layer::{Layer, SegmentTag};
+use crate::layered_repository::{InMemoryLayer, SnapshotLayer};
+use crate::relish::*;
+use anyhow::Result;
+use log::*;
+use std::collections::HashSet;
+use std::collections::{BTreeMap, HashMap};
+use std::ops::Bound::Included;
+use std::sync::Arc;
+use zenith_utils::lsn::Lsn;
+
+///
+/// LayerMap tracks what layers exist or a timeline. The last layer that is
+/// open for writes is always an InMemoryLayer, and is tracked separately
+/// because there can be only one for each segment. The older layers,
+/// stored on disk, are kept in a BTreeMap keyed by the layer's start LSN.
+///
+pub struct LayerMap {
+    segs: HashMap<SegmentTag, SegEntry>,
+}
+
+struct SegEntry {
+    pub open: Option<Arc<InMemoryLayer>>,
+    pub historic: BTreeMap<Lsn, Arc<SnapshotLayer>>,
+}
+
+impl LayerMap {
+    ///
+    /// Look up using the given segment tag and LSN. This differs from a plain
+    /// key-value lookup in that if there is any layer that covers the
+    /// given LSN, or precedes the given LSN, it is returned. In other words,
+    /// you don't need to know the exact start LSN of the layer.
+    ///
+    pub fn get(&self, tag: &SegmentTag, lsn: Lsn) -> Option<Arc<dyn Layer>> {
+        let segentry = self.segs.get(tag)?;
+
+        if let Some(open) = &segentry.open {
+            if open.get_start_lsn() <= lsn {
+                let x: Arc<dyn Layer> = Arc::clone(&open) as _;
+                return Some(x);
+            }
+        }
+
+        if let Some((_k, v)) = segentry
+            .historic
+            .range((Included(Lsn(0)), Included(lsn)))
+            .next_back()
+        {
+            let x: Arc<dyn Layer> = Arc::clone(&v) as _;
+            Some(x)
+        } else {
+            None
+        }
+    }
+
+    ///
+    /// Get the open layer for given segment for writing. Or None if no open
+    /// layer exists.
+    ///
+    pub fn get_open(&self, tag: &SegmentTag) -> Option<Arc<InMemoryLayer>> {
+        let segentry = self.segs.get(tag)?;
+
+        if let Some(open) = &segentry.open {
+            Some(Arc::clone(open))
+        } else {
+            None
+        }
+    }
+
+    ///
+    /// Insert an open in-memory layer
+    ///
+    pub fn insert_open(&mut self, layer: Arc<InMemoryLayer>) {
+        let tag = layer.get_seg_tag();
+
+        if let Some(segentry) = self.segs.get_mut(&tag) {
+            if let Some(_old) = &segentry.open {
+                // FIXME: shouldn't exist, but check
+            }
+            segentry.open = Some(layer);
+        } else {
+            let segentry = SegEntry {
+                open: Some(layer),
+                historic: BTreeMap::new(),
+            };
+            self.segs.insert(tag, segentry);
+        }
+    }
+
+    ///
+    /// Insert an on-disk layer
+    ///
+    pub fn insert_historic(&mut self, layer: Arc<SnapshotLayer>) {
+        let tag = layer.get_seg_tag();
+        let start_lsn = layer.get_start_lsn();
+
+        if let Some(segentry) = self.segs.get_mut(&tag) {
+            segentry.historic.insert(start_lsn, layer);
+        } else {
+            let mut historic = BTreeMap::new();
+            historic.insert(start_lsn, layer);
+
+            let segentry = SegEntry {
+                open: None,
+                historic,
+            };
+            self.segs.insert(tag, segentry);
+        }
+    }
+
+    ///
+    /// Remove an on-disk layer from the map.
+    ///
+    /// This should be called when the corresponding file on disk has been deleted.
+    ///
+    pub fn remove_historic(&mut self, layer: &SnapshotLayer) {
+        let tag = layer.get_seg_tag();
+        let start_lsn = layer.get_start_lsn();
+
+        if let Some(segentry) = self.segs.get_mut(&tag) {
+            segentry.historic.remove(&start_lsn);
+        }
+    }
+
+    pub fn list_rels(&self, spcnode: u32, dbnode: u32) -> Result<HashSet<RelTag>> {
+        let mut rels: HashSet<RelTag> = HashSet::new();
+
+        for (seg, _entry) in self.segs.iter() {
+            if let RelishTag::Relation(reltag) = seg.rel {
+                // FIXME: skip if it was dropped before the requested LSN. But there is no
+                // LSN argument
+
+                if (spcnode == 0 || reltag.spcnode == spcnode)
+                    && (dbnode == 0 || reltag.dbnode == dbnode)
+                {
+                    rels.insert(reltag);
+                }
+            }
+        }
+        Ok(rels)
+    }
+
+    pub fn list_nonrels(&self, _lsn: Lsn) -> Result<HashSet<RelishTag>> {
+        let mut rels: HashSet<RelishTag> = HashSet::new();
+
+        // Scan the timeline directory to get all rels in this timeline.
+        for (seg, _entry) in self.segs.iter() {
+            // FIXME: skip if it was dropped before the requested LSN.
+
+            if let RelishTag::Relation(_) = seg.rel {
+            } else {
+                rels.insert(seg.rel);
+            }
+        }
+        Ok(rels)
+    }
+
+    /// Is there a newer layer for given segment?
+    pub fn newer_layer_exists(&self, seg: SegmentTag, lsn: Lsn) -> bool {
+        if let Some(segentry) = self.segs.get(&seg) {
+            if let Some(_open) = &segentry.open {
+                return true;
+            }
+
+            for (newer_lsn, layer) in segentry
+                .historic
+                .range((Included(lsn), Included(Lsn(u64::MAX))))
+            {
+                if layer.get_end_lsn() > lsn {
+                    trace!(
+                        "found later layer for {}, {} {}-{}",
+                        seg,
+                        lsn,
+                        newer_lsn,
+                        layer.get_end_lsn()
+                    );
+                    return true;
+                } else {
+                    trace!("found singleton layer for {}, {} {}", seg, lsn, newer_lsn);
+                    continue;
+                }
+            }
+        }
+        trace!("no later layer found for {}, {}", seg, lsn);
+        false
+    }
+
+    pub fn iter_open_layers(&mut self) -> OpenLayerIter {
+        OpenLayerIter {
+            last: None,
+            segiter: self.segs.iter_mut(),
+        }
+    }
+
+    pub fn iter_historic_layers(&self) -> HistoricLayerIter {
+        HistoricLayerIter {
+            segiter: self.segs.iter(),
+            iter: None,
+        }
+    }
+}
+
+impl Default for LayerMap {
+    fn default() -> Self {
+        LayerMap {
+            segs: HashMap::new(),
+        }
+    }
+}
+
+pub struct OpenLayerIter<'a> {
+    last: Option<&'a mut SegEntry>,
+
+    segiter: std::collections::hash_map::IterMut<'a, SegmentTag, SegEntry>,
+}
+
+impl<'a> OpenLayerIter<'a> {
+    pub fn replace(&mut self, replacement: Option<Arc<InMemoryLayer>>) {
+        let segentry = self.last.as_mut().unwrap();
+        segentry.open = replacement;
+    }
+
+    pub fn insert_historic(&mut self, new_layer: Arc<SnapshotLayer>) {
+        let start_lsn = new_layer.get_start_lsn();
+
+        let segentry = self.last.as_mut().unwrap();
+        segentry.historic.insert(start_lsn, new_layer);
+    }
+}
+
+impl<'a> Iterator for OpenLayerIter<'a> {
+    type Item = Arc<InMemoryLayer>;
+
+    fn next(&mut self) -> std::option::Option<<Self as std::iter::Iterator>::Item> {
+        while let Some((_seg, entry)) = self.segiter.next() {
+            if let Some(open) = &entry.open {
+                let op = Arc::clone(&open);
+                self.last = Some(entry);
+                return Some(op);
+            }
+        }
+        self.last = None;
+        None
+    }
+}
+
+pub struct HistoricLayerIter<'a> {
+    segiter: std::collections::hash_map::Iter<'a, SegmentTag, SegEntry>,
+    iter: Option<std::collections::btree_map::Iter<'a, Lsn, Arc<SnapshotLayer>>>,
+}
+
+impl<'a> Iterator for HistoricLayerIter<'a> {
+    type Item = Arc<SnapshotLayer>;
+
+    fn next(&mut self) -> std::option::Option<<Self as std::iter::Iterator>::Item> {
+        loop {
+            if let Some(x) = &mut self.iter {
+                if let Some(x) = x.next() {
+                    return Some(Arc::clone(&*x.1));
+                }
+            }
+            if let Some(seg) = self.segiter.next() {
+                self.iter = Some(seg.1.historic.iter());
+                continue;
+            } else {
+                return None;
+            }
+        }
+    }
+}
--- a/pageserver/src/layered_repository/snapshot_layer.rs
+++ b/pageserver/src/layered_repository/snapshot_layer.rs
@@ -0,0 +1,547 @@
+//!
+//! A SnapshotLayer represents one snapshot file on disk. One file holds all page
+//! version and size information of one relation, in a range of LSN.
+//! The name "snapshot file" is a bit of a misnomer because a snapshot file doesn't
+//! contain a snapshot at a specific LSN, but rather all the page versions in a range
+//! of LSNs.
+//!
+//! Currently, a snapshot file contains full information needed to reconstruct any
+//! page version in the LSN range, without consulting any other snapshot files. When
+//! a new snapshot file is created for writing, the full contents of relation are
+//! materialized as it is at the beginning of the LSN range. That can be very expensive,
+//! we should find a way to store differential files. But this keeps the read-side
+//! of things simple. You can find the correct snapshot file based on RelishTag and
+//! timeline+LSN, and once you've located it, you have all the data you need to in that
+//! file.
+//!
+//! When a snapshot file needs to be accessed, we slurp the whole file into memory, into
+//! the SnapshotLayer struct. See load() and unload() functions.
+//!
+//! On disk, the snapshot files are stored in timelines/<timelineid> directory.
+//! Currently, there are no subdirectories, and each snapshot file is named like this:
+//!
+//!    <spcnode>_<dbnode>_<relnode>_<forknum>_<start LSN>_<end LSN>
+//!
+//! For example:
+//!
+//!    1663_13990_2609_0_000000000169C348_000000000169C349
+//!
+//! If a relation is dropped, we add a '_DROPPED' to the end of the filename to indicate that.
+//! So the above example would become:
+//!
+//!    1663_13990_2609_0_000000000169C348_000000000169C349_DROPPED
+//!
+//! The end LSN indicates when it was dropped in that case, we don't store it in the
+//! file contents in any way.
+//!
+//! A snapshot file is constructed using the 'bookfile' crate. Each file consists of two
+//! parts: the page versions and the relation sizes. They are stored as separate chapters.
+//!
+use crate::layered_repository::storage_layer::{
+    Layer, PageReconstructData, PageVersion, SegmentTag,
+};
+use crate::relish::*;
+use crate::PageServerConf;
+use crate::{ZTenantId, ZTimelineId};
+use anyhow::{bail, Result};
+use log::*;
+use std::collections::BTreeMap;
+use std::fmt;
+use std::fs;
+use std::fs::File;
+use std::io::Write;
+use std::ops::Bound::Included;
+use std::path::PathBuf;
+use std::sync::{Arc, Mutex, MutexGuard};
+
+use bookfile::{Book, BookWriter};
+
+use zenith_utils::bin_ser::BeSer;
+use zenith_utils::lsn::Lsn;
+
+// Magic constant to identify a Zenith snapshot file
+static SNAPSHOT_FILE_MAGIC: u32 = 0x5A616E01;
+
+static PAGE_VERSIONS_CHAPTER: u64 = 1;
+static REL_SIZES_CHAPTER: u64 = 2;
+
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
+struct SnapshotFileName {
+    seg: SegmentTag,
+    start_lsn: Lsn,
+    end_lsn: Lsn,
+    dropped: bool,
+}
+
+impl SnapshotFileName {
+    fn from_str(fname: &str) -> Option<Self> {
+        // Split the filename into parts
+        //
+        //    <spcnode>_<dbnode>_<relnode>_<forknum>_<seg>_<start LSN>_<end LSN>
+        //
+        // or if it was dropped:
+        //
+        //    <spcnode>_<dbnode>_<relnode>_<forknum>_<seg>_<start LSN>_<end LSN>_DROPPED
+        //
+        let rel;
+        let mut parts;
+        if let Some(rest) = fname.strip_prefix("rel_") {
+            parts = rest.split('_');
+            rel = RelishTag::Relation(RelTag {
+                spcnode: parts.next()?.parse::<u32>().ok()?,
+                dbnode: parts.next()?.parse::<u32>().ok()?,
+                relnode: parts.next()?.parse::<u32>().ok()?,
+                forknum: parts.next()?.parse::<u8>().ok()?,
+            });
+        } else if let Some(rest) = fname.strip_prefix("pg_xact_") {
+            parts = rest.split('_');
+            rel = RelishTag::Slru {
+                slru: SlruKind::Clog,
+                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
+            };
+        } else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") {
+            parts = rest.split('_');
+            rel = RelishTag::Slru {
+                slru: SlruKind::MultiXactMembers,
+                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
+            };
+        } else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") {
+            parts = rest.split('_');
+            rel = RelishTag::Slru {
+                slru: SlruKind::MultiXactOffsets,
+                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
+            };
+        } else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") {
+            parts = rest.split('_');
+            rel = RelishTag::FileNodeMap {
+                spcnode: parts.next()?.parse::<u32>().ok()?,
+                dbnode: parts.next()?.parse::<u32>().ok()?,
+            };
+        } else if let Some(rest) = fname.strip_prefix("pg_twophase_") {
+            parts = rest.split('_');
+            rel = RelishTag::TwoPhase {
+                xid: parts.next()?.parse::<u32>().ok()?,
+            };
+        } else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") {
+            parts = rest.split('_');
+            rel = RelishTag::Checkpoint;
+        } else if let Some(rest) = fname.strip_prefix("pg_control_") {
+            parts = rest.split('_');
+            rel = RelishTag::ControlFile;
+        } else {
+            return None;
+        }
+
+        let segno = parts.next()?.parse::<u32>().ok()?;
+
+        let seg = SegmentTag { rel, segno };
+
+        let start_lsn = Lsn::from_hex(parts.next()?).ok()?;
+        let end_lsn = Lsn::from_hex(parts.next()?).ok()?;
+
+        let mut dropped = false;
+        if let Some(suffix) = parts.next() {
+            if suffix == "DROPPED" {
+                dropped = true;
+            } else {
+                warn!("unrecognized filename in timeline dir: {}", fname);
+                return None;
+            }
+        }
+        if parts.next().is_some() {
+            warn!("unrecognized filename in timeline dir: {}", fname);
+            return None;
+        }
+
+        Some(SnapshotFileName {
+            seg,
+            start_lsn,
+            end_lsn,
+            dropped,
+        })
+    }
+
+    fn to_string(&self) -> String {
+        let basename = match self.seg.rel {
+            RelishTag::Relation(reltag) => format!(
+                "rel_{}_{}_{}_{}",
+                reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum
+            ),
+            RelishTag::Slru {
+                slru: SlruKind::Clog,
+                segno,
+            } => format!("pg_xact_{:04X}", segno),
+            RelishTag::Slru {
+                slru: SlruKind::MultiXactMembers,
+                segno,
+            } => format!("pg_multixact_members_{:04X}", segno),
+            RelishTag::Slru {
+                slru: SlruKind::MultiXactOffsets,
+                segno,
+            } => format!("pg_multixact_offsets_{:04X}", segno),
+            RelishTag::FileNodeMap { spcnode, dbnode } => {
+                format!("pg_filenodemap_{}_{}", spcnode, dbnode)
+            }
+            RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
+            RelishTag::Checkpoint => format!("pg_control_checkpoint"),
+            RelishTag::ControlFile => format!("pg_control"),
+        };
+
+        format!(
+            "{}_{}_{:016X}_{:016X}{}",
+            basename,
+            self.seg.segno,
+            u64::from(self.start_lsn),
+            u64::from(self.end_lsn),
+            if self.dropped { "_DROPPED" } else { "" }
+        )
+    }
+}
+
+impl fmt::Display for SnapshotFileName {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self.to_string())
+    }
+}
+
+///
+/// SnapshotLayer is the in-memory data structure associated with an
+/// on-disk snapshot file.  We keep a SnapshotLayer in memory for each
+/// file, in the LayerMap. If a layer is in "loaded" state, we have a
+/// copy of the file in memory, in 'inner'. Otherwise the struct is
+/// just a placeholder for a file that exists on disk, and it needs to
+/// be loaded before using it in queries.
+///
+pub struct SnapshotLayer {
+    conf: &'static PageServerConf,
+    pub tenantid: ZTenantId,
+    pub timelineid: ZTimelineId,
+    pub seg: SegmentTag,
+
+    //
+    // This entry contains all the changes from 'start_lsn' to 'end_lsn'. The
+    // start is inclusive, and end is exclusive.
+    pub start_lsn: Lsn,
+    pub end_lsn: Lsn,
+
+    dropped: bool,
+
+    inner: Mutex<SnapshotLayerInner>,
+}
+
+pub struct SnapshotLayerInner {
+    /// If false, the 'page_versions' and 'relsizes' have not been
+    /// loaded into memory yet.
+    loaded: bool,
+
+    /// All versions of all pages in the file are are kept here.
+    /// Indexed by block number and LSN.
+    page_versions: BTreeMap<(u32, Lsn), PageVersion>,
+
+    /// `relsizes` tracks the size of the relation at different points in time.
+    relsizes: BTreeMap<Lsn, u32>,
+}
+
+impl Layer for SnapshotLayer {
+    fn get_timeline_id(&self) -> ZTimelineId {
+        return self.timelineid;
+    }
+
+    fn get_seg_tag(&self) -> SegmentTag {
+        return self.seg;
+    }
+
+    fn is_dropped(&self) -> bool {
+        return self.dropped;
+    }
+
+    fn get_start_lsn(&self) -> Lsn {
+        return self.start_lsn;
+    }
+
+    fn get_end_lsn(&self) -> Lsn {
+        return self.end_lsn;
+    }
+
+    /// Look up given page in the cache.
+    fn get_page_reconstruct_data(
+        &self,
+        blknum: u32,
+        lsn: Lsn,
+        reconstruct_data: &mut PageReconstructData,
+    ) -> Result<Option<Lsn>> {
+        // Scan the BTreeMap backwards, starting from the given entry.
+        let mut need_base_image_lsn: Option<Lsn> = Some(lsn);
+        {
+            let inner = self.load()?;
+            let minkey = (blknum, Lsn(0));
+            let maxkey = (blknum, lsn);
+            let mut iter = inner
+                .page_versions
+                .range((Included(&minkey), Included(&maxkey)));
+            while let Some(((_blknum, entry_lsn), entry)) = iter.next_back() {
+                if let Some(img) = &entry.page_image {
+                    reconstruct_data.page_img = Some(img.clone());
+                    need_base_image_lsn = None;
+                    break;
+                } else if let Some(rec) = &entry.record {
+                    reconstruct_data.records.push(rec.clone());
+                    if rec.will_init {
+                        // This WAL record initializes the page, so no need to go further back
+                        need_base_image_lsn = None;
+                        break;
+                    } else {
+                        need_base_image_lsn = Some(*entry_lsn);
+                    }
+                } else {
+                    // No base image, and no WAL record. Huh?
+                    bail!("no page image or WAL record for requested page");
+                }
+            }
+
+            // release lock on 'inner'
+        }
+
+        Ok(need_base_image_lsn)
+    }
+
+    /// Get size of the relation at given LSN
+    fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
+        // Scan the BTreeMap backwards, starting from the given entry.
+        let inner = self.load()?;
+        let mut iter = inner.relsizes.range((Included(&Lsn(0)), Included(&lsn)));
+
+        if let Some((_entry_lsn, entry)) = iter.next_back() {
+            let result = *entry;
+            drop(inner);
+            trace!("get_seg_size: {} at {} -> {}", self.seg, lsn, result);
+            Ok(result)
+        } else {
+            error!(
+                "No size found for {} at {} in snapshot layer {} {}-{}",
+                self.seg, lsn, self.seg, self.start_lsn, self.end_lsn
+            );
+            bail!(
+                "No size found for {} at {} in snapshot layer",
+                self.seg,
+                lsn
+            );
+        }
+    }
+
+    /// Does this segment exist at given LSN?
+    fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
+        // Is the requested LSN after the rel was dropped?
+        if self.dropped && lsn >= self.end_lsn {
+            return Ok(false);
+        }
+
+        // Otherwise, it exists.
+        Ok(true)
+    }
+}
+
+impl SnapshotLayer {
+    fn path(&self) -> PathBuf {
+        Self::path_for(
+            self.conf,
+            self.timelineid,
+            self.tenantid,
+            &SnapshotFileName {
+                seg: self.seg,
+                start_lsn: self.start_lsn,
+                end_lsn: self.end_lsn,
+                dropped: self.dropped,
+            },
+        )
+    }
+
+    fn path_for(
+        conf: &'static PageServerConf,
+        timelineid: ZTimelineId,
+        tenantid: ZTenantId,
+        fname: &SnapshotFileName,
+    ) -> PathBuf {
+        conf.timeline_path(&timelineid, &tenantid)
+            .join(fname.to_string())
+    }
+
+    /// Create a new snapshot file, using the given btreemaps containing the page versions and
+    /// relsizes.
+    ///
+    /// This is used to write the in-memory layer to disk. The in-memory layer uses the same
+    /// data structure with two btreemaps as we do, so passing the btreemaps is currently
+    /// expedient.
+    pub fn create(
+        conf: &'static PageServerConf,
+        timelineid: ZTimelineId,
+        tenantid: ZTenantId,
+        seg: SegmentTag,
+        start_lsn: Lsn,
+        end_lsn: Lsn,
+        dropped: bool,
+        page_versions: BTreeMap<(u32, Lsn), PageVersion>,
+        relsizes: BTreeMap<Lsn, u32>,
+    ) -> Result<SnapshotLayer> {
+        let snapfile = SnapshotLayer {
+            conf: conf,
+            timelineid: timelineid,
+            tenantid: tenantid,
+            seg: seg,
+            start_lsn: start_lsn,
+            end_lsn,
+            dropped,
+            inner: Mutex::new(SnapshotLayerInner {
+                loaded: true,
+                page_versions: page_versions,
+                relsizes: relsizes,
+            }),
+        };
+        let inner = snapfile.inner.lock().unwrap();
+
+        // Write the in-memory btreemaps into a file
+        let path = snapfile.path();
+
+        // Note: This overwrites any existing file. There shouldn't be any.
+        // FIXME: throw an error instead?
+        let file = File::create(&path)?;
+        let book = BookWriter::new(file, SNAPSHOT_FILE_MAGIC)?;
+
+        // Write out page versions
+        let mut chapter = book.new_chapter(PAGE_VERSIONS_CHAPTER);
+        let buf = BTreeMap::ser(&inner.page_versions)?;
+        chapter.write_all(&buf)?;
+        let book = chapter.close()?;
+
+        // and relsizes to separate chapter
+        let mut chapter = book.new_chapter(REL_SIZES_CHAPTER);
+        let buf = BTreeMap::ser(&inner.relsizes)?;
+        chapter.write_all(&buf)?;
+        let book = chapter.close()?;
+
+        book.close()?;
+
+        trace!("saved {}", &path.display());
+
+        drop(inner);
+
+        Ok(snapfile)
+    }
+
+    ///
+    /// Load the contents of the file into memory
+    ///
+    fn load(&self) -> Result<MutexGuard<SnapshotLayerInner>> {
+        // quick exit if already loaded
+        let mut inner = self.inner.lock().unwrap();
+
+        if inner.loaded {
+            return Ok(inner);
+        }
+
+        let path = Self::path_for(
+            self.conf,
+            self.timelineid,
+            self.tenantid,
+            &SnapshotFileName {
+                seg: self.seg,
+                start_lsn: self.start_lsn,
+                end_lsn: self.end_lsn,
+                dropped: self.dropped,
+            },
+        );
+
+        let file = File::open(&path)?;
+        let book = Book::new(file)?;
+
+        let chapter = book.read_chapter(PAGE_VERSIONS_CHAPTER)?;
+        let page_versions = BTreeMap::des(&chapter)?;
+
+        let chapter = book.read_chapter(REL_SIZES_CHAPTER)?;
+        let relsizes = BTreeMap::des(&chapter)?;
+
+        debug!("loaded from {}", &path.display());
+
+        *inner = SnapshotLayerInner {
+            loaded: true,
+            page_versions,
+            relsizes,
+        };
+
+        Ok(inner)
+    }
+
+    /// Create SnapshotLayers representing all files on disk
+    ///
+    // TODO: returning an Iterator would be more idiomatic
+    pub fn list_snapshot_files(
+        conf: &'static PageServerConf,
+        timelineid: ZTimelineId,
+        tenantid: ZTenantId,
+    ) -> Result<Vec<Arc<SnapshotLayer>>> {
+        let path = conf.timeline_path(&timelineid, &tenantid);
+
+        let mut snapfiles: Vec<Arc<SnapshotLayer>> = Vec::new();
+        for direntry in fs::read_dir(path)? {
+            let fname = direntry?.file_name();
+            let fname = fname.to_str().unwrap();
+
+            if let Some(snapfilename) = SnapshotFileName::from_str(fname) {
+                let snapfile = SnapshotLayer {
+                    conf,
+                    timelineid,
+                    tenantid,
+                    seg: snapfilename.seg,
+                    start_lsn: snapfilename.start_lsn,
+                    end_lsn: snapfilename.end_lsn,
+                    dropped: snapfilename.dropped,
+                    inner: Mutex::new(SnapshotLayerInner {
+                        loaded: false,
+                        page_versions: BTreeMap::new(),
+                        relsizes: BTreeMap::new(),
+                    }),
+                };
+
+                snapfiles.push(Arc::new(snapfile));
+            }
+        }
+        return Ok(snapfiles);
+    }
+
+    pub fn delete(&self) -> Result<()> {
+        // delete underlying file
+        fs::remove_file(self.path())?;
+        Ok(())
+    }
+
+    ///
+    /// Release most of the memory used by this layer. If it's accessed again later,
+    /// it will need to be loaded back.
+    ///
+    pub fn unload(&self) -> Result<()> {
+        let mut inner = self.inner.lock().unwrap();
+        inner.page_versions = BTreeMap::new();
+        inner.relsizes = BTreeMap::new();
+        inner.loaded = false;
+        Ok(())
+    }
+
+    /// debugging function to print out the contents of the layer
+    #[allow(unused)]
+    pub fn dump(&self) -> String {
+        let mut result = format!(
+            "----- snapshot layer for {} {}-{} ----\n",
+            self.seg, self.start_lsn, self.end_lsn
+        );
+
+        let inner = self.inner.lock().unwrap();
+        for (k, v) in inner.relsizes.iter() {
+            result += &format!("{}: {}\n", k, v);
+        }
+        //for (k, v) in inner.page_versions.iter() {
+        //    result += &format!("blk {} at {}: {}/{}\n", k.0, k.1, v.page_image.is_some(), v.record.is_some());
+        //}
+
+        result
+    }
+}
--- a/pageserver/src/layered_repository/storage_layer.rs
+++ b/pageserver/src/layered_repository/storage_layer.rs
@@ -0,0 +1,128 @@
+//!
+//! Common traits and structs for layers
+//!
+
+use crate::relish::RelishTag;
+use crate::repository::WALRecord;
+use crate::ZTimelineId;
+use anyhow::Result;
+use bytes::Bytes;
+use serde::{Deserialize, Serialize};
+use std::fmt;
+
+use zenith_utils::lsn::Lsn;
+
+// Size of one segment in pages (10 MB)
+pub const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192;
+
+///
+/// Each relish stored in the repository is divided into fixed-sized "segments",
+/// with 10 MB of key-space, or 1280 8k pages each.
+///
+#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy)]
+pub struct SegmentTag {
+    pub rel: RelishTag,
+    pub segno: u32,
+}
+
+impl fmt::Display for SegmentTag {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}.{}", self.rel, self.segno)
+    }
+}
+
+impl SegmentTag {
+    pub const fn from_blknum(rel: RelishTag, blknum: u32) -> SegmentTag {
+        SegmentTag {
+            rel,
+            segno: blknum / RELISH_SEG_SIZE,
+        }
+    }
+
+    pub fn blknum_in_seg(&self, blknum: u32) -> bool {
+        blknum / RELISH_SEG_SIZE == self.segno
+    }
+}
+
+///
+/// Represents a version of a page at a specific LSN. The LSN is the key of the
+/// entry in the 'page_versions' hash, it is not duplicated here.
+///
+/// A page version can be stored as a full page image, or as WAL record that needs
+/// to be applied over the previous page version to reconstruct this version.
+///
+/// It's also possible to have both a WAL record and a page image in the same
+/// PageVersion. That happens if page version is originally stored as a WAL record
+/// but it is later reconstructed by a GetPage@LSN request by performing WAL
+/// redo. The get_page_at_lsn() code will store the reconstructed pag image next to
+/// the WAL record in that case. TODO: That's pretty accidental, not the result
+/// of any grand design. If we want to keep reconstructed page versions around, we
+/// probably should have a separate buffer cache so that we could control the
+/// replacement policy globally. Or if we keep a reconstructed page image, we
+/// could throw away the WAL record.
+///
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PageVersion {
+    /// an 8kb page image
+    pub page_image: Option<Bytes>,
+    /// WAL record to get from previous page version to this one.
+    pub record: Option<WALRecord>,
+}
+
+///
+/// Data needed to reconstruct a page version
+///
+/// 'page_img' is the old base image of the page to start the WAL replay with.
+/// It can be None, if the first WAL record initializes the page (will_init)
+/// 'records' contains the records to apply over the base image.
+///
+pub struct PageReconstructData {
+    pub records: Vec<WALRecord>,
+    pub page_img: Option<Bytes>,
+}
+
+///
+/// A Layer holds all page versions for one segment of a relish, in a range of LSNs.
+/// There are two kinds of layers, in-memory and snapshot layers. In-memory
+/// layers are used to ingest incoming WAL, and provide fast access
+/// to the recent page versions. Snaphot layers are stored on disk, and
+/// are immutable. This trait presents the common functionality of
+/// in-memory and snapshot layers.
+///
+/// Each layer contains a full snapshot of the segment at the start
+/// LSN. In addition to that, it contains WAL (or more page images)
+/// needed to recontruct any page version up to the end LSN.
+///
+pub trait Layer: Send + Sync {
+    // These functions identify the relish segment and the LSN range
+    // that this Layer holds.
+    fn get_timeline_id(&self) -> ZTimelineId;
+    fn get_seg_tag(&self) -> SegmentTag;
+    fn get_start_lsn(&self) -> Lsn;
+    fn get_end_lsn(&self) -> Lsn;
+    fn is_dropped(&self) -> bool;
+
+    ///
+    /// Return data needed to reconstruct given page at LSN.
+    ///
+    /// It is up to the caller to collect more data from previous layer and
+    /// perform WAL redo, if necessary.
+    ///
+    /// If returns Some, the returned data is not complete. The caller needs
+    /// to continue with the returned 'lsn'.
+    ///
+    /// Note that the 'blknum' is the offset of the page from the beginning
+    /// of the *relish*, not the beginning of the segment. The requested
+    /// 'blknum' must be covered by this segment.
+    fn get_page_reconstruct_data(
+        &self,
+        blknum: u32,
+        lsn: Lsn,
+        reconstruct_data: &mut PageReconstructData,
+    ) -> Result<Option<Lsn>>;
+
+    // Functions that correspond to the Timeline trait functions.
+    fn get_seg_size(&self, lsn: Lsn) -> Result<u32>;
+
+    fn get_seg_exists(&self, lsn: Lsn) -> Result<bool>;
+}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -9,6 +9,7 @@ use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};

 pub mod basebackup;
 pub mod branches;
+pub mod layered_repository;
 pub mod logger;
 pub mod object_key;
 pub mod object_repository;
@@ -54,6 +55,14 @@ pub struct PageServerConf {
    pub auth_type: AuthType,

    pub auth_validation_public_key_path: Option<PathBuf>,
+
+    pub repository_format: RepositoryFormat,
+}
+
+#[derive(Debug, Clone, PartialEq)]
+pub enum RepositoryFormat {
+    Layered,
+    RocksDb,
 }

 impl PageServerConf {
--- a/pageserver/src/object_repository.rs
+++ b/pageserver/src/object_repository.rs
@@ -299,15 +299,13 @@ impl Timeline for ObjectTimeline {
        // move this check out of the funciton.
        //
        match rel {
-            RelishTag::Slru { .. } |
-            RelishTag::TwoPhase{ .. } =>
-            {
+            RelishTag::Slru { .. } | RelishTag::TwoPhase { .. } => {
                if !self.get_rel_exists(rel, req_lsn).unwrap_or(false) {
                    trace!("{:?} at {} doesn't exist", rel, req_lsn);
                    return Err(anyhow!("non-rel relish doesn't exist"));
                }
-            },
-            _ => ()
+            }
+            _ => (),
        };

        const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -2,11 +2,12 @@
 //! page server.

 use crate::branches;
+use crate::layered_repository::LayeredRepository;
 use crate::object_repository::ObjectRepository;
 use crate::repository::Repository;
 use crate::rocksdb_storage::RocksObjectStore;
 use crate::walredo::PostgresRedoManager;
-use crate::PageServerConf;
+use crate::{PageServerConf, RepositoryFormat};
 use anyhow::{anyhow, bail, Result};
 use lazy_static::lazy_static;
 use log::info;
@@ -27,16 +28,35 @@ pub fn init(conf: &'static PageServerConf) {
    for dir_entry in fs::read_dir(conf.tenants_path()).unwrap() {
        let tenantid =
            ZTenantId::from_str(dir_entry.unwrap().file_name().to_str().unwrap()).unwrap();
-        let obj_store = RocksObjectStore::open(conf, &tenantid).unwrap();

        // Set up a WAL redo manager, for applying WAL records.
        let walredo_mgr = PostgresRedoManager::new(conf, tenantid);

        // Set up an object repository, for actual data storage.
-        let repo =
-            ObjectRepository::new(conf, Arc::new(obj_store), Arc::new(walredo_mgr), tenantid);
+        let repo: Arc<dyn Repository + Sync + Send> = match conf.repository_format {
+            RepositoryFormat::Layered => {
+                let repo = Arc::new(LayeredRepository::new(
+                    conf,
+                    Arc::new(walredo_mgr),
+                    tenantid,
+                ));
+                LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
+                repo
+            }
+            RepositoryFormat::RocksDb => {
+                let obj_store = RocksObjectStore::open(conf, &tenantid).unwrap();
+
+                Arc::new(ObjectRepository::new(
+                    conf,
+                    Arc::new(obj_store),
+                    Arc::new(walredo_mgr),
+                    tenantid,
+                ))
+            }
+        };
+
        info!("initialized storage for tenant: {}", &tenantid);
-        m.insert(tenantid, Arc::new(repo));
+        m.insert(tenantid, repo);
    }
 }

@@ -53,7 +73,7 @@ pub fn create_repository_for_tenant(
    let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
    let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?;

-    m.insert(tenantid, Arc::new(repo));
+    m.insert(tenantid, repo);

    Ok(())
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -703,6 +703,18 @@ impl postgres_backend::Handler for PageServerHandler {
                RowDescriptor::int8_col(b"control_deleted"),
                RowDescriptor::int8_col(b"filenodemap_deleted"),
                RowDescriptor::int8_col(b"dropped"),
+                RowDescriptor::int8_col(b"snapshot_relfiles_total"),
+                RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_cutoff"),
+                RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_branches"),
+                RowDescriptor::int8_col(b"snapshot_relfiles_not_updated"),
+                RowDescriptor::int8_col(b"snapshot_relfiles_removed"),
+                RowDescriptor::int8_col(b"snapshot_relfiles_dropped"),
+                RowDescriptor::int8_col(b"snapshot_nonrelfiles_total"),
+                RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_cutoff"),
+                RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_branches"),
+                RowDescriptor::int8_col(b"snapshot_nonrelfiles_not_updated"),
+                RowDescriptor::int8_col(b"snapshot_nonrelfiles_removed"),
+                RowDescriptor::int8_col(b"snapshot_nonrelfiles_dropped"),
                RowDescriptor::int8_col(b"elapsed"),
            ]))?
            .write_message_noflush(&BeMessage::DataRow(&[
@@ -715,6 +727,43 @@ impl postgres_backend::Handler for PageServerHandler {
                Some(&result.control_deleted.to_string().as_bytes()),
                Some(&result.filenodemap_deleted.to_string().as_bytes()),
                Some(&result.dropped.to_string().as_bytes()),
+                Some(&result.snapshot_relfiles_total.to_string().as_bytes()),
+                Some(
+                    &result
+                        .snapshot_relfiles_needed_by_cutoff
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(
+                    &result
+                        .snapshot_relfiles_needed_by_branches
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(&result.snapshot_relfiles_not_updated.to_string().as_bytes()),
+                Some(&result.snapshot_relfiles_removed.to_string().as_bytes()),
+                Some(&result.snapshot_relfiles_dropped.to_string().as_bytes()),
+                Some(&result.snapshot_nonrelfiles_total.to_string().as_bytes()),
+                Some(
+                    &result
+                        .snapshot_nonrelfiles_needed_by_cutoff
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(
+                    &result
+                        .snapshot_nonrelfiles_needed_by_branches
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(
+                    &result
+                        .snapshot_nonrelfiles_not_updated
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(&result.snapshot_nonrelfiles_removed.to_string().as_bytes()),
+                Some(&result.snapshot_nonrelfiles_dropped.to_string().as_bytes()),
                Some(&result.elapsed.as_millis().to_string().as_bytes()),
            ]))?
            .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
--- a/pageserver/src/relish.rs
+++ b/pageserver/src/relish.rs
@@ -119,8 +119,16 @@ impl RelishTag {
            | RelishTag::TwoPhase { .. } => true,

            // and these don't
-            | RelishTag::ControlFile
-            | RelishTag::Checkpoint => false,
+            RelishTag::ControlFile | RelishTag::Checkpoint => false,
+        }
+    }
+
+    // convenience function to check if this relish is a normal relation.
+    pub const fn is_relation(&self) -> bool {
+        if let RelishTag::Relation(_) = self {
+            true
+        } else {
+            false
        }
    }
 }
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -5,6 +5,7 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
 use serde::{Deserialize, Serialize};
 use std::collections::HashSet;
 use std::iter::Iterator;
+use std::ops::AddAssign;
 use std::sync::Arc;
 use std::time::Duration;
 use zenith_utils::lsn::Lsn;
@@ -56,6 +57,8 @@ pub trait Repository: Send + Sync {
 ///
 #[derive(Default)]
 pub struct GcResult {
+    // FIXME: These counters make sense for the ObjectRepository. They are not used
+    // by the LayeredRepository.
    pub n_relations: u64,
    pub inspected: u64,
    pub truncated: u64,
@@ -66,9 +69,51 @@ pub struct GcResult {
    pub control_deleted: u64,     // RelishTag::ControlFile
    pub filenodemap_deleted: u64, // RelishTag::FileNodeMap
    pub dropped: u64,
+
+    // These are used for the LayeredRepository instead
+    pub snapshot_relfiles_total: u64,
+    pub snapshot_relfiles_needed_by_cutoff: u64,
+    pub snapshot_relfiles_needed_by_branches: u64,
+    pub snapshot_relfiles_not_updated: u64,
+    pub snapshot_relfiles_removed: u64, // # of snapshot files removed because they have been made obsolete by newer snapshot files.
+    pub snapshot_relfiles_dropped: u64, // # of snapshot files removed because the relation was dropped
+
+    pub snapshot_nonrelfiles_total: u64,
+    pub snapshot_nonrelfiles_needed_by_cutoff: u64,
+    pub snapshot_nonrelfiles_needed_by_branches: u64,
+    pub snapshot_nonrelfiles_not_updated: u64,
+    pub snapshot_nonrelfiles_removed: u64, // # of snapshot files removed because they have been made obsolete by newer snapshot files.
+    pub snapshot_nonrelfiles_dropped: u64, // # of snapshot files removed because the relation was dropped
+
    pub elapsed: Duration,
 }

+impl AddAssign for GcResult {
+    fn add_assign(&mut self, other: Self) {
+        self.n_relations += other.n_relations;
+        self.truncated += other.truncated;
+        self.deleted += other.deleted;
+        self.dropped += other.dropped;
+
+        self.snapshot_relfiles_total += other.snapshot_relfiles_total;
+        self.snapshot_relfiles_needed_by_cutoff += other.snapshot_relfiles_needed_by_cutoff;
+        self.snapshot_relfiles_needed_by_branches += other.snapshot_relfiles_needed_by_branches;
+        self.snapshot_relfiles_not_updated += other.snapshot_relfiles_not_updated;
+        self.snapshot_relfiles_removed += other.snapshot_relfiles_removed;
+        self.snapshot_relfiles_dropped += other.snapshot_relfiles_dropped;
+
+        self.snapshot_nonrelfiles_total += other.snapshot_nonrelfiles_total;
+        self.snapshot_nonrelfiles_needed_by_cutoff += other.snapshot_nonrelfiles_needed_by_cutoff;
+        self.snapshot_nonrelfiles_needed_by_branches +=
+            other.snapshot_nonrelfiles_needed_by_branches;
+        self.snapshot_nonrelfiles_not_updated += other.snapshot_nonrelfiles_not_updated;
+        self.snapshot_nonrelfiles_removed += other.snapshot_nonrelfiles_removed;
+        self.snapshot_nonrelfiles_dropped += other.snapshot_nonrelfiles_dropped;
+
+        self.elapsed += other.elapsed;
+    }
+}
+
 pub trait Timeline: Send + Sync {
    //------------------------------------------------------------------------------
    // Public GET functions
@@ -239,11 +284,12 @@ impl WALRecord {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use crate::layered_repository::LayeredRepository;
    use crate::object_repository::ObjectRepository;
    use crate::object_repository::{ObjectValue, PageEntry, RelationSizeEntry};
    use crate::rocksdb_storage::RocksObjectStore;
    use crate::walredo::{WalRedoError, WalRedoManager};
-    use crate::PageServerConf;
+    use crate::{PageServerConf, RepositoryFormat};
    use postgres_ffi::pg_constants;
    use std::fs;
    use std::path::PathBuf;
@@ -277,10 +323,16 @@ mod tests {
        buf.freeze()
    }

-    fn get_test_repo(test_name: &str) -> Result<Box<dyn Repository>> {
+    static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
+
+    fn get_test_repo(
+        test_name: &str,
+        repository_format: RepositoryFormat,
+    ) -> Result<Box<dyn Repository>> {
        let repo_dir = PathBuf::from(format!("../tmp_check/test_{}", test_name));
        let _ = fs::remove_dir_all(&repo_dir);
-        fs::create_dir_all(&repo_dir).unwrap();
+        fs::create_dir_all(&repo_dir)?;
+        fs::create_dir_all(&repo_dir.join("timelines"))?;

        let conf = PageServerConf {
            daemonize: false,
@@ -293,6 +345,7 @@ mod tests {
            pg_distrib_dir: "".into(),
            auth_type: AuthType::Trust,
            auth_validation_public_key_path: None,
+            repository_format,
        };
        // Make a static copy of the config. This can never be free'd, but that's
        // OK in a test.
@@ -300,24 +353,47 @@ mod tests {
        let tenantid = ZTenantId::generate();
        fs::create_dir_all(conf.tenant_path(&tenantid)).unwrap();

-        let obj_store = RocksObjectStore::create(conf, &tenantid)?;
-
        let walredo_mgr = TestRedoManager {};

-        let repo =
-            ObjectRepository::new(conf, Arc::new(obj_store), Arc::new(walredo_mgr), tenantid);
+        let repo: Box<dyn Repository + Sync + Send> = match conf.repository_format {
+            RepositoryFormat::Layered => Box::new(LayeredRepository::new(
+                conf,
+                Arc::new(walredo_mgr),
+                tenantid,
+            )),
+            RepositoryFormat::RocksDb => {
+                let obj_store = RocksObjectStore::create(conf, &tenantid)?;

-        Ok(Box::new(repo))
+                Box::new(ObjectRepository::new(
+                    conf,
+                    Arc::new(obj_store),
+                    Arc::new(walredo_mgr),
+                    tenantid,
+                ))
+            }
+        };
+
+        Ok(repo)
    }

    /// Test get_relsize() and truncation.
    #[test]
-    fn test_relsize() -> Result<()> {
+    fn test_relsize_rocksdb() -> Result<()> {
+        let repo = get_test_repo("test_relsize_rocksdb", RepositoryFormat::RocksDb)?;
+        test_relsize(&*repo)
+    }
+
+    #[test]
+    fn test_relsize_layered() -> Result<()> {
+        let repo = get_test_repo("test_relsize_layered", RepositoryFormat::Layered)?;
+        test_relsize(&*repo)
+    }
+
+    fn test_relsize(repo: &dyn Repository) -> Result<()> {
        // get_timeline() with non-existent timeline id should fail
        //repo.get_timeline("11223344556677881122334455667788");

        // Create timeline to work on
-        let repo = get_test_repo("test_relsize")?;
        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
        let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;

@@ -402,14 +478,24 @@ mod tests {
    /// This isn't very interesting with the RocksDb implementation, as we don't pay
    /// any attention to Postgres segment boundaries there.
    #[test]
-    fn test_large_rel() -> Result<()> {
-        let repo = get_test_repo("test_large_rel")?;
+    fn test_large_rel_rocksdb() -> Result<()> {
+        let repo = get_test_repo("test_large_rel_rocksdb", RepositoryFormat::RocksDb)?;
+        test_large_rel(&*repo)
+    }
+
+    #[test]
+    fn test_large_rel_layered() -> Result<()> {
+        let repo = get_test_repo("test_large_rel_layered", RepositoryFormat::Layered)?;
+        test_large_rel(&*repo)
+    }
+
+    fn test_large_rel(repo: &dyn Repository) -> Result<()> {
        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
        let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;

        tline.init_valid_lsn(Lsn(1));

-        let mut lsn = 0;
+        let mut lsn = 1;
        for blknum in 0..pg_constants::RELSEG_SIZE + 1 {
            let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
            lsn += 1;
@@ -440,6 +526,21 @@ mod tests {
            pg_constants::RELSEG_SIZE - 1
        );

+        // Truncate to 1500, and then truncate all the way down to 0, one block at a time
+        // This tests the behavior at segment boundaries
+        let mut size: i32 = 3000;
+        while size >= 0 {
+            lsn += 1;
+            tline.put_truncation(TESTREL_A, Lsn(lsn), size as u32)?;
+            tline.advance_last_valid_lsn(Lsn(lsn));
+            assert_eq!(
+                tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
+                size as u32
+            );
+
+            size -= 1;
+        }
+
        Ok(())
    }

@@ -455,15 +556,29 @@ mod tests {
        }))
    }

+    #[test]
+    fn test_branch_rocksdb() -> Result<()> {
+        let repo = get_test_repo("test_branch_rocksdb", RepositoryFormat::RocksDb)?;
+        test_branch(&*repo)
+    }
+
+    #[test]
+    fn test_branch_layered() -> Result<()> {
+        let repo = get_test_repo("test_branch_layered", RepositoryFormat::Layered)?;
+        test_branch(&*repo)
+    }
+
    ///
    /// Test branch creation
    ///
-    #[test]
-    fn test_branch() -> Result<()> {
-        let repo = get_test_repo("test_branch")?;
+    fn test_branch(repo: &dyn Repository) -> Result<()> {
        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
        let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;

+        // Import initial dummy checkpoint record, otherwise the get_timeline() call
+        // after branching fails below
+        tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(1), ZERO_PAGE.clone(), false)?;
+
        // Create a relation on the timeline
        tline.init_valid_lsn(Lsn(1));
        tline.put_page_image(TESTREL_A, 0, Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
@@ -505,8 +620,19 @@ mod tests {
    }

    #[test]
-    fn test_history() -> Result<()> {
-        let repo = get_test_repo("test_snapshot")?;
+    fn test_history_rocksdb() -> Result<()> {
+        let repo = get_test_repo("test_history_rocksdb", RepositoryFormat::RocksDb)?;
+        test_history(&*repo)
+    }
+    #[test]
+    // TODO: This doesn't work with the layered storage, the functions needed for push/pull
+    // functionality haven't been implemented yet.
+    #[ignore]
+    fn test_history_layered() -> Result<()> {
+        let repo = get_test_repo("test_history_layered", RepositoryFormat::Layered)?;
+        test_history(&*repo)
+    }
+    fn test_history(repo: &dyn Repository) -> Result<()> {
        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
        let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;

--- a/pageserver/src/restore_local_repo.rs
+++ b/pageserver/src/restore_local_repo.rs
@@ -132,6 +132,7 @@ pub fn import_timeline_from_postgres_datadir(
    }
    // TODO: Scan pg_tblspc

+    timeline.advance_last_valid_lsn(lsn);
    timeline.checkpoint()?;

    Ok(())
@@ -424,13 +425,13 @@ pub fn save_decoded_record(
            let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
            save_xact_record(timeline, lsn, &parsed_xact, decoded)?;
            // Remove twophase file. see RemoveTwoPhaseFile() in postgres code
-            info!(
-                "unlink twophaseFile for xid {} parsed_xact.xid {} here",
-                decoded.xl_xid, parsed_xact.xid
+            trace!(
+                "unlink twophaseFile for xid {} parsed_xact.xid {} here at {}",
+                decoded.xl_xid, parsed_xact.xid, lsn
            );
            timeline.put_unlink(
                RelishTag::TwoPhase {
-                    xid: decoded.xl_xid,
+                    xid: parsed_xact.xid,
                },
                lsn,
            )?;
@@ -795,7 +796,13 @@ fn save_clog_truncate_record(
    // Iterate via SLRU CLOG segments and unlink segments that we're ready to truncate
    // TODO This implementation is very inefficient -
    // it scans all non-rels only to find Clog
-    for obj in timeline.list_nonrels(lsn)? {
+    //
+    // We cannot pass 'lsn' to the Timeline.list_nonrels(), or it
+    // will block waiting for the last valid LSN to advance up to
+    // it. So we use the previous record's LSN in the get calls
+    // instead.
+    let req_lsn = min(timeline.get_last_record_lsn(), lsn);
+    for obj in timeline.list_nonrels(req_lsn)? {
        match obj {
            RelishTag::Slru { slru, segno } => {
                if slru == SlruKind::Clog {
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -8,7 +8,7 @@ use crate::page_cache;
 use crate::relish::*;
 use crate::restore_local_repo;
 use crate::waldecoder::*;
-use crate::PageServerConf;
+use crate::{PageServerConf, RepositoryFormat};
 use anyhow::{Error, Result};
 use lazy_static::lazy_static;
 use log::*;
@@ -264,7 +264,11 @@ fn walreceiver_main(
                    )?;

                    if newest_segno - oldest_segno >= 10 {
-                        timeline.checkpoint()?;
+                        // FIXME: The layered repository performs checkpointing in a separate thread, so this
+                        // isn't needed anymore. Remove 'checkpoint' from the Timeline trait altogether?
+                        if conf.repository_format == RepositoryFormat::RocksDb {
+                            timeline.checkpoint()?;
+                        }

                        // TODO: This is where we could remove WAL older than last_rec_lsn.
                        //remove_wal_files(timelineid, pg_constants::WAL_SEGMENT_SIZE, last_rec_lsn)?;
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -20,6 +20,7 @@
 //!
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
+use lazy_static::lazy_static;
 use log::*;
 use serde::{Deserialize, Serialize};
 use std::cell::RefCell;
@@ -36,6 +37,7 @@ use tokio::io::AsyncBufReadExt;
 use tokio::io::{AsyncReadExt, AsyncWriteExt};
 use tokio::process::{ChildStdin, ChildStdout, Command};
 use tokio::time::timeout;
+use zenith_metrics::{register_histogram, register_int_counter, Histogram, IntCounter};
 use zenith_utils::bin_ser::BeSer;
 use zenith_utils::lsn::Lsn;
 use zenith_utils::zid::ZTenantId;
@@ -103,6 +105,27 @@ impl crate::walredo::WalRedoManager for DummyRedoManager {

 static TIMEOUT: Duration = Duration::from_secs(20);

+// Metrics collected on WAL redo operations
+//
+// We collect the time spent in actual WAL redo ('redo'), and time waiting
+// for access to the postgres process ('wait') since there is only one for
+// each tenant.
+lazy_static! {
+    static ref WAL_REDO_TIME: Histogram =
+        register_histogram!("pageserver_wal_redo_time", "Time spent on WAL redo")
+            .expect("failed to define a metric");
+    static ref WAL_REDO_WAIT_TIME: Histogram = register_histogram!(
+        "pageserver_wal_redo_wait_time",
+        "Time spent waiting for access to the WAL redo process"
+    )
+    .expect("failed to define a metric");
+    static ref WAL_REDO_RECORD_COUNTER: IntCounter = register_int_counter!(
+        "pageserver_wal_records_replayed",
+        "Number of WAL records replayed"
+    )
+    .unwrap();
+}
+
 ///
 /// This is the real implementation that uses a Postgres process to
 /// perform WAL replay. Only one thread can use the processs at a time,
@@ -156,6 +179,9 @@ impl WalRedoManager for PostgresRedoManager {
        base_img: Option<Bytes>,
        records: Vec<WALRecord>,
    ) -> Result<Bytes, WalRedoError> {
+        let start_time;
+        let lock_time;
+        let end_time;

        let request = WalRedoRequest {
            rel,
@@ -165,16 +191,29 @@ impl WalRedoManager for PostgresRedoManager {
            records,
        };

-        // launch the WAL redo process on first use
-        let mut process_guard = self.process.lock().unwrap();
-        if process_guard.is_none() {
-            let p = self.runtime
-                .block_on(PostgresRedoProcess::launch(self.conf, &self.tenantid))?;
-            *process_guard = Some(p);
-        }
-        let process = (*process_guard).as_ref().unwrap();
+        start_time = Instant::now();
+        let result = {
+            let mut process_guard = self.process.lock().unwrap();
+            lock_time = Instant::now();

-        self.runtime.block_on(self.handle_apply_request(&process, &request))
+            // launch the WAL redo process on first use
+            if process_guard.is_none() {
+                let p = self
+                    .runtime
+                    .block_on(PostgresRedoProcess::launch(self.conf, &self.tenantid))?;
+                *process_guard = Some(p);
+            }
+            let process = (*process_guard).as_ref().unwrap();
+
+            self.runtime
+                .block_on(self.handle_apply_request(&process, &request))
+        };
+        end_time = Instant::now();
+
+        WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
+        WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());
+
+        result
    }
 }

@@ -197,12 +236,10 @@ fn mx_offset_to_member_offset(xid: MultiXactId) -> usize {
 }

 impl PostgresRedoManager {
-
    ///
    /// Create a new PostgresRedoManager.
    ///
    pub fn new(conf: &'static PageServerConf, tenantid: ZTenantId) -> PostgresRedoManager {
-
        // We block on waiting for requests on the walredo request channel, but
        // use async I/O to communicate with the child process. Initialize the
        // runtime for the async part.
@@ -244,6 +281,8 @@ impl PostgresRedoManager {
            let buf_tag = BufferTag { rel, blknum };
            apply_result = process.apply_wal_records(buf_tag, base_img, records).await;
        } else {
+            // Non-relational WAL records are handled here, with custom code that has the
+            // same effects as the corresponding Postgres WAL redo function.
            const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
            let mut page = BytesMut::new();
            if let Some(fpi) = base_img {
@@ -257,6 +296,8 @@ impl PostgresRedoManager {
            for record in records {
                let mut buf = record.rec.clone();

+                WAL_REDO_RECORD_COUNTER.inc();
+
                // 1. Parse XLogRecord struct
                // FIXME: refactor to avoid code duplication.
                let xlogrec = XLogRecord::from_bytes(&mut buf);
@@ -378,7 +419,7 @@ impl PostgresRedoManager {
                        panic!();
                    }
                } else if xlogrec.xl_rmid == pg_constants::RM_RELMAP_ID {
-                    // Ralation map file has size 512 bytes
+                    // Relation map file has size 512 bytes
                    page.clear();
                    page.extend_from_slice(&buf[12..]); // skip xl_relmap_update
                    assert!(page.len() == 512); // size of pg_filenode.map
@@ -557,6 +598,8 @@ impl PostgresRedoProcess {
            for rec in records.iter() {
                let r = rec.clone();

+                WAL_REDO_RECORD_COUNTER.inc();
+
                stdin
                    .write_all(&build_apply_record_msg(r.lsn, r.rec))
                    .await?;
--- a/test_runner/batch_others/test_gc.py
+++ b/test_runner/batch_others/test_gc.py
@@ -14,7 +14,8 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
@pytest.mark.skip(reason=""""
    Current GC test is flaky and overly strict. Since we are migrating to the layered repo format
-    with different GC implementation let's just silence this test for now.
+    with different GC implementation let's just silence this test for now. This test only
+    works with the RocksDB implementation.
 """)
 def test_gc(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
    zenith_cli.run(["branch", "test_gc", "empty"])
--- a/test_runner/batch_others/test_pageserver_restart.py
+++ b/test_runner/batch_others/test_pageserver_restart.py
@@ -0,0 +1,66 @@
+import pytest
+import random
+import time
+
+from contextlib import closing
+from multiprocessing import Process, Value
+from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+# Check that dead minority doesn't prevent the commits: execute insert n_inserts
+# times, with fault_probability chance of getting a wal acceptor down or up
+# along the way. 2 of 3 are always alive, so the work keeps going.
+def test_pageserver_restart(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory: WalAcceptorFactory):
+
+    # One safekeeper is enough for this test.
+    wa_factory.start_n_new(1)
+
+    zenith_cli.run(["branch", "test_pageserver_restart", "empty"])
+    pg = postgres.create_start('test_pageserver_restart',
+                               wal_acceptors=wa_factory.get_connstrs())
+
+    pg_conn = pg.connect()
+    cur = pg_conn.cursor()
+
+    # Create table, and insert some rows. Make it big enough that it doesn't fit in
+    # shared_buffers, otherwise the SELECT after restart will just return answer
+    # from shared_buffers without hitting the page server, which defeats the point
+    # of this test.
+    cur.execute('CREATE TABLE foo (t text)')
+    cur.execute('''
+        INSERT INTO foo
+            SELECT 'long string to consume some space' || g
+            FROM generate_series(1, 100000) g
+    ''')
+
+    # Verify that the table is larger than shared_buffers
+    cur.execute('''
+        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize
+        from pg_settings where name = 'shared_buffers'
+    ''')
+    row = cur.fetchone()
+    print("shared_buffers is {}, table size {}", row[0], row[1]);
+    assert int(row[0]) < int(row[1])
+
+    # Stop and restart pageserver. This is a more or less graceful shutdown, although
+    # the page server doesn't currently have a shutdown routine so there's no difference
+    # between stopping and crashing.
+    pageserver.stop();
+    pageserver.start();
+
+    # Stopping the pageserver breaks the connection from the postgres backend to
+    # the page server, and causes the next query on the connection to fail. Start a new
+    # postgres connection too, to avoid that error. (Ideally, the compute node would
+    # handle that and retry internally, without propagating the error to the user, but
+    # currently it doesn't...)
+    pg_conn = pg.connect()
+    cur = pg_conn.cursor()
+
+    cur.execute("SELECT count(*) FROM foo")
+    assert cur.fetchone() == (100000, )
+
+    # Stop the page server by force, and restart it
+    pageserver.stop();
+    pageserver.start();
+
--- a/test_runner/batch_others/test_snapfiles_gc.py
+++ b/test_runner/batch_others/test_snapfiles_gc.py
@@ -0,0 +1,124 @@
+from contextlib import closing
+import psycopg2.extras
+import time;
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+def print_gc_result(row):
+    print("GC duration {elapsed} ms".format_map(row));
+    print("  REL    total: {snapshot_relfiles_total}, needed_by_cutoff {snapshot_relfiles_needed_by_cutoff}, needed_by_branches: {snapshot_relfiles_needed_by_branches}, not_updated: {snapshot_relfiles_not_updated}, removed: {snapshot_relfiles_removed}, dropped: {snapshot_relfiles_dropped}".format_map(row))
+    print("  NONREL total: {snapshot_nonrelfiles_total}, needed_by_cutoff {snapshot_nonrelfiles_needed_by_cutoff}, needed_by_branches: {snapshot_nonrelfiles_needed_by_branches}, not_updated: {snapshot_nonrelfiles_not_updated}, removed: {snapshot_nonrelfiles_removed}, dropped: {snapshot_nonrelfiles_dropped}".format_map(row))
+
+
+#
+# Test Garbage Collection of old snapshot files
+#
+# This test is pretty tightly coupled with the current implementation of layered
+# storage, in layered_repository.rs.
+#
+def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
+    zenith_cli.run(["branch", "test_snapfiles_gc", "empty"])
+    pg = postgres.create_start('test_snapfiles_gc')
+
+    with closing(pg.connect()) as conn:
+        with conn.cursor() as cur:
+            with closing(pageserver.connect()) as psconn:
+                with psconn.cursor(cursor_factory = psycopg2.extras.DictCursor) as pscur:
+
+                    # Get the timeline ID of our branch. We need it for the 'do_gc' command
+                    cur.execute("SHOW zenith.zenith_timeline")
+                    timeline = cur.fetchone()[0]
+
+                    # Create a test table
+                    cur.execute("CREATE TABLE foo(x integer)")
+                    cur.execute("INSERT INTO foo VALUES (1)")
+
+                    cur.execute("select relfilenode from pg_class where oid = 'foo'::regclass");
+                    row = cur.fetchone();
+                    print("relfilenode is {}", row[0]);
+
+                    # Run GC, to clear out any garbage left behind in the catalogs by
+                    # the CREATE TABLE command. We want to have a clean slate with no garbage
+                    # before running the actual tests below, otherwise the counts won't match
+                    # what we expect.
+                    #
+                    # Also run vacuum first to make it less likely that autovacuum or pruning
+                    # kicks in and confuses our numbers.
+                    cur.execute("VACUUM")
+
+                    # delete the row, to update the Visibility Map. We don't want the VM
+                    # update to confuse our numbers either.
+                    cur.execute("DELETE FROM foo")
+
+                    print("Running GC before test")
+                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    row = pscur.fetchone()
+                    print_gc_result(row);
+                    # remember the number of files
+                    snapshot_relfiles_remain = row['snapshot_relfiles_total'] - row['snapshot_relfiles_removed']
+                    assert snapshot_relfiles_remain > 0
+
+                    # Insert a row.
+                    print("Inserting one row and running GC")
+                    cur.execute("INSERT INTO foo VALUES (1)")
+                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    row = pscur.fetchone()
+                    print_gc_result(row);
+                    assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1
+                    assert row['snapshot_relfiles_removed'] == 1
+                    assert row['snapshot_relfiles_dropped'] == 0
+
+                    # Insert two more rows and run GC.
+                    # This should create a new snapshot file with the new contents, and
+                    # remove the old one.
+                    print("Inserting two more rows and running GC")
+                    cur.execute("INSERT INTO foo VALUES (2)")
+                    cur.execute("INSERT INTO foo VALUES (3)")
+
+                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    row = pscur.fetchone()
+                    print_gc_result(row);
+                    assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1
+                    assert row['snapshot_relfiles_removed'] == 1
+                    assert row['snapshot_relfiles_dropped'] == 0
+
+                    # Do it again. Should again create a new snapshot file and remove old one.
+                    print("Inserting two more rows and running GC")
+                    cur.execute("INSERT INTO foo VALUES (2)")
+                    cur.execute("INSERT INTO foo VALUES (3)")
+
+                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    row = pscur.fetchone()
+                    print_gc_result(row);
+                    assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1
+                    assert row['snapshot_relfiles_removed'] == 1
+                    assert row['snapshot_relfiles_dropped'] == 0
+
+                    # Run GC again, with no changes in the database. Should not remove anything.
+                    print("Run GC again, with nothing to do")
+                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    row = pscur.fetchone()
+                    print_gc_result(row);
+                    assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain
+                    assert row['snapshot_relfiles_removed'] == 0
+                    assert row['snapshot_relfiles_dropped'] == 0
+
+                    #
+                    # Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
+                    #
+                    print("Drop table and run GC again");
+                    cur.execute("DROP TABLE foo")
+
+                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    row = pscur.fetchone()
+                    print_gc_result(row);
+
+                    # Each relation fork is counted separately, hence 3.
+                    assert row['snapshot_relfiles_dropped'] == 3
+
+                    # The catalog updates also create new snapshot files of the catalogs, which
+                    # are counted as 'removed'
+                    assert row['snapshot_relfiles_removed'] > 0
+
+                    # TODO: perhaps we should count catalog and user relations separately,
+                    # to make this kind of testing more robust
--- a/test_runner/batch_others/test_twophase.py
+++ b/test_runner/batch_others/test_twophase.py
@@ -1,3 +1,5 @@
+import os
+
 from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver


@@ -28,24 +30,59 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
    cur.execute("INSERT INTO foo VALUES ('two')")
    cur.execute("PREPARE TRANSACTION 'insert_two'")

+    # Prepare a transaction that will insert a row
+    cur.execute('BEGIN')
+    cur.execute("INSERT INTO foo VALUES ('three')")
+    cur.execute("PREPARE TRANSACTION 'insert_three'")
+
+    # Prepare another transaction that will insert a row
+    cur.execute('BEGIN')
+    cur.execute("INSERT INTO foo VALUES ('four')")
+    cur.execute("PREPARE TRANSACTION 'insert_four'")
+
+    # On checkpoint state data copied to files in
+    # pg_twophase directory and fsynced
+    cur.execute('CHECKPOINT')
+
+    twophase_files = os.listdir(pg.pg_twophase_dir_path())
+    print(twophase_files)
+    assert len(twophase_files) == 4
+
+    cur.execute("COMMIT PREPARED 'insert_three'")
+    cur.execute("ROLLBACK PREPARED 'insert_four'")
+    cur.execute('CHECKPOINT')
+
+    twophase_files = os.listdir(pg.pg_twophase_dir_path())
+    print(twophase_files)
+    assert len(twophase_files) == 2
+
    # Create a branch with the transaction in prepared state
    zenith_cli.run(["branch", "test_twophase_prepared", "test_twophase"])

-    pg2 = postgres.create_start(
+    # Create compute node, but don't start.
+    # We want to observe pgdata before postgres starts
+    pg2 = postgres.create(
        'test_twophase_prepared',
        config_lines=['max_prepared_transactions=5'],
    )
+
+    # Check that we restored only needed twophase files
+    twophase_files2 = os.listdir(pg2.pg_twophase_dir_path())
+    print(twophase_files2)
+    assert twophase_files2.sort() == twophase_files.sort()
+
+    pg2 = pg2.start()
    conn2 = pg2.connect()
    cur2 = conn2.cursor()

-    # On the new branch, commit one of the prepared transactions, abort the other one.
+    # On the new branch, commit one of the prepared transactions,
+    # abort the other one.
    cur2.execute("COMMIT PREPARED 'insert_one'")
    cur2.execute("ROLLBACK PREPARED 'insert_two'")

    cur2.execute('SELECT * FROM foo')
-    assert cur2.fetchall() == [('one', )]
+    assert cur2.fetchall() == [('one',), ('three',)]

-    # Neither insert is visible on the original branch, the transactions are still
-    # in prepared state there.
+    # Only one committed insert is visible on the original branch
    cur.execute('SELECT * FROM foo')
-    assert cur.fetchall() == []
+    assert cur.fetchall() == [('three',)]
--- a/test_runner/batch_others/test_wal_acceptor.py
+++ b/test_runner/batch_others/test_wal_acceptor.py
@@ -4,7 +4,7 @@ import time

 from contextlib import closing
 from multiprocessing import Process, Value
-from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory
+from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory

 pytest_plugins = ("fixtures.zenith_fixtures")

@@ -61,7 +61,7 @@ def test_many_timelines(zenith_cli, pageserver: ZenithPageserver, postgres: Post
 # Check that dead minority doesn't prevent the commits: execute insert n_inserts
 # times, with fault_probability chance of getting a wal acceptor down or up
 # along the way. 2 of 3 are always alive, so the work keeps going.
-def test_restarts(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory):
+def test_restarts(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory: WalAcceptorFactory):
    fault_probability = 0.01
    n_inserts = 1000
    n_acceptors = 3
--- a/test_runner/fixtures/zenith_fixtures.py
+++ b/test_runner/fixtures/zenith_fixtures.py
@@ -1,4 +1,3 @@
-import getpass
 import os
 import pathlib
 import uuid
@@ -7,9 +6,11 @@ import pytest
 import shutil
 import signal
 import subprocess
+import time

 from contextlib import closing
 from pathlib import Path
+from dataclasses import dataclass

 # Type-related stuff
 from psycopg2.extensions import connection as PgConnection
@@ -266,6 +267,7 @@ class Postgres(PgProtocol):
        branch: str,
        wal_acceptors: Optional[str] = None,
        config_lines: Optional[List[str]] = None,
+        config_only: bool = False,
    ) -> 'Postgres':
        """
        Create the pg data directory.
@@ -277,7 +279,10 @@ class Postgres(PgProtocol):
        if not config_lines:
            config_lines = []

-        self.zenith_cli.run(['pg', 'create', branch, f'--tenantid={self.tenant_id}'])
+        if config_only:
+            self.zenith_cli.run(['pg', 'create', '--config-only', branch, f'--tenantid={self.tenant_id}'])
+        else:
+            self.zenith_cli.run(['pg', 'create', branch, f'--tenantid={self.tenant_id}'])
        self.branch = branch
        if wal_acceptors is not None:
            self.adjust_for_wal_acceptors(wal_acceptors)
@@ -304,6 +309,13 @@ class Postgres(PgProtocol):
        path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id / self.branch / 'pg_xact'
        return os.path.join(self.repo_dir, path)

+    def pg_twophase_dir_path(self) -> str:
+        """ Path to pg_twophase dir """
+        print(self.tenant_id)
+        print(self.branch)
+        path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id / self.branch / 'pg_twophase'
+        return os.path.join(self.repo_dir, path)
+
    def config_file_path(self) -> str:
        """ Path to postgresql.conf """
        filename = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id / self.branch / 'postgresql.conf'
@@ -376,7 +388,8 @@ class Postgres(PgProtocol):
        config_lines: Optional[List[str]] = None,
    ) -> 'Postgres':
        """
-        Create a Postgres instance, then start it.
+        Create a Postgres instance, apply config
+        and then start it.
        Returns self.
        """

@@ -384,6 +397,7 @@ class Postgres(PgProtocol):
            branch=branch,
            wal_acceptors=wal_acceptors,
            config_lines=config_lines,
+            config_only=True,
        ).start()

        return self
@@ -429,6 +443,54 @@ class PostgresFactory:
            config_lines=config_lines,
        )

+    def create(
+        self,
+        branch: str = "main",
+        tenant_id: Optional[str] = None,
+        wal_acceptors: Optional[str] = None,
+        config_lines: Optional[List[str]] = None
+    ) -> Postgres:
+
+        pg = Postgres(
+            zenith_cli=self.zenith_cli,
+            repo_dir=self.repo_dir,
+            tenant_id=tenant_id or self.initial_tenant,
+            port=self.base_port + self.num_instances + 1,
+        )
+
+        self.num_instances += 1
+        self.instances.append(pg)
+
+        return pg.create(
+            branch=branch,
+            wal_acceptors=wal_acceptors,
+            config_lines=config_lines,
+        )
+
+    def config(
+        self,
+        branch: str = "main",
+        tenant_id: Optional[str] = None,
+        wal_acceptors: Optional[str] = None,
+        config_lines: Optional[List[str]] = None
+    ) -> Postgres:
+
+        pg = Postgres(
+            zenith_cli=self.zenith_cli,
+            repo_dir=self.repo_dir,
+            tenant_id=tenant_id or self.initial_tenant,
+            port=self.base_port + self.num_instances + 1,
+        )
+
+        self.num_instances += 1
+        self.instances.append(pg)
+
+        return pg.config(
+            branch=branch,
+            wal_acceptors=wal_acceptors,
+            config_lines=config_lines,
+        )
+
    def stop_all(self) -> 'PostgresFactory':
        for pg in self.instances:
            pg.stop()
@@ -511,26 +573,27 @@ def pg_bin(test_output_dir: str, pg_distrib_dir: str) -> PgBin:
    return PgBin(test_output_dir, pg_distrib_dir)


-def read_pid(path):
+def read_pid(path: Path):
    """ Read content of file into number """
-    return int(Path(path).read_text())
+    return int(path.read_text())


+@dataclass
 class WalAcceptor:
    """ An object representing a running wal acceptor daemon. """
-    def __init__(self, wa_binpath, data_dir, port, num, auth_token: Optional[str] = None):
-        self.wa_binpath = wa_binpath
-        self.data_dir = data_dir
-        self.port = port
-        self.num = num  # identifier for logging
-        self.auth_token = auth_token
+    bin_path: Path
+    data_dir: Path
+    port: int
+    num: int # identifier for logging
+    auth_token: Optional[str] = None

    def start(self) -> 'WalAcceptor':
        # create data directory if not exists
-        Path(self.data_dir).mkdir(parents=True, exist_ok=True)
+        self.data_dir.mkdir(parents=True, exist_ok=True)
+        self.pidfile.unlink(missing_ok=True)

-        cmd = [self.wa_binpath]
-        cmd.extend(["-D", self.data_dir])
+        cmd = [str(self.bin_path)]
+        cmd.extend(["-D", str(self.data_dir)])
        cmd.extend(["-l", "localhost:{}".format(self.port)])
        cmd.append("--daemonize")
        cmd.append("--no-sync")
@@ -541,38 +604,51 @@ class WalAcceptor:
        env = {'PAGESERVER_AUTH_TOKEN': self.auth_token} if self.auth_token else None
        subprocess.run(cmd, check=True, env=env)

-        return self
+        # wait for wal acceptor start by checkking that pid is readable
+        for _ in range(3):
+            pid = self.get_pid()
+            if pid is not None:
+                return self
+            time.sleep(0.5)
+
+        raise RuntimeError("cannot get wal acceptor pid")
+
+    @property
+    def pidfile(self) -> Path:
+        return self.data_dir / "wal_acceptor.pid"
+
+    def get_pid(self) -> Optional[int]:
+        if not self.pidfile.exists():
+            return None
+
+        try:
+            pid = read_pid(self.pidfile)
+        except ValueError:
+            return None
+
+        return pid

    def stop(self) -> 'WalAcceptor':
        print('Stopping wal acceptor {}'.format(self.num))
-        pidfile_path = os.path.join(self.data_dir, "wal_acceptor.pid")
-        try:
-            pid = read_pid(pidfile_path)
-            try:
-                os.kill(pid, signal.SIGTERM)
-            except Exception:
-                pass  # pidfile might be obsolete
-            # TODO: cleanup pid file on exit in wal acceptor
-            return self
-            # for _ in range(5):
-            # print('waiting wal acceptor {} (pid {}) to stop...', self.num, pid)
-            # try:
-            # read_pid(pidfile_path)
-            # except FileNotFoundError:
-            # return  # done
-            # time.sleep(1)
-            # raise Exception('Failed to wait for wal acceptor {} shutdown'.format(self.num))
-        except FileNotFoundError:
+        pid = self.get_pid()
+        if pid is None:
            print("Wal acceptor {} is not running".format(self.num))
            return self

+        try:
+            os.kill(pid, signal.SIGTERM)
+        except Exception:
+            # TODO: cleanup pid file on exit in wal acceptor
+            pass # pidfile might be obsolete
+        return self
+

 class WalAcceptorFactory:
    """ An object representing multiple running wal acceptors. """
-    def __init__(self, zenith_binpath, data_dir):
-        self.wa_binpath = os.path.join(zenith_binpath, 'wal_acceptor')
+    def __init__(self, zenith_binpath: Path, data_dir: Path):
+        self.wa_binpath = zenith_binpath / 'wal_acceptor'
        self.data_dir = data_dir
-        self.instances = []
+        self.instances: List[WalAcceptor] = []
        self.initial_port = 54321

    def start_new(self, auth_token: Optional[str] = None) -> WalAcceptor:
@@ -583,7 +659,7 @@ class WalAcceptorFactory:
        wa_num = len(self.instances)
        wa = WalAcceptor(
            self.wa_binpath,
-            os.path.join(self.data_dir, "wal_acceptor_{}".format(wa_num)),
+            self.data_dir / "wal_acceptor_{}".format(wa_num),
            self.initial_port + wa_num,
            wa_num,
            auth_token,
@@ -613,7 +689,7 @@ class WalAcceptorFactory:
@zenfixture
 def wa_factory(zenith_binpath: str, repo_dir: str) -> Iterator[WalAcceptorFactory]:
    """ Gives WalAcceptorFactory providing wal acceptors. """
-    wafactory = WalAcceptorFactory(zenith_binpath, os.path.join(repo_dir, "wal_acceptors"))
+    wafactory = WalAcceptorFactory(Path(zenith_binpath), Path(repo_dir) / "wal_acceptors")
    yield wafactory
    # After the yield comes any cleanup code we need.
    print('Starting wal acceptors cleanup')
--- a/walkeeper/src/receive_wal.rs
+++ b/walkeeper/src/receive_wal.rs
@@ -2,7 +2,7 @@
 //!
 //! FIXME: better description needed here

-use anyhow::{bail, Result};
+use anyhow::{bail, Context, Result};
 use bincode::config::Options;
 use bytes::{Buf, Bytes};
 use log::*;
@@ -236,7 +236,9 @@ impl<'pg> ReceiveWalConn<'pg> {
            .write_message(&BeMessage::CopyBothResponse)?;

        // Receive information about server
-        let server_info = self.read_msg::<ServerInfo>()?;
+        let server_info = self
+            .read_msg::<ServerInfo>()
+            .context("Failed to receive server info")?;
        info!(
            "Start handshake with wal_proposer {} sysid {} timeline {} tenant {}",
            self.peer_addr, server_info.system_id, server_info.timeline_id, server_info.tenant_id,
@@ -284,7 +286,9 @@ impl<'pg> ReceiveWalConn<'pg> {
        self.write_msg(&my_info)?;

        /* Wait for vote request */
-        let prop = self.read_msg::<RequestVote>()?;
+        let prop = self
+            .read_msg::<RequestVote>()
+            .context("Failed to read vote request")?;
        /* This is Paxos check which should ensure that only one master can perform commits */
        if prop.node_id < my_info.server.node_id {
            /* Send my node-id to inform proposer that it's candidate was rejected */
@@ -331,7 +335,8 @@ impl<'pg> ReceiveWalConn<'pg> {
            let msg_bytes = self.read_msg_bytes()?;
            let mut msg_reader = msg_bytes.reader();

-            let req = SafeKeeperRequest::des_from(&mut msg_reader)?;
+            let req = SafeKeeperRequest::des_from(&mut msg_reader)
+                .context("Failed to get WAL message header")?;
            if req.sender_id != my_info.server.node_id {
                bail!("Sender NodeId is changed");
            }
--- a/zenith/src/main.rs
+++ b/zenith/src/main.rs
@@ -61,6 +61,13 @@ fn main() -> Result<()> {
                        .long("enable-auth")
                        .takes_value(false)
                        .help("Enable authentication using ZenithJWT")
+                )
+                .arg(
+                    Arg::with_name("repository-format")
+                        .long("repository-format")
+                        .takes_value(false)
+                        .value_name("repository-format")
+                        .help("Choose repository format, 'layered' or 'rocksdb'")
                ),
        )
        .subcommand(
@@ -85,8 +92,18 @@ fn main() -> Result<()> {
                .setting(AppSettings::ArgRequiredElseHelp)
                .about("Manage postgres instances")
                .subcommand(SubCommand::with_name("list").arg(tenantid_arg.clone()))
-                .subcommand(SubCommand::with_name("create").arg(timeline_arg.clone()).arg(tenantid_arg.clone()))
-                .subcommand(SubCommand::with_name("start").arg(timeline_arg.clone()).arg(tenantid_arg.clone()))
+                .subcommand(SubCommand::with_name("create")
+                    .about("Create a postgres compute node")
+                    .arg(timeline_arg.clone()).arg(tenantid_arg.clone())
+                    .arg(
+                        Arg::with_name("config-only")
+                            .help("Don't do basebackup, create compute node with only config files")
+                            .long("config-only")
+                            .required(false)
+                    ))
+                .subcommand(SubCommand::with_name("start")
+                    .about("Start a postrges compute node.\n This command actually creates new node from scrath, but preserves existing config files")
+                    .arg(timeline_arg.clone()).arg(tenantid_arg.clone()))
                .subcommand(
                    SubCommand::with_name("stop")
                        .arg(timeline_arg.clone())
@@ -131,8 +148,8 @@ fn main() -> Result<()> {
        } else {
            AuthType::Trust
        };
-
-        local_env::init(pageserver_uri, tenantid, auth_type)
+        let repository_format = init_match.value_of("repository-format");
+        local_env::init(pageserver_uri, tenantid, auth_type, repository_format)
            .with_context(|| "Failed to create config file")?;
    }

@@ -151,6 +168,7 @@ fn main() -> Result<()> {
            if let Err(e) = pageserver.init(
                Some(&env.tenantid.to_string()),
                init_match.is_present("enable-auth"),
+                init_match.value_of("repository-format"),
            ) {
                eprintln!("pageserver init failed: {}", e);
                exit(1);
@@ -451,10 +469,9 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
                .value_of("tenantid")
                .map_or(Ok(env.tenantid), |value| value.parse())?;
            let timeline_name = create_match.value_of("timeline").unwrap_or("main");
-            // check is that timeline doesnt already exist
-            // this check here is because it
+            let config_only = create_match.is_present("config-only");

-            cplane.new_node(tenantid, timeline_name)?;
+            cplane.new_node(tenantid, timeline_name, config_only)?;
        }
        ("start", Some(start_match)) => {
            let tenantid: ZTenantId = start_match
@@ -475,7 +492,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
            if let Some(node) = node {
                node.start(&auth_token)?;
            } else {
-                let node = cplane.new_node(tenantid, timeline_name)?;
+                let node = cplane.new_node(tenantid, timeline_name, false)?;
                node.start(&auth_token)?;
            }
        }
--- a/zenith_utils/src/postgres_backend.rs
+++ b/zenith_utils/src/postgres_backend.rs
@@ -301,8 +301,9 @@ impl PostgresBackend {
            FeMessage::Query(m) => {
                trace!("got query {:?}", m.body);
                // xxx distinguish fatal and recoverable errors?
-                if let Err(e) = handler.process_query(self, m.body) {
+                if let Err(e) = handler.process_query(self, m.body.clone()) {
                    let errmsg = format!("{}", e);
+                    warn!("query handler for {:?} failed: {}", m.body, errmsg);
                    self.write_message_noflush(&BeMessage::ErrorResponse(errmsg))?;
                }
                self.write_message(&BeMessage::ReadyForQuery)?;
--- a/zenith_utils/src/zid.rs
+++ b/zenith_utils/src/zid.rs
@@ -126,7 +126,7 @@ macro_rules! zid_newtype {
 /// is separate from PostgreSQL timelines, and doesn't have those
 /// limitations. A zenith timeline is identified by a 128-bit ID, which
 /// is usually printed out as a hex string.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)]
 pub struct ZTimelineId(ZId);

 zid_newtype!(ZTimelineId);