add missing expected file

Add test for relation truncation
[refer #190 ] Refectoring based on the reviews
2026-05-22 15:41:15 +00:00 · 2021-05-27 20:52:33 +03:00 · 2021-05-27 15:52:19 +03:00 · 2021-05-27 10:11:01 +03:00 · 2021-05-27 10:00:04 +03:00 · 2021-05-26 20:27:43 +03:00
108 changed files with 8718 additions and 12753 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -0,0 +1,267 @@
+version: 2.1
+
+orbs:
+  python: circleci/python@1.4.0
+
+executors:
+  zenith-build-executor:
+    resource_class: xlarge
+    docker:
+      - image: cimg/rust:1.51.0
+
+jobs:
+
+  # A job to build postgres
+  build-postgres:
+    executor: zenith-build-executor
+    steps:
+        # Checkout the git repo (circleci doesn't have a flag to enable submodules here)
+      - checkout
+
+        # Grab the postgres git revision to build a cache key.
+        # Note this works even though the submodule hasn't been checkout out yet.
+      - run:
+          name: Get postgres cache key
+          command: |
+            git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
+
+      - restore_cache:
+          name: Restore postgres cache
+          keys:
+            # Restore ONLY if the rev key matches exactly
+            - v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
+
+        # FIXME We could cache our own docker container, instead of installing packages every time.
+      - run:
+          name: apt install dependencies
+          command: |
+            if [ ! -e tmp_install/bin/postgres ]; then
+              sudo apt update
+              sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libxml2-dev libcurl4-openssl-dev
+            fi
+
+        # Build postgres if the restore_cache didn't find a build.
+        # `make` can't figure out whether the cache is valid, since
+        # it only compares file timestamps.
+      - run:
+          name: build postgres
+          command: |
+            if [ ! -e tmp_install/bin/postgres ]; then
+              # "depth 1" saves some time by not cloning the whole repo
+              git submodule update --init --depth 1
+              make postgres
+            fi
+
+      - save_cache:
+          name: Save postgres cache
+          key: v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
+          paths:
+            - tmp_install
+
+  # A job to build zenith rust code
+  build-zenith:
+    executor: zenith-build-executor
+    parameters:
+      build_type:
+        type: enum
+        enum: ["debug", "release"]
+    steps:
+      - run:
+          name: apt install dependencies
+          command: |
+            sudo apt update
+            sudo apt install libssl-dev clang
+
+        # Checkout the git repo (without submodules)
+      - checkout
+
+        # Grab the postgres git revision to build a cache key.
+        # Note this works even though the submodule hasn't been checkout out yet.
+      - run:
+          name: Get postgres cache key
+          command: |
+            git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
+
+      - restore_cache:
+          name: Restore postgres cache
+          keys:
+            # Restore ONLY if the rev key matches exactly
+            - v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
+
+      - restore_cache:
+          name: Restore rust cache
+          keys:
+            # Require an exact match. While an out of date cache might speed up the build,
+            # there's no way to clean out old packages, so the cache grows every time something
+            # changes.
+            - v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
+
+        # Build the rust code, including test binaries
+      - run:
+          name: Rust build << parameters.build_type >>
+          command: |
+            export CARGO_INCREMENTAL=0
+            BUILD_TYPE="<< parameters.build_type >>"
+            if [[ $BUILD_TYPE == "debug" ]]; then
+              echo "Build in debug mode"
+              cargo build --bins --tests
+            elif [[ $BUILD_TYPE == "release" ]]; then
+              echo "Build in release mode"
+              cargo build --release --bins --tests
+            fi
+
+      - save_cache:
+          name: Save rust cache
+          key: v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
+          paths:
+            - ~/.cargo/registry
+            - ~/.cargo/git
+            - target
+
+        # Run rust unit tests
+        # FIXME: remove -p zenith_utils once integration tests are moved to python
+      - run: cargo test -p zenith_utils
+
+        # Install the rust binaries, for use by test jobs
+        # `--locked` is required; otherwise, `cargo install` will ignore Cargo.lock.
+        # FIXME: this is a really silly way to install; maybe we should just output
+        # a tarball as an artifact? Or a .deb package?
+      - run:
+          name: cargo install
+          command: |
+            export CARGO_INCREMENTAL=0
+            BUILD_TYPE="<< parameters.build_type >>"
+            if [[ $BUILD_TYPE == "debug" ]]; then
+              echo "Install debug mode"
+              CARGO_FLAGS="--debug"
+            elif [[ $BUILD_TYPE == "release" ]]; then
+              echo "Install release mode"
+              # The default is release mode; there is no --release flag.
+              CARGO_FLAGS=""
+            fi
+            cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path pageserver
+            cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path walkeeper
+            cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path zenith
+
+        # Install the postgres binaries, for use by test jobs
+        # FIXME: this is a silly way to do "install"; maybe just output a standard
+        # postgres package, whatever the favored form is (tarball? .deb package?)
+        # Note that pg_regress needs some build artifacts that probably aren't
+        # in the usual package...?
+      - run:
+          name: postgres install
+          command: |
+            cp -a tmp_install /tmp/zenith/pg_install
+
+        # Save the rust output binaries for other jobs in this workflow.
+      - persist_to_workspace:
+          root: /tmp/zenith
+          paths:
+            - "*"
+
+  run-pytest:
+    #description: "Run pytest"
+    executor: python/default
+    parameters:
+      # pytest args to specify the tests to run.
+      #
+      # This can be a test file name, e.g. 'test_pgbench.py, or a subdirectory,
+      # or '-k foobar' to run tests containing string 'foobar'. See pytest man page
+      # section SPECIFYING TESTS / SELECTING TESTS for details.
+      #
+      # Select the type of Rust build. Must be "release" or "debug".
+      build_type:
+        type: string
+        default: "debug"
+      # This parameter is required, to prevent the mistake of running all tests in one job.
+      test_selection:
+        type: string
+        default: ""
+      # Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr
+      extra_params:
+        type: string
+        default: ""
+      needs_postgres_source:
+        type: boolean
+        default: false
+    steps:
+      - attach_workspace:
+          at: /tmp/zenith
+      - checkout
+      - when:
+          condition: << parameters.needs_postgres_source >>
+          steps:
+            - run: git submodule update --init --depth 1
+      - run: pip install pytest psycopg2
+      - run:
+          name: Run pytest
+          working_directory: test_runner
+          environment:
+            - ZENITH_BIN: /tmp/zenith/bin
+            - POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
+            - TEST_OUTPUT: /tmp/test_output
+          command: |
+            TEST_SELECTION="<< parameters.test_selection >>"
+            EXTRA_PARAMS="<< parameters.extra_params >>"
+            if [ -z "$TEST_SELECTION" ]; then
+              echo "test_selection must be set"
+              exit 1
+            fi
+            # Run the tests.
+            #
+            # The junit.xml file allows CircleCI to display more fine-grained test information
+            # in its "Tests" tab in the results page.
+            pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short $TEST_SELECTION $EXTRA_PARAMS
+      - run:
+          # CircleCI artifacts are preserved one file at a time, so skipping
+          # this step isn't a good idea. If you want to extract the
+          # pageserver state, perhaps a tarball would be a better idea.
+          name: Delete pageserver data
+          when: always
+          command: |
+            du -sh /tmp/test_output/*
+            for DIR in /tmp/test_output/*; do
+              mv $DIR/repo/pageserver.log $DIR/ || true # ignore errors
+              for PGDIR in $DIR/repo/pgdatadirs/pg?; do
+                echo "PGDIR: $PGDIR"
+                NEW_LOG="${PGDIR##*/}_log"
+                mv $PGDIR/log "$DIR/$NEW_LOG" || true # ignore errors
+              done
+              echo "rm $DIR/repo"
+              rm -rf $DIR/repo
+            done
+            du -sh /tmp/test_output/*
+      - store_artifacts:
+          path: /tmp/test_output
+      # The store_test_results step tells CircleCI where to find the junit.xml file.
+      - store_test_results:
+          path: /tmp/test_output
+
+workflows:
+  build_and_test:
+    jobs:
+      - build-postgres
+      - build-zenith:
+          name: build-zenith-<< matrix.build_type >>
+          matrix:
+            parameters:
+              build_type: ["debug", "release"]
+          requires:
+            - build-postgres
+      - run-pytest:
+          name: pg_regress tests << matrix.build_type >>
+          matrix:
+            parameters:
+              build_type: ["debug", "release"]
+          test_selection: batch_pg_regress
+          needs_postgres_source: true
+          requires:
+            - build-zenith-<< matrix.build_type >>
+      - run-pytest:
+          name: other tests << matrix.build_type >>
+          matrix:
+            parameters:
+              build_type: ["debug", "release"]
+          test_selection: batch_others
+          requires:
+            - build-zenith-<< matrix.build_type >>
--- a/.gitignore
+++ b/.gitignore
@@ -2,5 +2,8 @@
 /tmp_check
 /tmp_install
 /tmp_check_cli
+__pycache__/
+test_output/
 .vscode
-.zenith
+/.zenith
+/integration_tests/.zenith
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,4 +7,5 @@ members = [
    "control_plane",
    "postgres_ffi",
    "zenith_utils",
+    "workspace_hack",
 ]
--- a/6
+++ b/6
@@ -32,10 +32,14 @@ postgres-headers: postgres-configure
 	+@echo "Installing PostgreSQL headers"
 	$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install

-# Compile and install PostgreSQL
+
+# Compile and install PostgreSQL and contrib/zenith
 postgres: postgres-configure
 	+@echo "Compiling PostgreSQL"
 	$(MAKE) -C tmp_install/build MAKELEVEL=0 install
+	+@echo "Compiling contrib/zenith"
+	(cd vendor/postgres/contrib/zenith && \
+	$(MAKE) PG_CONFIG=$(abspath tmp_install)/bin/pg_config install USE_PGXS=1)

 postgres-clean:
 	$(MAKE) -C tmp_install/build MAKELEVEL=0 clean
--- a/README.md
+++ b/README.md
@@ -4,49 +4,96 @@ Zenith substitutes PostgreSQL storage layer and redistributes data across a clus

 ## Running local installation

-1. Build zenith and patched postgres
+1. Install build dependencies and other useful packages
+
+On Ubuntu or Debian this set of packages should be sufficient to build the code:
+```text
+apt install build-essential libtool libreadline-dev zlib1g-dev flex bison \
+libxml2-dev libcurl4-openssl-dev libssl-dev clang
+```
+
+[Rust] 1.48 or later is also required.
+
+To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.
+
+To run the integration tests (not required to use the code), install
+Python (3.6 or higher), and install python3 packages with `pip` (called `pip3` on some systems):
+```
+pip install pytest psycopg2
+```
+
+2. Build zenith and patched postgres
 ```sh
 git clone --recursive https://github.com/libzenith/zenith.git
 cd zenith
-make
+make -j5
 ```

-2. Start pageserver and postggres on top of it (should be called from repo root):
+3. Start pageserver and postgres on top of it (should be called from repo root):
 ```sh
-# Create ~/.zenith with proper paths to binaries and data
+# Create repository in .zenith with proper paths to binaries and data
 # Later that would be responsibility of a package install script
->./target/debug/zenith init
+> ./target/debug/zenith init
+<...>
+new zenith repository was created in .zenith

 # start pageserver
-> ./target/debug/zenith pageserver start
-Starting pageserver at '127.0.0.1:64000'
+> ./target/debug/zenith start
+Starting pageserver at '127.0.0.1:64000' in .zenith
+Pageserver started

-# create and configure postgres data dir
-> ./target/debug/zenith pg create
-Creating new postgres: path=/Users/user/code/zenith/tmp_check_cli/compute/pg1 port=55432
-Database initialized
+# start postgres on top on the pageserver
+> ./target/debug/zenith pg start main
+Starting postgres node at 'host=127.0.0.1 port=55432 user=stas'
+waiting for server to start.... done

-# start it
-> ./target/debug/zenith pg start pg1
-
-# look up status and connection info
-> ./target/debug/zenith pg list     
-NODE		ADDRESS				STATUS
-pg1			127.0.0.1:55432		running
+# check list of running postgres instances
+> ./target/debug/zenith pg list
+BRANCH	ADDRESS		LSN		STATUS
+main	127.0.0.1:55432	0/1609610	running
 ```

-3. Now it is possible to connect to postgres and run some queries:
-```
+4. Now it is possible to connect to postgres and run some queries:
+```text
 > psql -p55432 -h 127.0.0.1 postgres
 postgres=# CREATE TABLE t(key int primary key, value text);
 CREATE TABLE
 postgres=# insert into t values(1,1);
 INSERT 0 1
+postgres=# select * from t;
+ key | value
+-----+-------
+   1 | 1
+(1 row)
+```
+
+5. And create branches and run postgres on them:
+```sh
+# create branch named migration_check
+> ./target/debug/zenith branch migration_check main
+Created branch 'migration_check' at 0/1609610
+
+# check branches tree
+> ./target/debug/zenith branch
+ main
+ ┗━ @0/1609610: migration_check
+
+# start postgres on that branch
+> ./target/debug/zenith pg start migration_check
+Starting postgres node at 'host=127.0.0.1 port=55433 user=stas'
+waiting for server to start.... done
+
+# this new postgres instance will have all the data from 'main' postgres,
+# but all modifications would not affect data in original postgres
+> psql -p55433 -h 127.0.0.1 postgres
 postgres=# select * from t;
 key | value 
 -----+-------
   1 | 1
 (1 row)
+
+postgres=# insert into t values(2,2);
+INSERT 0 1
 ```

 ## Running tests
@@ -54,38 +101,34 @@ postgres=# select * from t;
 ```sh
 git clone --recursive https://github.com/libzenith/zenith.git
 make # builds also postgres and installs it to ./tmp_install
-cargo test -- --test-threads=1
+cd test_runner
+pytest
 ```

 ## Source tree layout

-/walkeeper:
+`/walkeeper`:

 WAL safekeeper. Written in Rust.

-/pageserver:
+`/pageserver`:

 Page Server. Written in Rust.

 Depends on the modified 'postgres' binary for WAL redo.

-/integration_tests:
-
-Tests with different combinations of a Postgres compute node, WAL safekeeper and Page Server.
-
-/mgmt-console:
-
-Web UI to launch (modified) Postgres servers, using S3 as the backing store. Written in Python.
-This is somewhat outdated, as it doesn't use the WAL safekeeper or Page Servers.
-
-/vendor/postgres:
+`/vendor/postgres`:

 PostgreSQL source tree, with the modifications needed for Zenith.

-/vendor/postgres/src/bin/safekeeper:
+`/vendor/postgres/src/bin/safekeeper`:

 Extension (safekeeper_proxy) that runs in the compute node, and connects to the WAL safekeepers
 and streams the WAL

+`/test_runner`:
+
+Integration tests, written in Python using the `pytest` framework.


+[Rust]: https://www.rust-lang.org/learn/get-started
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -9,19 +9,22 @@ edition = "2018"
 [dependencies]
 rand = "0.8.3"
 tar = "0.4.33"
-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
-tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
-
-serde = ""
-serde_derive = ""
-toml = ""
-lazy_static = ""
+postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1"
+toml = "0.5"
+lazy_static = "1.4"
 regex = "1"
 anyhow = "1.0"
-hex = "0.4.3"
+# hex = "0.4.3"
 bytes = "1.0.1"
-fs_extra = "1.2.0"
+# fs_extra = "1.2.0"
+nix = "0.20"
+# thiserror = "1"
+url = "2.2.2"

 pageserver = { path = "../pageserver" }
 walkeeper = { path = "../walkeeper" }
 postgres_ffi = { path = "../postgres_ffi" }
+zenith_utils = { path = "../zenith_utils" }
+workspace_hack = { path = "../workspace_hack" }
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -1,23 +1,24 @@
-use std::fs::{self, File, OpenOptions};
-use std::io::{Read, Write};
+use std::io::Write;
 use std::net::SocketAddr;
 use std::net::TcpStream;
 use std::os::unix::fs::PermissionsExt;
-use std::path::Path;
-use std::process::{Command, ExitStatus};
+use std::process::Command;
 use std::sync::Arc;
 use std::time::Duration;
 use std::{collections::BTreeMap, path::PathBuf};
+use std::{
+    fs::{self, OpenOptions},
+    io::Read,
+};

 use anyhow::{Context, Result};
 use lazy_static::lazy_static;
 use regex::Regex;

-use postgres::{Client, NoTls};
-
 use crate::local_env::LocalEnv;
-use crate::storage::{PageServerNode, WalProposerNode};
-use pageserver::{zenith_repo_dir, ZTimelineId};
+use pageserver::ZTimelineId;
+
+use crate::storage::PageServerNode;

 //
 // ComputeControlPlane
@@ -36,7 +37,7 @@ impl ComputeControlPlane {
        // it is running on default port. Change that when pageserver will have config.
        let pageserver = Arc::new(PageServerNode::from_env(&env));

-        let pgdatadirspath = env.repo_path.join("pgdatadirs");
+        let pgdatadirspath = &env.pg_data_dirs_path();
        let nodes: Result<BTreeMap<_, _>> = fs::read_dir(&pgdatadirspath)
            .with_context(|| format!("failed to list {}", pgdatadirspath.display()))?
            .into_iter()
@@ -79,11 +80,10 @@ impl ComputeControlPlane {
        &mut self,
        is_test: bool,
        timelineid: ZTimelineId,
+        name: &str,
    ) -> Result<Arc<PostgresNode>> {
-        let node_id = self.nodes.len() as u32 + 1;
-
        let node = Arc::new(PostgresNode {
-            name: format!("pg{}", node_id),
+            name: name.to_owned(),
            address: SocketAddr::new("127.0.0.1".parse().unwrap(), self.get_port()),
            env: self.env.clone(),
            pageserver: Arc::clone(&self.pageserver),
@@ -97,47 +97,66 @@ impl ComputeControlPlane {
        Ok(node)
    }

-    pub fn new_test_node(&mut self, timelineid: ZTimelineId) -> Arc<PostgresNode> {
-        let node = self.new_from_page_server(true, timelineid);
-        assert!(node.is_ok());
+    pub fn new_test_node(&mut self, branch_name: &str) -> Arc<PostgresNode> {
+        let timeline_id = self
+            .pageserver
+            .branch_get_by_name(branch_name)
+            .expect("failed to get timeline_id")
+            .timeline_id;
+
+        let node = self.new_from_page_server(true, timeline_id, branch_name);
        let node = node.unwrap();

        // Configure the node to stream WAL directly to the pageserver
        node.append_conf(
            "postgresql.conf",
            format!(
-                "callmemaybe_connstring = '{}'\n", // FIXME escaping
+                "shared_preload_libraries = zenith\n\
+                zenith.callmemaybe_connstring = '{}'\n", // FIXME escaping
                node.connstr()
            )
            .as_str(),
-        );
+        )
+        .unwrap();

        node
    }

-    pub fn new_test_master_node(&mut self, timelineid: ZTimelineId) -> Arc<PostgresNode> {
-        let node = self.new_from_page_server(true, timelineid).unwrap();
+    pub fn new_test_master_node(&mut self, branch_name: &str) -> Arc<PostgresNode> {
+        let timeline_id = self
+            .pageserver
+            .branch_get_by_name(branch_name)
+            .expect("failed to get timeline_id")
+            .timeline_id;
+
+        let node = self
+            .new_from_page_server(true, timeline_id, branch_name)
+            .unwrap();

        node.append_conf(
            "postgresql.conf",
            "synchronous_standby_names = 'safekeeper_proxy'\n",
-        );
+        )
+        .unwrap();

        node
    }

-    pub fn new_node(&mut self, timelineid: ZTimelineId) -> Result<Arc<PostgresNode>> {
-        let node = self.new_from_page_server(false, timelineid).unwrap();
+    pub fn new_node(&mut self, branch_name: &str) -> Result<Arc<PostgresNode>> {
+        let timeline_id = self.pageserver.branch_get_by_name(branch_name)?.timeline_id;
+
+        let node = self.new_from_page_server(false, timeline_id, branch_name)?;

        // Configure the node to stream WAL directly to the pageserver
        node.append_conf(
            "postgresql.conf",
            format!(
-                "callmemaybe_connstring = '{}'\n", // FIXME escaping
+                "shared_preload_libraries = zenith\n\
+                zenith.callmemaybe_connstring = '{}'\n", // FIXME escaping
                node.connstr()
            )
            .as_str(),
-        );
+        )?;

        Ok(node)
    }
@@ -151,7 +170,7 @@ pub struct PostgresNode {
    pub env: LocalEnv,
    pageserver: Arc<PageServerNode>,
    is_test: bool,
-    timelineid: ZTimelineId,
+    pub timelineid: ZTimelineId,
 }

 impl PostgresNode {
@@ -169,6 +188,8 @@ impl PostgresNode {

        lazy_static! {
            static ref CONF_PORT_RE: Regex = Regex::new(r"(?m)^\s*port\s*=\s*(\d+)\s*$").unwrap();
+            static ref CONF_TIMELINE_RE: Regex =
+                Regex::new(r"(?m)^\s*zenith.zenith_timeline\s*=\s*'(\w+)'\s*$").unwrap();
        }

        // parse data directory name
@@ -184,6 +205,7 @@ impl PostgresNode {
            )
        })?;

+        // parse port
        let err_msg = format!(
            "failed to find port definition in config file {}",
            cfg_path.to_str().unwrap()
@@ -199,11 +221,21 @@ impl PostgresNode {
            .parse()
            .with_context(|| err_msg)?;

-        // FIXME: What timeline is this server on? Would have to parse the postgresql.conf
-        // file for that, too. It's currently not needed for anything, but it would be
-        // nice to list the timeline in "zenith pg list"
-        let timelineid_buf = [0u8; 16];
-        let timelineid = ZTimelineId::from(timelineid_buf);
+        // parse timeline
+        let err_msg = format!(
+            "failed to find timeline definition in config file {}",
+            cfg_path.to_str().unwrap()
+        );
+        let timelineid: ZTimelineId = CONF_TIMELINE_RE
+            .captures(config.as_str())
+            .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))?
+            .iter()
+            .last()
+            .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))?
+            .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))?
+            .as_str()
+            .parse()
+            .with_context(|| err_msg)?;

        // ok now
        Ok(PostgresNode {
@@ -277,22 +309,22 @@ impl PostgresNode {
                 max_replication_slots = 10\n\
                 hot_standby = on\n\
                 shared_buffers = 1MB\n\
-				 fsync = off\n\
+                 fsync = off\n\
                 max_connections = 100\n\
-				 wal_sender_timeout = 0\n\
+                 wal_sender_timeout = 0\n\
                 wal_level = replica\n\
                 listen_addresses = '{address}'\n\
                 port = {port}\n",
                address = self.address.ip(),
                port = self.address.port()
            ),
-        );
+        )?;

        // Never clean up old WAL. TODO: We should use a replication
        // slot or something proper, to prevent the compute node
        // from removing WAL that hasn't been streamed to the safekeepr or
        // page server yet. But this will do for now.
-        self.append_conf("postgresql.conf", "wal_keep_size='10TB'\n");
+        self.append_conf("postgresql.conf", "wal_keep_size='10TB'\n")?;

        // Connect it to the page server.

@@ -300,19 +332,20 @@ impl PostgresNode {
        self.append_conf(
            "postgresql.conf",
            &format!(
-                "page_server_connstring = 'host={} port={}'\n\
-                      zenith_timeline='{}'\n",
+                "shared_preload_libraries = zenith \n\
+                 zenith.page_server_connstring = 'host={} port={}'\n\
+                 zenith.zenith_timeline='{}'\n",
                self.pageserver.address().ip(),
                self.pageserver.address().port(),
                self.timelineid
            ),
-        );
+        )?;

        Ok(())
    }

-    fn pgdata(&self) -> PathBuf {
-        self.env.repo_path.join("pgdatadirs").join(&self.name)
+    pub fn pgdata(&self) -> PathBuf {
+        self.env.pg_data_dir(&self.name)
    }

    pub fn status(&self) -> &str {
@@ -328,13 +361,12 @@ impl PostgresNode {
        }
    }

-    pub fn append_conf(&self, config: &str, opts: &str) {
+    pub fn append_conf(&self, config: &str, opts: &str) -> Result<()> {
        OpenOptions::new()
            .append(true)
-            .open(self.pgdata().join(config).to_str().unwrap())
-            .unwrap()
-            .write_all(opts.as_bytes())
-            .unwrap();
+            .open(self.pgdata().join(config).to_str().unwrap())?
+            .write_all(opts.as_bytes())?;
+        Ok(())
    }

    fn pg_ctl(&self, args: &[&str]) -> Result<()> {
@@ -373,8 +405,16 @@ impl PostgresNode {
        self.pg_ctl(&["restart"])
    }

-    pub fn stop(&self) -> Result<()> {
-        self.pg_ctl(&["-m", "immediate", "stop"])
+    pub fn stop(&self, destroy: bool) -> Result<()> {
+        self.pg_ctl(&["-m", "immediate", "stop"])?;
+        if destroy {
+            println!(
+                "Destroying postgres data directory '{}'",
+                self.pgdata().to_str().unwrap()
+            );
+            fs::remove_dir_all(&self.pgdata())?;
+        }
+        Ok(())
    }

    pub fn connstr(&self) -> String {
@@ -398,152 +438,6 @@ impl PostgresNode {

        String::from_utf8(output.stdout).unwrap().trim().to_string()
    }
-
-    fn dump_log_file(&self) {
-        if let Ok(mut file) = File::open(self.env.repo_path.join("pageserver.log")) {
-            let mut buffer = String::new();
-            file.read_to_string(&mut buffer).unwrap();
-            println!("--------------- pageserver.log:\n{}", buffer);
-        }
-    }
-
-    pub fn safe_psql(&self, db: &str, sql: &str) -> Vec<tokio_postgres::Row> {
-        let connstring = format!(
-            "host={} port={} dbname={} user={}",
-            self.address.ip(),
-            self.address.port(),
-            db,
-            self.whoami()
-        );
-        let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
-
-        println!("Running {}", sql);
-        let result = client.query(sql, &[]);
-        if result.is_err() {
-            self.dump_log_file();
-        }
-        result.unwrap()
-    }
-
-    pub fn open_psql(&self, db: &str) -> Client {
-        let connstring = format!(
-            "host={} port={} dbname={} user={}",
-            self.address.ip(),
-            self.address.port(),
-            db,
-            self.whoami()
-        );
-        Client::connect(connstring.as_str(), NoTls).unwrap()
-    }
-
-    pub fn start_proxy(&self, wal_acceptors: &str) -> WalProposerNode {
-        let proxy_path = self.env.pg_bin_dir().join("safekeeper_proxy");
-        match Command::new(proxy_path.as_path())
-            .args(&["--ztimelineid", &self.timelineid.to_string()])
-            .args(&["-s", wal_acceptors])
-            .args(&["-h", &self.address.ip().to_string()])
-            .args(&["-p", &self.address.port().to_string()])
-            .arg("-v")
-            .stderr(
-                OpenOptions::new()
-                    .create(true)
-                    .append(true)
-                    .open(self.pgdata().join("safekeeper_proxy.log"))
-                    .unwrap(),
-            )
-            .spawn()
-        {
-            Ok(child) => WalProposerNode { pid: child.id() },
-            Err(e) => panic!("Failed to launch {:?}: {}", proxy_path, e),
-        }
-    }
-
-    pub fn pg_regress(&self) -> ExitStatus {
-        self.safe_psql("postgres", "CREATE DATABASE regression");
-        let data_dir = zenith_repo_dir();
-        let regress_run_path = data_dir.join("regress");
-        fs::create_dir_all(&regress_run_path).unwrap();
-        fs::create_dir_all(regress_run_path.join("testtablespace")).unwrap();
-        std::env::set_current_dir(regress_run_path).unwrap();
-
-        let regress_build_path =
-            Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress");
-        let regress_src_path =
-            Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress");
-
-        let regress_check = Command::new(regress_build_path.join("pg_regress"))
-            .args(&[
-                "--bindir=''",
-                "--use-existing",
-                format!("--bindir={}", self.env.pg_bin_dir().to_str().unwrap()).as_str(),
-                format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(),
-                format!(
-                    "--schedule={}",
-                    regress_src_path.join("parallel_schedule").to_str().unwrap()
-                )
-                .as_str(),
-                format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(),
-            ])
-            .env_clear()
-            .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
-            .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
-            .env("PGPORT", self.address.port().to_string())
-            .env("PGUSER", self.whoami())
-            .env("PGHOST", self.address.ip().to_string())
-            .status()
-            .expect("pg_regress failed");
-        if !regress_check.success() {
-            if let Ok(mut file) = File::open("regression.diffs") {
-                let mut buffer = String::new();
-                file.read_to_string(&mut buffer).unwrap();
-                println!("--------------- regression.diffs:\n{}", buffer);
-            }
-            self.dump_log_file();
-            if let Ok(mut file) = File::open(
-                self.env
-                    .repo_path
-                    .join("pgdatadirs")
-                    .join("pg1")
-                    .join("log"),
-            ) {
-                let mut buffer = String::new();
-                file.read_to_string(&mut buffer).unwrap();
-                println!("--------------- pgdatadirs/pg1/log:\n{}", buffer);
-            }
-        }
-        regress_check
-    }
-
-    pub fn pg_bench(&self, clients: u32, seconds: u32) -> ExitStatus {
-        let port = self.address.port().to_string();
-        let clients = clients.to_string();
-        let seconds = seconds.to_string();
-        let _pg_bench_init = Command::new(self.env.pg_bin_dir().join("pgbench"))
-            .args(&["-i", "-p", port.as_str(), "postgres"])
-            .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
-            .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
-            .status()
-            .expect("pgbench -i");
-        let pg_bench_run = Command::new(self.env.pg_bin_dir().join("pgbench"))
-            .args(&[
-                "-p",
-                port.as_str(),
-                "-T",
-                seconds.as_str(),
-                "-P",
-                "1",
-                "-c",
-                clients.as_str(),
-                "-M",
-                "prepared",
-                "postgres",
-            ])
-            .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
-            .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
-            .status()
-            .expect("pgbench run");
-        pg_bench_run
-    }
 }

 impl Drop for PostgresNode {
@@ -552,7 +446,7 @@ impl Drop for PostgresNode {
    // and checking it here. But let just clean datadirs on start.
    fn drop(&mut self) {
        if self.is_test {
-            let _ = self.stop();
+            let _ = self.stop(true);
        }
    }
 }
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -6,7 +6,26 @@
 // Intended to be used in integration tests and in CLI tools for
 // local installations.
 //
+use anyhow::{anyhow, bail, Context, Result};
+use std::fs;
+use std::path::Path;

 pub mod compute;
 pub mod local_env;
 pub mod storage;
+
+/// Read a PID file
+///
+/// We expect a file that contains a single integer.
+/// We return an i32 for compatibility with libc and nix.
+pub fn read_pidfile(pidfile: &Path) -> Result<i32> {
+    let pid_str = fs::read_to_string(pidfile)
+        .with_context(|| format!("failed to read pidfile {:?}", pidfile))?;
+    let pid: i32 = pid_str
+        .parse()
+        .map_err(|_| anyhow!("failed to parse pidfile {:?}", pidfile))?;
+    if pid < 1 {
+        bail!("pidfile {:?} contained bad value '{}'", pidfile, pid);
+    }
+    Ok(pid)
+}
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -4,34 +4,25 @@
 // Now it also provides init method which acts like a stub for proper installation
 // script which will use local paths.
 //
-use anyhow::Context;
-use bytes::Bytes;
-use rand::Rng;
-use std::env;
+use anyhow::{anyhow, Result};
+use serde::{Deserialize, Serialize};
 use std::fs;
-use std::path::{Path, PathBuf};
-use std::process::{Command, Stdio};
+use std::path::PathBuf;
+use std::{collections::BTreeMap, env};
+use url::Url;

-use anyhow::Result;
-use serde_derive::{Deserialize, Serialize};
-
-use pageserver::zenith_repo_dir;
-use pageserver::ZTimelineId;
-use postgres_ffi::xlog_utils;
+pub type Remotes = BTreeMap<String, String>;

 //
-// This data structure represents deserialized zenith config, which should be
-// located in ~/.zenith
-//
-// TODO: should we also support ZENITH_CONF env var?
+// This data structures represent deserialized zenith CLI config
 //
 #[derive(Serialize, Deserialize, Clone)]
 pub struct LocalEnv {
-    // Path to the Repository. Here page server and compute nodes will create and store their data.
-    pub repo_path: PathBuf,
+    // Pageserver connection strings
+    pub pageserver_connstring: String,

-    // System identifier, from the PostgreSQL control file
-    pub systemid: u64,
+    // Base directory for both pageserver and compute nodes
+    pub base_data_dir: PathBuf,

    // Path to postgres distribution. It's expected that "bin", "include",
    // "lib", "share" from postgres distribution are there. If at some point
@@ -39,191 +30,118 @@ pub struct LocalEnv {
    // to four separate paths and match OS-specific installation layout.
    pub pg_distrib_dir: PathBuf,

-    // Path to pageserver binary.
-    pub zenith_distrib_dir: PathBuf,
+    // Path to pageserver binary. Empty for remote pageserver.
+    pub zenith_distrib_dir: Option<PathBuf>,
+
+    pub remotes: Remotes,
 }

 impl LocalEnv {
-    // postgres installation
+    // postgres installation paths
    pub fn pg_bin_dir(&self) -> PathBuf {
        self.pg_distrib_dir.join("bin")
    }
    pub fn pg_lib_dir(&self) -> PathBuf {
        self.pg_distrib_dir.join("lib")
    }
+
+    pub fn pageserver_bin(&self) -> Result<PathBuf> {
+        Ok(self
+            .zenith_distrib_dir
+            .as_ref()
+            .ok_or_else(|| anyhow!("Can not manage remote pageserver"))?
+            .join("pageserver"))
+    }
+
+    pub fn pg_data_dirs_path(&self) -> PathBuf {
+        self.base_data_dir.join("pgdatadirs")
+    }
+
+    pub fn pg_data_dir(&self, name: &str) -> PathBuf {
+        self.pg_data_dirs_path().join(name)
+    }
+
+    // TODO: move pageserver files into ./pageserver
+    pub fn pageserver_data_dir(&self) -> PathBuf {
+        self.base_data_dir.clone()
+    }
+}
+
+fn base_path() -> PathBuf {
+    match std::env::var_os("ZENITH_REPO_DIR") {
+        Some(val) => PathBuf::from(val.to_str().unwrap()),
+        None => ".zenith".into(),
+    }
 }

 //
 // Initialize a new Zenith repository
 //
-pub fn init() -> Result<()> {
+pub fn init(remote_pageserver: Option<&str>) -> Result<()> {
    // check if config already exists
-    let repo_path = zenith_repo_dir();
-    if repo_path.exists() {
+    let base_path = base_path();
+    if base_path.exists() {
        anyhow::bail!(
            "{} already exists. Perhaps already initialized?",
-            repo_path.to_str().unwrap()
-        );
-    }
-
-    // Now we can run init only from crate directory, so check that current dir is our crate.
-    // Use 'pageserver/Cargo.toml' existence as evidendce.
-    let cargo_path = env::current_dir()?;
-    if !cargo_path.join("pageserver/Cargo.toml").exists() {
-        anyhow::bail!(
-            "Current directory does not look like a zenith repo. \
-            Please, run 'init' from zenith repo root."
+            base_path.to_str().unwrap()
        );
    }

    // ok, now check that expected binaries are present

-    // check postgres
-    let pg_distrib_dir = cargo_path.join("tmp_install");
-    let pg_path = pg_distrib_dir.join("bin/postgres");
-    if !pg_path.exists() {
-        anyhow::bail!(
-            "Can't find postres binary at {}. \
-                       Perhaps 'make postgres' is needed to build it first.",
-            pg_path.to_str().unwrap()
-        );
-    }
-
-    // check pageserver
-    let zenith_distrib_dir = cargo_path.join("target/debug/");
-    let pageserver_path = zenith_distrib_dir.join("pageserver");
-    if !pageserver_path.exists() {
-        anyhow::bail!(
-            "Can't find pageserver binary at {}. Please build it.",
-            pageserver_path.to_str().unwrap()
-        );
-    }
-
-    // ok, we are good to go
-    let mut conf = LocalEnv {
-        repo_path,
-        pg_distrib_dir,
-        zenith_distrib_dir,
-        systemid: 0,
+    // Find postgres binaries. Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
+    let pg_distrib_dir: PathBuf = {
+        if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
+            postgres_bin.into()
+        } else {
+            let cwd = env::current_dir()?;
+            cwd.join("tmp_install")
+        }
    };
-    init_repo(&mut conf)?;
-
-    Ok(())
-}
-
-pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> {
-    let repopath = &local_env.repo_path;
-    fs::create_dir(&repopath)
-        .with_context(|| format!("could not create directory {}", repopath.display()))?;
-    fs::create_dir(repopath.join("pgdatadirs"))?;
-    fs::create_dir(repopath.join("timelines"))?;
-    fs::create_dir(repopath.join("refs"))?;
-    fs::create_dir(repopath.join("refs").join("branches"))?;
-    fs::create_dir(repopath.join("refs").join("tags"))?;
-    println!("created directory structure in {}", repopath.display());
-
-    // Create initial timeline
-    let tli = create_timeline(&local_env, None)?;
-    let timelinedir = repopath.join("timelines").join(tli.to_string());
-    println!("created initial timeline {}", timelinedir.display());
-
-    // Run initdb
-    //
-    // We create the cluster temporarily in a "tmp" directory inside the repository,
-    // and move it to the right location from there.
-    let tmppath = repopath.join("tmp");
-
-    let initdb_path = local_env.pg_bin_dir().join("initdb");
-    let initdb = Command::new(initdb_path)
-        .args(&["-D", tmppath.to_str().unwrap()])
-        .arg("--no-instructions")
-        .env_clear()
-        .env("LD_LIBRARY_PATH", local_env.pg_lib_dir().to_str().unwrap())
-        .env(
-            "DYLD_LIBRARY_PATH",
-            local_env.pg_lib_dir().to_str().unwrap(),
-        )
-        .stdout(Stdio::null())
-        .status()
-        .with_context(|| "failed to execute initdb")?;
-    if !initdb.success() {
-        anyhow::bail!("initdb failed");
+    if !pg_distrib_dir.join("bin/postgres").exists() {
+        anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
    }
-    println!("initdb succeeded");

-    // Read control file to extract the LSN and system id
-    let controlfile_path = tmppath.join("global").join("pg_control");
-    let controlfile = postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfile_path)?))?;
-    let systemid = controlfile.system_identifier;
-    let lsn = controlfile.checkPoint;
-    let lsnstr = format!("{:016X}", lsn);
+    fs::create_dir(&base_path)?;
+    fs::create_dir(base_path.join("pgdatadirs"))?;

-    // Move the initial WAL file
-    fs::rename(
-        tmppath.join("pg_wal").join("000000010000000000000001"),
-        timelinedir
-            .join("wal")
-            .join("000000010000000000000001.partial"),
-    )?;
-    println!("moved initial WAL file");
+    let conf = if let Some(addr) = remote_pageserver {
+        // check that addr is parsable
+        let _uri = Url::parse(addr).map_err(|e| anyhow!("{}: {}", addr, e))?;

-    // Remove pg_wal
-    fs::remove_dir_all(tmppath.join("pg_wal"))?;
+        LocalEnv {
+            pageserver_connstring: format!("postgresql://{}/", addr),
+            pg_distrib_dir,
+            zenith_distrib_dir: None,
+            base_data_dir: base_path,
+            remotes: BTreeMap::default(),
+        }
+    } else {
+        // Find zenith binaries.
+        let zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
+        if !zenith_distrib_dir.join("pageserver").exists() {
+            anyhow::bail!("Can't find pageserver binary.",);
+        }

-    force_crash_recovery(&tmppath)?;
-    println!("updated pg_control");
+        LocalEnv {
+            pageserver_connstring: "postgresql://127.0.0.1:6400".to_string(),
+            pg_distrib_dir,
+            zenith_distrib_dir: Some(zenith_distrib_dir),
+            base_data_dir: base_path,
+            remotes: BTreeMap::default(),
+        }
+    };

-    let target = timelinedir.join("snapshots").join(&lsnstr);
-    fs::rename(tmppath, &target)?;
-    println!("moved 'tmp' to {}", target.display());
-
-    // Create 'main' branch to refer to the initial timeline
-    let data = tli.to_string();
-    fs::write(repopath.join("refs").join("branches").join("main"), data)?;
-    println!("created main branch");
-
-    // Also update the system id in the LocalEnv
-    local_env.systemid = systemid;
-
-    // write config
-    let toml = toml::to_string(&local_env)?;
-    fs::write(repopath.join("config"), toml)?;
-
-    println!(
-        "new zenith repository was created in {}",
-        repopath.display()
-    );
+    let toml = toml::to_string_pretty(&conf)?;
+    fs::write(conf.base_data_dir.join("config"), toml)?;

    Ok(())
 }

-// If control file says the cluster was shut down cleanly, modify it, to mark
-// it as crashed. That forces crash recovery when you start the cluster.
-//
-// FIXME:
-// We currently do this to the initial snapshot in "zenith init". It would
-// be more natural to do this when the snapshot is restored instead, but we
-// currently don't have any code to create new snapshots, so it doesn't matter
-// Or better yet, use a less hacky way of putting the cluster into recovery.
-// Perhaps create a backup label file in the data directory when it's restored.
-fn force_crash_recovery(datadir: &Path) -> Result<()> {
-    // Read in the control file
-    let controlfilepath = datadir.to_path_buf().join("global").join("pg_control");
-    let mut controlfile =
-        postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfilepath.as_path())?))?;
+// Locate and load config
+pub fn load_config() -> Result<LocalEnv> {
+    let repopath = base_path();

-    controlfile.state = postgres_ffi::DBState_DB_IN_PRODUCTION;
-
-    fs::write(
-        controlfilepath.as_path(),
-        postgres_ffi::encode_pg_control(controlfile),
-    )?;
-
-    Ok(())
-}
-
-// check that config file is present
-pub fn load_config(repopath: &Path) -> Result<LocalEnv> {
    if !repopath.exists() {
        anyhow::bail!(
            "Zenith config is not found in {}. You need to run 'zenith init' first",
@@ -231,159 +149,18 @@ pub fn load_config(repopath: &Path) -> Result<LocalEnv> {
        );
    }

+    // TODO: check that it looks like a zenith repository
+
    // load and parse file
    let config = fs::read_to_string(repopath.join("config"))?;
    toml::from_str(config.as_str()).map_err(|e| e.into())
 }

-// local env for tests
-pub fn test_env(testname: &str) -> LocalEnv {
-    fs::create_dir_all("../tmp_check").expect("could not create directory ../tmp_check");
-
-    let repo_path = Path::new(env!("CARGO_MANIFEST_DIR"))
-        .join("../tmp_check/")
-        .join(testname);
-
-    // Remove remnants of old test repo
-    let _ = fs::remove_dir_all(&repo_path);
-
-    let mut local_env = LocalEnv {
-        repo_path,
-        pg_distrib_dir: Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install"),
-        zenith_distrib_dir: cargo_bin_dir(),
-        systemid: 0,
-    };
-    init_repo(&mut local_env).expect("could not initialize zenith repository");
-    local_env
-}
-
-// Find the directory where the binaries were put (i.e. target/debug/)
-pub fn cargo_bin_dir() -> PathBuf {
-    let mut pathbuf = std::env::current_exe().unwrap();
-
-    pathbuf.pop();
-    if pathbuf.ends_with("deps") {
-        pathbuf.pop();
-    }
-
-    pathbuf
-}
-
-#[derive(Debug, Clone, Copy)]
-pub struct PointInTime {
-    pub timelineid: ZTimelineId,
-    pub lsn: u64,
-}
-
-fn create_timeline(local_env: &LocalEnv, ancestor: Option<PointInTime>) -> Result<ZTimelineId> {
-    let repopath = &local_env.repo_path;
-
-    // Create initial timeline
-    let mut tli_buf = [0u8; 16];
-    rand::thread_rng().fill(&mut tli_buf);
-    let timelineid = ZTimelineId::from(tli_buf);
-
-    let timelinedir = repopath.join("timelines").join(timelineid.to_string());
-
-    fs::create_dir(&timelinedir)?;
-    fs::create_dir(&timelinedir.join("snapshots"))?;
-    fs::create_dir(&timelinedir.join("wal"))?;
-
-    if let Some(ancestor) = ancestor {
-        let data = format!(
-            "{}@{:X}/{:X}",
-            ancestor.timelineid,
-            ancestor.lsn >> 32,
-            ancestor.lsn & 0xffffffff
-        );
-        fs::write(timelinedir.join("ancestor"), data)?;
-    }
-
-    Ok(timelineid)
-}
-
-// Parse an LSN in the format used in filenames
-//
-// For example: 00000000015D3DD8
-//
-fn parse_lsn(s: &str) -> std::result::Result<u64, std::num::ParseIntError> {
-    u64::from_str_radix(s, 16)
-}
-
-// Create a new branch in the repository (for the "zenith branch" subcommand)
-pub fn create_branch(
-    local_env: &LocalEnv,
-    branchname: &str,
-    startpoint: PointInTime,
-) -> Result<()> {
-    let repopath = &local_env.repo_path;
-
-    // create a new timeline for it
-    let newtli = create_timeline(local_env, Some(startpoint))?;
-    let newtimelinedir = repopath.join("timelines").join(newtli.to_string());
-
-    let data = newtli.to_string();
-    fs::write(
-        repopath.join("refs").join("branches").join(branchname),
-        data,
-    )?;
-
-    // Copy the latest snapshot (TODO: before the startpoint) and all WAL
-    // TODO: be smarter and avoid the copying...
-    let (_maxsnapshot, oldsnapshotdir) = find_latest_snapshot(local_env, startpoint.timelineid)?;
-    let copy_opts = fs_extra::dir::CopyOptions::new();
-    fs_extra::dir::copy(oldsnapshotdir, newtimelinedir.join("snapshots"), &copy_opts)?;
-
-    let oldtimelinedir = repopath
-        .join("timelines")
-        .join(startpoint.timelineid.to_string());
-    let mut copy_opts = fs_extra::dir::CopyOptions::new();
-    copy_opts.content_only = true;
-    fs_extra::dir::copy(
-        oldtimelinedir.join("wal"),
-        newtimelinedir.join("wal"),
-        &copy_opts,
-    )?;
+// Save config. We use that to change set of remotes from CLI itself.
+pub fn save_config(conf: &LocalEnv) -> Result<()> {
+    let config_path = base_path().join("config");
+    let conf_str = toml::to_string_pretty(conf)?;

+    fs::write(config_path, conf_str)?;
    Ok(())
 }
-
-// Find the end of valid WAL in a wal directory
-pub fn find_end_of_wal(local_env: &LocalEnv, timeline: ZTimelineId) -> Result<u64> {
-    let repopath = &local_env.repo_path;
-    let waldir = repopath
-        .join("timelines")
-        .join(timeline.to_string())
-        .join("wal");
-
-    let (lsn, _tli) = xlog_utils::find_end_of_wal(&waldir, 16 * 1024 * 1024, true);
-
-    Ok(lsn)
-}
-
-// Find the latest snapshot for a timeline
-fn find_latest_snapshot(local_env: &LocalEnv, timeline: ZTimelineId) -> Result<(u64, PathBuf)> {
-    let repopath = &local_env.repo_path;
-
-    let snapshotsdir = repopath
-        .join("timelines")
-        .join(timeline.to_string())
-        .join("snapshots");
-    let paths = fs::read_dir(&snapshotsdir)?;
-    let mut maxsnapshot: u64 = 0;
-    let mut snapshotdir: Option<PathBuf> = None;
-    for path in paths {
-        let path = path?;
-        let filename = path.file_name().to_str().unwrap().to_owned();
-        if let Ok(lsn) = parse_lsn(&filename) {
-            maxsnapshot = std::cmp::max(lsn, maxsnapshot);
-            snapshotdir = Some(path.path());
-        }
-    }
-    if maxsnapshot == 0 {
-        // TODO: check ancestor timeline
-        anyhow::bail!("no snapshot found in {}", snapshotsdir.display());
-    }
-
-    Ok((maxsnapshot, snapshotdir.unwrap()))
-}
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -1,122 +1,18 @@
-use anyhow::Result;
-use std::fs;
-use std::io;
-use std::net::SocketAddr;
-use std::net::TcpStream;
-use std::path::{Path, PathBuf};
+use std::collections::HashMap;
+use std::net::{SocketAddr, TcpStream};
+use std::path::PathBuf;
 use std::process::Command;
-use std::str::FromStr;
-use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::Arc;
 use std::thread;
 use std::time::Duration;

+use anyhow::{anyhow, bail, Result};
+use nix::sys::signal::{kill, Signal};
+use nix::unistd::Pid;
 use postgres::{Client, NoTls};

 use crate::local_env::LocalEnv;
-use pageserver::ZTimelineId;
-
-//
-// Collection of several example deployments useful for tests.
-//
-// I'm intendedly modelling storage and compute control planes as a separate entities
-// as it is closer to the actual setup.
-//
-pub struct TestStorageControlPlane {
-    pub wal_acceptors: Vec<WalAcceptorNode>,
-    pub pageserver: Arc<PageServerNode>,
-    pub test_done: AtomicBool,
-    pub repopath: PathBuf,
-}
-
-impl TestStorageControlPlane {
-    // Peek into the repository, to grab the timeline ID of given branch
-    pub fn get_branch_timeline(&self, branchname: &str) -> ZTimelineId {
-        let branchpath = self.repopath.join("refs/branches/".to_owned() + branchname);
-
-        ZTimelineId::from_str(&(fs::read_to_string(&branchpath).unwrap())).unwrap()
-    }
-
-    // postgres <-> page_server
-    //
-    // Initialize a new repository and configure a page server to run in it
-    //
-    pub fn one_page_server(local_env: &LocalEnv) -> TestStorageControlPlane {
-        let repopath = local_env.repo_path.clone();
-
-        let pserver = Arc::new(PageServerNode {
-            env: local_env.clone(),
-            kill_on_exit: true,
-            listen_address: None,
-        });
-        pserver.start().unwrap();
-
-        TestStorageControlPlane {
-            wal_acceptors: Vec::new(),
-            pageserver: pserver,
-            test_done: AtomicBool::new(false),
-            repopath,
-        }
-    }
-
-    // postgres <-> {wal_acceptor1, wal_acceptor2, ...}
-    pub fn fault_tolerant(local_env: &LocalEnv, redundancy: usize) -> TestStorageControlPlane {
-        let repopath = local_env.repo_path.clone();
-
-        let mut cplane = TestStorageControlPlane {
-            wal_acceptors: Vec::new(),
-            pageserver: Arc::new(PageServerNode {
-                env: local_env.clone(),
-                kill_on_exit: true,
-                listen_address: None,
-            }),
-            test_done: AtomicBool::new(false),
-            repopath,
-        };
-        cplane.pageserver.start().unwrap();
-
-        const WAL_ACCEPTOR_PORT: usize = 54321;
-
-        for i in 0..redundancy {
-            let wal_acceptor = WalAcceptorNode {
-                listen: format!("127.0.0.1:{}", WAL_ACCEPTOR_PORT + i)
-                    .parse()
-                    .unwrap(),
-                data_dir: local_env.repo_path.join(format!("wal_acceptor_{}", i)),
-                env: local_env.clone(),
-            };
-            wal_acceptor.init();
-            wal_acceptor.start();
-            cplane.wal_acceptors.push(wal_acceptor);
-        }
-        cplane
-    }
-
-    pub fn stop(&self) {
-        for wa in self.wal_acceptors.iter() {
-            let _ = wa.stop();
-        }
-        self.test_done.store(true, Ordering::Relaxed);
-    }
-
-    pub fn get_wal_acceptor_conn_info(&self) -> String {
-        self.wal_acceptors
-            .iter()
-            .map(|wa| wa.listen.to_string())
-            .collect::<Vec<String>>()
-            .join(",")
-    }
-
-    pub fn is_running(&self) -> bool {
-        self.test_done.load(Ordering::Relaxed)
-    }
-}
-
-impl Drop for TestStorageControlPlane {
-    fn drop(&mut self) {
-        self.stop();
-    }
-}
+use crate::read_pidfile;
+use pageserver::branches::BranchInfo;

 //
 // Control routines for pageserver.
@@ -124,8 +20,8 @@ impl Drop for TestStorageControlPlane {
 // Used in CLI and tests.
 //
 pub struct PageServerNode {
-    kill_on_exit: bool,
-    listen_address: Option<SocketAddr>,
+    pub kill_on_exit: bool,
+    pub listen_address: Option<SocketAddr>,
    pub env: LocalEnv,
 }

@@ -145,12 +41,36 @@ impl PageServerNode {
        }
    }

+    pub fn init(&self) -> Result<()> {
+        let mut cmd = Command::new(self.env.pageserver_bin()?);
+        let status = cmd
+            .args(&["--init", "-D", self.env.base_data_dir.to_str().unwrap()])
+            .env_clear()
+            .env("RUST_BACKTRACE", "1")
+            .env(
+                "POSTGRES_DISTRIB_DIR",
+                self.env.pg_distrib_dir.to_str().unwrap(),
+            )
+            .env("ZENITH_REPO_DIR", self.repo_path())
+            .env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary
+            .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
+            .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
+            .status()
+            .expect("pageserver init failed");
+
+        if status.success() {
+            Ok(())
+        } else {
+            Err(anyhow!("pageserver init failed"))
+        }
+    }
+
    pub fn repo_path(&self) -> PathBuf {
-        self.env.repo_path.clone()
+        self.env.pageserver_data_dir()
    }

    pub fn pid_file(&self) -> PathBuf {
-        self.env.repo_path.join("pageserver.pid")
+        self.repo_path().join("pageserver.pid")
    }

    pub fn start(&self) -> Result<()> {
@@ -160,18 +80,27 @@ impl PageServerNode {
            self.repo_path().display()
        );

-        let mut cmd = Command::new(self.env.zenith_distrib_dir.join("pageserver"));
-        cmd.args(&["-l", self.address().to_string().as_str()])
-            .arg("-d")
-            .env_clear()
-            .env("RUST_BACKTRACE", "1")
-            .env("ZENITH_REPO_DIR", self.repo_path())
-            .env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary
-            .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
-            .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap());
+        let mut cmd = Command::new(self.env.pageserver_bin()?);
+        cmd.args(&[
+            "-l",
+            self.address().to_string().as_str(),
+            "-D",
+            self.repo_path().to_str().unwrap(),
+        ])
+        .arg("-d")
+        .env_clear()
+        .env("RUST_BACKTRACE", "1")
+        .env(
+            "POSTGRES_DISTRIB_DIR",
+            self.env.pg_distrib_dir.to_str().unwrap(),
+        )
+        .env("ZENITH_REPO_DIR", self.repo_path())
+        .env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary
+        .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
+        .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap());

        if !cmd.status()?.success() {
-            anyhow::bail!(
+            bail!(
                "Pageserver failed to start. See '{}' for details.",
                self.repo_path().join("pageserver.log").display()
            );
@@ -184,43 +113,35 @@ impl PageServerNode {
            if client.is_ok() {
                break;
            } else {
-                println!("page server not responding yet, retrying ({})...", retries);
+                println!("Pageserver not responding yet, retrying ({})...", retries);
                thread::sleep(Duration::from_secs(1));
            }
        }
+
+        println!("Pageserver started");
+
        Ok(())
    }

    pub fn stop(&self) -> Result<()> {
-        let pidfile = self.pid_file();
-        let pid = read_pidfile(&pidfile)?;
-
-        let status = Command::new("kill")
-            .arg(&pid)
-            .env_clear()
-            .status()
-            .expect("failed to execute kill");
-
-        if !status.success() {
-            anyhow::bail!("Failed to kill pageserver with pid {}", pid);
+        let pid = read_pidfile(&self.pid_file())?;
+        let pid = Pid::from_raw(pid);
+        if kill(pid, Signal::SIGTERM).is_err() {
+            bail!("Failed to kill pageserver with pid {}", pid);
        }

-        // await for pageserver stop
+        // wait for pageserver stop
        for _ in 0..5 {
            let stream = TcpStream::connect(self.address());
+            thread::sleep(Duration::from_secs(1));
            if let Err(_e) = stream {
+                println!("Pageserver stopped");
                return Ok(());
            }
            println!("Stopping pageserver on {}", self.address());
-            thread::sleep(Duration::from_secs(1));
        }

-        // ok, we failed to stop pageserver, let's panic
-        if !status.success() {
-            anyhow::bail!("Failed to stop pageserver with pid {}", pid);
-        } else {
-            Ok(())
-        }
+        bail!("Failed to stop pageserver with pid {}", pid);
    }

    pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
@@ -237,9 +158,7 @@ impl PageServerNode {
        client.simple_query(sql).unwrap()
    }

-    pub fn page_server_psql_client(
-        &self,
-    ) -> std::result::Result<postgres::Client, postgres::Error> {
+    pub fn page_server_psql_client(&self) -> Result<postgres::Client, postgres::Error> {
        let connstring = format!(
            "host={} port={} dbname={} user={}",
            self.address().ip(),
@@ -249,6 +168,79 @@ impl PageServerNode {
        );
        Client::connect(connstring.as_str(), NoTls)
    }
+
+    pub fn branches_list(&self) -> Result<Vec<BranchInfo>> {
+        let mut client = self.page_server_psql_client()?;
+        let query_result = client.simple_query("branch_list")?;
+        let branches_json = query_result
+            .first()
+            .map(|msg| match msg {
+                postgres::SimpleQueryMessage::Row(row) => row.get(0),
+                _ => None,
+            })
+            .flatten()
+            .ok_or_else(|| anyhow!("missing branches"))?;
+
+        let res: Vec<BranchInfo> = serde_json::from_str(branches_json)?;
+        Ok(res)
+    }
+
+    pub fn branch_create(&self, name: &str, startpoint: &str) -> Result<BranchInfo> {
+        let mut client = self.page_server_psql_client()?;
+        let query_result =
+            client.simple_query(format!("branch_create {} {}", name, startpoint).as_str())?;
+
+        let branch_json = query_result
+            .first()
+            .map(|msg| match msg {
+                postgres::SimpleQueryMessage::Row(row) => row.get(0),
+                _ => None,
+            })
+            .flatten()
+            .ok_or_else(|| anyhow!("missing branch"))?;
+
+        let res: BranchInfo = serde_json::from_str(branch_json).map_err(|e| {
+            anyhow!(
+                "failed to parse branch_create response: {}: {}",
+                branch_json,
+                e
+            )
+        })?;
+
+        Ok(res)
+    }
+
+    // TODO: make this a separate request type and avoid loading all the branches
+    pub fn branch_get_by_name(&self, name: &str) -> Result<BranchInfo> {
+        let branch_infos = self.branches_list()?;
+        let branche_by_name: Result<HashMap<String, BranchInfo>> = branch_infos
+            .into_iter()
+            .map(|branch_info| Ok((branch_info.name.clone(), branch_info)))
+            .collect();
+        let branche_by_name = branche_by_name?;
+
+        let branch = branche_by_name
+            .get(name)
+            .ok_or_else(|| anyhow!("Branch {} not found", name))?;
+
+        Ok(branch.clone())
+    }
+
+    pub fn system_id_get(&self) -> Result<u64> {
+        let mut client = self.page_server_psql_client()?;
+        let query_result = client
+            .simple_query("identify_system")?
+            .first()
+            .map(|msg| match msg {
+                postgres::SimpleQueryMessage::Row(row) => row.get(0),
+                _ => None,
+            })
+            .flatten()
+            .ok_or_else(|| anyhow!("failed to get system_id"))?
+            .parse::<u64>()?;
+
+        Ok(query_result)
+    }
 }

 impl Drop for PageServerNode {
@@ -258,106 +250,3 @@ impl Drop for PageServerNode {
        }
    }
 }
-
-//
-// Control routines for WalAcceptor.
-//
-// Now used only in test setups.
-//
-pub struct WalAcceptorNode {
-    listen: SocketAddr,
-    data_dir: PathBuf,
-    env: LocalEnv,
-}
-
-impl WalAcceptorNode {
-    pub fn init(&self) {
-        if self.data_dir.exists() {
-            fs::remove_dir_all(self.data_dir.clone()).unwrap();
-        }
-        fs::create_dir_all(self.data_dir.clone()).unwrap();
-    }
-
-    pub fn start(&self) {
-        println!(
-            "Starting wal_acceptor in {} listening '{}'",
-            self.data_dir.to_str().unwrap(),
-            self.listen
-        );
-
-        let status = Command::new(self.env.zenith_distrib_dir.join("wal_acceptor"))
-            .args(&["-D", self.data_dir.to_str().unwrap()])
-            .args(&["-l", self.listen.to_string().as_str()])
-            .args(&["--systemid", &self.env.systemid.to_string()])
-            // Tell page server it can receive WAL from this WAL safekeeper
-            // FIXME: If there are multiple safekeepers, they will all inform
-            // the page server. Only the last "notification" will stay in effect.
-            // So it's pretty random which safekeeper the page server will connect to
-            .args(&["--pageserver", "127.0.0.1:64000"])
-            .arg("-d")
-            .arg("-n")
-            .status()
-            .expect("failed to start wal_acceptor");
-
-        if !status.success() {
-            panic!("wal_acceptor start failed");
-        }
-    }
-
-    pub fn stop(&self) -> std::result::Result<(), io::Error> {
-        println!("Stopping wal acceptor on {}", self.listen);
-        let pidfile = self.data_dir.join("wal_acceptor.pid");
-        let pid = read_pidfile(&pidfile)?;
-        // Ignores any failures when running this command
-        let _status = Command::new("kill")
-            .arg(pid)
-            .env_clear()
-            .status()
-            .expect("failed to execute kill");
-
-        Ok(())
-    }
-}
-
-impl Drop for WalAcceptorNode {
-    fn drop(&mut self) {
-        self.stop().unwrap();
-    }
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-pub struct WalProposerNode {
-    pub pid: u32,
-}
-
-impl WalProposerNode {
-    pub fn stop(&self) {
-        let status = Command::new("kill")
-            .arg(self.pid.to_string())
-            .env_clear()
-            .status()
-            .expect("failed to execute kill");
-
-        if !status.success() {
-            panic!("kill start failed");
-        }
-    }
-}
-
-impl Drop for WalProposerNode {
-    fn drop(&mut self) {
-        self.stop();
-    }
-}
-
-/// Read a PID file
-///
-/// This should contain an unsigned integer, but we return it as a String
-/// because our callers only want to pass it back into a subcommand.
-fn read_pidfile(pidfile: &Path) -> std::result::Result<String, io::Error> {
-    fs::read_to_string(pidfile).map_err(|err| {
-        eprintln!("failed to read pidfile {:?}: {:?}", pidfile, err);
-        err
-    })
-}
--- a/integration_tests/Cargo.toml
+++ b/integration_tests/Cargo.toml
@@ -9,8 +9,9 @@ edition = "2018"
 [dependencies]
 lazy_static = "1.4.0"
 rand = "0.8.3"
-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
-tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
+anyhow = "1.0"
+nix = "0.20"
+postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }

 pageserver = { path = "../pageserver" }
 walkeeper = { path = "../walkeeper" }
--- a/integration_tests/src/lib.rs
+++ b/integration_tests/src/lib.rs
@@ -0,0 +1,414 @@
+use anyhow::{bail, Result};
+use nix::sys::signal::{kill, Signal};
+use nix::unistd::Pid;
+use std::collections::BTreeMap;
+use std::convert::TryInto;
+use std::fs::{self, File, OpenOptions};
+use std::io::Read;
+use std::net::SocketAddr;
+use std::path::{Path, PathBuf};
+use std::process::{Command, ExitStatus};
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
+
+use control_plane::compute::PostgresNode;
+use control_plane::read_pidfile;
+use control_plane::{local_env::LocalEnv, storage::PageServerNode};
+
+// Find the directory where the binaries were put (i.e. target/debug/)
+fn cargo_bin_dir() -> PathBuf {
+    let mut pathbuf = std::env::current_exe().unwrap();
+
+    pathbuf.pop();
+    if pathbuf.ends_with("deps") {
+        pathbuf.pop();
+    }
+
+    pathbuf
+}
+
+// local compute env for tests
+pub fn create_test_env(testname: &str) -> LocalEnv {
+    let base_path = Path::new(env!("CARGO_MANIFEST_DIR"))
+        .join("../tmp_check/")
+        .join(testname);
+
+    let base_path_str = base_path.to_str().unwrap();
+
+    // Remove remnants of old test repo
+    let _ = fs::remove_dir_all(&base_path);
+
+    fs::create_dir_all(&base_path)
+        .unwrap_or_else(|_| panic!("could not create directory for {}", base_path_str));
+
+    let pgdatadirs_path = base_path.join("pgdatadirs");
+    fs::create_dir(&pgdatadirs_path)
+        .unwrap_or_else(|_| panic!("could not create directory {:?}", pgdatadirs_path));
+
+    LocalEnv {
+        pageserver_connstring: "postgresql://127.0.0.1:64000".to_string(),
+        pg_distrib_dir: Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install"),
+        zenith_distrib_dir: Some(cargo_bin_dir()),
+        base_data_dir: base_path,
+        remotes: BTreeMap::default(),
+    }
+}
+
+//
+// Collection of several example deployments useful for tests.
+//
+// I'm intendedly modelling storage and compute control planes as a separate entities
+// as it is closer to the actual setup.
+//
+pub struct TestStorageControlPlane {
+    pub wal_acceptors: Vec<WalAcceptorNode>,
+    pub pageserver: Arc<PageServerNode>,
+    pub test_done: AtomicBool,
+}
+
+impl TestStorageControlPlane {
+    // postgres <-> page_server
+    //
+    // Initialize a new repository and configure a page server to run in it
+    //
+    pub fn one_page_server(local_env: &LocalEnv) -> TestStorageControlPlane {
+        let pserver = Arc::new(PageServerNode {
+            env: local_env.clone(),
+            kill_on_exit: true,
+            listen_address: None,
+        });
+        pserver.init().unwrap();
+        pserver.start().unwrap();
+
+        TestStorageControlPlane {
+            wal_acceptors: Vec::new(),
+            pageserver: pserver,
+            test_done: AtomicBool::new(false),
+        }
+    }
+
+    // postgres <-> {wal_acceptor1, wal_acceptor2, ...}
+    pub fn fault_tolerant(local_env: &LocalEnv, redundancy: usize) -> TestStorageControlPlane {
+        let mut cplane = TestStorageControlPlane {
+            wal_acceptors: Vec::new(),
+            pageserver: Arc::new(PageServerNode {
+                env: local_env.clone(),
+                kill_on_exit: true,
+                listen_address: None,
+            }),
+            test_done: AtomicBool::new(false),
+            // repopath,
+        };
+        cplane.pageserver.init().unwrap();
+        cplane.pageserver.start().unwrap();
+
+        let systemid = cplane.pageserver.system_id_get().unwrap();
+
+        const WAL_ACCEPTOR_PORT: usize = 54321;
+
+        let datadir_base = local_env.base_data_dir.join("safekeepers");
+        fs::create_dir_all(&datadir_base).unwrap();
+
+        for i in 0..redundancy {
+            let wal_acceptor = WalAcceptorNode {
+                listen: format!("127.0.0.1:{}", WAL_ACCEPTOR_PORT + i)
+                    .parse()
+                    .unwrap(),
+                data_dir: datadir_base.join(format!("wal_acceptor_{}", i)),
+                systemid,
+                env: local_env.clone(),
+                pass_to_pageserver: true,
+            };
+            wal_acceptor.init();
+            wal_acceptor.start();
+            cplane.wal_acceptors.push(wal_acceptor);
+        }
+        cplane
+    }
+
+    pub fn stop(&self) {
+        for wa in self.wal_acceptors.iter() {
+            let _ = wa.stop();
+        }
+        self.test_done.store(true, Ordering::Relaxed);
+    }
+
+    pub fn get_wal_acceptor_conn_info(&self) -> String {
+        self.wal_acceptors
+            .iter()
+            .map(|wa| wa.listen.to_string())
+            .collect::<Vec<String>>()
+            .join(",")
+    }
+
+    pub fn is_running(&self) -> bool {
+        self.test_done.load(Ordering::Relaxed)
+    }
+}
+
+impl Drop for TestStorageControlPlane {
+    fn drop(&mut self) {
+        self.stop();
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// PostgresNodeExt
+//
+///////////////////////////////////////////////////////////////////////////////
+
+///
+/// Testing utilities for PostgresNode type
+///
+pub trait PostgresNodeExt {
+    fn pg_regress(&self) -> ExitStatus;
+    fn pg_bench(&self, clients: u32, seconds: u32) -> ExitStatus;
+    fn start_proxy(&self, wal_acceptors: &str) -> WalProposerNode;
+    fn open_psql(&self, db: &str) -> postgres::Client;
+    fn dump_log_file(&self);
+    fn safe_psql(&self, db: &str, sql: &str) -> Vec<postgres::Row>;
+}
+
+impl PostgresNodeExt for PostgresNode {
+    fn pg_regress(&self) -> ExitStatus {
+        self.safe_psql("postgres", "CREATE DATABASE regression");
+
+        let regress_run_path = self.env.base_data_dir.join("regress");
+        fs::create_dir_all(&regress_run_path).unwrap();
+        fs::create_dir_all(regress_run_path.join("testtablespace")).unwrap();
+        std::env::set_current_dir(regress_run_path).unwrap();
+
+        let regress_build_path =
+            Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress");
+        let regress_src_path =
+            Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress");
+
+        let regress_check = Command::new(regress_build_path.join("pg_regress"))
+            .args(&[
+                "--bindir=''",
+                "--use-existing",
+                format!("--bindir={}", self.env.pg_bin_dir().to_str().unwrap()).as_str(),
+                format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(),
+                format!(
+                    "--schedule={}",
+                    regress_src_path.join("parallel_schedule").to_str().unwrap()
+                )
+                .as_str(),
+                format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(),
+            ])
+            .env_clear()
+            .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
+            .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
+            .env("PGPORT", self.address.port().to_string())
+            .env("PGUSER", self.whoami())
+            .env("PGHOST", self.address.ip().to_string())
+            .status()
+            .expect("pg_regress failed");
+        if !regress_check.success() {
+            if let Ok(mut file) = File::open("regression.diffs") {
+                let mut buffer = String::new();
+                file.read_to_string(&mut buffer).unwrap();
+                println!("--------------- regression.diffs:\n{}", buffer);
+            }
+            self.dump_log_file();
+        }
+        regress_check
+    }
+
+    fn pg_bench(&self, clients: u32, seconds: u32) -> ExitStatus {
+        let port = self.address.port().to_string();
+        let clients = clients.to_string();
+        let seconds = seconds.to_string();
+        let _pg_bench_init = Command::new(self.env.pg_bin_dir().join("pgbench"))
+            .args(&["-i", "-p", port.as_str(), "postgres"])
+            .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
+            .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
+            .status()
+            .expect("pgbench -i");
+        let pg_bench_run = Command::new(self.env.pg_bin_dir().join("pgbench"))
+            .args(&[
+                "-p",
+                port.as_str(),
+                "-T",
+                seconds.as_str(),
+                "-P",
+                "1",
+                "-c",
+                clients.as_str(),
+                "-M",
+                "prepared",
+                "postgres",
+            ])
+            .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
+            .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
+            .status()
+            .expect("pgbench run");
+        pg_bench_run
+    }
+
+    fn start_proxy(&self, wal_acceptors: &str) -> WalProposerNode {
+        let proxy_path = self.env.pg_bin_dir().join("safekeeper_proxy");
+        match Command::new(proxy_path.as_path())
+            .args(&["--ztimelineid", &self.timelineid.to_string()])
+            .args(&["-s", wal_acceptors])
+            .args(&["-h", &self.address.ip().to_string()])
+            .args(&["-p", &self.address.port().to_string()])
+            .arg("-v")
+            .stderr(
+                OpenOptions::new()
+                    .create(true)
+                    .append(true)
+                    .open(self.pgdata().join("safekeeper_proxy.log"))
+                    .unwrap(),
+            )
+            .spawn()
+        {
+            Ok(child) => WalProposerNode { pid: child.id() },
+            Err(e) => panic!("Failed to launch {:?}: {}", proxy_path, e),
+        }
+    }
+
+    fn dump_log_file(&self) {
+        if let Ok(mut file) = File::open(self.env.pageserver_data_dir().join("pageserver.log")) {
+            let mut buffer = String::new();
+            file.read_to_string(&mut buffer).unwrap();
+            println!("--------------- pageserver.log:\n{}", buffer);
+        }
+    }
+
+    fn safe_psql(&self, db: &str, sql: &str) -> Vec<postgres::Row> {
+        let connstring = format!(
+            "host={} port={} dbname={} user={}",
+            self.address.ip(),
+            self.address.port(),
+            db,
+            self.whoami()
+        );
+        let mut client = postgres::Client::connect(connstring.as_str(), postgres::NoTls).unwrap();
+
+        println!("Running {}", sql);
+        let result = client.query(sql, &[]);
+        if result.is_err() {
+            self.dump_log_file();
+        }
+        result.unwrap()
+    }
+
+    fn open_psql(&self, db: &str) -> postgres::Client {
+        let connstring = format!(
+            "host={} port={} dbname={} user={}",
+            self.address.ip(),
+            self.address.port(),
+            db,
+            self.whoami()
+        );
+        postgres::Client::connect(connstring.as_str(), postgres::NoTls).unwrap()
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// WalAcceptorNode
+//
+///////////////////////////////////////////////////////////////////////////////
+
+//
+// Control routines for WalAcceptor.
+//
+// Now used only in test setups.
+//
+pub struct WalAcceptorNode {
+    listen: SocketAddr,
+    data_dir: PathBuf,
+    systemid: u64,
+    env: LocalEnv,
+    pass_to_pageserver: bool,
+}
+
+impl WalAcceptorNode {
+    pub fn init(&self) {
+        if self.data_dir.exists() {
+            fs::remove_dir_all(self.data_dir.clone()).unwrap();
+        }
+        fs::create_dir_all(self.data_dir.clone()).unwrap();
+    }
+
+    pub fn start(&self) {
+        println!(
+            "Starting wal_acceptor in {} listening '{}'",
+            self.data_dir.to_str().unwrap(),
+            self.listen
+        );
+
+        let ps_arg = if self.pass_to_pageserver {
+            // Tell page server it can receive WAL from this WAL safekeeper
+            ["--pageserver", "127.0.0.1:64000"].to_vec()
+        } else {
+            [].to_vec()
+        };
+
+        let status = Command::new(
+            self.env
+                .zenith_distrib_dir
+                .as_ref()
+                .unwrap()
+                .join("wal_acceptor"),
+        )
+        .args(&["-D", self.data_dir.to_str().unwrap()])
+        .args(&["-l", self.listen.to_string().as_str()])
+        .args(&["--systemid", self.systemid.to_string().as_str()])
+        .args(&ps_arg)
+        .arg("-d")
+        .arg("-n")
+        .status()
+        .expect("failed to start wal_acceptor");
+
+        if !status.success() {
+            panic!("wal_acceptor start failed");
+        }
+    }
+
+    pub fn stop(&self) -> Result<()> {
+        println!("Stopping wal acceptor on {}", self.listen);
+        let pidfile = self.data_dir.join("wal_acceptor.pid");
+        let pid = read_pidfile(&pidfile)?;
+        let pid = Pid::from_raw(pid);
+        if kill(pid, Signal::SIGTERM).is_err() {
+            bail!("Failed to kill wal_acceptor with pid {}", pid);
+        }
+        Ok(())
+    }
+}
+
+impl Drop for WalAcceptorNode {
+    fn drop(&mut self) {
+        // Ignore errors.
+        let _ = self.stop();
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// WalProposerNode
+//
+///////////////////////////////////////////////////////////////////////////////
+
+pub struct WalProposerNode {
+    pub pid: u32,
+}
+
+impl WalProposerNode {
+    pub fn stop(&self) {
+        // std::process::Child::id() returns u32, we need i32.
+        let pid: i32 = self.pid.try_into().unwrap();
+        let pid = Pid::from_raw(pid);
+        kill(pid, Signal::SIGTERM).expect("failed to execute kill");
+    }
+}
+
+impl Drop for WalProposerNode {
+    fn drop(&mut self) {
+        self.stop();
+    }
+}
--- a/integration_tests/tests/test_compute.rs
+++ b/integration_tests/tests/test_compute.rs
@@ -1,11 +0,0 @@
-// test node resettlement to an empty datadir
-
-// TODO
-/*
-#[test]
-fn test_resettlement() {}
-
-// test seq scan of everythin after restart
-#[test]
-fn test_cold_seqscan() {}
-*/
--- a/integration_tests/tests/test_control_plane.rs
+++ b/integration_tests/tests/test_control_plane.rs
@@ -1,8 +0,0 @@
-// TODO
-/*
-#[test]
-fn test_actions() {}
-
-#[test]
-fn test_regress() {}
-*/
--- a/integration_tests/tests/test_pageserver.rs
+++ b/integration_tests/tests/test_pageserver.rs
@@ -1,150 +0,0 @@
-// mod control_plane;
-use control_plane::compute::ComputeControlPlane;
-use control_plane::local_env;
-use control_plane::local_env::PointInTime;
-use control_plane::storage::TestStorageControlPlane;
-
-// XXX: force all redo at the end
-// -- restart + seqscan won't read deleted stuff
-// -- pageserver api endpoint to check all rels
-#[test]
-fn test_redo_cases() {
-    let local_env = local_env::test_env("test_redo_cases");
-
-    // Start pageserver that reads WAL directly from that postgres
-    let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
-    let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
-
-    // start postgres
-    let maintli = storage_cplane.get_branch_timeline("main");
-    let node = compute_cplane.new_test_node(maintli);
-    node.start().unwrap();
-
-    // check basic work with table
-    node.safe_psql(
-        "postgres",
-        "CREATE TABLE t(key int primary key, value text)",
-    );
-    node.safe_psql(
-        "postgres",
-        "INSERT INTO t SELECT generate_series(1,100000), 'payload'",
-    );
-    let count: i64 = node
-        .safe_psql("postgres", "SELECT sum(key) FROM t")
-        .first()
-        .unwrap()
-        .get(0);
-    println!("sum = {}", count);
-    assert_eq!(count, 5000050000);
-
-    // check 'create table as'
-    node.safe_psql("postgres", "CREATE TABLE t2 AS SELECT * FROM t");
-    let count: i64 = node
-        .safe_psql("postgres", "SELECT sum(key) FROM t")
-        .first()
-        .unwrap()
-        .get(0);
-    println!("sum = {}", count);
-    assert_eq!(count, 5000050000);
-}
-
-// Runs pg_regress on a compute node
-#[test]
-fn test_regress() {
-    let local_env = local_env::test_env("test_regress");
-
-    // Start pageserver that reads WAL directly from that postgres
-    let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
-    let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
-
-    // start postgres
-    let maintli = storage_cplane.get_branch_timeline("main");
-    let node = compute_cplane.new_test_node(maintli);
-    node.start().unwrap();
-
-    let status = node.pg_regress();
-    assert!(status.success());
-}
-
-// Runs pg_bench on a compute node
-#[test]
-fn pgbench() {
-    let local_env = local_env::test_env("pgbench");
-
-    // Start pageserver that reads WAL directly from that postgres
-    let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
-    let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
-
-    // start postgres
-    let maintli = storage_cplane.get_branch_timeline("main");
-    let node = compute_cplane.new_test_node(maintli);
-    node.start().unwrap();
-
-    let status = node.pg_bench(10, 100);
-    assert!(status.success());
-}
-
-// Run two postgres instances on one pageserver, on different timelines
-#[test]
-fn test_pageserver_two_timelines() {
-    let local_env = local_env::test_env("test_pageserver_two_timelines");
-
-    // Start pageserver that reads WAL directly from that postgres
-    let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
-    let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
-
-    let maintli = storage_cplane.get_branch_timeline("main");
-
-    // Create new branch at the end of 'main'
-    let startpoint = local_env::find_end_of_wal(&local_env, maintli).unwrap();
-    local_env::create_branch(
-        &local_env,
-        "experimental",
-        PointInTime {
-            timelineid: maintli,
-            lsn: startpoint,
-        },
-    )
-    .unwrap();
-    let experimentaltli = storage_cplane.get_branch_timeline("experimental");
-
-    // Launch postgres instances on both branches
-    let node1 = compute_cplane.new_test_node(maintli);
-    let node2 = compute_cplane.new_test_node(experimentaltli);
-    node1.start().unwrap();
-    node2.start().unwrap();
-
-    // check node1
-    node1.safe_psql(
-        "postgres",
-        "CREATE TABLE t(key int primary key, value text)",
-    );
-    node1.safe_psql(
-        "postgres",
-        "INSERT INTO t SELECT generate_series(1,100000), 'payload'",
-    );
-    let count: i64 = node1
-        .safe_psql("postgres", "SELECT sum(key) FROM t")
-        .first()
-        .unwrap()
-        .get(0);
-    println!("sum = {}", count);
-    assert_eq!(count, 5000050000);
-
-    // check node2
-    node2.safe_psql(
-        "postgres",
-        "CREATE TABLE t(key int primary key, value text)",
-    );
-    node2.safe_psql(
-        "postgres",
-        "INSERT INTO t SELECT generate_series(100000,200000), 'payload'",
-    );
-    let count: i64 = node2
-        .safe_psql("postgres", "SELECT sum(key) FROM t")
-        .first()
-        .unwrap()
-        .get(0);
-    println!("sum = {}", count);
-    assert_eq!(count, 15000150000);
-}
--- a/integration_tests/tests/test_wal_acceptor.rs
+++ b/integration_tests/tests/test_wal_acceptor.rs
@@ -1,21 +1,32 @@
-// Restart acceptors one by one while compute is under the load.
-use control_plane::compute::ComputeControlPlane;
-use control_plane::local_env;
-use control_plane::local_env::PointInTime;
-use control_plane::storage::TestStorageControlPlane;
-use pageserver::ZTimelineId;
-
 use rand::Rng;
 use std::sync::Arc;
 use std::time::SystemTime;
 use std::{thread, time};

+use control_plane::compute::{ComputeControlPlane, PostgresNode};
+
+use integration_tests::PostgresNodeExt;
+use integration_tests::TestStorageControlPlane;
+
 const DOWNTIME: u64 = 2;

+fn start_node_with_wal_proposer(
+    timeline: &str,
+    compute_cplane: &mut ComputeControlPlane,
+    wal_acceptors: &str,
+) -> Arc<PostgresNode> {
+    let node = compute_cplane.new_test_master_node(timeline);
+    let _node = node.append_conf(
+        "postgresql.conf",
+        &format!("wal_acceptors='{}'\n", wal_acceptors),
+    );
+    node.start().unwrap();
+    node
+}
+
 #[test]
-//#[ignore]
 fn test_embedded_wal_proposer() {
-    let local_env = local_env::test_env("test_embedded_wal_proposer");
+    let local_env = integration_tests::create_test_env("test_embedded_wal_proposer");

    const REDUNDANCY: usize = 3;
    let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
@@ -23,13 +34,7 @@ fn test_embedded_wal_proposer() {
    let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();

    // start postgres
-    let maintli = storage_cplane.get_branch_timeline("main");
-    let node = compute_cplane.new_test_master_node(maintli);
-    node.append_conf(
-        "postgresql.conf",
-        &format!("wal_acceptors='{}'\n", wal_acceptors),
-    );
-    node.start().unwrap();
+    let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);

    // check basic work with table
    node.safe_psql(
@@ -52,7 +57,7 @@ fn test_embedded_wal_proposer() {

 #[test]
 fn test_acceptors_normal_work() {
-    let local_env = local_env::test_env("test_acceptors_normal_work");
+    let local_env = integration_tests::create_test_env("test_acceptors_normal_work");

    const REDUNDANCY: usize = 3;
    let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
@@ -60,12 +65,7 @@ fn test_acceptors_normal_work() {
    let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();

    // start postgres
-    let maintli = storage_cplane.get_branch_timeline("main");
-    let node = compute_cplane.new_test_master_node(maintli);
-    node.start().unwrap();
-
-    // start proxy
-    let _proxy = node.start_proxy(&wal_acceptors);
+    let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);

    // check basic work with table
    node.safe_psql(
@@ -93,39 +93,28 @@ fn test_many_timelines() {
    // Initialize a new repository, and set up WAL safekeepers and page server.
    const REDUNDANCY: usize = 3;
    const N_TIMELINES: usize = 5;
-    let local_env = local_env::test_env("test_many_timelines");
+    let local_env = integration_tests::create_test_env("test_many_timelines");
    let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
    let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
    let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();

    // Create branches
-    let mut timelines: Vec<ZTimelineId> = Vec::new();
-    let maintli = storage_cplane.get_branch_timeline("main"); // main branch
-    timelines.push(maintli);
-    let startpoint = local_env::find_end_of_wal(&local_env, maintli).unwrap();
+    let mut timelines: Vec<String> = vec!["main".to_string()];
+
    for i in 1..N_TIMELINES {
-        // additional branches
        let branchname = format!("experimental{}", i);
-        local_env::create_branch(
-            &local_env,
-            &branchname,
-            PointInTime {
-                timelineid: maintli,
-                lsn: startpoint,
-            },
-        )
-        .unwrap();
-        let tli = storage_cplane.get_branch_timeline(&branchname);
-        timelines.push(tli);
+        storage_cplane
+            .pageserver
+            .branch_create(&branchname, "main")
+            .unwrap();
+        timelines.push(branchname);
    }

    // start postgres on each timeline
    let mut nodes = Vec::new();
-    for tli in timelines {
-        let node = compute_cplane.new_test_node(tli);
+    for tli_name in timelines {
+        let node = start_node_with_wal_proposer(&tli_name, &mut compute_cplane, &wal_acceptors);
        nodes.push(node.clone());
-        node.start().unwrap();
-        node.start_proxy(&wal_acceptors);
    }

    // create schema
@@ -159,7 +148,7 @@ fn test_many_timelines() {
 // Majority is always alive
 #[test]
 fn test_acceptors_restarts() {
-    let local_env = local_env::test_env("test_acceptors_restarts");
+    let local_env = integration_tests::create_test_env("test_acceptors_restarts");

    // Start pageserver that reads WAL directly from that postgres
    const REDUNDANCY: usize = 3;
@@ -171,12 +160,8 @@ fn test_acceptors_restarts() {
    let mut rng = rand::thread_rng();

    // start postgres
-    let maintli = storage_cplane.get_branch_timeline("main");
-    let node = compute_cplane.new_test_master_node(maintli);
-    node.start().unwrap();
+    let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);

-    // start proxy
-    let _proxy = node.start_proxy(&wal_acceptors);
    let mut failed_node: Option<usize> = None;

    // check basic work with table
@@ -222,7 +207,7 @@ fn start_acceptor(cplane: &Arc<TestStorageControlPlane>, no: usize) {
 // N_CRASHES env var
 #[test]
 fn test_acceptors_unavailability() {
-    let local_env = local_env::test_env("test_acceptors_unavailability");
+    let local_env = integration_tests::create_test_env("test_acceptors_unavailability");

    // Start pageserver that reads WAL directly from that postgres
    const REDUNDANCY: usize = 2;
@@ -232,12 +217,7 @@ fn test_acceptors_unavailability() {
    let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();

    // start postgres
-    let maintli = storage_cplane.get_branch_timeline("main");
-    let node = compute_cplane.new_test_master_node(maintli);
-    node.start().unwrap();
-
-    // start proxy
-    let _proxy = node.start_proxy(&wal_acceptors);
+    let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);

    // check basic work with table
    node.safe_psql(
@@ -307,7 +287,7 @@ fn simulate_failures(cplane: Arc<TestStorageControlPlane>) {
 // Race condition test
 #[test]
 fn test_race_conditions() {
-    let local_env = local_env::test_env("test_race_conditions");
+    let local_env = integration_tests::create_test_env("test_race_conditions");

    // Start pageserver that reads WAL directly from that postgres
    const REDUNDANCY: usize = 3;
@@ -319,12 +299,7 @@ fn test_race_conditions() {
    let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();

    // start postgres
-    let maintli = storage_cplane.get_branch_timeline("main");
-    let node = compute_cplane.new_test_master_node(maintli);
-    node.start().unwrap();
-
-    // start proxy
-    let _proxy = node.start_proxy(&wal_acceptors);
+    let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);

    // check basic work with table
    node.safe_psql(
--- a/mgmt-console/.gitignore
+++ b/mgmt-console/.gitignore
@@ -1,23 +0,0 @@
-# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
-
-# dependencies
-/node_modules
-/.pnp
-.pnp.js
-
-# testing
-/coverage
-
-# production
-/build
-
-# misc
-.DS_Store
-.env.local
-.env.development.local
-.env.test.local
-.env.production.local
-
-npm-debug.log*
-yarn-debug.log*
-yarn-error.log*
--- a/mgmt-console/README
+++ b/mgmt-console/README
@@ -1,55 +0,0 @@
-Mock implementation of a management console.
-
-See demo-howto.txt for usage.
-
-Building and Installation
-------------------------
-
-To compile Postgres:
-  sudo apt build-dep postgresql
-  sudo apt install bison flex libz-dev libssl-dev
-  sudo apt install ccache
-  sudo apt install libcurl4-openssl-dev libxml2-dev
-
-For the webapp:
-  # NOTE: This requires at least version 1.1.0 of python3-flask. That's not
-  # available in Debian Buster, need at least Bullseye.
-
-  sudo apt install python3 python3-flask python3-pip npm webpack
-  pip3 install Flask-BasicAuth
-  pip3 install boto3
-
-git clone and compile and install patched version of Postgres:
-
-  git clone https://github.com/libzenith/postgres.git
-  cd postgres
-  git checkout zenith-experiments
-  ./configure --enable-debug --enable-cassert --with-openssl --prefix=/home/heikki/pgsql-install --with-libxml CC="ccache gcc" CFLAGS="-O0"
-  make -j4 -s install
-
-Get the webapp:
-  cd ~
-  git clone https://github.com/libzenith/zenith-mgmt-console.git
-  cd zenith-mgmt-console
-  mkdir pgdatadirs
-
-
-  openssl req -new -x509 -days 365 -nodes -text -out server.crt \
-    -keyout server.key -subj "/CN=zenith-demo"
-
-For Mock S3 server (unless you want to test against a real cloud service):
-  sudo apt install python3-tornado
-
-  cd ~/zenith-mgmt-console
-  git clone https://github.com/hlinnaka/ms3.git
-
-Compile & run it:
-  npm install
-  webpack # compile React app
-
-  BASIC_AUTH_PASSWORD=<password> ./launch-local.sh
-
-
-You can view the contents of the S3 bucket with browser:
-
-http://<server>/list_bucket
--- a/mgmt-console/app.py
+++ b/mgmt-console/app.py
@@ -1,340 +0,0 @@
-from flask import request
-from flask_basicauth import BasicAuth
-from flask import render_template
-from subprocess import PIPE, STDOUT, run, Popen
-import html
-import os
-import re
-import shutil
-import logging
-import time
-
-import boto3
-from boto3.session import Session
-from botocore.client import Config
-from botocore.handlers import set_list_objects_encoding_type_url
-
-from flask import Flask
-
-import waldump
-
-
-app = Flask(__name__)
-
-app.config['BASIC_AUTH_USERNAME'] = 'zenith'
-app.config['BASIC_AUTH_PASSWORD'] = os.getenv('BASIC_AUTH_PASSWORD')
-app.config['BASIC_AUTH_FORCE'] = True
-
-basic_auth = BasicAuth(app)
-
-# S3 configuration:
-
-ENDPOINT = os.getenv('S3_ENDPOINT', 'https://localhost:9000')
-ACCESS_KEY = os.getenv('S3_ACCESSKEY', 'minioadmin')
-SECRET = os.getenv('S3_SECRET', '')
-BUCKET = os.getenv('S3_BUCKET', 'foobucket')
-
-print("Using bucket at " + ENDPOINT);
-
-#boto3.set_stream_logger('botocore', logging.DEBUG)
-
-session = Session(aws_access_key_id=ACCESS_KEY,
-                  aws_secret_access_key=SECRET,
-                  region_name=os.getenv('S3_REGION', 'auto'))
-
-# needed for google cloud?
-session.events.unregister('before-parameter-build.s3.ListObjects',
-                          set_list_objects_encoding_type_url)
-
-s3resource = session.resource('s3',
-                              endpoint_url=ENDPOINT,
-                              verify=False,
-                              config=Config(signature_version='s3v4'))
-s3bucket = s3resource.Bucket(BUCKET)
-
-s3_client = boto3.client('s3',
-                         endpoint_url=ENDPOINT,
-                         verify=False,
-                         config=Config(signature_version='s3v4'),
-                         aws_access_key_id=ACCESS_KEY,
-                         aws_secret_access_key=SECRET)
-
-
-@app.route("/")
-def index():
-    return render_template("index.html")
-
-
-@app.route("/api/waldump")
-def render_waldump():
-    return render_template("waldump.html")
-
-@app.route('/api/fetch_wal')
-def fetch_wal():
-    return waldump.fetch_wal(request, s3bucket);
-
-@app.route("/api/server_status")
-def server_status():
-    dirs = os.listdir("pgdatadirs")
-    dirs.sort()
-
-    primary = None
-    standbys = []
-
-    for dirname in dirs:
-        
-        result = run("pg_ctl status -D pgdatadirs/" + dirname, stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
-
-        srv = {
-            'datadir': dirname,
-            'status': result.stdout,
-            'port': None
-        }
-
-        if dirname == 'primary':
-            primary = srv;
-            primary['port'] = 5432;
-        else:
-            standby_match = re.search('standby_([0-9]+)', dirname)
-            if standby_match:
-                srv['port'] = int(standby_match.group(1))
-
-            standbys.append(srv);
-
-    return {'primary': primary, 'standbys': standbys}
-
-@app.route('/api/list_bucket')
-def list_bucket():
-
-    response = 'cloud bucket contents:<br>\n'
-
-    for file in s3bucket.objects.all():
-        response = response + html.escape(file.key) + '<br>\n'
-
-    return response
-
-def walpos_str(walpos):
-    return '{:X}/{:X}'.format(walpos >> 32, walpos & 0xFFFFFFFF)
-
-@app.route('/api/bucket_summary')
-def bucket_summary():
-
-    nonrelimages = []
-    minwal = int(0)
-    maxwal = int(0)
-    minseqwal = int(0)
-    maxseqwal = int(0)
-
-    for file in s3bucket.objects.all():
-        path = file.key
-        match = re.search('nonreldata/nonrel_([0-9A-F]+).tar', path)
-        if match:
-            walpos = int(match.group(1), 16)
-            nonrelimages.append(walpos_str(walpos))
-
-        match = re.search('nonreldata/nonrel_([0-9A-F]+)-([0-9A-F]+)', path)
-        if match:
-            endwal = int(match.group(2), 16)
-            if endwal > maxwal:
-                maxwal = endwal
-
-        match = re.search('walarchive/([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})', path)
-        if match:
-            tli = int(match.group(1), 16)
-            logno = int(match.group(2), 16)
-            segno = int(match.group(3), 16)
-            # FIXME: this assumes default 16 MB wal segment size
-            logsegno = logno * (0x100000000 / (16*1024*1024)) + segno
-
-            seqwal = int((logsegno + 1) * (16*1024*1024))
-
-            if seqwal > maxseqwal:
-                maxseqwal = seqwal;
-            if minseqwal == 0 or seqwal < minseqwal:
-                minseqwal = seqwal;
-
-    return {
-        'nonrelimages': nonrelimages,
-        'minwal': walpos_str(minwal),
-        'maxwal': walpos_str(maxwal),
-        'minseqwal': walpos_str(minseqwal),
-        'maxseqwal': walpos_str(maxseqwal)
-        }
-
-def print_cmd_result(cmd_result):
-    return print_cmd_result_ex(cmd_result.args, cmd_result.returncode, cmd_result.stdout)
-
-def print_cmd_result_ex(cmd, returncode, stdout):
-    res = ''
-    res += 'ran command:\n' + str(cmd) + '\n'
-    res += 'It returned code ' + str(returncode) + '\n'
-    res += '\n'
-    res += 'stdout/stderr:\n'
-    res += stdout
-
-    return res
-
-@app.route('/api/init_primary', methods=['GET', 'POST'])
-def init_primary():
-    
-    initdb_result = run("initdb -D pgdatadirs/primary --username=zenith --pwfile=pg-password.txt", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
-    if initdb_result.returncode != 0:
-        return print_cmd_result(initdb_result)
-    
-    # Append archive_mode and archive_command and port to postgresql.conf
-    f=open("pgdatadirs/primary/postgresql.conf", "a+")
-    f.write("listen_addresses='*'\n")
-    f.write("archive_mode=on\n")
-    f.write("archive_command='zenith_push --archive-wal-path=%p --archive-wal-fname=%f'\n")
-    f.write("ssl=on\n")
-    f.close()
-
-    f=open("pgdatadirs/primary/pg_hba.conf", "a+")
-    f.write("# allow SSL connections with password from anywhere\n")
-    f.write("hostssl    all             all             0.0.0.0/0           md5\n")
-    f.write("hostssl    all             all             ::0/0               md5\n")
-    f.close()
-
-    shutil.copyfile("server.crt", "pgdatadirs/primary/server.crt")
-    shutil.copyfile("server.key", "pgdatadirs/primary/server.key")
-    os.chmod("pgdatadirs/primary/server.key", 0o0600)
-    
-    start_proc = Popen(args=["pg_ctl", "start", "-D", "pgdatadirs/primary", "-l", "pgdatadirs/primary/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
-    start_rc = start_proc.wait()
-    start_stdout, start_stderr = start_proc.communicate()
-
-    responsestr = print_cmd_result(initdb_result) + '\n'
-    responsestr += print_cmd_result_ex(start_proc.args, start_rc, start_stdout)
-
-    return responsestr
-
-@app.route('/api/zenith_push', methods=['GET', 'POST'])
-def zenith_push():
-    # Stop the primary if it's running
-    stop_result = run(args=["pg_ctl", "stop", "-D", "pgdatadirs/primary"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
-    
-    # Call zenith_push
-    push_result = run("zenith_push -D pgdatadirs/primary", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
-
-    # Restart the primary
-    start_proc = Popen(args=["pg_ctl", "start", "-D", "pgdatadirs/primary", "-l", "pgdatadirs/primary/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
-    start_rc = start_proc.wait()
-    start_stdout, start_stderr = start_proc.communicate()
-    
-    responsestr = print_cmd_result(stop_result) + '\n'
-    responsestr += print_cmd_result(push_result) + '\n'
-    responsestr += print_cmd_result_ex(start_proc.args, start_rc, start_stdout) + '\n'
-
-    return responsestr
-
-@app.route('/api/create_standby', methods=['GET', 'POST'])
-def create_standby():
-
-    walpos = request.form.get('walpos')
-    if not walpos:
-        return 'no walpos'
-    
-    dirs = os.listdir("pgdatadirs")
-
-    last_port = 5432
-
-    for dirname in dirs:
-
-        standby_match = re.search('standby_([0-9]+)', dirname)
-        if standby_match:
-            port = int(standby_match.group(1))
-            if port > last_port:
-                last_port = port
-
-    standby_port = last_port + 1
-
-    standby_dir = "pgdatadirs/standby_" + str(standby_port)
-
-    # Call zenith_restore
-    restore_result = run(["zenith_restore", "--end=" + walpos, "-D", standby_dir], stdout=PIPE, stderr=STDOUT, encoding='latin1')
-    responsestr = print_cmd_result(restore_result)
-
-    if restore_result.returncode == 0:
-        # Append hot_standby and port to postgresql.conf
-        f=open(standby_dir + "/postgresql.conf", "a+")
-        f.write("hot_standby=on\n")
-        f.write("port=" + str(standby_port) + "\n")
-        f.close()
-
-        start_proc = Popen(args=["pg_ctl", "start", "-D", standby_dir, "-l", standby_dir + "/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
-        start_rc = start_proc.wait()
-        start_stdout, start_stderr = start_proc.communicate()
-        responsestr += '\n\n' + print_cmd_result_ex(start_proc.args, start_rc, start_stdout)
-
-    return responsestr
-
-@app.route('/api/destroy_server', methods=['GET', 'POST'])
-def destroy_primary():
-
-    datadir = request.form.get('datadir')
-
-    # Check that the datadir parameter doesn't contain anything funny.
-    if not re.match("^[A-Za-z0-9_-]+$", datadir):
-        raise Exception('invalid datadir: ' + datadir)
-    
-    # Stop the server if it's running
-    stop_result = run(args=["pg_ctl", "stop", "-m", "immediate", "-D", "pgdatadirs/" + datadir], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
-
-    shutil.rmtree('pgdatadirs/' + datadir, ignore_errors=True)
-
-    responsestr = print_cmd_result(stop_result) + '\n'
-    responsestr += 'Deleted datadir ' + datadir + '.\n'
-
-    return responsestr
-
-@app.route('/api/restore_primary', methods=['GET', 'POST'])
-def restore_primary():
-
-    # Call zenith_restore
-    restore_result = run(["zenith_restore", "-D", "pgdatadirs/primary"], stdout=PIPE, stderr=STDOUT, encoding='latin1')
-    responsestr = print_cmd_result(restore_result)
-
-    # Append restore_command to postgresql.conf, so that it can find the last raw WAL segments
-    f=open("pgdatadirs/primary/postgresql.conf", "a+")
-    f.write("listen_addresses='*'\n")
-    f.write("restore_command='zenith_restore --archive-wal-path=%p --archive-wal-fname=%f'\n")
-    f.write("ssl=on\n")
-    f.close()
-    
-    if restore_result.returncode == 0:
-        start_proc = Popen(args=["pg_ctl", "start", "-D", "pgdatadirs/primary", "-l", "pgdatadirs/primary/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
-        start_rc = start_proc.wait()
-        start_stdout, start_stderr = start_proc.communicate()
-        responsestr += print_cmd_result_ex(start_proc.args, start_rc, start_stdout)
-
-    return responsestr
-
-@app.route('/api/slicedice', methods=['GET', 'POST'])
-def run_slicedice():
-    result = run("zenith_slicedice", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
-    
-    responsestr = print_cmd_result(result)
-
-    return responsestr
-
-@app.route('/api/reset_demo', methods=['POST'])
-def reset_all():
-    result = run("pkill -9 postgres", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
-
-    dirs = os.listdir("pgdatadirs")
-    for dirname in dirs:
-        shutil.rmtree('pgdatadirs/' + dirname)
-        
-    for file in s3bucket.objects.all():
-        s3_client.delete_object(Bucket = BUCKET, Key = file.key)
-
-    responsestr = print_cmd_result(result) + '\n'
-    responsestr += '''
-Deleted all Postgres datadirs.
-Deleted all files in object storage bucket.
-'''
-
-    return responsestr
-
-if __name__ == '__main__':
-    app.run()
--- a/mgmt-console/babel.config.js
+++ b/mgmt-console/babel.config.js
@@ -1,3 +0,0 @@
-module.exports = {
-    presets: ["@babel/preset-env", "@babel/preset-react"],
-};
--- a/mgmt-console/demo-howto.txt
+++ b/mgmt-console/demo-howto.txt
@@ -1,67 +0,0 @@
-Mock implementation of a management console.
-
-This isn't very different from a "normal" PostgreSQL installation with
-a base backup and WAL archive. The main user-visible difference is
-that when you create a standby server, we don't restore the whole data
-directory, but only the "non-relation" files. Relation files are
-restored on demand, when they're accessed the first time. That makes
-the "create standby" operation is very fast, but with some delay when
-you connect and start running queries instead.  Most visible if you
-have a large database. (However, see note below about large databases)
-
-Note: lots of things are broken/unsafe. Things will fail if a table is
-larger than 1 GB. Or if there are more than 1000 files in the cloud
-bucket.
-
-How to use this demo:
-
-1. If there are any leftovers from previous runs, reset by clicking
-   the RESET DEMO button.  This kills and deletes all Postgres servers,
-   and empties the cloud storage bucket
-
-2. Create primary server by clicking on the "Init primary" button
-
-3. Push a base image of the primary to cloud storage, by clicking the
-   "push base image" button.  (This takes about 30 seconds, be
-   patient)
-
-4. Connect to primary with psql, and create a test table with a little data.
-
-      psql postgres  -p5432 -U zenith -h<host>
-
-      create table mytable (i int4);
-
-      insert into mytable values (1);
-      select pg_switch_wal();
-
-   The Postgres password is the same as for the management console.
-
-3. Now that there's a new WAL segment in the arhive, we can "slice &
-   dice" it. Click on the "Slice & dice button".
-
-4. Perform more updates on the primary, to generate more WAL.
-
-      insert into mytable values (2); select pg_switch_wal();
-      insert into mytable values (3); select pg_switch_wal();
-      insert into mytable values (4); select pg_switch_wal();
-      insert into mytable values (5); select pg_switch_wal();
-
-5. Slice & Dice the WAL again
-
-6. Now you can create read-only standby servers at any point in the
-   WAL. Type a WAL position in the text box (or use the slider), and
-   click "Create new standby". The first standby is created at port 5433,
-   the second at port 5434, and so forth.
-
-7. Connect to the standby with "psql -p 5433". Note that it takes a
-   few seconds until the connection is established. That's because the
-   standby has to restore the basic system catalogs, like pg_database and
-   pg_authid from the backup. After connecting, you can do "\d" to list
-   tables, this will also take a few seconds, as more catalog tables are
-   restored from backup.  Subsequent commands will be faster.
-
-   Run queries in the standby:
-
-      select * from mytable;
-
-   the result depends on the LSN that you picked when you created the server.
--- a/mgmt-console/js/app.js
+++ b/mgmt-console/js/app.js
@@ -1,463 +0,0 @@
-import React, { useState, useEffect } from 'react';
-import ReactDOM from 'react-dom';
-import Loader from "react-loader-spinner";
-import { Router, Route, Link, IndexRoute, hashHistory, browserHistory } from 'react-router';
-
-function ServerStatus(props) {
-    const datadir = props.server.datadir;
-    const status = props.server.status;
-    const port = props.server.port;
-
-    return (
-	<div>
-	    <h2>{ datadir == 'primary' ? 'Primary' : datadir }</h2>
-	    status: <div className='status'>{status}</div><br/>
-	    to connect: <span className='shellcommand'>psql -h { window.location.hostname } -p { port } -U zenith postgres</span><br/>
-	</div>
-    );
-}
-
-function StandbyList(props) {
-    const bucketSummary = props.bucketSummary;
-    const standbys = props.standbys;
-    const maxwalpos = bucketSummary.maxwal ? walpos_to_int(bucketSummary.maxwal) : 0;
-
-    const [walposInput, setWalposInput] = useState({ src: 'text', value: '0/0'});
-
-    // find earliest base image
-    const minwalpos = bucketSummary.nonrelimages ? bucketSummary.nonrelimages.reduce((minpos, imgpos_str, index, array) => {
-	const imgpos = walpos_to_int(imgpos_str);
-	return (minpos == 0 || imgpos < minpos) ? imgpos : minpos;
-    }, 0) : 0;
-
-    const can_create_standby = minwalpos > 0 && maxwalpos > 0 && maxwalpos >= minwalpos;
-    var walpos_valid = true;
-
-    function create_standby() {
-	const formdata = new FormData();
-	formdata.append("walpos", walposStr);
-
-	props.startOperation('Creating new standby at ' + walposStr + '...',
-			     fetch("/api/create_standby", { method: 'POST', body: formdata }));
-    }
-
-    function destroy_standby(datadir) {
-	const formdata = new FormData();
-	formdata.append("datadir", datadir);
-	props.startOperation('Destroying ' + datadir + '...',
-			     fetch("/api/destroy_server", { method: 'POST', body: formdata }));
-    }
-
-    const handleSliderChange = (event) => {
-	setWalposInput({ src: 'slider', value: event.target.value });
-    }    
-
-    const handleWalposChange = (event) => {
-	setWalposInput({ src: 'text', value: event.target.value });
-    }
-
-    var sliderValue;
-    var walposStr;
-    if (walposInput.src == 'text')
-    {
-	const walpos = walpos_to_int(walposInput.value);
-
-	if (walpos >= minwalpos && walpos <= maxwalpos)
-	    walpos_valid = true;
-	else
-	    walpos_valid = false;
-	
-	sliderValue = Math.round((walpos - minwalpos) / (maxwalpos - minwalpos) * 100);
-	walposStr = walposInput.value;
-    }
-    else
-    {
-	const slider = walposInput.value;
-	const new_walpos = minwalpos + slider / 100 * (maxwalpos - minwalpos);
-
-	console.log('minwalpos: '+ minwalpos);
-	console.log('maxwalpos: '+ maxwalpos);
-
-	walposStr = int_to_walpos(Math.round(new_walpos));
-	walpos_valid = true;
-	console.log(walposStr);
-    }
-
-    var standbystatus = ''
-    if (standbys)
-    {
-	standbystatus = 
-	    <div>
-		{
-		    standbys.length > 0 ? 
- 			standbys.map((server) =>
-			    <>
-				<ServerStatus key={ 'status_' + server.datadir} server={server}/>
-				<button key={ 'destroy_' + server.datadir} onClick={e => destroy_standby(server.datadir)}>Destroy standby</button>
-			    </>
-			) : "no standby servers"
-		}
-	    </div>
-    }
-
-    return (
-	<div>
-	    <h2>Standbys</h2>
-	    <button onClick={create_standby} disabled={!can_create_standby || !walpos_valid}>Create new Standby</button> at LSN 
-            <input type="text" id="walpos_input" value={ walposStr } onChange={handleWalposChange} disabled={!can_create_standby}/>
-	    <input type="range" id="walpos_slider" min="0" max="100" steps="1" value={sliderValue}  onChange={handleSliderChange} disabled={!can_create_standby}/>
-	    <br/>
-	    { standbystatus }
-	</div>
-    );
-}
-
-function ServerList(props) {
-    const primary = props.serverStatus ? props.serverStatus.primary : null;
-    const standbys = props.serverStatus ? props.serverStatus.standbys : [];
-    const bucketSummary = props.bucketSummary;
-
-    var primarystatus = '';
-
-    function destroy_primary() {
-	const formdata = new FormData();
-	formdata.append("datadir", 'primary');
-	props.startOperation('Destroying primary...',
-			     fetch("/api/destroy_server", { method: 'POST', body: formdata }));
-    }    
-
-    function restore_primary() {
-	props.startOperation('Restoring primary...',
-			     fetch("/api/restore_primary", { method: 'POST' }));
-    }    
-    
-    if (primary)
-    {
-	primarystatus =
-	    <div>
-		<ServerStatus server={primary}/>
-		<button onClick={destroy_primary}>Destroy primary</button>
-	    </div>
-    }
-    else
-    {
-	primarystatus =
-	    <div>
-		no primary server<br/>
-		<button onClick={restore_primary}>Restore primary</button>
-	    </div>
-    }
-
-    return (
-	<>
-	    { primarystatus }
-	    <StandbyList standbys={standbys} startOperation={props.startOperation} bucketSummary={props.bucketSummary}/>
-	    <p className="todo">
-		Should we list the WAL safekeeper nodes here? Or are they part of the Storage? Or not visible to users at all?
-	    </p>
-	</>
-    );
-}
-
-function BucketSummary(props) {
-    const bucketSummary = props.bucketSummary;
-    const startOperation = props.startOperation;
-
-    function slicedice() {
-	startOperation('Slicing sequential WAL to per-relation WAL...',
-		       fetch("/api/slicedice", { method: 'POST' }));
-    }
-    
-    if (!bucketSummary.nonrelimages)
-    {
-	return <>loading...</>
-    }
-
-    return (
-	<div>
-	    <div>Base images at following WAL positions:
-		<ul>
-		    {bucketSummary.nonrelimages.map((img) => (
-			<li key={img}>{img}</li>
-		    ))}
-		</ul>
-	    </div>
-            Sliced WAL is available up to { bucketSummary.maxwal }<br/>
-	    Raw WAL is available up to { bucketSummary.maxseqwal }<br/>
-
-	    <br/>
-	    <button onClick={slicedice}>Slice & Dice WAL</button>
-	    <p className="todo">
-		Currently, the slicing or "sharding" of the WAL needs to be triggered manually, by clicking the above button.
-		<br/>
-		TODO: make it a continuous process that runs in the WAL safekeepers, or in the Page Servers, or as a standalone service.
-	    </p>
-	</div>
-    );
-}
-
-function ProgressIndicator()
-{
-    return (
-	<div>
-	    <Loader
-		type="Puff"
-		color="#00BFFF"
-		height={100}
-		width={100}
-	    />
-	</div>
-    )
-}
-
-function walpos_to_int(walpos)
-{
-    const [hi, lo] = walpos.split('/');
-
-    return parseInt(hi, 16) + parseInt(lo, 16);
-}
-
-function int_to_walpos(x)
-{
-    console.log('converting ' + x);
-    return (Math.floor((x / 0x100000000)).toString(16) + '/' + (x % 0x100000000).toString(16)).toUpperCase();
-}
-
-function OperationStatus(props) {
-    const lastOperation = props.lastOperation;
-    const inProgress = props.inProgress;
-    const operationResult = props.operationResult;
-
-    if (lastOperation)
-    {
-	return (
-	    <div><h2>Last operation:</h2>
-		<div>{lastOperation} { (!inProgress && lastOperation) ? 'done!' : '' }</div>
-		<div className='result'>
-		    {inProgress ? <ProgressIndicator/> : <pre>{operationResult}</pre>}
-		</div>
-	    </div>
-	);
-    }
-    else
-	return '';
-}
-
-function ActionButtons(props) {
-
-    const startOperation = props.startOperation;
-    const bucketSummary = props.bucketSummary;
-    
-    function reset_demo() {
-	startOperation('resetting everything...',
-		       fetch("/api/reset_demo", { method: 'POST' }));
-    }
-
-    function init_primary() {
-	startOperation('Initializing new primary...',
-		       fetch("/api/init_primary", { method: 'POST' }));
-    }
-
-    function zenith_push() {
-	startOperation('Pushing new base image...',
-		       fetch("/api/zenith_push", { method: 'POST' }));
-    }
-	
-    return (
-	<div>
-	    <p className="todo">
-		RESET DEMO deletes everything in the storage bucket, and stops and destroys all servers. This resets the whole demo environment to the initial state.
-	    </p>
-	    <button onClick={reset_demo}>RESET DEMO</button>
-	    <p className="todo">
-		Init Primary runs initdb to create a new primary server. Click this after Resetting the demo.
-	    </p>
-
-	    <button onClick={init_primary}>Init primary</button>
-
-	    <p className="todo">
-		Push Base Image stops the primary, copies the current state of the primary to the storage bucket as a new base backup, and restarts the primary.
-		<br/>
-		TODO: This should be handled by a continuous background process, probably running in the storage nodes. And without having to shut down the cluster, of course.
-	    </p>
-
-	    <button onClick={zenith_push}>Push base image</button>
-
-	</div>
-    );
-}
-
-function Sidenav(props)
-{
-    const toPage = (page) => (event) => {
-	//event.preventDefault()
-	props.switchPage(page);
-    };
-    return (
-	<div>
-	    <h3 className="sidenav-item">Menu</h3>
-	    <a href="#servers" onClick={toPage('servers')} className="sidenav-item">Servers</a>
-	    <a href="#storage" onClick={toPage('storage')} className="sidenav-item">Storage</a>
-	    <a href="#snapshots" onClick={toPage('snapshots')} className="sidenav-item">Snapshots</a>
-	    <a href="#demo" onClick={toPage('demo')} className="sidenav-item">Demo</a>
-	    <a href="#import" onClick={toPage('import')}  className="sidenav-item">Import / Export</a>
-	    <a href="#jobs" onClick={toPage('jobs')} className="sidenav-item">Jobs</a>
-	</div>
-    );
-}
-
-function App()
-{
-    const [page, setPage] = useState('servers');
-    const [serverStatus, setServerStatus] = useState({});
-    const [bucketSummary, setBucketSummary] = useState({});
-    const [lastOperation, setLastOperation] = useState('');
-    const [inProgress, setInProgress] = useState('');
-    const [operationResult, setOperationResult] = useState('');
-
-    useEffect(() => {
-	reloadStatus();
-    }, []);
-
-    function startOperation(operation, promise)
-    {
-	promise.then(result => result.text()).then(resultText => {
-	    operationFinished(resultText);
-	});
-	
-	setLastOperation(operation);
-	setInProgress(true);
-	setOperationResult('');
-    }
-
-    function operationFinished(result)
-    {
-	setInProgress(false);
-	setOperationResult(result);
-	reloadStatus();
-    }
-
-    function clearOperation()
-    {
-	setLastOperation('')
-	setInProgress('');
-	setOperationResult('');
-	console.log("cleared");
-    }
-    
-    function reloadStatus()
-    {
-	fetch('/api/server_status').then(res => res.json()).then(data => {
-	    setServerStatus(data);
-	});
-
-	fetch('/api/bucket_summary').then(res => res.json()).then(data => {
-	    setBucketSummary(data);
-	});
-    }
-
-    const content = () => {
-	console.log(page);
-	if (page === 'servers') {
-	    return (
-		<>
-		    <h1>Server status</h1>
-		    <ServerList startOperation={ startOperation }
-				serverStatus={ serverStatus }
-				bucketSummary={ bucketSummary }/>
-		</>
-	    );
-	} else if (page === 'storage') {
-	    return (
-		<>
-		    <h1>Storage bucket status</h1>
-		    <BucketSummary startOperation={ startOperation }
-				   bucketSummary={ bucketSummary }/>
-		</>
-	    );
-	} else if (page === 'snapshots') {
-	    return (
-		<>
-		    <h1>Snapshots</h1>
-		    <p className="todo">
-			In Zenith, snapshots are just specific points (LSNs) in the WAL history, with a label. A snapshot prevents garbage collecting old data that's still needed to reconstruct the database at that LSN.
-		    </p>
-		    <p className="todo">
-			TODO:
-			<ul>
-			    <li>List existing snapshots</li>
-			    <li>Create new snapshot manually, from current state or from a given LSN</li>
-			    <li>Drill into the WAL stream to see what have happened. Provide tools for e.g. finding point where a table was dropped</li>
-			    <li>Create snapshots automatically based on events in the WAL, like if you call pg_create_restore_point(() in the primary</li>
-			    <li>Launch new reader instance at a snapshot</li>
-			    <li>Export snapshot</li>
-			    <li>Rollback cluster to a snapshot</li>
-			</ul>
-		    </p>
-		</>
-	    );
-	} else if (page === 'demo') {
-	    return (
-		<>
-		    <h1>Misc actions</h1>
-		    <ActionButtons startOperation={ startOperation }
-				   bucketSummary={ bucketSummary }/>
-		</>
-	    );
-	} else if (page === 'import') {
-	    return (
-		<>
-		    <h1>Import & Export tools</h1>
-		    <p className="TODO">TODO:
-			<ul>
-			    <li>Initialize database from existing backup (pg_basebackup, WAL-G, pgbackrest)</li>
-			    <li>Initialize from a pg_dump or other SQL script</li>
-			    <li>Launch batch job to import data files from S3</li>
-			    <li>Launch batch job to export database with pg_dump to S3</li>
-			</ul>
-			These jobs can be run in against reader processing nodes. We can even
-			spawn a new reader node dedicated to a job, and destry it when the job is done.
-		    </p>
-		</>
-	    );
-	} else if (page === 'jobs') {
-	    return (
-		<>
-		    <h1>Batch jobs</h1>
-		    <p className="TODO">TODO:
-			<ul>
-			    <li>List running jobs launched from Import & Export tools</li>
-			    <li>List other batch jobs launched by the user</li>
-			    <li>Launch new batch jobs</li>
-			</ul>
-		    </p>
-		</>
-	    );
-	}
-    }
-
-    function switchPage(page)
-    {
-	console.log("topage " + page);
-	setPage(page)
-	clearOperation();
-    };
-
-    return (
-	<div className="row">
-	    <div className="sidenav">
-		<Sidenav switchPage={switchPage} className="column"/>
-	    </div>
-	    <div className="column">
-		<div>
-		    { content() }
-		</div>
-		<OperationStatus lastOperation={ lastOperation }
-				 inProgress = { inProgress }
-				 operationResult = { operationResult }/>
-	    </div>
-	</div>
-    );
-}
-
-ReactDOM.render(<App/>, document.getElementById('reactApp'));
--- a/mgmt-console/js/waldump.js
+++ b/mgmt-console/js/waldump.js
@@ -1,105 +0,0 @@
-import React, { useState, useEffect } from 'react';
-import ReactDOM from 'react-dom';
-import Loader from "react-loader-spinner";
-
-function walpos_to_int(walpos)
-{
-    const [hi, lo] = walpos.split('/');
-
-    return parseInt(hi, 16) + parseInt(lo, 16);
-}
-
-const palette = [
-    "#003f5c",
-    "#2f4b7c",
-    "#665191",
-    "#a05195",
-    "#d45087",
-    "#f95d6a",
-    "#ff7c43",
-    "#ffa600"];
-
-function WalRecord(props)
-{
-    const firstwalpos = props.firstwalpos;
-    const endwalpos = props.endwalpos;
-    const record = props.record;
-    const index = props.index;
-    const xidmap = props.xidmap;
-
-    const startpos = walpos_to_int(record.start)
-    const endpos = walpos_to_int(record.end)
-
-    const scale = 1000 / (16*1024*1024)
-    const startx = (startpos - firstwalpos) * scale;
-    const endx = (endpos - firstwalpos) * scale;
-
-    const xidindex = xidmap[record.xid];
-    const color = palette[index % palette.length];
-
-    const y = 5 + (xidindex) * 20 + (index % 2) * 2;
-    
-    return (
-	<line x1={ startx } y1={y} x2={endx} y2={y} stroke={ color } strokeWidth="5">
-	    <title>
-		start: { record.start } end: { record.end }
-	    </title>
-	</line>
-    )
-}
-
-function WalFile(props)
-{
-    const walContent = props.walContent;
-    const firstwalpos = props.firstwalpos;
-    const xidmap = props.xidmap;
-   
-    return <svg width="1000" height="200">
-	       {
-		   walContent.records ? 
- 		       walContent.records.map((record, index) =>
-			   <WalRecord key={record.start} firstwalpos={firstwalpos} record={record} index={index} xidmap={xidmap}/>
-		       ) : "no records"
-	       }
-	   </svg>
-}
-
-function WalDumpApp()
-{
-    const [walContent, setWalContent] = useState({});
-
-    const filename = '00000001000000000000000C';
-
-    useEffect(() => {
-	fetch('/fetch_wal?filename='+filename).then(res => res.json()).then(data => {
-	    setWalContent(data);
-	});
-    }, []);
-
-    var firstwalpos = 0;
-    var endwalpos = 0;
-    var numxids = 0;
-    var xidmap = {};
-    if (walContent.records && walContent.records.length > 0)
-    {
-	firstwalpos = walpos_to_int(walContent.records[0].start);
-	endwalpos = firstwalpos + 16*1024*1024;
-
-	walContent.records.forEach(rec => {
-	    if (!xidmap[rec.xid])
-	    {
-		xidmap[rec.xid] = ++numxids;
-	    }
-	});
-    }
-
-    return (
-	<>
-	    <h2>{filename}</h2>
-	    <WalFile walContent={walContent} firstwalpos={firstwalpos} endwalpos={endwalpos} xidmap={xidmap}/>
-	</>
-    );
-}
-
-console.log('hey there');
-ReactDOM.render(<WalDumpApp/>, document.getElementById('waldump'));
--- a/mgmt-console/launch-google-cloud.sh
+++ b/mgmt-console/launch-google-cloud.sh
@@ -1,9 +0,0 @@
-#!/bin/bash
-#
-# NOTE: You must set the following environment variables before running this:
-#  BASIC_AUTH_PASSWORD - basic http auth password
-#  S3_ACCESSKEY
-#  S3_SECRET
-
-
-S3_ENDPOINT=https://storage.googleapis.com S3_BUCKET=zenith-testbucket PATH=/home/heikki/pgsql-install/bin:$PATH flask run --host=0.0.0.0
--- a/mgmt-console/launch-local.sh
+++ b/mgmt-console/launch-local.sh
@@ -1,8 +0,0 @@
-#!/bin/bash
-#
-# NOTE: You should set the BASIC_AUTH_PASSWORD environment variable before calling
-
-# Launch S3 server
-(cd ms3 && python3 -m ms3.app --listen-address=localhost) &
-
-FLASK_ENV=development S3_REGION=auto S3_ENDPOINT=http://localhost:9009 S3_BUCKET=zenith-testbucket PATH=/home/heikki/pgsql.fsmfork/bin:$PATH flask run --host=0.0.0.0
--- a/mgmt-console/package-lock.json
+++ b/mgmt-console/package-lock.json
--- a/mgmt-console/package.json
+++ b/mgmt-console/package.json
@@ -1,27 +0,0 @@
-{
-  "name": "starter-kit",
-  "version": "1.1.0",
-  "description": "",
-  "main": "index.js",
-  "scripts": {
-    "test": "echo \"Error: no test specified\" && exit 1",
-    "build": "webpack",
-    "start": "python app.py"
-  },
-  "author": "",
-  "license": "ISC",
-  "dependencies": {
-    "react": "^17.0.1",
-    "react-dom": "^17.0.1",
-    "react-loader-spinner": "^4.0.0",
-    "react-router": "^5.2.0"
-  },
-  "devDependencies": {
-    "@babel/core": "^7.13.1",
-    "@babel/preset-env": "^7.13.5",
-    "@babel/preset-react": "^7.12.13",
-    "babel-loader": "^8.2.2",
-    "webpack": "^5.24.2",
-    "webpack-cli": "^4.5.0"
-  }
-}
--- a/mgmt-console/templates/index.html
+++ b/mgmt-console/templates/index.html
@@ -1,58 +0,0 @@
-<head>
-
-<style>
-  .status {
-      font-family: monospace;
-      background-color: lightgrey;
-  }
-  .shellcommand {
-      font-family: monospace;
-      background-color: lightgrey;
-  }
-  .result {
-      font-family: monospace;
-      background-color: lightgrey;
-      padding: 10px;
-  }
-
-
-  .todo   {font-style: italic;}
-
-
-  h1   {color: blue;}
-
-  .column {
-      float: left;
-      width: 50%;
-      padding: 10px;
-  }
-  /* Clear floats after the columns */
-  .row:after {
-      content: "";
-      display: table;
-      clear: both;
-  }
-
-  .sidenav {
-      float: left;
-      width: 150px;
-      padding: 10px;
-      background-color: pink;
-  }
-
-  .sidenav-item {
-      padding:10px 0px;
-      border:none;
-      display:block;
-  }
-
-</style>
-
-</head>
-
-<body>
-  <div id="reactApp"></div>
-
-  <!-- Attach React components -->
-  <script type="text/javascript" src="{{ url_for('static', filename='app_bundle.js') }}"></script>
-</body>
--- a/mgmt-console/templates/waldump.html
+++ b/mgmt-console/templates/waldump.html
@@ -1,46 +0,0 @@
-<head>
-
-<style>
-  .status {
-      font-family: monospace;
-      background-color: lightgrey;
-  }
-  .shellcommand {
-      font-family: monospace;
-      background-color: lightgrey;
-  }
-  .result {
-      font-family: monospace;
-      background-color: lightgrey;
-      padding: 10px;
-  }
-h1   {color: blue;}
-p    {color: red;}
-
-* {
-  box-sizing: border-box;
-}
-
-.row {
-  display: flex;
-}
-
-/* Create two equal columns that sits next to each other */
-.column1 {
-  flex: 30%;
-  padding: 10px;
-}
-.column2 {
-  flex: 70%;
-  padding: 10px;
-}
-</style>
-
-</head>
-
-<body>
-  <div id="waldump"></div>
-
-  <!-- Attach React components -->
-  <script type="text/javascript" src="{{ url_for('static', filename='waldump_bundle.js') }}"></script>
-</body>
--- a/mgmt-console/waldump.py
+++ b/mgmt-console/waldump.py
@@ -1,25 +0,0 @@
-#
-# This file contains work-in-progress code to visualize WAL contents.
-#
-# This is the API endpoint that calls a 'zenith_wal_to_json' executable,
-# which is a hacked version of pg_waldump that prints information about the
-# records in JSON format. The code in js/waldump.js displays it.
-#
-
-import os
-import re
-from subprocess import PIPE, STDOUT, run, Popen
-
-def fetch_wal(request, s3bucket):
-    filename = request.args.get('filename')
-    if not re.match("^[A-Za-z0-9_]+$", filename):
-        raise Exception('invalid WAL filename: ' + filename)
-
-    # FIXME: this downloads the WAL file to current dir. Use a temp dir? Pipe?
-    s3bucket.download_file('walarchive/' + filename, filename)
-
-    result = run("zenith_wal_to_json " + filename, stdout=PIPE, universal_newlines=True, shell=True)
-
-    os.unlink(filename);
-
-    return result.stdout
--- a/mgmt-console/webpack.config.js
+++ b/mgmt-console/webpack.config.js
@@ -1,27 +0,0 @@
-var webpack = require('webpack');  
-module.exports = {  
-    entry: {
-	app: './js/app.js',
-	waldump: './js/waldump.js'
-    },
-    output: {
-	filename: "[name]_bundle.js",
-	path: __dirname + '/static'
-    },
-    module: {
-	rules: [
-	    {
-		test: /\.js?$/,
-		exclude: /node_modules/,
-		use: {
-		    loader: 'babel-loader',
-		    options: {
-			presets: ['@babel/preset-env']
-		    }
-		}
-	    }
-	]
-    },
-    plugins: [
-    ]
-};
--- a/mgmt-console/zenith.py
+++ b/mgmt-console/zenith.py
@@ -1,179 +0,0 @@
-#zenith.py
-import click
-import testgres
-import os
-
-from testgres import PostgresNode
-from tabulate import tabulate
-
-zenith_base_dir = '/home/anastasia/zenith/basedir'
-
-@click.group()
-def main():
-    """Run the Zenith CLI."""
-
-@click.group()
-def pg():
-    """Db operations
-
-        NOTE: 'database' here means one postgresql node
-    """
-
-@click.command(name='create')
-@click.option('--name', required=True)
-@click.option('-s', '--storage-name', help='Name of the storage',
-                                 default='zenith-local',
-                                 show_default=True)
-@click.option('--snapshot', help='init from the snapshot. Snap is a name or URL')
-@click.option('--no-start', is_flag=True, help='Do not start created node',
-                            default=False, show_default=True)
-def pg_create(name, storage_name, snapshot, no_start):
-    """Initialize the database"""
-    node = PostgresNode()
-    base_dir = os.path.join(zenith_base_dir, 'pg', name)
-    node = testgres.get_new_node(name, base_dir=base_dir)
-    # TODO skip init, instead of that link node with storage or upload it from snapshot
-    node.init()
-    if(no_start==False):
-        node.start()
-
-@click.command(name='start')
-@click.option('--name', required=True)
-@click.option('--snapshot')
-@click.option('--read-only', is_flag=True, help='Start read-only node', show_default=True)
-def pg_start(name, snapshot, read_only):
-    """Start the database"""
-    node = PostgresNode()
-    base_dir = os.path.join(zenith_base_dir, 'pg', name)
-    node = testgres.get_new_node(name, base_dir=base_dir)
-    # TODO pass snapshot as a parameter
-    node.start()
-
-@click.command(name='stop')
-@click.option('--name', required=True)
-def pg_stop(name):
-    """Stop the database"""
-    node = PostgresNode()
-    base_dir = os.path.join(zenith_base_dir, 'pg', name)
-    node = testgres.get_new_node(name, base_dir=base_dir)
-    node.stop()
-
-@click.command(name='destroy')
-@click.option('--name', required=True)
-def pg_destroy(name):
-    """Drop the database"""
-    node = PostgresNode()
-    base_dir = os.path.join(zenith_base_dir, 'pg', name)
-    node = testgres.get_new_node(name, base_dir=base_dir)
-    node.cleanup()
-
-@click.command(name='list')
-def pg_list():
-    """List existing databases"""
-    dirs = os.listdir(os.path.join(zenith_base_dir, 'pg'))
-    path={}
-    status={}
-    data=[]
-
-    for dirname in dirs:
-        path[dirname] = os.path.join(zenith_base_dir, 'pg', dirname)
-        fname = os.path.join( path[dirname], 'data/postmaster.pid')
-        try:
-            f = open(fname,'r')
-            status[dirname] = f.readlines()[-1]
-        except OSError as err:
-            status[dirname]='inactive'
-        data.append([dirname , status[dirname], path[dirname]])
-
-    print(tabulate(data, headers=['Name', 'Status', 'Path']))
-
-pg.add_command(pg_create)
-pg.add_command(pg_destroy)
-pg.add_command(pg_start)   
-pg.add_command(pg_stop)   
-pg.add_command(pg_list)
-
-
-
-@click.group()
-def storage():
-    """Storage operations"""
-
-@click.command(name='attach')
-@click.option('--name')
-def storage_attach(name):
-    """Attach the storage"""
-
-@click.command(name='detach')
-@click.option('--name')
-@click.option('--force', is_flag=True, show_default=True)
-def storage_detach(name):
-    """Detach the storage"""
-
-@click.command(name='list')
-def storage_list():
-    """List existing storages"""
-
-storage.add_command(storage_attach)
-storage.add_command(storage_detach)
-storage.add_command(storage_list)
-
-@click.group()
-def snapshot():
-    """Snapshot operations"""
-
-@click.command(name='create')
-def snapshot_create():
-    """Create new snapshot"""
-
-@click.command(name='destroy')
-def snapshot_destroy():
-    """Destroy the snapshot"""
-
-@click.command(name='pull')
-def snapshot_pull():
-    """Pull remote snapshot"""
-
-@click.command(name='push')
-def snapshot_push():
-    """Push snapshot to remote"""
-
-@click.command(name='import')
-def snapshot_import():
-    """Convert given format to zenith snapshot"""
-
-@click.command(name='export')
-def snapshot_export():
-    """Convert zenith snapshot to PostgreSQL compatible format"""
-
-snapshot.add_command(snapshot_create)
-snapshot.add_command(snapshot_destroy)
-snapshot.add_command(snapshot_pull)
-snapshot.add_command(snapshot_push)
-snapshot.add_command(snapshot_import)
-snapshot.add_command(snapshot_export)
-
-@click.group()
-def wal():
-    """WAL operations"""
-
-@click.command()
-def wallist(name="list"):
-    """List WAL files"""
-
-wal.add_command(wallist)
-
-
-@click.command()
-def console():
-    """Open web console"""
-
-main.add_command(pg)
-main.add_command(storage)
-main.add_command(snapshot)
-main.add_command(wal)
-main.add_command(console)
-
-
-if __name__ == '__main__':
-    main()
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -24,13 +24,12 @@ clap = "2.33.0"
 termion = "1.5.6"
 tui = "0.14.0"
 daemonize = "0.4.1"
-rust-s3 = { git = "https://github.com/hlinnaka/rust-s3", rev="7f15a24ec7daa0a5d9516da706212745f9042818", features = ["no-verify-ssl"] }
+rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
 tokio = { version = "1.3.0", features = ["full"] }
 tokio-stream = { version = "0.1.4" }
-tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
-postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
-postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
+postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
+postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
+postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
 rocksdb = "0.16.0"
 anyhow = "1.0"
 crc32c = "0.6.0"
@@ -38,7 +37,11 @@ walkdir = "2"
 thiserror = "1.0"
 hex = "0.4.3"
 tar = "0.4.33"
-parse_duration = "*"
+parse_duration = "2.1.1"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1"
+fs_extra = "1.2.0"

 postgres_ffi = { path = "../postgres_ffi" }
 zenith_utils = { path = "../zenith_utils" }
+workspace_hack = { path = "../workspace_hack" }
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -1,20 +1,105 @@
+use crate::ZTimelineId;
 use log::*;
-use regex::Regex;
-use std::fmt;
 use std::io::Write;
+use std::sync::Arc;
 use tar::Builder;
 use walkdir::WalkDir;

-use crate::ZTimelineId;
+use crate::repository::Timeline;
+use postgres_ffi::relfile_utils::*;
+use zenith_utils::lsn::Lsn;

+///
+/// Generate tarball with non-relational files from repository
+///
+pub fn send_tarball_at_lsn(
+    write: &mut dyn Write,
+    timelineid: ZTimelineId,
+    _timeline: &Arc<dyn Timeline>,
+    _lsn: Lsn,
+    snapshot_lsn: Lsn,
+) -> anyhow::Result<()> {
+    let mut ar = Builder::new(write);
+
+    let snappath = format!("timelines/{}/snapshots/{:016X}", timelineid, snapshot_lsn.0);
+    let walpath = format!("timelines/{}/wal", timelineid);
+
+    debug!("sending tarball of snapshot in {}", snappath);
+    for entry in WalkDir::new(&snappath) {
+        let entry = entry?;
+        let fullpath = entry.path();
+        let relpath = entry.path().strip_prefix(&snappath).unwrap();
+
+        if relpath.to_str().unwrap() == "" {
+            continue;
+        }
+
+        if entry.file_type().is_dir() {
+            trace!(
+                "sending dir {} as {}",
+                fullpath.display(),
+                relpath.display()
+            );
+            ar.append_dir(relpath, fullpath)?;
+        } else if entry.file_type().is_symlink() {
+            error!("ignoring symlink in snapshot dir");
+        } else if entry.file_type().is_file() {
+            // Shared catalogs are exempt
+            if relpath.starts_with("global/") {
+                trace!("sending shared catalog {}", relpath.display());
+                ar.append_path_with_name(fullpath, relpath)?;
+            } else if !is_rel_file_path(relpath.to_str().unwrap()) {
+                trace!("sending {}", relpath.display());
+                ar.append_path_with_name(fullpath, relpath)?;
+            } else {
+                trace!("not sending {}", relpath.display());
+            }
+        } else {
+            error!("unknown file type: {}", fullpath.display());
+        }
+    }
+
+    // FIXME: Also send all the WAL. The compute node would only need
+    // the WAL that applies to non-relation files, because the page
+    // server handles all the relation files. But we don't have a
+    // mechanism for separating relation and non-relation WAL at the
+    // moment.
+    for entry in std::fs::read_dir(&walpath)? {
+        let entry = entry?;
+        let fullpath = &entry.path();
+        let relpath = fullpath.strip_prefix(&walpath).unwrap();
+
+        if !entry.path().is_file() {
+            continue;
+        }
+
+        let archive_fname = relpath.to_str().unwrap();
+        let archive_fname = archive_fname
+            .strip_suffix(".partial")
+            .unwrap_or(&archive_fname);
+        let archive_path = "pg_wal/".to_owned() + archive_fname;
+        ar.append_path_with_name(fullpath, archive_path)?;
+    }
+    ar.finish()?;
+    debug!("all tarred up!");
+    Ok(())
+}
+
+///
+/// Send a tarball containing a snapshot of all non-relation files in the
+/// PostgreSQL data directory, at given LSN
+///
+/// There must be a snapshot at the given LSN in the snapshots directory, we cannot
+/// reconstruct the state at an arbitrary LSN at the moment.
+///
 pub fn send_snapshot_tarball(
    write: &mut dyn Write,
    timelineid: ZTimelineId,
-    snapshotlsn: u64,
+    snapshotlsn: Lsn,
 ) -> Result<(), std::io::Error> {
    let mut ar = Builder::new(write);

-    let snappath = format!("timelines/{}/snapshots/{:016X}", timelineid, snapshotlsn);
+    let snappath = format!("timelines/{}/snapshots/{:016X}", timelineid, snapshotlsn.0);
    let walpath = format!("timelines/{}/wal", timelineid);

    debug!("sending tarball of snapshot in {}", snappath);
@@ -48,7 +133,14 @@ pub fn send_snapshot_tarball(
                ar.append_path_with_name(fullpath, relpath)?;
            } else {
                trace!("not sending {}", relpath.display());
-                // FIXME: send all files for now
+
+                // FIXME: For now, also send all the relation files.
+                // This really shouldn't be necessary, and kind of
+                // defeats the point of having a page server in the
+                // first place. But it is useful at least when
+                // debugging with the DEBUG_COMPARE_LOCAL option (see
+                // vendor/postgres/src/backend/storage/smgr/pagestore_smgr.c)
+
                ar.append_path_with_name(fullpath, relpath)?;
            }
        } else {
@@ -56,7 +148,11 @@ pub fn send_snapshot_tarball(
        }
    }

-    // FIXME: also send all the WAL
+    // FIXME: Also send all the WAL. The compute node would only need
+    // the WAL that applies to non-relation files, because the page
+    // server handles all the relation files. But we don't have a
+    // mechanism for separating relation and non-relation WAL at the
+    // moment.
    for entry in std::fs::read_dir(&walpath)? {
        let entry = entry?;
        let fullpath = &entry.path();
@@ -79,73 +175,10 @@ pub fn send_snapshot_tarball(
    Ok(())
 }

-// formats:
-// <oid>
-// <oid>_<fork name>
-// <oid>.<segment number>
-// <oid>_<fork name>.<segment number>
-
-#[derive(Debug)]
-struct FilePathError {
-    msg: String,
-}
-
-impl FilePathError {
-    fn new(msg: &str) -> FilePathError {
-        FilePathError {
-            msg: msg.to_string(),
-        }
-    }
-}
-
-impl From<core::num::ParseIntError> for FilePathError {
-    fn from(e: core::num::ParseIntError) -> Self {
-        return FilePathError {
-            msg: format!("invalid filename: {}", e),
-        };
-    }
-}
-
-impl fmt::Display for FilePathError {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "invalid filename")
-    }
-}
-
-fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
-    match forkname {
-        // "main" is not in filenames, it's implicit if the fork name is not present
-        None => Ok(0),
-        Some("fsm") => Ok(1),
-        Some("vm") => Ok(2),
-        Some("init") => Ok(3),
-        Some(_) => Err(FilePathError::new("invalid forkname")),
-    }
-}
-
-fn parse_filename(fname: &str) -> Result<(u32, u32, u32), FilePathError> {
-    let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
-
-    let caps = re
-        .captures(fname)
-        .ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
-
-    let relnode_str = caps.name("relnode").unwrap().as_str();
-    let relnode = u32::from_str_radix(relnode_str, 10)?;
-
-    let forkname = caps.name("forkname").map(|f| f.as_str());
-    let forknum = forkname_to_forknum(forkname)?;
-
-    let segno_match = caps.name("segno");
-    let segno = if segno_match.is_none() {
-        0
-    } else {
-        u32::from_str_radix(segno_match.unwrap().as_str(), 10)?
-    };
-
-    Ok((relnode, forknum, segno))
-}
-
+///
+/// Parse a path, relative to the root of PostgreSQL data directory, as
+/// a PostgreSQL relation data file.
+///
 fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> {
    /*
     * Relation data files can be in one of the following directories:
@@ -165,30 +198,27 @@ fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> {
     * <oid>.<segment number>
     */
    if let Some(fname) = path.strip_prefix("global/") {
-        let (_relnode, _forknum, _segno) = parse_filename(fname)?;
+        let (_relnode, _forknum, _segno) = parse_relfilename(fname)?;

        Ok(())
    } else if let Some(dbpath) = path.strip_prefix("base/") {
        let mut s = dbpath.split('/');
-        let dbnode_str = s
-            .next()
-            .ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
-        let _dbnode = u32::from_str_radix(dbnode_str, 10)?;
-        let fname = s
-            .next()
-            .ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
+        let dbnode_str = s.next().ok_or(FilePathError::InvalidFileName)?;
+        let _dbnode = dbnode_str.parse::<u32>()?;
+        let fname = s.next().ok_or(FilePathError::InvalidFileName)?;
        if s.next().is_some() {
-            return Err(FilePathError::new("invalid relation data file name"));
+            return Err(FilePathError::InvalidFileName);
        };

-        let (_relnode, _forknum, _segno) = parse_filename(fname)?;
+        let (_relnode, _forknum, _segno) = parse_relfilename(fname)?;

        Ok(())
-    } else if let Some(_) = path.strip_prefix("pg_tblspc/") {
+    } else if path.strip_prefix("pg_tblspc/").is_some() {
        // TODO
-        Err(FilePathError::new("tablespaces not supported"))
+        error!("tablespaces not implemented yet");
+        Err(FilePathError::InvalidFileName)
    } else {
-        Err(FilePathError::new("invalid relation data file name"))
+        Err(FilePathError::InvalidFileName)
    }
 }

--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -4,19 +4,23 @@

 use log::*;
 use parse_duration::parse;
-use std::fs::{self, OpenOptions};
 use std::io;
 use std::process::exit;
 use std::thread;
 use std::time::Duration;
+use std::{env, path::PathBuf};
+use std::{
+    fs::{File, OpenOptions},
+    net::TcpListener,
+};

 use anyhow::{Context, Result};
 use clap::{App, Arg};
 use daemonize::Daemonize;

-use slog::Drain;
+use slog::{Drain, FnValue};

-use pageserver::{page_service, tui, zenith_repo_dir, PageServerConf};
+use pageserver::{branches, page_cache, page_service, tui, PageServerConf};

 const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
 const DEFAULT_GC_PERIOD_SEC: u64 = 10;
@@ -47,6 +51,12 @@ fn main() -> Result<()> {
                .takes_value(false)
                .help("Run in the background"),
        )
+        .arg(
+            Arg::with_name("init")
+                .long("init")
+                .takes_value(false)
+                .help("Initialize pageserver repo"),
+        )
        .arg(
            Arg::with_name("gc_horizon")
                .long("gc_horizon")
@@ -59,14 +69,46 @@ fn main() -> Result<()> {
                .takes_value(true)
                .help("Interval between garbage collector iterations"),
        )
+        .arg(
+            Arg::with_name("workdir")
+                .short("D")
+                .long("workdir")
+                .takes_value(true)
+                .help("Working directory for the pageserver"),
+        )
        .get_matches();

+    let workdir = if let Some(workdir_arg) = arg_matches.value_of("workdir") {
+        PathBuf::from(workdir_arg)
+    } else if let Some(workdir_arg) = std::env::var_os("ZENITH_REPO_DIR") {
+        PathBuf::from(workdir_arg.to_str().unwrap())
+    } else {
+        PathBuf::from(".zenith")
+    };
+
+    let pg_distrib_dir: PathBuf = {
+        if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
+            postgres_bin.into()
+        } else {
+            let cwd = env::current_dir()?;
+            cwd.join("tmp_install")
+        }
+    };
+
+    if !pg_distrib_dir.join("bin/postgres").exists() {
+        anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
+    }
+
    let mut conf = PageServerConf {
        daemonize: false,
        interactive: false,
        gc_horizon: DEFAULT_GC_HORIZON,
        gc_period: Duration::from_secs(DEFAULT_GC_PERIOD_SEC),
-        listen_addr: "127.0.0.1:5430".parse().unwrap(),
+        listen_addr: "127.0.0.1:64000".parse().unwrap(),
+        // we will change the current working directory to the repository below,
+        // so always set 'workdir' to '.'
+        workdir: PathBuf::from("."),
+        pg_distrib_dir,
    };

    if arg_matches.is_present("daemonize") {
@@ -94,55 +136,68 @@ fn main() -> Result<()> {
        conf.gc_period = parse(period)?;
    }

-    start_pageserver(&conf)
+    // The configuration is all set up now. Turn it into a 'static
+    // that can be freely stored in structs and passed across threads
+    // as a ref.
+    let conf: &'static PageServerConf = Box::leak(Box::new(conf));
+
+    // Create repo and exit if init was requested
+    if arg_matches.is_present("init") {
+        branches::init_repo(conf, &workdir)?;
+        return Ok(());
+    }
+
+    // Set CWD to workdir for non-daemon modes
+    env::set_current_dir(&workdir)?;
+
+    start_pageserver(conf)
 }

-fn start_pageserver(conf: &PageServerConf) -> Result<()> {
+fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
+    let log_filename = "pageserver.log";
+    // Don't open the same file for output multiple times;
+    // the different fds could overwrite each other's output.
+    let log_file = OpenOptions::new()
+        .create(true)
+        .append(true)
+        .open(&log_filename)
+        .with_context(|| format!("failed to open {:?}", &log_filename))?;
+
    // Initialize logger
-    let _scope_guard = init_logging(&conf)?;
+    let logger_file = log_file.try_clone().unwrap();
+    let _scope_guard = init_logging(&conf, logger_file)?;
    let _log_guard = slog_stdlog::init()?;

    // Note: this `info!(...)` macro comes from `log` crate
    info!("standard logging redirected to slog");

-    let tui_thread: Option<thread::JoinHandle<()>>;
-    if conf.interactive {
+    let tui_thread = if conf.interactive {
        // Initialize the UI
-        tui_thread = Some(
+        Some(
            thread::Builder::new()
                .name("UI thread".into())
                .spawn(|| {
                    let _ = tui::ui_main();
                })
                .unwrap(),
-        );
-        //threads.push(tui_thread);
+        )
    } else {
-        tui_thread = None;
-    }
+        None
+    };
+
+    // TODO: Check that it looks like a valid repository before going further

    if conf.daemonize {
        info!("daemonizing...");

-        let repodir = zenith_repo_dir();
-
        // There should'n be any logging to stdin/stdout. Redirect it to the main log so
        // that we will see any accidental manual fprintf's or backtraces.
-        let log_filename = repodir.join("pageserver.log");
-        let stdout = OpenOptions::new()
-            .create(true)
-            .append(true)
-            .open(&log_filename)
-            .with_context(|| format!("failed to open {:?}", &log_filename))?;
-        let stderr = OpenOptions::new()
-            .create(true)
-            .append(true)
-            .open(&log_filename)
-            .with_context(|| format!("failed to open {:?}", &log_filename))?;
+        let stdout = log_file.try_clone().unwrap();
+        let stderr = log_file;

        let daemonize = Daemonize::new()
-            .pid_file(repodir.join("pageserver.pid"))
-            .working_directory(repodir)
+            .pid_file("pageserver.pid")
+            .working_directory(".")
            .stdout(stdout)
            .stderr(stderr);

@@ -150,69 +205,42 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> {
            Ok(_) => info!("Success, daemonized"),
            Err(e) => error!("Error, {}", e),
        }
-    } else {
-        // change into the repository directory. In daemon mode, Daemonize
-        // does this for us.
-        let repodir = zenith_repo_dir();
-        std::env::set_current_dir(&repodir)?;
-        info!("Changed current directory to repository in {:?}", &repodir);
    }

-    let mut threads = Vec::new();
+    // Check that we can bind to address before further initialization
+    info!("Starting pageserver on {}", conf.listen_addr);
+    let pageserver_listener = TcpListener::bind(conf.listen_addr)?;

-    // TODO: Check that it looks like a valid repository before going further
+    // Initialize page cache, this will spawn walredo_thread
+    page_cache::init(conf);

-    // Create directory for wal-redo datadirs
-    match fs::create_dir("wal-redo") {
-        Ok(_) => {}
-        Err(e) => match e.kind() {
-            io::ErrorKind::AlreadyExists => {}
-            _ => {
-                anyhow::bail!("Failed to create wal-redo data directory: {}", e);
-            }
-        },
-    }
-
-    // GetPage@LSN requests are served by another thread. (It uses async I/O,
-    // but the code in page_service sets up it own thread pool for that)
-    let conf_copy = conf.clone();
-    let page_server_thread = thread::Builder::new()
+    // Spawn a thread to listen for connections. It will spawn further threads
+    // for each connection.
+    let page_service_thread = thread::Builder::new()
        .name("Page Service thread".into())
-        .spawn(move || {
-            // thread code
-            page_service::thread_main(&conf_copy);
-        })
-        .unwrap();
-    threads.push(page_server_thread);
+        .spawn(move || page_service::thread_main(conf, pageserver_listener))?;

    if let Some(tui_thread) = tui_thread {
        // The TUI thread exits when the user asks to Quit.
        tui_thread.join().unwrap();
    } else {
-        // In non-interactive mode, wait forever.
-        for t in threads {
-            t.join().unwrap()
-        }
+        page_service_thread
+            .join()
+            .expect("Page service thread has panicked")?
    }
+
    Ok(())
 }

-fn init_logging(conf: &PageServerConf) -> Result<slog_scope::GlobalLoggerGuard, io::Error> {
+fn init_logging(
+    conf: &PageServerConf,
+    log_file: File,
+) -> Result<slog_scope::GlobalLoggerGuard, io::Error> {
    if conf.interactive {
        Ok(tui::init_logging())
    } else if conf.daemonize {
-        let log = zenith_repo_dir().join("pageserver.log");
-        let log_file = OpenOptions::new()
-            .create(true)
-            .append(true)
-            .open(&log)
-            .map_err(|err| {
-                // We failed to initialize logging, so we can't log this message with error!
-                eprintln!("Could not create log file {:?}: {}", log, err);
-                err
-            })?;
        let decorator = slog_term::PlainSyncDecorator::new(log_file);
-        let drain = slog_term::CompactFormat::new(decorator).build();
+        let drain = slog_term::FullFormat::new(decorator).build();
        let drain = slog::Filter::new(drain, |record: &slog::Record| {
            if record.level().is_at_least(slog::Level::Info) {
                return true;
@@ -220,7 +248,20 @@ fn init_logging(conf: &PageServerConf) -> Result<slog_scope::GlobalLoggerGuard,
            false
        });
        let drain = std::sync::Mutex::new(drain).fuse();
-        let logger = slog::Logger::root(drain, slog::o!());
+        let logger = slog::Logger::root(
+            drain,
+            slog::o!(
+                "location" =>
+                FnValue(move |record| {
+                    format!("{}, {}:{}",
+                            record.module(),
+                            record.file(),
+                            record.line()
+                            )
+                    }
+                )
+            ),
+        );
        Ok(slog_scope::set_global_logger(logger))
    } else {
        let decorator = slog_term::TermDecorator::new().build();
--- a/pageserver/src/branches.rs
+++ b/pageserver/src/branches.rs
@@ -0,0 +1,460 @@
+//
+// Branch management code
+//
+// TODO: move all paths construction to conf impl
+//
+
+use anyhow::{anyhow, bail, Context, Result};
+use bytes::Bytes;
+use fs::File;
+use postgres_ffi::{pg_constants, xlog_utils};
+use rand::Rng;
+use serde::{Deserialize, Serialize};
+use std::env;
+use std::io::{Read, Write};
+use std::{
+    collections::HashMap,
+    fs, io,
+    path::{Path, PathBuf},
+    process::{Command, Stdio},
+    str::FromStr,
+};
+use zenith_utils::lsn::Lsn;
+
+use crate::page_cache;
+use crate::restore_local_repo;
+use crate::{repository::Repository, PageServerConf, ZTimelineId};
+
+#[derive(Serialize, Deserialize, Clone)]
+pub struct BranchInfo {
+    pub name: String,
+    pub timeline_id: ZTimelineId,
+    pub latest_valid_lsn: Option<Lsn>,
+    pub ancestor_id: Option<String>,
+    pub ancestor_lsn: Option<String>,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct PointInTime {
+    pub timelineid: ZTimelineId,
+    pub lsn: Lsn,
+}
+
+pub fn init_repo(conf: &'static PageServerConf, repo_dir: &Path) -> Result<()> {
+    // top-level dir may exist if we are creating it through CLI
+    fs::create_dir_all(repo_dir)
+        .with_context(|| format!("could not create directory {}", repo_dir.display()))?;
+
+    env::set_current_dir(repo_dir)?;
+
+    fs::create_dir(std::path::Path::new("timelines"))?;
+    fs::create_dir(std::path::Path::new("refs"))?;
+    fs::create_dir(std::path::Path::new("refs").join("branches"))?;
+    fs::create_dir(std::path::Path::new("refs").join("tags"))?;
+
+    println!("created directory structure in {}", repo_dir.display());
+
+    // Run initdb
+    //
+    // We create the cluster temporarily in a "tmp" directory inside the repository,
+    // and move it to the right location from there.
+    let tmppath = std::path::Path::new("tmp");
+
+    print!("running initdb... ");
+    io::stdout().flush()?;
+
+    let initdb_path = conf.pg_bin_dir().join("initdb");
+    let initdb_otput = Command::new(initdb_path)
+        .args(&["-D", tmppath.to_str().unwrap()])
+        .arg("--no-instructions")
+        .env_clear()
+        .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
+        .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
+        .stdout(Stdio::null())
+        .output()
+        .with_context(|| "failed to execute initdb")?;
+    if !initdb_otput.status.success() {
+        anyhow::bail!("initdb failed");
+    }
+    println!("initdb succeeded");
+
+    // Read control file to extract the LSN and system id
+    let controlfile_path = tmppath.join("global").join("pg_control");
+    let controlfile = postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfile_path)?))?;
+    // let systemid = controlfile.system_identifier;
+    let lsn = controlfile.checkPoint;
+    let lsnstr = format!("{:016X}", lsn);
+
+    // Bootstrap the repository by loading the newly-initdb'd cluster into 'main' branch.
+    let tli = create_timeline(conf, None)?;
+    let timelinedir = conf.timeline_path(tli);
+
+    // We don't use page_cache here, because we don't want to spawn the WAL redo thread during
+    // repository initialization.
+    //
+    // FIXME: That caused trouble, because the WAL redo thread launched initdb in the background,
+    // and it kept running even after the "zenith init" had exited. In tests, we started the
+    // page server immediately after that, so that initdb was still running in the background,
+    // and we failed to run initdb again in the same directory. This has been solved for the
+    // rapid init+start case now, but the general race condition remains if you restart the the
+    // server quickly.
+    let repo = crate::repository::rocksdb::RocksRepository::new(
+        conf,
+        std::sync::Arc::new(crate::walredo::DummyRedoManager {}),
+    );
+    let timeline = repo.create_empty_timeline(tli, Lsn(lsn))?;
+
+    restore_local_repo::import_timeline_from_postgres_datadir(&tmppath, &*timeline, Lsn(lsn))?;
+
+    // Move the initial WAL file
+    fs::rename(
+        tmppath.join("pg_wal").join("000000010000000000000001"),
+        timelinedir
+            .join("wal")
+            .join("000000010000000000000001.partial"),
+    )?;
+    println!("created initial timeline {}", tli);
+
+    let data = tli.to_string();
+    fs::write(conf.branch_path("main"), data)?;
+    println!("created main branch");
+
+    // Remove pg_wal
+    fs::remove_dir_all(tmppath.join("pg_wal"))?;
+
+    force_crash_recovery(&tmppath)?;
+    println!("updated pg_control");
+
+    // Move the data directory as an initial base backup.
+    // FIXME: It would be enough to only copy the non-relation files here, the relation
+    // data was already loaded into the repository.
+    let target = timelinedir.join("snapshots").join(&lsnstr);
+    fs::rename(tmppath, &target)?;
+
+    println!(
+        "new zenith repository was created in {}",
+        repo_dir.display()
+    );
+
+    Ok(())
+}
+
+pub(crate) fn get_branches(
+    conf: &PageServerConf,
+    repository: &dyn Repository,
+) -> Result<Vec<BranchInfo>> {
+    // Each branch has a corresponding record (text file) in the refs/branches
+    // with timeline_id.
+    let branches_dir = std::path::Path::new("refs").join("branches");
+
+    std::fs::read_dir(&branches_dir)?
+        .map(|dir_entry_res| {
+            let dir_entry = dir_entry_res?;
+            let name = dir_entry.file_name().to_str().unwrap().to_string();
+            let timeline_id = std::fs::read_to_string(dir_entry.path())?.parse::<ZTimelineId>()?;
+
+            let latest_valid_lsn = repository
+                .get_timeline(timeline_id)
+                .map(|timeline| timeline.get_last_valid_lsn())
+                .ok();
+
+            let ancestor_path = conf.ancestor_path(timeline_id);
+            let mut ancestor_id: Option<String> = None;
+            let mut ancestor_lsn: Option<String> = None;
+
+            if ancestor_path.exists() {
+                let ancestor = std::fs::read_to_string(ancestor_path)?;
+                let mut strings = ancestor.split('@');
+
+                ancestor_id = Some(
+                    strings
+                        .next()
+                        .with_context(|| "wrong branch ancestor point in time format")?
+                        .to_owned(),
+                );
+                ancestor_lsn = Some(
+                    strings
+                        .next()
+                        .with_context(|| "wrong branch ancestor point in time format")?
+                        .to_owned(),
+                );
+            }
+
+            Ok(BranchInfo {
+                name,
+                timeline_id,
+                latest_valid_lsn,
+                ancestor_id,
+                ancestor_lsn,
+            })
+        })
+        .collect()
+}
+
+pub(crate) fn get_system_id(conf: &PageServerConf) -> Result<u64> {
+    // let branches = get_branches();
+
+    let branches_dir = std::path::Path::new("refs").join("branches");
+    let branches = std::fs::read_dir(&branches_dir)?
+        .map(|dir_entry_res| {
+            let dir_entry = dir_entry_res?;
+            let name = dir_entry.file_name().to_str().unwrap().to_string();
+            let timeline_id = std::fs::read_to_string(dir_entry.path())?.parse::<ZTimelineId>()?;
+            Ok((name, timeline_id))
+        })
+        .collect::<Result<HashMap<String, ZTimelineId>>>()?;
+
+    let main_tli = branches
+        .get("main")
+        .ok_or_else(|| anyhow!("Branch main not found"))?;
+
+    let (_, main_snap_dir) = find_latest_snapshot(conf, *main_tli)?;
+    let controlfile_path = main_snap_dir.join("global").join("pg_control");
+    let controlfile = postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfile_path)?))?;
+    Ok(controlfile.system_identifier)
+}
+
+pub(crate) fn create_branch(
+    conf: &PageServerConf,
+    branchname: &str,
+    startpoint_str: &str,
+) -> Result<BranchInfo> {
+    if conf.branch_path(&branchname).exists() {
+        anyhow::bail!("branch {} already exists", branchname);
+    }
+
+    let mut startpoint = parse_point_in_time(conf, startpoint_str)?;
+
+    if startpoint.lsn == Lsn(0) {
+        // Find end of WAL on the old timeline
+        let end_of_wal = find_end_of_wal(conf, startpoint.timelineid)?;
+        println!("branching at end of WAL: {}", end_of_wal);
+        startpoint.lsn = end_of_wal;
+    }
+
+    // create a new timeline directory for it
+    let newtli = create_timeline(conf, Some(startpoint))?;
+    let newtimelinedir = conf.timeline_path(newtli);
+
+    // Let the Repository backend do its initialization
+    let repo = page_cache::get_repository();
+    repo.branch_timeline(startpoint.timelineid, newtli, startpoint.lsn)?;
+
+    // Copy the latest snapshot (TODO: before the startpoint) and all WAL
+    // TODO: be smarter and avoid the copying...
+    let (_maxsnapshot, oldsnapshotdir) = find_latest_snapshot(conf, startpoint.timelineid)?;
+    let copy_opts = fs_extra::dir::CopyOptions::new();
+    fs_extra::dir::copy(oldsnapshotdir, newtimelinedir.join("snapshots"), &copy_opts)?;
+
+    let oldtimelinedir = conf.timeline_path(startpoint.timelineid);
+    copy_wal(
+        &oldtimelinedir.join("wal"),
+        &newtimelinedir.join("wal"),
+        startpoint.lsn,
+        pg_constants::WAL_SEGMENT_SIZE,
+    )?;
+
+    // Remember the human-readable branch name for the new timeline.
+    // FIXME: there's a race condition, if you create a branch with the same
+    // name concurrently.
+    let data = newtli.to_string();
+    fs::write(conf.branch_path(&branchname), data)?;
+
+    Ok(BranchInfo {
+        name: branchname.to_string(),
+        timeline_id: newtli,
+        latest_valid_lsn: Some(startpoint.lsn),
+        ancestor_id: None,
+        ancestor_lsn: None,
+    })
+}
+
+//
+// Parse user-given string that represents a point-in-time.
+//
+// We support multiple variants:
+//
+// Raw timeline id in hex, meaning the end of that timeline:
+//    bc62e7d612d0e6fe8f99a6dd2f281f9d
+//
+// A specific LSN on a timeline:
+//    bc62e7d612d0e6fe8f99a6dd2f281f9d@2/15D3DD8
+//
+// Same, with a human-friendly branch name:
+//    main
+//    main@2/15D3DD8
+//
+// Human-friendly tag name:
+//    mytag
+//
+//
+fn parse_point_in_time(conf: &PageServerConf, s: &str) -> Result<PointInTime> {
+    let mut strings = s.split('@');
+    let name = strings.next().unwrap();
+
+    let lsn: Option<Lsn>;
+    if let Some(lsnstr) = strings.next() {
+        lsn = Some(
+            Lsn::from_str(lsnstr).with_context(|| "invalid LSN in point-in-time specification")?,
+        );
+    } else {
+        lsn = None
+    }
+
+    // Check if it's a tag
+    if lsn.is_none() {
+        let tagpath = conf.tag_path(name);
+        if tagpath.exists() {
+            let pointstr = fs::read_to_string(tagpath)?;
+
+            return parse_point_in_time(conf, &pointstr);
+        }
+    }
+
+    // Check if it's a branch
+    // Check if it's branch @ LSN
+    let branchpath = conf.branch_path(name);
+    if branchpath.exists() {
+        let pointstr = fs::read_to_string(branchpath)?;
+
+        let mut result = parse_point_in_time(conf, &pointstr)?;
+
+        result.lsn = lsn.unwrap_or(Lsn(0));
+        return Ok(result);
+    }
+
+    // Check if it's a timelineid
+    // Check if it's timelineid @ LSN
+    if let Ok(timelineid) = ZTimelineId::from_str(name) {
+        let tlipath = conf.timeline_path(timelineid);
+        if tlipath.exists() {
+            return Ok(PointInTime {
+                timelineid,
+                lsn: lsn.unwrap_or(Lsn(0)),
+            });
+        }
+    }
+
+    bail!("could not parse point-in-time {}", s);
+}
+
+// If control file says the cluster was shut down cleanly, modify it, to mark
+// it as crashed. That forces crash recovery when you start the cluster.
+//
+// FIXME:
+// We currently do this to the initial snapshot in "zenith init". It would
+// be more natural to do this when the snapshot is restored instead, but we
+// currently don't have any code to create new snapshots, so it doesn't matter
+// Or better yet, use a less hacky way of putting the cluster into recovery.
+// Perhaps create a backup label file in the data directory when it's restored.
+fn force_crash_recovery(datadir: &Path) -> Result<()> {
+    // Read in the control file
+    let controlfilepath = datadir.to_path_buf().join("global").join("pg_control");
+    let mut controlfile =
+        postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfilepath.as_path())?))?;
+
+    controlfile.state = postgres_ffi::DBState_DB_IN_PRODUCTION;
+
+    fs::write(
+        controlfilepath.as_path(),
+        postgres_ffi::encode_pg_control(controlfile),
+    )?;
+
+    Ok(())
+}
+
+fn create_timeline(conf: &PageServerConf, ancestor: Option<PointInTime>) -> Result<ZTimelineId> {
+    // Create initial timeline
+    let mut tli_buf = [0u8; 16];
+    rand::thread_rng().fill(&mut tli_buf);
+    let timelineid = ZTimelineId::from(tli_buf);
+
+    let timelinedir = conf.timeline_path(timelineid);
+
+    fs::create_dir(&timelinedir)?;
+    fs::create_dir(&timelinedir.join("snapshots"))?;
+    fs::create_dir(&timelinedir.join("wal"))?;
+
+    if let Some(ancestor) = ancestor {
+        let data = format!("{}@{}", ancestor.timelineid, ancestor.lsn);
+        fs::write(timelinedir.join("ancestor"), data)?;
+    }
+
+    Ok(timelineid)
+}
+
+///
+/// Copy all WAL segments from one directory to another, up to given LSN.
+///
+/// If the given LSN is in the middle of a segment, the last segment containing it
+/// is written out as .partial, and padded with zeros.
+///
+fn copy_wal(src_dir: &Path, dst_dir: &Path, upto: Lsn, wal_seg_size: usize) -> Result<()> {
+    let last_segno = upto.segment_number(wal_seg_size);
+    let last_segoff = upto.segment_offset(wal_seg_size);
+
+    for entry in fs::read_dir(src_dir).unwrap().flatten() {
+        let entry_name = entry.file_name();
+        let fname = entry_name.to_str().unwrap();
+
+        // Check if the filename looks like an xlog file, or a .partial file.
+        if !xlog_utils::IsXLogFileName(fname) && !xlog_utils::IsPartialXLogFileName(fname) {
+            continue;
+        }
+        let (segno, _tli) = xlog_utils::XLogFromFileName(fname, wal_seg_size as usize);
+
+        let copylen;
+        let mut dst_fname = PathBuf::from(fname);
+        if segno > last_segno {
+            // future segment, skip
+            continue;
+        } else if segno < last_segno {
+            copylen = wal_seg_size;
+            dst_fname.set_extension("");
+        } else {
+            copylen = last_segoff;
+            dst_fname.set_extension("partial");
+        }
+
+        let src_file = File::open(entry.path())?;
+        let mut dst_file = File::create(dst_dir.join(&dst_fname))?;
+        std::io::copy(&mut src_file.take(copylen as u64), &mut dst_file)?;
+
+        if copylen < wal_seg_size {
+            std::io::copy(
+                &mut std::io::repeat(0).take((wal_seg_size - copylen) as u64),
+                &mut dst_file,
+            )?;
+        }
+    }
+    Ok(())
+}
+
+// Find the end of valid WAL in a wal directory
+pub fn find_end_of_wal(conf: &PageServerConf, timeline: ZTimelineId) -> Result<Lsn> {
+    let waldir = conf.timeline_path(timeline).join("wal");
+    let (lsn, _tli) = xlog_utils::find_end_of_wal(&waldir, pg_constants::WAL_SEGMENT_SIZE, true);
+    Ok(Lsn(lsn))
+}
+
+// Find the latest snapshot for a timeline
+fn find_latest_snapshot(conf: &PageServerConf, timeline: ZTimelineId) -> Result<(Lsn, PathBuf)> {
+    let snapshotsdir = conf.snapshots_path(timeline);
+    let paths = fs::read_dir(&snapshotsdir)?;
+    let mut maxsnapshot = Lsn(0);
+    let mut snapshotdir: Option<PathBuf> = None;
+    for path in paths {
+        let path = path?;
+        let filename = path.file_name().to_str().unwrap().to_owned();
+        if let Ok(lsn) = Lsn::from_hex(&filename) {
+            maxsnapshot = std::cmp::max(lsn, maxsnapshot);
+            snapshotdir = Some(path.path());
+        }
+    }
+    if maxsnapshot == Lsn(0) {
+        // TODO: check ancestor timeline
+        anyhow::bail!("no snapshot found in {}", snapshotsdir.display());
+    }
+
+    Ok((maxsnapshot, snapshotdir.unwrap()))
+}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,3 +1,5 @@
+use serde::{Deserialize, Serialize};
+
 use std::fmt;
 use std::net::SocketAddr;
 use std::path::PathBuf;
@@ -5,8 +7,10 @@ use std::str::FromStr;
 use std::time::Duration;

 pub mod basebackup;
+pub mod branches;
 pub mod page_cache;
 pub mod page_service;
+pub mod repository;
 pub mod restore_local_repo;
 pub mod tui;
 pub mod tui_event;
@@ -22,6 +26,54 @@ pub struct PageServerConf {
    pub listen_addr: SocketAddr,
    pub gc_horizon: u64,
    pub gc_period: Duration,
+
+    // Repository directory, relative to current working directory.
+    // Normally, the page server changes the current working directory
+    // to the repository, and 'workdir' is always '.'. But we don't do
+    // that during unit testing, because the current directory is global
+    // to the process but different unit tests work on different
+    // repositories.
+    pub workdir: PathBuf,
+
+    pub pg_distrib_dir: PathBuf,
+}
+
+impl PageServerConf {
+    //
+    // Repository paths, relative to workdir.
+    //
+
+    fn tag_path(&self, name: &str) -> PathBuf {
+        self.workdir.join("refs").join("tags").join(name)
+    }
+
+    fn branch_path(&self, name: &str) -> PathBuf {
+        self.workdir.join("refs").join("branches").join(name)
+    }
+
+    fn timeline_path(&self, timelineid: ZTimelineId) -> PathBuf {
+        self.workdir.join("timelines").join(timelineid.to_string())
+    }
+
+    fn snapshots_path(&self, timelineid: ZTimelineId) -> PathBuf {
+        self.timeline_path(timelineid).join("snapshots")
+    }
+
+    fn ancestor_path(&self, timelineid: ZTimelineId) -> PathBuf {
+        self.timeline_path(timelineid).join("ancestor")
+    }
+
+    //
+    // Postgres distribution paths
+    //
+
+    pub fn pg_bin_dir(&self) -> PathBuf {
+        self.pg_distrib_dir.join("bin")
+    }
+
+    pub fn pg_lib_dir(&self) -> PathBuf {
+        self.pg_distrib_dir.join("lib")
+    }
 }

 /// Zenith Timeline ID is a 128-bit random ID.
@@ -48,7 +100,7 @@ pub struct PageServerConf {
 /// is separate from PostgreSQL timelines, and doesn't have those
 /// limitations. A zenith timeline is identified by a 128-bit ID, which
 /// is usually printed out as a hex string.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct ZTimelineId([u8; 16]);

 impl FromStr for ZTimelineId {
@@ -84,11 +136,3 @@ impl fmt::Display for ZTimelineId {
        f.write_str(&hex::encode(self.0))
    }
 }
-
-pub fn zenith_repo_dir() -> PathBuf {
-    // Find repository path
-    match std::env::var_os("ZENITH_REPO_DIR") {
-        Some(val) => PathBuf::from(val.to_str().unwrap()),
-        None => ".zenith".into(),
-    }
-}
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -1,917 +1,32 @@
-//
-// Page Cache holds all the different page versions and WAL records
-//
-// Currently, the page cache uses RocksDB to store WAL wal records and
-// full page images, keyed by the RelFileNode, blocknumber, and the
-// LSN.
+//! This module acts as a switchboard to access different repositories managed by this
+//! page server. Currently, a Page Server can only manage one repository, so there
+//! isn't much here. If we implement multi-tenancy, this will probably be changed into
+//! a hash map, keyed by the tenant ID.

-use crate::restore_local_repo::restore_timeline;
-use crate::waldecoder::Oid;
-use crate::walredo::WalRedoManager;
-use crate::ZTimelineId;
-use crate::{zenith_repo_dir, PageServerConf};
-use anyhow::{bail, Context};
-use bytes::{Buf, BufMut, Bytes, BytesMut};
+use crate::repository::rocksdb::RocksRepository;
+use crate::repository::Repository;
+use crate::walredo::PostgresRedoManager;
+use crate::PageServerConf;
 use lazy_static::lazy_static;
-use log::*;
-use std::cmp::min;
-use std::collections::HashMap;
-use std::sync::atomic::AtomicU64;
-use std::sync::atomic::Ordering;
 use std::sync::{Arc, Mutex};
-use std::thread;
-use std::time::{Duration, Instant};
-use std::{convert::TryInto, ops::AddAssign};
-use zenith_utils::lsn::{AtomicLsn, Lsn};
-use zenith_utils::seqwait::SeqWait;
-
-// Timeout when waiting or WAL receiver to catch up to an LSN given in a GetPage@LSN call.
-static TIMEOUT: Duration = Duration::from_secs(60);
-
-pub struct PageCache {
-    // RocksDB handle
-    db: rocksdb::DB,
-
-    // WAL redo manager
-    walredo_mgr: WalRedoManager,
-
-    // What page versions do we hold in the cache? If we get GetPage with
-    // LSN < first_valid_lsn, that's an error because we (no longer) hold that
-    // page version. If we get a request > last_valid_lsn, we need to wait until
-    // we receive all the WAL up to the request. The SeqWait provides functions
-    // for that.
-    //
-    // last_record_lsn points to the end of last processed WAL record.
-    // It can lag behind last_valid_lsn, if the WAL receiver has received some WAL
-    // after the end of last record, but not the whole next record yet. In the
-    // page cache, we care about last_valid_lsn, but if the WAL receiver needs to
-    // restart the streaming, it needs to restart at the end of last record, so
-    // we track them separately. last_record_lsn should perhaps be in
-    // walreceiver.rs instead of here, but it seems convenient to keep all three
-    // values together.
-    //
-    first_valid_lsn: AtomicLsn,
-    last_valid_lsn: SeqWait<Lsn>,
-    last_record_lsn: AtomicLsn,
-
-    // Counters, for metrics collection.
-    pub num_entries: AtomicU64,
-    pub num_page_images: AtomicU64,
-    pub num_wal_records: AtomicU64,
-    pub num_getpage_requests: AtomicU64,
-}
-
-#[derive(Clone)]
-pub struct PageCacheStats {
-    pub num_entries: u64,
-    pub num_page_images: u64,
-    pub num_wal_records: u64,
-    pub num_getpage_requests: u64,
-}
-
-impl AddAssign for PageCacheStats {
-    fn add_assign(&mut self, other: Self) {
-        self.num_entries += other.num_entries;
-        self.num_page_images += other.num_page_images;
-        self.num_wal_records += other.num_wal_records;
-        self.num_getpage_requests += other.num_getpage_requests;
-    }
-}

 lazy_static! {
-    pub static ref PAGECACHES: Mutex<HashMap<ZTimelineId, Arc<PageCache>>> =
-        Mutex::new(HashMap::new());
+    pub static ref REPOSITORY: Mutex<Option<Arc<dyn Repository + Send + Sync>>> = Mutex::new(None);
 }

-// Get Page Cache for given timeline. It is assumed to already exist.
-pub fn get_pagecache(_conf: &PageServerConf, timelineid: ZTimelineId) -> Option<Arc<PageCache>> {
-    let pcaches = PAGECACHES.lock().unwrap();
+pub fn init(conf: &'static PageServerConf) {
+    let mut m = REPOSITORY.lock().unwrap();

-    match pcaches.get(&timelineid) {
-        Some(pcache) => Some(pcache.clone()),
-        None => None,
-    }
+    // Set up a WAL redo manager, for applying WAL records.
+    let walredo_mgr = PostgresRedoManager::new(conf);
+
+    // we have already changed current dir to the repository.
+    let repo = RocksRepository::new(conf, Arc::new(walredo_mgr));
+
+    *m = Some(Arc::new(repo));
 }

-pub fn get_or_restore_pagecache(
-    conf: &PageServerConf,
-    timelineid: ZTimelineId,
-) -> anyhow::Result<Arc<PageCache>> {
-    let mut pcaches = PAGECACHES.lock().unwrap();
-    match pcaches.get(&timelineid) {
-        Some(pcache) => Ok(pcache.clone()),
-        None => {
-            let pcache = init_page_cache(conf, timelineid);
-
-            restore_timeline(conf, &pcache, timelineid)?;
-
-            let result = Arc::new(pcache);
-
-            pcaches.insert(timelineid, result.clone());
-
-            if conf.gc_horizon != 0 {
-                let conf_copy = conf.clone();
-                let _gc_thread = thread::Builder::new()
-                    .name("Garbage collection thread".into())
-                    .spawn(move || {
-                        gc_thread_main(&conf_copy, timelineid);
-                    })
-                    .unwrap();
-            }
-            Ok(result)
-        }
-    }
-}
-
-fn gc_thread_main(conf: &PageServerConf, timelineid: ZTimelineId) {
-    info!("Garbage collection thread started {}", timelineid);
-    let pcache = get_pagecache(conf, timelineid).unwrap();
-
-    pcache.do_gc(conf).unwrap();
-}
-
-fn open_rocksdb(_conf: &PageServerConf, timelineid: ZTimelineId) -> rocksdb::DB {
-    let path = zenith_repo_dir().join(timelineid.to_string());
-    let mut opts = rocksdb::Options::default();
-    opts.create_if_missing(true);
-    opts.set_use_fsync(true);
-    opts.set_compression_type(rocksdb::DBCompressionType::Lz4);
-    opts.set_compaction_filter("ttl", move |_level: u32, _key: &[u8], val: &[u8]| {
-        if (val[0] & UNUSED_VERSION_FLAG) != 0 {
-            rocksdb::compaction_filter::Decision::Remove
-        } else {
-            rocksdb::compaction_filter::Decision::Keep
-        }
-    });
-    rocksdb::DB::open(&opts, &path).unwrap()
-}
-
-fn init_page_cache(conf: &PageServerConf, timelineid: ZTimelineId) -> PageCache {
-    PageCache {
-        db: open_rocksdb(&conf, timelineid),
-
-        walredo_mgr: WalRedoManager::new(conf, timelineid),
-
-        first_valid_lsn: AtomicLsn::new(0),
-        last_valid_lsn: SeqWait::new(Lsn(0)),
-        last_record_lsn: AtomicLsn::new(0),
-
-        num_entries: AtomicU64::new(0),
-        num_page_images: AtomicU64::new(0),
-        num_wal_records: AtomicU64::new(0),
-        num_getpage_requests: AtomicU64::new(0),
-    }
-}
-
-//
-// We store two kinds of entries in the page cache:
-//
-// 1. Ready-made images of the block
-// 2. WAL records, to be applied on top of the "previous" entry
-//
-// Some WAL records will initialize the page from scratch. For such records,
-// the 'will_init' flag is set. They don't need the previous page image before
-// applying. The 'will_init' flag is set for records containing a full-page image,
-// and for records with the BKPBLOCK_WILL_INIT flag. These differ from PageImages
-// stored directly in the cache entry in that you still need to run the WAL redo
-// routine to generate the page image.
-//
-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
-pub struct CacheKey {
-    pub tag: BufferTag,
-    pub lsn: Lsn,
-}
-
-impl CacheKey {
-    pub fn pack(&self, buf: &mut BytesMut) {
-        self.tag.pack(buf);
-        buf.put_u64(self.lsn.0);
-    }
-    pub fn unpack(buf: &mut BytesMut) -> CacheKey {
-        CacheKey {
-            tag: BufferTag::unpack(buf),
-            lsn: Lsn::from(buf.get_u64()),
-        }
-    }
-}
-
-pub struct CacheEntryContent {
-    pub page_image: Option<Bytes>,
-    pub wal_record: Option<WALRecord>,
-}
-
-const PAGE_IMAGE_FLAG: u8 = 1u8;
-const UNUSED_VERSION_FLAG: u8 = 2u8;
-
-impl CacheEntryContent {
-    pub fn pack(&self, buf: &mut BytesMut) {
-        if let Some(image) = &self.page_image {
-            buf.put_u8(PAGE_IMAGE_FLAG);
-            buf.put_u16(image.len() as u16);
-            buf.put_slice(&image[..]);
-        } else if let Some(rec) = &self.wal_record {
-            buf.put_u8(0);
-            rec.pack(buf);
-        }
-    }
-    pub fn unpack(buf: &mut BytesMut) -> CacheEntryContent {
-        if (buf.get_u8() & PAGE_IMAGE_FLAG) != 0 {
-            let mut dst = vec![0u8; buf.get_u16() as usize];
-            buf.copy_to_slice(&mut dst);
-            CacheEntryContent {
-                page_image: Some(Bytes::from(dst)),
-                wal_record: None,
-            }
-        } else {
-            CacheEntryContent {
-                page_image: None,
-                wal_record: Some(WALRecord::unpack(buf)),
-            }
-        }
-    }
-}
-
-#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy)]
-pub struct RelTag {
-    pub spcnode: u32,
-    pub dbnode: u32,
-    pub relnode: u32,
-    pub forknum: u8,
-}
-
-impl RelTag {
-    pub fn pack(&self, buf: &mut BytesMut) {
-        buf.put_u32(self.spcnode);
-        buf.put_u32(self.dbnode);
-        buf.put_u32(self.relnode);
-        buf.put_u32(self.forknum as u32); // encode forknum as u32 to provide compatibility with wal_redo_postgres
-    }
-    pub fn unpack(buf: &mut BytesMut) -> RelTag {
-        RelTag {
-            spcnode: buf.get_u32(),
-            dbnode: buf.get_u32(),
-            relnode: buf.get_u32(),
-            forknum: buf.get_u32() as u8,
-        }
-    }
-}
-
-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
-pub struct BufferTag {
-    pub rel: RelTag,
-    pub blknum: u32,
-}
-
-impl BufferTag {
-    pub fn pack(&self, buf: &mut BytesMut) {
-        self.rel.pack(buf);
-        buf.put_u32(self.blknum);
-    }
-    pub fn unpack(buf: &mut BytesMut) -> BufferTag {
-        BufferTag {
-            rel: RelTag::unpack(buf),
-            blknum: buf.get_u32(),
-        }
-    }
-}
-
-#[derive(Debug, Clone)]
-pub struct WALRecord {
-    pub lsn: Lsn, // LSN at the *end* of the record
-    pub will_init: bool,
-    pub truncate: bool,
-    pub rec: Bytes,
-    // Remember the offset of main_data in rec,
-    // so that we don't have to parse the record again.
-    // If record has no main_data, this offset equals rec.len().
-    pub main_data_offset: u32,
-}
-
-impl WALRecord {
-    pub fn pack(&self, buf: &mut BytesMut) {
-        buf.put_u64(self.lsn.0);
-        buf.put_u8(self.will_init as u8);
-        buf.put_u8(self.truncate as u8);
-        buf.put_u32(self.main_data_offset);
-        buf.put_u32(self.rec.len() as u32);
-        buf.put_slice(&self.rec[..]);
-    }
-    pub fn unpack(buf: &mut BytesMut) -> WALRecord {
-        let lsn = Lsn::from(buf.get_u64());
-        let will_init = buf.get_u8() != 0;
-        let truncate = buf.get_u8() != 0;
-        let main_data_offset = buf.get_u32();
-        let mut dst = vec![0u8; buf.get_u32() as usize];
-        buf.copy_to_slice(&mut dst);
-        WALRecord {
-            lsn,
-            will_init,
-            truncate,
-            rec: Bytes::from(dst),
-            main_data_offset,
-        }
-    }
-}
-
-impl PageCache {
-    // Public GET interface functions
-
-    ///
-    /// GetPage@LSN
-    ///
-    /// Returns an 8k page image
-    ///
-    pub fn get_page_at_lsn(&self, tag: BufferTag, req_lsn: Lsn) -> anyhow::Result<Bytes> {
-        self.num_getpage_requests.fetch_add(1, Ordering::Relaxed);
-
-        let lsn = self.wait_lsn(req_lsn)?;
-
-        // Look up cache entry. If it's a page image, return that. If it's a WAL record,
-        // ask the WAL redo service to reconstruct the page image from the WAL records.
-        let key = CacheKey { tag, lsn };
-
-        let mut buf = BytesMut::new();
-        key.pack(&mut buf);
-        let mut iter = self.db.raw_iterator();
-        iter.seek_for_prev(&buf[..]);
-
-        if iter.valid() {
-            let k = iter.key().unwrap();
-            buf.clear();
-            buf.extend_from_slice(&k);
-            let key = CacheKey::unpack(&mut buf);
-            if key.tag == tag {
-                let v = iter.value().unwrap();
-                buf.clear();
-                buf.extend_from_slice(&v);
-                let content = CacheEntryContent::unpack(&mut buf);
-                let page_img: Bytes;
-                if let Some(img) = &content.page_image {
-                    page_img = img.clone();
-                } else if content.wal_record.is_some() {
-                    // Request the WAL redo manager to apply the WAL records for us.
-                    let (base_img, records) = self.collect_records_for_apply(tag, lsn);
-                    page_img = self.walredo_mgr.request_redo(tag, lsn, base_img, records)?;
-
-                    self.put_page_image(tag, lsn, page_img.clone());
-                } else {
-                    // No base image, and no WAL record. Huh?
-                    bail!("no page image or WAL record for requested page");
-                }
-                // FIXME: assumes little-endian. Only used for the debugging log though
-                let page_lsn_hi =
-                    u32::from_le_bytes(page_img.get(0..4).unwrap().try_into().unwrap());
-                let page_lsn_lo =
-                    u32::from_le_bytes(page_img.get(4..8).unwrap().try_into().unwrap());
-                debug!(
-                    "Returning page with LSN {:X}/{:X} for {}/{}/{}.{} blk {}",
-                    page_lsn_hi,
-                    page_lsn_lo,
-                    tag.rel.spcnode,
-                    tag.rel.dbnode,
-                    tag.rel.relnode,
-                    tag.rel.forknum,
-                    tag.blknum
-                );
-                return Ok(page_img);
-            }
-        }
-        static ZERO_PAGE: [u8; 8192] = [0u8; 8192];
-        debug!("Page {:?} at {}({}) not found", tag, req_lsn, lsn);
-        Ok(Bytes::from_static(&ZERO_PAGE))
-        /* return Err("could not find page image")?; */
-    }
-
-    ///
-    /// Get size of relation at given LSN.
-    ///
-    pub fn relsize_get(&self, rel: &RelTag, lsn: Lsn) -> anyhow::Result<u32> {
-        self.wait_lsn(lsn)?;
-        self.relsize_get_nowait(rel, lsn)
-    }
-
-    ///
-    /// Does relation exist at given LSN?
-    ///
-    pub fn relsize_exist(&self, rel: &RelTag, req_lsn: Lsn) -> anyhow::Result<bool> {
-        let lsn = self.wait_lsn(req_lsn)?;
-
-        let key = CacheKey {
-            tag: BufferTag {
-                rel: *rel,
-                blknum: u32::MAX,
-            },
-            lsn,
-        };
-        let mut buf = BytesMut::new();
-        key.pack(&mut buf);
-        let mut iter = self.db.raw_iterator();
-        iter.seek_for_prev(&buf[..]);
-        if iter.valid() {
-            let k = iter.key().unwrap();
-            buf.clear();
-            buf.extend_from_slice(&k);
-            let tag = BufferTag::unpack(&mut buf);
-            if tag.rel == *rel {
-                debug!("Relation {:?} exists at {}", rel, lsn);
-                return Ok(true);
-            }
-        }
-        debug!("Relation {:?} doesn't exist at {}", rel, lsn);
-        Ok(false)
-    }
-
-    // Other public functions, for updating the page cache.
-    // These are used by the WAL receiver and WAL redo.
-
-    ///
-    /// Collect all the WAL records that are needed to reconstruct a page
-    /// image for the given cache entry.
-    ///
-    /// Returns an old page image (if any), and a vector of WAL records to apply
-    /// over it.
-    ///
-    pub fn collect_records_for_apply(
-        &self,
-        tag: BufferTag,
-        lsn: Lsn,
-    ) -> (Option<Bytes>, Vec<WALRecord>) {
-        let mut buf = BytesMut::new();
-        let key = CacheKey { tag, lsn };
-        key.pack(&mut buf);
-
-        let mut base_img: Option<Bytes> = None;
-        let mut records: Vec<WALRecord> = Vec::new();
-
-        let mut iter = self.db.raw_iterator();
-        iter.seek_for_prev(&buf[..]);
-
-        // Scan backwards, collecting the WAL records, until we hit an
-        // old page image.
-        while iter.valid() {
-            let k = iter.key().unwrap();
-            buf.clear();
-            buf.extend_from_slice(&k);
-            let key = CacheKey::unpack(&mut buf);
-            if key.tag != tag {
-                break;
-            }
-            let v = iter.value().unwrap();
-            buf.clear();
-            buf.extend_from_slice(&v);
-            let content = CacheEntryContent::unpack(&mut buf);
-            if let Some(img) = &content.page_image {
-                // We have a base image. No need to dig deeper into the list of
-                // records
-                base_img = Some(img.clone());
-                break;
-            } else if let Some(rec) = &content.wal_record {
-                records.push(rec.clone());
-                // If this WAL record initializes the page, no need to dig deeper.
-                if rec.will_init {
-                    break;
-                }
-            } else {
-                panic!("no base image and no WAL record on cache entry");
-            }
-            iter.prev();
-        }
-        records.reverse();
-        (base_img, records)
-    }
-
-    ///
-    /// Adds a WAL record to the page cache
-    ///
-    pub fn put_wal_record(&self, tag: BufferTag, rec: WALRecord) {
-        let lsn = rec.lsn;
-        let key = CacheKey { tag, lsn };
-
-        let content = CacheEntryContent {
-            page_image: None,
-            wal_record: Some(rec),
-        };
-
-        let mut key_buf = BytesMut::new();
-        key.pack(&mut key_buf);
-        let mut val_buf = BytesMut::new();
-        content.pack(&mut val_buf);
-
-        let _res = self.db.put(&key_buf[..], &val_buf[..]);
-        //trace!("put_wal_record lsn: {}", lsn);
-
-        self.num_entries.fetch_add(1, Ordering::Relaxed);
-        self.num_wal_records.fetch_add(1, Ordering::Relaxed);
-    }
-
-    ///
-    /// Adds a relation-wide WAL record (like truncate) to the page cache,
-    /// associating it with all pages started with specified block number
-    ///
-    pub fn put_rel_wal_record(&self, tag: BufferTag, rec: WALRecord) -> anyhow::Result<()> {
-        let mut key = CacheKey { tag, lsn: rec.lsn };
-
-        // What was the size of the relation before this record?
-        let last_lsn = self.last_valid_lsn.load();
-        let old_rel_size = self.relsize_get_nowait(&tag.rel, last_lsn)?;
-
-        let content = CacheEntryContent {
-            page_image: None,
-            wal_record: Some(rec),
-        };
-        // set new relation size
-        trace!("Truncate relation {:?}", tag);
-        let mut key_buf = BytesMut::new();
-        let mut val_buf = BytesMut::new();
-        content.pack(&mut val_buf);
-
-        for blknum in tag.blknum..old_rel_size {
-            key_buf.clear();
-            key.tag.blknum = blknum;
-            key.pack(&mut key_buf);
-            trace!("put_wal_record lsn: {}", key.lsn);
-            let _res = self.db.put(&key_buf[..], &val_buf[..]);
-        }
-        let n = (old_rel_size - tag.blknum) as u64;
-        self.num_entries.fetch_add(n, Ordering::Relaxed);
-        self.num_wal_records.fetch_add(n, Ordering::Relaxed);
-        Ok(())
-    }
-
-    ///
-    /// Memorize a full image of a page version
-    ///
-    pub fn put_page_image(&self, tag: BufferTag, lsn: Lsn, img: Bytes) {
-        let key = CacheKey { tag, lsn };
-        let content = CacheEntryContent {
-            page_image: Some(img),
-            wal_record: None,
-        };
-
-        let mut key_buf = BytesMut::new();
-        key.pack(&mut key_buf);
-        let mut val_buf = BytesMut::new();
-        content.pack(&mut val_buf);
-
-        trace!("put_wal_record lsn: {}", key.lsn);
-        let _res = self.db.put(&key_buf[..], &val_buf[..]);
-
-        //debug!("inserted page image for {}/{}/{}_{} blk {} at {}",
-        //        tag.spcnode, tag.dbnode, tag.relnode, tag.forknum, tag.blknum, lsn);
-        self.num_page_images.fetch_add(1, Ordering::Relaxed);
-    }
-
-    pub fn create_database(
-        &self,
-        lsn: Lsn,
-        db_id: Oid,
-        tablespace_id: Oid,
-        src_db_id: Oid,
-        src_tablespace_id: Oid,
-    ) -> anyhow::Result<()> {
-        let mut buf = BytesMut::new();
-        let key = CacheKey {
-            tag: BufferTag {
-                rel: RelTag {
-                    spcnode: src_tablespace_id,
-                    dbnode: src_db_id,
-                    relnode: 0,
-                    forknum: 0u8,
-                },
-                blknum: 0,
-            },
-            lsn: Lsn(0),
-        };
-        key.pack(&mut buf);
-        let mut iter = self.db.raw_iterator();
-        iter.seek(&buf[..]);
-        let mut n = 0;
-        while iter.valid() {
-            let k = iter.key().unwrap();
-            let v = iter.value().unwrap();
-            buf.clear();
-            buf.extend_from_slice(&k);
-            let mut key = CacheKey::unpack(&mut buf);
-            if key.tag.rel.spcnode != src_tablespace_id || key.tag.rel.dbnode != src_db_id {
-                break;
-            }
-            key.tag.rel.spcnode = tablespace_id;
-            key.tag.rel.dbnode = db_id;
-            key.lsn = lsn;
-            buf.clear();
-            key.pack(&mut buf);
-
-            self.db.put(&buf[..], v)?;
-            n += 1;
-            iter.next();
-        }
-        info!(
-            "Create database {}/{}, copy {} entries",
-            tablespace_id, db_id, n
-        );
-        Ok(())
-    }
-
-    /// Remember that WAL has been received and added to the page cache up to the given LSN
-    pub fn advance_last_valid_lsn(&self, lsn: Lsn) {
-        let old = self.last_valid_lsn.advance(lsn);
-
-        // Can't move backwards.
-        if lsn < old {
-            warn!(
-                "attempted to move last valid LSN backwards (was {}, new {})",
-                old, lsn
-            );
-        }
-    }
-
-    ///
-    /// Remember the (end of) last valid WAL record remembered in the page cache.
-    ///
-    /// NOTE: this updates last_valid_lsn as well.
-    ///
-    pub fn advance_last_record_lsn(&self, lsn: Lsn) {
-        // Can't move backwards.
-        let old = self.last_record_lsn.fetch_max(lsn);
-        assert!(old <= lsn);
-
-        // Also advance last_valid_lsn
-        let old = self.last_valid_lsn.advance(lsn);
-        // Can't move backwards.
-        if lsn < old {
-            warn!(
-                "attempted to move last record LSN backwards (was {}, new {})",
-                old, lsn
-            );
-        }
-    }
-
-    ///
-    /// Remember the beginning of valid WAL.
-    ///
-    /// TODO: This should be called by garbage collection, so that if an older
-    /// page is requested, we will return an error to the requestor.
-    pub fn _advance_first_valid_lsn(&self, lsn: Lsn) {
-        // Can't overtake last_valid_lsn (except when we're
-        // initializing the system and last_valid_lsn hasn't been set yet.
-        let last_valid_lsn = self.last_valid_lsn.load();
-        assert!(last_valid_lsn == Lsn(0) || lsn < last_valid_lsn);
-
-        let old = self.first_valid_lsn.fetch_max(lsn);
-        // Can't move backwards.
-        assert!(lsn >= old);
-    }
-
-    pub fn init_valid_lsn(&self, lsn: Lsn) {
-        let old = self.last_valid_lsn.advance(lsn);
-        assert!(old == Lsn(0));
-        let old = self.last_record_lsn.fetch_max(lsn);
-        assert!(old == Lsn(0));
-        let old = self.first_valid_lsn.fetch_max(lsn);
-        assert!(old == Lsn(0));
-    }
-
-    pub fn get_last_valid_lsn(&self) -> Lsn {
-        self.last_valid_lsn.load()
-    }
-
-    //
-    // Get statistics to be displayed in the user interface.
-    //
-    pub fn get_stats(&self) -> PageCacheStats {
-        PageCacheStats {
-            num_entries: self.num_entries.load(Ordering::Relaxed),
-            num_page_images: self.num_page_images.load(Ordering::Relaxed),
-            num_wal_records: self.num_wal_records.load(Ordering::Relaxed),
-            num_getpage_requests: self.num_getpage_requests.load(Ordering::Relaxed),
-        }
-    }
-
-    // Internal functions
-
-    //
-    // Internal function to get relation size at given LSN.
-    //
-    // The caller must ensure that WAL has been received up to 'lsn'.
-    //
-    fn relsize_get_nowait(&self, rel: &RelTag, lsn: Lsn) -> anyhow::Result<u32> {
-        assert!(lsn <= self.last_valid_lsn.load());
-
-        let mut key = CacheKey {
-            tag: BufferTag {
-                rel: *rel,
-                blknum: u32::MAX,
-            },
-            lsn,
-        };
-        let mut buf = BytesMut::new();
-        let mut iter = self.db.raw_iterator();
-
-        loop {
-            buf.clear();
-            key.pack(&mut buf);
-            iter.seek_for_prev(&buf[..]);
-            if iter.valid() {
-                let k = iter.key().unwrap();
-                let v = iter.value().unwrap();
-                buf.clear();
-                buf.extend_from_slice(&k);
-                let tag = BufferTag::unpack(&mut buf);
-                if tag.rel == *rel {
-                    buf.clear();
-                    buf.extend_from_slice(&v);
-                    let content = CacheEntryContent::unpack(&mut buf);
-                    if let Some(rec) = &content.wal_record {
-                        if rec.truncate {
-                            if tag.blknum > 0 {
-                                key.tag.blknum = tag.blknum - 1;
-                                continue;
-                            }
-                            break;
-                        }
-                    }
-                    let relsize = tag.blknum + 1;
-                    debug!("Size of relation {:?} at {} is {}", rel, lsn, relsize);
-                    return Ok(relsize);
-                }
-            }
-            break;
-        }
-        debug!("Size of relation {:?} at {} is zero", rel, lsn);
-        Ok(0)
-    }
-
-    fn do_gc(&self, conf: &PageServerConf) -> anyhow::Result<Bytes> {
-        let mut buf = BytesMut::new();
-        loop {
-            thread::sleep(conf.gc_period);
-            let last_lsn = self.get_last_valid_lsn();
-
-            // checked_sub() returns None on overflow.
-            if let Some(horizon) = last_lsn.checked_sub(conf.gc_horizon) {
-                let mut maxkey = CacheKey {
-                    tag: BufferTag {
-                        rel: RelTag {
-                            spcnode: u32::MAX,
-                            dbnode: u32::MAX,
-                            relnode: u32::MAX,
-                            forknum: u8::MAX,
-                        },
-                        blknum: u32::MAX,
-                    },
-                    lsn: Lsn::MAX,
-                };
-                let now = Instant::now();
-                let mut reconstructed = 0u64;
-                let mut truncated = 0u64;
-                let mut inspected = 0u64;
-                let mut deleted = 0u64;
-                loop {
-                    buf.clear();
-                    maxkey.pack(&mut buf);
-                    let mut iter = self.db.raw_iterator();
-                    iter.seek_for_prev(&buf[..]);
-                    if iter.valid() {
-                        let k = iter.key().unwrap();
-                        let v = iter.value().unwrap();
-
-                        inspected += 1;
-
-                        buf.clear();
-                        buf.extend_from_slice(&k);
-                        let key = CacheKey::unpack(&mut buf);
-
-                        // Construct boundaries for old records cleanup
-                        maxkey.tag = key.tag;
-                        let last_lsn = key.lsn;
-                        maxkey.lsn = min(horizon, last_lsn); // do not remove last version
-
-                        let mut minkey = maxkey.clone();
-                        minkey.lsn = Lsn(0); // first version
-
-                        // reconstruct most recent page version
-                        if (v[0] & PAGE_IMAGE_FLAG) == 0 {
-                            trace!("Reconstruct most recent page {:?}", key);
-                            // force reconstruction of most recent page version
-                            let (base_img, records) =
-                                self.collect_records_for_apply(key.tag, key.lsn);
-                            let new_img = self
-                                .walredo_mgr
-                                .request_redo(key.tag, key.lsn, base_img, records)?;
-
-                            self.put_page_image(key.tag, key.lsn, new_img.clone());
-
-                            reconstructed += 1;
-                        }
-
-                        buf.clear();
-                        maxkey.pack(&mut buf);
-
-                        iter.seek_for_prev(&buf[..]);
-                        if iter.valid() {
-                            // do not remove last version
-                            if last_lsn > horizon {
-                                // locate most recent record before horizon
-                                let k = iter.key().unwrap();
-                                buf.clear();
-                                buf.extend_from_slice(&k);
-                                let key = CacheKey::unpack(&mut buf);
-                                if key.tag == maxkey.tag {
-                                    let v = iter.value().unwrap();
-                                    if (v[0] & PAGE_IMAGE_FLAG) == 0 {
-                                        trace!("Reconstruct horizon page {:?}", key);
-                                        let (base_img, records) =
-                                            self.collect_records_for_apply(key.tag, key.lsn);
-                                        let new_img = self
-                                            .walredo_mgr
-                                            .request_redo(key.tag, key.lsn, base_img, records)?;
-                                        self.put_page_image(key.tag, key.lsn, new_img.clone());
-
-                                        truncated += 1;
-                                    }
-                                }
-                            }
-                            // remove records prior to horizon
-                            loop {
-                                iter.prev();
-                                if !iter.valid() {
-                                    break;
-                                }
-                                let k = iter.key().unwrap();
-                                buf.clear();
-                                buf.extend_from_slice(&k);
-                                let key = CacheKey::unpack(&mut buf);
-                                if key.tag != maxkey.tag {
-                                    break;
-                                }
-                                let v = iter.value().unwrap();
-                                if (v[0] & UNUSED_VERSION_FLAG) == 0 {
-                                    let mut v = v.to_owned();
-                                    v[0] |= UNUSED_VERSION_FLAG;
-                                    self.db.put(k, &v[..])?;
-                                    deleted += 1;
-                                } else {
-                                    break;
-                                }
-                            }
-                        }
-                        maxkey = minkey;
-                    } else {
-                        break;
-                    }
-                }
-                info!("Garbage collection completed in {:?}:\n{} version chains inspected, {} pages reconstructed, {} version histories truncated, {} versions deleted",
-					  now.elapsed(), inspected, reconstructed, truncated, deleted);
-            }
-        }
-    }
-
-    //
-    // Wait until WAL has been received up to the given LSN.
-    //
-    fn wait_lsn(&self, mut lsn: Lsn) -> anyhow::Result<Lsn> {
-        // When invalid LSN is requested, it means "don't wait, return latest version of the page"
-        // This is necessary for bootstrap.
-        if lsn == Lsn(0) {
-            let last_valid_lsn = self.last_valid_lsn.load();
-            trace!(
-                "walreceiver doesn't work yet last_valid_lsn {}, requested {}",
-                last_valid_lsn,
-                lsn
-            );
-            lsn = last_valid_lsn;
-        }
-
-        self.last_valid_lsn
-            .wait_for_timeout(lsn, TIMEOUT)
-            .with_context(|| {
-                format!(
-                    "Timed out while waiting for WAL record at LSN {} to arrive",
-                    lsn
-                )
-            })?;
-
-        Ok(lsn)
-    }
-}
-
-//
-// Get statistics to be displayed in the user interface.
-//
-// This combines the stats from all PageCache instances
-//
-pub fn get_stats() -> PageCacheStats {
-    let pcaches = PAGECACHES.lock().unwrap();
-
-    let mut stats = PageCacheStats {
-        num_entries: 0,
-        num_page_images: 0,
-        num_wal_records: 0,
-        num_getpage_requests: 0,
-    };
-
-    pcaches.iter().for_each(|(_sys_id, pcache)| {
-        stats += pcache.get_stats();
-    });
-    stats
+pub fn get_repository() -> Arc<dyn Repository + Send + Sync> {
+    let o = &REPOSITORY.lock().unwrap();
+    Arc::clone(o.as_ref().unwrap())
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -10,6 +10,7 @@
 //     *callmemaybe <zenith timelineid> $url* -- ask pageserver to start walreceiver on $url
 //

+use anyhow::{anyhow, bail};
 use byteorder::{ReadBytesExt, WriteBytesExt, BE};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use log::*;
@@ -19,18 +20,17 @@ use std::io::{BufReader, BufWriter, Read, Write};
 use std::net::{TcpListener, TcpStream};
 use std::str::FromStr;
 use std::thread;
-use std::time::Duration;
 use zenith_utils::lsn::Lsn;

 use crate::basebackup;
+use crate::branches;
 use crate::page_cache;
+use crate::repository::{BufferTag, RelTag};
 use crate::restore_local_repo;
 use crate::walreceiver;
 use crate::PageServerConf;
 use crate::ZTimelineId;

-type Result<T> = std::result::Result<T, io::Error>;
-
 #[derive(Debug)]
 enum FeMessage {
    StartupMessage(FeStartupMessage),
@@ -42,18 +42,13 @@ enum FeMessage {
    Close(FeCloseMessage),
    Sync,
    Terminate,
-
-    //
-    // All that messages are actually CopyData from libpq point of view.
-    //
-    ZenithExistsRequest(ZenithRequest),
-    ZenithNblocksRequest(ZenithRequest),
-    ZenithReadRequest(ZenithRequest),
+    CopyData(Bytes),
 }

 #[derive(Debug)]
 enum BeMessage {
    AuthenticationOk,
+    ParameterStatus,
    ReadyForQuery,
    RowDescription,
    ParseComplete,
@@ -61,20 +56,31 @@ enum BeMessage {
    NoData,
    BindComplete,
    CloseComplete,
-    DataRow,
+    DataRow(Bytes),
    CommandComplete,
    ControlFile,
-
-    //
-    // All that messages are actually CopyData from libpq point of view.
-    //
-    ZenithStatusResponse(ZenithStatusResponse),
-    ZenithNblocksResponse(ZenithStatusResponse),
-    ZenithReadResponse(ZenithReadResponse),
+    CopyData(Bytes),
+    ErrorResponse(String),
 }

+// Wrapped in libpq CopyData
+enum PagestreamFeMessage {
+    Exists(PagestreamRequest),
+    Nblocks(PagestreamRequest),
+    Read(PagestreamRequest),
+}
+
+// Wrapped in libpq CopyData
+enum PagestreamBeMessage {
+    Status(PagestreamStatusResponse),
+    Nblocks(PagestreamStatusResponse),
+    Read(PagestreamReadResponse),
+}
+
+static HELLO_WORLD_ROW: BeMessage = BeMessage::DataRow(Bytes::from_static(b"hello world"));
+
 #[derive(Debug)]
-struct ZenithRequest {
+struct PagestreamRequest {
    spcnode: u32,
    dbnode: u32,
    relnode: u32,
@@ -84,13 +90,13 @@ struct ZenithRequest {
 }

 #[derive(Debug)]
-struct ZenithStatusResponse {
+struct PagestreamStatusResponse {
    ok: bool,
    n_blocks: u32,
 }

 #[derive(Debug)]
-struct ZenithReadResponse {
+struct PagestreamReadResponse {
    ok: bool,
    n_blocks: u32,
    page: Bytes,
@@ -111,7 +117,7 @@ enum StartupRequestCode {
 }

 impl FeStartupMessage {
-    pub fn read(stream: &mut dyn std::io::Read) -> Result<Option<FeMessage>> {
+    pub fn read(stream: &mut dyn std::io::Read) -> anyhow::Result<Option<FeMessage>> {
        const MAX_STARTUP_PACKET_LENGTH: u32 = 10000;
        const CANCEL_REQUEST_CODE: u32 = (1234 << 16) | 5678;
        const NEGOTIATE_SSL_CODE: u32 = (1234 << 16) | 5679;
@@ -123,19 +129,12 @@ impl FeStartupMessage {
        // in the log if the client opens connection but closes it immediately.
        let len = match stream.read_u32::<BE>() {
            Ok(len) => len,
-            Err(err) => {
-                if err.kind() == std::io::ErrorKind::UnexpectedEof {
-                    return Ok(None);
-                } else {
-                    return Err(err);
-                }
-            }
+            Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
+            Err(e) => return Err(e.into()),
        };
-        if len < 4 || len as u32 > MAX_STARTUP_PACKET_LENGTH {
-            return Err(io::Error::new(
-                io::ErrorKind::InvalidData,
-                "invalid message length",
-            ));
+
+        if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
+            bail!("invalid message length");
        }
        let bodylen = len - 4;

@@ -180,15 +179,12 @@ struct FeParseMessage {
    query_string: Bytes,
 }

-fn read_null_terminated(buf: &mut Bytes) -> Result<Bytes> {
+fn read_null_terminated(buf: &mut Bytes) -> anyhow::Result<Bytes> {
    let mut result = BytesMut::new();

    loop {
        if !buf.has_remaining() {
-            return Err(io::Error::new(
-                io::ErrorKind::InvalidInput,
-                "no null-terminator in string",
-            ));
+            bail!("no null-terminator in string");
        }

        let byte = buf.get_u8();
@@ -202,7 +198,7 @@ fn read_null_terminated(buf: &mut Bytes) -> Result<Bytes> {
 }

 impl FeParseMessage {
-    pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
+    pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
        let _pstmt_name = read_null_terminated(&mut buf)?;
        let query_string = read_null_terminated(&mut buf)?;
        let nparams = buf.get_i16();
@@ -221,10 +217,7 @@ impl FeParseMessage {
         */

        if nparams != 0 {
-            return Err(io::Error::new(
-                io::ErrorKind::InvalidInput,
-                "query params not implemented",
-            ));
+            bail!("query params not implemented");
        }

        Ok(FeMessage::Parse(FeParseMessage { query_string }))
@@ -238,7 +231,7 @@ struct FeDescribeMessage {
 }

 impl FeDescribeMessage {
-    pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
+    pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
        let kind = buf.get_u8();
        let _pstmt_name = read_null_terminated(&mut buf)?;

@@ -253,10 +246,7 @@ impl FeDescribeMessage {
        */

        if kind != b'S' {
-            return Err(io::Error::new(
-                io::ErrorKind::InvalidInput,
-                "only prepared statmement Describe is implemented",
-            ));
+            bail!("only prepared statmement Describe is implemented");
        }

        Ok(FeMessage::Describe(FeDescribeMessage { kind }))
@@ -271,22 +261,16 @@ struct FeExecuteMessage {
 }

 impl FeExecuteMessage {
-    pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
+    pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
        let portal_name = read_null_terminated(&mut buf)?;
        let maxrows = buf.get_i32();

        if !portal_name.is_empty() {
-            return Err(io::Error::new(
-                io::ErrorKind::InvalidInput,
-                "named portals not implemented",
-            ));
+            bail!("named portals not implemented");
        }

        if maxrows != 0 {
-            return Err(io::Error::new(
-                io::ErrorKind::InvalidInput,
-                "row limit in Execute message not supported",
-            ));
+            bail!("row limit in Execute message not supported");
        }

        Ok(FeMessage::Execute(FeExecuteMessage { maxrows }))
@@ -298,15 +282,12 @@ impl FeExecuteMessage {
 struct FeBindMessage {}

 impl FeBindMessage {
-    pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
+    pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
        let portal_name = read_null_terminated(&mut buf)?;
        let _pstmt_name = read_null_terminated(&mut buf)?;

        if !portal_name.is_empty() {
-            return Err(io::Error::new(
-                io::ErrorKind::InvalidInput,
-                "named portals not implemented",
-            ));
+            bail!("named portals not implemented");
        }

        // FIXME: see FeParseMessage::parse
@@ -328,7 +309,7 @@ impl FeBindMessage {
 struct FeCloseMessage {}

 impl FeCloseMessage {
-    pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
+    pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
        let _kind = buf.get_u8();
        let _pstmt_or_portal_name = read_null_terminated(&mut buf)?;

@@ -339,28 +320,20 @@ impl FeCloseMessage {
 }

 impl FeMessage {
-    pub fn read(stream: &mut dyn Read) -> Result<Option<FeMessage>> {
+    pub fn read(stream: &mut dyn Read) -> anyhow::Result<Option<FeMessage>> {
        // Each libpq message begins with a message type byte, followed by message length
        // If the client closes the connection, return None. But if the client closes the
        // connection in the middle of a message, we will return an error.
        let tag = match stream.read_u8() {
            Ok(b) => b,
-            Err(err) => {
-                if err.kind() == std::io::ErrorKind::UnexpectedEof {
-                    return Ok(None);
-                } else {
-                    return Err(err);
-                }
-            }
+            Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
+            Err(e) => return Err(e.into()),
        };
        let len = stream.read_u32::<BE>()?;

        // The message length includes itself, so it better be at least 4
        if len < 4 {
-            return Err(io::Error::new(
-                io::ErrorKind::InvalidInput,
-                "invalid message length: parsing u32",
-            ));
+            bail!("invalid message length: parsing u32");
        }
        let bodylen = len - 4;

@@ -368,7 +341,7 @@ impl FeMessage {
        let mut body_buf: Vec<u8> = vec![0; bodylen as usize];
        stream.read_exact(&mut body_buf)?;

-        let mut body = Bytes::from(body_buf);
+        let body = Bytes::from(body_buf);

        // Parse it
        match tag {
@@ -380,37 +353,70 @@ impl FeMessage {
            b'C' => Ok(Some(FeCloseMessage::parse(body)?)),
            b'S' => Ok(Some(FeMessage::Sync)),
            b'X' => Ok(Some(FeMessage::Terminate)),
-            b'd' => {
-                let smgr_tag = body.get_u8();
-                let zreq = ZenithRequest {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
-                    blkno: body.get_u32(),
-                    lsn: Lsn::from(body.get_u64()),
-                };
+            b'd' => Ok(Some(FeMessage::CopyData(body))),
+            tag => Err(anyhow!("unknown message tag: {},'{:?}'", tag, body)),
+        }
+    }
+}

-                // TODO: consider using protobuf or serde bincode for less error prone
-                // serialization.
-                match smgr_tag {
-                    0 => Ok(Some(FeMessage::ZenithExistsRequest(zreq))),
-                    1 => Ok(Some(FeMessage::ZenithNblocksRequest(zreq))),
-                    2 => Ok(Some(FeMessage::ZenithReadRequest(zreq))),
-                    _ => Err(io::Error::new(
-                        io::ErrorKind::InvalidInput,
-                        format!("unknown smgr message tag: {},'{:?}'", smgr_tag, body),
-                    )),
-                }
-            }
-            tag => Err(io::Error::new(
-                io::ErrorKind::InvalidInput,
-                format!("unknown message tag: {},'{:?}'", tag, body),
+impl PagestreamFeMessage {
+    fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
+        // TODO these gets can fail
+
+        let smgr_tag = body.get_u8();
+        let zreq = PagestreamRequest {
+            spcnode: body.get_u32(),
+            dbnode: body.get_u32(),
+            relnode: body.get_u32(),
+            forknum: body.get_u8(),
+            blkno: body.get_u32(),
+            lsn: Lsn::from(body.get_u64()),
+        };
+
+        // TODO: consider using protobuf or serde bincode for less error prone
+        // serialization.
+        match smgr_tag {
+            0 => Ok(PagestreamFeMessage::Exists(zreq)),
+            1 => Ok(PagestreamFeMessage::Nblocks(zreq)),
+            2 => Ok(PagestreamFeMessage::Read(zreq)),
+            _ => Err(anyhow!(
+                "unknown smgr message tag: {},'{:?}'",
+                smgr_tag,
+                body
            )),
        }
    }
 }

+impl PagestreamBeMessage {
+    fn serialize(&self) -> Bytes {
+        let mut bytes = BytesMut::new();
+
+        match self {
+            Self::Status(resp) => {
+                bytes.put_u8(100); /* tag from pagestore_client.h */
+                bytes.put_u8(resp.ok as u8);
+                bytes.put_u32(resp.n_blocks);
+            }
+
+            Self::Nblocks(resp) => {
+                bytes.put_u8(101); /* tag from pagestore_client.h */
+                bytes.put_u8(resp.ok as u8);
+                bytes.put_u32(resp.n_blocks);
+            }
+
+            Self::Read(resp) => {
+                bytes.put_u8(102); /* tag from pagestore_client.h */
+                bytes.put_u8(resp.ok as u8);
+                bytes.put_u32(resp.n_blocks);
+                bytes.put(&resp.page[..]);
+            }
+        }
+
+        bytes.into()
+    }
+}
+
 ///////////////////////////////////////////////////////////////////////////////

 ///
@@ -418,16 +424,12 @@ impl FeMessage {
 ///
 /// Listens for connections, and launches a new handler thread for each.
 ///
-pub fn thread_main(conf: &PageServerConf) {
-    info!("Starting page server on {}", conf.listen_addr);
-
-    let listener = TcpListener::bind(conf.listen_addr).unwrap();
-
+pub fn thread_main(conf: &'static PageServerConf, listener: TcpListener) -> anyhow::Result<()> {
    loop {
-        let (socket, peer_addr) = listener.accept().unwrap();
+        let (socket, peer_addr) = listener.accept()?;
        debug!("accepted connection from {}", peer_addr);
        socket.set_nodelay(true).unwrap();
-        let mut conn_handler = Connection::new(conf.clone(), socket);
+        let mut conn_handler = Connection::new(conf, socket);

        thread::spawn(move || {
            if let Err(err) = conn_handler.run() {
@@ -441,17 +443,15 @@ pub fn thread_main(conf: &PageServerConf) {
 struct Connection {
    stream_in: BufReader<TcpStream>,
    stream: BufWriter<TcpStream>,
-    buffer: BytesMut,
    init_done: bool,
-    conf: PageServerConf,
+    conf: &'static PageServerConf,
 }

 impl Connection {
-    pub fn new(conf: PageServerConf, socket: TcpStream) -> Connection {
+    pub fn new(conf: &'static PageServerConf, socket: TcpStream) -> Connection {
        Connection {
            stream_in: BufReader::new(socket.try_clone().unwrap()),
            stream: BufWriter::new(socket),
-            buffer: BytesMut::with_capacity(10 * 1024),
            init_done: false,
            conf,
        }
@@ -460,7 +460,7 @@ impl Connection {
    //
    // Read full message or return None if connection is closed
    //
-    fn read_message(&mut self) -> Result<Option<FeMessage>> {
+    fn read_message(&mut self) -> anyhow::Result<Option<FeMessage>> {
        if !self.init_done {
            FeStartupMessage::read(&mut self.stream_in)
        } else {
@@ -476,6 +476,16 @@ impl Connection {
                self.stream.write_i32::<BE>(0)?;
            }

+            BeMessage::ParameterStatus => {
+                self.stream.write_u8(b'S')?;
+                // parameter names and values are specified by null terminated strings
+                const PARAM_NAME_VALUE: &[u8] = b"client_encoding\0UTF8\0";
+                // length of this i32 + rest of data in message
+                self.stream
+                    .write_i32::<BE>(4 + PARAM_NAME_VALUE.len() as i32)?;
+                self.stream.write_all(PARAM_NAME_VALUE)?;
+            }
+
            BeMessage::ReadyForQuery => {
                self.stream.write_u8(b'Z')?;
                self.stream.write_i32::<BE>(4 + 1)?;
@@ -528,10 +538,7 @@ impl Connection {
            }

            // XXX: accept some text data
-            BeMessage::DataRow => {
-                // XXX
-                let b = Bytes::from("hello world");
-
+            BeMessage::DataRow(b) => {
                self.stream.write_u8(b'D')?;
                self.stream.write_i32::<BE>(4 + 2 + 4 + b.len() as i32)?;

@@ -560,30 +567,42 @@ impl Connection {
                self.stream.write_all(&b)?;
            }

-            BeMessage::ZenithStatusResponse(resp) => {
+            BeMessage::CopyData(data) => {
                self.stream.write_u8(b'd')?;
-                self.stream.write_u32::<BE>(4 + 1 + 1 + 4)?;
-                self.stream.write_u8(100)?; /* tag from pagestore_client.h */
-                self.stream.write_u8(resp.ok as u8)?;
-                self.stream.write_u32::<BE>(resp.n_blocks)?;
+                self.stream.write_u32::<BE>(4 + data.len() as u32)?;
+                self.stream.write_all(&data)?;
            }

-            BeMessage::ZenithNblocksResponse(resp) => {
-                self.stream.write_u8(b'd')?;
-                self.stream.write_u32::<BE>(4 + 1 + 1 + 4)?;
-                self.stream.write_u8(101)?; /* tag from pagestore_client.h */
-                self.stream.write_u8(resp.ok as u8)?;
-                self.stream.write_u32::<BE>(resp.n_blocks)?;
-            }
+            // ErrorResponse is a zero-terminated array of zero-terminated fields.
+            // First byte of each field represents type of this field. Set just enough fields
+            // to satisfy rust-postgres client: 'S' -- severity, 'C' -- error, 'M' -- error
+            // message text.
+            BeMessage::ErrorResponse(error_msg) => {
+                // For all the errors set Severity to Error and error code to
+                // 'internal error'.
+                let severity = Bytes::from("SERROR\0");
+                let code = Bytes::from("CXX000\0");

-            BeMessage::ZenithReadResponse(resp) => {
-                self.stream.write_u8(b'd')?;
-                self.stream
-                    .write_u32::<BE>(4 + 1 + 1 + 4 + resp.page.len() as u32)?;
-                self.stream.write_u8(102)?; /* tag from pagestore_client.h */
-                self.stream.write_u8(resp.ok as u8)?;
-                self.stream.write_u32::<BE>(resp.n_blocks)?;
-                self.stream.write_all(&resp.page.clone())?;
+                // 'E' signalizes ErrorResponse messages
+                self.stream.write_u8(b'E')?;
+                self.stream.write_u32::<BE>(
+                    4 + severity.len() as u32
+                        + code.len() as u32
+                        + (1 + error_msg.len() as u32 + 1)
+                        + 1,
+                )?;
+
+                // Send severity and code fields
+                self.stream.write_all(&severity)?;
+                self.stream.write_all(&code)?;
+
+                // Send error message field
+                self.stream.write_u8(b'M')?;
+                self.stream.write_all(error_msg.as_bytes())?;
+                self.stream.write_u8(0)?;
+
+                // Terminate fields
+                self.stream.write_u8(0)?;
            }
        }

@@ -595,7 +614,7 @@ impl Connection {
        self.stream.flush()
    }

-    fn run(&mut self) -> Result<()> {
+    fn run(&mut self) -> anyhow::Result<()> {
        let mut unnamed_query_string = Bytes::new();
        loop {
            let msg = self.read_message()?;
@@ -612,6 +631,9 @@ impl Connection {
                        }
                        StartupRequestCode::Normal => {
                            self.write_message_noflush(&BeMessage::AuthenticationOk)?;
+                            // psycopg2 will not connect if client_encoding is not
+                            // specified by the server
+                            self.write_message_noflush(&BeMessage::ParameterStatus)?;
                            self.write_message(&BeMessage::ReadyForQuery)?;
                            self.init_done = true;
                        }
@@ -619,7 +641,11 @@ impl Connection {
                    }
                }
                Some(FeMessage::Query(m)) => {
-                    self.process_query(m.body)?;
+                    if let Err(e) = self.process_query(m.body) {
+                        let errmsg = format!("{}", e);
+                        self.write_message_noflush(&BeMessage::ErrorResponse(errmsg))?;
+                    }
+                    self.write_message(&BeMessage::ReadyForQuery)?;
                }
                Some(FeMessage::Parse(m)) => {
                    unnamed_query_string = m.query_string;
@@ -637,6 +663,7 @@ impl Connection {
                }
                Some(FeMessage::Execute(_)) => {
                    self.process_query(unnamed_query_string.clone())?;
+                    self.stream.flush()?;
                }
                Some(FeMessage::Sync) => {
                    self.write_message(&BeMessage::ReadyForQuery)?;
@@ -649,8 +676,7 @@ impl Connection {
                    break;
                }
                x => {
-                    error!("unexpected message type : {:?}", x);
-                    return Err(io::Error::new(io::ErrorKind::Other, "unexpected message"));
+                    bail!("unexpected message type : {:?}", x);
                }
            }
        }
@@ -658,86 +684,130 @@ impl Connection {
        Ok(())
    }

-    fn process_query(&mut self, query_string: Bytes) -> Result<()> {
+    fn process_query(&mut self, query_string: Bytes) -> anyhow::Result<()> {
        debug!("process query {:?}", query_string);

        // remove null terminator, if any
-        let mut query_string = query_string.clone();
+        let mut query_string = query_string;
        if query_string.last() == Some(&0) {
            query_string.truncate(query_string.len() - 1);
        }

        if query_string.starts_with(b"controlfile") {
-            self.handle_controlfile()
+            self.handle_controlfile()?;
        } else if query_string.starts_with(b"pagestream ") {
            let (_l, r) = query_string.split_at("pagestream ".len());
-            let timelineid_str = String::from_utf8(r.to_vec()).unwrap();
-            let timelineid = ZTimelineId::from_str(&timelineid_str).unwrap();
+            let timelineid_str = String::from_utf8(r.to_vec())?;
+            let timelineid = ZTimelineId::from_str(&timelineid_str)?;

-            self.handle_pagerequests(timelineid)
+            self.handle_pagerequests(timelineid)?;
        } else if query_string.starts_with(b"basebackup ") {
            let (_l, r) = query_string.split_at("basebackup ".len());
            let r = r.to_vec();
-            let timelineid_str = String::from(String::from_utf8(r).unwrap().trim_end());
+            let basebackup_args = String::from(String::from_utf8(r)?.trim_end());
+            let args: Vec<&str> = basebackup_args.rsplit(' ').collect();
+            let timelineid_str = args[0];
            info!("got basebackup command: \"{}\"", timelineid_str);
-            let timelineid = ZTimelineId::from_str(&timelineid_str).unwrap();
-
+            let timelineid = ZTimelineId::from_str(&timelineid_str)?;
+            let lsn = if args.len() > 1 {
+                Some(Lsn::from_str(args[1])?)
+            } else {
+                None
+            };
            // Check that the timeline exists
-            self.handle_basebackup_request(timelineid)?;
+            self.handle_basebackup_request(timelineid, lsn)?;
            self.write_message_noflush(&BeMessage::CommandComplete)?;
-            self.write_message(&BeMessage::ReadyForQuery)
        } else if query_string.starts_with(b"callmemaybe ") {
-            let query_str = String::from_utf8(query_string.to_vec()).unwrap();
+            let query_str = String::from_utf8(query_string.to_vec())?;

            // callmemaybe <zenith timelineid as hex string> <connstr>
+            // TODO lazy static
            let re = Regex::new(r"^callmemaybe ([[:xdigit:]]+) (.*)$").unwrap();
-            let caps = re.captures(&query_str);
-            let caps = caps.unwrap();
+            let caps = re
+                .captures(&query_str)
+                .ok_or_else(|| anyhow!("invalid callmemaybe: '{}'", query_str))?;

-            let timelineid = ZTimelineId::from_str(caps.get(1).unwrap().as_str()).unwrap();
+            let timelineid = ZTimelineId::from_str(caps.get(1).unwrap().as_str())?;
            let connstr: String = String::from(caps.get(2).unwrap().as_str());

            // Check that the timeline exists
-            let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid);
-            if pcache.is_err() {
-                return Err(io::Error::new(
-                    io::ErrorKind::InvalidInput,
-                    format!("client requested callmemaybe on timeline {} which does not exist in page server", timelineid)));
+            let repository = page_cache::get_repository();
+            if repository.get_timeline(timelineid).is_err() {
+                bail!("client requested callmemaybe on timeline {} which does not exist in page server", timelineid);
            }

            walreceiver::launch_wal_receiver(&self.conf, timelineid, &connstr);

            self.write_message_noflush(&BeMessage::CommandComplete)?;
-            self.write_message(&BeMessage::ReadyForQuery)
+        } else if query_string.starts_with(b"branch_create ") {
+            let query_str = String::from_utf8(query_string.to_vec())?;
+            let err = || anyhow!("invalid branch_create: '{}'", query_str);
+
+            // branch_create <branchname> <startpoint>
+            // TODO lazy static
+            // TOOD: escaping, to allow branch names with spaces
+            let re = Regex::new(r"^branch_create (\S+) ([^\r\n\s;]+)[\r\n\s;]*;?$").unwrap();
+            let caps = re.captures(&query_str).ok_or_else(err)?;
+
+            let branchname: String = String::from(caps.get(1).ok_or_else(err)?.as_str());
+            let startpoint_str: String = String::from(caps.get(2).ok_or_else(err)?.as_str());
+
+            let branch = branches::create_branch(&self.conf, &branchname, &startpoint_str)?;
+            let branch = serde_json::to_vec(&branch)?;
+
+            self.write_message_noflush(&BeMessage::RowDescription)?;
+            self.write_message_noflush(&BeMessage::DataRow(Bytes::from(branch)))?;
+            self.write_message_noflush(&BeMessage::CommandComplete)?;
+        } else if query_string.starts_with(b"branch_list") {
+            let branches =
+                crate::branches::get_branches(&self.conf, &*page_cache::get_repository())?;
+            let branches_buf = serde_json::to_vec(&branches)?;
+
+            self.write_message_noflush(&BeMessage::RowDescription)?;
+            self.write_message_noflush(&BeMessage::DataRow(Bytes::from(branches_buf)))?;
+            self.write_message_noflush(&BeMessage::CommandComplete)?;
        } else if query_string.starts_with(b"status") {
            self.write_message_noflush(&BeMessage::RowDescription)?;
-            self.write_message_noflush(&BeMessage::DataRow)?;
+            self.write_message_noflush(&HELLO_WORLD_ROW)?;
            self.write_message_noflush(&BeMessage::CommandComplete)?;
-            self.write_message(&BeMessage::ReadyForQuery)
-        } else {
+        } else if query_string.to_ascii_lowercase().starts_with(b"set ") {
+            // important because psycopg2 executes "SET datestyle TO 'ISO'"
+            // on connect
+            self.write_message_noflush(&BeMessage::CommandComplete)?;
+        } else if query_string
+            .to_ascii_lowercase()
+            .starts_with(b"identify_system")
+        {
+            // TODO: match postgres response formarmat for 'identify_system'
+            let system_id = crate::branches::get_system_id(&self.conf)?.to_string();
+
            self.write_message_noflush(&BeMessage::RowDescription)?;
-            self.write_message_noflush(&BeMessage::DataRow)?;
+            self.write_message_noflush(&BeMessage::DataRow(Bytes::from(system_id)))?;
            self.write_message_noflush(&BeMessage::CommandComplete)?;
-            self.write_message(&BeMessage::ReadyForQuery)
+        } else {
+            bail!("unknown command");
        }
+
+        Ok(())
    }

-    fn handle_controlfile(&mut self) -> Result<()> {
+    fn handle_controlfile(&mut self) -> io::Result<()> {
        self.write_message_noflush(&BeMessage::RowDescription)?;
        self.write_message_noflush(&BeMessage::ControlFile)?;
-        self.write_message_noflush(&BeMessage::CommandComplete)?;
-        self.write_message(&BeMessage::ReadyForQuery)
+        self.write_message(&BeMessage::CommandComplete)?;
+
+        Ok(())
    }

-    fn handle_pagerequests(&mut self, timelineid: ZTimelineId) -> Result<()> {
+    fn handle_pagerequests(&mut self, timelineid: ZTimelineId) -> anyhow::Result<()> {
        // Check that the timeline exists
-        let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid);
-        if pcache.is_err() {
-            return Err(io::Error::new(
-                io::ErrorKind::InvalidInput,
-                format!("client requested pagestream on timeline {} which does not exist in page server", timelineid)));
-        }
-        let pcache = pcache.unwrap();
+        let repository = page_cache::get_repository();
+        let timeline = repository.get_timeline(timelineid).map_err(|_| {
+            anyhow!(
+                "client requested pagestream on timeline {} which does not exist in page server",
+                timelineid
+            )
+        })?;

        /* switch client to COPYBOTH */
        self.stream.write_u8(b'W')?;
@@ -746,52 +816,47 @@ impl Connection {
        self.stream.write_i16::<BE>(0)?; /* numAttributes */
        self.stream.flush()?;

-        loop {
-            let message = self.read_message()?;
+        while let Some(message) = self.read_message()? {
+            trace!("query({:?}): {:?}", timelineid, message);

-            if let Some(m) = &message {
-                trace!("query({:?}): {:?}", timelineid, m);
+            let copy_data_bytes = match message {
+                FeMessage::CopyData(bytes) => bytes,
+                _ => continue,
            };

-            if message.is_none() {
-                // connection was closed
-                return Ok(());
-            }
+            let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;

-            match message {
-                Some(FeMessage::ZenithExistsRequest(req)) => {
-                    let tag = page_cache::RelTag {
+            let response = match zenith_fe_msg {
+                PagestreamFeMessage::Exists(req) => {
+                    let tag = RelTag {
                        spcnode: req.spcnode,
                        dbnode: req.dbnode,
                        relnode: req.relnode,
                        forknum: req.forknum,
                    };

-                    let exist = pcache.relsize_exist(&tag, req.lsn).unwrap_or(false);
+                    let exist = timeline.get_relsize_exists(tag, req.lsn).unwrap_or(false);

-                    self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse {
+                    PagestreamBeMessage::Status(PagestreamStatusResponse {
                        ok: exist,
                        n_blocks: 0,
-                    }))?
+                    })
                }
-                Some(FeMessage::ZenithNblocksRequest(req)) => {
-                    let tag = page_cache::RelTag {
+                PagestreamFeMessage::Nblocks(req) => {
+                    let tag = RelTag {
                        spcnode: req.spcnode,
                        dbnode: req.dbnode,
                        relnode: req.relnode,
                        forknum: req.forknum,
                    };

-                    let n_blocks = pcache.relsize_get(&tag, req.lsn).unwrap_or(0);
+                    let n_blocks = timeline.get_relsize(tag, req.lsn).unwrap_or(0);

-                    self.write_message(&BeMessage::ZenithNblocksResponse(ZenithStatusResponse {
-                        ok: true,
-                        n_blocks,
-                    }))?
+                    PagestreamBeMessage::Nblocks(PagestreamStatusResponse { ok: true, n_blocks })
                }
-                Some(FeMessage::ZenithReadRequest(req)) => {
-                    let buf_tag = page_cache::BufferTag {
-                        rel: page_cache::RelTag {
+                PagestreamFeMessage::Read(req) => {
+                    let buf_tag = BufferTag {
+                        rel: RelTag {
                            spcnode: req.spcnode,
                            dbnode: req.dbnode,
                            relnode: req.relnode,
@@ -800,39 +865,47 @@ impl Connection {
                        blknum: req.blkno,
                    };

-                    let msg = match pcache.get_page_at_lsn(buf_tag, req.lsn) {
-                        Ok(p) => BeMessage::ZenithReadResponse(ZenithReadResponse {
+                    let read_response = match timeline.get_page_at_lsn(buf_tag, req.lsn) {
+                        Ok(p) => PagestreamReadResponse {
                            ok: true,
                            n_blocks: 0,
                            page: p,
-                        }),
+                        },
                        Err(e) => {
                            const ZERO_PAGE: [u8; 8192] = [0; 8192];
                            error!("get_page_at_lsn: {}", e);
-                            BeMessage::ZenithReadResponse(ZenithReadResponse {
+                            PagestreamReadResponse {
                                ok: false,
                                n_blocks: 0,
                                page: Bytes::from_static(&ZERO_PAGE),
-                            })
+                            }
                        }
                    };

-                    self.write_message(&msg)?
+                    PagestreamBeMessage::Read(read_response)
                }
-                _ => {}
-            }
+            };
+
+            self.write_message(&BeMessage::CopyData(response.serialize()))?;
        }
+
+        Ok(())
    }

-    fn handle_basebackup_request(&mut self, timelineid: ZTimelineId) -> Result<()> {
+    fn handle_basebackup_request(
+        &mut self,
+        timelineid: ZTimelineId,
+        lsn: Option<Lsn>,
+    ) -> anyhow::Result<()> {
        // check that the timeline exists
-        let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid);
-        if pcache.is_err() {
-            return Err(io::Error::new(
-                io::ErrorKind::InvalidInput,
-                format!("client requested basebackup on timeline {} which does not exist in page server", timelineid)));
-        }
-
+        let repository = page_cache::get_repository();
+        let timeline = repository.get_timeline(timelineid).map_err(|e| {
+            error!("error fetching timeline: {:?}", e);
+            anyhow!(
+                "client requested basebackup on timeline {} which does not exist in page server",
+                timelineid
+            )
+        })?;
        /* switch client to COPYOUT */
        let stream = &mut self.stream;
        stream.write_u8(b'H')?;
@@ -845,21 +918,22 @@ impl Connection {
        /* Send a tarball of the latest snapshot on the timeline */

        // find latest snapshot
-        let snapshotlsn = restore_local_repo::find_latest_snapshot(&self.conf, timelineid).unwrap();
-
-        basebackup::send_snapshot_tarball(&mut CopyDataSink { stream }, timelineid, snapshotlsn)?;
-
+        let snapshot_lsn =
+            restore_local_repo::find_latest_snapshot(&self.conf, timelineid).unwrap();
+        let req_lsn = lsn.unwrap_or(snapshot_lsn);
+        basebackup::send_tarball_at_lsn(
+            &mut CopyDataSink { stream },
+            timelineid,
+            &timeline,
+            req_lsn,
+            snapshot_lsn,
+        )?;
        // CopyDone
        self.stream.write_u8(b'c')?;
        self.stream.write_u32::<BE>(4)?;
        self.stream.flush()?;
        debug!("CopyDone sent!");

-        // FIXME: I'm getting an error from the tokio copyout driver without this.
-        // I think it happens when the CommandComplete, CloseComplete and ReadyForQuery
-        // are sent in the same TCP packet as the CopyDone. I don't understand why.
-        thread::sleep(Duration::from_secs(1));
-
        Ok(())
    }
 }
@@ -872,8 +946,8 @@ struct CopyDataSink<'a> {
    stream: &'a mut BufWriter<TcpStream>,
 }

-impl<'a> std::io::Write for CopyDataSink<'a> {
-    fn write(&mut self, data: &[u8]) -> std::result::Result<usize, std::io::Error> {
+impl<'a> io::Write for CopyDataSink<'a> {
+    fn write(&mut self, data: &[u8]) -> io::Result<usize> {
        // CopyData
        // FIXME: if the input is large, we should split it into multiple messages.
        // Not sure what the threshold should be, but the ultimate hard limit is that
@@ -889,7 +963,7 @@ impl<'a> std::io::Write for CopyDataSink<'a> {

        Ok(data.len())
    }
-    fn flush(&mut self) -> std::result::Result<(), std::io::Error> {
+    fn flush(&mut self) -> io::Result<()> {
        // no-op
        Ok(())
    }
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -0,0 +1,532 @@
+pub mod rocksdb;
+
+use crate::waldecoder::{DecodedWALRecord, Oid, XlCreateDatabase, XlSmgrTruncate};
+use crate::ZTimelineId;
+use anyhow::Result;
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+use log::*;
+use postgres_ffi::pg_constants;
+use postgres_ffi::relfile_utils::forknumber_to_name;
+use serde::{Deserialize, Serialize};
+use std::fmt;
+use std::sync::Arc;
+use zenith_utils::lsn::Lsn;
+
+///
+/// A repository corresponds to one .zenith directory. One repository holds multiple
+/// timelines, forked off from the same initial call to 'initdb'.
+pub trait Repository {
+    /// Get Timeline handle for given zenith timeline ID.
+    fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
+
+    /// Create a new, empty timeline. The caller is responsible for loading data into it
+    fn create_empty_timeline(
+        &self,
+        timelineid: ZTimelineId,
+        start_lsn: Lsn,
+    ) -> Result<Arc<dyn Timeline>>;
+
+    /// Branch a timeline
+    fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>;
+
+    //fn get_stats(&self) -> RepositoryStats;
+}
+
+pub trait Timeline {
+    //------------------------------------------------------------------------------
+    // Public GET functions
+    //------------------------------------------------------------------------------
+
+    /// Look up given page in the cache.
+    fn get_page_at_lsn(&self, tag: BufferTag, lsn: Lsn) -> Result<Bytes>;
+
+    /// Get size of relation
+    fn get_relsize(&self, tag: RelTag, lsn: Lsn) -> Result<u32>;
+
+    /// Does relation exist?
+    fn get_relsize_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool>;
+
+    //------------------------------------------------------------------------------
+    // Public PUT functions, to update the repository with new page versions.
+    //
+    // These are called by the WAL receiver to digest WAL records.
+    //------------------------------------------------------------------------------
+
+    /// Put a new page version that can be constructed from a WAL record
+    ///
+    /// This will implicitly extend the relation, if the page is beyond the
+    /// current end-of-file.
+    fn put_wal_record(&self, tag: BufferTag, rec: WALRecord) -> Result<()>;
+
+    /// Like put_wal_record, but with ready-made image of the page.
+    fn put_page_image(&self, tag: BufferTag, lsn: Lsn, img: Bytes) -> Result<()>;
+
+    /// Truncate relation
+    fn put_truncation(&self, rel: RelTag, lsn: Lsn, nblocks: u32) -> Result<()>;
+
+    /// Drop relation or file segment
+    fn put_drop(&self, tag: BufferTag, lsn: Lsn) -> Result<()>;
+
+    /// Create a new database from a template database
+    ///
+    /// In PostgreSQL, CREATE DATABASE works by scanning the data directory and
+    /// copying all relation files from the template database. This is the equivalent
+    /// of that.
+    fn put_create_database(
+        &self,
+        lsn: Lsn,
+        db_id: Oid,
+        tablespace_id: Oid,
+        src_db_id: Oid,
+        src_tablespace_id: Oid,
+    ) -> Result<()>;
+
+    ///
+    /// Helper function to parse a WAL record and call the above functions for all the
+    /// relations/pages that the record affects.
+    ///
+    fn save_decoded_record(
+        &self,
+        decoded: DecodedWALRecord,
+        recdata: Bytes,
+        lsn: Lsn,
+    ) -> Result<()> {
+        // Figure out which blocks the record applies to, and "put" a separate copy
+        // of the record for each block.
+        for blk in decoded.blocks.iter() {
+            let tag = BufferTag {
+                rel: RelTag {
+                    spcnode: blk.rnode_spcnode,
+                    dbnode: blk.rnode_dbnode,
+                    relnode: blk.rnode_relnode,
+                    forknum: blk.forknum as u8,
+                },
+                blknum: blk.blkno,
+            };
+
+            if blk.will_drop {
+                self.put_drop(tag, lsn)?;
+            } else {
+                let rec = WALRecord {
+                    lsn,
+                    will_init: blk.will_init || blk.apply_image,
+                    rec: recdata.clone(),
+                    main_data_offset: decoded.main_data_offset as u32,
+                };
+
+                self.put_wal_record(tag, rec)?;
+            }
+        }
+
+        // Handle a few special record types
+        if decoded.xl_rmid == pg_constants::RM_SMGR_ID
+            && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
+                == pg_constants::XLOG_SMGR_TRUNCATE
+        {
+            let truncate = XlSmgrTruncate::decode(&decoded);
+            let mut rel = RelTag {
+                spcnode: truncate.rnode.spcnode,
+                dbnode: truncate.rnode.dbnode,
+                relnode: truncate.rnode.relnode,
+                forknum: pg_constants::MAIN_FORKNUM,
+            };
+            if (truncate.flags & pg_constants::SMGR_TRUNCATE_HEAP) != 0 {
+                self.put_truncation(rel, lsn, truncate.blkno)?;
+            }
+            if (truncate.flags & pg_constants::SMGR_TRUNCATE_FSM) != 0 {
+                if truncate.blkno == 0 {
+                    rel.forknum = pg_constants::FSM_FORKNUM;
+                    self.put_truncation(rel, lsn, truncate.blkno)?;
+                } else {
+                    // TODO: handle partial truncation of FSM:
+                    // need to map heap block number to FSM block number
+                    // and clear bits in the tail block
+                    info!("Partial truncation of FSM is not supported");
+                }
+            }
+            if (truncate.flags & pg_constants::SMGR_TRUNCATE_VM) != 0 {
+                if truncate.blkno == 0 {
+                    rel.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
+                    self.put_truncation(rel, lsn, truncate.blkno)?;
+                } else {
+                    // TODO: handle partial truncation of VM:
+                    // need to map heap block number to VM block number
+                    // and clear bits in the tail block
+                    info!("Partial truncation of VM is not supported");
+                }
+            }
+        } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID
+            && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
+                == pg_constants::XLOG_DBASE_CREATE
+        {
+            let createdb = XlCreateDatabase::decode(&decoded);
+            self.put_create_database(
+                lsn,
+                createdb.db_id,
+                createdb.tablespace_id,
+                createdb.src_db_id,
+                createdb.src_tablespace_id,
+            )?;
+        }
+        // Now that this record has been handled, let the repository know that
+        // it is up-to-date to this LSN
+        self.advance_last_record_lsn(lsn);
+        Ok(())
+    }
+
+    /// Remember the all WAL before the given LSN has been processed.
+    ///
+    /// The WAL receiver calls this after the put_* functions, to indicate that
+    /// all WAL before this point has been digested. Before that, if you call
+    /// GET on an earlier LSN, it will block.
+    fn advance_last_valid_lsn(&self, lsn: Lsn);
+    fn get_last_valid_lsn(&self) -> Lsn;
+    fn init_valid_lsn(&self, lsn: Lsn);
+
+    /// Like `advance_last_valid_lsn`, but this always points to the end of
+    /// a WAL record, not in the middle of one.
+    ///
+    /// This must be <= last valid LSN. This is tracked separately from last
+    /// valid LSN, so that the WAL receiver knows where to restart streaming.
+    fn advance_last_record_lsn(&self, lsn: Lsn);
+    fn get_last_record_lsn(&self) -> Lsn;
+
+    ///
+    /// Flush to disk all data that was written with the put_* functions
+    ///
+    /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
+    /// know anything about them here in the repository.
+    fn checkpoint(&self) -> Result<()>;
+}
+
+#[derive(Clone)]
+pub struct RepositoryStats {
+    pub num_entries: Lsn,
+    pub num_page_images: Lsn,
+    pub num_wal_records: Lsn,
+    pub num_getpage_requests: Lsn,
+}
+
+///
+/// Relation data file segment id throughout the Postgres cluster.
+///
+/// Every data file in Postgres is uniquely identified by 4 numbers:
+/// - relation id / node (`relnode`)
+/// - database id (`dbnode`)
+/// - tablespace id (`spcnode`), in short this is a unique id of a separate
+///   directory to store data files.
+/// - forknumber (`forknum`) is used to split different kinds of data of the same relation
+///   between some set of files (`relnode`, `relnode_fsm`, `relnode_vm`).
+///
+/// In native Postgres code `RelFileNode` structure and individual `ForkNumber` value
+/// are used for the same purpose.
+/// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
+///
+/// We use additional fork numbers to logically separate relational and
+/// non-relational data inside pageserver key-value storage.
+/// See, e.g., `ROCKSDB_SPECIAL_FORKNUM`.
+///
+#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)]
+pub struct RelTag {
+    pub forknum: u8,
+    pub spcnode: u32,
+    pub dbnode: u32,
+    pub relnode: u32,
+}
+
+/// Display RelTag in the same format that's used in most PostgreSQL debug messages:
+///
+/// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
+///
+impl fmt::Display for RelTag {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        if let Some(forkname) = forknumber_to_name(self.forknum) {
+            write!(
+                f,
+                "{}/{}/{}_{}",
+                self.spcnode, self.dbnode, self.relnode, forkname
+            )
+        } else {
+            write!(f, "{}/{}/{}", self.spcnode, self.dbnode, self.relnode)
+        }
+    }
+}
+
+///
+/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
+/// This is used as a part of the key inside key-value storage (RocksDB currently).
+///
+/// In Postgres `BufferTag` structure is used for exactly the same purpose.
+/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
+///
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize, Deserialize)]
+pub struct BufferTag {
+    pub rel: RelTag,
+    pub blknum: u32,
+}
+
+#[derive(Debug, Clone)]
+pub struct WALRecord {
+    pub lsn: Lsn, // LSN at the *end* of the record
+    pub will_init: bool,
+    pub rec: Bytes,
+    // Remember the offset of main_data in rec,
+    // so that we don't have to parse the record again.
+    // If record has no main_data, this offset equals rec.len().
+    pub main_data_offset: u32,
+}
+
+impl WALRecord {
+    pub fn pack(&self, buf: &mut BytesMut) {
+        buf.put_u64(self.lsn.0);
+        buf.put_u8(self.will_init as u8);
+        buf.put_u32(self.main_data_offset);
+        buf.put_u32(self.rec.len() as u32);
+        buf.put_slice(&self.rec[..]);
+    }
+    pub fn unpack(buf: &mut Bytes) -> WALRecord {
+        let lsn = Lsn::from(buf.get_u64());
+        let will_init = buf.get_u8() != 0;
+        let main_data_offset = buf.get_u32();
+        let mut dst = vec![0u8; buf.get_u32() as usize];
+        buf.copy_to_slice(&mut dst);
+        WALRecord {
+            lsn,
+            will_init,
+            rec: Bytes::from(dst),
+            main_data_offset,
+        }
+    }
+}
+
+///
+/// Tests that should work the same with any Repository/Timeline implementation.
+///
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::walredo::{WalRedoError, WalRedoManager};
+    use crate::PageServerConf;
+    use postgres_ffi::pg_constants;
+    use std::fs;
+    use std::path::PathBuf;
+    use std::str::FromStr;
+    use std::time::Duration;
+
+    /// Arbitrary relation tag, for testing.
+    const TESTREL_A: RelTag = RelTag {
+        spcnode: 0,
+        dbnode: 111,
+        relnode: 1000,
+        forknum: 0,
+    };
+
+    /// Convenience function to create a BufferTag for testing.
+    /// Helps to keeps the tests shorter.
+    #[allow(non_snake_case)]
+    fn TEST_BUF(blknum: u32) -> BufferTag {
+        BufferTag {
+            rel: TESTREL_A,
+            blknum,
+        }
+    }
+
+    /// Convenience function to create a page image with given string as the only content
+    #[allow(non_snake_case)]
+    fn TEST_IMG(s: &str) -> Bytes {
+        let mut buf = BytesMut::new();
+        buf.extend_from_slice(s.as_bytes());
+        buf.resize(8192, 0);
+
+        buf.freeze()
+    }
+
+    fn get_test_repo(test_name: &str) -> Result<Box<dyn Repository>> {
+        let repo_dir = PathBuf::from(format!("../tmp_check/test_{}", test_name));
+        let _ = fs::remove_dir_all(&repo_dir);
+        fs::create_dir_all(&repo_dir)?;
+
+        let conf = PageServerConf {
+            daemonize: false,
+            interactive: false,
+            gc_horizon: 64 * 1024 * 1024,
+            gc_period: Duration::from_secs(10),
+            listen_addr: "127.0.0.1:5430".parse().unwrap(),
+            workdir: repo_dir,
+            pg_distrib_dir: "".into(),
+        };
+        // Make a static copy of the config. This can never be free'd, but that's
+        // OK in a test.
+        let conf: &'static PageServerConf = Box::leak(Box::new(conf));
+
+        let walredo_mgr = TestRedoManager {};
+
+        let repo = rocksdb::RocksRepository::new(conf, Arc::new(walredo_mgr));
+
+        Ok(Box::new(repo))
+    }
+
+    /// Test get_relsize() and truncation.
+    #[test]
+    fn test_relsize() -> Result<()> {
+        // get_timeline() with non-existent timeline id should fail
+        //repo.get_timeline("11223344556677881122334455667788");
+
+        // Create timeline to work on
+        let repo = get_test_repo("test_relsize")?;
+        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
+        let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
+
+        tline.init_valid_lsn(Lsn(1));
+        tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"))?;
+        tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"))?;
+        tline.put_page_image(TEST_BUF(0), Lsn(3), TEST_IMG("foo blk 0 at 3"))?;
+        tline.put_page_image(TEST_BUF(1), Lsn(4), TEST_IMG("foo blk 1 at 4"))?;
+        tline.put_page_image(TEST_BUF(2), Lsn(5), TEST_IMG("foo blk 2 at 5"))?;
+
+        tline.advance_last_valid_lsn(Lsn(5));
+
+        // FIXME: The rocksdb implementation erroneously returns 'true' here, even
+        // though the relation was created only at a later LSN
+        // rocksdb implementation erroneosly returns 'true' here
+        assert_eq!(tline.get_relsize_exists(TESTREL_A, Lsn(1))?, true); // CORRECT: false
+                                                                        // And this probably should throw an error, becaue the relation doesn't exist at Lsn(1) yet
+        assert_eq!(tline.get_relsize(TESTREL_A, Lsn(1))?, 0); // CORRECT: throw error
+
+        assert_eq!(tline.get_relsize_exists(TESTREL_A, Lsn(2))?, true);
+        assert_eq!(tline.get_relsize(TESTREL_A, Lsn(2))?, 1);
+        assert_eq!(tline.get_relsize(TESTREL_A, Lsn(5))?, 3);
+
+        // Check page contents at each LSN
+        assert_eq!(
+            tline.get_page_at_lsn(TEST_BUF(0), Lsn(2))?,
+            TEST_IMG("foo blk 0 at 2")
+        );
+
+        assert_eq!(
+            tline.get_page_at_lsn(TEST_BUF(0), Lsn(3))?,
+            TEST_IMG("foo blk 0 at 3")
+        );
+
+        assert_eq!(
+            tline.get_page_at_lsn(TEST_BUF(0), Lsn(4))?,
+            TEST_IMG("foo blk 0 at 3")
+        );
+        assert_eq!(
+            tline.get_page_at_lsn(TEST_BUF(1), Lsn(4))?,
+            TEST_IMG("foo blk 1 at 4")
+        );
+
+        assert_eq!(
+            tline.get_page_at_lsn(TEST_BUF(0), Lsn(5))?,
+            TEST_IMG("foo blk 0 at 3")
+        );
+        assert_eq!(
+            tline.get_page_at_lsn(TEST_BUF(1), Lsn(5))?,
+            TEST_IMG("foo blk 1 at 4")
+        );
+        assert_eq!(
+            tline.get_page_at_lsn(TEST_BUF(2), Lsn(5))?,
+            TEST_IMG("foo blk 2 at 5")
+        );
+
+        // Truncate last block
+        tline.put_truncation(TESTREL_A, Lsn(6), 2)?;
+        tline.advance_last_valid_lsn(Lsn(6));
+
+        // Check reported size and contents after truncation
+        assert_eq!(tline.get_relsize(TESTREL_A, Lsn(6))?, 2);
+        assert_eq!(
+            tline.get_page_at_lsn(TEST_BUF(0), Lsn(6))?,
+            TEST_IMG("foo blk 0 at 3")
+        );
+        assert_eq!(
+            tline.get_page_at_lsn(TEST_BUF(1), Lsn(6))?,
+            TEST_IMG("foo blk 1 at 4")
+        );
+
+        // should still see the truncated block with older LSN
+        assert_eq!(tline.get_relsize(TESTREL_A, Lsn(5))?, 3);
+        assert_eq!(
+            tline.get_page_at_lsn(TEST_BUF(2), Lsn(5))?,
+            TEST_IMG("foo blk 2 at 5")
+        );
+
+        Ok(())
+    }
+
+    /// Test get_relsize() and truncation with a file larger than 1 GB, so that it's
+    /// split into multiple 1 GB segments in Postgres.
+    ///
+    /// This isn't very interesting with the RocksDb implementation, as we don't pay
+    /// any attention to Postgres segment boundaries there.
+    #[test]
+    fn test_large_rel() -> Result<()> {
+        let repo = get_test_repo("test_large_rel")?;
+        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
+        let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
+
+        tline.init_valid_lsn(Lsn(1));
+
+        let mut lsn = 0;
+        for i in 0..pg_constants::RELSEG_SIZE + 1 {
+            let img = TEST_IMG(&format!("foo blk {} at {}", i, Lsn(lsn)));
+            lsn += 1;
+            tline.put_page_image(TEST_BUF(i as u32), Lsn(lsn), img)?;
+        }
+        tline.advance_last_valid_lsn(Lsn(lsn));
+
+        assert_eq!(
+            tline.get_relsize(TESTREL_A, Lsn(lsn))?,
+            pg_constants::RELSEG_SIZE + 1
+        );
+
+        // Truncate one block
+        lsn += 1;
+        tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?;
+        tline.advance_last_valid_lsn(Lsn(lsn));
+        assert_eq!(
+            tline.get_relsize(TESTREL_A, Lsn(lsn))?,
+            pg_constants::RELSEG_SIZE
+        );
+
+        // Truncate another block
+        lsn += 1;
+        tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?;
+        tline.advance_last_valid_lsn(Lsn(lsn));
+        assert_eq!(
+            tline.get_relsize(TESTREL_A, Lsn(lsn))?,
+            pg_constants::RELSEG_SIZE - 1
+        );
+
+        Ok(())
+    }
+
+    // Mock WAL redo manager that doesn't do much
+    struct TestRedoManager {}
+
+    impl WalRedoManager for TestRedoManager {
+        fn request_redo(
+            &self,
+            tag: BufferTag,
+            lsn: Lsn,
+            base_img: Option<Bytes>,
+            records: Vec<WALRecord>,
+        ) -> Result<Bytes, WalRedoError> {
+            let s = format!(
+                "redo for rel {} blk {} to get to {}, with {} and {} records",
+                tag.rel,
+                tag.blknum,
+                lsn,
+                if base_img.is_some() {
+                    "base image"
+                } else {
+                    "no base image"
+                },
+                records.len()
+            );
+            println!("{}", s);
+            Ok(TEST_IMG(&s))
+        }
+    }
+}
--- a/pageserver/src/repository/rocksdb.rs
+++ b/pageserver/src/repository/rocksdb.rs
--- a/pageserver/src/restore_local_repo.rs
+++ b/pageserver/src/restore_local_repo.rs
@@ -1,21 +1,9 @@
-//
-// Restore chunks from local Zenith repository
-//
-// This runs once at Page Server startup. It loads all the "snapshots" and all
-// WAL from all timelines from the local zenith repository into the in-memory page
-// cache.
-//
-// This also initializes the "last valid LSN" in the page cache to the last LSN
-// seen in the WAL, so that when the WAL receiver is started, it starts
-// streaming from that LSN.
-//
-
+//!
+//! Import data and WAL from a PostgreSQL data directory and WAL segments into
+//! zenith repository
+//!
 use log::*;
-use regex::Regex;
-use std::fmt;
-
 use std::cmp::max;
-use std::error::Error;
 use std::fs;
 use std::fs::File;
 use std::io::Read;
@@ -26,101 +14,49 @@ use std::path::{Path, PathBuf};
 use anyhow::Result;
 use bytes::Bytes;

-use crate::page_cache;
-use crate::page_cache::BufferTag;
-use crate::page_cache::PageCache;
-use crate::page_cache::RelTag;
-use crate::waldecoder::{decode_wal_record, WalStreamDecoder};
+use crate::repository::{BufferTag, RelTag, Timeline};
+use crate::waldecoder::{decode_wal_record, Oid, WalStreamDecoder};
 use crate::PageServerConf;
 use crate::ZTimelineId;
 use postgres_ffi::pg_constants;
+use postgres_ffi::relfile_utils::*;
 use postgres_ffi::xlog_utils::*;
 use zenith_utils::lsn::Lsn;

-// From pg_tablespace_d.h
-//
-// FIXME: we'll probably need these elsewhere too, move to some common location
-const DEFAULTTABLESPACE_OID: u32 = 1663;
-const GLOBALTABLESPACE_OID: u32 = 1664;
-
-//
-// Load it all into the page cache.
-//
-pub fn restore_timeline(
-    conf: &PageServerConf,
-    pcache: &PageCache,
-    timeline: ZTimelineId,
-) -> Result<()> {
-    let timelinepath = PathBuf::from("timelines").join(timeline.to_string());
-
-    if !timelinepath.exists() {
-        anyhow::bail!("timeline {} does not exist in the page server's repository");
-    }
-
-    // Scan .zenith/timelines/<timeline>/snapshots
-    let snapshotspath = PathBuf::from("timelines")
-        .join(timeline.to_string())
-        .join("snapshots");
-
-    let mut last_snapshot_lsn: Lsn = Lsn(0);
+///
+/// Find latest snapshot in a timeline's 'snapshots' directory
+///
+pub fn find_latest_snapshot(_conf: &PageServerConf, timeline: ZTimelineId) -> Result<Lsn> {
+    let snapshotspath = format!("timelines/{}/snapshots", timeline);

+    let mut last_snapshot_lsn = Lsn(0);
    for direntry in fs::read_dir(&snapshotspath).unwrap() {
-        let direntry = direntry?;
-        let filename = direntry.file_name();
-        let lsn = Lsn::from_filename(&filename)?;
-        last_snapshot_lsn = max(lsn, last_snapshot_lsn);
+        let filename = direntry.unwrap().file_name();

-        // FIXME: pass filename as Path instead of str?
-        let filename_str = filename.into_string().unwrap();
-        restore_snapshot(conf, pcache, timeline, &filename_str)?;
-        info!("restored snapshot at {:?}", filename_str);
+        if let Ok(lsn) = Lsn::from_filename(&filename) {
+            last_snapshot_lsn = max(lsn, last_snapshot_lsn);
+        } else {
+            error!("unrecognized file in snapshots directory: {:?}", filename);
+        }
    }

    if last_snapshot_lsn == Lsn(0) {
-        error!(
-            "could not find valid snapshot in {}",
-            snapshotspath.display()
-        );
-        // TODO return error?
-    }
-    pcache.init_valid_lsn(last_snapshot_lsn);
-
-    restore_wal(conf, pcache, timeline, last_snapshot_lsn)?;
-
-    Ok(())
-}
-
-pub fn find_latest_snapshot(_conf: &PageServerConf, timeline: ZTimelineId) -> Result<u64> {
-    let snapshotspath = format!("timelines/{}/snapshots", timeline);
-
-    let mut last_snapshot_lsn = 0;
-    for direntry in fs::read_dir(&snapshotspath).unwrap() {
-        let filename = direntry.unwrap().file_name().to_str().unwrap().to_owned();
-
-        let lsn = u64::from_str_radix(&filename, 16)?;
-        last_snapshot_lsn = max(lsn, last_snapshot_lsn);
-    }
-
-    if last_snapshot_lsn == 0 {
        error!("could not find valid snapshot in {}", &snapshotspath);
        // TODO return error?
    }
    Ok(last_snapshot_lsn)
 }

-fn restore_snapshot(
-    conf: &PageServerConf,
-    pcache: &PageCache,
-    timeline: ZTimelineId,
-    snapshot: &str,
+///
+/// Import all relation data pages from local disk into the repository.
+///
+pub fn import_timeline_from_postgres_datadir(
+    path: &Path,
+    timeline: &dyn Timeline,
+    lsn: Lsn,
 ) -> Result<()> {
-    let snapshotpath = PathBuf::from("timelines")
-        .join(timeline.to_string())
-        .join("snapshots")
-        .join(snapshot);
-
    // Scan 'global'
-    for direntry in fs::read_dir(snapshotpath.join("global"))? {
+    for direntry in fs::read_dir(path.join("global"))? {
        let direntry = direntry?;
        match direntry.file_name().to_str() {
            None => continue,
@@ -130,24 +66,22 @@ fn restore_snapshot(
            Some("pg_filenode.map") => continue,

            // Load any relation files into the page server
-            _ => restore_relfile(
-                conf,
-                pcache,
-                timeline,
-                snapshot,
-                GLOBALTABLESPACE_OID,
-                0,
+            _ => import_relfile(
                &direntry.path(),
+                timeline,
+                lsn,
+                pg_constants::GLOBALTABLESPACE_OID,
+                0,
            )?,
        }
    }

    // Scan 'base'. It contains database dirs, the database OID is the filename.
    // E.g. 'base/12345', where 12345 is the database OID.
-    for direntry in fs::read_dir(snapshotpath.join("base"))? {
+    for direntry in fs::read_dir(path.join("base"))? {
        let direntry = direntry?;

-        let dboid = u32::from_str_radix(direntry.file_name().to_str().unwrap(), 10)?;
+        let dboid = direntry.file_name().to_str().unwrap().parse::<u32>()?;

        for direntry in fs::read_dir(direntry.path())? {
            let direntry = direntry?;
@@ -159,45 +93,31 @@ fn restore_snapshot(
                Some("pg_filenode.map") => continue,

                // Load any relation files into the page server
-                _ => restore_relfile(
-                    conf,
-                    pcache,
-                    timeline,
-                    snapshot,
-                    DEFAULTTABLESPACE_OID,
-                    dboid,
+                _ => import_relfile(
                    &direntry.path(),
+                    timeline,
+                    lsn,
+                    pg_constants::DEFAULTTABLESPACE_OID,
+                    dboid,
                )?,
            }
        }
    }
-    for entry in fs::read_dir(snapshotpath.join("pg_xact"))? {
-        let entry = entry?;
-        restore_nonrelfile(
-            conf,
-            pcache,
-            timeline,
-            snapshot,
-            pg_constants::PG_XACT_FORKNUM,
-            &entry.path(),
-        )?;
-    }
    // TODO: Scan pg_tblspc

+    timeline.checkpoint()?;
+
    Ok(())
 }

-fn restore_relfile(
-    _conf: &PageServerConf,
-    pcache: &PageCache,
-    _timeline: ZTimelineId,
-    snapshot: &str,
-    spcoid: u32,
-    dboid: u32,
+// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
+fn import_relfile(
    path: &Path,
+    timeline: &dyn Timeline,
+    lsn: Lsn,
+    spcoid: Oid,
+    dboid: Oid,
 ) -> Result<()> {
-    let lsn = Lsn::from_hex(snapshot)?;
-
    // Does it look like a relation file?

    let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
@@ -210,8 +130,7 @@ fn restore_relfile(
    let mut file = File::open(path)?;
    let mut buf: [u8; 8192] = [0u8; 8192];

-    // FIXME: use constants (BLCKSZ)
-    let mut blknum: u32 = segno * (1024 * 1024 * 1024 / 8192);
+    let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
    loop {
        let r = file.read_exact(&mut buf);
        match r {
@@ -221,11 +140,11 @@ fn restore_relfile(
                        spcnode: spcoid,
                        dbnode: dboid,
                        relnode,
-                        forknum: forknum as u8,
+                        forknum,
                    },
                    blknum,
                };
-                pcache.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf));
+                timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf))?;
                /*
                if oldest_lsn == 0 || p.lsn < oldest_lsn {
                    oldest_lsn = p.lsn;
@@ -252,88 +171,22 @@ fn restore_relfile(
    Ok(())
 }

-fn restore_nonrelfile(
-    _conf: &PageServerConf,
-    pcache: &PageCache,
-    _timeline: ZTimelineId,
-    snapshot: &str,
-    forknum: u32,
-    path: &Path,
-) -> Result<()> {
-    let lsn = Lsn::from_hex(snapshot)?;
-
-    // Does it look like a relation file?
-
-    let mut file = File::open(path)?;
-    let mut buf: [u8; 8192] = [0u8; 8192];
-    let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;
-
-    // FIXME: use constants (BLCKSZ)
-    let mut blknum: u32 = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
-    loop {
-        let r = file.read_exact(&mut buf);
-        match r {
-            Ok(_) => {
-                let tag = BufferTag {
-                    rel: RelTag {
-                        spcnode: 0,
-                        dbnode: 0,
-                        relnode: 0,
-                        forknum: forknum as u8,
-                    },
-                    blknum,
-                };
-                pcache.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf));
-                /*
-                if oldest_lsn == 0 || p.lsn < oldest_lsn {
-                    oldest_lsn = p.lsn;
-                }
-                 */
-            }
-
-            // TODO: UnexpectedEof is expected
-            Err(e) => match e.kind() {
-                std::io::ErrorKind::UnexpectedEof => {
-                    // reached EOF. That's expected.
-                    // FIXME: maybe check that we read the full length of the file?
-                    break;
-                }
-                _ => {
-                    error!("error reading file: {:?} ({})", path, e);
-                    break;
-                }
-            },
-        };
-        blknum += 1;
-    }
-
-    Ok(())
-}
-
-// Scan WAL on a timeline, starting from gien LSN, and load all the records
-// into the page cache.
-fn restore_wal(
-    _conf: &PageServerConf,
-    pcache: &PageCache,
-    timeline: ZTimelineId,
-    startpoint: Lsn,
-) -> Result<()> {
-    let walpath = format!("timelines/{}/wal", timeline);
-
+/// Scan PostgreSQL WAL files in given directory, and load all records >= 'startpoint' into
+/// the repository.
+pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint: Lsn) -> Result<()> {
    let mut waldecoder = WalStreamDecoder::new(startpoint);

-    const SEG_SIZE: u64 = 16 * 1024 * 1024;
-    let mut segno = startpoint.segment_number(SEG_SIZE);
-    let mut offset = startpoint.segment_offset(SEG_SIZE);
-    let mut last_lsn = Lsn(0);
+    let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE);
+    let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
+    let mut last_lsn = startpoint;
    loop {
        // FIXME: assume postgresql tli 1 for now
-        let filename = XLogFileName(1, segno, 16 * 1024 * 1024);
-        let mut path = walpath.clone() + "/" + &filename;
+        let filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE);
+        let mut path = walpath.join(&filename);

        // It could be as .partial
        if !PathBuf::from(&path).exists() {
-            path += ".partial";
+            path = walpath.join(filename + ".partial");
        }

        // Slurp the WAL file
@@ -351,7 +204,7 @@ fn restore_wal(

        let mut buf = Vec::new();
        let nread = file.read_to_end(&mut buf)?;
-        if nread != 16 * 1024 * 1024 - offset as usize {
+        if nread != pg_constants::WAL_SEGMENT_SIZE - offset as usize {
            // Maybe allow this for .partial files?
            error!("read only {} bytes from WAL file", nread);
        }
@@ -367,33 +220,7 @@ fn restore_wal(
            }
            if let Some((lsn, recdata)) = rec.unwrap() {
                let decoded = decode_wal_record(recdata.clone());
-                // Put the WAL record to the page cache. We make a separate copy of
-                // it for every block it modifies. (The actual WAL record is kept in
-                // a Bytes, which uses a reference counter for the underlying buffer,
-                // so having multiple copies of it doesn't cost that much)
-                for blk in decoded.blocks.iter() {
-                    let tag = BufferTag {
-                        rel: RelTag {
-                            spcnode: blk.rnode_spcnode,
-                            dbnode: blk.rnode_dbnode,
-                            relnode: blk.rnode_relnode,
-                            forknum: blk.forknum as u8,
-                        },
-                        blknum: blk.blkno,
-                    };
-                    let rec = page_cache::WALRecord {
-                        lsn,
-                        will_init: blk.will_init || blk.apply_image,
-                        truncate: false,
-                        rec: recdata.clone(),
-                        main_data_offset: decoded.main_data_offset as u32,
-                    };
-
-                    pcache.put_wal_record(tag, rec);
-                }
-                // Now that this record has been handled, let the page cache know that
-                // it is up-to-date to this LSN
-                pcache.advance_last_valid_lsn(lsn);
+                timeline.save_decoded_record(decoded, recdata, lsn)?;
                last_lsn = lsn;
            } else {
                break;
@@ -401,95 +228,16 @@ fn restore_wal(
            nrecords += 1;
        }

-        info!("restored {} records from WAL file {}", nrecords, filename);
+        info!(
+            "imported {} records from WAL file {} up to {}",
+            nrecords,
+            path.display(),
+            last_lsn
+        );

        segno += 1;
        offset = 0;
    }
    info!("reached end of WAL at {}", last_lsn);
-
    Ok(())
 }
-
-#[derive(Debug, Clone)]
-struct FilePathError {
-    msg: String,
-}
-
-impl Error for FilePathError {
-    fn description(&self) -> &str {
-        &self.msg
-    }
-}
-impl FilePathError {
-    fn new(msg: &str) -> FilePathError {
-        FilePathError {
-            msg: msg.to_string(),
-        }
-    }
-}
-
-impl From<core::num::ParseIntError> for FilePathError {
-    fn from(e: core::num::ParseIntError) -> Self {
-        return FilePathError {
-            msg: format!("invalid filename: {}", e),
-        };
-    }
-}
-
-impl fmt::Display for FilePathError {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "invalid filename")
-    }
-}
-
-fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
-    match forkname {
-        // "main" is not in filenames, it's implicit if the fork name is not present
-        None => Ok(0),
-        Some("fsm") => Ok(1),
-        Some("vm") => Ok(2),
-        Some("init") => Ok(3),
-        Some(_) => Err(FilePathError::new("invalid forkname")),
-    }
-}
-
-#[derive(Debug)]
-struct ParsedBaseImageFileName {
-    pub spcnode: u32,
-    pub dbnode: u32,
-    pub relnode: u32,
-    pub forknum: u32,
-    pub segno: u32,
-
-    pub lsn: u64,
-}
-
-// formats:
-// <oid>
-// <oid>_<fork name>
-// <oid>.<segment number>
-// <oid>_<fork name>.<segment number>
-
-fn parse_relfilename(fname: &str) -> Result<(u32, u32, u32), FilePathError> {
-    let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
-
-    let caps = re
-        .captures(fname)
-        .ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
-
-    let relnode_str = caps.name("relnode").unwrap().as_str();
-    let relnode = u32::from_str_radix(relnode_str, 10)?;
-
-    let forkname = caps.name("forkname").map(|f| f.as_str());
-    let forknum = forkname_to_forknum(forkname)?;
-
-    let segno_match = caps.name("segno");
-    let segno = if segno_match.is_none() {
-        0
-    } else {
-        u32::from_str_radix(segno_match.unwrap().as_str(), 10)?
-    };
-
-    Ok((relnode, forknum, segno))
-}
--- a/pageserver/src/restore_s3.rs
+++ b/pageserver/src/restore_s3.rs
@@ -23,6 +23,8 @@ use tokio::runtime;
 use futures::future;

 use crate::{page_cache, PageServerConf};
+use postgres_ffi::pg_constants;
+use postgres_ffi::relfile_utils::*;

 struct Storage {
    region: Region,
@@ -127,56 +129,12 @@ async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
    Ok(())
 }

-// From pg_tablespace_d.h
-//
-// FIXME: we'll probably need these elsewhere too, move to some common location
-const DEFAULTTABLESPACE_OID: u32 = 1663;
-const GLOBALTABLESPACE_OID: u32 = 1664;
-
-#[derive(Debug)]
-struct FilePathError {
-    msg: String,
-}
-
-impl FilePathError {
-    fn new(msg: &str) -> FilePathError {
-        FilePathError {
-            msg: msg.to_string(),
-        }
-    }
-}
-
-impl From<core::num::ParseIntError> for FilePathError {
-    fn from(e: core::num::ParseIntError) -> Self {
-        return FilePathError {
-            msg: format!("invalid filename: {}", e),
-        };
-    }
-}
-
-impl fmt::Display for FilePathError {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "invalid filename")
-    }
-}
-
-fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
-    match forkname {
-        // "main" is not in filenames, it's implicit if the fork name is not present
-        None => Ok(0),
-        Some("fsm") => Ok(1),
-        Some("vm") => Ok(2),
-        Some("init") => Ok(3),
-        Some(_) => Err(FilePathError::new("invalid forkname")),
-    }
-}
-
 #[derive(Debug)]
 struct ParsedBaseImageFileName {
    pub spcnode: u32,
    pub dbnode: u32,
    pub relnode: u32,
-    pub forknum: u32,
+    pub forknum: u8,
    pub segno: u32,

    pub lsn: u64,
@@ -188,7 +146,7 @@ struct ParsedBaseImageFileName {
 // <oid>.<segment number>
 // <oid>_<fork name>.<segment number>

-fn parse_filename(fname: &str) -> Result<(u32, u32, u32, u64), FilePathError> {
+fn parse_filename(fname: &str) -> Result<(u32, u8, u32, u64), FilePathError> {
    let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?_(?P<lsnhi>[[:xdigit:]]{8})(?P<lsnlo>[[:xdigit:]]{8})$").unwrap();

    let caps = re
@@ -237,7 +195,7 @@ fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathEr
        let (relnode, forknum, segno, lsn) = parse_filename(fname)?;

        Ok(ParsedBaseImageFileName {
-            spcnode: GLOBALTABLESPACE_OID,
+            spcnode: pg_constants::GLOBALTABLESPACE_OID,
            dbnode: 0,
            relnode,
            forknum,
@@ -260,7 +218,7 @@ fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathEr
        let (relnode, forknum, segno, lsn) = parse_filename(fname)?;

        Ok(ParsedBaseImageFileName {
-            spcnode: DEFAULTTABLESPACE_OID,
+            spcnode: pg_constants::DEFAULTTABLESPACE_OID,
            dbnode,
            relnode,
            forknum,
@@ -294,8 +252,7 @@ async fn slurp_base_file(

    let mut bytes = BytesMut::from(data.as_slice()).freeze();

-    // FIXME: use constants (BLCKSZ)
-    let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / 8192);
+    let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);

    let pcache = page_cache::get_pagecache(conf, sys_id);

@@ -305,7 +262,7 @@ async fn slurp_base_file(
                spcnode: parsed.spcnode,
                dbnode: parsed.dbnode,
                relnode: parsed.relnode,
-                forknum: parsed.forknum as u8,
+                forknum: parsed.forknum,
            },
            blknum,
        };
--- a/pageserver/src/tui.rs
+++ b/pageserver/src/tui.rs
@@ -171,6 +171,11 @@ pub fn ui_main() -> Result<(), Box<dyn Error>> {
        })?;

        // If ther user presses 'q', quit.
+
+        // silence clippy's suggestion to rewrite this as an if-statement. Match
+        // makes more sense as soon as we get another command than 'q'.
+        #[allow(clippy::single_match)]
+        #[allow(clippy::collapsible_match)]
        if let Event::Input(key) = events.next()? {
            match key {
                Key::Char('q') => {
@@ -229,7 +234,7 @@ impl<'a> Widget for LogWidget<'a> {
 // Render a widget to show some metrics
 struct MetricsWidget {}

-fn get_metric_u64(title: &str, value: u64) -> Spans {
+fn _get_metric_u64(title: &str, value: u64) -> Spans {
    Spans::from(vec![
        Span::styled(format!("{:<20}", title), Style::default()),
        Span::raw(": "),
@@ -260,9 +265,11 @@ impl tui::widgets::Widget for MetricsWidget {

        block.render(area, buf);

+        #[allow(unused_mut)]
        let mut lines: Vec<Spans> = Vec::new();

-        let page_cache_stats = crate::page_cache::get_stats();
+        // FIXME
+        //let page_cache_stats = crate::page_cache::get_stats();

        // This is not used since LSNs were removed from page cache stats.
        // Maybe it will be used in the future?
@@ -275,7 +282,7 @@ impl tui::widgets::Widget for MetricsWidget {
        lines.push(get_metric_str("Valid LSN range", &lsnrange));
        lines.push(get_metric_str("Last record LSN", &last_valid_recordlsn_str));
        */
-
+        /*
        lines.push(get_metric_u64(
            "# of cache entries",
            page_cache_stats.num_entries,
@@ -292,7 +299,7 @@ impl tui::widgets::Widget for MetricsWidget {
            "# of GetPage@LSN calls",
            page_cache_stats.num_getpage_requests,
        ));
-
+        */
        let text = Text::from(lines);

        Paragraph::new(text).render(inner_area, buf);
--- a/pageserver/src/waldecoder.rs
+++ b/pageserver/src/waldecoder.rs
@@ -7,8 +7,13 @@ use std::str;
 use thiserror::Error;
 use zenith_utils::lsn::Lsn;

-// FIXME: this is configurable in PostgreSQL, 16 MB is the default
-const WAL_SEGMENT_SIZE: u64 = 16 * 1024 * 1024;
+pub type Oid = u32;
+pub type TransactionId = u32;
+pub type BlockNumber = u32;
+pub type OffsetNumber = u16;
+pub type MultiXactId = TransactionId;
+pub type MultiXactOffset = u32;
+pub type MultiXactStatus = u32;

 // From PostgreSQL headers

@@ -92,7 +97,7 @@ impl WalStreamDecoder {
    pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
        loop {
            // parse and verify page boundaries as we go
-            if self.lsn.segment_offset(WAL_SEGMENT_SIZE) == 0 {
+            if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
                // parse long header

                if self.inputbuf.remaining() < SizeOfXLogLongPHD {
@@ -185,7 +190,8 @@ impl WalStreamDecoder {
                    let xlogrec = XLogRecord::from_bytes(&mut buf);
                    if xlogrec.is_xlog_switch_record() {
                        trace!("saw xlog switch record at {}", self.lsn);
-                        self.padlen = self.lsn.calc_padding(WAL_SEGMENT_SIZE) as u32;
+                        self.padlen =
+                            self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32;
                    } else {
                        // Pad to an 8-byte boundary
                        self.padlen = self.lsn.calc_padding(8u32) as u32;
@@ -258,7 +264,8 @@ pub struct DecodedBkpBlock {
    /* Information on full-page image, if any */
    has_image: bool,       /* has image, even for consistency checking */
    pub apply_image: bool, /* has image that should be restored */
-    pub will_init: bool,
+    pub will_init: bool,   /* record intialize page content */
+    pub will_drop: bool,   /* record drops relation */
    //char	   *bkp_image;
    hole_offset: u16,
    hole_length: u16,
@@ -283,6 +290,7 @@ impl DecodedBkpBlock {
            has_image: false,
            apply_image: false,
            will_init: false,
+            will_drop: false,
            hole_offset: 0,
            hole_length: 0,
            bimg_len: 0,
@@ -306,12 +314,6 @@ pub struct DecodedWALRecord {
    pub main_data_offset: usize,
 }

-pub type Oid = u32;
-pub type BlockNumber = u32;
-
-pub const MAIN_FORKNUM: u8 = 0;
-pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001;
-
 #[repr(C)]
 #[derive(Debug, Clone, Copy)]
 pub struct RelFileNode {
@@ -320,6 +322,24 @@ pub struct RelFileNode {
    pub relnode: Oid, /* relation */
 }

+#[repr(C)]
+#[derive(Debug)]
+pub struct XlRelmapUpdate {
+    pub dbid: Oid,   /* database ID, or 0 for shared map */
+    pub tsid: Oid,   /* database's tablespace, or pg_global */
+    pub nbytes: i32, /* size of relmap data */
+}
+
+impl XlRelmapUpdate {
+    pub fn decode(buf: &mut Bytes) -> XlRelmapUpdate {
+        XlRelmapUpdate {
+            dbid: buf.get_u32_le(),
+            tsid: buf.get_u32_le(),
+            nbytes: buf.get_i32_le(),
+        }
+    }
+}
+
 #[repr(C)]
 #[derive(Debug)]
 pub struct XlSmgrTruncate {
@@ -366,6 +386,150 @@ impl XlCreateDatabase {
    }
 }

+#[repr(C)]
+#[derive(Debug)]
+pub struct XlHeapInsert {
+    pub offnum: OffsetNumber,
+    pub flags: u8,
+}
+
+impl XlHeapInsert {
+    pub fn decode(buf: &mut Bytes) -> XlHeapInsert {
+        XlHeapInsert {
+            offnum: buf.get_u16_le(),
+            flags: buf.get_u8(),
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlHeapMultiInsert {
+    pub flags: u8,
+    pub ntuples: u16,
+}
+
+impl XlHeapMultiInsert {
+    pub fn decode(buf: &mut Bytes) -> XlHeapMultiInsert {
+        XlHeapMultiInsert {
+            flags: buf.get_u8(),
+            ntuples: buf.get_u16_le(),
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlHeapDelete {
+    pub xmax: TransactionId,
+    pub offnum: OffsetNumber,
+    pub infobits_set: u8,
+    pub flags: u8,
+}
+
+impl XlHeapDelete {
+    pub fn decode(buf: &mut Bytes) -> XlHeapDelete {
+        XlHeapDelete {
+            xmax: buf.get_u32_le(),
+            offnum: buf.get_u16_le(),
+            infobits_set: buf.get_u8(),
+            flags: buf.get_u8(),
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlHeapUpdate {
+    pub old_xmax: TransactionId,
+    pub old_offnum: OffsetNumber,
+    pub old_infobits_set: u8,
+    pub flags: u8,
+    pub new_xmax: TransactionId,
+    pub new_offnum: OffsetNumber,
+}
+
+impl XlHeapUpdate {
+    pub fn decode(buf: &mut Bytes) -> XlHeapUpdate {
+        XlHeapUpdate {
+            old_xmax: buf.get_u32_le(),
+            old_offnum: buf.get_u16_le(),
+            old_infobits_set: buf.get_u8(),
+            flags: buf.get_u8(),
+            new_xmax: buf.get_u32_le(),
+            new_offnum: buf.get_u16_le(),
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct MultiXactMember {
+    pub xid: TransactionId,
+    pub status: MultiXactStatus,
+}
+
+impl MultiXactMember {
+    pub fn decode(buf: &mut Bytes) -> MultiXactMember {
+        MultiXactMember {
+            xid: buf.get_u32_le(),
+            status: buf.get_u32_le(),
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlMultiXactCreate {
+    pub mid: MultiXactId,      /* new MultiXact's ID */
+    pub moff: MultiXactOffset, /* its starting offset in members file */
+    pub nmembers: u32,         /* number of member XIDs */
+    pub members: Vec<MultiXactMember>,
+}
+
+impl XlMultiXactCreate {
+    pub fn decode(buf: &mut Bytes) -> XlMultiXactCreate {
+        let mid = buf.get_u32_le();
+        let moff = buf.get_u32_le();
+        let nmembers = buf.get_u32_le();
+        let mut members = Vec::new();
+        for _ in 0..nmembers {
+            members.push(MultiXactMember::decode(buf));
+        }
+        XlMultiXactCreate {
+            mid,
+            moff,
+            nmembers,
+            members,
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlMultiXactTruncate {
+    oldest_multi_db: Oid,
+    /* to-be-truncated range of multixact offsets */
+    start_trunc_off: MultiXactId, /* just for completeness' sake */
+    end_trunc_off: MultiXactId,
+
+    /* to-be-truncated range of multixact members */
+    start_trunc_memb: MultiXactOffset,
+    end_trunc_memb: MultiXactOffset,
+}
+
+impl XlMultiXactTruncate {
+    pub fn decode(buf: &mut Bytes) -> XlMultiXactTruncate {
+        XlMultiXactTruncate {
+            oldest_multi_db: buf.get_u32_le(),
+            start_trunc_off: buf.get_u32_le(),
+            end_trunc_off: buf.get_u32_le(),
+            start_trunc_memb: buf.get_u32_le(),
+            end_trunc_memb: buf.get_u32_le(),
+        }
+    }
+}
+
 //
 // Routines to decode a WAL record and figure out which blocks are modified
 //
@@ -614,30 +778,10 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
        assert_eq!(buf.remaining(), main_data_len as usize);
    }

-    //5. Handle special CLOG and XACT records
-    if xlogrec.xl_rmid == pg_constants::RM_CLOG_ID {
-        let mut blk = DecodedBkpBlock::new();
-        blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
-        blk.blkno = buf.get_i32_le() as u32;
-        blk.will_init = true;
-        trace!("RM_CLOG_ID updates block {}", blk.blkno);
-        blocks.push(blk);
-    } else if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
+    //5. Handle special XACT records
+    if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
        let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
        if info == pg_constants::XLOG_XACT_COMMIT {
-            let mut blk = DecodedBkpBlock::new();
-            blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
-            blk.blkno = xlogrec.xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
-            trace!(
-                "XLOG_XACT_COMMIT xl_info {} xl_prev {:X}/{:X}  xid {} updates block {} main_data_len {}",
-                xlogrec.xl_info, (xlogrec.xl_prev >> 32),
-                xlogrec.xl_prev & 0xffffffff,
-                xlogrec.xl_xid,
-                blk.blkno,
-                main_data_len
-            );
-            blocks.push(blk);
-
            //parse commit record to extract subtrans entries
            // xl_xact_commit starts with time of commit
            let _xact_time = buf.get_i64_le();
@@ -652,17 +796,8 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
            }
            if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
                let nsubxacts = buf.get_i32_le();
-                let mut prev_blkno = u32::MAX;
                for _i in 0..nsubxacts {
-                    let subxact = buf.get_u32_le();
-                    let blkno = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
-                    if prev_blkno != blkno {
-                        prev_blkno = blkno;
-                        let mut blk = DecodedBkpBlock::new();
-                        blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
-                        blk.blkno = blkno;
-                        blocks.push(blk);
-                    }
+                    let _subxact = buf.get_u32_le();
                }
            }
            if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
@@ -671,7 +806,13 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
                    let spcnode = buf.get_u32_le();
                    let dbnode = buf.get_u32_le();
                    let relnode = buf.get_u32_le();
-                    //TODO handle this too?
+                    let mut blk = DecodedBkpBlock::new();
+                    blk.forknum = pg_constants::MAIN_FORKNUM;
+                    blk.rnode_spcnode = spcnode;
+                    blk.rnode_dbnode = dbnode;
+                    blk.rnode_relnode = relnode;
+                    blk.will_drop = true;
+                    blocks.push(blk);
                    trace!(
                        "XLOG_XACT_COMMIT relfilenode {}/{}/{}",
                        spcnode,
@@ -693,18 +834,6 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
                //TODO handle this to be able to restore pg_twophase on node start
            }
        } else if info == pg_constants::XLOG_XACT_ABORT {
-            let mut blk = DecodedBkpBlock::new();
-            blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
-            blk.blkno = xlogrec.xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
-            trace!(
-                "XLOG_XACT_ABORT xl_info {} xl_prev {:X}/{:X} xid {} updates block {} main_data_len {}",
-                xlogrec.xl_info, (xlogrec.xl_prev >> 32),
-                xlogrec.xl_prev & 0xffffffff,
-                xlogrec.xl_xid,
-                blk.blkno,
-                main_data_len
-            );
-            blocks.push(blk);
            //parse abort record to extract subtrans entries
            // xl_xact_abort starts with time of commit
            let _xact_time = buf.get_i64_le();
@@ -719,17 +848,8 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
            }
            if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
                let nsubxacts = buf.get_i32_le();
-                let mut prev_blkno = u32::MAX;
                for _i in 0..nsubxacts {
-                    let subxact = buf.get_u32_le();
-                    let blkno = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
-                    if prev_blkno != blkno {
-                        prev_blkno = blkno;
-                        let mut blk = DecodedBkpBlock::new();
-                        blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
-                        blk.blkno = blkno;
-                        blocks.push(blk);
-                    }
+                    let _subxact = buf.get_u32_le();
                }
            }
            if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
@@ -738,7 +858,13 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
                    let spcnode = buf.get_u32_le();
                    let dbnode = buf.get_u32_le();
                    let relnode = buf.get_u32_le();
-                    //TODO save these too
+                    let mut blk = DecodedBkpBlock::new();
+                    blk.forknum = pg_constants::MAIN_FORKNUM;
+                    blk.rnode_spcnode = spcnode;
+                    blk.rnode_dbnode = dbnode;
+                    blk.rnode_relnode = relnode;
+                    blk.will_drop = true;
+                    blocks.push(blk);
                    trace!(
                        "XLOG_XACT_ABORT relfilenode {}/{}/{}",
                        spcnode,
@@ -782,6 +908,79 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
        } else {
            trace!("XLOG_TBLSPC_DROP is not handled yet");
        }
+    } else if xlogrec.xl_rmid == pg_constants::RM_HEAP_ID {
+        let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
+        let blkno = blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
+        if info == pg_constants::XLOG_HEAP_INSERT {
+            let xlrec = XlHeapInsert::decode(&mut buf);
+            if (xlrec.flags
+                & (pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED
+                    | pg_constants::XLH_INSERT_ALL_FROZEN_SET))
+                != 0
+            {
+                let mut blk = DecodedBkpBlock::new();
+                blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
+                blk.blkno = blkno;
+                blk.rnode_spcnode = blocks[0].rnode_spcnode;
+                blk.rnode_dbnode = blocks[0].rnode_dbnode;
+                blk.rnode_relnode = blocks[0].rnode_relnode;
+                blocks.push(blk);
+            }
+        } else if info == pg_constants::XLOG_HEAP_DELETE {
+            let xlrec = XlHeapDelete::decode(&mut buf);
+            if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
+                let mut blk = DecodedBkpBlock::new();
+                blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
+                blk.blkno = blkno;
+                blk.rnode_spcnode = blocks[0].rnode_spcnode;
+                blk.rnode_dbnode = blocks[0].rnode_dbnode;
+                blk.rnode_relnode = blocks[0].rnode_relnode;
+                blocks.push(blk);
+            }
+        } else if info == pg_constants::XLOG_HEAP_UPDATE
+            || info == pg_constants::XLOG_HEAP_HOT_UPDATE
+        {
+            let xlrec = XlHeapUpdate::decode(&mut buf);
+            if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
+                let mut blk = DecodedBkpBlock::new();
+                blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
+                blk.blkno = blkno;
+                blk.rnode_spcnode = blocks[0].rnode_spcnode;
+                blk.rnode_dbnode = blocks[0].rnode_dbnode;
+                blk.rnode_relnode = blocks[0].rnode_relnode;
+                blocks.push(blk);
+            }
+            if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0
+                && blocks.len() > 1
+            {
+                let mut blk = DecodedBkpBlock::new();
+                blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
+                blk.blkno = blocks[1].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
+                blk.rnode_spcnode = blocks[1].rnode_spcnode;
+                blk.rnode_dbnode = blocks[1].rnode_dbnode;
+                blk.rnode_relnode = blocks[1].rnode_relnode;
+                blocks.push(blk);
+            }
+        }
+    } else if xlogrec.xl_rmid == pg_constants::RM_HEAP2_ID {
+        let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
+        if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
+            let xlrec = XlHeapMultiInsert::decode(&mut buf);
+            if (xlrec.flags
+                & (pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED
+                    | pg_constants::XLH_INSERT_ALL_FROZEN_SET))
+                != 0
+            {
+                let mut blk = DecodedBkpBlock::new();
+                let blkno = blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
+                blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
+                blk.blkno = blkno;
+                blk.rnode_spcnode = blocks[0].rnode_spcnode;
+                blk.rnode_dbnode = blocks[0].rnode_dbnode;
+                blk.rnode_relnode = blocks[0].rnode_relnode;
+                blocks.push(blk);
+            }
+        }
    }

    DecodedWALRecord {
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -7,17 +7,20 @@
 //!

 use crate::page_cache;
-use crate::page_cache::{BufferTag, RelTag};
 use crate::waldecoder::*;
 use crate::PageServerConf;
 use crate::ZTimelineId;
-use anyhow::Error;
+use anyhow::{Error, Result};
 use lazy_static::lazy_static;
 use log::*;
+use postgres::fallible_iterator::FallibleIterator;
+use postgres::replication::ReplicationIter;
+use postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow};
 use postgres_ffi::pg_constants;
 use postgres_ffi::xlog_utils::*;
 use postgres_protocol::message::backend::ReplicationMessage;
 use postgres_types::PgLsn;
+use std::cmp::{max, min};
 use std::collections::HashMap;
 use std::fs;
 use std::fs::{File, OpenOptions};
@@ -27,11 +30,7 @@ use std::str::FromStr;
 use std::sync::Mutex;
 use std::thread;
 use std::thread::sleep;
-use std::time::Duration;
-use tokio::runtime::Runtime;
-use tokio_postgres::replication::{PgTimestamp, ReplicationStream};
-use tokio_postgres::{NoTls, SimpleQueryMessage, SimpleQueryRow};
-use tokio_stream::StreamExt;
+use std::time::{Duration, SystemTime};
 use zenith_utils::lsn::Lsn;

 //
@@ -48,7 +47,7 @@ lazy_static! {

 // Launch a new WAL receiver, or tell one that's running about change in connection string
 pub fn launch_wal_receiver(
-    conf: &PageServerConf,
+    conf: &'static PageServerConf,
    timelineid: ZTimelineId,
    wal_producer_connstr: &str,
 ) {
@@ -65,11 +64,10 @@ pub fn launch_wal_receiver(
            receivers.insert(timelineid, receiver);

            // Also launch a new thread to handle this connection
-            let conf_copy = conf.clone();
            let _walreceiver_thread = thread::Builder::new()
                .name("WAL receiver thread".into())
                .spawn(move || {
-                    thread_main(&conf_copy, timelineid);
+                    thread_main(conf, timelineid);
                })
                .unwrap();
        }
@@ -90,22 +88,12 @@ fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String {
 //
 // This is the entry point for the WAL receiver thread.
 //
-fn thread_main(conf: &PageServerConf, timelineid: ZTimelineId) {
+fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId) {
    info!(
        "WAL receiver thread started for timeline : '{}'",
        timelineid
    );

-    // We need a tokio runtime to call the rust-postgres copy_both function.
-    // Most functions in the rust-postgres driver have a blocking wrapper,
-    // but copy_both does not (TODO: the copy_both support is still work-in-progress
-    // as of this writing. Check later if that has changed, or implement the
-    // wrapper ourselves in rust-postgres)
-    let runtime = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()
-        .unwrap();
-
    //
    // Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
    // and start streaming WAL from it. If the connection is lost, keep retrying.
@@ -114,7 +102,7 @@ fn thread_main(conf: &PageServerConf, timelineid: ZTimelineId) {
        // Look up the current WAL producer address
        let wal_producer_connstr = get_wal_producer_connstr(timelineid);

-        let res = walreceiver_main(&runtime, conf, timelineid, &wal_producer_connstr);
+        let res = walreceiver_main(conf, timelineid, &wal_producer_connstr);

        if let Err(e) = res {
            info!(
@@ -127,8 +115,7 @@ fn thread_main(conf: &PageServerConf, timelineid: ZTimelineId) {
 }

 fn walreceiver_main(
-    runtime: &Runtime,
-    conf: &PageServerConf,
+    _conf: &PageServerConf,
    timelineid: ZTimelineId,
    wal_producer_connstr: &str,
 ) -> Result<(), Error> {
@@ -136,154 +123,73 @@ fn walreceiver_main(
    info!("connecting to {:?}", wal_producer_connstr);
    let connect_cfg = format!("{} replication=true", wal_producer_connstr);

-    let (rclient, connection) = runtime.block_on(tokio_postgres::connect(&connect_cfg, NoTls))?;
+    let mut rclient = Client::connect(&connect_cfg, NoTls)?;
    info!("connected!");

-    // The connection object performs the actual communication with the database,
-    // so spawn it off to run on its own.
-    runtime.spawn(async move {
-        if let Err(e) = connection.await {
-            error!("connection error: {}", e);
-        }
-    });
-
-    let identify = identify_system(runtime, &rclient)?;
+    let identify = identify_system(&mut rclient)?;
    info!("{:?}", identify);
    let end_of_wal = Lsn::from(u64::from(identify.xlogpos));
    let mut caught_up = false;

-    let pcache = page_cache::get_pagecache(&conf, timelineid).unwrap();
+    let repository = page_cache::get_repository();
+    let timeline = repository.get_timeline(timelineid).unwrap();

    //
    // Start streaming the WAL, from where we left off previously.
    //
-    let mut startpoint = pcache.get_last_valid_lsn();
-    let last_valid_lsn = pcache.get_last_valid_lsn();
+    // If we had previously received WAL up to some point in the middle of a WAL record, we
+    // better start from the end of last full WAL record, not in the middle of one. Hence,
+    // use 'last_record_lsn' rather than 'last_valid_lsn' here.
+    let mut last_rec_lsn = timeline.get_last_record_lsn();
+    let mut startpoint = last_rec_lsn;
+
    if startpoint == Lsn(0) {
-        // If we start here with identify.xlogpos we will have race condition with
-        // postgres start: insert into postgres may request page that was modified with lsn
-        // smaller than identify.xlogpos.
-        //
-        // Current procedure for starting postgres will anyway be changed to something
-        // different like having 'initdb' method on a pageserver (or importing some shared
-        // empty database snapshot), so for now I just put start of first segment which
-        // seems to be a valid record.
-        pcache.init_valid_lsn(Lsn(0x0100_0000));
-        startpoint = Lsn(0x0100_0000);
-    } else {
-        // There might be some padding after the last full record, skip it.
-        //
-        // FIXME: It probably would be better to always start streaming from the beginning
-        // of the page, or the segment, so that we could check the page/segment headers
-        // too. Just for the sake of paranoia.
-        startpoint += startpoint.calc_padding(8u32);
+        error!("No previous WAL position");
    }
+
+    // There might be some padding after the last full record, skip it.
+    //
+    // FIXME: It probably would be better to always start streaming from the beginning
+    // of the page, or the segment, so that we could check the page/segment headers
+    // too. Just for the sake of paranoia.
+    startpoint += startpoint.calc_padding(8u32);
+
    debug!(
-        "last_valid_lsn {} starting replication from {}  for timeline {}, server is at {}...",
-        last_valid_lsn, startpoint, timelineid, end_of_wal
+        "last_record_lsn {} starting replication from {} for timeline {}, server is at {}...",
+        last_rec_lsn, startpoint, timelineid, end_of_wal
    );

    let query = format!("START_REPLICATION PHYSICAL {}", startpoint);

-    let copy_stream = runtime.block_on(rclient.copy_both_simple::<bytes::Bytes>(&query))?;
-
-    let physical_stream = ReplicationStream::new(copy_stream);
-    tokio::pin!(physical_stream);
+    let copy_stream = rclient.copy_both_simple(&query)?;
+    let mut physical_stream = ReplicationIter::new(copy_stream);

    let mut waldecoder = WalStreamDecoder::new(startpoint);

-    while let Some(replication_message) = runtime.block_on(physical_stream.next()) {
-        match replication_message? {
+    while let Some(replication_message) = physical_stream.next()? {
+        match replication_message {
            ReplicationMessage::XLogData(xlog_data) => {
                // Pass the WAL data to the decoder, and see if we can decode
                // more records as a result.
                let data = xlog_data.data();
                let startlsn = Lsn::from(xlog_data.wal_start());
                let endlsn = startlsn + data.len() as u64;
+                let prev_last_rec_lsn = last_rec_lsn;

-                write_wal_file(
-                    startlsn,
-                    timelineid,
-                    16 * 1024 * 1024, // FIXME
-                    data,
-                )?;
+                write_wal_file(startlsn, timelineid, pg_constants::WAL_SEGMENT_SIZE, data)?;

                trace!("received XLogData between {} and {}", startlsn, endlsn);

                waldecoder.feed_bytes(data);

-                loop {
-                    if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                        let decoded = decode_wal_record(recdata.clone());
-                        // Put the WAL record to the page cache. We make a separate copy of
-                        // it for every block it modifies. (The actual WAL record is kept in
-                        // a Bytes, which uses a reference counter for the underlying buffer,
-                        // so having multiple copies of it doesn't cost that much)
-                        for blk in decoded.blocks.iter() {
-                            let tag = BufferTag {
-                                rel: RelTag {
-                                    spcnode: blk.rnode_spcnode,
-                                    dbnode: blk.rnode_dbnode,
-                                    relnode: blk.rnode_relnode,
-                                    forknum: blk.forknum as u8,
-                                },
-                                blknum: blk.blkno,
-                            };
+                while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
+                    let decoded = decode_wal_record(recdata.clone());
+                    timeline.save_decoded_record(decoded, recdata, lsn)?;

-                            let rec = page_cache::WALRecord {
-                                lsn,
-                                will_init: blk.will_init || blk.apply_image,
-                                truncate: false,
-                                rec: recdata.clone(),
-                                main_data_offset: decoded.main_data_offset as u32,
-                            };
-
-                            pcache.put_wal_record(tag, rec);
-                        }
-                        // include truncate wal record in all pages
-                        if decoded.xl_rmid == pg_constants::RM_SMGR_ID
-                            && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
-                                == pg_constants::XLOG_SMGR_TRUNCATE
-                        {
-                            let truncate = XlSmgrTruncate::decode(&decoded);
-                            if (truncate.flags & SMGR_TRUNCATE_HEAP) != 0 {
-                                let tag = BufferTag {
-                                    rel: RelTag {
-                                        spcnode: truncate.rnode.spcnode,
-                                        dbnode: truncate.rnode.dbnode,
-                                        relnode: truncate.rnode.relnode,
-                                        forknum: MAIN_FORKNUM,
-                                    },
-                                    blknum: truncate.blkno,
-                                };
-                                let rec = page_cache::WALRecord {
-                                    lsn,
-                                    will_init: false,
-                                    truncate: true,
-                                    rec: recdata.clone(),
-                                    main_data_offset: decoded.main_data_offset as u32,
-                                };
-                                pcache.put_rel_wal_record(tag, rec)?;
-                            }
-                        } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID
-                            && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
-                                == pg_constants::XLOG_DBASE_CREATE
-                        {
-                            let createdb = XlCreateDatabase::decode(&decoded);
-                            pcache.create_database(
-                                lsn,
-                                createdb.db_id,
-                                createdb.tablespace_id,
-                                createdb.src_db_id,
-                                createdb.src_tablespace_id,
-                            )?;
-                        }
-                        // Now that this record has been handled, let the page cache know that
-                        // it is up-to-date to this LSN
-                        pcache.advance_last_record_lsn(lsn);
-                    } else {
-                        break;
-                    }
+                    // Now that this record has been handled, let the page cache know that
+                    // it is up-to-date to this LSN
+                    timeline.advance_last_record_lsn(lsn);
+                    last_rec_lsn = lsn;
                }

                // Update the last_valid LSN value in the page cache one more time. We updated
@@ -292,7 +198,34 @@ fn walreceiver_main(
                // better reflect that, because GetPage@LSN requests might also point in the
                // middle of a record, if the request LSN was taken from the server's current
                // flush ptr.
-                pcache.advance_last_valid_lsn(endlsn);
+                timeline.advance_last_valid_lsn(endlsn);
+
+                // Somewhat arbitrarily, if we have at least 10 complete wal segments (16 MB each),
+                // "checkpoint" the repository to flush all the changes from WAL we've processed
+                // so far to disk. After this, we don't need the original WAL anymore, and it
+                // can be removed.
+                //
+                // TODO: We don't actually dare to remove the WAL. It's useful for debugging,
+                // and we might it for logical decoiding other things in the future. Although
+                // we should also be able to fetch it back from the WAL safekeepers or S3 if
+                // needed.
+                if prev_last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
+                    != last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
+                {
+                    info!("switched segment {} to {}", prev_last_rec_lsn, last_rec_lsn);
+                    let (oldest_segno, newest_segno) = find_wal_file_range(
+                        timelineid,
+                        pg_constants::WAL_SEGMENT_SIZE,
+                        last_rec_lsn,
+                    )?;
+
+                    if newest_segno - oldest_segno >= 10 {
+                        timeline.checkpoint()?;
+
+                        // TODO: This is where we could remove WAL older than last_rec_lsn.
+                        //remove_wal_files(timelineid, pg_constants::WAL_SEGMENT_SIZE, last_rec_lsn)?;
+                    }
+                }

                if !caught_up && endlsn >= end_of_wal {
                    info!("caught up at LSN {}", endlsn);
@@ -306,25 +239,22 @@ fn walreceiver_main(
                let reply_requested: bool = keepalive.reply() != 0;

                trace!(
-                    "received PrimaryKeepAlive(wal_end: {}, timestamp: {} reply: {})",
+                    "received PrimaryKeepAlive(wal_end: {}, timestamp: {:?} reply: {})",
                    wal_end,
                    timestamp,
                    reply_requested,
                );
                if reply_requested {
                    // TODO: More thought should go into what values are sent here.
-                    let last_lsn = PgLsn::from(u64::from(pcache.get_last_valid_lsn()));
+                    let last_lsn = PgLsn::from(u64::from(timeline.get_last_valid_lsn()));
                    let write_lsn = last_lsn;
                    let flush_lsn = last_lsn;
-                    let apply_lsn = PgLsn::INVALID;
-                    let ts = PgTimestamp::now()?;
+                    let apply_lsn = PgLsn::from(0);
+                    let ts = SystemTime::now();
                    const NO_REPLY: u8 = 0u8;

-                    runtime.block_on(
-                        physical_stream
-                            .as_mut()
-                            .standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY),
-                    )?;
+                    physical_stream
+                        .standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
                }
            }
            _ => (),
@@ -333,6 +263,45 @@ fn walreceiver_main(
    Ok(())
 }

+fn find_wal_file_range(
+    timeline: ZTimelineId,
+    wal_seg_size: usize,
+    written_upto: Lsn,
+) -> Result<(u64, u64)> {
+    let written_upto_segno = written_upto.segment_number(wal_seg_size);
+
+    let mut oldest_segno = written_upto_segno;
+    let mut newest_segno = written_upto_segno;
+    // Scan the wal directory, and count how many WAL filed we could remove
+    let wal_dir = PathBuf::from(format!("timelines/{}/wal", timeline));
+    for entry in fs::read_dir(wal_dir)? {
+        let entry = entry?;
+        let path = entry.path();
+
+        if path.is_dir() {
+            continue;
+        }
+
+        let filename = path.file_name().unwrap().to_str().unwrap();
+
+        if IsXLogFileName(filename) {
+            let (segno, _tli) = XLogFromFileName(filename, wal_seg_size);
+
+            if segno > written_upto_segno {
+                // that's strange.
+                warn!("there is a WAL file from future at {}", path.display());
+                continue;
+            }
+
+            oldest_segno = min(oldest_segno, segno);
+            newest_segno = max(newest_segno, segno);
+        }
+    }
+    // FIXME: would be good to assert that there are no gaps in the WAL files
+
+    Ok((oldest_segno, newest_segno))
+}
+
 /// Data returned from the postgres `IDENTIFY_SYSTEM` command
 ///
 /// See the [postgres docs] for more details.
@@ -353,12 +322,9 @@ pub struct IdentifySystem {
 pub struct IdentifyError;

 /// Run the postgres `IDENTIFY_SYSTEM` command
-pub fn identify_system(
-    runtime: &Runtime,
-    client: &tokio_postgres::Client,
-) -> Result<IdentifySystem, Error> {
+pub fn identify_system(client: &mut Client) -> Result<IdentifySystem, Error> {
    let query_str = "IDENTIFY_SYSTEM";
-    let response = runtime.block_on(client.simple_query(query_str))?;
+    let response = client.simple_query(query_str)?;

    // get(N) from row, then parse it as some destination type.
    fn get_parse<T>(row: &SimpleQueryRow, idx: usize) -> Result<T, IdentifyError>
@@ -379,7 +345,7 @@ pub fn identify_system(
            dbname: get_parse(first_row, 3).ok(),
        })
    } else {
-        Err(IdentifyError)?
+        Err(IdentifyError.into())
    }
 }

@@ -398,7 +364,7 @@ fn write_wal_file(
    let wal_dir = PathBuf::from(format!("timelines/{}/wal", timeline));

    /* Extract WAL location for this block */
-    let mut xlogoff = start_pos.segment_offset(wal_seg_size as u64) as usize;
+    let mut xlogoff = start_pos.segment_offset(wal_seg_size);

    while bytes_left != 0 {
        let bytes_to_write;
@@ -414,7 +380,7 @@ fn write_wal_file(
        }

        /* Open file */
-        let segno = start_pos.segment_number(wal_seg_size as u64);
+        let segno = start_pos.segment_number(wal_seg_size);
        let wal_file_name = XLogFileName(
            1, // FIXME: always use Postgres timeline 1
            segno,
@@ -466,7 +432,7 @@ fn write_wal_file(
        xlogoff += bytes_to_write;

        /* Did we reach the end of a WAL segment? */
-        if start_pos.segment_offset(wal_seg_size as u64) == 0 {
+        if start_pos.segment_offset(wal_seg_size) == 0 {
            xlogoff = 0;
            if partial {
                fs::rename(&wal_file_partial_path, &wal_file_path)?;
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -1,28 +1,27 @@
-//
-// WAL redo
-//
-// We rely on Postgres to perform WAL redo for us. We launch a
-// postgres process in special "wal redo" mode that's similar to
-// single-user mode. We then pass the the previous page image, if any,
-// and all the WAL records we want to apply, to the postgress
-// process. Then we get the page image back. Communication with the
-// postgres process happens via stdin/stdout
-//
-// See src/backend/tcop/zenith_wal_redo.c for the other side of
-// this communication.
-//
-// TODO: Even though the postgres code runs in a separate process,
-// it's not a secure sandbox.
-//
-use bytes::{Buf, BufMut, Bytes, BytesMut};
+//!
+//! WAL redo
+//!
+//! We rely on Postgres to perform WAL redo for us. We launch a
+//! postgres process in special "wal redo" mode that's similar to
+//! single-user mode. We then pass the the previous page image, if any,
+//! and all the WAL records we want to apply, to the postgres
+//! process. Then we get the page image back. Communication with the
+//! postgres process happens via stdin/stdout
+//!
+//! See src/backend/tcop/zenith_wal_redo.c for the other side of
+//! this communication.
+//!
+//! TODO: Even though the postgres code runs in a separate process,
+//! it's not a secure sandbox.
+//!
+use bytes::{BufMut, Bytes, BytesMut};
 use log::*;
-use std::assert;
 use std::cell::RefCell;
 use std::fs;
 use std::fs::OpenOptions;
 use std::io::prelude::*;
 use std::io::Error;
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 use std::process::Stdio;
 use std::sync::mpsc;
 use std::sync::Mutex;
@@ -32,30 +31,65 @@ use tokio::io::AsyncBufReadExt;
 use tokio::io::{AsyncReadExt, AsyncWriteExt};
 use tokio::process::{ChildStdin, ChildStdout, Command};
 use tokio::time::timeout;
+use zenith_utils::bin_ser::BeSer;
 use zenith_utils::lsn::Lsn;

-use crate::page_cache::BufferTag;
-use crate::page_cache::WALRecord;
+use crate::repository::BufferTag;
+use crate::repository::WALRecord;
 use crate::PageServerConf;
-use crate::ZTimelineId;
-use postgres_ffi::pg_constants;
-use postgres_ffi::xlog_utils::XLogRecord;
+
+///
+/// WAL Redo Manager is responsible for replaying WAL records.
+///
+/// Callers use the WAL redo manager through this abstract interface,
+/// which makes it easy to mock it in tests.
+pub trait WalRedoManager: Send + Sync {
+    /// Apply some WAL records.
+    ///
+    /// The caller passes an old page image, and WAL records that should be
+    /// applied over it. The return value is a new page image, after applying
+    /// the reords.
+    fn request_redo(
+        &self,
+        tag: BufferTag,
+        lsn: Lsn,
+        base_img: Option<Bytes>,
+        records: Vec<WALRecord>,
+    ) -> Result<Bytes, WalRedoError>;
+}
+
+///
+/// A dummy WAL Redo Manager implementation that doesn't allow replaying
+/// anything. Currently used during bootstrapping (zenith init), to create
+/// a Repository object without launching the real WAL redo process.
+///
+pub struct DummyRedoManager {}
+impl crate::walredo::WalRedoManager for DummyRedoManager {
+    fn request_redo(
+        &self,
+        _tag: BufferTag,
+        _lsn: Lsn,
+        _base_img: Option<Bytes>,
+        _records: Vec<WALRecord>,
+    ) -> Result<Bytes, WalRedoError> {
+        Err(WalRedoError::InvalidState)
+    }
+}

 static TIMEOUT: Duration = Duration::from_secs(20);

 ///
-/// A WAL redo manager consists of two parts: WalRedoManager, and
-/// WalRedoManagerInternal. WalRedoManager is the public struct
+/// The implementation consists of two parts: PostgresRedoManager, and
+/// PostgresRedoManagerInternal. PostgresRedoManager is the public struct
 /// that can be used to send redo requests to the manager.
-/// WalRedoManagerInternal is used by the manager thread itself.
+/// PostgresRedoManagerInternal is used by the manager thread itself.
 ///
-pub struct WalRedoManager {
+pub struct PostgresRedoManager {
    request_tx: Mutex<mpsc::Sender<WalRedoRequest>>,
 }

-struct WalRedoManagerInternal {
-    _conf: PageServerConf,
-    timelineid: ZTimelineId,
+struct PostgresRedoManagerInternal {
+    conf: &'static PageServerConf,

    request_rx: mpsc::Receiver<WalRedoRequest>,
 }
@@ -76,18 +110,20 @@ struct WalRedoRequest {
 pub enum WalRedoError {
    #[error(transparent)]
    IoError(#[from] std::io::Error),
+
+    #[error("cannot perform WAL redo now")]
+    InvalidState,
 }

 ///
 /// Public interface of WAL redo manager
 ///
-impl WalRedoManager {
+impl PostgresRedoManager {
    ///
-    /// Create a new WalRedoManager.
+    /// Create a new PostgresRedoManager.
    ///
-    /// This only initializes the struct. You need to call WalRedoManager::launch to
-    /// start the thread that processes the requests.
-    pub fn new(conf: &PageServerConf, timelineid: ZTimelineId) -> WalRedoManager {
+    /// This launches a new thread to handle the requests.
+    pub fn new(conf: &'static PageServerConf) -> PostgresRedoManager {
        let (tx, rx) = mpsc::channel();

        //
@@ -96,32 +132,31 @@ impl WalRedoManager {
        // Get mutable references to the values that we need to pass to the
        // thread.
        let request_rx = rx;
-        let conf_copy = conf.clone();

        // Currently, the join handle is not saved anywhere and we
        // won't try restart the thread if it dies.
        let _walredo_thread = std::thread::Builder::new()
            .name("WAL redo thread".into())
            .spawn(move || {
-                let mut internal = WalRedoManagerInternal {
-                    _conf: conf_copy,
-                    timelineid,
-                    request_rx,
-                };
+                let mut internal = PostgresRedoManagerInternal { conf, request_rx };
                internal.wal_redo_main();
            })
            .unwrap();

-        WalRedoManager {
+        PostgresRedoManager {
            request_tx: Mutex::new(tx),
        }
    }
+}

+impl WalRedoManager for PostgresRedoManager {
    ///
-    /// Request the WAL redo manager to apply WAL records, to reconstruct the page image
-    /// of the given page version.
+    /// Request the WAL redo manager to apply some WAL records
    ///
-    pub fn request_redo(
+    /// The WAL redo is handled by a separate thread, so this just sends a request
+    /// to the thread and waits for response.
+    ///
+    fn request_redo(
        &self,
        tag: BufferTag,
        lsn: Lsn,
@@ -153,12 +188,12 @@ impl WalRedoManager {
 ///
 /// WAL redo thread
 ///
-impl WalRedoManagerInternal {
+impl PostgresRedoManagerInternal {
    //
    // Main entry point for the WAL applicator thread.
    //
    fn wal_redo_main(&mut self) {
-        info!("WAL redo thread started {}", self.timelineid);
+        info!("WAL redo thread started");

        // We block on waiting for requests on the walredo request channel, but
        // use async I/O to communicate with the child process. Initialize the
@@ -168,17 +203,25 @@ impl WalRedoManagerInternal {
            .build()
            .unwrap();

-        let process: WalRedoProcess;
-        let datadir = format!("wal-redo/{}", self.timelineid);
+        let process: PostgresRedoProcess;

-        info!("launching WAL redo postgres process {}", self.timelineid);
+        // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
+        // just create one with constant name. That fails if you try to launch more than
+        // one WAL redo manager concurrently.
+        let datadir = self.conf.workdir.join("wal-redo-datadir");

-        process = runtime.block_on(WalRedoProcess::launch(&datadir)).unwrap();
-        info!("WAL redo postgres started");
+        info!("launching WAL redo postgres process");
+
+        process = runtime
+            .block_on(PostgresRedoProcess::launch(&datadir))
+            .unwrap();

        // Loop forever, handling requests as they come.
        loop {
-            let request = self.request_rx.recv().unwrap();
+            let request = self
+                .request_rx
+                .recv()
+                .expect("WAL redo request channel was closed");

            let result = runtime.block_on(self.handle_apply_request(&process, &request));
            let result_ok = result.is_ok();
@@ -187,33 +230,17 @@ impl WalRedoManagerInternal {
            let _ = request.response_channel.send(result);

            if !result_ok {
-                error!("wal-redo-postgres filed to apply request {:?}", request);
+                error!("wal-redo-postgres failed to apply request {:?}", request);
            }
        }
    }

-    fn transaction_id_set_status_bit(&self, xid: u32, status: u8, page: &mut BytesMut) {
-        trace!(
-            "handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort, 3-sub_commit)",
-            status
-        );
-
-        let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
-            / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
-
-        let bshift: u8 = ((xid % pg_constants::CLOG_XACTS_PER_BYTE)
-            * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
-
-        page[byteno] =
-            (page[byteno] & !(pg_constants::CLOG_XACT_BITMASK << bshift)) | (status << bshift);
-    }
-
    ///
    /// Process one request for WAL redo.
    ///
    async fn handle_apply_request(
        &self,
-        process: &WalRedoProcess,
+        process: &PostgresRedoProcess,
        request: &WalRedoRequest,
    ) -> Result<Bytes, WalRedoError> {
        let tag = request.tag;
@@ -226,102 +253,7 @@ impl WalRedoManagerInternal {
        let start = Instant::now();

        let apply_result: Result<Bytes, Error>;
-        if tag.rel.forknum == pg_constants::PG_XACT_FORKNUM as u8 {
-            const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
-            let mut page = BytesMut::new();
-            if let Some(fpi) = base_img {
-                page.extend_from_slice(&fpi[..]);
-            } else {
-                page.extend_from_slice(&ZERO_PAGE);
-            }
-            for record in records {
-                let mut buf = record.rec.clone();
-
-                // 1. Parse XLogRecord struct
-                // FIXME: refactor to avoid code duplication.
-                let xlogrec = XLogRecord::from_bytes(&mut buf);
-
-                //move to main data
-                // TODO probably, we should store some records in our special format
-                // to avoid this weird parsing on replay
-                let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
-                if buf.remaining() > skip {
-                    buf.advance(skip);
-                }
-
-                if xlogrec.xl_rmid == pg_constants::RM_CLOG_ID {
-                    let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
-                    if info == pg_constants::CLOG_ZEROPAGE {
-                        page.clone_from_slice(&ZERO_PAGE);
-                    }
-                } else if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
-                    let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
-                    let mut status = 0;
-                    if info == pg_constants::XLOG_XACT_COMMIT {
-                        status = pg_constants::TRANSACTION_STATUS_COMMITTED;
-                        self.transaction_id_set_status_bit(xlogrec.xl_xid, status, &mut page);
-                        //handle subtrans
-                        let _xact_time = buf.get_i64_le();
-                        let mut xinfo = 0;
-                        if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
-                            xinfo = buf.get_u32_le();
-                            if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
-                                let _dbid = buf.get_u32_le();
-                                let _tsid = buf.get_u32_le();
-                            }
-                        }
-
-                        if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
-                            let nsubxacts = buf.get_i32_le();
-                            for _i in 0..nsubxacts {
-                                let subxact = buf.get_u32_le();
-                                let blkno = subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
-                                // only update xids on the requested page
-                                if tag.blknum == blkno {
-                                    status = pg_constants::TRANSACTION_STATUS_SUB_COMMITTED;
-                                    self.transaction_id_set_status_bit(subxact, status, &mut page);
-                                }
-                            }
-                        }
-                    } else if info == pg_constants::XLOG_XACT_ABORT {
-                        status = pg_constants::TRANSACTION_STATUS_ABORTED;
-                        self.transaction_id_set_status_bit(xlogrec.xl_xid, status, &mut page);
-                        //handle subtrans
-                        let _xact_time = buf.get_i64_le();
-                        let mut xinfo = 0;
-                        if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
-                            xinfo = buf.get_u32_le();
-                            if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
-                                let _dbid = buf.get_u32_le();
-                                let _tsid = buf.get_u32_le();
-                            }
-                        }
-
-                        if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
-                            let nsubxacts = buf.get_i32_le();
-                            for _i in 0..nsubxacts {
-                                let subxact = buf.get_u32_le();
-                                let blkno = subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
-                                // only update xids on the requested page
-                                if tag.blknum == blkno {
-                                    status = pg_constants::TRANSACTION_STATUS_ABORTED;
-                                    self.transaction_id_set_status_bit(subxact, status, &mut page);
-                                }
-                            }
-                        }
-                    } else {
-                        trace!("handle_apply_request for RM_XACT_ID-{} NOT SUPPORTED YET. RETURN. lsn {} main_data_offset {}, rec.len {}",
-                               status,
-                               record.lsn,
-                               record.main_data_offset, record.rec.len());
-                    }
-                }
-            }
-
-            apply_result = Ok::<Bytes, Error>(page.freeze());
-        } else {
-            apply_result = process.apply_wal_records(tag, base_img, records).await;
-        }
+        apply_result = process.apply_wal_records(tag, base_img, records).await;

        let duration = start.elapsed();

@@ -348,23 +280,31 @@ impl WalRedoManagerInternal {
    }
 }

-struct WalRedoProcess {
+struct PostgresRedoProcess {
    stdin: RefCell<ChildStdin>,
    stdout: RefCell<ChildStdout>,
 }

-impl WalRedoProcess {
+impl PostgresRedoProcess {
    //
    // Start postgres binary in special WAL redo mode.
    //
    // Tests who run pageserver binary are setting proper PG_BIN_DIR
-    // and PG_LIB_DIR so that WalRedo would start right postgres. We may later
+    // and PG_LIB_DIR so that WalRedo would start right postgres.
+
+    // do that: We may later
    // switch to setting same things in pageserver config file.
-    async fn launch(datadir: &str) -> Result<WalRedoProcess, Error> {
-        // Create empty data directory for wal-redo postgres deleting old one.
-        fs::remove_dir_all(datadir).ok();
+    async fn launch(datadir: &Path) -> Result<PostgresRedoProcess, Error> {
+        // Create empty data directory for wal-redo postgres, deleting old one first.
+        if datadir.exists() {
+            info!("directory {:?} exists, removing", &datadir);
+            if let Err(e) = fs::remove_dir_all(&datadir) {
+                error!("could not remove old wal-redo-datadir: {:?}", e);
+            }
+        }
+        info!("running initdb in {:?}", datadir.display());
        let initdb = Command::new("initdb")
-            .args(&["-D", datadir])
+            .args(&["-D", datadir.to_str().unwrap()])
            .arg("-N")
            .output()
            .await
@@ -383,6 +323,8 @@ impl WalRedoProcess {
                .open(PathBuf::from(&datadir).join("postgresql.conf"))?;
            config.write_all(b"shared_buffers=128kB\n")?;
            config.write_all(b"fsync=off\n")?;
+            config.write_all(b"shared_preload_libraries=zenith\n")?;
+            config.write_all(b"zenith.wal_redo=on\n")?;
        }
        // Start postgres itself
        let mut child = Command::new("postgres")
@@ -394,7 +336,10 @@ impl WalRedoProcess {
            .spawn()
            .expect("postgres --wal-redo command failed to start");

-        info!("launched WAL redo postgres process on {}", datadir);
+        info!(
+            "launched WAL redo postgres process on {:?}",
+            datadir.display()
+        );

        let stdin = child.stdin.take().expect("failed to open child's stdin");
        let stderr = child.stderr.take().expect("failed to open child's stderr");
@@ -421,7 +366,7 @@ impl WalRedoProcess {
        };
        tokio::spawn(f_stderr);

-        Ok(WalRedoProcess {
+        Ok(PostgresRedoProcess {
            stdin: RefCell::new(stdin),
            stdout: RefCell::new(stdout),
        })
@@ -435,7 +380,7 @@ impl WalRedoProcess {
        &self,
        tag: BufferTag,
        base_img: Option<Bytes>,
-        records: &Vec<WALRecord>,
+        records: &[WALRecord],
    ) -> Result<Bytes, std::io::Error> {
        let mut stdin = self.stdin.borrow_mut();
        let mut stdout = self.stdout.borrow_mut();
@@ -493,7 +438,7 @@ impl WalRedoProcess {
            Ok::<[u8; 8192], Error>(buf)
        };

-        let res = futures::try_join!(f_stdout, f_stdin)?;
+        let res = tokio::try_join!(f_stdout, f_stdin)?;

        let buf = res.0;

@@ -506,14 +451,31 @@ impl WalRedoProcess {
 // explanation of the protocol.

 fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
-    let len = 4 + 5 * 4;
+    let len = 4 + 1 + 4 * 4;
    let mut buf = BytesMut::with_capacity(1 + len);

    buf.put_u8(b'B');
    buf.put_u32(len as u32);
-    tag.pack(&mut buf);

-    assert!(buf.len() == 1 + len);
+    // FIXME: this is a temporary hack that should go away when we refactor
+    // the postgres protocol serialization + handlers.
+    //
+    // BytesMut is a dynamic growable buffer, used a lot in tokio code but
+    // not in the std library. To write to a BytesMut from a serde serializer,
+    // we need to either:
+    // - pre-allocate the required buffer space. This is annoying because we
+    //   shouldn't care what the exact serialized size is-- that's the
+    //   serializer's job.
+    // - Or, we need to create a temporary "writer" (which implements the
+    //   `Write` trait). It's a bit awkward, because the writer consumes the
+    //   underlying BytesMut, and we need to extract it later with
+    //   `into_inner`.
+    let mut writer = buf.writer();
+    tag.ser_into(&mut writer)
+        .expect("serialize BufferTag should always succeed");
+    let buf = writer.into_inner();
+
+    debug_assert!(buf.len() == 1 + len);

    buf.freeze()
 }
@@ -521,15 +483,18 @@ fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
 fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
    assert!(base_img.len() == 8192);

-    let len = 4 + 5 * 4 + base_img.len();
+    let len = 4 + 1 + 4 * 4 + base_img.len();
    let mut buf = BytesMut::with_capacity(1 + len);

    buf.put_u8(b'P');
    buf.put_u32(len as u32);
-    tag.pack(&mut buf);
+    let mut writer = buf.writer();
+    tag.ser_into(&mut writer)
+        .expect("serialize BufferTag should always succeed");
+    let mut buf = writer.into_inner();
    buf.put(base_img);

-    assert!(buf.len() == 1 + len);
+    debug_assert!(buf.len() == 1 + len);

    buf.freeze()
 }
@@ -543,20 +508,23 @@ fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes {
    buf.put_u64(endlsn.0);
    buf.put(rec);

-    assert!(buf.len() == 1 + len);
+    debug_assert!(buf.len() == 1 + len);

    buf.freeze()
 }

 fn build_get_page_msg(tag: BufferTag) -> Bytes {
-    let len = 4 + 5 * 4;
+    let len = 4 + 1 + 4 * 4;
    let mut buf = BytesMut::with_capacity(1 + len);

    buf.put_u8(b'G');
    buf.put_u32(len as u32);
-    tag.pack(&mut buf);
+    let mut writer = buf.writer();
+    tag.ser_into(&mut writer)
+        .expect("serialize BufferTag should always succeed");
+    let buf = writer.into_inner();

-    assert!(buf.len() == 1 + len);
+    debug_assert!(buf.len() == 1 + len);

    buf.freeze()
 }
--- a/postgres_ffi/Cargo.toml
+++ b/postgres_ffi/Cargo.toml
@@ -9,12 +9,16 @@ edition = "2018"
 [dependencies]
 chrono = "0.4.19"
 rand = "0.8.3"
+regex = "1.4.5"
 bytes = "1.0.1"
 byteorder = "1.4.3"
 anyhow = "1.0"
 crc32c = "0.6.0"
 hex = "0.4.3"
+lazy_static = "1.4"
 log = "0.4.14"
+thiserror = "1.0"
+workspace_hack = { path = "../workspace_hack" }

 [build-dependencies]
 bindgen = "0.57"
--- a/postgres_ffi/src/lib.rs
+++ b/postgres_ffi/src/lib.rs
@@ -4,6 +4,7 @@
 include!(concat!(env!("OUT_DIR"), "/bindings.rs"));

 pub mod pg_constants;
+pub mod relfile_utils;
 pub mod xlog_utils;

 use bytes::{Buf, Bytes, BytesMut};
--- a/postgres_ffi/src/nonrelfile_utils.rs
+++ b/postgres_ffi/src/nonrelfile_utils.rs
@@ -0,0 +1,32 @@
+//!
+//! Common utilities for dealing with PostgreSQL non-relation files.
+//!
+use crate::pg_constants;
+use bytes::BytesMut;
+use log::*;
+
+pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) {
+    trace!(
+        "handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort, 3-sub_commit)",
+        status
+    );
+
+    let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
+        / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
+
+    let bshift: u8 =
+        ((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
+
+    page[byteno] =
+        (page[byteno] & !(pg_constants::CLOG_XACT_BITMASK << bshift)) | (status << bshift);
+}
+
+pub fn transaction_id_get_status(xid: u32, page: &[u8]) -> u8 {
+    let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
+        / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
+
+    let bshift: u8 =
+        ((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
+
+    ((page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK) as u8
+}
--- a/postgres_ffi/src/pg_constants.rs
+++ b/postgres_ffi/src/pg_constants.rs
@@ -1,37 +1,44 @@
+//!
+//! Misc constants, copied from PostgreSQL headers.
+//!
+
+//
 // From pg_tablespace_d.h
 //
 pub const DEFAULTTABLESPACE_OID: u32 = 1663;
 pub const GLOBALTABLESPACE_OID: u32 = 1664;
-//Special values for non-rel files' tags
-//TODO maybe use enum?
-pub const PG_CONTROLFILE_FORKNUM: u32 = 42;
-pub const PG_FILENODEMAP_FORKNUM: u32 = 43;
-pub const PG_XACT_FORKNUM: u32 = 44;
-pub const PG_MXACT_OFFSETS_FORKNUM: u32 = 45;
-pub const PG_MXACT_MEMBERS_FORKNUM: u32 = 46;

 //
-// constants from clog.h
+// Fork numbers, from relpath.h
 //
-pub const CLOG_XACTS_PER_BYTE: u32 = 4;
-pub const CLOG_XACTS_PER_PAGE: u32 = 8192 * CLOG_XACTS_PER_BYTE;
-pub const CLOG_BITS_PER_XACT: u8 = 2;
-pub const CLOG_XACT_BITMASK: u8 = (1 << CLOG_BITS_PER_XACT) - 1;
+pub const MAIN_FORKNUM: u8 = 0;
+pub const FSM_FORKNUM: u8 = 1;
+pub const VISIBILITYMAP_FORKNUM: u8 = 2;
+pub const INIT_FORKNUM: u8 = 3;
+
+pub const ROCKSDB_SPECIAL_FORKNUM: u8 = 50;
+
+// From storage_xlog.h
+pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001;
+pub const SMGR_TRUNCATE_VM: u32 = 0x0002;
+pub const SMGR_TRUNCATE_FSM: u32 = 0x0004;
+
+//
+// Constants from visbilitymap.h
+//
+pub const SIZE_OF_PAGE_HEADER: u16 = 24;
+pub const BITS_PER_HEAPBLOCK: u16 = 2;
+pub const HEAPBLOCKS_PER_PAGE: u16 = (BLCKSZ - SIZE_OF_PAGE_HEADER) * 8 / BITS_PER_HEAPBLOCK;

 pub const TRANSACTION_STATUS_COMMITTED: u8 = 0x01;
 pub const TRANSACTION_STATUS_ABORTED: u8 = 0x02;
 pub const TRANSACTION_STATUS_SUB_COMMITTED: u8 = 0x03;

-pub const CLOG_ZEROPAGE: u8 = 0x00;
-pub const CLOG_TRUNCATE: u8 = 0x10;
-
 // From xact.h
 pub const XLOG_XACT_COMMIT: u8 = 0x00;
+pub const XLOG_XACT_PREPARE: u8 = 0x10;
 pub const XLOG_XACT_ABORT: u8 = 0x20;

-// From srlu.h
-pub const SLRU_PAGES_PER_SEGMENT: u32 = 32;
-
 /* mask for filtering opcodes out of xl_info */
 pub const XLOG_XACT_OPMASK: u8 = 0x70;
 /* does this record have a 'xinfo' field or not */
@@ -53,13 +60,31 @@ pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
 // From pg_control.h and rmgrlist.h
 pub const XLOG_SWITCH: u8 = 0x40;
 pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
+
+// From heapam_xlog.h
+pub const XLOG_HEAP_INSERT: u8 = 0x00;
+pub const XLOG_HEAP_DELETE: u8 = 0x10;
+pub const XLOG_HEAP_UPDATE: u8 = 0x20;
+pub const XLOG_HEAP_HOT_UPDATE: u8 = 0x40;
+pub const XLOG_HEAP2_VISIBLE: u8 = 0x40;
+pub const XLOG_HEAP2_MULTI_INSERT: u8 = 0x50;
+pub const XLH_INSERT_ALL_FROZEN_SET: u8 = (1 << 5) as u8;
+pub const XLH_INSERT_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
+pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
+pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8;
+pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
+
 pub const RM_XLOG_ID: u8 = 0;
 pub const RM_XACT_ID: u8 = 1;
 pub const RM_SMGR_ID: u8 = 2;
 pub const RM_CLOG_ID: u8 = 3;
 pub const RM_DBASE_ID: u8 = 4;
 pub const RM_TBLSPC_ID: u8 = 5;
-// pub const RM_MULTIXACT_ID:u8 = 6;
+pub const RM_MULTIXACT_ID: u8 = 6;
+pub const RM_RELMAP_ID: u8 = 7;
+pub const RM_STANDBY_ID: u8 = 8;
+pub const RM_HEAP2_ID: u8 = 9;
+pub const RM_HEAP_ID: u8 = 10;

 // from xlogreader.h
 pub const XLR_INFO_MASK: u8 = 0x0F;
@@ -74,8 +99,10 @@ pub const XLOG_TBLSPC_DROP: u8 = 0x10;

 pub const SIZEOF_XLOGRECORD: u32 = 24;

-// FIXME:
+// from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and
+// --with-segsize=SEGSIZE, but assume the defaults for now.
 pub const BLCKSZ: u16 = 8192;
+pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32);

 //
 // from xlogrecord.h
@@ -98,3 +125,6 @@ pub const BKPBLOCK_SAME_REL: u8 = 0x80; /* RelFileNode omitted, same as previous
 pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */
 pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
 pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
+
+/* FIXME: pageserver should request wal_seg_size from compute node */
+pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024;
--- a/postgres_ffi/src/relfile_utils.rs
+++ b/postgres_ffi/src/relfile_utils.rs
@@ -0,0 +1,141 @@
+//!
+//! Common utilities for dealing with PostgreSQL relation files.
+//!
+use crate::pg_constants;
+use lazy_static::lazy_static;
+use regex::Regex;
+
+#[derive(Debug, Clone, thiserror::Error, PartialEq)]
+pub enum FilePathError {
+    #[error("invalid relation fork name")]
+    InvalidForkName,
+    #[error("invalid relation data file name")]
+    InvalidFileName,
+}
+
+impl From<core::num::ParseIntError> for FilePathError {
+    fn from(_e: core::num::ParseIntError) -> Self {
+        FilePathError::InvalidFileName
+    }
+}
+
+/// Convert Postgres relation file's fork suffix to fork number.
+pub fn forkname_to_number(forkname: Option<&str>) -> Result<u8, FilePathError> {
+    match forkname {
+        // "main" is not in filenames, it's implicit if the fork name is not present
+        None => Ok(pg_constants::MAIN_FORKNUM),
+        Some("fsm") => Ok(pg_constants::FSM_FORKNUM),
+        Some("vm") => Ok(pg_constants::VISIBILITYMAP_FORKNUM),
+        Some("init") => Ok(pg_constants::INIT_FORKNUM),
+        Some(_) => Err(FilePathError::InvalidForkName),
+    }
+}
+
+/// Convert Postgres fork number to the right suffix of the relation data file.
+pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
+    match forknum {
+        pg_constants::MAIN_FORKNUM => None,
+        pg_constants::FSM_FORKNUM => Some("fsm"),
+        pg_constants::VISIBILITYMAP_FORKNUM => Some("vm"),
+        pg_constants::INIT_FORKNUM => Some("init"),
+        _ => Some("UNKNOWN FORKNUM"),
+    }
+}
+
+///
+/// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple.
+///
+/// Formats:
+/// <oid>
+/// <oid>_<fork name>
+/// <oid>.<segment number>
+/// <oid>_<fork name>.<segment number>
+///
+/// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
+///
+pub fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
+    lazy_static! {
+        static ref RELFILE_RE: Regex =
+            Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
+    }
+    let caps = RELFILE_RE
+        .captures(fname)
+        .ok_or(FilePathError::InvalidFileName)?;
+
+    let relnode_str = caps.name("relnode").unwrap().as_str();
+    let relnode = relnode_str.parse::<u32>()?;
+
+    let forkname = caps.name("forkname").map(|f| f.as_str());
+    let forknum = forkname_to_number(forkname)?;
+
+    let segno_match = caps.name("segno");
+    let segno = if segno_match.is_none() {
+        0
+    } else {
+        segno_match.unwrap().as_str().parse::<u32>()?
+    };
+
+    Ok((relnode, forknum, segno))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_valid_relfilenames() {
+        assert_eq!(parse_relfilename("1234"), Ok((1234, 0, 0)));
+        assert_eq!(parse_relfilename("1234_fsm"), Ok((1234, 1, 0)));
+        assert_eq!(parse_relfilename("1234_vm"), Ok((1234, 2, 0)));
+        assert_eq!(parse_relfilename("1234_init"), Ok((1234, 3, 0)));
+
+        assert_eq!(parse_relfilename("1234.12"), Ok((1234, 0, 12)));
+        assert_eq!(parse_relfilename("1234_fsm.12"), Ok((1234, 1, 12)));
+        assert_eq!(parse_relfilename("1234_vm.12"), Ok((1234, 2, 12)));
+        assert_eq!(parse_relfilename("1234_init.12"), Ok((1234, 3, 12)));
+
+        // relfilenode is unsigned, so it can go up to 2^32-1
+        assert_eq!(parse_relfilename("3147483648"), Ok((3147483648, 0, 0)));
+    }
+
+    #[test]
+    fn test_parse_invalid_relfilenames() {
+        assert_eq!(
+            parse_relfilename("foo"),
+            Err(FilePathError::InvalidFileName)
+        );
+        assert_eq!(
+            parse_relfilename("1.2.3"),
+            Err(FilePathError::InvalidFileName)
+        );
+        assert_eq!(
+            parse_relfilename("1234_invalid"),
+            Err(FilePathError::InvalidForkName)
+        );
+        assert_eq!(
+            parse_relfilename("1234_"),
+            Err(FilePathError::InvalidFileName)
+        );
+
+        // too large for u32
+        assert_eq!(
+            parse_relfilename("12345678901"),
+            Err(FilePathError::InvalidFileName)
+        );
+        assert_eq!(
+            parse_relfilename("-1234"),
+            Err(FilePathError::InvalidFileName)
+        );
+    }
+
+    #[test]
+    fn test_parse_weird_relfilenames() {
+        // we accept 0 for the relfilenode, but PostgreSQL should never do that.
+        assert_eq!(parse_relfilename("0"), Ok((0, 0, 0)));
+
+        // PostgreSQL has a limit of 2^32-2 blocks in a table. With 8k block size and
+        // 1 GB segments, the max segment number is 32767. But we accept larger values
+        // currently.
+        assert_eq!(parse_relfilename("1.123456"), Ok((1, 0, 123456)));
+    }
+}
--- a/postgres_ffi/src/xlog_utils.rs
+++ b/postgres_ffi/src/xlog_utils.rs
@@ -27,26 +27,18 @@ pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = XLP_REM_LEN_OFFS + 4 + 4;
 pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = XLOG_SIZE_OF_XLOG_SHORT_PHD + 8 + 4 + 4;
 pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2;
 pub const XLOG_SIZE_OF_XLOG_RECORD: usize = XLOG_RECORD_CRC_OFFS + 4;
+pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;
+
 pub type XLogRecPtr = u64;
 pub type TimeLineID = u32;
 pub type TimestampTz = u64;
 pub type XLogSegNo = u64;

-#[allow(non_snake_case)]
-pub fn XLogSegmentOffset(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> u32 {
-    (xlogptr as u32) & (wal_segsz_bytes as u32 - 1)
-}
-
 #[allow(non_snake_case)]
 pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
    (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo
 }

-#[allow(non_snake_case)]
-pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo {
-    xlogptr / wal_segsz_bytes as u64
-}
-
 #[allow(non_snake_case)]
 pub fn XLogSegNoOffsetToRecPtr(
    segno: XLogSegNo,
@@ -207,33 +199,31 @@ pub fn find_end_of_wal(
    let mut high_tli: TimeLineID = 0;
    let mut high_ispartial = false;

-    for entry in fs::read_dir(data_dir).unwrap() {
-        if let Ok(entry) = entry {
-            let ispartial: bool;
-            let entry_name = entry.file_name();
-            let fname = entry_name.to_str().unwrap();
-            /*
-             * Check if the filename looks like an xlog file, or a .partial file.
-             */
-            if IsXLogFileName(fname) {
-                ispartial = false;
-            } else if IsPartialXLogFileName(fname) {
-                ispartial = true;
-            } else {
-                continue;
-            }
-            let (segno, tli) = XLogFromFileName(fname, wal_seg_size);
-            if !ispartial && entry.metadata().unwrap().len() != wal_seg_size as u64 {
-                continue;
-            }
-            if segno > high_segno
-                || (segno == high_segno && tli > high_tli)
-                || (segno == high_segno && tli == high_tli && high_ispartial && !ispartial)
-            {
-                high_segno = segno;
-                high_tli = tli;
-                high_ispartial = ispartial;
-            }
+    for entry in fs::read_dir(data_dir).unwrap().flatten() {
+        let ispartial: bool;
+        let entry_name = entry.file_name();
+        let fname = entry_name.to_str().unwrap();
+        /*
+         * Check if the filename looks like an xlog file, or a .partial file.
+         */
+        if IsXLogFileName(fname) {
+            ispartial = false;
+        } else if IsPartialXLogFileName(fname) {
+            ispartial = true;
+        } else {
+            continue;
+        }
+        let (segno, tli) = XLogFromFileName(fname, wal_seg_size);
+        if !ispartial && entry.metadata().unwrap().len() != wal_seg_size as u64 {
+            continue;
+        }
+        if segno > high_segno
+            || (segno == high_segno && tli > high_tli)
+            || (segno == high_segno && tli == high_tli && high_ispartial && !ispartial)
+        {
+            high_segno = segno;
+            high_tli = tli;
+            high_ispartial = ispartial;
        }
    }
    if high_segno > 0 {
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -0,0 +1,92 @@
+## Zenith test runner
+
+This directory contains integration tests.
+
+Prerequisites:
+- Python 3.6 or later
+- Python packages: pytest, psycopg2
+    - pytest 6.0 is required.
+    - __NOTE: `apt install` on Debian/Ubuntu won't work.__
+      They ship a much older version of pytest (and sometimes rename it to
+      `pytest-3`.)
+    - Install using something like this:
+        - `pip3 install pytest psycopg2` (Debian or Ubuntu)
+- Zenith and Postgres binaries
+    - See the root README.md for build directions
+    - Tests can be run from the git tree; or see the environment variables
+      below to run from other directories.
+- The zenith git repo, including the postgres submodule
+  (for some tests, e.g. pg_regress)
+
+### Test Organization
+
+The tests are divided into a few batches, such that each batch takes roughly
+the same amount of time. The batches can be run in parallel, to minimize total
+runtime. Currently, there are only two batches:
+
+- test_batch_pg_regress: Runs PostgreSQL regression tests
+- test_others: All other tests
+
+### Running the tests
+
+Because pytest will search all subdirectories for tests, it's easiest to
+run the tests from within the `test_runner` directory.
+
+Test state (postgres data, pageserver state, and log files) will
+be stored under a directory `test_output`.
+
+You can run all the tests with:
+
+`pytest`
+
+If you want to run all the tests in a particular file:
+
+`pytest test_pgbench.py`
+
+If you want to run all tests that have the string "bench" in their names:
+
+`pytest -k bench`
+
+Useful environment variables:
+
+`ZENITH_BIN`: The directory where zenith binaries can be found.
+`POSTGRES_DISTRIB_DIR`: The directory where postgres distribution can be found.
+`TEST_OUTPUT`: Set the directory where test state and test output files
+should go.
+`TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.
+
+Let stdout and stderr go to the terminal instead of capturing them:
+`pytest -s ...`
+(Note many tests capture subprocess outputs separately, so this may not
+show much.)
+
+Exit after the first test failure:
+`pytest -x ...`
+(there are many more pytest options; run `pytest -h` to see them.)
+
+
+### Building new tests
+
+The tests make heavy use of pytest fixtures. You can read about how they work here: https://docs.pytest.org/en/stable/fixture.html
+
+Essentially, this means that each time you see a fixture named as an input parameter, the function with that name will be run and passed as a parameter to the function.
+
+So this code:
+```
+def test_something(zenith_cli, pg_bin):
+    pass
+```
+
+... will run the fixtures called `zenith_cli` and `pg_bin` and deliver those results to the test function.
+
+Fixtures can't be imported using the normal python syntax. Instead, use this:
+```
+pytest_plugins = ("fixtures.something")
+```
+That will make all the fixtures in the `fixtures/something.py` file available.
+
+Anything that's likely to be used in multiple tests should be built into a fixture.
+
+Note that fixtures can clean up after themselves if they use the `yield` syntax.
+Cleanup will happen even if the test fails (raises an unhandled exception).
+Python destructors, e.g. `__del__()` aren't recommended for cleanup.
--- a/test_runner/batch_others/test_branch_behind.py
+++ b/test_runner/batch_others/test_branch_behind.py
@@ -0,0 +1,67 @@
+import pytest
+import getpass
+import psycopg2
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+#
+# Create a couple of branches off the main branch, at a historical point in time.
+#
+def test_branch_behind(zenith_cli, pageserver, postgres, pg_bin):
+    # Branch at the point where only 100 rows were inserted
+    zenith_cli.run(["branch", "test_branch_behind", "empty"]);
+
+    pgmain = postgres.create_start('test_branch_behind')
+    print("postgres is running on 'test_branch_behind' branch")
+
+    main_pg_conn = psycopg2.connect(pgmain.connstr());
+    main_pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
+    main_cur = main_pg_conn.cursor()
+
+    # Create table, and insert the first 100 rows
+    main_cur.execute('CREATE TABLE foo (t text)');
+    main_cur.execute("INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100) g");
+    main_cur.execute('SELECT pg_current_wal_insert_lsn()');
+    lsn_a = main_cur.fetchone()[0]
+    print('LSN after 100 rows: ' + lsn_a)
+
+    # Insert some more rows. (This generates enough WAL to fill a few segments.)
+    main_cur.execute("INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g");
+    main_cur.execute('SELECT pg_current_wal_insert_lsn()');
+    lsn_b = main_cur.fetchone()[0]
+    print('LSN after 100100 rows: ' + lsn_b)
+
+    # Branch at the point where only 100 rows were inserted
+    zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@"+lsn_a]);
+
+    # Insert many more rows. This generates enough WAL to fill a few segments.
+    main_cur.execute("INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g");
+    main_cur.execute('SELECT pg_current_wal_insert_lsn()');
+
+    main_cur.execute('SELECT pg_current_wal_insert_lsn()');
+    lsn_c = main_cur.fetchone()[0]
+    print('LSN after 200100 rows: ' + lsn_c)
+
+    # Branch at the point where only 200 rows were inserted
+    zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@"+lsn_b]);
+
+    pg_hundred = postgres.create_start("test_branch_behind_hundred")
+    pg_more = postgres.create_start("test_branch_behind_more")
+
+    # On the 'hundred' branch, we should see only 100 rows
+    hundred_pg_conn = psycopg2.connect(pg_hundred.connstr())
+    hundred_pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
+    hundred_cur = hundred_pg_conn.cursor()
+    hundred_cur.execute('SELECT count(*) FROM foo');
+    assert(hundred_cur.fetchone()[0] == 100);
+
+    # On the 'more' branch, we should see 100200 rows
+    more_pg_conn = psycopg2.connect(pg_more.connstr())
+    more_pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
+    more_cur = more_pg_conn.cursor()
+    more_cur.execute('SELECT count(*) FROM foo');
+    assert(more_cur.fetchone()[0] == 100100);
+
+    # All the rows are visible on the main branch
+    main_cur.execute('SELECT count(*) FROM foo');
+    assert(main_cur.fetchone()[0] == 200100);
--- a/test_runner/batch_others/test_config.py
+++ b/test_runner/batch_others/test_config.py
@@ -0,0 +1,30 @@
+import pytest
+import os
+import getpass
+import psycopg2
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+
+#
+# Test starting Postgres with custom options
+#
+def test_config(zenith_cli, pageserver, postgres, pg_bin):
+    # Create a branch for us
+    zenith_cli.run(["branch", "test_config", "empty"]);
+
+    # change config
+    pg = postgres.create_start('test_config', ['log_min_messages=debug1'])
+    print('postgres is running on test_config branch')
+
+    pg_conn = psycopg2.connect(pg.connstr())
+    pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
+    cur = pg_conn.cursor()
+
+    #check that config change was applied
+    cur.execute('SELECT name, setting from pg_settings WHERE source!=%s and source!=%s', ("default","override",))
+    for record in cur:
+        if record[0] == 'log_min_messages':
+            assert(record[1] == 'debug1')
+
+    pg_conn.close()
--- a/test_runner/batch_others/test_createdb.py
+++ b/test_runner/batch_others/test_createdb.py
@@ -0,0 +1,37 @@
+import pytest
+import getpass
+import psycopg2
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+#
+# Test CREATE DATABASE when there have been relmapper changes
+#
+def test_createdb(zenith_cli, pageserver, postgres, pg_bin):
+    zenith_cli.run(["branch", "test_createdb", "empty"]);
+
+    pg = postgres.create_start('test_createdb')
+    print("postgres is running on 'test_createdb' branch")
+
+    conn = psycopg2.connect(pg.connstr());
+    conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
+    cur = conn.cursor()
+
+    # Cause a 'relmapper' change in the original branch
+    cur.execute('VACUUM FULL pg_class');
+
+    cur.execute('CREATE DATABASE foodb');
+
+    cur.execute('SELECT pg_current_wal_insert_lsn()');
+    lsn = cur.fetchone()[0]
+
+    conn.close();
+
+    # Create a branch
+    zenith_cli.run(["branch", "test_createdb2", "test_createdb@"+lsn]);
+
+    pg2 = postgres.create_start('test_createdb2')
+
+    # Test that you can connect to the new database on both branches
+    conn = psycopg2.connect(pg.connstr('foodb'));
+    conn2 = psycopg2.connect(pg2.connstr('foodb'));
--- a/test_runner/batch_others/test_multixact.py
+++ b/test_runner/batch_others/test_multixact.py
@@ -0,0 +1,71 @@
+import pytest
+import os
+import psycopg2
+import multiprocessing
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+#
+# Test multixact state after branching
+# Now this test is very minimalistic -
+# it only checks next_multixact_id field in restored pg_control,
+# since we don't have functions to check multixact internals.
+#
+
+def runQuery(connstr):
+    con = psycopg2.connect(connstr) 
+    con.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
+    cur = con.cursor()
+    cur.execute('select * from t1 for key share;') 
+
+
+def test_multixact(pageserver, postgres, pg_bin, zenith_cli, base_dir):
+
+    # Create a branch for us
+    zenith_cli.run(["branch", "test_multixact", "empty"])
+    pg = postgres.create_start('test_multixact')
+
+    print("postgres is running on 'test_multixact' branch")
+    pg_conn = psycopg2.connect(pg.connstr())
+    pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
+    cur = pg_conn.cursor()
+
+    cur.execute('CREATE TABLE t1(i int primary key);'
+    'INSERT INTO t1 select * from generate_series(1,100);')
+
+    cur.execute('SELECT next_multixact_id FROM pg_control_checkpoint();')
+    next_multixact_id_old = cur.fetchone()[0]
+
+    # Lock entries in parallel connections to set multixact
+    nclients = 3
+    pool = multiprocessing.Pool(nclients)
+    args = [pg.connstr()] * nclients
+    pool.map(runQuery, args)
+    pool.close()
+    pool.join()
+
+    # force wal flush
+    cur.execute('checkpoint')
+
+    cur.execute('SELECT next_multixact_id, pg_current_wal_flush_lsn() FROM pg_control_checkpoint();')
+    res = cur.fetchone()
+    next_multixact_id = res[0]
+    lsn = res[1]
+
+    # Ensure that we did lock some tuples
+    assert(int(next_multixact_id) > int(next_multixact_id_old))
+
+    # Branch at this point
+    zenith_cli.run(["branch", "test_multixact_new", "test_multixact@"+lsn]);
+    pg_new = postgres.create_start('test_multixact_new')
+
+    print("postgres is running on 'test_multixact_new' branch")
+    pg_new_conn = psycopg2.connect(pg_new.connstr())
+    pg_new_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
+    cur_new = pg_new_conn.cursor()
+
+    cur_new.execute('SELECT next_multixact_id FROM pg_control_checkpoint();')
+    next_multixact_id_new = cur_new.fetchone()[0]
+
+    # Check that we restored pg_controlfile correctly
+    assert(next_multixact_id_new == next_multixact_id)
--- a/test_runner/batch_others/test_pageserver_api.py
+++ b/test_runner/batch_others/test_pageserver_api.py
@@ -0,0 +1,54 @@
+import pytest
+import psycopg2
+import getpass
+import json
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+def test_status(pageserver):
+    pg_conn = psycopg2.connect(pageserver.connstr())
+    pg_conn.autocommit = True
+    cur = pg_conn.cursor()
+    cur.execute('status;')
+    assert cur.fetchone() == ('hello world',)
+    pg_conn.close()
+
+def test_branch_list(pageserver, zenith_cli):
+
+    # Create a branch for us
+    zenith_cli.run(["branch", "test_branch_list_main", "empty"]);
+
+    page_server_conn = psycopg2.connect(pageserver.connstr())
+    page_server_conn.autocommit = True
+    page_server_cur = page_server_conn.cursor()
+
+    page_server_cur.execute('branch_list;')
+    branches = json.loads(page_server_cur.fetchone()[0])
+    # Filter out branches created by other tests
+    branches = [x for x in branches if x['name'].startswith('test_branch_list')]
+
+    assert len(branches) == 1
+    assert branches[0]['name'] == 'test_branch_list_main'
+    assert 'timeline_id' in branches[0]
+    assert 'latest_valid_lsn' in branches[0]
+    assert 'ancestor_id' in branches[0]
+    assert 'ancestor_lsn' in branches[0]
+
+    # Create another branch, and start Postgres on it
+    zenith_cli.run(['branch', 'test_branch_list_experimental', 'test_branch_list_main'])
+    zenith_cli.run(['pg', 'create', 'test_branch_list_experimental'])
+
+    page_server_cur.execute('branch_list;')
+    new_branches = json.loads(page_server_cur.fetchone()[0])
+    # Filter out branches created by other tests
+    new_branches = [x for x in new_branches if x['name'].startswith('test_branch_list')]
+    assert len(new_branches) == 2
+    new_branches.sort(key=lambda k: k['name'])
+
+    assert new_branches[0]['name'] == 'test_branch_list_experimental'
+    assert new_branches[0]['timeline_id'] != branches[0]['timeline_id']
+
+    # TODO: do the LSNs have to match here?
+    assert new_branches[1] == branches[0]
+
+    page_server_conn.close()
--- a/test_runner/batch_others/test_pgbench.py
+++ b/test_runner/batch_others/test_pgbench.py
@@ -0,0 +1,17 @@
+import pytest
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+
+def test_pgbench(pageserver, postgres, pg_bin, zenith_cli):
+
+    # Create a branch for us
+    zenith_cli.run(["branch", "test_pgbench", "empty"]);
+
+    pg = postgres.create_start('test_pgbench')
+    print("postgres is running on 'test_pgbench' branch")
+
+    connstr = pg.connstr();
+
+    pg_bin.run_capture(['pgbench', '-i', connstr])
+    pg_bin.run_capture(['pgbench'] + '-c 10 -T 5 -P 1 -M prepared'.split() + [connstr])
--- a/test_runner/batch_others/test_twophase.py
+++ b/test_runner/batch_others/test_twophase.py
@@ -0,0 +1,50 @@
+#
+# Test branching, when a transaction is in prepared state
+#
+import pytest
+import getpass
+import psycopg2
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+def test_twophase(zenith_cli, pageserver, postgres, pg_bin):
+    zenith_cli.run(["branch", "test_twophase", "empty"]);
+
+    pg = postgres.create_start('test_twophase', ['max_prepared_transactions=5'])
+    print("postgres is running on 'test_twophase' branch")
+
+    conn = psycopg2.connect(pg.connstr());
+    conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
+    cur = conn.cursor()
+
+    cur.execute('CREATE TABLE foo (t text)');
+
+    # Prepare a transaction that will insert a row
+    cur.execute('BEGIN');
+    cur.execute("INSERT INTO foo VALUES ('one')");
+    cur.execute("PREPARE TRANSACTION 'insert_one'");
+
+    # Prepare another transaction that will insert a row
+    cur.execute('BEGIN');
+    cur.execute("INSERT INTO foo VALUES ('two')");
+    cur.execute("PREPARE TRANSACTION 'insert_two'");
+
+    # Create a branch with the transaction in prepared state
+    zenith_cli.run(["branch", "test_twophase_prepared", "test_twophase"]);
+
+    pg2 = postgres.create_start('test_twophase_prepared', ['max_prepared_transactions=5'])
+    conn2 = psycopg2.connect(pg2.connstr());
+    conn2.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
+    cur2 = conn2.cursor()
+
+    # On the new branch, commit one of the prepared transactions, abort the other one.
+    cur2.execute("COMMIT PREPARED 'insert_one'");
+    cur2.execute("ROLLBACK PREPARED 'insert_two'");
+
+    cur2.execute('SELECT * FROM foo');
+    assert(cur2.fetchall() == [('one',)]);
+
+    # Neither insert is visible on the original branch, the transactions are still
+    # in prepared state there.
+    cur.execute('SELECT * FROM foo');
+    assert(cur.fetchall() == []);
--- a/test_runner/batch_others/test_zenith_cli.py
+++ b/test_runner/batch_others/test_zenith_cli.py
@@ -0,0 +1,49 @@
+import pytest
+import psycopg2
+import json
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+def helper_compare_branch_list(page_server_cur, zenith_cli):
+    """
+    Compare branches list returned by CLI and directly via API.
+    Filters out branches created by other tests.
+    """
+
+    page_server_cur.execute('branch_list;')
+    branches_api = sorted(map(lambda b: b['name'], json.loads(page_server_cur.fetchone()[0])))
+    branches_api = [b for b in branches_api if b.startswith('test_cli_') or b in ('empty', 'main')]
+
+    res = zenith_cli.run(["branch"]);
+    assert(res.stderr == '')
+    branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
+    branches_cli = [b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')]
+
+    assert(branches_api == branches_cli)
+
+def test_cli_branch_list(pageserver, zenith_cli):
+
+    page_server_conn = psycopg2.connect(pageserver.connstr())
+    page_server_conn.autocommit = True
+    page_server_cur = page_server_conn.cursor()
+
+    # Initial sanity check
+    helper_compare_branch_list(page_server_cur, zenith_cli)
+
+    # Create a branch for us
+    res = zenith_cli.run(["branch", "test_cli_branch_list_main", "main"]);
+    assert(res.stderr == '')
+    helper_compare_branch_list(page_server_cur, zenith_cli)
+
+    # Create a nested branch
+    res = zenith_cli.run(["branch", "test_cli_branch_list_nested", "test_cli_branch_list_main"]);
+    assert(res.stderr == '')
+    helper_compare_branch_list(page_server_cur, zenith_cli)
+
+    # Check that all new branches are visible via CLI
+    res = zenith_cli.run(["branch"]);
+    assert(res.stderr == '')
+    branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
+
+    assert('test_cli_branch_list_main' in branches_cli)
+    assert('test_cli_branch_list_nested' in branches_cli)
--- a/test_runner/batch_pg_regress/test_pg_regress.py
+++ b/test_runner/batch_pg_regress/test_pg_regress.py
@@ -0,0 +1,61 @@
+import pytest
+from fixtures.utils import mkdir_if_needed
+import getpass
+import os
+import psycopg2
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+# FIXME: put host + port in a fixture
+HOST = 'localhost'
+PORT = 55432
+
+
+def test_pg_regress(pageserver, postgres, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir, base_dir, capsys):
+
+    # Create a branch for us
+    zenith_cli.run(["branch", "test_pg_regress", "empty"]);
+
+    # Connect to postgres and create a database called "regression".
+    pg = postgres.create_start('test_pg_regress')
+    pg_conn = psycopg2.connect(pg.connstr())
+    pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
+    cur = pg_conn.cursor()
+    cur.execute('CREATE DATABASE regression')
+    pg_conn.close()
+
+    # Create some local directories for pg_regress to run in.
+    runpath = os.path.join(test_output_dir, 'regress')
+    mkdir_if_needed(runpath)
+    mkdir_if_needed(os.path.join(runpath, 'testtablespace'))
+
+    # Compute all the file locations that pg_regress will need.
+    build_path = os.path.join(
+        pg_distrib_dir, 'build/src/test/regress')
+    src_path = os.path.join(
+        base_dir, 'vendor/postgres/src/test/regress')
+    bindir = os.path.join(pg_distrib_dir, 'bin')
+    schedule = os.path.join(src_path, 'parallel_schedule')
+    pg_regress = os.path.join(build_path, 'pg_regress')
+
+    pg_regress_command = [
+        pg_regress,
+        '--bindir=""',
+        '--use-existing',
+        '--bindir={}'.format(bindir),
+        '--dlpath={}'.format(build_path),
+        '--schedule={}'.format(schedule),
+        '--inputdir={}'.format(src_path),
+    ]
+
+    env = {
+        'PGPORT': str(pg.port),
+        'PGUSER': pg.username,
+        'PGHOST': pg.host,
+    }
+
+    # Run the command.
+    # We don't capture the output. It's not too chatty, and it always
+    # logs the exact same data to `regression.out` anyway.
+    with capsys.disabled():
+        pg_bin.run(pg_regress_command, env=env, cwd=runpath)
--- a/test_runner/batch_pg_regress/test_zehith_regress.py
+++ b/test_runner/batch_pg_regress/test_zehith_regress.py
@@ -0,0 +1,62 @@
+import pytest
+from fixtures.utils import mkdir_if_needed
+import getpass
+import os
+import psycopg2
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+# FIXME: put host + port in a fixture
+HOST = 'localhost'
+PORT = 55432
+
+
+def test_zenith_regress(pageserver, postgres, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir, base_dir, capsys):
+
+    # Create a branch for us
+    zenith_cli.run(["branch", "test_zenith_regress", "empty"]);
+
+    # Connect to postgres and create a database called "regression".
+    pg = postgres.create_start('test_zenith_regress')
+    pg_conn = psycopg2.connect(pg.connstr())
+    pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
+    cur = pg_conn.cursor()
+    cur.execute('CREATE DATABASE regression')
+    pg_conn.close()
+
+    # Create some local directories for pg_regress to run in.
+    runpath = os.path.join(test_output_dir, 'regress')
+    mkdir_if_needed(runpath)
+    mkdir_if_needed(os.path.join(runpath, 'testtablespace'))
+
+    # Compute all the file locations that pg_regress will need.
+    # This test runs zenith specific tests
+    build_path = os.path.join(
+        pg_distrib_dir, 'build/src/test/regress')
+    src_path = os.path.join(
+        base_dir, 'test_runner/zenith_regress')
+    bindir = os.path.join(pg_distrib_dir, 'bin')
+    schedule = os.path.join(src_path, 'parallel_schedule')
+    pg_regress = os.path.join(build_path, 'pg_regress')
+
+    pg_regress_command = [
+        pg_regress,
+        '--use-existing',
+        '--bindir={}'.format(bindir),
+        '--dlpath={}'.format(build_path),
+        '--schedule={}'.format(schedule),
+        '--inputdir={}'.format(src_path),
+    ]
+
+    print(pg_regress_command)
+    env = {
+        'PGPORT': str(pg.port),
+        'PGUSER': pg.username,
+        'PGHOST': pg.host,
+    }
+
+    # Run the command.
+    # We don't capture the output. It's not too chatty, and it always
+    # logs the exact same data to `regression.out` anyway.
+    with capsys.disabled():
+        pg_bin.run(pg_regress_command, env=env, cwd=runpath)
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -0,0 +1 @@
+pytest_plugins = ("fixtures.zenith_fixtures")
--- a/test_runner/fixtures/init.py
+++ b/test_runner/fixtures/init.py
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -0,0 +1,53 @@
+
+import os
+import subprocess
+
+def get_self_dir():
+    """ Get the path to the directory where this script lives. """
+    return os.path.dirname(os.path.abspath(__file__))
+
+
+def mkdir_if_needed(path):
+    """ Create a directory if it doesn't already exist
+
+    Note this won't try to create intermediate directories.
+    """
+    if os.path.exists(path):
+        assert os.path.isdir(path)
+        return
+    os.mkdir(path)
+
+
+def subprocess_capture(capture_dir, cmd, **kwargs):
+    """ Run a process and capture its output
+
+    Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
+    where "cmd" is the name of the program and NNN is an incrementing
+    counter.
+
+    If those files already exist, we will overwrite them.
+    """
+    assert type(cmd) is list
+    base = os.path.basename(cmd[0]) + '_{}'.format(global_counter())
+    basepath = os.path.join(capture_dir, base)
+    stdout_filename = basepath + '.stdout'
+    stderr_filename = basepath + '.stderr'
+
+    with open(stdout_filename, 'w') as stdout_f:
+        with open(stderr_filename, 'w') as stderr_f:
+            print('(capturing output to "{}.stdout")'.format(base))
+            subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
+
+
+_global_counter = 0
+
+
+def global_counter():
+    """ A really dumb global counter.
+
+    This is useful for giving output files a unique number, so if we run the
+    same command multiple times we can keep their output separate.
+    """
+    global _global_counter
+    _global_counter += 1
+    return _global_counter
--- a/test_runner/fixtures/zenith_fixtures.py
+++ b/test_runner/fixtures/zenith_fixtures.py
@@ -0,0 +1,357 @@
+import getpass
+import os
+import psycopg2
+import pytest
+import shutil
+import subprocess
+import sys
+from .utils import (get_self_dir, mkdir_if_needed,
+                    subprocess_capture, global_counter)
+
+"""
+This file contains pytest fixtures. A fixture is a test resource that can be
+summoned by placing its name in the test's arguments.
+
+A fixture is created with the decorator @zenfixture, which is a wrapper around
+the standard pytest.fixture with some extra behavior.
+
+There are several environment variables that can control the running of tests:
+ZENITH_BIN, POSTGRES_DISTRIB_DIR, etc. See README.md for more information.
+
+To use fixtures in a test file, add this line of code:
+
+    pytest_plugins = ("fixtures.zenith_fixtures")
+
+Don't import functions from this file, or pytest will emit warnings. Instead
+put directly-importable functions into utils.py or another separate file.
+"""
+
+DEFAULT_OUTPUT_DIR = 'test_output'
+DEFAULT_POSTGRES_DIR = 'tmp_install'
+
+
+def determine_scope(fixture_name, config):
+    return 'session'
+
+
+def zenfixture(func):
+    """ This is a python decorator for fixtures with a flexible scope.
+
+    By default every test function will set up and tear down a new
+    database. In pytest, this is called fixtures "function" scope.
+
+    If the environment variable TEST_SHARED_FIXTURES is set, then all
+    tests will share the same database. State, logs, etc. will be
+    stored in a directory called "shared".
+
+    """
+    if os.environ.get('TEST_SHARED_FIXTURES') is None:
+        scope = 'function'
+    else:
+        scope = 'session'
+    return pytest.fixture(func, scope=scope)
+
+
+@pytest.fixture(autouse=True, scope='session')
+def safety_check():
+    """ Ensure that no unwanted daemons are running before we start testing. """
+    # does not use -c as it is not supported on macOS
+    cmd = ['pgrep', 'pageserver|postgres|wal_acceptor']
+    result = subprocess.run(cmd, stdout=subprocess.DEVNULL)
+    if result.returncode == 0:
+        # returncode of 0 means it found something.
+        # This is bad; we don't want any of those processes polluting the
+        # result of the test.
+        raise Exception('found interfering processes running')
+
+
+class ZenithCli:
+    """ An object representing the CLI binary named "zenith".
+
+    We also store an environment that will tell the CLI to operate
+    on a particular ZENITH_REPO_DIR.
+    """
+
+    def __init__(self, binpath, repo_dir, pg_distrib_dir):
+        assert os.path.isdir(binpath)
+        self.binpath = binpath
+        self.bin_zenith = os.path.join(binpath, 'zenith')
+        self.env = os.environ.copy()
+        self.env['ZENITH_REPO_DIR'] = repo_dir
+        self.env['POSTGRES_DISTRIB_DIR'] = pg_distrib_dir
+
+    def run(self, arguments):
+        """ Run "zenith" with the specified arguments.
+
+        arguments must be in list form, e.g. ['pg', 'create']
+
+        Return both stdout and stderr, which can be accessed as
+
+        result = zenith_cli.run(...)
+        assert(result.stderr == "")
+        print(result.stdout)
+
+        """
+        assert type(arguments) == list
+        args = [self.bin_zenith] + arguments
+        print('Running command "{}"'.format(' '.join(args)))
+        return subprocess.run(args, env=self.env, check=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+
+@zenfixture
+def zenith_cli(zenith_binpath, repo_dir, pg_distrib_dir):
+    return ZenithCli(zenith_binpath, repo_dir, pg_distrib_dir)
+
+
+class ZenithPageserver:
+    """ An object representing a running pageserver. """
+
+    def __init__(self, zenith_cli):
+        self.zenith_cli = zenith_cli
+        self.running = False
+
+    # Initialize the repository, i.e. run "zenith init"
+    def init(self):
+        self.zenith_cli.run(['init'])
+
+    # Start the page server
+    def start(self):
+        self.zenith_cli.run(['start'])
+        self.running = True
+
+    # Stop the page server
+    def stop(self):
+        self.zenith_cli.run(['stop'])
+        self.running = True
+
+    # The page server speaks the Postgres FE/BE protocol, so you can connect
+    # to it with any Postgres client, and run special commands. This function
+    # returns a libpq connection string for connecting to it.
+    def connstr(self):
+        username = getpass.getuser()
+        conn_str = 'host={} port={} dbname=postgres user={}'.format(
+            'localhost', 64000, username)
+        return conn_str
+
+# The 'pageserver' fixture provides a Page Server that's up and running.
+#
+# If TEST_SHARED_FIXTURES is set, the Page Server instance is shared by all
+# the tests. To avoid clashing with other tests, don't use the 'main' branch in
+# the tests directly. Instead, create a branch off the 'empty' branch and use
+# that.
+#
+# By convention, the test branches are named after the tests. For example,
+# test called 'test_foo' would create and use branches with the 'test_foo' prefix.
+@zenfixture
+def pageserver(zenith_cli):
+    ps = ZenithPageserver(zenith_cli)
+    ps.init()
+    ps.start()
+    # For convenience in tests, create a branch from the freshly-initialized cluster.
+    zenith_cli.run(["branch", "empty", "main"]);
+    yield ps
+    # After the yield comes any cleanup code we need.
+    print('Starting pageserver cleanup')
+    ps.stop()
+
+class Postgres:
+    """ An object representing a running postgres daemon. """
+
+    def __init__(self, zenith_cli, repo_dir, instance_num):
+        self.zenith_cli = zenith_cli
+        self.instance_num = instance_num
+        self.running = False
+        self.username = getpass.getuser()
+        self.host = 'localhost'
+        self.port = 55431 + instance_num
+        self.repo_dir = repo_dir
+        self.branch = None
+        # path to conf is <repo_dir>/pgdatadirs/<branch_name>/postgresql.conf
+
+    def create_start(self, branch, config_lines=None):
+        """ create the pg data directory, and start the server """
+        self.zenith_cli.run(['pg', 'create', branch])
+        self.branch = branch
+        if config_lines is None:
+            config_lines = []
+        self.config(config_lines)
+        self.zenith_cli.run(['pg', 'start', branch])
+        self.running = True
+        return
+
+    #lines should be an array of valid postgresql.conf rows
+    def config(self, lines):
+        filename = 'pgdatadirs/{}/postgresql.conf'.format(self.branch)
+        config_name = os.path.join(self.repo_dir, filename)
+        with open(config_name, 'a') as conf:
+            for line in lines:
+                conf.write(line)
+                conf.write('\n')
+
+    def stop(self):
+        if self.running:
+            self.zenith_cli.run(['pg', 'stop', self.branch])
+
+    # Return a libpq connection string to connect to the Postgres instance
+    def connstr(self, dbname='postgres'):
+        conn_str = 'host={} port={} dbname={} user={}'.format(
+            self.host, self.port, dbname, self.username)
+        return conn_str
+
+class PostgresFactory:
+    """ An object representing multiple running postgres daemons. """
+    def __init__(self, zenith_cli, repo_dir):
+        self.zenith_cli = zenith_cli
+        self.host = 'localhost'
+        self.repo_dir = repo_dir
+        self.num_instances = 0
+        self.instances = []
+
+    def create_start(self, branch="main", config_lines=None):
+        pg = Postgres(self.zenith_cli, self.repo_dir, self.num_instances + 1)
+        self.num_instances += 1
+        self.instances.append(pg)
+        pg.create_start(branch, config_lines)
+        return pg
+
+    def stop_all(self):
+        for pg in self.instances:
+            pg.stop()
+
+@zenfixture
+def postgres(zenith_cli, repo_dir):
+    pgfactory = PostgresFactory(zenith_cli, repo_dir)
+    yield pgfactory
+    # After the yield comes any cleanup code we need.
+    print('Starting postgres cleanup')
+    pgfactory.stop_all()
+
+
+class PgBin:
+    """ A helper class for executing postgres binaries """
+
+    def __init__(self, log_dir, pg_distrib_dir):
+        self.log_dir = log_dir
+        self.pg_install_path = pg_distrib_dir
+        self.pg_bin_path = os.path.join(self.pg_install_path, 'bin')
+        self.env = os.environ.copy()
+        self.env['LD_LIBRARY_PATH'] = os.path.join(self.pg_install_path, 'lib')
+
+    def _fixpath(self, command):
+        if not '/' in command[0]:
+            command[0] = os.path.join(self.pg_bin_path, command[0])
+
+    def _build_env(self, env_add):
+        if env_add is None:
+            return self.env
+        env = self.env.copy()
+        env.update(env_add)
+        return env
+
+    def run(self, command, env=None, cwd=None):
+        """ Run one of the postgres binaries.
+
+        The command should be in list form, e.g. ['pgbench', '-p', '55432']
+
+        All the necessary environment variables will be set.
+
+        If the first argument (the command name) doesn't include a path (no '/'
+        characters present), then it will be edited to include the correct path.
+
+        If you want stdout/stderr captured to files, use `run_capture` instead.
+
+        """
+        self._fixpath(command)
+        print('Running command "{}"'.format(' '.join(command)))
+        env = self._build_env(env)
+        subprocess.run(command, env=env, cwd=cwd, check=True)
+
+    def run_capture(self, command, env=None, cwd=None):
+        """ Run one of the postgres binaries, with stderr and stdout redirected to a file.
+
+        This is just like `run`, but for chatty programs.
+        """
+        self._fixpath(command)
+        print('Running command "{}"'.format(' '.join(command)))
+        env = self._build_env(env)
+        subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True)
+
+
+@zenfixture
+def pg_bin(test_output_dir, pg_distrib_dir):
+    return PgBin(test_output_dir, pg_distrib_dir)
+
+
+@zenfixture
+def base_dir():
+    """ find the base directory (currently this is the git root) """
+    base_dir = os.path.normpath(os.path.join(get_self_dir(), '../..'))
+    print('base_dir is', base_dir)
+    return base_dir
+
+
+@zenfixture
+def top_output_dir(base_dir):
+    """ Compute the top-level directory for all tests. """
+    env_test_output = os.environ.get('TEST_OUTPUT')
+    if env_test_output is not None:
+        output_dir = env_test_output
+    else:
+        output_dir = os.path.join(base_dir, DEFAULT_OUTPUT_DIR)
+    mkdir_if_needed(output_dir)
+    return output_dir
+
+
+@zenfixture
+def test_output_dir(request, top_output_dir):
+    """ Compute the working directory for an individual test. """
+    if os.environ.get('TEST_SHARED_FIXTURES') is None:
+        # one directory per test
+        test_name = request.node.name
+    else:
+        # We're running shared fixtures. Share a single directory.
+        test_name = 'shared'
+
+    test_output_dir = os.path.join(top_output_dir, test_name)
+    print('test_output_dir is', test_output_dir)
+    shutil.rmtree(test_output_dir, ignore_errors=True)
+    mkdir_if_needed(test_output_dir)
+    return test_output_dir
+
+
+@zenfixture
+def repo_dir(request, test_output_dir):
+    """ Compute the test repo_dir
+
+    "repo_dir" is the place where all of the pageserver files will go.
+    It doesn't have anything to do with the git repo.
+    """
+    repo_dir = os.path.join(test_output_dir, 'repo')
+    return repo_dir
+
+
+@zenfixture
+def zenith_binpath(base_dir):
+    """ find the zenith binaries """
+    env_zenith_bin = os.environ.get('ZENITH_BIN')
+    if env_zenith_bin:
+        zenith_dir = env_zenith_bin
+    else:
+        zenith_dir = os.path.join(base_dir, 'target/debug')
+    if not os.path.exists(os.path.join(zenith_dir, 'pageserver')):
+        raise Exception('zenith binaries not found at "{}"'.format(zenith_dir))
+    return zenith_dir
+
+
+@zenfixture
+def pg_distrib_dir(base_dir):
+    """ find the postgress install """
+    env_postgres_bin = os.environ.get('POSTGRES_DISTRIB_DIR')
+    if env_postgres_bin:
+        pg_dir = env_postgres_bin
+    else:
+        pg_dir = os.path.normpath(os.path.join(base_dir, DEFAULT_POSTGRES_DIR))
+    print('postgres dir is', pg_dir)
+    if not os.path.exists(os.path.join(pg_dir, 'bin/postgres')):
+        raise Exception('postgres not found at "{}"'.format(pg_dir))
+    return pg_dir
--- a/test_runner/pytest.ini
+++ b/test_runner/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+minversion = 6.0
--- a/test_runner/test_broken.py
+++ b/test_runner/test_broken.py
@@ -0,0 +1,33 @@
+import pytest
+import os
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+"""
+
+Use this test to see what happens when tests fail.
+
+We should be able to clean up after ourselves, including stopping any
+postgres or pageserver processes.
+
+Set the environment variable RUN_BROKEN to see this test run (and fail,
+and hopefully not leave any server processes behind).
+
+"""
+
+
+run_broken = pytest.mark.skipif(
+    os.environ.get('RUN_BROKEN') == None,
+    reason="only used for testing the fixtures"
+)
+
+@run_broken
+def test_broken(zenith_cli, pageserver, postgres, pg_bin):
+    # Create a branch for us
+    zenith_cli.run(["branch", "test_broken", "empty"]);
+
+    pg = postgres.create_start("test_broken")
+    print('postgres is running')
+
+    print('THIS NEXT COMMAND WILL FAIL:')
+    pg_bin.run('pgbench -i_am_a_broken_test'.split())
--- a/test_runner/zenith_regress/.gitignore
+++ b/test_runner/zenith_regress/.gitignore
@@ -0,0 +1,11 @@
+# Local binaries
+/pg_regress
+
+# Generated subdirectories
+/tmp_check/
+/results/
+/log/
+
+# Note: regression.* are only left behind on a failure; that's why they're not ignored
+#/regression.diffs
+#/regression.out
--- a/test_runner/zenith_regress/README.md
+++ b/test_runner/zenith_regress/README.md
@@ -0,0 +1,11 @@
+To add a new SQL test
+
+- add sql script to run to zenith_regress/sql/testname.sql
+- add expected output to zenith/regress/expected/testname.out
+- add testname to both parallel_schedule and serial_schedule files*
+
+That's it.
+For more complex tests see PostgreSQL regression tests. These works basically the same.
+
+*it was changed recently in PostgreSQL upstream - no more separate serial_schedule.
+Someday we'll catch up with these changes.
--- a/test_runner/zenith_regress/expected/.gitignore
+++ b/test_runner/zenith_regress/expected/.gitignore
@@ -0,0 +1,9 @@
+/constraints.out
+/copy.out
+/create_function_1.out
+/create_function_2.out
+/largeobject.out
+/largeobject_1.out
+/misc.out
+/security_label.out
+/tablespace.out
--- a/test_runner/zenith_regress/expected/zenith-cid.out
+++ b/test_runner/zenith_regress/expected/zenith-cid.out
@@ -0,0 +1,34 @@
+BEGIN;
+SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;
+CREATE TABLE cursor (a int);
+INSERT INTO cursor VALUES (1);
+DECLARE c1 NO SCROLL CURSOR FOR SELECT * FROM cursor FOR UPDATE;
+UPDATE cursor SET a = 2;
+FETCH ALL FROM c1;
+ a 
+---
+(0 rows)
+
+COMMIT;
+DROP TABLE cursor;
+create table to_be_evicted(x bigint);
+begin;
+insert into to_be_evicted values (1);
+insert into to_be_evicted select x*10 from to_be_evicted;
+insert into to_be_evicted select x*10 from to_be_evicted;
+insert into to_be_evicted select x*10 from to_be_evicted;
+insert into to_be_evicted select x*10 from to_be_evicted;
+insert into to_be_evicted select x*10 from to_be_evicted;
+insert into to_be_evicted select x*10 from to_be_evicted;
+insert into to_be_evicted select x*10 from to_be_evicted;
+insert into to_be_evicted select x*10 from to_be_evicted;
+insert into to_be_evicted select x*10 from to_be_evicted;
+insert into to_be_evicted select x*10 from to_be_evicted;
+select sum(x) from to_be_evicted;
+     sum     
+-------------
+ 25937424601
+(1 row)
+
+end;
+drop table to_be_evicted;
--- a/test_runner/zenith_regress/expected/zenith-clog.out
+++ b/test_runner/zenith_regress/expected/zenith-clog.out
@@ -0,0 +1,15 @@
+create or replace procedure do_commits() as $$
+declare
+    xid xid8;
+	i integer;
+begin
+    for i in 1..1000000 loop
+	    xid = txid_current();
+		commit;
+		if (pg_xact_status(xid) <> 'committed') then
+		   raise exception 'CLOG corruption';
+		end if;
+	end loop;
+end;
+$$ language plpgsql;
+call do_commits();
--- a/test_runner/zenith_regress/expected/zenith-rel-truncate.out
+++ b/test_runner/zenith_regress/expected/zenith-rel-truncate.out
@@ -0,0 +1,19 @@
+--
+-- Test that when a relation is truncated by VACUUM, the next smgrnblocks()
+-- query to get the relation's size returns the new size.
+-- (This isn't related to the TRUNCATE command, which works differently,
+-- by creating a new relation file)
+--
+CREATE TABLE truncatetest (i int);
+INSERT INTO truncatetest SELECT g FROM generate_series(1, 10000) g;
+-- Remove all the rows, and run VACUUM to remove the dead tuples and
+-- truncate the physical relation to 0 blocks.
+DELETE FROM truncatetest;
+VACUUM truncatetest;
+-- Check that a SeqScan sees correct relation size (which is now 0)
+SELECT * FROM truncatetest;
+ i 
+---
+(0 rows)
+
+DROP TABLE truncatetest;
--- a/test_runner/zenith_regress/expected/zenith-truncate.out
+++ b/test_runner/zenith_regress/expected/zenith-truncate.out
@@ -0,0 +1,9 @@
+create table tt(x integer);
+insert into tt values (generate_series(1,10000));
+delete from tt;
+vacuum tt;
+insert into tt values (generate_series(1,10000));
+delete from tt;
+vacuum tt;
+insert into tt values (generate_series(1,10000));
+drop table tt;
--- a/test_runner/zenith_regress/expected/zenith-vacuum-full.out
+++ b/test_runner/zenith_regress/expected/zenith-vacuum-full.out
@@ -0,0 +1,304 @@
+create table foo(a int primary key, b int, c int);
+insert into foo values (generate_series(1,10000), generate_series(1,10000), generate_series(1,10000));
+create index concurrently on foo(b);
+create index concurrently on foo(c);
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+vacuum full foo;
+\d foo
+                Table "public.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "foo_pkey" PRIMARY KEY, btree (a)
+    "foo_b_idx" btree (b)
+    "foo_c_idx" btree (c)
+
+drop table foo;
--- a/test_runner/zenith_regress/parallel_schedule
+++ b/test_runner/zenith_regress/parallel_schedule
@@ -0,0 +1,12 @@
+# ----------
+# src/test/regress/parallel_schedule
+#
+# By convention, we put no more than twenty tests in any one parallel group;
+# this limits the number of connections needed to run the tests.
+# ----------
+
+test: zenith-cid
+test: zenith-rel-truncate
+test: zenith-clog
+test: zenith-vacuum-full
+test: zenith-truncate
--- a/test_runner/zenith_regress/serial_schedule
+++ b/test_runner/zenith_regress/serial_schedule
@@ -0,0 +1,7 @@
+# src/test/regress/serial_schedule
+# This should probably be in an order similar to parallel_schedule.
+test: zenith-cid
+test: zenith-rel-truncate
+test: zenith-clog
+test: zenith-vacuum-full
+test: zenith-truncate
--- a/test_runner/zenith_regress/sql/.gitignore
+++ b/test_runner/zenith_regress/sql/.gitignore
@@ -0,0 +1,8 @@
+/constraints.sql
+/copy.sql
+/create_function_1.sql
+/create_function_2.sql
+/largeobject.sql
+/misc.sql
+/security_label.sql
+/tablespace.sql
--- a/test_runner/zenith_regress/sql/zenith-cid.sql
+++ b/test_runner/zenith_regress/sql/zenith-cid.sql
@@ -0,0 +1,26 @@
+BEGIN;
+SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;
+CREATE TABLE cursor (a int);
+INSERT INTO cursor VALUES (1);
+DECLARE c1 NO SCROLL CURSOR FOR SELECT * FROM cursor FOR UPDATE;
+UPDATE cursor SET a = 2;
+FETCH ALL FROM c1;
+COMMIT;
+DROP TABLE cursor;
+
+create table to_be_evicted(x bigint);
+begin;
+insert into to_be_evicted values (1);
+insert into to_be_evicted select x*10 from to_be_evicted;
+insert into to_be_evicted select x*10 from to_be_evicted;
+insert into to_be_evicted select x*10 from to_be_evicted;
+insert into to_be_evicted select x*10 from to_be_evicted;
+insert into to_be_evicted select x*10 from to_be_evicted;
+insert into to_be_evicted select x*10 from to_be_evicted;
+insert into to_be_evicted select x*10 from to_be_evicted;
+insert into to_be_evicted select x*10 from to_be_evicted;
+insert into to_be_evicted select x*10 from to_be_evicted;
+insert into to_be_evicted select x*10 from to_be_evicted;
+select sum(x) from to_be_evicted;
+end;
+drop table to_be_evicted;
--- a/test_runner/zenith_regress/sql/zenith-clog.sql
+++ b/test_runner/zenith_regress/sql/zenith-clog.sql
@@ -0,0 +1,16 @@
+create or replace procedure do_commits() as $$
+declare
+    xid xid8;
+	i integer;
+begin
+    for i in 1..1000000 loop
+	    xid = txid_current();
+		commit;
+		if (pg_xact_status(xid) <> 'committed') then
+		   raise exception 'CLOG corruption';
+		end if;
+	end loop;
+end;
+$$ language plpgsql;
+
+call do_commits();
--- a/test_runner/zenith_regress/sql/zenith-rel-truncate.sql
+++ b/test_runner/zenith_regress/sql/zenith-rel-truncate.sql
@@ -0,0 +1,18 @@
+--
+-- Test that when a relation is truncated by VACUUM, the next smgrnblocks()
+-- query to get the relation's size returns the new size.
+-- (This isn't related to the TRUNCATE command, which works differently,
+-- by creating a new relation file)
+--
+CREATE TABLE truncatetest (i int);
+INSERT INTO truncatetest SELECT g FROM generate_series(1, 10000) g;
+
+-- Remove all the rows, and run VACUUM to remove the dead tuples and
+-- truncate the physical relation to 0 blocks.
+DELETE FROM truncatetest;
+VACUUM truncatetest;
+
+-- Check that a SeqScan sees correct relation size (which is now 0)
+SELECT * FROM truncatetest;
+
+DROP TABLE truncatetest;
--- a/test_runner/zenith_regress/sql/zenith-truncate.sql
+++ b/test_runner/zenith_regress/sql/zenith-truncate.sql
@@ -0,0 +1,9 @@
+create table tt(x integer);
+insert into tt values (generate_series(1,10000));
+delete from tt;
+vacuum tt;
+insert into tt values (generate_series(1,10000));
+delete from tt;
+vacuum tt;
+insert into tt values (generate_series(1,10000));
+drop table tt;
--- a/test_runner/zenith_regress/sql/zenith-vacuum-full.sql
+++ b/test_runner/zenith_regress/sql/zenith-vacuum-full.sql
@@ -0,0 +1,51 @@
+create table foo(a int primary key, b int, c int);
+insert into foo values (generate_series(1,10000), generate_series(1,10000), generate_series(1,10000));
+create index concurrently on foo(b);
+create index concurrently on foo(c);
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+vacuum full foo;
+\d foo
+drop table foo;
--- a/vendor/postgres
+++ b/vendor/postgres
--- a/walkeeper/Cargo.toml
+++ b/walkeeper/Cargo.toml
@@ -20,17 +20,20 @@ slog = "2.7.0"
 log = "0.4.14"
 clap = "2.33.0"
 daemonize = "0.4.1"
-rust-s3 = { git = "https://github.com/hlinnaka/rust-s3", rev="7f15a24ec7daa0a5d9516da706212745f9042818", features = ["no-verify-ssl"] }
+rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
 tokio = { version = "1.3.0", features = ["full"] }
 tokio-stream = { version = "0.1.4" }
-tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
-postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
+postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
+postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
 anyhow = "1.0"
 crc32c = "0.6.0"
-parse_duration = "*"
+parse_duration = "2.1.1"
 walkdir = "2"
+serde = { version = "1.0", features = ["derive"] }
+hex = "0.4.3"

 # FIXME: 'pageserver' is needed for ZTimelineId. Refactor
 pageserver = { path = "../pageserver" }
 postgres_ffi = { path = "../postgres_ffi" }
+workspace_hack = { path = "../workspace_hack" }
+zenith_utils = { path = "../zenith_utils" }
--- a/walkeeper/README.md
+++ b/walkeeper/README.md
@@ -0,0 +1,282 @@
+# Proxy-safekeeper communication consensus protocol.
+
+## General requirements and architecture
+
+There is single stateless master and several safekeepers. Number of safekeepers is determined by redundancy level.
+To minimize number of changes in Postgres core, we are using standard streaming replication from master (through WAL sender).
+This replication stream is initiated by `safekeeper_proxy` which receives data from the master and broadcasts it to safekeepers.
+To provide durability we use synchronous replication at master (response to the commit statement is sent to the client
+only when acknowledged by WAL receiver). `safekeeper_proxy` sends this acknowledgment only when LSN of commit record is confirmed by quorum of safekeepers.
+
+`Safekeeper_proxy` tries to establish connections with safekeepers.
+At any moment of time each safekeeper can serve exactly once proxy, but it can accept new connections.
+
+Any of safekeepers can be used as WAL server, producing replication stream. So both `Pagers` and `Replicas`
+(read-only computation nodes) can connect to safekeeper to receive WAL stream. Safekeepers is streaming WAL until
+it reaches min(`commitLSN`,`flushLSN`). Then replication is suspended until new data arrives from master.
+
+
+## Handshake
+The goal of handshake is to collect quorum (to be able to perform recovery)
+and avoid split-brains caused by simultaneous presence of old and new master.
+Procedure of handshake consists of the following steps:
+
+1. Broadcast information about server to all safekeepers (wal segment size, system_id,...)
+2. Receive responses with information about safekeepers.
+3. Once quorum of handshake responses are received, propose new `NodeId(max(term)+1, server.uuid)`
+to all of them.
+4. On receiving proposed nodeId, safekeeper compares it with locally stored nodeId and if it is greater or equals
+then accepts proposed nodeId and persists this choice in the local control file.
+5. If quorum of safekeepers approve proposed nodeId, then server assumes that handshake is successfully completed and switch to recovery stage.
+
+## Recovery
+Proxy computes max(`restartLSN`) and max(`flushLSN`) from quorum of attached safekeepers.
+`RestartLSN` - is position in WAL which is known to be delivered to all safekeepers.
+In other words: `restartLSN` can be also considered as cut-off horizon (all preceding WAL segments can be removed).
+`FlushLSN` is position flushed by safekeeper to the local persistent storage.
+
+If max(`restartLSN`) != max(`flushLSN`), then recovery has to be performed.
+Proxy creates replication channel with most advanced safekeeper (safekeeper with the largest `flushLSN`).
+Then it downloads all WAL messages between max(`restartLSN`)..max(`flushLSN`).
+Messages are inserted in L1-list (ordered by LSN). Then we locate position of each safekeeper in this list according
+to their `flushLSN`s. Safekeepers that are not yet connected (out of quorum) should start from the beginning of the list
+(corresponding to `restartLSN`).
+
+We need to choose max(`flushLSN`) because voting quorum may be different from quorum committed the last message.
+So we do not know whether records with max(`flushLSN`) was committed by quorum or not. So we have to consider it committed
+to avoid loose of committed data.
+
+Calculated max(`flushLSN`) is called `VCL` (Volume Complete LSN). As far as it is chosen among quorum, there may be some other offline safekeeper with larger
+`VCL`. Once it becomes online, we need to overwrite its WAL beyond `VCL`. To support it, each safekeeper maintains
+`epoch` number. `Epoch` plays almost the same role as `term`, but algorithm of `epoch` bumping is different.
+`VCL` and new epoch are received by safekeeper from proxy during voting.
+But safekeeper doesn't switch to new epoch immediately after voting.
+Instead of it, safekeepers waits record with LSN > Max(`flushLSN`,`VCL`) is received.
+It means that we restore all records from old generation and switch to new generation.
+When proxy calculates max(`FlushLSN`), it first compares `Epoch`. So actually we compare (`Epoch`,`FlushLSN`) pairs.
+
+Let's looks at the examples. Consider that we have three safekeepers: S1, S2, S3. Si(N) means that i-th safekeeper has epoch=N.
+Ri(x) - WAL record for resource X with LSN=i. Assume that we have the following state:
+
+```
+S1(1): R1(a)
+S2(1): R1(a),R2(b)
+S3(1): R1(a),R2(b),R3(c),R4(d)  - offline
+```
+
+Proxy choose quorum (S1,S2). VCL for them is 2. We download S2 to proxy and schedule its write to S1.
+After receiving record R5 the picture can be:
+
+```
+S1(2): R1(a),R2(b),R3(e)
+S2(2): R1(a),R2(b),R3(e)
+S3(1): R1(a),R2(b),R3(c),R4(d)  - offline
+```
+
+Now if server is crashed or restarted, we perform new voting and
+doesn't matter which quorum we choose: (S1,S2), (S2,S3)...
+in any case VCL=3, because S3 has smaller epoch.
+R3(c) will be overwritten with R3(e):
+
+```
+S1(3): R1(a),R2(b),R3(e)
+S2(3): R1(a),R2(b),R3(e)
+S3(1): R1(a),R2(b),R3(e),R4(d)
+```
+
+Epoch of S3 will be adjusted once it overwrites R4:
+
+```
+S1(3): R1(a),R2(b),R3(e),R4(f)
+S2(3): R1(a),R2(b),R3(e),R4(f)
+S3(3): R1(a),R2(b),R3(e),R4(f)
+```
+
+Crash can happen before epoch was bumped. Let's return back to the initial position:
+
+```
+S1(1): R1(a)
+S2(1): R1(a),R2(b)
+S3(1): R1(a),R2(b),R3(c),R4(d)  - offline
+```
+
+Assume that we start recovery:
+
+```
+S1(1): R1(a),R2(b)
+S2(1): R1(a),R2(b)
+S3(1): R1(a),R2(b),R3(c),R4(d)  - offline
+```
+
+and then crash happens. During voting we choose quorum (S3,S3).
+Now them belong to the same epoch and S3 is most advanced among them.
+So VCL is set to 4 and we recover S1 and S2 from S3:
+
+```
+S1(1): R1(a),R2(b),R3(c),R4(d)
+S2(1): R1(a),R2(b),R3(c),R4(d)
+S3(1): R1(a),R2(b),R3(c),R4(d)
+```
+
+## Main loop
+Once recovery is completed, proxy switches to normal processing loop: it receives WAL stream from master and appends WAL
+messages to the list. At the same time it tries to push messages to safekeepers. Each safekeeper is associated
+with some element in message list and once it acknowledged receiving of the message, position is moved forward.
+Each queue element contains acknowledgment mask, which bits corresponds to safekeepers.
+Once all safekeepers acknowledged receiving of this message (by setting correspondent bit),
+then element can be removed from queue and `restartLSN` is advanced forward.
+
+Proxy maintains `restartLSN` and `commitLSN` based on the responses received by safekeepers.
+`RestartLSN` equals to the LSN of head message in the list. `CommitLSN` is `flushLSN[nSafekeepers-Quorum]` element
+in ordered array with `flushLSN`s of safekeepers. `CommitLSN` and `RestartLSN` are included in requests
+sent from proxy to safekeepers and stored in safekeepers control file.
+To avoid overhead of extra fsync, this control file is not fsynced on each request. Flushing this file is performed
+periodically, which means that `restartLSN`/`commitLSN` stored by safekeeper may be slightly deteriorated.
+It is not critical because may only cause redundant processing of some WAL record.
+And `FlushLSN` is recalculated after node restart by scanning local WAL files.
+
+## Fault tolerance
+Once `safekeeper_proxy` looses connection to safekeeper it tries to reestablish this connection using the same nodeId.
+If `safekeeper_proxy` looses connection with master, it is terminated. Right now safekeeper is standalone process,
+which can be launched at any node, but it can be also spawned as master's background worker, so that it is automatically
+restarted in case of Postgres instance restart.
+
+Restart of `safekeeper_proxy` initiates new round of voting and switching new epoch.
+
+## Limitations
+Right now message queue is maintained in main memory and is not spilled to the disk.
+It can cause memory overflow in case of presence of lagging safekeepers.
+It is assumed that in case of loosing local data by some safekeepers, it should be recovered using some external mechanism.
+
+
+## Glossary
+* `CommitLSN`: position in WAL confirmed by quorum safekeepers.
+* `RestartLSN`: position in WAL confirmed by all safekeepers.
+* `FlushLSN`: part of WAL persisted to the disk by safekeeper.
+* `NodeID`: pair (term,UUID)
+* `Pager`: Zenith component restoring pages from WAL stream
+* `Replica`: read-only computatio node
+* `VCL`: the largerst LSN for which we can guarantee availablity of all prior records.
+
+## Algorithm
+
+```python
+process SafekeeperProxy(safekeepers,server,curr_epoch,restart_lsn=0,message_queue={},feedbacks={})
+    function do_recovery(epoch,restart_lsn,VCL)
+        leader = i:safekeepers[i].state.epoch=epoch and safekeepers[i].state.flushLsn=VCL
+        wal_stream = safekeepers[leader].start_replication(restart_lsn,VCL)
+        do
+            message = wal_stream.read()
+            message_queue.append(message)
+        while message.startPos < VCL
+
+        for i in 1..safekeepers.size()
+            for message in message_queue
+                if message.endLsn < safekeepers[i].state.flushLsn
+                    message.delivered += i
+                else
+                    send_message(i, message)
+                    break
+    end function
+
+    function send_message(i,msg)
+        msg.restartLsn = restart_lsn
+        msg.commitLsn = get_commit_lsn()
+        safekeepers[i].send(msg, response_handler)
+    end function
+
+    function do_broadcast(message)
+        for i in 1..safekeepers.size()
+            if not safekeepers[i].sending()
+                send_message(i, message)
+    end function
+
+    function get_commit_lsn()
+        sorted_feedbacks = feedbacks.sort()
+        return sorted_feedbacks[safekeepers.size() - quorum]
+    end function
+
+    function response_handler(i,message,response)
+        feedbacks[i] = if response.epoch=curr_epoch then response.flushLsn else VCL
+        server.write(get_commit_lsn())
+
+        message.delivered += i
+        next_message = message_queue.next(message)
+        if next_message
+            send_message(i, next_message)
+
+        while message_queue.head.delivered.size() = safekeepers.size()
+            if restart_lsn < message_queue.head.beginLsn
+                restart_lsn = message_queue.head.endLsn
+            message_queue.pop_head()
+    end function
+
+    server_info = server.read()
+
+    safekeepers.write(server_info)
+    safekeepers.state = safekeepers.read()
+    next_term = max(safekeepers.state.nodeId.term)+1
+    restart_lsn = max(safekeepers.state.restartLsn)
+    epoch,VCL = max(safekeepers.state.epoch,safekeepers.state.flushLsn)
+    curr_epoch = epoch + 1
+
+    proposal = Proposal(NodeId(next_term,server.id),curr_epoch,VCL)
+    safekeepers.send(proposal)
+    responses = safekeepers.read()
+    if any responses.is_rejected()
+        exit()
+
+    for i in 1..safekeepers.size()
+        feedbacks[i].flushLsn = if epoch=safekeepers[i].state.epoch then safekeepers[i].state.flushLsn else restart_lsn
+
+    if restart_lsn != VCL
+        do_recovery(epoch,restart_lsn,VCL)
+
+    wal_stream = server.start_replication(VCL)
+    for ever
+        message = wal_stream.read()
+        message_queue.append(message)
+        do_broadcast(message)
+end process
+
+process safekeeper(gateway,state)
+    function handshake()
+        proxy = gateway.accept()
+        server_info = proxy.read()
+        proxy.write(state)
+        proposal = proxy.read()
+        if proposal.nodeId < state.nodeId
+            proxy.write(rejected)
+            return null
+        else
+            state.nodeId = proposal.nodeId
+            state.proposed_epoch = proposal.epoch
+            state.VCL = proposal.VCL
+            write_control_file(state)
+            proxy.write(accepted)
+            return proxy
+    end function
+
+    state = read_control_file()
+    state.flushLsn = locate_end_of_wal()
+
+    for ever
+        proxy = handshake()
+        if not proxy
+            continue
+        for ever
+            req = proxy.read()
+            if req.nodeId != state.nodeId
+                break
+            save_wal_file(req.data)
+            state.restartLsn = req.restartLsn
+            if state.epoch < state.proposed_epoch and req.endPos > max(state.flushLsn,state.VCL)
+                state.epoch = state.proposed_epoch
+            if req.endPos > state.flushLsn
+                state.flushLsn = req.endPos
+            save_control_file(state)
+            resp = Response(state.epoch,req.endPos)
+            proxy.write(resp)
+            notify_wal_sender(Min(req.commitLsn,req.endPos))
+end process
+```
--- a/walkeeper/src/bin/wal_acceptor.rs
+++ b/walkeeper/src/bin/wal_acceptor.rs
@@ -1,20 +1,18 @@
 //
 // Main entry point for the wal_acceptor executable
 //
+use anyhow::{Context, Result};
+use clap::{App, Arg};
 use daemonize::Daemonize;
 use log::*;
 use parse_duration::parse;
+use slog::Drain;
 use std::io;
 use std::path::{Path, PathBuf};
 use std::thread;
 use std::time::Duration;
 use std::{fs::File, fs::OpenOptions};

-use anyhow::Result;
-use clap::{App, Arg};
-
-use slog::Drain;
-
 use walkeeper::s3_offload;
 use walkeeper::wal_service;
 use walkeeper::WalAcceptorConf;
@@ -115,8 +113,18 @@ fn main() -> Result<()> {
 }

 fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
+    let log_filename = conf.data_dir.join("wal_acceptor.log");
+    // Don't open the same file for output multiple times;
+    // the different fds could overwrite each other's output.
+    let log_file = OpenOptions::new()
+        .create(true)
+        .append(true)
+        .open(&log_filename)
+        .with_context(|| format!("failed to open {:?}", &log_filename))?;
+
    // Initialize logger
-    let _scope_guard = init_logging(&conf)?;
+    let logger_file = log_file.try_clone().unwrap();
+    let _scope_guard = init_logging(&conf, logger_file)?;
    let _log_guard = slog_stdlog::init().unwrap();
    // Note: this `info!(...)` macro comes from `log` crate
    info!("standard logging redirected to slog");
@@ -126,16 +134,8 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {

        // There should'n be any logging to stdin/stdout. Redirect it to the main log so
        // that we will see any accidental manual fprintf's or backtraces.
-        let stdout = OpenOptions::new()
-            .create(true)
-            .append(true)
-            .open("wal_acceptor.log")
-            .unwrap();
-        let stderr = OpenOptions::new()
-            .create(true)
-            .append(true)
-            .open("wal_acceptor.log")
-            .unwrap();
+        let stdout = log_file.try_clone().unwrap();
+        let stderr = log_file;

        let daemonize = Daemonize::new()
            .pid_file("wal_acceptor.pid")
@@ -167,7 +167,10 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
        .name("WAL acceptor thread".into())
        .spawn(|| {
            // thread code
-            wal_service::thread_main(conf);
+            let thread_result = wal_service::thread_main(conf);
+            if let Err(e) = thread_result {
+                info!("wal_service thread terminated: {}", e);
+            }
        })
        .unwrap();
    threads.push(wal_acceptor_thread);
@@ -178,14 +181,11 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
    Ok(())
 }

-fn init_logging(conf: &WalAcceptorConf) -> Result<slog_scope::GlobalLoggerGuard, io::Error> {
+fn init_logging(
+    conf: &WalAcceptorConf,
+    log_file: File,
+) -> Result<slog_scope::GlobalLoggerGuard, io::Error> {
    if conf.daemonize {
-        let log = conf.data_dir.join("wal_acceptor.log");
-        let log_file = File::create(&log).map_err(|err| {
-            // We failed to initialize logging, so we can't log this message with error!
-            eprintln!("Could not create log file {:?}: {}", log, err);
-            err
-        })?;
        let decorator = slog_term::PlainSyncDecorator::new(log_file);
        let drain = slog_term::CompactFormat::new(decorator).build();
        let drain = std::sync::Mutex::new(drain).fuse();
--- a/walkeeper/src/lib.rs
+++ b/walkeeper/src/lib.rs
@@ -3,8 +3,12 @@ use std::net::SocketAddr;
 use std::path::PathBuf;
 use std::time::Duration;

-mod pq_protocol;
+pub mod pq_protocol;
+pub mod receive_wal;
+pub mod replication;
 pub mod s3_offload;
+pub mod send_wal;
+pub mod timeline;
 pub mod wal_service;

 use crate::pq_protocol::SystemId;
--- a/walkeeper/src/pq_protocol.rs
+++ b/walkeeper/src/pq_protocol.rs
@@ -1,17 +1,15 @@
-use byteorder::{BigEndian, ByteOrder};
-use bytes::{Buf, BufMut, Bytes, BytesMut};
+use byteorder::{BigEndian, ReadBytesExt};
+use bytes::{BufMut, Bytes, BytesMut};
 use pageserver::ZTimelineId;
-use std::io;
+use std::io::{self, Read};
 use std::str;
 use std::str::FromStr;

 pub type Oid = u32;
 pub type SystemId = u64;
-pub type Result<T> = std::result::Result<T, io::Error>;

 #[derive(Debug)]
 pub enum FeMessage {
-    StartupMessage(FeStartupMessage),
    Query(FeQueryMessage),
    Terminate,
    CopyData(FeCopyData),
@@ -52,28 +50,22 @@ pub enum StartupRequestCode {
 }

 impl FeStartupMessage {
-    pub fn parse(buf: &mut BytesMut) -> Result<Option<FeMessage>> {
+    pub fn read_from(reader: &mut impl Read) -> io::Result<Self> {
        const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
        const CANCEL_REQUEST_CODE: u32 = (1234 << 16) | 5678;
        const NEGOTIATE_SSL_CODE: u32 = (1234 << 16) | 5679;
        const NEGOTIATE_GSS_CODE: u32 = (1234 << 16) | 5680;

-        if buf.len() < 4 {
-            return Ok(None);
-        }
-        let len = BigEndian::read_u32(&buf[0..4]) as usize;
+        let len = reader.read_u32::<BigEndian>()? as usize;

        if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
            return Err(io::Error::new(
                io::ErrorKind::InvalidData,
-                "invalid message length",
+                "FeStartupMessage: invalid message length",
            ));
        }
-        if buf.len() < len {
-            return Ok(None);
-        }

-        let version = BigEndian::read_u32(&buf[4..8]);
+        let version = reader.read_u32::<BigEndian>()?;

        let kind = match version {
            CANCEL_REQUEST_CODE => StartupRequestCode::Cancel,
@@ -82,7 +74,10 @@ impl FeStartupMessage {
            _ => StartupRequestCode::Normal,
        };

-        let params_bytes = &buf[8..len];
+        let params_len = len - 8;
+        let mut params_bytes = vec![0u8; params_len];
+        reader.read_exact(params_bytes.as_mut())?;
+
        let params_str = str::from_utf8(&params_bytes).unwrap();
        let params = params_str.split('\0');
        let mut options = false;
@@ -110,13 +105,12 @@ impl FeStartupMessage {
            ));
        }

-        buf.advance(len as usize);
-        Ok(Some(FeMessage::StartupMessage(FeStartupMessage {
+        Ok(FeStartupMessage {
            version,
            kind,
            appname,
            timelineid: timelineid.unwrap(),
-        })))
+        })
    }
 }

@@ -202,44 +196,28 @@ impl<'a> BeMessage<'a> {
 }

 impl FeMessage {
-    pub fn parse(buf: &mut BytesMut) -> Result<Option<FeMessage>> {
-        if buf.len() < 5 {
-            let to_read = 5 - buf.len();
-            buf.reserve(to_read);
-            return Ok(None);
-        }
-
-        let tag = buf[0];
-        let len = BigEndian::read_u32(&buf[1..5]);
+    pub fn read_from(reader: &mut impl Read) -> io::Result<FeMessage> {
+        let tag = reader.read_u8()?;
+        let len = reader.read_u32::<BigEndian>()?;

        if len < 4 {
            return Err(io::Error::new(
                io::ErrorKind::InvalidInput,
-                "invalid message length: parsing u32",
+                "FeMessage: invalid message length",
            ));
        }

-        let total_len = len as usize + 1;
-        if buf.len() < total_len {
-            let to_read = total_len - buf.len();
-            buf.reserve(to_read);
-            return Ok(None);
-        }
-
-        let mut body = buf.split_to(total_len);
-        body.advance(5);
+        let body_len = (len - 4) as usize;
+        let mut body = vec![0u8; body_len];
+        reader.read_exact(&mut body)?;

        match tag {
-            b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage {
-                body: body.freeze(),
-            }))),
-            b'd' => Ok(Some(FeMessage::CopyData(FeCopyData {
-                body: body.freeze(),
-            }))),
-            b'X' => Ok(Some(FeMessage::Terminate)),
+            b'Q' => Ok(FeMessage::Query(FeQueryMessage { body: body.into() })),
+            b'd' => Ok(FeMessage::CopyData(FeCopyData { body: body.into() })),
+            b'X' => Ok(FeMessage::Terminate),
            tag => Err(io::Error::new(
                io::ErrorKind::InvalidInput,
-                format!("unknown message tag: {},'{:?}'", tag, buf),
+                format!("unknown message tag: {},'{:?}'", tag, body),
            )),
        }
    }
--- a/walkeeper/src/receive_wal.rs
+++ b/walkeeper/src/receive_wal.rs
@@ -0,0 +1,447 @@
+//! This implements the Safekeeper protocol.
+//!
+//! FIXME: better description needed here
+
+use anyhow::{bail, Result};
+use log::*;
+use postgres::{Client, NoTls};
+use serde::{Deserialize, Serialize};
+use std::cmp::{max, min};
+use std::fs::{self, File, OpenOptions};
+use std::io::{BufReader, Read, Seek, SeekFrom, Write};
+use std::net::{SocketAddr, TcpStream};
+use std::str;
+use std::sync::Arc;
+use zenith_utils::bin_ser::LeSer;
+use zenith_utils::lsn::Lsn;
+
+use crate::pq_protocol::*;
+use crate::replication::HotStandbyFeedback;
+use crate::timeline::{Timeline, TimelineTools};
+use crate::WalAcceptorConf;
+use pageserver::ZTimelineId;
+use postgres_ffi::xlog_utils::{TimeLineID, XLogFileName, MAX_SEND_SIZE, XLOG_BLCKSZ};
+
+pub const SK_MAGIC: u32 = 0xcafeceefu32;
+pub const SK_FORMAT_VERSION: u32 = 1;
+const SK_PROTOCOL_VERSION: u32 = 1;
+const UNKNOWN_SERVER_VERSION: u32 = 0;
+const END_OF_STREAM: Lsn = Lsn(0);
+pub const CONTROL_FILE_NAME: &str = "safekeeper.control";
+
+/// Unique node identifier used by Paxos
+#[derive(Debug, Clone, Copy, Ord, PartialOrd, PartialEq, Eq, Serialize, Deserialize)]
+pub struct NodeId {
+    term: u64,
+    uuid: [u8; 16],
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
+pub struct ServerInfo {
+    /// proxy-safekeeper protocol version
+    pub protocol_version: u32,
+    /// Postgres server version
+    pub pg_version: u32,
+    pub node_id: NodeId,
+    pub system_id: SystemId,
+    /// Zenith timelineid
+    pub timeline_id: ZTimelineId,
+    pub wal_end: Lsn,
+    pub timeline: TimeLineID,
+    pub wal_seg_size: u32,
+}
+
+/// Vote request sent from proxy to safekeepers
+#[derive(Debug, PartialEq, Serialize, Deserialize)]
+struct RequestVote {
+    node_id: NodeId,
+    /// volume commit LSN
+    vcl: Lsn,
+    /// new epoch when safekeeper reaches vcl
+    epoch: u64,
+}
+
+/// Information of about storage node
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+pub struct SafeKeeperInfo {
+    /// magic for verifying content the control file
+    pub magic: u32,
+    /// safekeeper format version
+    pub format_version: u32,
+    /// safekeeper's epoch
+    pub epoch: u64,
+    /// information about server
+    pub server: ServerInfo,
+    /// part of WAL acknowledged by quorum
+    pub commit_lsn: Lsn,
+    /// locally flushed part of WAL
+    pub flush_lsn: Lsn,
+    /// minimal LSN which may be needed for recovery of some safekeeper: min(commit_lsn) for all safekeepers
+    pub restart_lsn: Lsn,
+}
+
+impl SafeKeeperInfo {
+    pub fn new() -> SafeKeeperInfo {
+        SafeKeeperInfo {
+            magic: SK_MAGIC,
+            format_version: SK_FORMAT_VERSION,
+            epoch: 0,
+            server: ServerInfo {
+                protocol_version: SK_PROTOCOL_VERSION, /* proxy-safekeeper protocol version */
+                pg_version: UNKNOWN_SERVER_VERSION,    /* Postgres server version */
+                node_id: NodeId {
+                    term: 0,
+                    uuid: [0; 16],
+                },
+                system_id: 0, /* Postgres system identifier */
+                timeline_id: ZTimelineId::from([0u8; 16]),
+                wal_end: Lsn(0),
+                timeline: 0,
+                wal_seg_size: 0,
+            },
+            commit_lsn: Lsn(0),  /* part of WAL acknowledged by quorum */
+            flush_lsn: Lsn(0),   /* locally flushed part of WAL */
+            restart_lsn: Lsn(0), /* minimal LSN which may be needed for recovery of some safekeeper */
+        }
+    }
+}
+
+/// Request with WAL message sent from proxy to safekeeper.
+#[derive(Debug, PartialEq, Serialize, Deserialize)]
+struct SafeKeeperRequest {
+    /// Sender's node identifier (looks like we do not need it for TCP streaming connection)
+    sender_id: NodeId,
+    /// start position of message in WAL
+    begin_lsn: Lsn,
+    /// end position of message in WAL
+    end_lsn: Lsn,
+    /// restart LSN position  (minimal LSN which may be needed by proxy to perform recovery)
+    restart_lsn: Lsn,
+    /// LSN committed by quorum of safekeepers
+    commit_lsn: Lsn,
+}
+
+/// Report safekeeper state to proxy
+#[derive(Debug, PartialEq, Serialize, Deserialize)]
+struct SafeKeeperResponse {
+    epoch: u64,
+    flush_lsn: Lsn,
+    hs_feedback: HotStandbyFeedback,
+}
+
+#[derive(Debug)]
+pub struct ReceiveWalConn {
+    pub timeline: Option<Arc<Timeline>>,
+    /// Postgres connection, buffered input
+    pub stream_in: BufReader<TcpStream>,
+    /// Postgres connection, output
+    pub stream_out: TcpStream,
+    /// The cached result of socket.peer_addr()
+    pub peer_addr: SocketAddr,
+    /// wal acceptor configuration
+    pub conf: WalAcceptorConf,
+}
+
+impl ReceiveWalConn {
+    pub fn new(socket: TcpStream, conf: WalAcceptorConf) -> Result<ReceiveWalConn> {
+        let peer_addr = socket.peer_addr()?;
+        let conn = ReceiveWalConn {
+            timeline: None,
+            stream_in: BufReader::new(socket.try_clone()?),
+            stream_out: socket,
+            peer_addr,
+            conf,
+        };
+        Ok(conn)
+    }
+
+    fn read_req<T: LeSer>(&mut self) -> Result<T> {
+        // As the trait bound implies, this always encodes little-endian.
+        Ok(T::des_from(&mut self.stream_in)?)
+    }
+
+    fn request_callback(&self) -> std::result::Result<(), postgres::error::Error> {
+        if let Some(addr) = self.conf.pageserver_addr {
+            let ps_connstr = format!(
+                "host={} port={} dbname={} user={}",
+                addr.ip(),
+                addr.port(),
+                "no_db",
+                "no_user",
+            );
+            let callme = format!(
+                "callmemaybe {} host={} port={} options='-c ztimelineid={}'",
+                self.timeline.get().timelineid,
+                self.conf.listen_addr.ip(),
+                self.conf.listen_addr.port(),
+                self.timeline.get().timelineid
+            );
+            info!(
+                "requesting page server to connect to us: start {} {}",
+                ps_connstr, callme
+            );
+            let mut client = Client::connect(&ps_connstr, NoTls)?;
+            client.simple_query(&callme)?;
+        }
+        Ok(())
+    }
+
+    /// Receive WAL from wal_proposer
+    pub fn run(&mut self) -> Result<()> {
+        // Receive information about server
+        let server_info = self.read_req::<ServerInfo>()?;
+        info!(
+            "Start handshake with wal_proposer {} sysid {} timeline {}",
+            self.peer_addr, server_info.system_id, server_info.timeline_id,
+        );
+        // FIXME: also check that the system identifier matches
+        self.timeline.set(server_info.timeline_id)?;
+        self.timeline.get().load_control_file(&self.conf)?;
+
+        let mut my_info = self.timeline.get().get_info();
+
+        /* Check protocol compatibility */
+        if server_info.protocol_version != SK_PROTOCOL_VERSION {
+            bail!(
+                "Incompatible protocol version {}, expected {}",
+                server_info.protocol_version,
+                SK_PROTOCOL_VERSION
+            );
+        }
+        /* Postgres upgrade is not treated as fatal error */
+        if server_info.pg_version != my_info.server.pg_version
+            && my_info.server.pg_version != UNKNOWN_SERVER_VERSION
+        {
+            info!(
+                "Incompatible server version {}, expected {}",
+                server_info.pg_version, my_info.server.pg_version
+            );
+        }
+
+        /* Update information about server, but preserve locally stored node_id */
+        let node_id = my_info.server.node_id;
+        my_info.server = server_info;
+        my_info.server.node_id = node_id;
+
+        /* Calculate WAL end based on local data */
+        let (flush_lsn, timeline) = self.timeline.find_end_of_wal(&self.conf.data_dir, true);
+        my_info.flush_lsn = flush_lsn;
+        my_info.server.timeline = timeline;
+
+        /* Report my identifier to proxy */
+        my_info.ser_into(&mut self.stream_out)?;
+
+        /* Wait for vote request */
+        let prop = self.read_req::<RequestVote>()?;
+        /* This is Paxos check which should ensure that only one master can perform commits */
+        if prop.node_id < my_info.server.node_id {
+            /* Send my node-id to inform proxy that it's candidate was rejected */
+            my_info.server.node_id.ser_into(&mut self.stream_out)?;
+            bail!(
+                "Reject connection attempt with term {} because my term is {}",
+                prop.node_id.term,
+                my_info.server.node_id.term,
+            );
+        }
+        my_info.server.node_id = prop.node_id;
+        self.timeline.get().set_info(&my_info);
+        /* Need to persist our vote first */
+        self.timeline.get().save_control_file(true)?;
+
+        let mut flushed_restart_lsn = Lsn(0);
+        let wal_seg_size = server_info.wal_seg_size as usize;
+
+        /* Acknowledge the proposed candidate by returning it to the proxy */
+        prop.node_id.ser_into(&mut self.stream_out)?;
+
+        // Need to establish replication channel with page server.
+        // Add far as replication in postgres is initiated by receiver, we should use callme mechanism
+        if let Err(e) = self.request_callback() {
+            // Do not treate it as fatal error and continue work
+            // FIXME: we should retry after a while...
+            error!("Failed to send callme request to pageserver: {}", e);
+        }
+
+        info!(
+            "Start streaming from timeline {} address {:?}",
+            server_info.timeline_id, self.peer_addr,
+        );
+
+        // Main loop
+        loop {
+            let mut sync_control_file = false;
+
+            /* Receive message header */
+            let req = self.read_req::<SafeKeeperRequest>()?;
+            if req.sender_id != my_info.server.node_id {
+                bail!("Sender NodeId is changed");
+            }
+            if req.begin_lsn == END_OF_STREAM {
+                info!("Server stops streaming");
+                break;
+            }
+            let start_pos = req.begin_lsn;
+            let end_pos = req.end_lsn;
+            let rec_size = end_pos.checked_sub(start_pos).unwrap().0 as usize;
+            assert!(rec_size <= MAX_SEND_SIZE);
+
+            debug!(
+                "received for {} bytes between {} and {}",
+                rec_size, start_pos, end_pos,
+            );
+
+            /* Receive message body */
+            let mut inbuf = vec![0u8; rec_size];
+            self.stream_in.read_exact(&mut inbuf)?;
+
+            /* Save message in file */
+            self.write_wal_file(start_pos, timeline, wal_seg_size, &inbuf)?;
+
+            my_info.restart_lsn = req.restart_lsn;
+            my_info.commit_lsn = req.commit_lsn;
+
+            /*
+             * Epoch switch happen when written WAL record cross the boundary.
+             * The boundary is maximum of last WAL position at this node (FlushLSN) and global
+             * maximum (vcl) determined by safekeeper_proxy during handshake.
+             * Switching epoch means that node completes recovery and start writing in the WAL new data.
+             */
+            if my_info.epoch < prop.epoch && end_pos > max(my_info.flush_lsn, prop.vcl) {
+                info!("Switch to new epoch {}", prop.epoch);
+                my_info.epoch = prop.epoch; /* bump epoch */
+                sync_control_file = true;
+            }
+            if end_pos > my_info.flush_lsn {
+                my_info.flush_lsn = end_pos;
+            }
+            /*
+             * Update restart LSN in control file.
+             * To avoid negative impact on performance of extra fsync, do it only
+             * when restart_lsn delta exceeds WAL segment size.
+             */
+            sync_control_file |= flushed_restart_lsn + (wal_seg_size as u64) < my_info.restart_lsn;
+            self.timeline.get().save_control_file(sync_control_file)?;
+
+            if sync_control_file {
+                flushed_restart_lsn = my_info.restart_lsn;
+            }
+
+            /* Report flush position */
+            //info!("Confirm LSN: {:X}/{:>08X}", (end_pos>>32) as u32, end_pos as u32);
+            let resp = SafeKeeperResponse {
+                epoch: my_info.epoch,
+                flush_lsn: end_pos,
+                hs_feedback: self.timeline.get().get_hs_feedback(),
+            };
+            resp.ser_into(&mut self.stream_out)?;
+
+            /*
+             * Ping wal sender that new data is available.
+             * FlushLSN (end_pos) can be smaller than commitLSN in case we are at catching-up safekeeper.
+             */
+            self.timeline
+                .get()
+                .notify_wal_senders(min(req.commit_lsn, end_pos));
+        }
+        Ok(())
+    }
+
+    fn write_wal_file(
+        &self,
+        startpos: Lsn,
+        timeline: TimeLineID,
+        wal_seg_size: usize,
+        buf: &[u8],
+    ) -> Result<()> {
+        let mut bytes_left: usize = buf.len();
+        let mut bytes_written: usize = 0;
+        let mut partial;
+        let mut start_pos = startpos;
+        const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
+
+        /* Extract WAL location for this block */
+        let mut xlogoff = start_pos.segment_offset(wal_seg_size) as usize;
+
+        while bytes_left != 0 {
+            let bytes_to_write;
+
+            /*
+             * If crossing a WAL boundary, only write up until we reach wal
+             * segment size.
+             */
+            if xlogoff + bytes_left > wal_seg_size {
+                bytes_to_write = wal_seg_size - xlogoff;
+            } else {
+                bytes_to_write = bytes_left;
+            }
+
+            /* Open file */
+            let segno = start_pos.segment_number(wal_seg_size);
+            let wal_file_name = XLogFileName(timeline, segno, wal_seg_size);
+            let wal_file_path = self
+                .conf
+                .data_dir
+                .join(self.timeline.get().timelineid.to_string())
+                .join(wal_file_name.clone());
+            let wal_file_partial_path = self
+                .conf
+                .data_dir
+                .join(self.timeline.get().timelineid.to_string())
+                .join(wal_file_name.clone() + ".partial");
+
+            {
+                let mut wal_file: File;
+                /* Try to open already completed segment */
+                if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) {
+                    wal_file = file;
+                    partial = false;
+                } else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path)
+                {
+                    /* Try to open existed partial file */
+                    wal_file = file;
+                    partial = true;
+                } else {
+                    /* Create and fill new partial file */
+                    partial = true;
+                    match OpenOptions::new()
+                        .create(true)
+                        .write(true)
+                        .open(&wal_file_partial_path)
+                    {
+                        Ok(mut file) => {
+                            for _ in 0..(wal_seg_size / XLOG_BLCKSZ) {
+                                file.write_all(&ZERO_BLOCK)?;
+                            }
+                            wal_file = file;
+                        }
+                        Err(e) => {
+                            error!("Failed to open log file {:?}: {}", &wal_file_path, e);
+                            return Err(e.into());
+                        }
+                    }
+                }
+                wal_file.seek(SeekFrom::Start(xlogoff as u64))?;
+                wal_file.write_all(&buf[bytes_written..(bytes_written + bytes_to_write)])?;
+
+                // Flush file is not prohibited
+                if !self.conf.no_sync {
+                    wal_file.sync_all()?;
+                }
+            }
+            /* Write was successful, advance our position */
+            bytes_written += bytes_to_write;
+            bytes_left -= bytes_to_write;
+            start_pos += bytes_to_write as u64;
+            xlogoff += bytes_to_write;
+
+            /* Did we reach the end of a WAL segment? */
+            if start_pos.segment_offset(wal_seg_size) == 0 {
+                xlogoff = 0;
+                if partial {
+                    fs::rename(&wal_file_partial_path, &wal_file_path)?;
+                }
+            }
+        }
+        Ok(())
+    }
+}
--- a/walkeeper/src/replication.rs
+++ b/walkeeper/src/replication.rs
@@ -0,0 +1,238 @@
+//! This module implements the replication protocol, starting with the
+//! "START REPLICATION" message.
+
+use crate::pq_protocol::{BeMessage, FeMessage};
+use crate::send_wal::SendWalConn;
+use crate::timeline::{Timeline, TimelineTools};
+use crate::WalAcceptorConf;
+use anyhow::{anyhow, bail, Result};
+use bytes::{BufMut, Bytes, BytesMut};
+use log::*;
+use postgres_ffi::xlog_utils::{get_current_timestamp, TimestampTz, XLogFileName, MAX_SEND_SIZE};
+use regex::Regex;
+use serde::{Deserialize, Serialize};
+use std::cmp::min;
+use std::fs::File;
+use std::io::{BufReader, Read, Seek, SeekFrom, Write};
+use std::net::TcpStream;
+use std::path::Path;
+use std::sync::Arc;
+use std::{str, thread};
+use zenith_utils::bin_ser::BeSer;
+use zenith_utils::lsn::Lsn;
+
+const XLOG_HDR_SIZE: usize = 1 + 8 * 3; /* 'w' + startPos + walEnd + timestamp */
+const LIBPQ_HDR_SIZE: usize = 5; /* 1 byte with message type + 4 bytes length */
+const LIBPQ_MSG_SIZE_OFFS: usize = 1;
+pub const END_REPLICATION_MARKER: Lsn = Lsn::MAX;
+
+type FullTransactionId = u64;
+
+/// Hot standby feedback received from replica
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+pub struct HotStandbyFeedback {
+    pub ts: TimestampTz,
+    pub xmin: FullTransactionId,
+    pub catalog_xmin: FullTransactionId,
+}
+
+/// A network connection that's speaking the replication protocol.
+pub struct ReplicationConn {
+    timeline: Option<Arc<Timeline>>,
+    /// Postgres connection, buffered input
+    ///
+    /// This is an `Option` because we will spawn a background thread that will
+    /// `take` it from us.
+    stream_in: Option<BufReader<TcpStream>>,
+    /// Postgres connection, output
+    stream_out: TcpStream,
+    /// wal acceptor configuration
+    conf: WalAcceptorConf,
+    /// assigned application name
+    appname: Option<String>,
+}
+
+impl ReplicationConn {
+    /// Create a new `SendWal`, consuming the `Connection`.
+    pub fn new(conn: SendWalConn) -> Self {
+        Self {
+            timeline: conn.timeline,
+            stream_in: Some(conn.stream_in),
+            stream_out: conn.stream_out,
+            conf: conn.conf,
+            appname: None,
+        }
+    }
+
+    /// Handle incoming messages from the network.
+    ///
+    /// This is spawned into the background by `handle_start_replication`.
+    ///
+    fn background_thread(mut stream_in: impl Read, timeline: Arc<Timeline>) -> Result<()> {
+        // Wait for replica's feedback.
+        // We only handle `CopyData` messages. Anything else is ignored.
+        loop {
+            match FeMessage::read_from(&mut stream_in)? {
+                FeMessage::CopyData(m) => {
+                    let feedback = HotStandbyFeedback::des(&m.body)?;
+                    timeline.add_hs_feedback(feedback)
+                }
+                msg => {
+                    info!("unexpected message {:?}", msg);
+                }
+            }
+        }
+    }
+
+    /// Helper function that parses a pair of LSNs.
+    fn parse_start_stop(cmd: &[u8]) -> Result<(Lsn, Lsn)> {
+        let re = Regex::new(r"([[:xdigit:]]+/[[:xdigit:]]+)").unwrap();
+        let caps = re.captures_iter(str::from_utf8(cmd)?);
+        let mut lsns = caps.map(|cap| cap[1].parse::<Lsn>());
+        let start_pos = lsns
+            .next()
+            .ok_or_else(|| anyhow!("failed to find start LSN"))??;
+        let stop_pos = lsns.next().transpose()?.unwrap_or(Lsn(0));
+        Ok((start_pos, stop_pos))
+    }
+
+    /// Helper function for opening a wal file.
+    fn open_wal_file(wal_file_path: &Path) -> Result<File> {
+        // First try to open the .partial file.
+        let mut partial_path = wal_file_path.to_owned();
+        partial_path.set_extension("partial");
+        if let Ok(opened_file) = File::open(&partial_path) {
+            return Ok(opened_file);
+        }
+
+        // If that failed, try it without the .partial extension.
+        match File::open(&wal_file_path) {
+            Ok(opened_file) => Ok(opened_file),
+            Err(e) => {
+                error!("Failed to open log file {:?}: {}", &wal_file_path, e);
+                Err(e.into())
+            }
+        }
+    }
+
+    ///
+    /// Handle START_REPLICATION replication command
+    ///
+    pub fn run(&mut self, cmd: &Bytes) -> Result<()> {
+        // spawn the background thread which receives HotStandbyFeedback messages.
+        let bg_timeline = Arc::clone(self.timeline.get());
+        let bg_stream_in = self.stream_in.take().unwrap();
+
+        thread::spawn(move || {
+            if let Err(err) = Self::background_thread(bg_stream_in, bg_timeline) {
+                error!("socket error: {}", err);
+            }
+        });
+
+        let (mut start_pos, mut stop_pos) = Self::parse_start_stop(&cmd)?;
+
+        let wal_seg_size = self.timeline.get().get_info().server.wal_seg_size as usize;
+        if wal_seg_size == 0 {
+            bail!("Can not start replication before connecting to wal_proposer");
+        }
+        let (wal_end, timeline) = self.timeline.find_end_of_wal(&self.conf.data_dir, false);
+        if start_pos == Lsn(0) {
+            start_pos = wal_end;
+        }
+        if stop_pos == Lsn(0) && self.appname == Some("wal_proposer_recovery".to_string()) {
+            stop_pos = wal_end;
+        }
+        info!("Start replication from {} till {}", start_pos, stop_pos);
+
+        let mut outbuf = BytesMut::new();
+        BeMessage::write(&mut outbuf, &BeMessage::Copy);
+        self.send(&outbuf)?;
+        outbuf.clear();
+
+        let mut end_pos: Lsn;
+        let mut wal_file: Option<File> = None;
+
+        loop {
+            /* Wait until we have some data to stream */
+            if stop_pos != Lsn(0) {
+                /* recovery mode: stream up to the specified LSN (VCL) */
+                if start_pos >= stop_pos {
+                    /* recovery finished */
+                    break;
+                }
+                end_pos = stop_pos;
+            } else {
+                /* normal mode */
+                let timeline = self.timeline.get();
+                end_pos = timeline.wait_for_lsn(start_pos);
+            }
+            if end_pos == END_REPLICATION_MARKER {
+                break;
+            }
+
+            // Take the `File` from `wal_file`, or open a new file.
+            let mut file = match wal_file.take() {
+                Some(file) => file,
+                None => {
+                    // Open a new file.
+                    let segno = start_pos.segment_number(wal_seg_size);
+                    let wal_file_name = XLogFileName(timeline, segno, wal_seg_size);
+                    let timeline_id = self.timeline.get().timelineid.to_string();
+                    let wal_file_path = self.conf.data_dir.join(timeline_id).join(wal_file_name);
+                    Self::open_wal_file(&wal_file_path)?
+                }
+            };
+
+            let xlogoff = start_pos.segment_offset(wal_seg_size) as usize;
+
+            // How much to read and send in message? We cannot cross the WAL file
+            // boundary, and we don't want send more than MAX_SEND_SIZE.
+            let send_size = end_pos.checked_sub(start_pos).unwrap().0 as usize;
+            let send_size = min(send_size, wal_seg_size - xlogoff);
+            let send_size = min(send_size, MAX_SEND_SIZE);
+
+            let msg_size = LIBPQ_HDR_SIZE + XLOG_HDR_SIZE + send_size;
+
+            // Read some data from the file.
+            let mut file_buf = vec![0u8; send_size];
+            file.seek(SeekFrom::Start(xlogoff as u64))?;
+            file.read_exact(&mut file_buf)?;
+
+            // Write some data to the network socket.
+            // FIXME: turn these into structs.
+            // 'd' is CopyData;
+            // 'w' is "WAL records"
+            // https://www.postgresql.org/docs/9.1/protocol-message-formats.html
+            // src/backend/replication/walreceiver.c
+            outbuf.clear();
+            outbuf.put_u8(b'd');
+            outbuf.put_u32((msg_size - LIBPQ_MSG_SIZE_OFFS) as u32);
+            outbuf.put_u8(b'w');
+            outbuf.put_u64(start_pos.0);
+            outbuf.put_u64(end_pos.0);
+            outbuf.put_u64(get_current_timestamp());
+
+            assert!(outbuf.len() + file_buf.len() == msg_size);
+            // This thread has exclusive access to the TcpStream, so it's fine
+            // to do this as two separate calls.
+            self.send(&outbuf)?;
+            self.send(&file_buf)?;
+            start_pos += send_size as u64;
+
+            debug!("Sent WAL to page server up to {}", end_pos);
+
+            // Decide whether to reuse this file. If we don't set wal_file here
+            // a new file will be opened next time.
+            if start_pos.segment_offset(wal_seg_size) != 0 {
+                wal_file = Some(file);
+            }
+        }
+        Ok(())
+    }
+
+    /// Send messages on the network.
+    fn send(&mut self, buf: &[u8]) -> Result<()> {
+        self.stream_out.write_all(buf.as_ref())?;
+        Ok(())
+    }
+}
--- a/walkeeper/src/s3_offload.rs
+++ b/walkeeper/src/s3_offload.rs
@@ -12,8 +12,7 @@ use std::collections::HashSet;
 use std::env;
 use std::fs::{self, File};
 use std::io::prelude::*;
-use std::iter::FromIterator;
-use std::path::PathBuf;
+use std::path::Path;
 use std::time::SystemTime;
 use tokio::runtime;
 use tokio::time::sleep;
@@ -42,7 +41,7 @@ pub fn thread_main(conf: WalAcceptorConf) {
 async fn offload_files(
    bucket: &Bucket,
    listing: &HashSet<String>,
-    dir_path: &PathBuf,
+    dir_path: &Path,
    conf: &WalAcceptorConf,
 ) -> Result<u64> {
    let horizon = SystemTime::now() - conf.ttl.unwrap();
@@ -93,11 +92,10 @@ async fn main_loop(conf: &WalAcceptorConf) -> Result<()> {
        let results = bucket
            .list("walarchive/".to_string(), Some("".to_string()))
            .await?;
-        let listing = HashSet::from_iter(
-            results
-                .iter()
-                .flat_map(|b| b.contents.iter().map(|o| o.key.clone())),
-        );
+        let listing = results
+            .iter()
+            .flat_map(|b| b.contents.iter().map(|o| o.key.clone()))
+            .collect();

        let n = offload_files(&bucket, &listing, &conf.data_dir, conf).await?;
        info!("Offload {} files to S3", n);
--- a/walkeeper/src/send_wal.rs
+++ b/walkeeper/src/send_wal.rs
@@ -0,0 +1,155 @@
+//! This implements the libpq replication protocol between wal_acceptor
+//! and replicas/pagers
+//!
+
+use crate::pq_protocol::{
+    BeMessage, FeMessage, FeStartupMessage, RowDescriptor, StartupRequestCode,
+};
+use crate::replication::ReplicationConn;
+use crate::timeline::{Timeline, TimelineTools};
+use crate::WalAcceptorConf;
+use anyhow::{bail, Result};
+use bytes::BytesMut;
+use log::*;
+use std::io::{BufReader, Write};
+use std::net::{SocketAddr, TcpStream};
+use std::sync::Arc;
+
+/// A network connection that's speaking the libpq replication protocol.
+pub struct SendWalConn {
+    pub timeline: Option<Arc<Timeline>>,
+    /// Postgres connection, buffered input
+    pub stream_in: BufReader<TcpStream>,
+    /// Postgres connection, output
+    pub stream_out: TcpStream,
+    /// The cached result of socket.peer_addr()
+    pub peer_addr: SocketAddr,
+    /// wal acceptor configuration
+    pub conf: WalAcceptorConf,
+    /// assigned application name
+    appname: Option<String>,
+}
+
+impl SendWalConn {
+    /// Create a new `SendWal`, consuming the `Connection`.
+    pub fn new(socket: TcpStream, conf: WalAcceptorConf) -> Result<Self> {
+        let peer_addr = socket.peer_addr()?;
+        let conn = SendWalConn {
+            timeline: None,
+            stream_in: BufReader::new(socket.try_clone()?),
+            stream_out: socket,
+            peer_addr,
+            conf,
+            appname: None,
+        };
+        Ok(conn)
+    }
+
+    ///
+    /// Send WAL to replica or WAL receiver using standard libpq replication protocol
+    ///
+    pub fn run(mut self) -> Result<()> {
+        let peer_addr = self.peer_addr;
+        info!("WAL sender to {:?} is started", peer_addr);
+
+        // Handle the startup message first.
+
+        let m = FeStartupMessage::read_from(&mut self.stream_in)?;
+        trace!("got startup message {:?}", m);
+        match m.kind {
+            StartupRequestCode::NegotiateGss | StartupRequestCode::NegotiateSsl => {
+                let mut buf = BytesMut::new();
+                BeMessage::write(&mut buf, &BeMessage::Negotiate);
+                info!("SSL requested");
+                self.stream_out.write_all(&buf)?;
+            }
+            StartupRequestCode::Normal => {
+                let mut buf = BytesMut::new();
+                BeMessage::write(&mut buf, &BeMessage::AuthenticationOk);
+                BeMessage::write(&mut buf, &BeMessage::ReadyForQuery);
+                self.stream_out.write_all(&buf)?;
+                self.timeline.set(m.timelineid)?;
+                self.appname = m.appname;
+            }
+            StartupRequestCode::Cancel => return Ok(()),
+        }
+
+        loop {
+            let msg = FeMessage::read_from(&mut self.stream_in)?;
+            match msg {
+                FeMessage::Query(q) => {
+                    trace!("got query {:?}", q.body);
+
+                    if q.body.starts_with(b"IDENTIFY_SYSTEM") {
+                        self.handle_identify_system()?;
+                    } else if q.body.starts_with(b"START_REPLICATION") {
+                        // Create a new replication object, consuming `self`.
+                        ReplicationConn::new(self).run(&q.body)?;
+                        break;
+                    } else {
+                        bail!("Unexpected command {:?}", q.body);
+                    }
+                }
+                FeMessage::Terminate => {
+                    break;
+                }
+                _ => {
+                    bail!("unexpected message");
+                }
+            }
+        }
+        info!("WAL sender to {:?} is finished", peer_addr);
+        Ok(())
+    }
+
+    ///
+    /// Handle IDENTIFY_SYSTEM replication command
+    ///
+    fn handle_identify_system(&mut self) -> Result<()> {
+        let (start_pos, timeline) = self.timeline.find_end_of_wal(&self.conf.data_dir, false);
+        let lsn = start_pos.to_string();
+        let tli = timeline.to_string();
+        let sysid = self.timeline.get().get_info().server.system_id.to_string();
+        let lsn_bytes = lsn.as_bytes();
+        let tli_bytes = tli.as_bytes();
+        let sysid_bytes = sysid.as_bytes();
+
+        let mut outbuf = BytesMut::new();
+        BeMessage::write(
+            &mut outbuf,
+            &BeMessage::RowDescription(&[
+                RowDescriptor {
+                    name: b"systemid\0",
+                    typoid: 25,
+                    typlen: -1,
+                },
+                RowDescriptor {
+                    name: b"timeline\0",
+                    typoid: 23,
+                    typlen: 4,
+                },
+                RowDescriptor {
+                    name: b"xlogpos\0",
+                    typoid: 25,
+                    typlen: -1,
+                },
+                RowDescriptor {
+                    name: b"dbname\0",
+                    typoid: 25,
+                    typlen: -1,
+                },
+            ]),
+        );
+        BeMessage::write(
+            &mut outbuf,
+            &BeMessage::DataRow(&[Some(sysid_bytes), Some(tli_bytes), Some(lsn_bytes), None]),
+        );
+        BeMessage::write(
+            &mut outbuf,
+            &BeMessage::CommandComplete(b"IDENTIFY_SYSTEM\0"),
+        );
+        BeMessage::write(&mut outbuf, &BeMessage::ReadyForQuery);
+        self.stream_out.write_all(&outbuf)?;
+        Ok(())
+    }
+}
--- a/walkeeper/src/timeline.rs
+++ b/walkeeper/src/timeline.rs
@@ -0,0 +1,267 @@
+//! This module contains tools for managing timelines.
+//!
+
+use anyhow::{bail, Result};
+use fs2::FileExt;
+use lazy_static::lazy_static;
+use log::*;
+use pageserver::ZTimelineId;
+use postgres_ffi::xlog_utils::{find_end_of_wal, TimeLineID};
+use std::cmp::{max, min};
+use std::collections::HashMap;
+use std::fs::{self, File, OpenOptions};
+use std::io::{Seek, SeekFrom};
+use std::path::Path;
+use std::sync::{Arc, Condvar, Mutex};
+use zenith_utils::bin_ser::LeSer;
+use zenith_utils::lsn::Lsn;
+
+use crate::receive_wal::{SafeKeeperInfo, CONTROL_FILE_NAME, SK_FORMAT_VERSION, SK_MAGIC};
+use crate::replication::{HotStandbyFeedback, END_REPLICATION_MARKER};
+use crate::WalAcceptorConf;
+
+/// Shared state associated with database instance (tenant)
+#[derive(Debug)]
+struct SharedState {
+    /// quorum commit LSN
+    commit_lsn: Lsn,
+    /// information about this safekeeper
+    info: SafeKeeperInfo,
+    /// opened file control file handle (needed to hold exlusive file lock
+    control_file: Option<File>,
+    /// combined hot standby feedback from all replicas
+    hs_feedback: HotStandbyFeedback,
+}
+
+impl SharedState {
+    fn new() -> Self {
+        Self {
+            commit_lsn: Lsn(0),
+            info: SafeKeeperInfo::new(),
+            control_file: None,
+            hs_feedback: HotStandbyFeedback {
+                ts: 0,
+                xmin: u64::MAX,
+                catalog_xmin: u64::MAX,
+            },
+        }
+    }
+
+    /// Accumulate hot standby feedbacks from replicas
+    pub fn add_hs_feedback(&mut self, feedback: HotStandbyFeedback) {
+        self.hs_feedback.xmin = min(self.hs_feedback.xmin, feedback.xmin);
+        self.hs_feedback.catalog_xmin = min(self.hs_feedback.catalog_xmin, feedback.catalog_xmin);
+        self.hs_feedback.ts = max(self.hs_feedback.ts, feedback.ts);
+    }
+
+    /// Load and lock control file (prevent running more than one instance of safekeeper)
+    pub fn load_control_file(
+        &mut self,
+        conf: &WalAcceptorConf,
+        timelineid: ZTimelineId,
+    ) -> Result<()> {
+        if self.control_file.is_some() {
+            info!("control file for timeline {} is already open", timelineid);
+            return Ok(());
+        }
+
+        let control_file_path = conf
+            .data_dir
+            .join(timelineid.to_string())
+            .join(CONTROL_FILE_NAME);
+        info!("loading control file {}", control_file_path.display());
+        match OpenOptions::new()
+            .read(true)
+            .write(true)
+            .create(true)
+            .open(&control_file_path)
+        {
+            Ok(file) => {
+                // Lock file to prevent two or more active wal_acceptors
+                match file.try_lock_exclusive() {
+                    Ok(()) => {}
+                    Err(e) => {
+                        bail!(
+                            "Control file {:?} is locked by some other process: {}",
+                            &control_file_path,
+                            e
+                        );
+                    }
+                }
+                self.control_file = Some(file);
+
+                let cfile_ref = self.control_file.as_mut().unwrap();
+                match SafeKeeperInfo::des_from(cfile_ref) {
+                    Err(e) => {
+                        warn!("read from {:?} failed: {}", control_file_path, e);
+                    }
+                    Ok(info) => {
+                        if info.magic != SK_MAGIC {
+                            bail!("Invalid control file magic: {}", info.magic);
+                        }
+                        if info.format_version != SK_FORMAT_VERSION {
+                            bail!(
+                                "Incompatible format version: {} vs. {}",
+                                info.format_version,
+                                SK_FORMAT_VERSION
+                            );
+                        }
+                        self.info = info;
+                    }
+                }
+            }
+            Err(e) => {
+                panic!(
+                    "Failed to open control file {:?}: {}",
+                    &control_file_path, e
+                );
+            }
+        }
+        Ok(())
+    }
+
+    pub fn save_control_file(&mut self, sync: bool) -> Result<()> {
+        let file = self.control_file.as_mut().unwrap();
+        file.seek(SeekFrom::Start(0))?;
+        self.info.ser_into(file)?;
+        if sync {
+            file.sync_all()?;
+        }
+        Ok(())
+    }
+}
+
+/// Database instance (tenant)
+#[derive(Debug)]
+pub struct Timeline {
+    pub timelineid: ZTimelineId,
+    mutex: Mutex<SharedState>,
+    /// conditional variable used to notify wal senders
+    cond: Condvar,
+}
+
+impl Timeline {
+    fn new(timelineid: ZTimelineId, shared_state: SharedState) -> Timeline {
+        Timeline {
+            timelineid,
+            mutex: Mutex::new(shared_state),
+            cond: Condvar::new(),
+        }
+    }
+
+    /// Wait for an LSN to be committed.
+    ///
+    /// Returns the last committed LSN, which will be at least
+    /// as high as the LSN waited for.
+    ///
+    pub fn wait_for_lsn(&self, lsn: Lsn) -> Lsn {
+        let mut shared_state = self.mutex.lock().unwrap();
+        loop {
+            let commit_lsn = shared_state.commit_lsn;
+            // This must be `>`, not `>=`.
+            if commit_lsn > lsn {
+                return commit_lsn;
+            }
+            shared_state = self.cond.wait(shared_state).unwrap();
+        }
+    }
+
+    // Notify caught-up WAL senders about new WAL data received
+    pub fn notify_wal_senders(&self, commit_lsn: Lsn) {
+        let mut shared_state = self.mutex.lock().unwrap();
+        if shared_state.commit_lsn < commit_lsn {
+            shared_state.commit_lsn = commit_lsn;
+            self.cond.notify_all();
+        }
+    }
+
+    fn _stop_wal_senders(&self) {
+        self.notify_wal_senders(END_REPLICATION_MARKER);
+    }
+
+    pub fn get_info(&self) -> SafeKeeperInfo {
+        return self.mutex.lock().unwrap().info.clone();
+    }
+
+    pub fn set_info(&self, info: &SafeKeeperInfo) {
+        self.mutex.lock().unwrap().info = info.clone();
+    }
+
+    // Accumulate hot standby feedbacks from replicas
+    pub fn add_hs_feedback(&self, feedback: HotStandbyFeedback) {
+        let mut shared_state = self.mutex.lock().unwrap();
+        shared_state.add_hs_feedback(feedback);
+    }
+
+    pub fn get_hs_feedback(&self) -> HotStandbyFeedback {
+        let shared_state = self.mutex.lock().unwrap();
+        shared_state.hs_feedback.clone()
+    }
+
+    pub fn load_control_file(&self, conf: &WalAcceptorConf) -> Result<()> {
+        let mut shared_state = self.mutex.lock().unwrap();
+        shared_state.load_control_file(conf, self.timelineid)
+    }
+
+    pub fn save_control_file(&self, sync: bool) -> Result<()> {
+        let mut shared_state = self.mutex.lock().unwrap();
+        shared_state.save_control_file(sync)
+    }
+}
+
+// Utilities needed by various Connection-like objects
+pub trait TimelineTools {
+    fn set(&mut self, timeline_id: ZTimelineId) -> Result<()>;
+    fn get(&self) -> &Arc<Timeline>;
+    fn find_end_of_wal(&self, data_dir: &Path, precise: bool) -> (Lsn, TimeLineID);
+}
+
+impl TimelineTools for Option<Arc<Timeline>> {
+    fn set(&mut self, timeline_id: ZTimelineId) -> Result<()> {
+        // We will only set the timeline once. If it were to ever change,
+        // anyone who cloned the Arc would be out of date.
+        assert!(self.is_none());
+        *self = Some(GlobalTimelines::store(timeline_id)?);
+        Ok(())
+    }
+
+    fn get(&self) -> &Arc<Timeline> {
+        self.as_ref().unwrap()
+    }
+
+    /// Find last WAL record. If "precise" is false then just locate last partial segment
+    fn find_end_of_wal(&self, data_dir: &Path, precise: bool) -> (Lsn, TimeLineID) {
+        let seg_size = self.get().get_info().server.wal_seg_size as usize;
+        let (lsn, timeline) = find_end_of_wal(data_dir, seg_size, precise);
+        (Lsn(lsn), timeline)
+    }
+}
+
+lazy_static! {
+    pub static ref TIMELINES: Mutex<HashMap<ZTimelineId, Arc<Timeline>>> =
+        Mutex::new(HashMap::new());
+}
+
+/// A zero-sized struct used to manage access to the global timelines map.
+struct GlobalTimelines;
+
+impl GlobalTimelines {
+    /// Store a new timeline into the global TIMELINES map.
+    fn store(timeline_id: ZTimelineId) -> Result<Arc<Timeline>> {
+        let mut timelines = TIMELINES.lock().unwrap();
+
+        match timelines.get(&timeline_id) {
+            Some(result) => Ok(Arc::clone(result)),
+            None => {
+                info!("creating timeline dir {}", timeline_id);
+                fs::create_dir_all(timeline_id.to_string())?;
+
+                let shared_state = SharedState::new();
+
+                let new_tid = Arc::new(Timeline::new(timeline_id, shared_state));
+                timelines.insert(timeline_id, Arc::clone(&new_tid));
+                Ok(new_tid)
+            }
+        }
+    }
+}
--- a/walkeeper/src/wal_service.rs
+++ b/walkeeper/src/wal_service.rs
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name = "workspace_hack"
+version = "0.1.0"
+edition = "2018"
+
+[target.'cfg(all())'.dependencies]
+libc = { version = "0.2", features = ["default", "extra_traits", "std"] }
+memchr = { version = "2", features = ["default", "std", "use_std"] }
+num-integer = { version = "0.1", default-features = false, features = ["std"] }
+num-traits = { version = "0.2", default-features = false, features = ["std"] }
+regex = { version = "1", features = ["aho-corasick", "default", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
+regex-syntax = { version = "0.6", features = ["default", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
+serde = { version = "1", features = ["default", "derive", "serde_derive", "std"] }
+
+[target.'cfg(all())'.build-dependencies]
+libc = { version = "0.2", features = ["default", "extra_traits", "std"] }
+memchr = { version = "2", features = ["default", "std", "use_std"] }
+proc-macro2 = { version = "1", features = ["default", "proc-macro"] }
+quote = { version = "1", features = ["default", "proc-macro"] }
+regex = { version = "1", features = ["aho-corasick", "default", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
+regex-syntax = { version = "0.6", features = ["default", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
+syn = { version = "1", features = ["clone-impls", "default", "derive", "full", "parsing", "printing", "proc-macro", "quote", "visit", "visit-mut"] }
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`pytest_plugins = ("fixtures.zenith_fixtures")`