pageserver - add naive layer IDs

Better API to handle timeline metadata properly
Set SO_REUSEADDR for all TCP listeners
2026-03-06 18:00:37 +00:00 · 2021-11-01 00:20:50 -07:00 · 2021-10-29 23:51:40 +03:00 · 2021-10-29 12:45:26 -07:00 · 2021-10-29 19:01:01 +03:00 · 2021-10-28 18:55:14 +03:00
138 changed files with 12400 additions and 4748 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,13 +1,13 @@
 version: 2.1

-orbs:
-  python: circleci/python@1.4.0
-
 executors:
  zenith-build-executor:
    resource_class: xlarge
    docker:
-      - image: cimg/rust:1.52.1
+      - image: cimg/rust:1.55.0
+  zenith-python-executor:
+    docker:
+      - image: cimg/python:3.7.10  # Oldest available 3.7 with Ubuntu 20.04 (for GLIBC and Rust) at CirlceCI

 jobs:
  check-codestyle:
@@ -24,6 +24,12 @@ jobs:
  # A job to build postgres
  build-postgres:
    executor: zenith-build-executor
+    parameters:
+      build_type:
+        type: enum
+        enum: ["debug", "release"]
+    environment:
+      BUILD_TYPE: << parameters.build_type >>
    steps:
        # Checkout the git repo (circleci doesn't have a flag to enable submodules here)
      - checkout
@@ -39,7 +45,7 @@ jobs:
          name: Restore postgres cache
          keys:
            # Restore ONLY if the rev key matches exactly
-            - v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
+            - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}

        # FIXME We could cache our own docker container, instead of installing packages every time.
      - run:
@@ -59,12 +65,12 @@ jobs:
            if [ ! -e tmp_install/bin/postgres ]; then
              # "depth 1" saves some time by not cloning the whole repo
              git submodule update --init --depth 1
-              make postgres
+              make postgres -j8
            fi

      - save_cache:
          name: Save postgres cache
-          key: v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
+          key: v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
          paths:
            - tmp_install

@@ -96,7 +102,7 @@ jobs:
          name: Restore postgres cache
          keys:
            # Restore ONLY if the rev key matches exactly
-            - v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
+            - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}

      - restore_cache:
          name: Restore rust cache
@@ -104,7 +110,7 @@ jobs:
            # Require an exact match. While an out of date cache might speed up the build,
            # there's no way to clean out old packages, so the cache grows every time something
            # changes.
-            - v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
+            - v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}

        # Build the rust code, including test binaries
      - run:
@@ -122,7 +128,7 @@ jobs:

      - save_cache:
          name: Save rust cache
-          key: v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
+          key: v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
          paths:
            - ~/.cargo/registry
            - ~/.cargo/git
@@ -176,9 +182,27 @@ jobs:
          paths:
            - "*"

+  check-python:
+    executor: zenith-python-executor
+    steps:
+      - checkout
+      - run:
+          name: Install deps
+          working_directory: test_runner
+          command: pipenv --python 3.7 install --dev
+      - run:
+          name: Run yapf to ensure code format
+          when: always
+          working_directory: test_runner
+          command: pipenv run yapf --recursive --diff .
+      - run:
+          name: Run mypy to check types
+          when: always
+          working_directory: test_runner
+          command: pipenv run mypy .
+
  run-pytest:
-    #description: "Run pytest"
-    executor: python/default
+    executor: zenith-python-executor
    parameters:
      # pytest args to specify the tests to run.
      #
@@ -213,11 +237,9 @@ jobs:
          steps:
            - run: git submodule update --init --depth 1
      - run:
-          name: Install pipenv & deps
+          name: Install deps
          working_directory: test_runner
-          command: |
-            pip install pipenv
-            pipenv install
+          command: pipenv --python 3.7 install
      - run:
          name: Run pytest
          working_directory: test_runner
@@ -239,13 +261,13 @@ jobs:
            #
            # The junit.xml file allows CircleCI to display more fine-grained test information
            # in its "Tests" tab in the results page.
-            # -s prevents pytest from capturing output, which helps to see
-            # what's going on if the test hangs
            # --verbose prints name of each test (helpful when there are
            # multiple tests in one file)
            # -rA prints summary in the end
            # -n4 uses four processes to run tests via pytest-xdist
-            pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short -s --verbose -rA $TEST_SELECTION $EXTRA_PARAMS
+            # -s is not used to prevent pytest from capturing output, because tests are running
+            # in parallel and logs are mixed between different tests
+            pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short --verbose -rA $TEST_SELECTION $EXTRA_PARAMS
      - run:
          # CircleCI artifacts are preserved one file at a time, so skipping
          # this step isn't a good idea. If you want to extract the
@@ -254,7 +276,7 @@ jobs:
          when: always
          command: |
            du -sh /tmp/test_output/*
-            find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "wal_acceptor.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" -delete
+            find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete
            du -sh /tmp/test_output/*
      - store_artifacts:
          path: /tmp/test_output
@@ -319,8 +341,7 @@ jobs:
                \"inputs\": {
                  \"ci_job_name\": \"zenith-remote-ci\",
                  \"commit_hash\": \"$CIRCLE_SHA1\",
-                  \"remote_repo\": \"$LOCAL_REPO\",
-                  \"zenith_image_branch\": \"$CIRCLE_BRANCH\"
+                  \"remote_repo\": \"$LOCAL_REPO\"
                }
              }"

@@ -328,14 +349,19 @@ workflows:
  build_and_test:
    jobs:
      - check-codestyle
-      - build-postgres
+      - check-python
+      - build-postgres:
+          name: build-postgres-<< matrix.build_type >>
+          matrix:
+            parameters:
+              build_type: ["debug", "release"]
      - build-zenith:
          name: build-zenith-<< matrix.build_type >>
          matrix:
            parameters:
              build_type: ["debug", "release"]
          requires:
-            - build-postgres
+            - build-postgres-<< matrix.build_type >>
      - run-pytest:
          name: pg_regress-tests-<< matrix.build_type >>
          matrix:
--- a/.dockerignore
+++ b/.dockerignore
@@ -2,12 +2,17 @@
 **/__pycache__
 **/.pytest_cache

-/target
-/tmp_check
-/tmp_install
-/tmp_check_cli
-/test_output
-/.vscode
-/.zenith
-/integration_tests/.zenith
-/Dockerfile
+.git
+target
+tmp_check
+tmp_install
+tmp_check_cli
+test_output
+.vscode
+.zenith
+integration_tests/.zenith
+.mypy_cache
+
+Dockerfile
+.dockerignore
+
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/36
+++ b/36
@@ -10,39 +10,21 @@ FROM zenithdb/build:buster AS pg-build
 WORKDIR /zenith
 COPY ./vendor/postgres vendor/postgres
 COPY ./Makefile Makefile
+ENV BUILD_TYPE release
 RUN make -j $(getconf _NPROCESSORS_ONLN) -s postgres
-
-#
-# Calculate cargo dependencies.
-# This will always run, but only generate recipe.json with list of dependencies without
-# installing them.
-#
-FROM zenithdb/build:buster AS cargo-deps-inspect
-WORKDIR /zenith
-COPY . .
-RUN cargo chef prepare --recipe-path /zenith/recipe.json
-
-#
-# Build cargo dependencies.
-# This temp cantainner should be rebuilt only if recipe.json was changed.
-#
-FROM zenithdb/build:buster AS deps-build
-WORKDIR /zenith
-COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
-COPY --from=cargo-deps-inspect /usr/local/cargo/bin/cargo-chef /usr/local/cargo/bin/
-COPY --from=cargo-deps-inspect /zenith/recipe.json recipe.json
-RUN ROCKSDB_LIB_DIR=/usr/lib/ cargo chef cook --release --recipe-path recipe.json
+RUN rm -rf postgres_install/build

 #
 # Build zenith binaries
 #
+# TODO: build cargo deps as separate layer. We used cargo-chef before but that was
+# net time waste in a lot of cases. Copying Cargo.lock with empty lib.rs should do the work.
+#
 FROM zenithdb/build:buster AS build
 WORKDIR /zenith
-COPY . .
-# Copy cached dependencies
 COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
-COPY --from=deps-build /zenith/target target
-COPY --from=deps-build /usr/local/cargo/ /usr/local/cargo/
+
+COPY . .
 RUN cargo build --release

 #
@@ -51,11 +33,11 @@ RUN cargo build --release
 FROM debian:buster-slim
 WORKDIR /data

-RUN apt-get update && apt-get -yq install librocksdb-dev libseccomp-dev openssl && \
+RUN apt-get update && apt-get -yq install libreadline-dev libseccomp-dev openssl ca-certificates && \
    mkdir zenith_install

 COPY --from=build /zenith/target/release/pageserver /usr/local/bin
-COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
+COPY --from=build /zenith/target/release/safekeeper /usr/local/bin
 COPY --from=build /zenith/target/release/proxy /usr/local/bin
 COPY --from=pg-build /zenith/tmp_install postgres_install
 COPY docker-entrypoint.sh /docker-entrypoint.sh
--- a/Dockerfile.alpine
+++ b/Dockerfile.alpine
@@ -81,7 +81,7 @@ FROM alpine:3.13
 RUN apk add --update openssl build-base libseccomp-dev
 RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb
 COPY --from=build /zenith/target/release/pageserver /usr/local/bin
-COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
+COPY --from=build /zenith/target/release/safekeeper /usr/local/bin
 COPY --from=build /zenith/target/release/proxy /usr/local/bin
 COPY --from=pg-build /zenith/tmp_install /usr/local
 COPY docker-entrypoint.sh /docker-entrypoint.sh
--- a/Dockerfile.build
+++ b/Dockerfile.build
@@ -9,7 +9,7 @@ WORKDIR /zenith
 # Install postgres and zenith build dependencies
 # clang is for rocksdb
 RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-                                          libseccomp-dev pkg-config libssl-dev librocksdb-dev clang
+                                          libseccomp-dev pkg-config libssl-dev clang

 # Install rust tools
-RUN rustup component add clippy && cargo install cargo-chef cargo-audit
+RUN rustup component add clippy && cargo install cargo-audit
--- a/58
+++ b/58
@@ -6,34 +6,55 @@ else
 	SECCOMP =
 endif

+#
+# We differentiate between release / debug build types using the BUILD_TYPE
+# environment variable.
+#
+BUILD_TYPE ?= debug
+ifeq ($(BUILD_TYPE),release)
+	PG_CONFIGURE_OPTS = --enable-debug
+	PG_CFLAGS = -O2 -g3 $(CFLAGS)
+	# Unfortunately, `--profile=...` is a nightly feature
+	CARGO_BUILD_FLAGS += --release
+else ifeq ($(BUILD_TYPE),debug)
+	PG_CONFIGURE_OPTS = --enable-debug --enable-cassert --enable-depend
+	PG_CFLAGS = -O0 -g3 $(CFLAGS)
+else
+$(error Bad build type `$(BUILD_TYPE)', see Makefile for options)
+endif
+
+# Choose whether we should be silent or verbose
+CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose)
+# Fix for a corner case when make doesn't pass a jobserver
+CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
+
+# This option has a side effect of passing make jobserver to cargo.
+# However, we shouldn't do this if `make -n` (--dry-run) has been asked.
+CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
+# Force cargo not to print progress bar
+CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
+
 #
 # Top level Makefile to build Zenith and PostgreSQL
 #
+.PHONY: all
 all: zenith postgres

-# We don't want to run 'cargo build' in parallel with the postgres build,
-# because interleaving cargo build output with postgres build output looks
-# confusing. Also, 'cargo build' is parallel on its own, so it would be too
-# much parallelism. (Recursive invocation of postgres target still gets any
-# '-j' flag from the command line, so 'make -j' is still useful.)
-.NOTPARALLEL:
-
 ### Zenith Rust bits
 #
 # The 'postgres_ffi' depends on the Postgres headers.
 .PHONY: zenith
 zenith: postgres-headers
-	cargo build
+	+@echo "Compiling Zenith"
+	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)

 ### PostgreSQL parts
 tmp_install/build/config.status:
 	+@echo "Configuring postgres build"
 	mkdir -p tmp_install/build
 	(cd tmp_install/build && \
-	../../vendor/postgres/configure CFLAGS='-O0 -g3 $(CFLAGS)' \
-		--enable-cassert \
-		--enable-debug \
-		--enable-depend \
+	../../vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
+		$(PG_CONFIGURE_OPTS) \
 		$(SECCOMP) \
 		--prefix=$(abspath tmp_install) > configure.log)

@@ -47,10 +68,10 @@ postgres-headers: postgres-configure
 	+@echo "Installing PostgreSQL headers"
 	$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install

-
 # Compile and install PostgreSQL and contrib/zenith
 .PHONY: postgres
-postgres: postgres-configure
+postgres: postgres-configure \
+		  postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers`
 	+@echo "Compiling PostgreSQL"
 	$(MAKE) -C tmp_install/build MAKELEVEL=0 install
 	+@echo "Compiling contrib/zenith"
@@ -58,18 +79,21 @@ postgres: postgres-configure
 	+@echo "Compiling contrib/zenith_test_utils"
 	$(MAKE) -C tmp_install/build/contrib/zenith_test_utils install

+.PHONY: postgres-clean
 postgres-clean:
 	$(MAKE) -C tmp_install/build MAKELEVEL=0 clean

 # This doesn't remove the effects of 'configure'.
+.PHONY: clean
 clean:
-	cd tmp_install/build && ${MAKE} clean
-	cargo clean
+	cd tmp_install/build && $(MAKE) clean
+	$(CARGO_CMD_PREFIX) cargo clean

 # This removes everything
+.PHONY: distclean
 distclean:
 	rm -rf tmp_install
-	cargo clean
+	$(CARGO_CMD_PREFIX) cargo clean

 .PHONY: fmt
 fmt:
--- a/README.md
+++ b/README.md
@@ -25,15 +25,15 @@ Pageserver consists of:
 On Ubuntu or Debian this set of packages should be sufficient to build the code:
 ```text
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
-libssl-dev clang
+libssl-dev clang pkg-config libpq-dev
 ```

-[Rust] 1.52 or later is also required.
+[Rust] 1.55 or later is also required.

 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.

 To run the integration tests (not required to use the code), install
-Python (3.6 or higher), and install python3 packages with `pipenv` using `pipenv install` in the project directory.
+Python (3.7 or higher), and install python3 packages with `pipenv` using `pipenv install` in the project directory.

 2. Build zenith and patched postgres
 ```sh
@@ -47,17 +47,26 @@ make -j5
 # Create repository in .zenith with proper paths to binaries and data
 # Later that would be responsibility of a package install script
 > ./target/debug/zenith init
+initializing tenantid c03ba6b7ad4c5e9cf556f059ade44229
+created initial timeline 5b014a9e41b4b63ce1a1febc04503636 timeline.lsn 0/169C3C8
+created main branch
 pageserver init succeeded

-# start pageserver
+# start pageserver and safekeeper
 > ./target/debug/zenith start
-Starting pageserver at '127.0.0.1:64000' in .zenith
+Starting pageserver at 'localhost:64000' in '.zenith'
 Pageserver started
+initializing for single for 7676
+Starting safekeeper at 'localhost:5454' in '.zenith/safekeepers/single'
+Safekeeper started

-# start postgres on top on the pageserver
+# start postgres compute node
 > ./target/debug/zenith pg start main
-Starting postgres node at 'host=127.0.0.1 port=55432 user=stas'
+Starting new postgres main on main...
+Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432
+Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres'
 waiting for server to start.... done
+server started

 # check list of running postgres instances
 > ./target/debug/zenith pg list
@@ -108,6 +117,12 @@ postgres=# insert into t values(2,2);
 INSERT 0 1
 ```

+6. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances
+   you have just started. You can stop them all with one command:
+```sh
+> ./target/debug/zenith stop
+```
+
 ## Running tests

 ```sh
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -18,7 +18,7 @@ regex = "1"
 anyhow = "1.0"
 thiserror = "1"
 bytes = "1.0.1"
-nix = "0.20"
+nix = "0.23"
 url = "2.2.2"
 hex = { version = "0.4.3", features = ["serde"] }
 reqwest = { version = "0.11", features = ["blocking", "json"] }
--- a/control_plane/safekeepers.conf
+++ b/control_plane/safekeepers.conf
@@ -0,0 +1,20 @@
+# Page server and three safekeepers.
+[pageserver]
+pg_port = 64000
+http_port = 9898
+auth_type = 'Trust'
+
+[[safekeepers]]
+name = 'sk1'
+pg_port = 5454
+http_port = 7676
+
+[[safekeepers]]
+name = 'sk2'
+pg_port = 5455
+http_port = 7677
+
+[[safekeepers]]
+name = 'sk3'
+pg_port = 5456
+http_port = 7678
--- a/control_plane/simple.conf
+++ b/control_plane/simple.conf
@@ -0,0 +1,11 @@
+# Minimal zenith environment with one safekeeper. This is equivalent to the built-in
+# defaults that you get with no --config
+[pageserver]
+pg_port = 64000
+http_port = 9898
+auth_type = 'Trust'
+
+[[safekeepers]]
+name = 'single'
+pg_port = 5454
+http_port = 7676
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -1,17 +1,16 @@
-use std::fs::{self, File, OpenOptions};
+use std::collections::BTreeMap;
+use std::fs::{self, File};
 use std::io::Write;
 use std::net::SocketAddr;
 use std::net::TcpStream;
 use std::os::unix::fs::PermissionsExt;
+use std::path::PathBuf;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
-use std::{collections::BTreeMap, path::PathBuf};

 use anyhow::{Context, Result};
-use lazy_static::lazy_static;
-use regex::Regex;
 use zenith_utils::connstring::connection_host_port;
 use zenith_utils::lsn::Lsn;
 use zenith_utils::postgres_backend::AuthType;
@@ -19,6 +18,7 @@ use zenith_utils::zid::ZTenantId;
 use zenith_utils::zid::ZTimelineId;

 use crate::local_env::LocalEnv;
+use crate::postgresql_conf::PostgresConf;
 use crate::storage::PageServerNode;

 //
@@ -39,8 +39,6 @@ impl ComputeControlPlane {
    // |  |- <tenant_id>
    // |  |   |- <branch name>
    pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
-        // TODO: since pageserver do not have config file yet we believe here that
-        // it is running on default port. Change that when pageserver will have config.
        let pageserver = Arc::new(PageServerNode::from_env(&env));

        let mut nodes = BTreeMap::default();
@@ -75,40 +73,59 @@ impl ComputeControlPlane {
            .unwrap_or(self.base_port)
    }

-    pub fn local(local_env: &LocalEnv, pageserver: &Arc<PageServerNode>) -> ComputeControlPlane {
-        ComputeControlPlane {
-            base_port: 65431,
-            pageserver: Arc::clone(pageserver),
-            nodes: BTreeMap::new(),
-            env: local_env.clone(),
+    // FIXME: see also parse_point_in_time in branches.rs.
+    fn parse_point_in_time(
+        &self,
+        tenantid: ZTenantId,
+        s: &str,
+    ) -> Result<(ZTimelineId, Option<Lsn>)> {
+        let mut strings = s.split('@');
+        let name = strings.next().unwrap();
+
+        let lsn: Option<Lsn>;
+        if let Some(lsnstr) = strings.next() {
+            lsn = Some(
+                Lsn::from_str(lsnstr)
+                    .with_context(|| "invalid LSN in point-in-time specification")?,
+            );
+        } else {
+            lsn = None
        }
+
+        // Resolve the timeline ID, given the human-readable branch name
+        let timeline_id = self
+            .pageserver
+            .branch_get_by_name(&tenantid, name)?
+            .timeline_id;
+
+        Ok((timeline_id, lsn))
    }

    pub fn new_node(
        &mut self,
        tenantid: ZTenantId,
-        branch_name: &str,
+        name: &str,
+        timeline_spec: &str,
        port: Option<u16>,
    ) -> Result<Arc<PostgresNode>> {
-        let timeline_id = self
-            .pageserver
-            .branch_get_by_name(&tenantid, branch_name)?
-            .timeline_id;
+        // Resolve the human-readable timeline spec into timeline ID and LSN
+        let (timelineid, lsn) = self.parse_point_in_time(tenantid, timeline_spec)?;

        let port = port.unwrap_or_else(|| self.get_port());
        let node = Arc::new(PostgresNode {
-            name: branch_name.to_owned(),
+            name: name.to_owned(),
            address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
            env: self.env.clone(),
            pageserver: Arc::clone(&self.pageserver),
            is_test: false,
-            timelineid: timeline_id,
+            timelineid,
+            lsn,
            tenantid,
            uses_wal_proposer: false,
        });

        node.create_pgdata()?;
-        node.setup_pg_conf(self.env.auth_type)?;
+        node.setup_pg_conf(self.env.pageserver.auth_type)?;

        self.nodes
            .insert((tenantid, node.name.clone()), Arc::clone(&node));
@@ -127,6 +144,7 @@ pub struct PostgresNode {
    pageserver: Arc<PageServerNode>,
    is_test: bool,
    pub timelineid: ZTimelineId,
+    pub lsn: Option<Lsn>, // if it's a read-only node. None for primary
    pub tenantid: ZTenantId,
    uses_wal_proposer: bool,
 }
@@ -144,76 +162,28 @@ impl PostgresNode {
            );
        }

-        lazy_static! {
-            static ref CONF_PORT_RE: Regex = Regex::new(r"(?m)^\s*port\s*=\s*(\d+)\s*$").unwrap();
-            static ref CONF_TIMELINE_RE: Regex =
-                Regex::new(r"(?m)^\s*zenith.zenith_timeline\s*=\s*'(\w+)'\s*$").unwrap();
-            static ref CONF_TENANT_RE: Regex =
-                Regex::new(r"(?m)^\s*zenith.zenith_tenant\s*=\s*'(\w+)'\s*$").unwrap();
-        }
-
        // parse data directory name
        let fname = entry.file_name();
        let name = fname.to_str().unwrap().to_string();

-        // find out tcp port in config file
+        // Read config file into memory
        let cfg_path = entry.path().join("postgresql.conf");
-        let config = fs::read_to_string(cfg_path.clone()).with_context(|| {
-            format!(
-                "failed to read config file in {}",
-                cfg_path.to_str().unwrap()
-            )
-        })?;
+        let cfg_path_str = cfg_path.to_string_lossy();
+        let mut conf_file = File::open(&cfg_path)
+            .with_context(|| format!("failed to open config file in {}", cfg_path_str))?;
+        let conf = PostgresConf::read(&mut conf_file)
+            .with_context(|| format!("failed to read config file in {}", cfg_path_str))?;

-        // parse port
-        let err_msg = format!(
-            "failed to find port definition in config file {}",
-            cfg_path.to_str().unwrap()
-        );
-        let port: u16 = CONF_PORT_RE
-            .captures(config.as_str())
-            .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))?
-            .iter()
-            .last()
-            .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))?
-            .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))?
-            .as_str()
-            .parse()
-            .with_context(|| err_msg)?;
+        // Read a few options from the config file
+        let context = format!("in config file {}", cfg_path_str);
+        let port: u16 = conf.parse_field("port", &context)?;
+        let timelineid: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?;
+        let tenantid: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?;
+        let uses_wal_proposer = conf.get("wal_acceptors").is_some();

-        // parse timeline
-        let err_msg = format!(
-            "failed to find timeline definition in config file {}",
-            cfg_path.to_str().unwrap()
-        );
-        let timelineid: ZTimelineId = CONF_TIMELINE_RE
-            .captures(config.as_str())
-            .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))?
-            .iter()
-            .last()
-            .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))?
-            .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))?
-            .as_str()
-            .parse()
-            .with_context(|| err_msg)?;
-
-        // parse tenant
-        let err_msg = format!(
-            "failed to find tenant definition in config file {}",
-            cfg_path.to_str().unwrap()
-        );
-        let tenantid = CONF_TENANT_RE
-            .captures(config.as_str())
-            .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))?
-            .iter()
-            .last()
-            .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))?
-            .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))?
-            .as_str()
-            .parse()
-            .with_context(|| err_msg)?;
-
-        let uses_wal_proposer = config.contains("wal_acceptors");
+        // parse recovery_target_lsn, if any
+        let recovery_target_lsn: Option<Lsn> =
+            conf.parse_field_optional("recovery_target_lsn", &context)?;

        // ok now
        Ok(PostgresNode {
@@ -223,12 +193,13 @@ impl PostgresNode {
            pageserver: Arc::clone(pageserver),
            is_test: false,
            timelineid,
+            lsn: recovery_target_lsn,
            tenantid,
            uses_wal_proposer,
        })
    }

-    fn sync_walkeepers(&self) -> Result<Lsn> {
+    fn sync_safekeepers(&self) -> Result<Lsn> {
        let pg_path = self.env.pg_bin_dir().join("postgres");
        let sync_handle = Command::new(pg_path)
            .arg("--sync-safekeepers")
@@ -253,7 +224,7 @@ impl PostgresNode {
        }

        let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?;
-        println!("Walkeepers synced on {}", lsn);
+        println!("Safekeepers synced on {}", lsn);
        Ok(lsn)
    }

@@ -284,7 +255,7 @@ impl PostgresNode {
        // Read the archive directly from the `CopyOutReader`
        tar::Archive::new(copyreader)
            .unpack(&self.pgdata())
-            .with_context(|| "extracting page backup failed")?;
+            .with_context(|| "extracting base backup failed")?;

        Ok(())
    }
@@ -308,85 +279,90 @@ impl PostgresNode {
    // Connect to a page server, get base backup, and untar it to initialize a
    // new data directory
    fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> {
-        File::create(self.pgdata().join("postgresql.conf").to_str().unwrap())?;
-
+        let mut conf = PostgresConf::new();
+        conf.append("max_wal_senders", "10");
        // wal_log_hints is mandatory when running against pageserver (see gh issue#192)
        // TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE?
-        self.append_conf(
-            "postgresql.conf",
-            &format!(
-                "max_wal_senders = 10\n\
-                 wal_log_hints = on\n\
-                 max_replication_slots = 10\n\
-                 hot_standby = on\n\
-                 shared_buffers = 1MB\n\
-                 fsync = off\n\
-                 max_connections = 100\n\
-                 wal_sender_timeout = 0\n\
-                 wal_level = replica\n\
-                 zenith.file_cache_size = 4096\n\
-                 zenith.file_cache_path = '/tmp/file.cache'\n\
-                 listen_addresses = '{address}'\n\
-                 port = {port}\n",
-                address = self.address.ip(),
-                port = self.address.port()
-            ),
-        )?;
+        conf.append("wal_log_hints", "on");
+        conf.append("max_replication_slots", "10");
+        conf.append("hot_standby", "on");
+        conf.append("shared_buffers", "1MB");
+        conf.append("fsync", "off");
+        conf.append("max_connections", "100");
+        conf.append("wal_sender_timeout", "0");
+        conf.append("wal_level", "replica");
+        conf.append("listen_addresses", &self.address.ip().to_string());
+        conf.append("port", &self.address.port().to_string());

        // Never clean up old WAL. TODO: We should use a replication
        // slot or something proper, to prevent the compute node
        // from removing WAL that hasn't been streamed to the safekeeper or
        // page server yet. (gh issue #349)
-        self.append_conf("postgresql.conf", "wal_keep_size='10TB'\n")?;
+        conf.append("wal_keep_size", "10TB");

-        // set up authentication
-        let password = if let AuthType::ZenithJWT = auth_type {
-            "$ZENITH_AUTH_TOKEN"
-        } else {
-            ""
+        // Configure the node to fetch pages from pageserver
+        let pageserver_connstr = {
+            let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
+
+            // Set up authentication
+            //
+            // $ZENITH_AUTH_TOKEN will be replaced with value from environment
+            // variable during compute pg startup. It is done this way because
+            // otherwise user will be able to retrieve the value using SHOW
+            // command or pg_settings
+            let password = if let AuthType::ZenithJWT = auth_type {
+                "$ZENITH_AUTH_TOKEN"
+            } else {
+                ""
+            };
+
+            format!("host={} port={} password={}", host, port, password)
        };
+        conf.append("shared_preload_libraries", "zenith");
+        conf.append_line("");
+        conf.append("zenith.page_server_connstring", &pageserver_connstr);
+        conf.append("zenith.zenith_tenant", &self.tenantid.to_string());
+        conf.append("zenith.zenith_timeline", &self.timelineid.to_string());
+        if let Some(lsn) = self.lsn {
+            conf.append("recovery_target_lsn", &lsn.to_string());
+        }
+        conf.append_line("");

-        // Configure that node to take pages from pageserver
-        let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
-        self.append_conf(
-            "postgresql.conf",
-            format!(
-                concat!(
-                    "shared_preload_libraries = zenith\n",
-                    // $ZENITH_AUTH_TOKEN will be replaced with value from environment variable during compute pg startup
-                    // it is done this way because otherwise user will be able to retrieve the value using SHOW command or pg_settings
-                    "zenith.page_server_connstring = 'host={} port={} password={}'\n",
-                    "zenith.zenith_timeline='{}'\n",
-                    "zenith.zenith_tenant='{}'\n",
-                ),
-                host, port, password, self.timelineid, self.tenantid,
-            )
-            .as_str(),
-        )?;
+        if !self.env.safekeepers.is_empty() {
+            // Configure the node to connect to the safekeepers
+            conf.append("synchronous_standby_names", "walproposer");

-        // Configure the node to stream WAL directly to the pageserver
-        self.append_conf(
-            "postgresql.conf",
-            format!(
-                concat!(
-                    "synchronous_standby_names = 'pageserver'\n", // TODO: add a new function arg?
-                    "zenith.callmemaybe_connstring = '{}'\n",     // FIXME escaping
-                ),
-                self.connstr(),
-            )
-            .as_str(),
-        )?;
+            let wal_acceptors = self
+                .env
+                .safekeepers
+                .iter()
+                .map(|sk| format!("localhost:{}", sk.pg_port))
+                .collect::<Vec<String>>()
+                .join(",");
+            conf.append("wal_acceptors", &wal_acceptors);
+        } else {
+            // Configure the node to stream WAL directly to the pageserver
+            // This isn't really a supported configuration, but can be useful for
+            // testing.
+            conf.append("synchronous_standby_names", "pageserver");
+            conf.append("zenith.callmemaybe_connstring", &self.connstr());
+        }
+
+        let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
+        file.write_all(conf.to_string().as_bytes())?;

        Ok(())
    }

    fn load_basebackup(&self) -> Result<()> {
-        let lsn = if self.uses_wal_proposer {
+        let backup_lsn = if let Some(lsn) = self.lsn {
+            Some(lsn)
+        } else if self.uses_wal_proposer {
            // LSN 0 means that it is bootstrap and we need to download just
            // latest data from the pageserver. That is a bit clumsy but whole bootstrap
            // procedure evolves quite actively right now, so let's think about it again
            // when things would be more stable (TODO).
-            let lsn = self.sync_walkeepers()?;
+            let lsn = self.sync_safekeepers()?;
            if lsn == Lsn(0) {
                None
            } else {
@@ -396,7 +372,7 @@ impl PostgresNode {
            None
        };

-        self.do_basebackup(lsn)?;
+        self.do_basebackup(backup_lsn)?;

        Ok(())
    }
@@ -418,14 +394,6 @@ impl PostgresNode {
        }
    }

-    pub fn append_conf(&self, config: &str, opts: &str) -> Result<()> {
-        OpenOptions::new()
-            .append(true)
-            .open(self.pgdata().join(config).to_str().unwrap())?
-            .write_all(opts.as_bytes())?;
-        Ok(())
-    }
-
    fn pg_ctl(&self, args: &[&str], auth_token: &Option<String>) -> Result<()> {
        let pg_ctl_path = self.env.pg_bin_dir().join("pg_ctl");
        let mut cmd = Command::new(pg_ctl_path);
@@ -481,6 +449,10 @@ impl PostgresNode {
        // 3. Load basebackup
        self.load_basebackup()?;

+        if self.lsn.is_some() {
+            File::create(self.pgdata().join("standby.signal"))?;
+        }
+
        // 4. Finally start the compute node postgres
        println!("Starting postgres node at '{}'", self.connstr());
        self.pg_ctl(&["start"], auth_token)
@@ -527,9 +499,7 @@ impl PostgresNode {
            .output()
            .expect("failed to execute whoami");

-        if !output.status.success() {
-            panic!("whoami failed");
-        }
+        assert!(output.status.success(), "whoami failed");

        String::from_utf8(output.stdout).unwrap().trim().to_string()
    }
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -12,6 +12,8 @@ use std::path::Path;

 pub mod compute;
 pub mod local_env;
+pub mod postgresql_conf;
+pub mod safekeeper;
 pub mod storage;

 /// Read a PID file
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -7,46 +7,102 @@
 use anyhow::{Context, Result};
 use serde::{Deserialize, Serialize};
 use std::env;
+use std::fmt::Write;
 use std::fs;
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
-use zenith_utils::auth::{encode_from_key_path, Claims, Scope};
+use zenith_utils::auth::{encode_from_key_file, Claims, Scope};
 use zenith_utils::postgres_backend::AuthType;
 use zenith_utils::zid::ZTenantId;

 //
-// This data structures represent deserialized zenith CLI config
+// This data structures represents zenith CLI config
+//
+// It is deserialized from the .zenith/config file, or the config file passed
+// to 'zenith init --config=<path>' option. See control_plane/simple.conf for
+// an example.
 //
 #[derive(Serialize, Deserialize, Clone, Debug)]
 pub struct LocalEnv {
-    // Pageserver connection settings
-    pub pageserver_pg_port: u16,
-    pub pageserver_http_port: u16,
-
-    // Base directory for both pageserver and compute nodes
+    // Base directory for all the nodes (the pageserver, safekeepers and
+    // compute nodes).
+    //
+    // This is not stored in the config file. Rather, this is the path where the
+    // config file itself is. It is read from the ZENITH_REPO_DIR env variable or
+    // '.zenith' if not given.
+    #[serde(skip)]
    pub base_data_dir: PathBuf,

    // Path to postgres distribution. It's expected that "bin", "include",
    // "lib", "share" from postgres distribution are there. If at some point
    // in time we will be able to run against vanilla postgres we may split that
    // to four separate paths and match OS-specific installation layout.
+    #[serde(default)]
    pub pg_distrib_dir: PathBuf,

    // Path to pageserver binary.
+    #[serde(default)]
    pub zenith_distrib_dir: PathBuf,

-    // keeping tenant id in config to reduce copy paste when running zenith locally with single tenant
-    #[serde(with = "hex")]
-    pub tenantid: ZTenantId,
+    // Default tenant ID to use with the 'zenith' command line utility, when
+    // --tenantid is not explicitly specified.
+    #[serde(with = "opt_tenantid_serde")]
+    #[serde(default)]
+    pub default_tenantid: Option<ZTenantId>,

-    // jwt auth token used for communication with pageserver
-    pub auth_token: String,
+    // used to issue tokens during e.g pg start
+    #[serde(default)]
+    pub private_key_path: PathBuf,
+
+    pub pageserver: PageServerConf,
+
+    #[serde(default)]
+    pub safekeepers: Vec<SafekeeperConf>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(default)]
+pub struct PageServerConf {
+    // Pageserver connection settings
+    pub pg_port: u16,
+    pub http_port: u16,

    // used to determine which auth type is used
    pub auth_type: AuthType,

-    // used to issue tokens during e.g pg start
-    pub private_key_path: PathBuf,
+    // jwt auth token used for communication with pageserver
+    pub auth_token: String,
+}
+
+impl Default for PageServerConf {
+    fn default() -> Self {
+        Self {
+            pg_port: 0,
+            http_port: 0,
+            auth_type: AuthType::Trust,
+            auth_token: "".to_string(),
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(default)]
+pub struct SafekeeperConf {
+    pub name: String,
+    pub pg_port: u16,
+    pub http_port: u16,
+    pub sync: bool,
+}
+
+impl Default for SafekeeperConf {
+    fn default() -> Self {
+        Self {
+            name: "".to_string(),
+            pg_port: 0,
+            http_port: 0,
+            sync: true,
+        }
+    }
 }

 impl LocalEnv {
@@ -62,6 +118,10 @@ impl LocalEnv {
        Ok(self.zenith_distrib_dir.join("pageserver"))
    }

+    pub fn safekeeper_bin(&self) -> Result<PathBuf> {
+        Ok(self.zenith_distrib_dir.join("safekeeper"))
+    }
+
    pub fn pg_data_dirs_path(&self) -> PathBuf {
        self.base_data_dir.join("pgdatadirs").join("tenants")
    }
@@ -76,6 +136,187 @@ impl LocalEnv {
    pub fn pageserver_data_dir(&self) -> PathBuf {
        self.base_data_dir.clone()
    }
+
+    pub fn safekeeper_data_dir(&self, node_name: &str) -> PathBuf {
+        self.base_data_dir.join("safekeepers").join(node_name)
+    }
+
+    /// Create a LocalEnv from a config file.
+    ///
+    /// Unlike 'load_config', this function fills in any defaults that are missing
+    /// from the config file.
+    pub fn create_config(toml: &str) -> Result<LocalEnv> {
+        let mut env: LocalEnv = toml::from_str(toml)?;
+
+        // Find postgres binaries.
+        // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
+        if env.pg_distrib_dir == Path::new("") {
+            if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
+                env.pg_distrib_dir = postgres_bin.into();
+            } else {
+                let cwd = env::current_dir()?;
+                env.pg_distrib_dir = cwd.join("tmp_install")
+            }
+        }
+        if !env.pg_distrib_dir.join("bin/postgres").exists() {
+            anyhow::bail!(
+                "Can't find postgres binary at {}",
+                env.pg_distrib_dir.display()
+            );
+        }
+
+        // Find zenith binaries.
+        if env.zenith_distrib_dir == Path::new("") {
+            env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
+        }
+        if !env.zenith_distrib_dir.join("pageserver").exists() {
+            anyhow::bail!("Can't find pageserver binary.");
+        }
+        if !env.zenith_distrib_dir.join("safekeeper").exists() {
+            anyhow::bail!("Can't find safekeeper binary.");
+        }
+
+        // If no initial tenant ID was given, generate it.
+        if env.default_tenantid.is_none() {
+            env.default_tenantid = Some(ZTenantId::generate());
+        }
+
+        env.base_data_dir = base_path();
+
+        Ok(env)
+    }
+
+    /// Locate and load config
+    pub fn load_config() -> Result<LocalEnv> {
+        let repopath = base_path();
+
+        if !repopath.exists() {
+            anyhow::bail!(
+                "Zenith config is not found in {}. You need to run 'zenith init' first",
+                repopath.to_str().unwrap()
+            );
+        }
+
+        // TODO: check that it looks like a zenith repository
+
+        // load and parse file
+        let config = fs::read_to_string(repopath.join("config"))?;
+        let mut env: LocalEnv = toml::from_str(config.as_str())?;
+
+        env.base_data_dir = repopath;
+
+        Ok(env)
+    }
+
+    // this function is used only for testing purposes in CLI e g generate tokens during init
+    pub fn generate_auth_token(&self, claims: &Claims) -> Result<String> {
+        let private_key_path = if self.private_key_path.is_absolute() {
+            self.private_key_path.to_path_buf()
+        } else {
+            self.base_data_dir.join(&self.private_key_path)
+        };
+
+        let key_data = fs::read(private_key_path)?;
+        encode_from_key_file(claims, &key_data)
+    }
+
+    //
+    // Initialize a new Zenith repository
+    //
+    pub fn init(&mut self) -> Result<()> {
+        // check if config already exists
+        let base_path = &self.base_data_dir;
+        if base_path == Path::new("") {
+            anyhow::bail!("repository base path is missing");
+        }
+        if base_path.exists() {
+            anyhow::bail!(
+                "directory '{}' already exists. Perhaps already initialized?",
+                base_path.to_str().unwrap()
+            );
+        }
+
+        fs::create_dir(&base_path)?;
+
+        // generate keys for jwt
+        // openssl genrsa -out private_key.pem 2048
+        let private_key_path;
+        if self.private_key_path == PathBuf::new() {
+            private_key_path = base_path.join("auth_private_key.pem");
+            let keygen_output = Command::new("openssl")
+                .arg("genrsa")
+                .args(&["-out", private_key_path.to_str().unwrap()])
+                .arg("2048")
+                .stdout(Stdio::null())
+                .output()
+                .with_context(|| "failed to generate auth private key")?;
+            if !keygen_output.status.success() {
+                anyhow::bail!(
+                    "openssl failed: '{}'",
+                    String::from_utf8_lossy(&keygen_output.stderr)
+                );
+            }
+            self.private_key_path = Path::new("auth_private_key.pem").to_path_buf();
+
+            let public_key_path = base_path.join("auth_public_key.pem");
+            // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
+            let keygen_output = Command::new("openssl")
+                .arg("rsa")
+                .args(&["-in", private_key_path.to_str().unwrap()])
+                .arg("-pubout")
+                .args(&["-outform", "PEM"])
+                .args(&["-out", public_key_path.to_str().unwrap()])
+                .stdout(Stdio::null())
+                .output()
+                .with_context(|| "failed to generate auth private key")?;
+            if !keygen_output.status.success() {
+                anyhow::bail!(
+                    "openssl failed: '{}'",
+                    String::from_utf8_lossy(&keygen_output.stderr)
+                );
+            }
+        }
+
+        self.pageserver.auth_token =
+            self.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
+
+        fs::create_dir_all(self.pg_data_dirs_path())?;
+
+        for safekeeper in self.safekeepers.iter() {
+            fs::create_dir_all(self.safekeeper_data_dir(&safekeeper.name))?;
+        }
+
+        let mut conf_content = String::new();
+
+        // Currently, the user first passes a config file with 'zenith init --config=<path>'
+        // We read that in, in `create_config`, and fill any missing defaults. Then it's saved
+        // to .zenith/config. TODO: We lose any formatting and comments along the way, which is
+        // a bit sad.
+        write!(
+            &mut conf_content,
+            r#"# This file describes a locale deployment of the page server
+# and safekeeeper node. It is read by the 'zenith' command-line
+# utility.
+"#
+        )?;
+
+        // Convert the LocalEnv to a toml file.
+        //
+        // This could be as simple as this:
+        //
+        // conf_content += &toml::to_string_pretty(env)?;
+        //
+        // But it results in a "values must be emitted before tables". I'm not sure
+        // why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
+        // Maybe rust reorders the fields to squeeze avoid padding or something?
+        // In any case, converting to toml::Value first, and serializing that, works.
+        // See https://github.com/alexcrichton/toml-rs/issues/142
+        conf_content += &toml::to_string_pretty(&toml::Value::try_from(&self)?)?;
+
+        fs::write(base_path.join("config"), conf_content)?;
+
+        Ok(())
+    }
 }

 fn base_path() -> PathBuf {
@@ -85,118 +326,29 @@ fn base_path() -> PathBuf {
    }
 }

-//
-// Initialize a new Zenith repository
-//
-pub fn init(
-    pageserver_pg_port: u16,
-    pageserver_http_port: u16,
-    tenantid: ZTenantId,
-    auth_type: AuthType,
-) -> Result<()> {
-    // check if config already exists
-    let base_path = base_path();
-    if base_path.exists() {
-        anyhow::bail!(
-            "{} already exists. Perhaps already initialized?",
-            base_path.to_str().unwrap()
-        );
+/// Serde routines for Option<ZTenantId>. The serialized form is a hex string.
+mod opt_tenantid_serde {
+    use serde::{Deserialize, Deserializer, Serialize, Serializer};
+    use std::str::FromStr;
+    use zenith_utils::zid::ZTenantId;
+
+    pub fn serialize<S>(tenantid: &Option<ZTenantId>, ser: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        tenantid.map(|t| t.to_string()).serialize(ser)
    }
-    fs::create_dir(&base_path)?;

-    // ok, now check that expected binaries are present
-
-    // Find postgres binaries. Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
-    let pg_distrib_dir: PathBuf = {
-        if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
-            postgres_bin.into()
-        } else {
-            let cwd = env::current_dir()?;
-            cwd.join("tmp_install")
+    pub fn deserialize<'de, D>(des: D) -> Result<Option<ZTenantId>, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        let s: Option<String> = Option::deserialize(des)?;
+        if let Some(s) = s {
+            return Ok(Some(
+                ZTenantId::from_str(&s).map_err(serde::de::Error::custom)?,
+            ));
        }
-    };
-    if !pg_distrib_dir.join("bin/postgres").exists() {
-        anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
+        Ok(None)
    }
-
-    // generate keys for jwt
-    // openssl genrsa -out private_key.pem 2048
-    let private_key_path = base_path.join("auth_private_key.pem");
-    let keygen_output = Command::new("openssl")
-        .arg("genrsa")
-        .args(&["-out", private_key_path.to_str().unwrap()])
-        .arg("2048")
-        .stdout(Stdio::null())
-        .output()
-        .with_context(|| "failed to generate auth private key")?;
-    if !keygen_output.status.success() {
-        anyhow::bail!(
-            "openssl failed: '{}'",
-            String::from_utf8_lossy(&keygen_output.stderr)
-        );
-    }
-
-    let public_key_path = base_path.join("auth_public_key.pem");
-    // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
-    let keygen_output = Command::new("openssl")
-        .arg("rsa")
-        .args(&["-in", private_key_path.to_str().unwrap()])
-        .arg("-pubout")
-        .args(&["-outform", "PEM"])
-        .args(&["-out", public_key_path.to_str().unwrap()])
-        .stdout(Stdio::null())
-        .output()
-        .with_context(|| "failed to generate auth private key")?;
-    if !keygen_output.status.success() {
-        anyhow::bail!(
-            "openssl failed: '{}'",
-            String::from_utf8_lossy(&keygen_output.stderr)
-        );
-    }
-
-    let auth_token =
-        encode_from_key_path(&Claims::new(None, Scope::PageServerApi), &private_key_path)?;
-
-    // Find zenith binaries.
-    let zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
-    if !zenith_distrib_dir.join("pageserver").exists() {
-        anyhow::bail!("Can't find pageserver binary.",);
-    }
-
-    let conf = LocalEnv {
-        pageserver_pg_port,
-        pageserver_http_port,
-        pg_distrib_dir,
-        zenith_distrib_dir,
-        base_data_dir: base_path,
-        tenantid,
-        auth_token,
-        auth_type,
-        private_key_path,
-    };
-
-    fs::create_dir_all(conf.pg_data_dirs_path())?;
-
-    let toml = toml::to_string_pretty(&conf)?;
-    fs::write(conf.base_data_dir.join("config"), toml)?;
-
-    Ok(())
-}
-
-// Locate and load config
-pub fn load_config() -> Result<LocalEnv> {
-    let repopath = base_path();
-
-    if !repopath.exists() {
-        anyhow::bail!(
-            "Zenith config is not found in {}. You need to run 'zenith init' first",
-            repopath.to_str().unwrap()
-        );
-    }
-
-    // TODO: check that it looks like a zenith repository
-
-    // load and parse file
-    let config = fs::read_to_string(repopath.join("config"))?;
-    toml::from_str(config.as_str()).map_err(|e| e.into())
 }
--- a/control_plane/src/postgresql_conf.rs
+++ b/control_plane/src/postgresql_conf.rs
@@ -0,0 +1,228 @@
+///
+/// Module for parsing postgresql.conf file.
+///
+/// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just
+/// enough to extract a few settings we need in Zenith, assuming you don't do
+/// funny stuff like include-directives or funny escaping.
+use anyhow::{anyhow, bail, Context, Result};
+use lazy_static::lazy_static;
+use regex::Regex;
+use std::collections::HashMap;
+use std::fmt;
+use std::io::BufRead;
+use std::str::FromStr;
+
+/// In-memory representation of a postgresql.conf file
+#[derive(Default)]
+pub struct PostgresConf {
+    lines: Vec<String>,
+    hash: HashMap<String, String>,
+}
+
+lazy_static! {
+    static ref CONF_LINE_RE: Regex = Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap();
+}
+
+impl PostgresConf {
+    pub fn new() -> PostgresConf {
+        PostgresConf::default()
+    }
+
+    /// Read file into memory
+    pub fn read(read: impl std::io::Read) -> Result<PostgresConf> {
+        let mut result = Self::new();
+
+        for line in std::io::BufReader::new(read).lines() {
+            let line = line?;
+
+            // Store each line in a vector, in original format
+            result.lines.push(line.clone());
+
+            // Also parse each line and insert key=value lines into a hash map.
+            //
+            // FIXME: This doesn't match exactly the flex/bison grammar in PostgreSQL.
+            // But it's close enough for our usage.
+            let line = line.trim();
+            if line.starts_with('#') {
+                // comment, ignore
+                continue;
+            } else if let Some(caps) = CONF_LINE_RE.captures(line) {
+                let name = caps.get(1).unwrap().as_str();
+                let raw_val = caps.get(2).unwrap().as_str();
+
+                if let Ok(val) = deescape_str(raw_val) {
+                    // Note: if there's already an entry in the hash map for
+                    // this key, this will replace it. That's the behavior what
+                    // we want; when PostgreSQL reads the file, each line
+                    // overrides any previous value for the same setting.
+                    result.hash.insert(name.to_string(), val.to_string());
+                }
+            }
+        }
+        Ok(result)
+    }
+
+    /// Return the current value of 'option'
+    pub fn get(&self, option: &str) -> Option<&str> {
+        self.hash.get(option).map(|x| x.as_ref())
+    }
+
+    /// Return the current value of a field, parsed to the right datatype.
+    ///
+    /// This calls the FromStr::parse() function on the value of the field. If
+    /// the field does not exist, or parsing fails, returns an error.
+    ///
+    pub fn parse_field<T>(&self, field_name: &str, context: &str) -> Result<T>
+    where
+        T: FromStr,
+        <T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
+    {
+        self.get(field_name)
+            .ok_or_else(|| anyhow!("could not find '{}' option {}", field_name, context))?
+            .parse::<T>()
+            .with_context(|| format!("could not parse '{}' option {}", field_name, context))
+    }
+
+    pub fn parse_field_optional<T>(&self, field_name: &str, context: &str) -> Result<Option<T>>
+    where
+        T: FromStr,
+        <T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
+    {
+        if let Some(val) = self.get(field_name) {
+            let result = val
+                .parse::<T>()
+                .with_context(|| format!("could not parse '{}' option {}", field_name, context))?;
+
+            Ok(Some(result))
+        } else {
+            Ok(None)
+        }
+    }
+
+    ///
+    /// Note: if you call this multiple times for the same option, the config
+    /// file will a line for each call. It would be nice to have a function
+    /// to change an existing line, but that's a TODO.
+    ///
+    pub fn append(&mut self, option: &str, value: &str) {
+        self.lines
+            .push(format!("{}={}\n", option, escape_str(value)));
+        self.hash.insert(option.to_string(), value.to_string());
+    }
+
+    /// Append an arbitrary non-setting line to the config file
+    pub fn append_line(&mut self, line: &str) {
+        self.lines.push(line.to_string());
+    }
+}
+
+impl fmt::Display for PostgresConf {
+    /// Return the whole configuration file as a string
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        for line in self.lines.iter() {
+            f.write_str(line)?;
+        }
+        Ok(())
+    }
+}
+
+/// Escape a value for putting in postgresql.conf.
+fn escape_str(s: &str) -> String {
+    // If the string doesn't contain anything that needs quoting or escaping, return it
+    // as it is.
+    //
+    // The first part of the regex, before the '|', matches the INTEGER rule in the
+    // PostgreSQL flex grammar (guc-file.l). It matches plain integers like "123" and
+    // "-123", and also accepts units like "10MB". The second part of the regex matches
+    // the UNQUOTED_STRING rule, and accepts strings that contain a single word, beginning
+    // with a letter. That covers words like "off" or "posix". Everything else is quoted.
+    //
+    // This regex is a bit more conservative than the rules in guc-file.l, so we quote some
+    // strings that PostgreSQL would accept without quoting, but that's OK.
+    lazy_static! {
+        static ref UNQUOTED_RE: Regex =
+            Regex::new(r"(^[-+]?[0-9]+[a-zA-Z]*$)|(^[a-zA-Z][a-zA-Z0-9]*$)").unwrap();
+    }
+    if UNQUOTED_RE.is_match(s) {
+        s.to_string()
+    } else {
+        // Otherwise escape and quote it
+        let s = s
+            .replace('\\', "\\\\")
+            .replace('\n', "\\n")
+            .replace('\'', "''");
+
+        "\'".to_owned() + &s + "\'"
+    }
+}
+
+/// De-escape a possibly-quoted value.
+///
+/// See `DeescapeQuotedString` function in PostgreSQL sources for how PostgreSQL
+/// does this.
+fn deescape_str(s: &str) -> Result<String> {
+    // If the string has a quote at the beginning and end, strip them out.
+    if s.len() >= 2 && s.starts_with('\'') && s.ends_with('\'') {
+        let mut result = String::new();
+
+        let mut iter = s[1..(s.len() - 1)].chars().peekable();
+        while let Some(c) = iter.next() {
+            let newc = if c == '\\' {
+                match iter.next() {
+                    Some('b') => '\x08',
+                    Some('f') => '\x0c',
+                    Some('n') => '\n',
+                    Some('r') => '\r',
+                    Some('t') => '\t',
+                    Some('0'..='7') => {
+                        // TODO
+                        bail!("octal escapes not supported");
+                    }
+                    Some(n) => n,
+                    None => break,
+                }
+            } else if c == '\'' && iter.peek() == Some(&'\'') {
+                // doubled quote becomes just one quote
+                iter.next().unwrap()
+            } else {
+                c
+            };
+
+            result.push(newc);
+        }
+        Ok(result)
+    } else {
+        Ok(s.to_string())
+    }
+}
+
+#[test]
+fn test_postgresql_conf_escapes() -> Result<()> {
+    assert_eq!(escape_str("foo bar"), "'foo bar'");
+    // these don't need to be quoted
+    assert_eq!(escape_str("foo"), "foo");
+    assert_eq!(escape_str("123"), "123");
+    assert_eq!(escape_str("+123"), "+123");
+    assert_eq!(escape_str("-10"), "-10");
+    assert_eq!(escape_str("1foo"), "1foo");
+    assert_eq!(escape_str("foo1"), "foo1");
+    assert_eq!(escape_str("10MB"), "10MB");
+    assert_eq!(escape_str("-10kB"), "-10kB");
+
+    // these need quoting and/or escaping
+    assert_eq!(escape_str("foo bar"), "'foo bar'");
+    assert_eq!(escape_str("fo'o"), "'fo''o'");
+    assert_eq!(escape_str("fo\no"), "'fo\\no'");
+    assert_eq!(escape_str("fo\\o"), "'fo\\\\o'");
+    assert_eq!(escape_str("10 cats"), "'10 cats'");
+
+    // Test de-escaping
+    assert_eq!(deescape_str(&escape_str("foo"))?, "foo");
+    assert_eq!(deescape_str(&escape_str("fo'o\nba\\r"))?, "fo'o\nba\\r");
+    assert_eq!(deescape_str("'\\b\\f\\n\\r\\t'")?, "\x08\x0c\n\r\t");
+
+    // octal-escapes are currently not supported
+    assert!(deescape_str("'foo\\7\\07\\007'").is_err());
+
+    Ok(())
+}
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -0,0 +1,277 @@
+use std::io::Write;
+use std::net::TcpStream;
+use std::path::PathBuf;
+use std::process::Command;
+use std::sync::Arc;
+use std::time::Duration;
+use std::{io, result, thread};
+
+use anyhow::bail;
+use nix::errno::Errno;
+use nix::sys::signal::{kill, Signal};
+use nix::unistd::Pid;
+use postgres::Config;
+use reqwest::blocking::{Client, RequestBuilder, Response};
+use reqwest::{IntoUrl, Method};
+use thiserror::Error;
+use zenith_utils::http::error::HttpErrorBody;
+use zenith_utils::postgres_backend::AuthType;
+
+use crate::local_env::{LocalEnv, SafekeeperConf};
+use crate::read_pidfile;
+use crate::storage::PageServerNode;
+use zenith_utils::connstring::connection_address;
+use zenith_utils::connstring::connection_host_port;
+
+#[derive(Error, Debug)]
+pub enum SafekeeperHttpError {
+    #[error("Reqwest error: {0}")]
+    Transport(#[from] reqwest::Error),
+
+    #[error("Error: {0}")]
+    Response(String),
+}
+
+type Result<T> = result::Result<T, SafekeeperHttpError>;
+
+pub trait ResponseErrorMessageExt: Sized {
+    fn error_from_body(self) -> Result<Self>;
+}
+
+impl ResponseErrorMessageExt for Response {
+    fn error_from_body(self) -> Result<Self> {
+        let status = self.status();
+        if !(status.is_client_error() || status.is_server_error()) {
+            return Ok(self);
+        }
+
+        // reqwest do not export it's error construction utility functions, so lets craft the message ourselves
+        let url = self.url().to_owned();
+        Err(SafekeeperHttpError::Response(
+            match self.json::<HttpErrorBody>() {
+                Ok(err_body) => format!("Error: {}", err_body.msg),
+                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
+            },
+        ))
+    }
+}
+
+//
+// Control routines for safekeeper.
+//
+// Used in CLI and tests.
+//
+#[derive(Debug)]
+pub struct SafekeeperNode {
+    pub name: String,
+
+    pub conf: SafekeeperConf,
+
+    pub pg_connection_config: Config,
+    pub env: LocalEnv,
+    pub http_client: Client,
+    pub http_base_url: String,
+
+    pub pageserver: Arc<PageServerNode>,
+}
+
+impl SafekeeperNode {
+    pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
+        let pageserver = Arc::new(PageServerNode::from_env(env));
+
+        println!("initializing for {} for {}", conf.name, conf.http_port);
+
+        SafekeeperNode {
+            name: conf.name.clone(),
+            conf: conf.clone(),
+            pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
+            env: env.clone(),
+            http_client: Client::new(),
+            http_base_url: format!("http://localhost:{}/v1", conf.http_port),
+            pageserver,
+        }
+    }
+
+    /// Construct libpq connection string for connecting to this safekeeper.
+    fn safekeeper_connection_config(port: u16) -> Config {
+        // TODO safekeeper authentication not implemented yet
+        format!("postgresql://no_user@localhost:{}/no_db", port)
+            .parse()
+            .unwrap()
+    }
+
+    pub fn datadir_path(&self) -> PathBuf {
+        self.env.safekeeper_data_dir(&self.name)
+    }
+
+    pub fn pid_file(&self) -> PathBuf {
+        self.datadir_path().join("safekeeper.pid")
+    }
+
+    pub fn start(&self) -> anyhow::Result<()> {
+        print!(
+            "Starting safekeeper at '{}' in '{}'",
+            connection_address(&self.pg_connection_config),
+            self.datadir_path().display()
+        );
+        io::stdout().flush().unwrap();
+
+        // Configure connection to page server
+        //
+        // FIXME: We extract the host and port from the connection string instead of using
+        // the connection string directly, because the 'safekeeper' binary expects
+        // host:port format. That's a bit silly when we already have a full libpq connection
+        // string at hand.
+        let pageserver_conn = {
+            let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
+            format!("{}:{}", host, port)
+        };
+
+        let listen_pg = format!("localhost:{}", self.conf.pg_port);
+        let listen_http = format!("localhost:{}", self.conf.http_port);
+
+        let mut cmd: &mut Command = &mut Command::new(self.env.safekeeper_bin()?);
+        cmd = cmd
+            .args(&["-D", self.datadir_path().to_str().unwrap()])
+            .args(&["--listen-pg", &listen_pg])
+            .args(&["--listen-http", &listen_http])
+            .args(&["--pageserver", &pageserver_conn])
+            .args(&["--recall", "1 second"])
+            .arg("--daemonize")
+            .env_clear()
+            .env("RUST_BACKTRACE", "1");
+        if !self.conf.sync {
+            cmd = cmd.arg("--no-sync");
+        }
+
+        if self.env.pageserver.auth_type == AuthType::ZenithJWT {
+            cmd.env("PAGESERVER_AUTH_TOKEN", &self.env.pageserver.auth_token);
+        }
+
+        if !cmd.status()?.success() {
+            bail!(
+                "Safekeeper failed to start. See '{}' for details.",
+                self.datadir_path().join("safekeeper.log").display()
+            );
+        }
+
+        // It takes a while for the safekeeper to start up. Wait until it is
+        // open for business.
+        const RETRIES: i8 = 15;
+        for retries in 1..RETRIES {
+            match self.check_status() {
+                Ok(_) => {
+                    println!("\nSafekeeper started");
+                    return Ok(());
+                }
+                Err(err) => {
+                    match err {
+                        SafekeeperHttpError::Transport(err) => {
+                            if err.is_connect() && retries < 5 {
+                                print!(".");
+                                io::stdout().flush().unwrap();
+                            } else {
+                                if retries == 5 {
+                                    println!() // put a line break after dots for second message
+                                }
+                                println!(
+                                    "Safekeeper not responding yet, err {} retrying ({})...",
+                                    err, retries
+                                );
+                            }
+                        }
+                        SafekeeperHttpError::Response(msg) => {
+                            bail!("safekeeper failed to start: {} ", msg)
+                        }
+                    }
+                    thread::sleep(Duration::from_secs(1));
+                }
+            }
+        }
+        bail!("safekeeper failed to start in {} seconds", RETRIES);
+    }
+
+    ///
+    /// Stop the server.
+    ///
+    /// If 'immediate' is true, we use SIGQUIT, killing the process immediately.
+    /// Otherwise we use SIGTERM, triggering a clean shutdown
+    ///
+    /// If the server is not running, returns success
+    ///
+    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
+        let pid_file = self.pid_file();
+        if !pid_file.exists() {
+            println!("Safekeeper {} is already stopped", self.name);
+            return Ok(());
+        }
+        let pid = read_pidfile(&pid_file)?;
+        let pid = Pid::from_raw(pid);
+
+        let sig = if immediate {
+            println!("Stop safekeeper immediately");
+            Signal::SIGQUIT
+        } else {
+            println!("Stop safekeeper gracefully");
+            Signal::SIGTERM
+        };
+        match kill(pid, sig) {
+            Ok(_) => (),
+            Err(Errno::ESRCH) => {
+                println!(
+                    "Safekeeper with pid {} does not exist, but a PID file was found",
+                    pid
+                );
+                return Ok(());
+            }
+            Err(err) => bail!(
+                "Failed to send signal to safekeeper with pid {}: {}",
+                pid,
+                err.desc()
+            ),
+        }
+
+        let address = connection_address(&self.pg_connection_config);
+
+        // TODO Remove this "timeout" and handle it on caller side instead.
+        // Shutting down may take a long time,
+        // if safekeeper flushes a lot of data
+        for _ in 0..100 {
+            if let Err(_e) = TcpStream::connect(&address) {
+                println!("Safekeeper stopped receiving connections");
+
+                //Now check status
+                match self.check_status() {
+                    Ok(_) => {
+                        println!("Safekeeper status is OK. Wait a bit.");
+                        thread::sleep(Duration::from_secs(1));
+                    }
+                    Err(err) => {
+                        println!("Safekeeper status is: {}", err);
+                        return Ok(());
+                    }
+                }
+            } else {
+                println!("Safekeeper still receives connections");
+                thread::sleep(Duration::from_secs(1));
+            }
+        }
+
+        bail!("Failed to stop safekeeper with pid {}", pid);
+    }
+
+    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
+        // TODO: authentication
+        //if self.env.auth_type == AuthType::ZenithJWT {
+        //    builder = builder.bearer_auth(&self.env.safekeeper_auth_token)
+        //}
+        self.http_client.request(method, url)
+    }
+
+    pub fn check_status(&self) -> Result<()> {
+        self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
+            .send()?
+            .error_from_body()?;
+        Ok(())
+    }
+}
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -6,6 +6,7 @@ use std::time::Duration;
 use std::{io, result, thread};

 use anyhow::{anyhow, bail};
+use nix::errno::Errno;
 use nix::sys::signal::{kill, Signal};
 use nix::unistd::Pid;
 use pageserver::http::models::{BranchCreateRequest, TenantCreateRequest};
@@ -20,6 +21,7 @@ use zenith_utils::zid::ZTenantId;
 use crate::local_env::LocalEnv;
 use crate::read_pidfile;
 use pageserver::branches::BranchInfo;
+use pageserver::tenant_mgr::TenantInfo;
 use zenith_utils::connstring::connection_address;

 #[derive(Error, Debug)]
@@ -62,7 +64,6 @@ impl ResponseErrorMessageExt for Response {
 //
 #[derive(Debug)]
 pub struct PageServerNode {
-    pub kill_on_exit: bool,
    pub pg_connection_config: Config,
    pub env: LocalEnv,
    pub http_client: Client,
@@ -71,34 +72,34 @@ pub struct PageServerNode {

 impl PageServerNode {
    pub fn from_env(env: &LocalEnv) -> PageServerNode {
-        let password = if env.auth_type == AuthType::ZenithJWT {
-            &env.auth_token
+        let password = if env.pageserver.auth_type == AuthType::ZenithJWT {
+            &env.pageserver.auth_token
        } else {
            ""
        };

        PageServerNode {
-            kill_on_exit: false,
            pg_connection_config: Self::pageserver_connection_config(
                password,
-                env.pageserver_pg_port,
+                env.pageserver.pg_port,
            ),
            env: env.clone(),
            http_client: Client::new(),
-            http_base_url: format!("http://localhost:{}/v1", env.pageserver_http_port),
+            http_base_url: format!("http://localhost:{}/v1", env.pageserver.http_port),
        }
    }

+    /// Construct libpq connection string for connecting to the pageserver.
    fn pageserver_connection_config(password: &str, port: u16) -> Config {
        format!("postgresql://no_user:{}@localhost:{}/no_db", password, port)
            .parse()
            .unwrap()
    }

-    pub fn init(&self, create_tenant: Option<&str>, enable_auth: bool) -> anyhow::Result<()> {
+    pub fn init(&self, create_tenant: Option<&str>) -> anyhow::Result<()> {
        let mut cmd = Command::new(self.env.pageserver_bin()?);
-        let listen_pg = format!("localhost:{}", self.env.pageserver_pg_port);
-        let listen_http = format!("localhost:{}", self.env.pageserver_http_port);
+        let listen_pg = format!("localhost:{}", self.env.pageserver.pg_port);
+        let listen_http = format!("localhost:{}", self.env.pageserver.http_port);
        let mut args = vec![
            "--init",
            "-D",
@@ -111,10 +112,11 @@ impl PageServerNode {
            &listen_http,
        ];

-        if enable_auth {
+        let auth_type_str = &self.env.pageserver.auth_type.to_string();
+        if self.env.pageserver.auth_type != AuthType::Trust {
            args.extend(&["--auth-validation-public-key-path", "auth_public_key.pem"]);
-            args.extend(&["--auth-type", "ZenithJWT"]);
        }
+        args.extend(&["--auth-type", auth_type_str]);

        if let Some(tenantid) = create_tenant {
            args.extend(&["--create-tenant", tenantid])
@@ -152,7 +154,7 @@ impl PageServerNode {

        let mut cmd = Command::new(self.env.pageserver_bin()?);
        cmd.args(&["-D", self.repo_path().to_str().unwrap()])
-            .arg("-d")
+            .arg("--daemonize")
            .env_clear()
            .env("RUST_BACKTRACE", "1");

@@ -199,23 +201,69 @@ impl PageServerNode {
        bail!("pageserver failed to start in {} seconds", RETRIES);
    }

-    pub fn stop(&self) -> anyhow::Result<()> {
-        let pid = read_pidfile(&self.pid_file())?;
-        let pid = Pid::from_raw(pid);
-        if kill(pid, Signal::SIGTERM).is_err() {
-            bail!("Failed to kill pageserver with pid {}", pid);
+    ///
+    /// Stop the server.
+    ///
+    /// If 'immediate' is true, we use SIGQUIT, killing the process immediately.
+    /// Otherwise we use SIGTERM, triggering a clean shutdown
+    ///
+    /// If the server is not running, returns success
+    ///
+    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
+        let pid_file = self.pid_file();
+        if !pid_file.exists() {
+            println!("Pageserver is already stopped");
+            return Ok(());
        }
+        let pid = Pid::from_raw(read_pidfile(&pid_file)?);

-        // wait for pageserver stop
-        let address = connection_address(&self.pg_connection_config);
-        for _ in 0..5 {
-            let stream = TcpStream::connect(&address);
-            thread::sleep(Duration::from_secs(1));
-            if let Err(_e) = stream {
-                println!("Pageserver stopped");
+        let sig = if immediate {
+            println!("Stop pageserver immediately");
+            Signal::SIGQUIT
+        } else {
+            println!("Stop pageserver gracefully");
+            Signal::SIGTERM
+        };
+        match kill(pid, sig) {
+            Ok(_) => (),
+            Err(Errno::ESRCH) => {
+                println!(
+                    "Pageserver with pid {} does not exist, but a PID file was found",
+                    pid
+                );
                return Ok(());
            }
-            println!("Stopping pageserver on {}", address);
+            Err(err) => bail!(
+                "Failed to send signal to pageserver with pid {}: {}",
+                pid,
+                err.desc()
+            ),
+        }
+
+        let address = connection_address(&self.pg_connection_config);
+
+        // TODO Remove this "timeout" and handle it on caller side instead.
+        // Shutting down may take a long time,
+        // if pageserver checkpoints a lot of data
+        for _ in 0..100 {
+            if let Err(_e) = TcpStream::connect(&address) {
+                println!("Pageserver stopped receiving connections");
+
+                //Now check status
+                match self.check_status() {
+                    Ok(_) => {
+                        println!("Pageserver status is OK. Wait a bit.");
+                        thread::sleep(Duration::from_secs(1));
+                    }
+                    Err(err) => {
+                        println!("Pageserver status is: {}", err);
+                        return Ok(());
+                    }
+                }
+            } else {
+                println!("Pageserver still receives connections");
+                thread::sleep(Duration::from_secs(1));
+            }
        }

        bail!("Failed to stop pageserver with pid {}", pid);
@@ -234,8 +282,8 @@ impl PageServerNode {

    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
        let mut builder = self.http_client.request(method, url);
-        if self.env.auth_type == AuthType::ZenithJWT {
-            builder = builder.bearer_auth(&self.env.auth_token)
+        if self.env.pageserver.auth_type == AuthType::ZenithJWT {
+            builder = builder.bearer_auth(&self.env.pageserver.auth_token)
        }
        builder
    }
@@ -247,7 +295,7 @@ impl PageServerNode {
        Ok(())
    }

-    pub fn tenant_list(&self) -> Result<Vec<String>> {
+    pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
        Ok(self
            .http_request(Method::GET, format!("{}/{}", self.http_base_url, "tenant"))
            .send()?
@@ -310,11 +358,3 @@ impl PageServerNode {
            .json()?)
    }
 }
-
-impl Drop for PageServerNode {
-    fn drop(&mut self) {
-        if self.kill_on_exit {
-            let _ = self.stop();
-        }
-    }
-}
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -7,7 +7,7 @@ if [ "$1" = 'pageserver' ]; then
        pageserver --init -D /data --postgres-distrib /usr/local
    fi
    echo "Staring pageserver at 0.0.0.0:6400"
-    pageserver -l 0.0.0.0:6400 -D /data
+    pageserver -l 0.0.0.0:6400 --listen-http 0.0.0.0:9898 -D /data
 else
    "$@"
 fi
--- a/docs/README.md
+++ b/docs/README.md
@@ -10,5 +10,5 @@
 - [pageserver/README](/pageserver/README) — pageserver overview.
 - [postgres_ffi/README](/postgres_ffi/README) — Postgres FFI overview.
 - [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
- [walkeeper/README](/walkeeper/README.md) — WAL service overview.
+- [walkeeper/README](/walkeeper/README) — WAL service overview.
 - [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -4,7 +4,7 @@

 Currently we build two main images:

- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `wal_acceptor` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
+- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
 - [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres).

 And two intermediate images used either to reduce build time or to deliver some additional binary tools from other repos:
--- a/docs/multitenancy.md
+++ b/docs/multitenancy.md
@@ -56,4 +56,4 @@ Tenant id is passed to postgres via GUC the same way as the timeline. Tenant id

 ### Safety

-For now particular tenant can only appear on a particular pageserver. Set of WAL acceptors are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline).
+For now particular tenant can only appear on a particular pageserver. Set of safekeepers are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline).
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -17,7 +17,7 @@ lazy_static = "1.4.0"
 log = "0.4.14"
 clap = "2.33.0"
 daemonize = "0.4.1"
-tokio = { version = "1.11", features = ["process", "macros", "fs"] }
+tokio = { version = "1.11", features = ["process", "macros", "fs", "rt", "io-util"] }
 postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
 postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
@@ -34,8 +34,16 @@ toml = "0.5"
 scopeguard = "1.1.0"
 rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
 async-trait = "0.1"
+const_format = "0.2.21"
+tracing = "0.1.27"
+signal-hook = {version = "0.3.10", features = ["extended-siginfo"] }
+url = "2"

 postgres_ffi = { path = "../postgres_ffi" }
 zenith_metrics = { path = "../zenith_metrics" }
 zenith_utils = { path = "../zenith_utils" }
 workspace_hack = { path = "../workspace_hack" }
+
+[dev-dependencies]
+hex-literal = "0.3"
+tempfile = "3.2"
--- a/pageserver/README.md
+++ b/pageserver/README.md
@@ -7,8 +7,9 @@ The Page Server has a few different duties:
 - Replay WAL that's applicable to the chunks that the Page Server maintains
 - Backup to S3

-
-
+S3 is the main fault-tolerant storage of all data, as there are no Page Server
+replicas. We use a separate fault-tolerant WAL service to reduce latency. It
+keeps track of WAL records which are not syncted to S3 yet.

 The Page Server consists of multiple threads that operate on a shared
 repository of page versions:
@@ -40,7 +41,7 @@ Legend:
 +--+

 ....
-.  .   Component that we will need, but doesn't exist at the moment. A TODO.
+.  .   Component at its early development phase.
 ....

 --->   Data flow
@@ -115,13 +116,49 @@ Remove old on-disk layer files that are no longer needed according to the
 PITR retention policy


-TODO: Backup service
--------------------
+### Backup service

-The backup service is responsible for periodically pushing the chunks to S3.
+The backup service, responsible for storing pageserver recovery data externally.

-TODO: How/when do restore from S3? Whenever we get a GetPage@LSN request for
-a chunk we don't currently have? Or when an external Control Plane tells us?
+Currently, pageserver stores its files in a filesystem directory it's pointed to.
+That working directory could be rather ephemeral for such cases as "a pageserver pod running in k8s with no persistent volumes attached".
+Therefore, the server interacts with external, more reliable storage to back up and restore its state.
+
+The code for storage support is extensible and can support arbitrary ones as long as they implement a certain Rust trait.
+There are the following implementations present:
+* local filesystem — to use in tests mainly
+* AWS S3           - to use in production
+
+Implementation details are covered in the [storage readme](./src/relish_storage/README.md) and corresponding Rust file docs.
+
+The backup service is disabled by default and can be enabled to interact with a single remote storage.
+
+CLI examples:
+* Local FS: `${PAGESERVER_BIN} --relish-storage-local-path="/some/local/path/"`
+* AWS S3  : `${PAGESERVER_BIN} --relish-storage-s3-bucket="some-sample-bucket" --relish-storage-region="eu-north-1" --relish-storage-access-key="SOMEKEYAAAAASADSAH*#" --relish-storage-secret-access-key="SOMEsEcReTsd292v"`
+
+For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
+For local S3 installations, refer to the their documentation for name format and credentials.
+
+Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup backup targets.
+Required sections are:
+
+```toml
+[relish_storage]
+local_path = '/Users/someonetoignore/Downloads/tmp_dir/'
+```
+
+or
+
+```toml
+[relish_storage]
+bucket_name = 'some-sample-bucket'
+bucket_region = 'eu-north-1'
+access_key_id = 'SOMEKEYAAAAASADSAH*#'
+secret_access_key = 'SOMEsEcReTsd292v'
+```
+
+Also, `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` variables can be used to specify the credentials instead of any of the ways above.

 TODO: Sharding
 --------------------
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,6 +13,7 @@
 use anyhow::Result;
 use bytes::{BufMut, BytesMut};
 use log::*;
+use std::fmt::Write as FmtWrite;
 use std::io;
 use std::io::Write;
 use std::sync::Arc;
@@ -31,7 +32,7 @@ use zenith_utils::lsn::Lsn;
 pub struct Basebackup<'a> {
    ar: Builder<&'a mut dyn Write>,
    timeline: &'a Arc<dyn Timeline>,
-    lsn: Lsn,
+    pub lsn: Lsn,
    prev_record_lsn: Lsn,
 }

@@ -83,7 +84,7 @@ impl<'a> Basebackup<'a> {

        info!(
            "taking basebackup lsn={}, prev_lsn={}",
-            backup_prev, backup_lsn
+            backup_lsn, backup_prev
        );

        Ok(Basebackup {
@@ -97,7 +98,6 @@ impl<'a> Basebackup<'a> {
    pub fn send_tarball(&mut self) -> anyhow::Result<()> {
        // Create pgdata subdirs structure
        for dir in pg_constants::PGDATA_SUBDIRS.iter() {
-            info!("send subdir {:?}", *dir);
            let header = new_tar_header_dir(*dir)?;
            self.ar.append(&header, &mut io::empty())?;
        }
@@ -249,13 +249,7 @@ impl<'a> Basebackup<'a> {
        let mut pg_control = ControlFileData::decode(&pg_control_bytes)?;
        let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;

-        // Generate new pg_control and WAL needed for bootstrap
-        let checkpoint_segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE);
-        let checkpoint_lsn = XLogSegNoOffsetToRecPtr(
-            checkpoint_segno,
-            XLOG_SIZE_OF_XLOG_LONG_PHD as u32,
-            pg_constants::WAL_SEGMENT_SIZE,
-        );
+        // Generate new pg_control needed for bootstrap
        checkpoint.redo = normalize_lsn(self.lsn, pg_constants::WAL_SEGMENT_SIZE).0;

        //reset some fields we don't want to preserve
@@ -264,19 +258,24 @@ impl<'a> Basebackup<'a> {
        checkpoint.oldestActiveXid = 0;

        //save new values in pg_control
-        pg_control.checkPoint = checkpoint_lsn;
+        pg_control.checkPoint = 0;
        pg_control.checkPointCopy = checkpoint;
        pg_control.state = pg_constants::DB_SHUTDOWNED;

        // add zenith.signal file
-        let xl_prev = if self.prev_record_lsn == Lsn(0) {
-            0xBAD0 // magic value to indicate that we don't know prev_lsn
+        let mut zenith_signal = String::new();
+        if self.prev_record_lsn == Lsn(0) {
+            if self.lsn == self.timeline.get_ancestor_lsn() {
+                write!(zenith_signal, "PREV LSN: none")?;
+            } else {
+                write!(zenith_signal, "PREV LSN: invalid")?;
+            }
        } else {
-            self.prev_record_lsn.0
-        };
+            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
+        }
        self.ar.append(
-            &new_tar_header("zenith.signal", 8)?,
-            &xl_prev.to_le_bytes()[..],
+            &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
+            zenith_signal.as_bytes(),
        )?;

        //send pg_control
@@ -285,14 +284,15 @@ impl<'a> Basebackup<'a> {
        self.ar.append(&header, &pg_control_bytes[..])?;

        //send wal segment
+        let segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE);
        let wal_file_name = XLogFileName(
            1, // FIXME: always use Postgres timeline 1
-            checkpoint_segno,
+            segno,
            pg_constants::WAL_SEGMENT_SIZE,
        );
        let wal_file_path = format!("pg_wal/{}", wal_file_name);
        let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?;
-        let wal_seg = generate_wal_segment(&pg_control);
+        let wal_seg = generate_wal_segment(segno, pg_control.system_identifier);
        assert!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE);
        self.ar.append(&header, &wal_seg[..])?;
        Ok(())
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -2,31 +2,40 @@
 // Main entry point for the Page Server executable
 //

-use log::*;
-use pageserver::defaults::*;
 use serde::{Deserialize, Serialize};
 use std::{
    env,
-    net::TcpListener,
    path::{Path, PathBuf},
-    process::exit,
    str::FromStr,
    thread,
 };
-use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType};
+use tracing::*;
+use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType, tcp_listener};
+
+use anyhow::{bail, ensure, Context, Result};
+use signal_hook::consts::signal::*;
+use signal_hook::consts::TERM_SIGNALS;
+use signal_hook::flag;
+use signal_hook::iterator::exfiltrator::WithOrigin;
+use signal_hook::iterator::SignalsInfo;
+use std::process::exit;
+use std::sync::atomic::AtomicBool;
+use std::sync::Arc;

-use anyhow::{bail, ensure, Result};
 use clap::{App, Arg, ArgMatches};
 use daemonize::Daemonize;

 use pageserver::{
-    branches, http, page_service, tenant_mgr, PageServerConf, RelishStorageConfig, S3Config,
-    LOG_FILE_NAME,
+    branches, defaults::*, http, page_service, relish_storage, tenant_mgr, PageServerConf,
+    RelishStorageConfig, RelishStorageKind, S3Config, LOG_FILE_NAME,
 };
 use zenith_utils::http::endpoint;
+use zenith_utils::postgres_backend;
+
+use const_format::formatcp;

 /// String arguments that can be declared via CLI or config file
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
 struct CfgFileParams {
    listen_pg_addr: Option<String>,
    listen_http_addr: Option<String>,
@@ -37,11 +46,21 @@ struct CfgFileParams {
    pg_distrib_dir: Option<String>,
    auth_validation_public_key_path: Option<String>,
    auth_type: Option<String>,
-    // see https://github.com/alexcrichton/toml-rs/blob/6c162e6562c3e432bf04c82a3d1d789d80761a86/examples/enum_external.rs for enum deserialisation examples
+    relish_storage_max_concurrent_sync: Option<String>,
+    /////////////////////////////////
+    //// Don't put `Option<String>` and other "simple" values below.
+    ////
+    /// `Option<RelishStorage>` is a <a href='https://toml.io/en/v1.0.0#table'>table</a> in TOML.
+    /// Values in TOML cannot be defined after tables (other tables can),
+    /// and [`toml`] crate serializes all fields in the order of their appearance.
+    ////////////////////////////////
    relish_storage: Option<RelishStorage>,
 }

-#[derive(Serialize, Deserialize, Clone)]
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
+// Without this attribute, enums with values won't be serialized by the `toml` library (but can be deserialized nonetheless!).
+// See https://github.com/alexcrichton/toml-rs/blob/6c162e6562c3e432bf04c82a3d1d789d80761a86/examples/enum_external.rs for the examples
+#[serde(untagged)]
 enum RelishStorage {
    Local {
        local_path: String,
@@ -89,6 +108,7 @@ impl CfgFileParams {
            auth_validation_public_key_path: get_arg("auth-validation-public-key-path"),
            auth_type: get_arg("auth-type"),
            relish_storage,
+            relish_storage_max_concurrent_sync: get_arg("relish-storage-max-concurrent-sync"),
        }
    }

@@ -108,6 +128,9 @@ impl CfgFileParams {
                .or(other.auth_validation_public_key_path),
            auth_type: self.auth_type.or(other.auth_type),
            relish_storage: self.relish_storage.or(other.relish_storage),
+            relish_storage_max_concurrent_sync: self
+                .relish_storage_max_concurrent_sync
+                .or(other.relish_storage_max_concurrent_sync),
        }
    }

@@ -176,25 +199,34 @@ impl CfgFileParams {
            );
        }

-        let relish_storage_config =
-            self.relish_storage
-                .as_ref()
-                .map(|storage_params| match storage_params.clone() {
-                    RelishStorage::Local { local_path } => {
-                        RelishStorageConfig::LocalFs(PathBuf::from(local_path))
-                    }
-                    RelishStorage::AwsS3 {
-                        bucket_name,
-                        bucket_region,
-                        access_key_id,
-                        secret_access_key,
-                    } => RelishStorageConfig::AwsS3(S3Config {
-                        bucket_name,
-                        bucket_region,
-                        access_key_id,
-                        secret_access_key,
-                    }),
-                });
+        let max_concurrent_sync = match self.relish_storage_max_concurrent_sync.as_deref() {
+            Some(relish_storage_max_concurrent_sync) => {
+                relish_storage_max_concurrent_sync.parse()?
+            }
+            None => DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS,
+        };
+        let relish_storage_config = self.relish_storage.as_ref().map(|storage_params| {
+            let storage = match storage_params.clone() {
+                RelishStorage::Local { local_path } => {
+                    RelishStorageKind::LocalFs(PathBuf::from(local_path))
+                }
+                RelishStorage::AwsS3 {
+                    bucket_name,
+                    bucket_region,
+                    access_key_id,
+                    secret_access_key,
+                } => RelishStorageKind::AwsS3(S3Config {
+                    bucket_name,
+                    bucket_region,
+                    access_key_id,
+                    secret_access_key,
+                }),
+            };
+            RelishStorageConfig {
+                max_concurrent_sync,
+                storage,
+            }
+        });

        Ok(PageServerConf {
            daemonize: false,
@@ -220,6 +252,7 @@ impl CfgFileParams {
 }

 fn main() -> Result<()> {
+    zenith_metrics::set_common_metrics_prefix("pageserver");
    let arg_matches = App::new("Zenith page server")
        .about("Materializes WAL stream to pages and serves them to the postgres")
        .arg(
@@ -228,14 +261,14 @@ fn main() -> Result<()> {
                .long("listen-pg")
                .alias("listen") // keep some compatibility
                .takes_value(true)
-                .help("listen for incoming page requests on ip:port (default: 127.0.0.1:5430)"),
+                .help(formatcp!("listen for incoming page requests on ip:port (default: {DEFAULT_PG_LISTEN_ADDR})")),
        )
        .arg(
            Arg::with_name("listen-http")
                .long("listen-http")
                .alias("http_endpoint") // keep some compatibility
                .takes_value(true)
-                .help("http endpoint address for for metrics and management API calls ip:port (default: 127.0.0.1:5430)"),
+                .help(formatcp!("http endpoint address for metrics and management API calls on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")),
        )
        .arg(
            Arg::with_name("daemonize")
@@ -343,10 +376,19 @@ fn main() -> Result<()> {
                .takes_value(true)
                .help("Credentials to access the AWS S3 bucket"),
        )
+        .arg(
+            Arg::with_name("relish-storage-max-concurrent-sync")
+                .long("relish-storage-max-concurrent-sync")
+                .takes_value(true)
+                .help("Maximum allowed concurrent synchronisations with storage"),
+        )
        .get_matches();

    let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
-    let cfg_file_path = workdir.canonicalize()?.join("pageserver.toml");
+    let cfg_file_path = workdir
+        .canonicalize()
+        .with_context(|| format!("Error opening workdir '{}'", workdir.display()))?
+        .join("pageserver.toml");

    let args_params = CfgFileParams::from_args(&arg_matches);

@@ -358,22 +400,37 @@ fn main() -> Result<()> {
        args_params
    } else {
        // Supplement the CLI arguments with the config file
-        let cfg_file_contents = std::fs::read_to_string(&cfg_file_path)?;
-        let file_params: CfgFileParams = toml::from_str(&cfg_file_contents)?;
+        let cfg_file_contents = std::fs::read_to_string(&cfg_file_path)
+            .with_context(|| format!("No pageserver config at '{}'", cfg_file_path.display()))?;
+        let file_params: CfgFileParams = toml::from_str(&cfg_file_contents).with_context(|| {
+            format!(
+                "Failed to read '{}' as pageserver config",
+                cfg_file_path.display()
+            )
+        })?;
        args_params.or(file_params)
    };

    // Set CWD to workdir for non-daemon modes
-    env::set_current_dir(&workdir)?;
+    env::set_current_dir(&workdir).with_context(|| {
+        format!(
+            "Failed to set application's current dir to '{}'",
+            workdir.display()
+        )
+    })?;

    // Ensure the config is valid, even if just init-ing
-    let mut conf = params.try_into_config()?;
+    let mut conf = params.try_into_config().with_context(|| {
+        format!(
+            "Pageserver config at '{}' is not valid",
+            cfg_file_path.display()
+        )
+    })?;

    conf.daemonize = arg_matches.is_present("daemonize");

    if init && conf.daemonize {
-        eprintln!("--daemonize cannot be used with --init");
-        exit(1);
+        bail!("--daemonize cannot be used with --init")
    }

    // The configuration is all set up now. Turn it into a 'static
@@ -383,21 +440,37 @@ fn main() -> Result<()> {

    // Create repo and exit if init was requested
    if init {
-        branches::init_pageserver(conf, create_tenant)?;
+        branches::init_pageserver(conf, create_tenant).context("Failed to init pageserver")?;
        // write the config file
-        let cfg_file_contents = toml::to_string_pretty(&params)?;
+        let cfg_file_contents = toml::to_string_pretty(&params)
+            .context("Failed to create pageserver config contents for initialisation")?;
        // TODO support enable-auth flag
-        std::fs::write(&cfg_file_path, cfg_file_contents)?;
-
-        return Ok(());
+        std::fs::write(&cfg_file_path, cfg_file_contents).with_context(|| {
+            format!(
+                "Failed to initialize pageserver config at '{}'",
+                cfg_file_path.display()
+            )
+        })?;
+        Ok(())
+    } else {
+        start_pageserver(conf).context("Failed to start pageserver")
    }
-
-    start_pageserver(conf)
 }

 fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
    // Initialize logger
-    let (_scope_guard, log_file) = logging::init(LOG_FILE_NAME, conf.daemonize)?;
+    let log_file = logging::init(LOG_FILE_NAME, conf.daemonize)?;
+
+    let term_now = Arc::new(AtomicBool::new(false));
+    for sig in TERM_SIGNALS {
+        // When terminated by a second term signal, exit with exit code 1.
+        // This will do nothing the first time (because term_now is false).
+        flag::register_conditional_shutdown(*sig, 1, Arc::clone(&term_now))?;
+        // But this will "arm" the above for the second time, by setting it to true.
+        // The order of registering these is important, if you put this one first, it will
+        // first arm and then terminate ‒ all in the first round.
+        flag::register(*sig, Arc::clone(&term_now))?;
+    }

    // TODO: Check that it looks like a valid repository before going further

@@ -406,13 +479,13 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
        "Starting pageserver http handler on {}",
        conf.listen_http_addr
    );
-    let http_listener = TcpListener::bind(conf.listen_http_addr.clone())?;
+    let http_listener = tcp_listener::bind(conf.listen_http_addr.clone())?;

    info!(
        "Starting pageserver pg protocol handler on {}",
        conf.listen_pg_addr
    );
-    let pageserver_listener = TcpListener::bind(conf.listen_pg_addr.clone())?;
+    let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;

    if conf.daemonize {
        info!("daemonizing...");
@@ -430,16 +503,20 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {

        match daemonize.start() {
            Ok(_) => info!("Success, daemonized"),
-            Err(e) => error!("Error, {}", e),
+            Err(err) => error!(%err, "could not daemonize"),
        }
    }

+    // keep join handles for spawned threads
+    // don't spawn threads before daemonizing
+    let mut join_handles = Vec::new();
+
+    if let Some(handle) = relish_storage::run_storage_sync_thread(conf)? {
+        join_handles.push(handle);
+    }
    // Initialize tenant manager.
    tenant_mgr::init(conf);

-    // keep join handles for spawned threads
-    let mut join_handles = vec![];
-
    // initialize authentication for incoming connections
    let auth = match &conf.auth_type {
        AuthType::Trust | AuthType::MD5 => None,
@@ -471,13 +548,173 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
            page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type)
        })?;

-    join_handles.push(page_service_thread);
+    for info in SignalsInfo::<WithOrigin>::new(TERM_SIGNALS)?.into_iter() {
+        match info.signal {
+            SIGQUIT => {
+                info!("Got SIGQUIT. Terminate pageserver in immediate shutdown mode");
+                exit(111);
+            }
+            SIGINT | SIGTERM => {
+                info!("Got SIGINT/SIGTERM. Terminate gracefully in fast shutdown mode");
+                // Terminate postgres backends
+                postgres_backend::set_pgbackend_shutdown_requested();
+                // Stop all tenants and flush their data
+                tenant_mgr::shutdown_all_tenants()?;
+                // Wait for pageservice thread to complete the job
+                page_service_thread
+                    .join()
+                    .expect("thread panicked")
+                    .expect("thread exited with an error");

-    for handle in join_handles.into_iter() {
-        handle
-            .join()
-            .expect("thread panicked")
-            .expect("thread exited with an error")
+                // Shut down http router
+                endpoint::shutdown();
+
+                // Wait for all threads
+                for handle in join_handles.into_iter() {
+                    handle
+                        .join()
+                        .expect("thread panicked")
+                        .expect("thread exited with an error");
+                }
+                info!("Pageserver shut down successfully completed");
+                exit(0);
+            }
+            unknown_signal => {
+                debug!("Unknown signal {}", unknown_signal);
+            }
+        }
    }
+
    Ok(())
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn page_server_conf_toml_serde() {
+        let params = CfgFileParams {
+            listen_pg_addr: Some("listen_pg_addr_VALUE".to_string()),
+            listen_http_addr: Some("listen_http_addr_VALUE".to_string()),
+            checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()),
+            checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
+            gc_horizon: Some("gc_horizon_VALUE".to_string()),
+            gc_period: Some("gc_period_VALUE".to_string()),
+            pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
+            auth_validation_public_key_path: Some(
+                "auth_validation_public_key_path_VALUE".to_string(),
+            ),
+            auth_type: Some("auth_type_VALUE".to_string()),
+            relish_storage: Some(RelishStorage::Local {
+                local_path: "relish_storage_local_VALUE".to_string(),
+            }),
+            relish_storage_max_concurrent_sync: Some(
+                "relish_storage_max_concurrent_sync_VALUE".to_string(),
+            ),
+        };
+
+        let toml_string = toml::to_string(&params).expect("Failed to serialize correct config");
+        let toml_pretty_string =
+            toml::to_string_pretty(&params).expect("Failed to serialize correct config");
+        assert_eq!(
+            r#"listen_pg_addr = 'listen_pg_addr_VALUE'
+listen_http_addr = 'listen_http_addr_VALUE'
+checkpoint_distance = 'checkpoint_distance_VALUE'
+checkpoint_period = 'checkpoint_period_VALUE'
+gc_horizon = 'gc_horizon_VALUE'
+gc_period = 'gc_period_VALUE'
+pg_distrib_dir = 'pg_distrib_dir_VALUE'
+auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
+auth_type = 'auth_type_VALUE'
+relish_storage_max_concurrent_sync = 'relish_storage_max_concurrent_sync_VALUE'
+
+[relish_storage]
+local_path = 'relish_storage_local_VALUE'
+"#,
+            toml_pretty_string
+        );
+
+        let params_from_serialized: CfgFileParams = toml::from_str(&toml_string)
+            .expect("Failed to deserialize the serialization result of the config");
+        let params_from_serialized_pretty: CfgFileParams = toml::from_str(&toml_pretty_string)
+            .expect("Failed to deserialize the prettified serialization result of the config");
+        assert!(
+            params_from_serialized == params,
+            "Expected the same config in the end of config -> serialize -> deserialize chain"
+        );
+        assert!(
+            params_from_serialized_pretty == params,
+            "Expected the same config in the end of config -> serialize pretty -> deserialize chain"
+        );
+    }
+
+    #[test]
+    fn credentials_omitted_during_serialization() {
+        let params = CfgFileParams {
+            listen_pg_addr: Some("listen_pg_addr_VALUE".to_string()),
+            listen_http_addr: Some("listen_http_addr_VALUE".to_string()),
+            checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()),
+            checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
+            gc_horizon: Some("gc_horizon_VALUE".to_string()),
+            gc_period: Some("gc_period_VALUE".to_string()),
+            pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
+            auth_validation_public_key_path: Some(
+                "auth_validation_public_key_path_VALUE".to_string(),
+            ),
+            auth_type: Some("auth_type_VALUE".to_string()),
+            relish_storage: Some(RelishStorage::AwsS3 {
+                bucket_name: "bucket_name_VALUE".to_string(),
+                bucket_region: "bucket_region_VALUE".to_string(),
+                access_key_id: Some("access_key_id_VALUE".to_string()),
+                secret_access_key: Some("secret_access_key_VALUE".to_string()),
+            }),
+            relish_storage_max_concurrent_sync: Some(
+                "relish_storage_max_concurrent_sync_VALUE".to_string(),
+            ),
+        };
+
+        let toml_string = toml::to_string(&params).expect("Failed to serialize correct config");
+        let toml_pretty_string =
+            toml::to_string_pretty(&params).expect("Failed to serialize correct config");
+        assert_eq!(
+            r#"listen_pg_addr = 'listen_pg_addr_VALUE'
+listen_http_addr = 'listen_http_addr_VALUE'
+checkpoint_distance = 'checkpoint_distance_VALUE'
+checkpoint_period = 'checkpoint_period_VALUE'
+gc_horizon = 'gc_horizon_VALUE'
+gc_period = 'gc_period_VALUE'
+pg_distrib_dir = 'pg_distrib_dir_VALUE'
+auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
+auth_type = 'auth_type_VALUE'
+relish_storage_max_concurrent_sync = 'relish_storage_max_concurrent_sync_VALUE'
+
+[relish_storage]
+bucket_name = 'bucket_name_VALUE'
+bucket_region = 'bucket_region_VALUE'
+"#,
+            toml_pretty_string
+        );
+
+        let params_from_serialized: CfgFileParams = toml::from_str(&toml_string)
+            .expect("Failed to deserialize the serialization result of the config");
+        let params_from_serialized_pretty: CfgFileParams = toml::from_str(&toml_pretty_string)
+            .expect("Failed to deserialize the prettified serialization result of the config");
+
+        let mut expected_params = params;
+        expected_params.relish_storage = Some(RelishStorage::AwsS3 {
+            bucket_name: "bucket_name_VALUE".to_string(),
+            bucket_region: "bucket_region_VALUE".to_string(),
+            access_key_id: None,
+            secret_access_key: None,
+        });
+        assert!(
+            params_from_serialized == expected_params,
+            "Expected the config without credentials in the end of a 'config -> serialize -> deserialize' chain"
+        );
+        assert!(
+            params_from_serialized_pretty == expected_params,
+            "Expected the config without credentials in the end of a 'config -> serialize pretty -> deserialize' chain"
+        );
+    }
+}
--- a/pageserver/src/branches.rs
+++ b/pageserver/src/branches.rs
@@ -4,7 +4,7 @@
 // TODO: move all paths construction to conf impl
 //

-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{bail, Context, Result};
 use postgres_ffi::ControlFileData;
 use serde::{Deserialize, Serialize};
 use std::{
@@ -14,14 +14,16 @@ use std::{
    str::FromStr,
    sync::Arc,
 };
-use zenith_utils::zid::{ZTenantId, ZTimelineId};
+use tracing::*;

-use log::*;
+use zenith_utils::crashsafe_dir;
 use zenith_utils::logging;
 use zenith_utils::lsn::Lsn;
+use zenith_utils::zid::{ZTenantId, ZTimelineId};

 use crate::tenant_mgr;
 use crate::walredo::WalRedoManager;
+use crate::CheckpointConfig;
 use crate::{repository::Repository, PageServerConf};
 use crate::{restore_local_repo, LOG_FILE_NAME};

@@ -34,7 +36,7 @@ pub struct BranchInfo {
    pub ancestor_id: Option<String>,
    pub ancestor_lsn: Option<String>,
    pub current_logical_size: usize,
-    pub current_logical_size_non_incremental: usize,
+    pub current_logical_size_non_incremental: Option<usize>,
 }

 impl BranchInfo {
@@ -43,6 +45,7 @@ impl BranchInfo {
        conf: &PageServerConf,
        tenantid: &ZTenantId,
        repo: &Arc<dyn Repository>,
+        include_non_incremental_logical_size: bool,
    ) -> Result<Self> {
        let name = path
            .as_ref()
@@ -77,6 +80,14 @@ impl BranchInfo {
            );
        }

+        // non incremental size calculation can be heavy, so let it be optional
+        // needed for tests to check size calculation
+        let current_logical_size_non_incremental = include_non_incremental_logical_size
+            .then(|| {
+                timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn())
+            })
+            .transpose()?;
+
        Ok(BranchInfo {
            name,
            timeline_id,
@@ -84,8 +95,7 @@ impl BranchInfo {
            ancestor_id,
            ancestor_lsn,
            current_logical_size: timeline.get_current_logical_size(),
-            current_logical_size_non_incremental: timeline
-                .get_current_logical_size_non_incremental(timeline.get_last_record_lsn())?,
+            current_logical_size_non_incremental,
        })
    }
 }
@@ -99,7 +109,7 @@ pub struct PointInTime {
 pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str>) -> Result<()> {
    // Initialize logger
    // use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages
-    let (_scope_guard, _log_file) = logging::init(LOG_FILE_NAME, true)?;
+    let _log_file = logging::init(LOG_FILE_NAME, true)?;

    // We don't use the real WAL redo manager, because we don't want to spawn the WAL redo
    // process during repository initialization.
@@ -118,7 +128,7 @@ pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str
        println!("initializing tenantid {}", tenantid);
        create_repo(conf, tenantid, dummy_redo_mgr).with_context(|| "failed to create repo")?;
    }
-    fs::create_dir_all(conf.tenants_path())?;
+    crashsafe_dir::create_dir_all(conf.tenants_path())?;

    println!("pageserver init succeeded");
    Ok(())
@@ -135,12 +145,12 @@ pub fn create_repo(
    }

    // top-level dir may exist if we are creating it through CLI
-    fs::create_dir_all(&repo_dir)
+    crashsafe_dir::create_dir_all(&repo_dir)
        .with_context(|| format!("could not create directory {}", repo_dir.display()))?;

-    fs::create_dir(conf.timelines_path(&tenantid))?;
-    fs::create_dir_all(conf.branches_path(&tenantid))?;
-    fs::create_dir_all(conf.tags_path(&tenantid))?;
+    crashsafe_dir::create_dir(conf.timelines_path(&tenantid))?;
+    crashsafe_dir::create_dir_all(conf.branches_path(&tenantid))?;
+    crashsafe_dir::create_dir_all(conf.tags_path(&tenantid))?;

    info!("created directory structure in {}", repo_dir.display());

@@ -150,12 +160,13 @@ pub fn create_repo(
        conf,
        wal_redo_manager,
        tenantid,
+        false,
    ));

    // Load data into pageserver
    // TODO To implement zenith import we need to
    //      move data loading out of create_repo()
-    bootstrap_timeline(conf, tenantid, tli, &*repo)?;
+    bootstrap_timeline(conf, tenantid, tli, repo.as_ref())?;

    Ok(repo)
 }
@@ -174,13 +185,16 @@ fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
 // to get bootstrap data for timeline initialization.
 //
 fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
-    info!("running initdb... ");
+    info!("running initdb in {}... ", initdbpath.display());

    let initdb_path = conf.pg_bin_dir().join("initdb");
    let initdb_output = Command::new(initdb_path)
        .args(&["-D", initdbpath.to_str().unwrap()])
        .args(&["-U", &conf.superuser])
        .arg("--no-instructions")
+        // This is only used for a temporary installation that is deleted shortly after,
+        // so no need to fsync it
+        .arg("--no-sync")
        .env_clear()
        .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
        .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
@@ -193,7 +207,6 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
            String::from_utf8_lossy(&initdb_output.stderr)
        );
    }
-    info!("initdb succeeded");

    Ok(())
 }
@@ -208,6 +221,8 @@ fn bootstrap_timeline(
    tli: ZTimelineId,
    repo: &dyn Repository,
 ) -> Result<()> {
+    let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered();
+
    let initdb_path = conf.tenant_path(&tenantid).join("tmp");

    // Init temporarily repo to get bootstrap data
@@ -216,13 +231,15 @@ fn bootstrap_timeline(

    let lsn = get_lsn_from_controlfile(&pgdata_path)?.align();

-    info!("bootstrap_timeline {:?} at lsn {}", pgdata_path, lsn);
-
    // Import the contents of the data directory at the initial checkpoint
    // LSN, and any WAL after that.
    let timeline = repo.create_empty_timeline(tli)?;
-    restore_local_repo::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?;
-    timeline.checkpoint()?;
+    restore_local_repo::import_timeline_from_postgres_datadir(
+        &pgdata_path,
+        timeline.writer().as_ref(),
+        lsn,
+    )?;
+    timeline.checkpoint(CheckpointConfig::Forced)?;

    println!(
        "created initial timeline {} timeline.lsn {}",
@@ -240,19 +257,11 @@ fn bootstrap_timeline(
    Ok(())
 }

-pub(crate) fn get_tenants(conf: &PageServerConf) -> Result<Vec<String>> {
-    let tenants_dir = conf.tenants_path();
-
-    std::fs::read_dir(&tenants_dir)?
-        .map(|dir_entry_res| {
-            let dir_entry = dir_entry_res?;
-            ensure!(dir_entry.file_type()?.is_dir());
-            Ok(dir_entry.file_name().to_str().unwrap().to_owned())
-        })
-        .collect()
-}
-
-pub(crate) fn get_branches(conf: &PageServerConf, tenantid: &ZTenantId) -> Result<Vec<BranchInfo>> {
+pub(crate) fn get_branches(
+    conf: &PageServerConf,
+    tenantid: &ZTenantId,
+    include_non_incremental_logical_size: bool,
+) -> Result<Vec<BranchInfo>> {
    let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?;

    // Each branch has a corresponding record (text file) in the refs/branches
@@ -262,7 +271,13 @@ pub(crate) fn get_branches(conf: &PageServerConf, tenantid: &ZTenantId) -> Resul
    std::fs::read_dir(&branches_dir)?
        .map(|dir_entry_res| {
            let dir_entry = dir_entry_res?;
-            BranchInfo::from_path(dir_entry.path(), conf, tenantid, &repo)
+            BranchInfo::from_path(
+                dir_entry.path(),
+                conf,
+                tenantid,
+                &repo,
+                include_non_incremental_logical_size,
+            )
        })
        .collect()
 }
@@ -324,7 +339,7 @@ pub(crate) fn create_branch(
        ancestor_id: None,
        ancestor_lsn: None,
        current_logical_size: 0,
-        current_logical_size_non_incremental: 0,
+        current_logical_size_non_incremental: Some(0),
    })
 }

@@ -413,7 +428,6 @@ fn create_timeline(
    let timelinedir = conf.timeline_path(&timelineid, tenantid);

    fs::create_dir(&timelinedir)?;
-    fs::create_dir(&timelinedir.join("wal"))?;

    if let Some(ancestor) = ancestor {
        let data = format!("{}@{}", ancestor.timelineid, ancestor.lsn);
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -25,6 +25,11 @@ paths:
        schema:
          type: string
          format: hex
+      - name: include-non-incremental-logical-size
+        in: query
+        schema:
+          type: string
+          description: Controls calculation of current_logical_size_non_incremental
    get:
      description: Get branches for tenant
      responses:
@@ -73,6 +78,11 @@ paths:
        required: true
        schema:
          type: string
+      - name: include-non-incremental-logical-size
+        in: query
+        schema:
+          type: string
+          description: Controls calculation of current_logical_size_non_incremental
    get:
      description: Get branches for tenant
      responses:
@@ -164,13 +174,13 @@ paths:
      description: Get tenants list
      responses:
        "200":
-          description: OK
+          description: TenantInfo
          content:
            application/json:
              schema:
                type: array
                items:
-                  type: string
+                  $ref: "#/components/schemas/TenantInfo"
        "401":
          description: Unauthorized Error
          content:
@@ -243,6 +253,16 @@ components:
      scheme: bearer
      bearerFormat: JWT
  schemas:
+    TenantInfo:
+      type: object
+      required:
+        - id
+        - state
+      properties:
+        id:
+          type: string
+        state:
+          type: string
    BranchInfo:
      type: object
      required:
@@ -250,7 +270,6 @@ components:
        - timeline_id
        - latest_valid_lsn
        - current_logical_size
-        - current_logical_size_non_incremental
      properties:
        name:
          type: string
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1,4 +1,3 @@
-use std::str::FromStr;
 use std::sync::Arc;

 use anyhow::Result;
@@ -6,6 +5,7 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use routerify::{ext::RequestExt, RouterBuilder};
+use tracing::*;
 use zenith_utils::auth::JwtAuth;
 use zenith_utils::http::endpoint::attach_openapi_ui;
 use zenith_utils::http::endpoint::auth_middleware;
@@ -15,6 +15,8 @@ use zenith_utils::http::{
    endpoint,
    error::HttpErrorBody,
    json::{json_request, json_response},
+    request::get_request_param,
+    request::parse_request_param,
 };

 use super::models::BranchCreateRequest;
@@ -56,33 +58,6 @@ fn get_config(request: &Request<Body>) -> &'static PageServerConf {
    get_state(request).conf
 }

-fn get_request_param<'a>(
-    request: &'a Request<Body>,
-    param_name: &str,
-) -> Result<&'a str, ApiError> {
-    match request.param(param_name) {
-        Some(arg) => Ok(arg),
-        None => {
-            return Err(ApiError::BadRequest(format!(
-                "no {} specified in path param",
-                param_name
-            )))
-        }
-    }
-}
-
-fn parse_request_param<T: FromStr>(
-    request: &Request<Body>,
-    param_name: &str,
-) -> Result<T, ApiError> {
-    match get_request_param(request, param_name)?.parse() {
-        Ok(v) => Ok(v),
-        Err(_) => Err(ApiError::BadRequest(
-            "failed to parse tenant id".to_string(),
-        )),
-    }
-}
-
 // healthcheck handler
 async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    Ok(Response::builder()
@@ -98,6 +73,7 @@ async fn branch_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    check_permission(&request, Some(request_data.tenant_id))?;

    let response_data = tokio::task::spawn_blocking(move || {
+        let _enter = info_span!("/branch_create", name = %request_data.name, tenant = %request_data.tenant_id, startpoint=%request_data.start_point).entered();
        branches::create_branch(
            get_config(&request),
            &request_data.name,
@@ -110,29 +86,59 @@ async fn branch_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    Ok(json_response(StatusCode::CREATED, response_data)?)
 }

+// Gate non incremental logical size calculation behind a flag
+// after pgbench -i -s100 calculation took 28ms so if multiplied by the number of timelines
+// and tenants it can take noticeable amount of time. Also the value currently used only in tests
+fn get_include_non_incremental_logical_size(request: &Request<Body>) -> bool {
+    request
+        .uri()
+        .query()
+        .map(|v| {
+            url::form_urlencoded::parse(v.as_bytes())
+                .into_owned()
+                .any(|(param, _)| param == "include-non-incremental-logical-size")
+        })
+        .unwrap_or(false)
+}
+
 async fn branch_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;

+    let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
+
    check_permission(&request, Some(tenantid))?;

    let response_data = tokio::task::spawn_blocking(move || {
-        crate::branches::get_branches(get_config(&request), &tenantid)
+        let _enter = info_span!("branch_list", tenant = %tenantid).entered();
+        crate::branches::get_branches(
+            get_config(&request),
+            &tenantid,
+            include_non_incremental_logical_size,
+        )
    })
    .await
    .map_err(ApiError::from_err)??;
    Ok(json_response(StatusCode::OK, response_data)?)
 }

-// TODO add to swagger
 async fn branch_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
-    let branch_name: &str = get_request_param(&request, "branch_name")?;
+    let branch_name: String = get_request_param(&request, "branch_name")?.to_string();
    let conf = get_state(&request).conf;
-    let path = conf.branch_path(branch_name, &tenantid);
+    let path = conf.branch_path(&branch_name, &tenantid);
+
+    let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);

    let response_data = tokio::task::spawn_blocking(move || {
+        let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered();
        let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
-        BranchInfo::from_path(path, conf, &tenantid, &repo)
+        BranchInfo::from_path(
+            path,
+            conf,
+            &tenantid,
+            &repo,
+            include_non_incremental_logical_size,
+        )
    })
    .await
    .map_err(ApiError::from_err)??;
@@ -144,10 +150,13 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
    // check for management permission
    check_permission(&request, None)?;

-    let response_data =
-        tokio::task::spawn_blocking(move || crate::branches::get_tenants(get_config(&request)))
-            .await
-            .map_err(ApiError::from_err)??;
+    let response_data = tokio::task::spawn_blocking(move || {
+        let _enter = info_span!("tenant_list").entered();
+        crate::tenant_mgr::list_tenants()
+    })
+    .await
+    .map_err(ApiError::from_err)??;
+
    Ok(json_response(StatusCode::OK, response_data)?)
 }

@@ -158,6 +167,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    let request_data: TenantCreateRequest = json_request(&mut request).await?;

    let response_data = tokio::task::spawn_blocking(move || {
+        let _enter = info_span!("tenant_create", tenant = %request_data.tenant_id).entered();
        tenant_mgr::create_repository_for_tenant(get_config(&request), request_data.tenant_id)
    })
    .await
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
--- a/pageserver/src/layered_repository/README.md
+++ b/pageserver/src/layered_repository/README.md
@@ -25,11 +25,13 @@ OnDisk layers can be Image or Delta:
 Dropped segments are always represented on disk by DeltaLayer.

 LSN range defined by start_lsn and end_lsn:
- start_lsn is always inclusive.
- end_lsn depends on layer kind:
-	- InMemoryLayer is either unbounded (end_lsn = MAX_LSN) or dropped (end_lsn = drop_lsn)
-    - ImageLayer represents snapshot at one LSN, so end_lsn = lsn.
-    - DeltaLayer has explicit end_lsn, which represents end of incremental layer.
+- start_lsn is inclusive.
+- end_lsn is exclusive.
+
+For an open in-memory layer, the end_lsn is MAX_LSN. For a frozen
+in-memory layer or a delta layer, it is a valid end bound. An image
+layer represents snapshot at one LSN, so end_lsn is always the
+snapshot LSN + 1

 Layers can be open or historical:
 - Open layer is a writeable one. Only InMemory layer can be open.
--- a/pageserver/src/layered_repository/delta_layer.rs
+++ b/pageserver/src/layered_repository/delta_layer.rs
@@ -42,15 +42,13 @@ use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
 use crate::layered_repository::storage_layer::{
    Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag,
 };
-use crate::repository::WALRecord;
 use crate::waldecoder;
 use crate::PageServerConf;
 use crate::{ZTenantId, ZTimelineId};
-use anyhow::{bail, Result};
-use bytes::Bytes;
+use anyhow::{bail, ensure, Result};
 use log::*;
 use serde::{Deserialize, Serialize};
-use std::collections::BTreeMap;
+use zenith_utils::vec_map::VecMap;
 // avoid binding to Write (conflicts with std::io::Write)
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
@@ -59,7 +57,7 @@ use std::fs::File;
 use std::io::{BufWriter, Write};
 use std::ops::Bound::Included;
 use std::path::{Path, PathBuf};
-use std::sync::{Arc, Mutex, MutexGuard};
+use std::sync::{Mutex, MutexGuard};

 use bookfile::{Book, BookWriter};

@@ -109,12 +107,6 @@ impl From<&DeltaLayer> for Summary {
    }
 }

-#[derive(Serialize, Deserialize)]
-struct PageVersionMeta {
-    page_image_range: Option<BlobRange>,
-    record_range: Option<BlobRange>,
-}
-
 ///
 /// DeltaLayer is the in-memory data structure associated with an
 /// on-disk delta file.  We keep a DeltaLayer in memory for each
@@ -139,9 +131,6 @@ pub struct DeltaLayer {

    dropped: bool,

-    /// Predecessor layer
-    predecessor: Option<Arc<dyn Layer>>,
-
    inner: Mutex<DeltaLayerInner>,
 }

@@ -152,10 +141,10 @@ pub struct DeltaLayerInner {

    /// All versions of all pages in the file are are kept here.
    /// Indexed by block number and LSN.
-    page_version_metas: BTreeMap<(u32, Lsn), PageVersionMeta>,
+    page_version_metas: VecMap<(u32, Lsn), BlobRange>,

    /// `relsizes` tracks the size of the relation at different points in time.
-    relsizes: BTreeMap<Lsn, u32>,
+    relsizes: VecMap<Lsn, u32>,
 }

 impl Layer for DeltaLayer {
@@ -180,29 +169,7 @@ impl Layer for DeltaLayer {
    }

    fn filename(&self) -> PathBuf {
-        PathBuf::from(
-            DeltaFileName {
-                seg: self.seg,
-                start_lsn: self.start_lsn,
-                end_lsn: self.end_lsn,
-                dropped: self.dropped,
-            }
-            .to_string(),
-        )
-    }
-
-    fn path(&self) -> Option<PathBuf> {
-        Some(Self::path_for(
-            &self.path_or_conf,
-            self.timelineid,
-            self.tenantid,
-            &DeltaFileName {
-                seg: self.seg,
-                start_lsn: self.start_lsn,
-                end_lsn: self.end_lsn,
-                dropped: self.dropped,
-            },
-        ))
+        PathBuf::from(self.layer_name().to_string())
    }

    /// Look up given page in the cache.
@@ -226,20 +193,22 @@ impl Layer for DeltaLayer {
            // Scan the metadata BTreeMap backwards, starting from the given entry.
            let minkey = (blknum, Lsn(0));
            let maxkey = (blknum, lsn);
-            let mut iter = inner
+            let iter = inner
                .page_version_metas
-                .range((Included(&minkey), Included(&maxkey)));
-            while let Some(((_blknum, _entry_lsn), entry)) = iter.next_back() {
-                if let Some(img_range) = &entry.page_image_range {
+                .slice_range((Included(&minkey), Included(&maxkey)))
+                .iter()
+                .rev();
+            for ((_blknum, pv_lsn), blob_range) in iter {
+                let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?;
+
+                if let Some(img) = pv.page_image {
                    // Found a page image, return it
-                    let img = Bytes::from(read_blob(&page_version_reader, img_range)?);
                    reconstruct_data.page_img = Some(img);
                    need_image = false;
                    break;
-                } else if let Some(rec_range) = &entry.record_range {
-                    let rec = WALRecord::des(&read_blob(&page_version_reader, rec_range)?)?;
+                } else if let Some(rec) = pv.record {
                    let will_init = rec.will_init;
-                    reconstruct_data.records.push(rec);
+                    reconstruct_data.records.push((*pv_lsn, rec));
                    if will_init {
                        // This WAL record initializes the page, so no need to go further back
                        need_image = false;
@@ -255,16 +224,9 @@ impl Layer for DeltaLayer {
        }

        // If an older page image is needed to reconstruct the page, let the
-        // caller know about the predecessor layer.
+        // caller know.
        if need_image {
-            if let Some(cont_layer) = &self.predecessor {
-                Ok(PageReconstructResult::Continue(
-                    self.start_lsn,
-                    Arc::clone(cont_layer),
-                ))
-            } else {
-                Ok(PageReconstructResult::Missing(self.start_lsn))
-            }
+            Ok(PageReconstructResult::Continue(self.start_lsn))
        } else {
            Ok(PageReconstructResult::Complete)
        }
@@ -273,21 +235,22 @@ impl Layer for DeltaLayer {
    /// Get size of the relation at given LSN
    fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
        assert!(lsn >= self.start_lsn);
+        ensure!(
+            self.seg.rel.is_blocky(),
+            "get_seg_size() called on a non-blocky rel"
+        );

        // Scan the BTreeMap backwards, starting from the given entry.
        let inner = self.load()?;
-        let mut iter = inner.relsizes.range((Included(&Lsn(0)), Included(&lsn)));
+        let slice = inner
+            .relsizes
+            .slice_range((Included(&Lsn(0)), Included(&lsn)));

-        let result;
-        if let Some((_entry_lsn, entry)) = iter.next_back() {
-            result = *entry;
-        // Use the base image if needed
-        } else if let Some(predecessor) = &self.predecessor {
-            result = predecessor.get_seg_size(lsn)?;
+        if let Some((_entry_lsn, entry)) = slice.last() {
+            Ok(*entry)
        } else {
-            result = 0;
+            Err(anyhow::anyhow!("could not find seg size in delta layer"))
        }
-        Ok(result)
    }

    /// Does this segment exist at given LSN?
@@ -307,17 +270,15 @@ impl Layer for DeltaLayer {
    ///
    fn unload(&self) -> Result<()> {
        let mut inner = self.inner.lock().unwrap();
-        inner.page_version_metas = BTreeMap::new();
-        inner.relsizes = BTreeMap::new();
+        inner.page_version_metas = VecMap::default();
+        inner.relsizes = VecMap::default();
        inner.loaded = false;
        Ok(())
    }

    fn delete(&self) -> Result<()> {
        // delete underlying file
-        if let Some(path) = self.path() {
-            fs::remove_file(path)?;
-        }
+        fs::remove_file(self.path())?;
        Ok(())
    }

@@ -334,22 +295,22 @@ impl Layer for DeltaLayer {

        println!("--- relsizes ---");
        let inner = self.load()?;
-        for (k, v) in inner.relsizes.iter() {
+        for (k, v) in inner.relsizes.as_slice() {
            println!("  {}: {}", k, v);
        }
        println!("--- page versions ---");
        let (_path, book) = self.open_book()?;
        let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
-        for (k, v) in inner.page_version_metas.iter() {
+        for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
            let mut desc = String::new();

-            if let Some(page_image_range) = v.page_image_range.as_ref() {
-                let image = read_blob(&chapter, page_image_range)?;
-                write!(&mut desc, " img {} bytes", image.len())?;
+            let buf = read_blob(&chapter, blob_range)?;
+            let pv = PageVersion::des(&buf)?;
+
+            if let Some(img) = pv.page_image.as_ref() {
+                write!(&mut desc, " img {} bytes", img.len())?;
            }
-            if let Some(record_range) = v.record_range.as_ref() {
-                let record_bytes = read_blob(&chapter, record_range)?;
-                let rec = WALRecord::des(&record_bytes)?;
+            if let Some(rec) = pv.record.as_ref() {
                let wal_desc = waldecoder::describe_wal_record(&rec.rec);
                write!(
                    &mut desc,
@@ -359,7 +320,7 @@ impl Layer for DeltaLayer {
                    wal_desc
                )?;
            }
-            println!("  blk {} at {}: {}", k.0, k.1, desc);
+            println!("  blk {} at {}: {}", blk, lsn, desc);
        }

        Ok(())
@@ -381,14 +342,15 @@ impl DeltaLayer {
        }
    }

-    /// Create a new delta file, using the given btreemaps containing the page versions and
-    /// relsizes.
+    /// Create a new delta file, using the given page versions and relsizes.
+    /// The page versions are passed by an iterator; the iterator must return
+    /// page versions in blknum+lsn order.
    ///
    /// This is used to write the in-memory layer to disk. The in-memory layer uses the same
    /// data structure with two btreemaps as we do, so passing the btreemaps is currently
    /// expedient.
    #[allow(clippy::too_many_arguments)]
-    pub fn create(
+    pub fn create<'a>(
        conf: &'static PageServerConf,
        timelineid: ZTimelineId,
        tenantid: ZTenantId,
@@ -396,10 +358,13 @@ impl DeltaLayer {
        start_lsn: Lsn,
        end_lsn: Lsn,
        dropped: bool,
-        predecessor: Option<Arc<dyn Layer>>,
-        page_versions: BTreeMap<(u32, Lsn), PageVersion>,
-        relsizes: BTreeMap<Lsn, u32>,
+        page_versions: impl Iterator<Item = (u32, Lsn, &'a PageVersion)>,
+        relsizes: VecMap<Lsn, u32>,
    ) -> Result<DeltaLayer> {
+        if seg.rel.is_blocky() {
+            assert!(!relsizes.is_empty());
+        }
+
        let delta_layer = DeltaLayer {
            path_or_conf: PathOrConf::Conf(conf),
            timelineid,
@@ -410,17 +375,14 @@ impl DeltaLayer {
            dropped,
            inner: Mutex::new(DeltaLayerInner {
                loaded: true,
-                page_version_metas: BTreeMap::new(),
+                page_version_metas: VecMap::default(),
                relsizes,
            }),
-            predecessor,
        };
        let mut inner = delta_layer.inner.lock().unwrap();

        // Write the in-memory btreemaps into a file
-        let path = delta_layer
-            .path()
-            .expect("DeltaLayer is supposed to have a layer path on disk");
+        let path = delta_layer.path();

        // Note: This overwrites any existing file. There shouldn't be any.
        // FIXME: throw an error instead?
@@ -430,42 +392,27 @@ impl DeltaLayer {

        let mut page_version_writer = BlobWriter::new(book, PAGE_VERSIONS_CHAPTER);

-        for (key, page_version) in page_versions {
-            let page_image_range = page_version
-                .page_image
-                .map(|page_image| page_version_writer.write_blob(page_image.as_ref()))
-                .transpose()?;
+        for (blknum, lsn, page_version) in page_versions {
+            let buf = PageVersion::ser(page_version)?;
+            let blob_range = page_version_writer.write_blob(&buf)?;

-            let record_range = page_version
-                .record
-                .map(|record| {
-                    let buf = WALRecord::ser(&record)?;
-                    page_version_writer.write_blob(&buf)
-                })
-                .transpose()?;
-
-            let old = inner.page_version_metas.insert(
-                key,
-                PageVersionMeta {
-                    page_image_range,
-                    record_range,
-                },
-            );
-
-            assert!(old.is_none());
+            inner
+                .page_version_metas
+                .append((blknum, lsn), blob_range)
+                .unwrap();
        }

        let book = page_version_writer.close()?;

        // Write out page versions
        let mut chapter = book.new_chapter(PAGE_VERSION_METAS_CHAPTER);
-        let buf = BTreeMap::ser(&inner.page_version_metas)?;
+        let buf = VecMap::ser(&inner.page_version_metas)?;
        chapter.write_all(&buf)?;
        let book = chapter.close()?;

        // and relsizes to separate chapter
        let mut chapter = book.new_chapter(REL_SIZES_CHAPTER);
-        let buf = BTreeMap::ser(&inner.relsizes)?;
+        let buf = VecMap::ser(&inner.relsizes)?;
        chapter.write_all(&buf)?;
        let book = chapter.close()?;

@@ -484,7 +431,8 @@ impl DeltaLayer {
        let book = chapter.close()?;

        // This flushes the underlying 'buf_writer'.
-        book.close()?;
+        let writer = book.close()?;
+        writer.get_ref().sync_all()?;

        trace!("saved {}", &path.display());

@@ -494,17 +442,7 @@ impl DeltaLayer {
    }

    fn open_book(&self) -> Result<(PathBuf, Book<File>)> {
-        let path = Self::path_for(
-            &self.path_or_conf,
-            self.timelineid,
-            self.tenantid,
-            &DeltaFileName {
-                seg: self.seg,
-                start_lsn: self.start_lsn,
-                end_lsn: self.end_lsn,
-                dropped: self.dropped,
-            },
-        );
+        let path = self.path();

        let file = File::open(&path)?;
        let book = Book::new(file)?;
@@ -551,10 +489,10 @@ impl DeltaLayer {
        }

        let chapter = book.read_chapter(PAGE_VERSION_METAS_CHAPTER)?;
-        let page_version_metas = BTreeMap::des(&chapter)?;
+        let page_version_metas = VecMap::des(&chapter)?;

        let chapter = book.read_chapter(REL_SIZES_CHAPTER)?;
-        let relsizes = BTreeMap::des(&chapter)?;
+        let relsizes = VecMap::des(&chapter)?;

        debug!("loaded from {}", &path.display());

@@ -573,7 +511,6 @@ impl DeltaLayer {
        timelineid: ZTimelineId,
        tenantid: ZTenantId,
        filename: &DeltaFileName,
-        predecessor: Option<Arc<dyn Layer>>,
    ) -> DeltaLayer {
        DeltaLayer {
            path_or_conf: PathOrConf::Conf(conf),
@@ -585,10 +522,9 @@ impl DeltaLayer {
            dropped: filename.dropped,
            inner: Mutex::new(DeltaLayerInner {
                loaded: false,
-                page_version_metas: BTreeMap::new(),
-                relsizes: BTreeMap::new(),
+                page_version_metas: VecMap::default(),
+                relsizes: VecMap::default(),
            }),
-            predecessor,
        }
    }

@@ -609,10 +545,28 @@ impl DeltaLayer {
            dropped: summary.dropped,
            inner: Mutex::new(DeltaLayerInner {
                loaded: false,
-                page_version_metas: BTreeMap::new(),
-                relsizes: BTreeMap::new(),
+                page_version_metas: VecMap::default(),
+                relsizes: VecMap::default(),
            }),
-            predecessor: None,
        })
    }
+
+    fn layer_name(&self) -> DeltaFileName {
+        DeltaFileName {
+            seg: self.seg,
+            start_lsn: self.start_lsn,
+            end_lsn: self.end_lsn,
+            dropped: self.dropped,
+        }
+    }
+
+    /// Path to the layer file in pageserver workdir.
+    pub fn path(&self) -> PathBuf {
+        Self::path_for(
+            &self.path_or_conf,
+            self.timelineid,
+            self.tenantid,
+            &self.layer_name(),
+        )
+    }
 }
--- a/pageserver/src/layered_repository/filename.rs
+++ b/pageserver/src/layered_repository/filename.rs
@@ -13,6 +13,8 @@ use anyhow::Result;
 use log::*;
 use zenith_utils::lsn::Lsn;

+use super::metadata::METADATA_FILE_NAME;
+
 // Note: LayeredTimeline::load_layer_map() relies on this sort order
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
 pub struct DeltaFileName {
@@ -35,7 +37,7 @@ impl DeltaFileName {
    /// Parse a string as a delta file name. Returns None if the filename does not
    /// match the expected pattern.
    ///
-    pub fn from_str(fname: &str) -> Option<Self> {
+    pub fn parse_str(fname: &str) -> Option<Self> {
        let rel;
        let mut parts;
        if let Some(rest) = fname.strip_prefix("rel_") {
@@ -168,7 +170,7 @@ impl ImageFileName {
    /// Parse a string as an image file name. Returns None if the filename does not
    /// match the expected pattern.
    ///
-    pub fn from_str(fname: &str) -> Option<Self> {
+    pub fn parse_str(fname: &str) -> Option<Self> {
        let rel;
        let mut parts;
        if let Some(rest) = fname.strip_prefix("rel_") {
@@ -286,11 +288,11 @@ pub fn list_files(
        let fname = direntry?.file_name();
        let fname = fname.to_str().unwrap();

-        if let Some(deltafilename) = DeltaFileName::from_str(fname) {
+        if let Some(deltafilename) = DeltaFileName::parse_str(fname) {
            deltafiles.push(deltafilename);
-        } else if let Some(imgfilename) = ImageFileName::from_str(fname) {
+        } else if let Some(imgfilename) = ImageFileName::parse_str(fname) {
            imgfiles.push(imgfilename);
-        } else if fname == "wal" || fname == "metadata" || fname == "ancestor" {
+        } else if fname == METADATA_FILE_NAME || fname == "ancestor" || fname.ends_with(".old") {
            // ignore these
        } else {
            warn!("unrecognized filename in timeline dir: {}", fname);
--- a/pageserver/src/layered_repository/image_layer.rs
+++ b/pageserver/src/layered_repository/image_layer.rs
@@ -114,25 +114,7 @@ pub struct ImageLayerInner {

 impl Layer for ImageLayer {
    fn filename(&self) -> PathBuf {
-        PathBuf::from(
-            ImageFileName {
-                seg: self.seg,
-                lsn: self.lsn,
-            }
-            .to_string(),
-        )
-    }
-
-    fn path(&self) -> Option<PathBuf> {
-        Some(Self::path_for(
-            &self.path_or_conf,
-            self.timelineid,
-            self.tenantid,
-            &ImageFileName {
-                seg: self.seg,
-                lsn: self.lsn,
-            },
-        ))
+        PathBuf::from(self.layer_name().to_string())
    }

    fn get_timeline_id(&self) -> ZTimelineId {
@@ -152,7 +134,8 @@ impl Layer for ImageLayer {
    }

    fn get_end_lsn(&self) -> Lsn {
-        self.lsn
+        // End-bound is exclusive
+        self.lsn + 1
    }

    /// Look up given page in the file
@@ -221,9 +204,7 @@ impl Layer for ImageLayer {

    fn delete(&self) -> Result<()> {
        // delete underlying file
-        if let Some(path) = self.path() {
-            fs::remove_file(path)?;
-        }
+        fs::remove_file(self.path())?;
        Ok(())
    }

@@ -299,9 +280,7 @@ impl ImageLayer {
        let inner = layer.inner.lock().unwrap();

        // Write the images into a file
-        let path = layer
-            .path()
-            .expect("ImageLayer is supposed to have a layer path on disk");
+        let path = layer.path();
        // Note: This overwrites any existing file. There shouldn't be any.
        // FIXME: throw an error instead?
        let file = File::create(&path)?;
@@ -336,9 +315,10 @@ impl ImageLayer {
        let book = chapter.close()?;

        // This flushes the underlying 'buf_writer'.
-        book.close()?;
+        let writer = book.close()?;
+        writer.get_ref().sync_all()?;

-        trace!("saved {}", &path.display());
+        trace!("saved {}", path.display());

        drop(inner);

@@ -443,15 +423,7 @@ impl ImageLayer {
    }

    fn open_book(&self) -> Result<(PathBuf, Book<File>)> {
-        let path = Self::path_for(
-            &self.path_or_conf,
-            self.timelineid,
-            self.tenantid,
-            &ImageFileName {
-                seg: self.seg,
-                lsn: self.lsn,
-            },
-        );
+        let path = self.path();

        let file = File::open(&path)?;
        let book = Book::new(file)?;
@@ -498,4 +470,21 @@ impl ImageLayer {
            }),
        })
    }
+
+    fn layer_name(&self) -> ImageFileName {
+        ImageFileName {
+            seg: self.seg,
+            lsn: self.lsn,
+        }
+    }
+
+    /// Path to the layer file in pageserver workdir.
+    pub fn path(&self) -> PathBuf {
+        Self::path_for(
+            &self.path_or_conf,
+            self.timelineid,
+            self.tenantid,
+            &self.layer_name(),
+        )
+    }
 }
--- a/pageserver/src/layered_repository/inmemory_layer.rs
+++ b/pageserver/src/layered_repository/inmemory_layer.rs
@@ -12,16 +12,17 @@ use crate::layered_repository::{DeltaLayer, ImageLayer};
 use crate::repository::WALRecord;
 use crate::PageServerConf;
 use crate::{ZTenantId, ZTimelineId};
-use anyhow::{bail, Result};
+use anyhow::{bail, ensure, Result};
 use bytes::Bytes;
 use log::*;
-use std::collections::BTreeMap;
-use std::ops::Bound::Included;
 use std::path::PathBuf;
-use std::sync::{Arc, Mutex};
+use std::sync::{Arc, RwLock};
+use zenith_utils::vec_map::VecMap;

 use zenith_utils::lsn::Lsn;

+use super::page_versions::PageVersions;
+
 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
    tenantid: ZTenantId,
@@ -34,57 +35,56 @@ pub struct InMemoryLayer {
    ///
    start_lsn: Lsn,

-    /// Frozen in-memory layers have an inclusive end LSN.
-    end_lsn: Option<Lsn>,
-
    /// LSN of the oldest page version stored in this layer
    oldest_pending_lsn: Lsn,

    /// The above fields never change. The parts that do change are in 'inner',
    /// and protected by mutex.
-    inner: Mutex<InMemoryLayerInner>,
+    inner: RwLock<InMemoryLayerInner>,
+
+    /// Predecessor layer might be needed?
+    incremental: bool,
 }

 pub struct InMemoryLayerInner {
+    /// Frozen in-memory layers have an exclusive end LSN.
+    /// Writes are only allowed when this is None
+    end_lsn: Option<Lsn>,
+
    /// If this relation was dropped, remember when that happened.
-    drop_lsn: Option<Lsn>,
+    /// The drop LSN is recorded in [`end_lsn`].
+    dropped: bool,

    ///
    /// All versions of all pages in the layer are are kept here.
    /// Indexed by block number and LSN.
    ///
-    page_versions: BTreeMap<(u32, Lsn), PageVersion>,
+    page_versions: PageVersions,

    ///
    /// `segsizes` tracks the size of the segment at different points in time.
    ///
-    segsizes: BTreeMap<Lsn, u32>,
-
-    /// Writes are only allowed when true.
-    /// Set to false when this layer is in the process of being replaced.
-    writeable: bool,
-
-    /// Predecessor layer
-    predecessor: Option<Arc<dyn Layer>>,
+    /// For a blocky rel, there is always one entry, at the layer's start_lsn,
+    /// so that determining the size never depends on the predecessor layer. For
+    /// a non-blocky rel, 'segsizes' is not used and is always empty.
+    ///
+    segsizes: VecMap<Lsn, u32>,
 }

 impl InMemoryLayerInner {
-    fn check_writeable(&self) -> WriteResult<()> {
-        if self.writeable {
-            Ok(())
-        } else {
-            Err(NonWriteableError)
-        }
+    fn assert_writeable(&self) {
+        assert!(self.end_lsn.is_none());
    }

    fn get_seg_size(&self, lsn: Lsn) -> u32 {
        // Scan the BTreeMap backwards, starting from the given entry.
-        let mut iter = self.segsizes.range((Included(&Lsn(0)), Included(&lsn)));
+        let slice = self.segsizes.slice_range(..=lsn);

-        if let Some((_entry_lsn, entry)) = iter.next_back() {
+        // We make sure there is always at least one entry
+        if let Some((_entry_lsn, entry)) = slice.last() {
            *entry
        } else {
-            0
+            panic!("could not find seg size in in-memory layer");
        }
    }
 }
@@ -93,33 +93,26 @@ impl Layer for InMemoryLayer {
    // An in-memory layer doesn't really have a filename as it's not stored on disk,
    // but we construct a filename as if it was a delta layer
    fn filename(&self) -> PathBuf {
-        let inner = self.inner.lock().unwrap();
+        let inner = self.inner.read().unwrap();

        let end_lsn;
-        let dropped;
-        if let Some(drop_lsn) = inner.drop_lsn {
+        if let Some(drop_lsn) = inner.end_lsn {
            end_lsn = drop_lsn;
-            dropped = true;
        } else {
            end_lsn = Lsn(u64::MAX);
-            dropped = false;
        }

        let delta_filename = DeltaFileName {
            seg: self.seg,
            start_lsn: self.start_lsn,
            end_lsn,
-            dropped,
+            dropped: inner.dropped,
        }
        .to_string();

        PathBuf::from(format!("inmem-{}", delta_filename))
    }

-    fn path(&self) -> Option<PathBuf> {
-        None
-    }
-
    fn get_timeline_id(&self) -> ZTimelineId {
        self.timelineid
    }
@@ -133,22 +126,18 @@ impl Layer for InMemoryLayer {
    }

    fn get_end_lsn(&self) -> Lsn {
-        if let Some(end_lsn) = self.end_lsn {
-            return Lsn(end_lsn.0 + 1);
-        }
+        let inner = self.inner.read().unwrap();

-        let inner = self.inner.lock().unwrap();
-
-        if let Some(drop_lsn) = inner.drop_lsn {
-            drop_lsn
+        if let Some(end_lsn) = inner.end_lsn {
+            end_lsn
        } else {
            Lsn(u64::MAX)
        }
    }

    fn is_dropped(&self) -> bool {
-        let inner = self.inner.lock().unwrap();
-        inner.drop_lsn.is_some()
+        let inner = self.inner.read().unwrap();
+        inner.dropped
    }

    /// Look up given page in the cache.
@@ -162,24 +151,22 @@ impl Layer for InMemoryLayer {

        assert!(self.seg.blknum_in_seg(blknum));

-        let predecessor: Option<Arc<dyn Layer>>;
-
        {
-            let inner = self.inner.lock().unwrap();
+            let inner = self.inner.read().unwrap();

-            // Scan the BTreeMap backwards, starting from reconstruct_data.lsn.
-            let minkey = (blknum, Lsn(0));
-            let maxkey = (blknum, lsn);
-            let mut iter = inner
+            // Scan the page versions backwards, starting from `lsn`.
+            let iter = inner
                .page_versions
-                .range((Included(&minkey), Included(&maxkey)));
-            while let Some(((_blknum, _entry_lsn), entry)) = iter.next_back() {
+                .get_block_lsn_range(blknum, ..=lsn)
+                .iter()
+                .rev();
+            for (entry_lsn, entry) in iter {
                if let Some(img) = &entry.page_image {
                    reconstruct_data.page_img = Some(img.clone());
                    need_image = false;
                    break;
                } else if let Some(rec) = &entry.record {
-                    reconstruct_data.records.push(rec.clone());
+                    reconstruct_data.records.push((*entry_lsn, rec.clone()));
                    if rec.will_init {
                        // This WAL record initializes the page, so no need to go further back
                        need_image = false;
@@ -190,16 +177,14 @@ impl Layer for InMemoryLayer {
                    bail!("no page image or WAL record for requested page");
                }
            }
-
-            predecessor = inner.predecessor.clone();
            // release lock on 'inner'
        }

        // If an older page image is needed to reconstruct the page, let the
-        // caller know about the predecessor layer.
+        // caller know
        if need_image {
-            if let Some(cont_layer) = predecessor {
-                Ok(PageReconstructResult::Continue(self.start_lsn, cont_layer))
+            if self.incremental {
+                Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
            } else {
                Ok(PageReconstructResult::Missing(self.start_lsn))
            }
@@ -211,14 +196,18 @@ impl Layer for InMemoryLayer {
    /// Get size of the relation at given LSN
    fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
        assert!(lsn >= self.start_lsn);
+        ensure!(
+            self.seg.rel.is_blocky(),
+            "get_seg_size() called on a non-blocky rel"
+        );

-        let inner = self.inner.lock().unwrap();
+        let inner = self.inner.read().unwrap();
        Ok(inner.get_seg_size(lsn))
    }

    /// Does this segment exist at given LSN?
    fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
-        let inner = self.inner.lock().unwrap();
+        let inner = self.inner.read().unwrap();

        // If the segment created after requested LSN,
        // it doesn't exist in the layer. But we shouldn't
@@ -226,8 +215,8 @@ impl Layer for InMemoryLayer {
        assert!(lsn >= self.start_lsn);

        // Is the requested LSN after the segment was dropped?
-        if let Some(drop_lsn) = inner.drop_lsn {
-            if lsn >= drop_lsn {
+        if let Some(end_lsn) = inner.end_lsn {
+            if lsn >= end_lsn {
                return Ok(false);
            }
        }
@@ -250,36 +239,35 @@ impl Layer for InMemoryLayer {
    }

    fn is_incremental(&self) -> bool {
-        let inner = self.inner.lock().unwrap();
-        inner.predecessor.is_some()
+        self.incremental
    }

    /// debugging function to print out the contents of the layer
    fn dump(&self) -> Result<()> {
-        let inner = self.inner.lock().unwrap();
+        let inner = self.inner.read().unwrap();

        let end_str = inner
-            .drop_lsn
+            .end_lsn
            .as_ref()
-            .map(|drop_lsn| drop_lsn.to_string())
+            .map(Lsn::to_string)
            .unwrap_or_default();

        println!(
-            "----- in-memory layer for tli {} seg {} {}-{} ----",
-            self.timelineid, self.seg, self.start_lsn, end_str
+            "----- in-memory layer for tli {} seg {} {}-{} {} ----",
+            self.timelineid, self.seg, self.start_lsn, end_str, inner.dropped,
        );

-        for (k, v) in inner.segsizes.iter() {
+        for (k, v) in inner.segsizes.as_slice() {
            println!("segsizes {}: {}", k, v);
        }

-        for (k, v) in inner.page_versions.iter() {
+        for (blknum, lsn, pv) in inner.page_versions.ordered_page_version_iter(None) {
            println!(
                "blk {} at {}: {}/{}\n",
-                k.0,
-                k.1,
-                v.page_image.is_some(),
-                v.record.is_some()
+                blknum,
+                lsn,
+                pv.page_image.is_some(),
+                pv.record.is_some()
            );
        }

@@ -287,26 +275,19 @@ impl Layer for InMemoryLayer {
    }
 }

-/// Write failed because the layer is in process of being replaced.
-/// See [`LayeredTimeline::perform_write_op`] for how to handle this error.
-#[derive(Debug)]
-pub struct NonWriteableError;
+/// A result of an inmemory layer data being written to disk.
+pub struct LayersOnDisk {
+    pub delta_layers: Vec<DeltaLayer>,
+    pub image_layers: Vec<ImageLayer>,
+}

-pub type WriteResult<T> = std::result::Result<T, NonWriteableError>;
-
-/// Helper struct to cleanup `InMemoryLayer::freeze` return signature.
-pub struct FreezeLayers {
-    /// Replacement layer for the layer which freeze was called on.
-    pub frozen: Arc<InMemoryLayer>,
-    /// New open layer containing leftover data.
-    pub open: Option<Arc<InMemoryLayer>>,
+impl LayersOnDisk {
+    pub fn is_empty(&self) -> bool {
+        self.delta_layers.is_empty() && self.image_layers.is_empty()
+    }
 }

 impl InMemoryLayer {
-    fn assert_not_frozen(&self) {
-        assert!(self.end_lsn.is_none());
-    }
-
    /// Return the oldest page version that's stored in this layer
    pub fn get_oldest_pending_lsn(&self) -> Lsn {
        self.oldest_pending_lsn
@@ -330,20 +311,25 @@ impl InMemoryLayer {
            start_lsn
        );

+        // The segment is initially empty, so initialize 'segsizes' with 0.
+        let mut segsizes = VecMap::default();
+        if seg.rel.is_blocky() {
+            segsizes.append(start_lsn, 0).unwrap();
+        }
+
        Ok(InMemoryLayer {
            conf,
            timelineid,
            tenantid,
            seg,
            start_lsn,
-            end_lsn: None,
            oldest_pending_lsn,
-            inner: Mutex::new(InMemoryLayerInner {
-                drop_lsn: None,
-                page_versions: BTreeMap::new(),
-                segsizes: BTreeMap::new(),
-                writeable: true,
-                predecessor: None,
+            incremental: false,
+            inner: RwLock::new(InMemoryLayerInner {
+                end_lsn: None,
+                dropped: false,
+                page_versions: PageVersions::default(),
+                segsizes,
            }),
        })
    }
@@ -351,10 +337,10 @@ impl InMemoryLayer {
    // Write operations

    /// Remember new page version, as a WAL record over previous version
-    pub fn put_wal_record(&self, blknum: u32, rec: WALRecord) -> WriteResult<u32> {
+    pub fn put_wal_record(&self, lsn: Lsn, blknum: u32, rec: WALRecord) -> u32 {
        self.put_page_version(
            blknum,
-            rec.lsn,
+            lsn,
            PageVersion {
                page_image: None,
                record: Some(rec),
@@ -363,7 +349,7 @@ impl InMemoryLayer {
    }

    /// Remember new page version, as a full page image
-    pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> WriteResult<u32> {
+    pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> u32 {
        self.put_page_version(
            blknum,
            lsn,
@@ -376,8 +362,7 @@ impl InMemoryLayer {

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> WriteResult<u32> {
-        self.assert_not_frozen();
+    pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> u32 {
        assert!(self.seg.blknum_in_seg(blknum));

        trace!(
@@ -387,11 +372,11 @@ impl InMemoryLayer {
            self.timelineid,
            lsn
        );
-        let mut inner = self.inner.lock().unwrap();
+        let mut inner = self.inner.write().unwrap();

-        inner.check_writeable()?;
+        inner.assert_writeable();

-        let old = inner.page_versions.insert((blknum, lsn), pv);
+        let old = inner.page_versions.append_or_update_last(blknum, lsn, pv);

        if old.is_some() {
            // We already had an entry for this LSN. That's odd..
@@ -405,7 +390,7 @@ impl InMemoryLayer {
        if self.seg.rel.is_blocky() {
            let newsize = blknum - self.seg.segno * RELISH_SEG_SIZE + 1;

-            // use inner get_seg_size, since calling self.get_seg_size will try to acquire self.inner.lock
+            // use inner get_seg_size, since calling self.get_seg_size will try to acquire the lock,
            // which we've just acquired above
            let oldsize = inner.get_seg_size(lsn);
            if newsize > oldsize {
@@ -436,7 +421,9 @@ impl InMemoryLayer {
                        gapblknum,
                        blknum
                    );
-                    let old = inner.page_versions.insert((gapblknum, lsn), zeropv);
+                    let old = inner
+                        .page_versions
+                        .append_or_update_last(gapblknum, lsn, zeropv);
                    // We already had an entry for this LSN. That's odd..

                    if old.is_some() {
@@ -447,48 +434,47 @@ impl InMemoryLayer {
                    }
                }

-                inner.segsizes.insert(lsn, newsize);
-                return Ok(newsize - oldsize);
+                inner.segsizes.append_or_update_last(lsn, newsize).unwrap();
+                return newsize - oldsize;
            }
        }
-        Ok(0)
+
+        0
    }

    /// Remember that the relation was truncated at given LSN
-    pub fn put_truncation(&self, lsn: Lsn, segsize: u32) -> WriteResult<()> {
-        self.assert_not_frozen();
+    pub fn put_truncation(&self, lsn: Lsn, segsize: u32) {
+        assert!(
+            self.seg.rel.is_blocky(),
+            "put_truncation() called on a non-blocky rel"
+        );

-        let mut inner = self.inner.lock().unwrap();
-        inner.check_writeable()?;
+        let mut inner = self.inner.write().unwrap();
+        inner.assert_writeable();

        // check that this we truncate to a smaller size than segment was before the truncation
        let oldsize = inner.get_seg_size(lsn);
        assert!(segsize < oldsize);

-        let old = inner.segsizes.insert(lsn, segsize);
+        let old = inner.segsizes.append_or_update_last(lsn, segsize).unwrap();

        if old.is_some() {
            // We already had an entry for this LSN. That's odd..
            warn!("Inserting truncation, but had an entry for the LSN already");
        }
-
-        Ok(())
    }

    /// Remember that the segment was dropped at given LSN
-    pub fn drop_segment(&self, lsn: Lsn) -> WriteResult<()> {
-        self.assert_not_frozen();
+    pub fn drop_segment(&self, lsn: Lsn) {
+        let mut inner = self.inner.write().unwrap();

-        let mut inner = self.inner.lock().unwrap();
+        assert!(inner.end_lsn.is_none());
+        assert!(!inner.dropped);
+        inner.dropped = true;
+        assert!(self.start_lsn < lsn);
+        inner.end_lsn = Some(lsn);

-        inner.check_writeable()?;
-
-        assert!(inner.drop_lsn.is_none());
-        inner.drop_lsn = Some(lsn);
-
-        info!("dropped segment {} at {}", self.seg, lsn);
-
-        Ok(())
+        trace!("dropped segment {} at {}", self.seg, lsn);
    }

    ///
@@ -505,6 +491,9 @@ impl InMemoryLayer {
    ) -> Result<InMemoryLayer> {
        let seg = src.get_seg_tag();

+        assert!(oldest_pending_lsn.is_aligned());
+        assert!(oldest_pending_lsn >= start_lsn);
+
        trace!(
            "initializing new InMemoryLayer for writing {} on timeline {} at {}",
            seg,
@@ -512,11 +501,11 @@ impl InMemoryLayer {
            start_lsn,
        );

-        // For convenience, copy the segment size from the predecessor layer
-        let mut segsizes = BTreeMap::new();
+        // Copy the segment size at the start LSN from the predecessor layer.
+        let mut segsizes = VecMap::default();
        if seg.rel.is_blocky() {
            let size = src.get_seg_size(start_lsn)?;
-            segsizes.insert(start_lsn, size);
+            segsizes.append(start_lsn, size).unwrap();
        }

        Ok(InMemoryLayer {
@@ -525,117 +514,43 @@ impl InMemoryLayer {
            tenantid,
            seg,
            start_lsn,
-            end_lsn: None,
            oldest_pending_lsn,
-            inner: Mutex::new(InMemoryLayerInner {
-                drop_lsn: None,
-                page_versions: BTreeMap::new(),
+            incremental: true,
+            inner: RwLock::new(InMemoryLayerInner {
+                end_lsn: None,
+                dropped: false,
+                page_versions: PageVersions::default(),
                segsizes,
-                writeable: true,
-                predecessor: Some(src),
            }),
        })
    }

-    /// Splits `self` into two InMemoryLayers: `frozen` and `open`.
-    /// All data up to and including `cutoff_lsn` (or the drop LSN, if dropped)
-    /// is copied to `frozen`, while the remaining data is copied to `open`.
-    /// After completion, self is non-writeable, but not frozen.
-    pub fn freeze(&self, cutoff_lsn: Lsn) -> Result<FreezeLayers> {
-        info!(
-            "freezing in memory layer for {} on timeline {} at {}",
-            self.seg, self.timelineid, cutoff_lsn
-        );
+    pub fn is_writeable(&self) -> bool {
+        let inner = self.inner.read().unwrap();
+        inner.end_lsn.is_none()
+    }

-        self.assert_not_frozen();
+    /// Make the layer non-writeable. Only call once.
+    /// Records the end_lsn for non-dropped layers.
+    /// `end_lsn` is inclusive
+    pub fn freeze(&self, end_lsn: Lsn) {
+        let mut inner = self.inner.write().unwrap();

-        let mut inner = self.inner.lock().unwrap();
-        assert!(inner.writeable);
-        inner.writeable = false;
-
-        // Normally, use the cutoff LSN as the end of the frozen layer.
-        // But if the relation was dropped, we know that there are no
-        // more changes coming in for it, and in particular we know that
-        // there are no changes "in flight" for the LSN anymore, so we use
-        // the drop LSN instead. The drop-LSN could be ahead of the
-        // caller-specified LSN!
-        let dropped = inner.drop_lsn.is_some();
-        let end_lsn = if dropped {
-            inner.drop_lsn.unwrap()
+        if inner.end_lsn.is_some() {
+            assert!(inner.dropped);
        } else {
-            cutoff_lsn
-        };
+            assert!(!inner.dropped);
+            assert!(self.start_lsn < end_lsn + 1);
+            inner.end_lsn = Some(Lsn(end_lsn.0 + 1));

-        // Divide all the page versions into old and new at the 'end_lsn' cutoff point.
-        let mut before_page_versions;
-        let mut before_segsizes;
-        let mut after_page_versions;
-        let mut after_segsizes;
-        if !dropped {
-            before_segsizes = BTreeMap::new();
-            after_segsizes = BTreeMap::new();
-            for (lsn, size) in inner.segsizes.iter() {
-                if *lsn > end_lsn {
-                    after_segsizes.insert(*lsn, *size);
-                } else {
-                    before_segsizes.insert(*lsn, *size);
-                }
+            if let Some((lsn, _)) = inner.segsizes.as_slice().last() {
+                assert!(lsn <= &end_lsn, "{:?} {:?}", lsn, end_lsn);
            }

-            before_page_versions = BTreeMap::new();
-            after_page_versions = BTreeMap::new();
-            for ((blknum, lsn), pv) in inner.page_versions.iter() {
-                if *lsn > end_lsn {
-                    after_page_versions.insert((*blknum, *lsn), pv.clone());
-                } else {
-                    before_page_versions.insert((*blknum, *lsn), pv.clone());
-                }
+            for (_blk, lsn, _pv) in inner.page_versions.ordered_page_version_iter(None) {
+                assert!(lsn <= end_lsn);
            }
-        } else {
-            before_page_versions = inner.page_versions.clone();
-            before_segsizes = inner.segsizes.clone();
-            after_segsizes = BTreeMap::new();
-            after_page_versions = BTreeMap::new();
        }
-
-        let frozen = Arc::new(InMemoryLayer {
-            conf: self.conf,
-            tenantid: self.tenantid,
-            timelineid: self.timelineid,
-            seg: self.seg,
-            start_lsn: self.start_lsn,
-            end_lsn: Some(end_lsn),
-            oldest_pending_lsn: self.start_lsn,
-            inner: Mutex::new(InMemoryLayerInner {
-                drop_lsn: inner.drop_lsn,
-                page_versions: before_page_versions,
-                segsizes: before_segsizes,
-                writeable: false,
-                predecessor: inner.predecessor.clone(),
-            }),
-        });
-
-        let open = if !dropped && (!after_segsizes.is_empty() || !after_page_versions.is_empty()) {
-            let mut new_open = Self::create_successor_layer(
-                self.conf,
-                frozen.clone(),
-                self.timelineid,
-                self.tenantid,
-                end_lsn,
-                end_lsn,
-            )?;
-
-            let new_inner = new_open.inner.get_mut().unwrap();
-            new_inner.page_versions.append(&mut after_page_versions);
-            new_inner.segsizes.append(&mut after_segsizes);
-
-            Some(Arc::new(new_open))
-        } else {
-            None
-        };
-
-        // TODO could we avoid creating the `frozen` if it contains no data
-        Ok(FreezeLayers { frozen, open })
    }

    /// Write the this frozen in-memory layer to disk.
@@ -646,40 +561,62 @@ impl InMemoryLayer {
    /// WAL records between start and end LSN. (The delta layer is not needed
    /// when a new relish is created with a single LSN, so that the start and
    /// end LSN are the same.)
-    pub fn write_to_disk(&self, timeline: &LayeredTimeline) -> Result<Vec<Arc<dyn Layer>>> {
-        let end_lsn = self.end_lsn.expect("can only write frozen layers to disk");
+    pub fn write_to_disk(&self, timeline: &LayeredTimeline) -> Result<LayersOnDisk> {
+        trace!(
+            "write_to_disk {} get_end_lsn is {}",
+            self.filename().display(),
+            self.get_end_lsn()
+        );

-        let inner = self.inner.lock().unwrap();
+        // Grab the lock in read-mode. We hold it over the I/O, but because this
+        // layer is not writeable anymore, no one should be trying to acquire the
+        // write lock on it, so we shouldn't block anyone. There's one exception
+        // though: another thread might have grabbed a reference to this layer
+        // in `get_layer_for_write' just before the checkpointer called
+        // `freeze`, and then `write_to_disk` on it. When the thread gets the
+        // lock, it will see that it's not writeable anymore and retry, but it
+        // would have to wait until we release it. That race condition is very
+        // rare though, so we just accept the potential latency hit for now.
+        let inner = self.inner.read().unwrap();
+        let end_lsn_exclusive = inner.end_lsn.unwrap();

-        let drop_lsn = inner.drop_lsn;
-        let predecessor = inner.predecessor.clone();
-
-        let mut before_page_versions;
-        let mut before_segsizes;
-        if inner.drop_lsn.is_none() {
-            before_segsizes = BTreeMap::new();
-            for (lsn, size) in inner.segsizes.iter() {
-                if *lsn <= end_lsn {
-                    before_segsizes.insert(*lsn, *size);
-                }
-            }
-
-            before_page_versions = BTreeMap::new();
-            for ((blknum, lsn), pv) in inner.page_versions.iter() {
-                if *lsn < end_lsn {
-                    before_page_versions.insert((*blknum, *lsn), pv.clone());
-                }
-            }
-        } else {
-            before_page_versions = inner.page_versions.clone();
-            before_segsizes = inner.segsizes.clone();
+        if inner.dropped {
+            let delta_layer = DeltaLayer::create(
+                self.conf,
+                self.timelineid,
+                self.tenantid,
+                self.seg,
+                self.start_lsn,
+                end_lsn_exclusive,
+                true,
+                inner.page_versions.ordered_page_version_iter(None),
+                inner.segsizes.clone(),
+            )?;
+            trace!(
+                "freeze: created delta layer for dropped segment {} {}-{}",
+                self.seg,
+                self.start_lsn,
+                end_lsn_exclusive
+            );
+            return Ok(LayersOnDisk {
+                delta_layers: vec![delta_layer],
+                image_layers: Vec::new(),
+            });
        }

-        drop(inner);
+        // Since `end_lsn` is inclusive, subtract 1.
+        // We want to make an ImageLayer for the last included LSN,
+        // so the DeltaLayer should exlcude that LSN.
+        let end_lsn_inclusive = Lsn(end_lsn_exclusive.0 - 1);

-        let mut frozen_layers: Vec<Arc<dyn Layer>> = Vec::new();
+        let mut page_versions = inner
+            .page_versions
+            .ordered_page_version_iter(Some(end_lsn_inclusive));

-        if self.start_lsn != end_lsn {
+        let mut delta_layers = Vec::new();
+
+        if self.start_lsn != end_lsn_inclusive {
+            let (segsizes, _) = inner.segsizes.split_at(&end_lsn_exclusive);
            // Write the page versions before the cutoff to disk.
            let delta_layer = DeltaLayer::create(
                self.conf,
@@ -687,35 +624,36 @@ impl InMemoryLayer {
                self.tenantid,
                self.seg,
                self.start_lsn,
-                end_lsn,
-                drop_lsn.is_some(),
-                predecessor,
-                before_page_versions,
-                before_segsizes,
+                end_lsn_inclusive,
+                false,
+                page_versions,
+                segsizes,
            )?;
-            frozen_layers.push(Arc::new(delta_layer));
+            delta_layers.push(delta_layer);
            trace!(
                "freeze: created delta layer {} {}-{}",
                self.seg,
                self.start_lsn,
-                end_lsn
+                end_lsn_inclusive
            );
        } else {
-            assert!(before_page_versions.is_empty());
+            assert!(page_versions.next().is_none());
        }

-        if drop_lsn.is_none() {
-            // Write a new base image layer at the cutoff point
-            let image_layer = ImageLayer::create_from_src(self.conf, timeline, self, end_lsn)?;
-            frozen_layers.push(Arc::new(image_layer));
-            trace!("freeze: created image layer {} at {}", self.seg, end_lsn);
-        }
+        drop(inner);

-        Ok(frozen_layers)
-    }
+        // Write a new base image layer at the cutoff point
+        let image_layer =
+            ImageLayer::create_from_src(self.conf, timeline, self, end_lsn_inclusive)?;
+        trace!(
+            "freeze: created image layer {} at {}",
+            self.seg,
+            end_lsn_inclusive
+        );

-    pub fn update_predecessor(&self, predecessor: Arc<dyn Layer>) -> Option<Arc<dyn Layer>> {
-        let mut inner = self.inner.lock().unwrap();
-        inner.predecessor.replace(predecessor)
+        Ok(LayersOnDisk {
+            delta_layers,
+            image_layers: vec![image_layer],
+        })
    }
 }
--- a/pageserver/src/layered_repository/interval_tree.rs
+++ b/pageserver/src/layered_repository/interval_tree.rs
@@ -0,0 +1,468 @@
+///
+/// IntervalTree is data structure for holding intervals. It is generic
+/// to make unit testing possible, but the only real user of it is the layer map,
+///
+/// It's inspired by the "segment tree" or a "statistic tree" as described in
+/// https://en.wikipedia.org/wiki/Segment_tree. However, we use a B-tree to hold
+/// the points instead of a binary tree. This is called an "interval tree" instead
+/// of "segment tree" because the term "segment" is already using Zenith to mean
+/// something else. To add to the confusion, there is another data structure known
+/// as "interval tree" out there (see https://en.wikipedia.org/wiki/Interval_tree),
+/// for storing intervals, but this isn't that.
+///
+/// The basic idea is to have a B-tree of "interesting Points". At each Point,
+/// there is a list of intervals that contain the point. The Points are formed
+/// from the start bounds of each interval; there is a Point for each distinct
+/// start bound.
+///
+/// Operations:
+///
+/// To find intervals that contain a given point, you search the b-tree to find
+/// the nearest Point <= search key. Then you just return the list of intervals.
+///
+/// To insert an interval, find the Point with start key equal to the inserted item.
+/// If the Point doesn't exist yet, create it, by copying all the items from the
+/// previous Point that cover the new Point. Then walk right, inserting the new
+/// interval to all the Points that are contained by the new interval (including the
+/// newly created Point).
+///
+/// To remove an interval, you scan the tree for all the Points that are contained by
+/// the removed interval, and remove it from the list in each Point.
+///
+/// Requirements and assumptions:
+///
+/// - Can store overlapping items
+/// - But there are not many overlapping items
+/// - The interval bounds don't change after it is added to the tree
+/// - Intervals are uniquely identified by pointer equality. You must not be insert the
+///   same interval object twice, and `remove` uses pointer equality to remove the right
+///   interval. It is OK to have two intervals with the same bounds, however.
+///
+use std::collections::BTreeMap;
+use std::fmt::Debug;
+use std::ops::Range;
+use std::sync::Arc;
+
+pub struct IntervalTree<I: ?Sized>
+where
+    I: IntervalItem,
+{
+    points: BTreeMap<I::Key, Point<I>>,
+}
+
+struct Point<I: ?Sized> {
+    /// All intervals that contain this point, in no particular order.
+    ///
+    /// We assume that there aren't a lot of overlappingg intervals, so that this vector
+    /// never grows very large. If that assumption doesn't hold, we could keep this ordered
+    /// by the end bound, to speed up `search`. But as long as there are only a few elements,
+    /// a linear search is OK.
+    elements: Vec<Arc<I>>,
+}
+
+/// Abstraction for an interval that can be stored in the tree
+///
+/// The start bound is inclusive and the end bound is exclusive. End must be greater
+/// than start.
+pub trait IntervalItem {
+    type Key: Ord + Copy + Debug + Sized;
+
+    fn start_key(&self) -> Self::Key;
+    fn end_key(&self) -> Self::Key;
+
+    fn bounds(&self) -> Range<Self::Key> {
+        self.start_key()..self.end_key()
+    }
+}
+
+impl<I: ?Sized> IntervalTree<I>
+where
+    I: IntervalItem,
+{
+    /// Return an element that contains 'key', or precedes it.
+    ///
+    /// If there are multiple candidates, returns the one with the highest 'end' key.
+    pub fn search(&self, key: I::Key) -> Option<Arc<I>> {
+        // Find the greatest point that precedes or is equal to the search key. If there is
+        // none, returns None.
+        let (_, p) = self.points.range(..=key).next_back()?;
+
+        // Find the element with the highest end key at this point
+        let highest_item = p
+            .elements
+            .iter()
+            .reduce(|a, b| {
+                // starting with Rust 1.53, could use `std::cmp::min_by_key` here
+                if a.end_key() > b.end_key() {
+                    a
+                } else {
+                    b
+                }
+            })
+            .unwrap();
+        Some(Arc::clone(highest_item))
+    }
+
+    /// Iterate over all items with start bound >= 'key'
+    pub fn iter_newer(&self, key: I::Key) -> IntervalIter<I> {
+        IntervalIter {
+            point_iter: self.points.range(key..),
+            elem_iter: None,
+        }
+    }
+
+    /// Iterate over all items
+    pub fn iter(&self) -> IntervalIter<I> {
+        IntervalIter {
+            point_iter: self.points.range(..),
+            elem_iter: None,
+        }
+    }
+
+    pub fn insert(&mut self, item: Arc<I>) {
+        let start_key = item.start_key();
+        let end_key = item.end_key();
+        assert!(start_key < end_key);
+        let bounds = start_key..end_key;
+
+        // Find the starting point and walk forward from there
+        let mut found_start_point = false;
+        let iter = self.points.range_mut(bounds);
+        for (point_key, point) in iter {
+            if *point_key == start_key {
+                found_start_point = true;
+                // It is an error to insert the same item to the tree twice.
+                assert!(
+                    !point.elements.iter().any(|x| Arc::ptr_eq(x, &item)),
+                    "interval is already in the tree"
+                );
+            }
+            point.elements.push(Arc::clone(&item));
+        }
+        if !found_start_point {
+            // Create a new Point for the starting point
+
+            // Look at the previous point, and copy over elements that overlap with this
+            // new point
+            let mut new_elements: Vec<Arc<I>> = Vec::new();
+            if let Some((_, prev_point)) = self.points.range(..start_key).next_back() {
+                let overlapping_prev_elements = prev_point
+                    .elements
+                    .iter()
+                    .filter(|x| x.bounds().contains(&start_key))
+                    .cloned();
+
+                new_elements.extend(overlapping_prev_elements);
+            }
+            new_elements.push(item);
+
+            let new_point = Point {
+                elements: new_elements,
+            };
+            self.points.insert(start_key, new_point);
+        }
+    }
+
+    pub fn remove(&mut self, item: &Arc<I>) {
+        // range search points
+        let start_key = item.start_key();
+        let end_key = item.end_key();
+        let bounds = start_key..end_key;
+
+        let mut points_to_remove: Vec<I::Key> = Vec::new();
+        let mut found_start_point = false;
+        for (point_key, point) in self.points.range_mut(bounds) {
+            if *point_key == start_key {
+                found_start_point = true;
+            }
+            let len_before = point.elements.len();
+            point.elements.retain(|other| !Arc::ptr_eq(other, item));
+            let len_after = point.elements.len();
+            assert_eq!(len_after + 1, len_before);
+            if len_after == 0 {
+                points_to_remove.push(*point_key);
+            }
+        }
+        assert!(found_start_point);
+
+        for k in points_to_remove {
+            self.points.remove(&k).unwrap();
+        }
+    }
+}
+
+pub struct IntervalIter<'a, I: ?Sized>
+where
+    I: IntervalItem,
+{
+    point_iter: std::collections::btree_map::Range<'a, I::Key, Point<I>>,
+    elem_iter: Option<(I::Key, std::slice::Iter<'a, Arc<I>>)>,
+}
+
+impl<'a, I> Iterator for IntervalIter<'a, I>
+where
+    I: IntervalItem + ?Sized,
+{
+    type Item = Arc<I>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        // Iterate over all elements in all the points in 'point_iter'. To avoid
+        // returning the same element twice, we only return each element at its
+        // starting point.
+        loop {
+            // Return next remaining element from the current point
+            if let Some((point_key, elem_iter)) = &mut self.elem_iter {
+                for elem in elem_iter {
+                    if elem.start_key() == *point_key {
+                        return Some(Arc::clone(elem));
+                    }
+                }
+            }
+            // No more elements at this point. Move to next point.
+            if let Some((point_key, point)) = self.point_iter.next() {
+                self.elem_iter = Some((*point_key, point.elements.iter()));
+                continue;
+            } else {
+                // No more points, all done
+                return None;
+            }
+        }
+    }
+}
+
+impl<I: ?Sized> Default for IntervalTree<I>
+where
+    I: IntervalItem,
+{
+    fn default() -> Self {
+        IntervalTree {
+            points: BTreeMap::new(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::fmt;
+
+    #[derive(Debug)]
+    struct MockItem {
+        start_key: u32,
+        end_key: u32,
+        val: String,
+    }
+    impl IntervalItem for MockItem {
+        type Key = u32;
+
+        fn start_key(&self) -> u32 {
+            self.start_key
+        }
+        fn end_key(&self) -> u32 {
+            self.end_key
+        }
+    }
+    impl MockItem {
+        fn new(start_key: u32, end_key: u32) -> Self {
+            MockItem {
+                start_key,
+                end_key,
+                val: format!("{}-{}", start_key, end_key),
+            }
+        }
+        fn new_str(start_key: u32, end_key: u32, val: &str) -> Self {
+            MockItem {
+                start_key,
+                end_key,
+                val: format!("{}-{}: {}", start_key, end_key, val),
+            }
+        }
+    }
+    impl fmt::Display for MockItem {
+        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            write!(f, "{}", self.val)
+        }
+    }
+    #[rustfmt::skip]
+    fn assert_search(
+        tree: &IntervalTree<MockItem>,
+        key: u32,
+        expected: &[&str],
+    ) -> Option<Arc<MockItem>> {
+        if let Some(v) = tree.search(key) {
+            let vstr = v.to_string();
+
+            assert!(!expected.is_empty(), "search with {} returned {}, expected None", key, v);
+            assert!(
+                expected.contains(&vstr.as_str()),
+                "search with {} returned {}, expected one of: {:?}",
+                key, v, expected,
+            );
+
+            Some(v)
+        } else {
+            assert!(
+                expected.is_empty(),
+                "search with {} returned None, expected one of {:?}",
+                key, expected
+            );
+            None
+        }
+    }
+
+    fn assert_contents(tree: &IntervalTree<MockItem>, expected: &[&str]) {
+        let mut contents: Vec<String> = tree.iter().map(|e| e.to_string()).collect();
+        contents.sort();
+        assert_eq!(contents, expected);
+    }
+
+    fn dump_tree(tree: &IntervalTree<MockItem>) {
+        for (point_key, point) in tree.points.iter() {
+            print!("{}:", point_key);
+            for e in point.elements.iter() {
+                print!(" {}", e);
+            }
+            println!();
+        }
+    }
+
+    #[test]
+    fn test_interval_tree_simple() {
+        let mut tree: IntervalTree<MockItem> = IntervalTree::default();
+
+        // Simple, non-overlapping ranges.
+        tree.insert(Arc::new(MockItem::new(10, 11)));
+        tree.insert(Arc::new(MockItem::new(11, 12)));
+        tree.insert(Arc::new(MockItem::new(12, 13)));
+        tree.insert(Arc::new(MockItem::new(18, 19)));
+        tree.insert(Arc::new(MockItem::new(17, 18)));
+        tree.insert(Arc::new(MockItem::new(15, 16)));
+
+        assert_search(&tree, 9, &[]);
+        assert_search(&tree, 10, &["10-11"]);
+        assert_search(&tree, 11, &["11-12"]);
+        assert_search(&tree, 12, &["12-13"]);
+        assert_search(&tree, 13, &["12-13"]);
+        assert_search(&tree, 14, &["12-13"]);
+        assert_search(&tree, 15, &["15-16"]);
+        assert_search(&tree, 16, &["15-16"]);
+        assert_search(&tree, 17, &["17-18"]);
+        assert_search(&tree, 18, &["18-19"]);
+        assert_search(&tree, 19, &["18-19"]);
+        assert_search(&tree, 20, &["18-19"]);
+
+        // remove a few entries and search around them again
+        tree.remove(&assert_search(&tree, 10, &["10-11"]).unwrap()); // first entry
+        tree.remove(&assert_search(&tree, 12, &["12-13"]).unwrap()); // entry in the middle
+        tree.remove(&assert_search(&tree, 18, &["18-19"]).unwrap()); // last entry
+        assert_search(&tree, 9, &[]);
+        assert_search(&tree, 10, &[]);
+        assert_search(&tree, 11, &["11-12"]);
+        assert_search(&tree, 12, &["11-12"]);
+        assert_search(&tree, 14, &["11-12"]);
+        assert_search(&tree, 15, &["15-16"]);
+        assert_search(&tree, 17, &["17-18"]);
+        assert_search(&tree, 18, &["17-18"]);
+    }
+
+    #[test]
+    fn test_interval_tree_overlap() {
+        let mut tree: IntervalTree<MockItem> = IntervalTree::default();
+
+        // Overlapping items
+        tree.insert(Arc::new(MockItem::new(22, 24)));
+        tree.insert(Arc::new(MockItem::new(23, 25)));
+        let x24_26 = Arc::new(MockItem::new(24, 26));
+        tree.insert(Arc::clone(&x24_26));
+        let x26_28 = Arc::new(MockItem::new(26, 28));
+        tree.insert(Arc::clone(&x26_28));
+        tree.insert(Arc::new(MockItem::new(25, 27)));
+
+        assert_search(&tree, 22, &["22-24"]);
+        assert_search(&tree, 23, &["22-24", "23-25"]);
+        assert_search(&tree, 24, &["23-25", "24-26"]);
+        assert_search(&tree, 25, &["24-26", "25-27"]);
+        assert_search(&tree, 26, &["25-27", "26-28"]);
+        assert_search(&tree, 27, &["26-28"]);
+        assert_search(&tree, 28, &["26-28"]);
+        assert_search(&tree, 29, &["26-28"]);
+
+        tree.remove(&x24_26);
+        tree.remove(&x26_28);
+        assert_search(&tree, 23, &["22-24", "23-25"]);
+        assert_search(&tree, 24, &["23-25"]);
+        assert_search(&tree, 25, &["25-27"]);
+        assert_search(&tree, 26, &["25-27"]);
+        assert_search(&tree, 27, &["25-27"]);
+        assert_search(&tree, 28, &["25-27"]);
+        assert_search(&tree, 29, &["25-27"]);
+    }
+
+    #[test]
+    fn test_interval_tree_nested() {
+        let mut tree: IntervalTree<MockItem> = IntervalTree::default();
+
+        // Items containing other items
+        tree.insert(Arc::new(MockItem::new(31, 39)));
+        tree.insert(Arc::new(MockItem::new(32, 34)));
+        tree.insert(Arc::new(MockItem::new(33, 35)));
+        tree.insert(Arc::new(MockItem::new(30, 40)));
+
+        assert_search(&tree, 30, &["30-40"]);
+        assert_search(&tree, 31, &["30-40", "31-39"]);
+        assert_search(&tree, 32, &["30-40", "32-34", "31-39"]);
+        assert_search(&tree, 33, &["30-40", "32-34", "33-35", "31-39"]);
+        assert_search(&tree, 34, &["30-40", "33-35", "31-39"]);
+        assert_search(&tree, 35, &["30-40", "31-39"]);
+        assert_search(&tree, 36, &["30-40", "31-39"]);
+        assert_search(&tree, 37, &["30-40", "31-39"]);
+        assert_search(&tree, 38, &["30-40", "31-39"]);
+        assert_search(&tree, 39, &["30-40"]);
+        assert_search(&tree, 40, &["30-40"]);
+        assert_search(&tree, 41, &["30-40"]);
+    }
+
+    #[test]
+    fn test_interval_tree_duplicates() {
+        let mut tree: IntervalTree<MockItem> = IntervalTree::default();
+
+        // Duplicate keys
+        let item_a = Arc::new(MockItem::new_str(55, 56, "a"));
+        tree.insert(Arc::clone(&item_a));
+        let item_b = Arc::new(MockItem::new_str(55, 56, "b"));
+        tree.insert(Arc::clone(&item_b));
+        let item_c = Arc::new(MockItem::new_str(55, 56, "c"));
+        tree.insert(Arc::clone(&item_c));
+        let item_d = Arc::new(MockItem::new_str(54, 56, "d"));
+        tree.insert(Arc::clone(&item_d));
+        let item_e = Arc::new(MockItem::new_str(55, 57, "e"));
+        tree.insert(Arc::clone(&item_e));
+
+        dump_tree(&tree);
+
+        assert_search(
+            &tree,
+            55,
+            &["55-56: a", "55-56: b", "55-56: c", "54-56: d", "55-57: e"],
+        );
+        tree.remove(&item_b);
+        dump_tree(&tree);
+
+        assert_contents(&tree, &["54-56: d", "55-56: a", "55-56: c", "55-57: e"]);
+
+        tree.remove(&item_d);
+        dump_tree(&tree);
+        assert_contents(&tree, &["55-56: a", "55-56: c", "55-57: e"]);
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_interval_tree_insert_twice() {
+        let mut tree: IntervalTree<MockItem> = IntervalTree::default();
+
+        // Inserting the same item twice is not cool
+        let item = Arc::new(MockItem::new(1, 2));
+        tree.insert(Arc::clone(&item));
+        tree.insert(Arc::clone(&item)); // fails assertion
+    }
+}
--- a/pageserver/src/layered_repository/layer_map.rs
+++ b/pageserver/src/layered_repository/layer_map.rs
@@ -9,13 +9,15 @@
 //! new image and delta layers and corresponding files are written to disk.
 //!

+use crate::layered_repository::interval_tree::{IntervalItem, IntervalIter, IntervalTree};
 use crate::layered_repository::storage_layer::{Layer, SegmentTag};
 use crate::layered_repository::InMemoryLayer;
 use crate::relish::*;
 use anyhow::Result;
 use lazy_static::lazy_static;
 use std::cmp::Ordering;
-use std::collections::{BTreeMap, BinaryHeap, HashMap};
+use std::collections::{BinaryHeap, HashMap};
+use std::sync::atomic::{self, AtomicU64};
 use std::sync::Arc;
 use zenith_metrics::{register_int_gauge, IntGauge};
 use zenith_utils::lsn::Lsn;
@@ -29,6 +31,17 @@ lazy_static! {
            .expect("failed to define a metric");
 }

+static NEXT_LAYER_ID: AtomicU64 = AtomicU64::new(0);
+
+#[derive(PartialEq, Eq, Hash, Clone, Copy)]
+pub struct LayerId(u64);
+
+impl LayerId {
+    fn next() -> LayerId {
+        Self(NEXT_LAYER_ID.fetch_add(1, atomic::Ordering::Relaxed))
+    }
+}
+
 ///
 /// LayerMap tracks what layers exist on a timeline.
 ///
@@ -42,6 +55,8 @@ pub struct LayerMap {
    /// contains the oldest WAL record.
    open_layers: BinaryHeap<OpenLayerEntry>,

+    open_layers_by_id: HashMap<LayerId, Arc<InMemoryLayer>>,
+
    /// Generation number, used to distinguish newly inserted entries in the
    /// binary heap from older entries during checkpoint.
    current_generation: u64,
@@ -70,23 +85,43 @@ impl LayerMap {
        segentry.open.as_ref().map(Arc::clone)
    }

+    #[allow(dead_code)]
+    pub fn get_open_by_id(&self, layer_id: &LayerId) -> Option<Arc<InMemoryLayer>> {
+        self.open_layers_by_id.get(layer_id).cloned()
+    }
+
    ///
    /// Insert an open in-memory layer
    ///
-    pub fn insert_open(&mut self, layer: Arc<InMemoryLayer>) {
+    pub fn insert_open(&mut self, layer: Arc<InMemoryLayer>) -> LayerId {
        let segentry = self.segs.entry(layer.get_seg_tag()).or_default();

-        segentry.insert_open(Arc::clone(&layer));
+        segentry.update_open(Arc::clone(&layer));
+
+        let oldest_pending_lsn = layer.get_oldest_pending_lsn();
+
+        // After a crash and restart, 'oldest_pending_lsn' of the oldest in-memory
+        // layer becomes the WAL streaming starting point, so it better not point
+        // in the middle of a WAL record.
+        assert!(oldest_pending_lsn.is_aligned());
+
+        let id = LayerId::next();

        // Also add it to the binary heap
        let open_layer_entry = OpenLayerEntry {
            oldest_pending_lsn: layer.get_oldest_pending_lsn(),
-            layer,
+            layer: Arc::clone(&layer),
            generation: self.current_generation,
+            id,
        };
        self.open_layers.push(open_layer_entry);

+        let old_layer = self.open_layers_by_id.insert(id, layer);
+        assert!(old_layer.is_none());
+
        NUM_INMEMORY_LAYERS.inc();
+
+        id
    }

    /// Remove the oldest in-memory layer
@@ -97,11 +132,16 @@ impl LayerMap {

        // Also remove it from the SegEntry of this segment
        let mut segentry = self.segs.get_mut(&segtag).unwrap();
-        assert!(Arc::ptr_eq(
-            segentry.open.as_ref().unwrap(),
-            &oldest_entry.layer
-        ));
-        segentry.open = None;
+        if Arc::ptr_eq(segentry.open.as_ref().unwrap(), &oldest_entry.layer) {
+            segentry.open = None;
+        } else {
+            // We could have already updated segentry.open for
+            // dropped (non-writeable) layer. This is fine.
+            assert!(!oldest_entry.layer.is_writeable());
+            assert!(oldest_entry.layer.is_dropped());
+        }
+
+        self.open_layers_by_id.remove(&oldest_entry.id).unwrap();

        NUM_INMEMORY_LAYERS.dec();
    }
@@ -121,12 +161,11 @@ impl LayerMap {
    ///
    /// This should be called when the corresponding file on disk has been deleted.
    ///
-    pub fn remove_historic(&mut self, layer: &dyn Layer) {
+    pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
        let tag = layer.get_seg_tag();
-        let start_lsn = layer.get_start_lsn();

        if let Some(segentry) = self.segs.get_mut(&tag) {
-            segentry.historic.remove(&start_lsn);
+            segentry.historic.remove(&layer);
        }
        NUM_ONDISK_LAYERS.dec();
    }
@@ -144,7 +183,7 @@ impl LayerMap {
                        if (request_rel.spcnode == 0 || reltag.spcnode == request_rel.spcnode)
                            && (request_rel.dbnode == 0 || reltag.dbnode == request_rel.dbnode)
                        {
-                            if let Some(exists) = segentry.exists_at_lsn(lsn) {
+                            if let Some(exists) = segentry.exists_at_lsn(lsn)? {
                                rels.insert(seg.rel, exists);
                            }
                        }
@@ -152,7 +191,7 @@ impl LayerMap {
                }
                _ => {
                    if tag == None {
-                        if let Some(exists) = segentry.exists_at_lsn(lsn) {
+                        if let Some(exists) = segentry.exists_at_lsn(lsn)? {
                            rels.insert(seg.rel, exists);
                        }
                    }
@@ -174,6 +213,20 @@ impl LayerMap {
        }
    }

+    /// Is there any layer for given segment that is alive at the lsn?
+    ///
+    /// This is a public wrapper for SegEntry fucntion,
+    /// used for garbage collection, to determine if some alive layer
+    /// exists at the lsn. If so, we shouldn't delete a newer dropped layer
+    /// to avoid incorrectly making it visible.
+    pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool> {
+        Ok(if let Some(segentry) = self.segs.get(&seg) {
+            segentry.exists_at_lsn(lsn)?.unwrap_or(false)
+        } else {
+            false
+        })
+    }
+
    /// Return the oldest in-memory layer, along with its generation number.
    pub fn peek_oldest_open(&self) -> Option<(Arc<InMemoryLayer>, u64)> {
        self.open_layers
@@ -191,7 +244,7 @@ impl LayerMap {

    pub fn iter_historic_layers(&self) -> HistoricLayerIter {
        HistoricLayerIter {
-            segiter: self.segs.iter(),
+            seg_iter: self.segs.iter(),
            iter: None,
        }
    }
@@ -205,7 +258,7 @@ impl LayerMap {
                open.dump()?;
            }

-            for (_, layer) in segentry.historic.iter() {
+            for layer in segentry.historic.iter() {
                layer.dump()?;
            }
        }
@@ -214,34 +267,40 @@ impl LayerMap {
    }
 }

+impl IntervalItem for dyn Layer {
+    type Key = Lsn;
+
+    fn start_key(&self) -> Lsn {
+        self.get_start_lsn()
+    }
+    fn end_key(&self) -> Lsn {
+        self.get_end_lsn()
+    }
+}
+
 ///
 /// Per-segment entry in the LayerMap::segs hash map. Holds all the layers
 /// associated with the segment.
 ///
 /// The last layer that is open for writes is always an InMemoryLayer,
 /// and is kept in a separate field, because there can be only one for
-/// each segment. The older layers, stored on disk, are kept in a
-/// BTreeMap keyed by the layer's start LSN.
+/// each segment. The older layers, stored on disk, are kept in an
+/// IntervalTree.
 #[derive(Default)]
 struct SegEntry {
-    pub open: Option<Arc<InMemoryLayer>>,
-    pub historic: BTreeMap<Lsn, Arc<dyn Layer>>,
+    open: Option<Arc<InMemoryLayer>>,
+    historic: IntervalTree<dyn Layer>,
 }

 impl SegEntry {
    /// Does the segment exist at given LSN?
    /// Return None if object is not found in this SegEntry.
-    fn exists_at_lsn(&self, lsn: Lsn) -> Option<bool> {
-        if let Some(layer) = &self.open {
-            if layer.get_start_lsn() <= lsn && lsn <= layer.get_end_lsn() {
-                let exists = layer.get_seg_exists(lsn).ok()?;
-                return Some(exists);
-            }
-        } else if let Some((_, layer)) = self.historic.range(..=lsn).next_back() {
-            let exists = layer.get_seg_exists(lsn).ok()?;
-            return Some(exists);
+    fn exists_at_lsn(&self, lsn: Lsn) -> Result<Option<bool>> {
+        if let Some(layer) = self.get(lsn) {
+            Ok(Some(layer.get_seg_exists(lsn)?))
+        } else {
+            Ok(None)
        }
-        None
    }

    pub fn get(&self, lsn: Lsn) -> Option<Arc<dyn Layer>> {
@@ -252,40 +311,30 @@ impl SegEntry {
            }
        }

-        if let Some((_start_lsn, layer)) = self.historic.range(..=lsn).next_back() {
-            Some(Arc::clone(layer))
-        } else {
-            None
-        }
+        self.historic.search(lsn)
    }

    pub fn newer_image_layer_exists(&self, lsn: Lsn) -> bool {
        // We only check on-disk layers, because
        // in-memory layers are not durable

-        for (_newer_lsn, layer) in self.historic.range(lsn..) {
-            // Ignore incremental layers.
-            if layer.is_incremental() {
-                continue;
-            }
-            if layer.get_end_lsn() > lsn {
-                return true;
-            } else {
-                continue;
-            }
-        }
-        false
+        self.historic
+            .iter_newer(lsn)
+            .any(|layer| !layer.is_incremental())
    }

-    pub fn insert_open(&mut self, layer: Arc<InMemoryLayer>) {
-        assert!(self.open.is_none());
+    // Set new open layer for a SegEntry.
+    // It's ok to rewrite previous open layer,
+    // but only if it is not writeable anymore.
+    pub fn update_open(&mut self, layer: Arc<InMemoryLayer>) {
+        if let Some(prev_open) = &self.open {
+            assert!(!prev_open.is_writeable());
+        }
        self.open = Some(layer);
    }

    pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
-        let start_lsn = layer.get_start_lsn();
-
-        self.historic.insert(start_lsn, layer);
+        self.historic.insert(layer);
    }
 }

@@ -299,6 +348,7 @@ struct OpenLayerEntry {
    pub oldest_pending_lsn: Lsn, // copy of layer.get_oldest_pending_lsn()
    pub generation: u64,
    pub layer: Arc<InMemoryLayer>,
+    id: LayerId,
 }
 impl Ord for OpenLayerEntry {
    fn cmp(&self, other: &Self) -> Ordering {
@@ -324,8 +374,8 @@ impl Eq for OpenLayerEntry {}

 /// Iterator returned by LayerMap::iter_historic_layers()
 pub struct HistoricLayerIter<'a> {
-    segiter: std::collections::hash_map::Iter<'a, SegmentTag, SegEntry>,
-    iter: Option<std::collections::btree_map::Iter<'a, Lsn, Arc<dyn Layer>>>,
+    seg_iter: std::collections::hash_map::Iter<'a, SegmentTag, SegEntry>,
+    iter: Option<IntervalIter<'a, dyn Layer>>,
 }

 impl<'a> Iterator for HistoricLayerIter<'a> {
@@ -335,11 +385,11 @@ impl<'a> Iterator for HistoricLayerIter<'a> {
        loop {
            if let Some(x) = &mut self.iter {
                if let Some(x) = x.next() {
-                    return Some(Arc::clone(&*x.1));
+                    return Some(Arc::clone(&x));
                }
            }
-            if let Some(seg) = self.segiter.next() {
-                self.iter = Some(seg.1.historic.iter());
+            if let Some((_tag, segentry)) = self.seg_iter.next() {
+                self.iter = Some(segentry.historic.iter());
                continue;
            } else {
                return None;
@@ -394,14 +444,14 @@ mod tests {
        let mut layers = LayerMap::default();

        let gen1 = layers.increment_generation();
-        layers.insert_open(dummy_inmem_layer(conf, 0, Lsn(100), Lsn(100)));
-        layers.insert_open(dummy_inmem_layer(conf, 1, Lsn(100), Lsn(200)));
-        layers.insert_open(dummy_inmem_layer(conf, 2, Lsn(100), Lsn(120)));
-        layers.insert_open(dummy_inmem_layer(conf, 3, Lsn(100), Lsn(110)));
+        layers.insert_open(dummy_inmem_layer(conf, 0, Lsn(0x100), Lsn(0x100)));
+        layers.insert_open(dummy_inmem_layer(conf, 1, Lsn(0x100), Lsn(0x200)));
+        layers.insert_open(dummy_inmem_layer(conf, 2, Lsn(0x100), Lsn(0x120)));
+        layers.insert_open(dummy_inmem_layer(conf, 3, Lsn(0x100), Lsn(0x110)));

        let gen2 = layers.increment_generation();
-        layers.insert_open(dummy_inmem_layer(conf, 4, Lsn(100), Lsn(110)));
-        layers.insert_open(dummy_inmem_layer(conf, 5, Lsn(100), Lsn(100)));
+        layers.insert_open(dummy_inmem_layer(conf, 4, Lsn(0x100), Lsn(0x110)));
+        layers.insert_open(dummy_inmem_layer(conf, 5, Lsn(0x100), Lsn(0x100)));

        // A helper function (closure) to pop the next oldest open entry from the layer map,
        // and assert that it is what we'd expect
@@ -412,12 +462,12 @@ mod tests {
            layers.pop_oldest_open();
        };

-        assert_pop_layer(0, gen1); // 100
-        assert_pop_layer(5, gen2); // 100
-        assert_pop_layer(3, gen1); // 110
-        assert_pop_layer(4, gen2); // 110
-        assert_pop_layer(2, gen1); // 120
-        assert_pop_layer(1, gen1); // 200
+        assert_pop_layer(0, gen1); // 0x100
+        assert_pop_layer(5, gen2); // 0x100
+        assert_pop_layer(3, gen1); // 0x110
+        assert_pop_layer(4, gen2); // 0x110
+        assert_pop_layer(2, gen1); // 0x120
+        assert_pop_layer(1, gen1); // 0x200

        Ok(())
    }
--- a/pageserver/src/layered_repository/metadata.rs
+++ b/pageserver/src/layered_repository/metadata.rs
@@ -0,0 +1,202 @@
+//! Every image of a certain timeline from [`crate::layered_repository::LayeredRepository`]
+//! has a metadata that needs to be stored persistently.
+//!
+//! Later, the file gets is used in [`crate::relish_storage::storage_sync`] as a part of
+//! external storage import and export operations.
+//!
+//! The module contains all structs and related helper methods related to timeline metadata.
+
+use std::{convert::TryInto, path::PathBuf};
+
+use anyhow::ensure;
+use zenith_utils::{
+    bin_ser::BeSer,
+    lsn::Lsn,
+    zid::{ZTenantId, ZTimelineId},
+};
+
+use crate::{
+    layered_repository::{METADATA_CHECKSUM_SIZE, METADATA_MAX_DATA_SIZE, METADATA_MAX_SAFE_SIZE},
+    PageServerConf,
+};
+
+/// The name of the metadata file pageserver creates per timeline.
+pub const METADATA_FILE_NAME: &str = "metadata";
+
+/// Metadata stored on disk for each timeline
+///
+/// The fields correspond to the values we hold in memory, in LayeredTimeline.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
+pub struct TimelineMetadata {
+    disk_consistent_lsn: Lsn,
+    // This is only set if we know it. We track it in memory when the page
+    // server is running, but we only track the value corresponding to
+    // 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a
+    // lot. We only store it in the metadata file when we flush *all* the
+    // in-memory data so that 'last_record_lsn' is the same as
+    // 'disk_consistent_lsn'.  That's OK, because after page server restart, as
+    // soon as we reprocess at least one record, we will have a valid
+    // 'prev_record_lsn' value in memory again. This is only really needed when
+    // doing a clean shutdown, so that there is no more WAL beyond
+    // 'disk_consistent_lsn'
+    prev_record_lsn: Option<Lsn>,
+    ancestor_timeline: Option<ZTimelineId>,
+    ancestor_lsn: Lsn,
+}
+
+/// Points to a place in pageserver's local directory,
+/// where certain timeline's metadata file should be located.
+pub fn metadata_path(
+    conf: &'static PageServerConf,
+    timelineid: ZTimelineId,
+    tenantid: ZTenantId,
+) -> PathBuf {
+    conf.timeline_path(&timelineid, &tenantid)
+        .join(METADATA_FILE_NAME)
+}
+
+impl TimelineMetadata {
+    pub fn new(
+        disk_consistent_lsn: Lsn,
+        prev_record_lsn: Option<Lsn>,
+        ancestor_timeline: Option<ZTimelineId>,
+        ancestor_lsn: Lsn,
+    ) -> Self {
+        Self {
+            disk_consistent_lsn,
+            prev_record_lsn,
+            ancestor_timeline,
+            ancestor_lsn,
+        }
+    }
+
+    pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
+        ensure!(
+            metadata_bytes.len() == METADATA_MAX_SAFE_SIZE,
+            "metadata bytes size is wrong"
+        );
+
+        let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE];
+        let calculated_checksum = crc32c::crc32c(data);
+
+        let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] =
+            metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?;
+        let expected_checksum = u32::from_le_bytes(*checksum_bytes);
+        ensure!(
+            calculated_checksum == expected_checksum,
+            "metadata checksum mismatch"
+        );
+
+        let data = TimelineMetadata::from(serialize::DeTimelineMetadata::des_prefix(data)?);
+        assert!(data.disk_consistent_lsn.is_aligned());
+
+        Ok(data)
+    }
+
+    pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
+        let serializeable_metadata = serialize::SeTimelineMetadata::from(self);
+        let mut metadata_bytes = serialize::SeTimelineMetadata::ser(&serializeable_metadata)?;
+        assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
+        metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8);
+
+        let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]);
+        metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum));
+        Ok(metadata_bytes)
+    }
+
+    /// [`Lsn`] that corresponds to the corresponding timeline directory
+    /// contents, stored locally in the pageserver workdir.
+    pub fn disk_consistent_lsn(&self) -> Lsn {
+        self.disk_consistent_lsn
+    }
+
+    pub fn prev_record_lsn(&self) -> Option<Lsn> {
+        self.prev_record_lsn
+    }
+
+    pub fn ancestor_timeline(&self) -> Option<ZTimelineId> {
+        self.ancestor_timeline
+    }
+
+    pub fn ancestor_lsn(&self) -> Lsn {
+        self.ancestor_lsn
+    }
+}
+
+/// This module is for direct conversion of metadata to bytes and back.
+/// For a certain metadata, besides the conversion a few verification steps has to
+/// be done, so all serde derives are hidden from the user, to avoid accidental
+/// verification-less metadata creation.
+mod serialize {
+    use serde::{Deserialize, Serialize};
+    use zenith_utils::{lsn::Lsn, zid::ZTimelineId};
+
+    use super::TimelineMetadata;
+
+    #[derive(Serialize)]
+    pub(super) struct SeTimelineMetadata<'a> {
+        disk_consistent_lsn: &'a Lsn,
+        prev_record_lsn: &'a Option<Lsn>,
+        ancestor_timeline: &'a Option<ZTimelineId>,
+        ancestor_lsn: &'a Lsn,
+    }
+
+    impl<'a> From<&'a TimelineMetadata> for SeTimelineMetadata<'a> {
+        fn from(other: &'a TimelineMetadata) -> Self {
+            Self {
+                disk_consistent_lsn: &other.disk_consistent_lsn,
+                prev_record_lsn: &other.prev_record_lsn,
+                ancestor_timeline: &other.ancestor_timeline,
+                ancestor_lsn: &other.ancestor_lsn,
+            }
+        }
+    }
+
+    #[derive(Deserialize)]
+    pub(super) struct DeTimelineMetadata {
+        disk_consistent_lsn: Lsn,
+        prev_record_lsn: Option<Lsn>,
+        ancestor_timeline: Option<ZTimelineId>,
+        ancestor_lsn: Lsn,
+    }
+
+    impl From<DeTimelineMetadata> for TimelineMetadata {
+        fn from(other: DeTimelineMetadata) -> Self {
+            Self {
+                disk_consistent_lsn: other.disk_consistent_lsn,
+                prev_record_lsn: other.prev_record_lsn,
+                ancestor_timeline: other.ancestor_timeline,
+                ancestor_lsn: other.ancestor_lsn,
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::repository::repo_harness::TIMELINE_ID;
+
+    use super::*;
+
+    #[test]
+    fn metadata_serializes_correctly() {
+        let original_metadata = TimelineMetadata {
+            disk_consistent_lsn: Lsn(0x200),
+            prev_record_lsn: Some(Lsn(0x100)),
+            ancestor_timeline: Some(TIMELINE_ID),
+            ancestor_lsn: Lsn(0),
+        };
+
+        let metadata_bytes = original_metadata
+            .to_bytes()
+            .expect("Should serialize correct metadata to bytes");
+
+        let deserialized_metadata = TimelineMetadata::from_bytes(&metadata_bytes)
+            .expect("Should deserialize its own bytes");
+
+        assert_eq!(
+            deserialized_metadata, original_metadata,
+            "Metadata that was serialized to bytes and deserialized back should not change"
+        );
+    }
+}
--- a/pageserver/src/layered_repository/page_versions.rs
+++ b/pageserver/src/layered_repository/page_versions.rs
@@ -0,0 +1,150 @@
+use std::{collections::HashMap, ops::RangeBounds, slice};
+
+use zenith_utils::{lsn::Lsn, vec_map::VecMap};
+
+use super::storage_layer::PageVersion;
+
+const EMPTY_SLICE: &[(Lsn, PageVersion)] = &[];
+
+#[derive(Debug, Default)]
+pub struct PageVersions(HashMap<u32, VecMap<Lsn, PageVersion>>);
+
+impl PageVersions {
+    pub fn append_or_update_last(
+        &mut self,
+        blknum: u32,
+        lsn: Lsn,
+        page_version: PageVersion,
+    ) -> Option<PageVersion> {
+        let map = self.0.entry(blknum).or_insert_with(VecMap::default);
+        map.append_or_update_last(lsn, page_version).unwrap()
+    }
+
+    /// Get all [`PageVersion`]s in a block
+    pub fn get_block_slice(&self, blknum: u32) -> &[(Lsn, PageVersion)] {
+        self.0
+            .get(&blknum)
+            .map(VecMap::as_slice)
+            .unwrap_or(EMPTY_SLICE)
+    }
+
+    /// Get a range of [`PageVersions`] in a block
+    pub fn get_block_lsn_range<R: RangeBounds<Lsn>>(
+        &self,
+        blknum: u32,
+        range: R,
+    ) -> &[(Lsn, PageVersion)] {
+        self.0
+            .get(&blknum)
+            .map(|vec_map| vec_map.slice_range(range))
+            .unwrap_or(EMPTY_SLICE)
+    }
+
+    /// Iterate through [`PageVersion`]s in (block, lsn) order.
+    /// If a [`cutoff_lsn`] is set, only show versions with `lsn < cutoff_lsn`
+    pub fn ordered_page_version_iter(&self, cutoff_lsn: Option<Lsn>) -> OrderedPageVersionIter<'_> {
+        let mut ordered_blocks: Vec<u32> = self.0.keys().cloned().collect();
+        ordered_blocks.sort_unstable();
+
+        let slice = ordered_blocks
+            .first()
+            .map(|&blknum| self.get_block_slice(blknum))
+            .unwrap_or(EMPTY_SLICE);
+
+        OrderedPageVersionIter {
+            page_versions: self,
+            ordered_blocks,
+            cur_block_idx: 0,
+            cutoff_lsn,
+            cur_slice_iter: slice.iter(),
+        }
+    }
+}
+
+pub struct OrderedPageVersionIter<'a> {
+    page_versions: &'a PageVersions,
+
+    ordered_blocks: Vec<u32>,
+    cur_block_idx: usize,
+
+    cutoff_lsn: Option<Lsn>,
+
+    cur_slice_iter: slice::Iter<'a, (Lsn, PageVersion)>,
+}
+
+impl OrderedPageVersionIter<'_> {
+    fn is_lsn_before_cutoff(&self, lsn: &Lsn) -> bool {
+        if let Some(cutoff_lsn) = self.cutoff_lsn.as_ref() {
+            lsn < cutoff_lsn
+        } else {
+            true
+        }
+    }
+}
+
+impl<'a> Iterator for OrderedPageVersionIter<'a> {
+    type Item = (u32, Lsn, &'a PageVersion);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        loop {
+            if let Some((lsn, page_version)) = self.cur_slice_iter.next() {
+                if self.is_lsn_before_cutoff(lsn) {
+                    let blknum = self.ordered_blocks[self.cur_block_idx];
+                    return Some((blknum, *lsn, page_version));
+                }
+            }
+
+            let next_block_idx = self.cur_block_idx + 1;
+            let blknum: u32 = *self.ordered_blocks.get(next_block_idx)?;
+            self.cur_block_idx = next_block_idx;
+            self.cur_slice_iter = self.page_versions.get_block_slice(blknum).iter();
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    const EMPTY_PAGE_VERSION: PageVersion = PageVersion {
+        page_image: None,
+        record: None,
+    };
+
+    #[test]
+    fn test_ordered_iter() {
+        let mut page_versions = PageVersions::default();
+        const BLOCKS: u32 = 1000;
+        const LSNS: u64 = 50;
+
+        for blknum in 0..BLOCKS {
+            for lsn in 0..LSNS {
+                let old = page_versions.append_or_update_last(blknum, Lsn(lsn), EMPTY_PAGE_VERSION);
+                assert!(old.is_none());
+            }
+        }
+
+        let mut iter = page_versions.ordered_page_version_iter(None);
+        for blknum in 0..BLOCKS {
+            for lsn in 0..LSNS {
+                let (actual_blknum, actual_lsn, _pv) = iter.next().unwrap();
+                assert_eq!(actual_blknum, blknum);
+                assert_eq!(Lsn(lsn), actual_lsn);
+            }
+        }
+        assert!(iter.next().is_none());
+        assert!(iter.next().is_none()); // should be robust against excessive next() calls
+
+        const CUTOFF_LSN: Lsn = Lsn(30);
+        let mut iter = page_versions.ordered_page_version_iter(Some(CUTOFF_LSN));
+        for blknum in 0..BLOCKS {
+            for lsn in 0..CUTOFF_LSN.0 {
+                let (actual_blknum, actual_lsn, _pv) = iter.next().unwrap();
+                assert_eq!(actual_blknum, blknum);
+                assert_eq!(Lsn(lsn), actual_lsn);
+            }
+        }
+        assert!(iter.next().is_none());
+        assert!(iter.next().is_none()); // should be robust against excessive next() calls
+    }
+}
--- a/pageserver/src/layered_repository/storage_layer.rs
+++ b/pageserver/src/layered_repository/storage_layer.rs
@@ -10,7 +10,6 @@ use bytes::Bytes;
 use serde::{Deserialize, Serialize};
 use std::fmt;
 use std::path::PathBuf;
-use std::sync::Arc;

 use zenith_utils::lsn::Lsn;

@@ -79,7 +78,7 @@ pub struct PageVersion {
 /// 'records' contains the records to apply over the base image.
 ///
 pub struct PageReconstructData {
-    pub records: Vec<WALRecord>,
+    pub records: Vec<(Lsn, WALRecord)>,
    pub page_img: Option<Bytes>,
 }

@@ -87,9 +86,9 @@ pub struct PageReconstructData {
 pub enum PageReconstructResult {
    /// Got all the data needed to reconstruct the requested page
    Complete,
-    /// This layer didn't contain all the required data, the caller should collect
-    /// more data from the returned predecessor layer at the returned LSN.
-    Continue(Lsn, Arc<dyn Layer>),
+    /// This layer didn't contain all the required data, the caller should look up
+    /// the predecessor layer at the returned LSN and collect more data from there.
+    Continue(Lsn),
    /// This layer didn't contain data needed to reconstruct the page version at
    /// the returned LSN. This is usually considered an error, but might be OK
    /// in some circumstances.
@@ -111,24 +110,19 @@ pub trait Layer: Send + Sync {
    /// Identify the relish segment
    fn get_seg_tag(&self) -> SegmentTag;

-    /// Inclusive start bound of the LSN range that this layer hold
+    /// Inclusive start bound of the LSN range that this layer holds
    fn get_start_lsn(&self) -> Lsn;

-    /// 'end_lsn' meaning depends on the layer kind:
-    /// - in-memory layer is either unbounded (end_lsn = MAX_LSN) or dropped (end_lsn = drop_lsn)
-    /// - image layer represents snapshot at one LSN, so end_lsn = lsn
-    /// - delta layer has end_lsn
+    /// Exclusive end bound of the LSN range that this layer holds.
    ///
-    /// TODO Is end_lsn always exclusive for all layer kinds?
+    /// - For an open in-memory layer, this is MAX_LSN.
+    /// - For a frozen in-memory layer or a delta layer, this is a valid end bound.
+    /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
    fn get_end_lsn(&self) -> Lsn;

    /// Is the segment represented by this layer dropped by PostgreSQL?
    fn is_dropped(&self) -> bool;

-    /// Gets the physical location of the layer on disk.
-    /// Some layers, such as in-memory, might not have the location.
-    fn path(&self) -> Option<PathBuf>;
-
    /// Filename used to store this layer on disk. (Even in-memory layers
    /// implement this, to print a handy unique identifier for the layer for
    /// log messages, even though they're never not on disk.)
@@ -146,8 +140,8 @@ pub trait Layer: Send + Sync {
    ///
    /// See PageReconstructResult for possible return values. The collected data
    /// is appended to reconstruct_data; the caller should pass an empty struct
-    /// on first call. If this returns PageReconstructResult::Continue, call
-    /// again on the returned predecessor layer with the same 'reconstruct_data'
+    /// on first call. If this returns PageReconstructResult::Continue, look up
+    /// the predecessor layer and call again with the same 'reconstruct_data'
    /// to collect more data.
    fn get_page_reconstruct_data(
        &self,
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,3 +1,4 @@
+use layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use zenith_utils::postgres_backend::AuthType;
 use zenith_utils::zid::{ZTenantId, ZTimelineId};

@@ -13,21 +14,23 @@ pub mod http;
 pub mod layered_repository;
 pub mod page_service;
 pub mod relish;
-mod relish_storage;
+pub mod relish_storage;
 pub mod repository;
 pub mod restore_local_repo;
 pub mod tenant_mgr;
+pub mod tenant_threads;
 pub mod waldecoder;
 pub mod walreceiver;
 pub mod walredo;

 pub mod defaults {
+    use const_format::formatcp;
    use std::time::Duration;

    pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
-    pub const DEFAULT_PG_LISTEN_ADDR: &str = "127.0.0.1:64000"; // can't format! const yet...
+    pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
    pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
-    pub const DEFAULT_HTTP_LISTEN_ADDR: &str = "127.0.0.1:9898";
+    pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");

    // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
    // would be more appropriate. But a low value forces the code to be exercised more,
@@ -39,6 +42,7 @@ pub mod defaults {
    pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100);

    pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
+    pub const DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS: usize = 100;
 }

 lazy_static! {
@@ -89,7 +93,7 @@ impl PageServerConf {
    //

    fn tenants_path(&self) -> PathBuf {
-        self.workdir.join("tenants")
+        self.workdir.join(TENANTS_SEGMENT_NAME)
    }

    fn tenant_path(&self, tenantid: &ZTenantId) -> PathBuf {
@@ -113,7 +117,7 @@ impl PageServerConf {
    }

    fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf {
-        self.tenant_path(tenantid).join("timelines")
+        self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME)
    }

    fn timeline_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
@@ -124,10 +128,6 @@ impl PageServerConf {
        self.timeline_path(timelineid, tenantid).join("ancestor")
    }

-    fn wal_dir_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
-        self.timeline_path(timelineid, tenantid).join("wal")
-    }
-
    //
    // Postgres distribution paths
    //
@@ -153,8 +153,8 @@ impl PageServerConf {
            checkpoint_period: Duration::from_secs(10),
            gc_horizon: defaults::DEFAULT_GC_HORIZON,
            gc_period: Duration::from_secs(10),
-            listen_pg_addr: "127.0.0.1:5430".to_string(),
-            listen_http_addr: "127.0.0.1:9898".to_string(),
+            listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
+            listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
            superuser: "zenith_admin".to_string(),
            workdir: repo_dir,
            pg_distrib_dir: "".into(),
@@ -165,20 +165,48 @@ impl PageServerConf {
    }
 }

+/// Config for the Repository checkpointer
+#[derive(Debug, Clone, Copy)]
+pub enum CheckpointConfig {
+    // Flush in-memory data that is older than this
+    Distance(u64),
+    // Flush all in-memory data
+    Forced,
+}
+
 /// External relish storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone)]
-pub enum RelishStorageConfig {
-    /// Root folder to place all stored relish data into.
+pub struct RelishStorageConfig {
+    /// Limits the number of concurrent sync operations between pageserver and relish storage.
+    pub max_concurrent_sync: usize,
+    /// The storage connection configuration.
+    pub storage: RelishStorageKind,
+}
+
+/// A kind of a relish storage to connect to, with its connection configuration.
+#[derive(Debug, Clone)]
+pub enum RelishStorageKind {
+    /// Storage based on local file system.
+    /// Specify a root folder to place all stored relish data into.
    LocalFs(PathBuf),
+    /// AWS S3 based storage, storing all relishes into the root
+    /// of the S3 bucket from the config.
    AwsS3(S3Config),
 }

 /// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
 #[derive(Clone)]
 pub struct S3Config {
+    /// Name of the bucket to connect to.
    pub bucket_name: String,
+    /// The region where the bucket is located at.
    pub bucket_region: String,
+    /// "Login" to use when connecting to bucket.
+    /// Can be empty for cases like AWS k8s IAM
+    /// where we can allow certain pods to connect
+    /// to the bucket directly without any credentials.
    pub access_key_id: Option<String>,
+    /// "Password" to use when connecting to bucket.
    pub secret_access_key: Option<String>,
 }

--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -13,7 +13,6 @@
 use anyhow::{anyhow, bail, ensure, Result};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use lazy_static::lazy_static;
-use log::*;
 use regex::Regex;
 use std::net::TcpListener;
 use std::str;
@@ -21,10 +20,12 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::thread;
 use std::{io, net::TcpStream};
+use tracing::*;
 use zenith_metrics::{register_histogram_vec, HistogramVec};
 use zenith_utils::auth::{self, JwtAuth};
 use zenith_utils::auth::{Claims, Scope};
 use zenith_utils::lsn::Lsn;
+use zenith_utils::postgres_backend::is_socket_read_timed_out;
 use zenith_utils::postgres_backend::PostgresBackend;
 use zenith_utils::postgres_backend::{self, AuthType};
 use zenith_utils::pq_proto::{
@@ -187,17 +188,32 @@ pub fn thread_main(
    listener: TcpListener,
    auth_type: AuthType,
 ) -> anyhow::Result<()> {
-    loop {
+    let mut join_handles = Vec::new();
+
+    while !tenant_mgr::shutdown_requested() {
        let (socket, peer_addr) = listener.accept()?;
        debug!("accepted connection from {}", peer_addr);
        socket.set_nodelay(true).unwrap();
        let local_auth = auth.clone();
-        thread::spawn(move || {
-            if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) {
-                error!("error: {}", err);
-            }
-        });
+
+        let handle = thread::Builder::new()
+            .name("serving Page Service thread".into())
+            .spawn(move || {
+                if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) {
+                    error!(%err, "page server thread exited with error");
+                }
+            })
+            .unwrap();
+
+        join_handles.push(handle);
    }
+
+    debug!("page_service loop terminated. wait for connections to cancel");
+    for handle in join_handles.into_iter() {
+        handle.join().unwrap();
+    }
+
+    Ok(())
 }

 fn page_service_conn_main(
@@ -216,7 +232,7 @@ fn page_service_conn_main(
    }

    let mut conn_handler = PageServerHandler::new(conf, auth);
-    let pgbackend = PostgresBackend::new(socket, auth_type, None)?;
+    let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?;
    pgbackend.run(&mut conn_handler)
 }

@@ -260,48 +276,66 @@ impl PageServerHandler {
        timelineid: ZTimelineId,
        tenantid: ZTenantId,
    ) -> anyhow::Result<()> {
+        let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered();
+
        // Check that the timeline exists
        let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;

        /* switch client to COPYBOTH */
        pgb.write_message(&BeMessage::CopyBothResponse)?;

-        while let Some(message) = pgb.read_message()? {
-            trace!("query({:?}): {:?}", timelineid, message);
+        while !tenant_mgr::shutdown_requested() {
+            match pgb.read_message() {
+                Ok(message) => {
+                    if let Some(message) = message {
+                        trace!("query: {:?}", message);

-            let copy_data_bytes = match message {
-                FeMessage::CopyData(bytes) => bytes,
-                _ => continue,
-            };
+                        let copy_data_bytes = match message {
+                            FeMessage::CopyData(bytes) => bytes,
+                            _ => continue,
+                        };

-            let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;
+                        let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;

-            let response = match zenith_fe_msg {
-                PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME
-                    .with_label_values(&["get_rel_exists"])
-                    .observe_closure_duration(|| {
-                        self.handle_get_rel_exists_request(&*timeline, &req)
-                    }),
-                PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME
-                    .with_label_values(&["get_rel_size"])
-                    .observe_closure_duration(|| self.handle_get_nblocks_request(&*timeline, &req)),
-                PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME
-                    .with_label_values(&["get_page_at_lsn"])
-                    .observe_closure_duration(|| {
-                        self.handle_get_page_at_lsn_request(&*timeline, &req)
-                    }),
-            };
+                        let response = match zenith_fe_msg {
+                            PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME
+                                .with_label_values(&["get_rel_exists"])
+                                .observe_closure_duration(|| {
+                                    self.handle_get_rel_exists_request(&*timeline, &req)
+                                }),
+                            PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME
+                                .with_label_values(&["get_rel_size"])
+                                .observe_closure_duration(|| {
+                                    self.handle_get_nblocks_request(&*timeline, &req)
+                                }),
+                            PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME
+                                .with_label_values(&["get_page_at_lsn"])
+                                .observe_closure_duration(|| {
+                                    self.handle_get_page_at_lsn_request(&*timeline, &req)
+                                }),
+                        };

-            let response = response.unwrap_or_else(|e| {
-                error!("error reading relation or page version: {}", e);
-                PagestreamBeMessage::Error(PagestreamErrorResponse {
-                    message: e.to_string(),
-                })
-            });
+                        let response = response.unwrap_or_else(|e| {
+                            // print the all details to the log with {:#}, but for the client the
+                            // error message is enough
+                            error!("error reading relation or page version: {:#}", e);
+                            PagestreamBeMessage::Error(PagestreamErrorResponse {
+                                message: e.to_string(),
+                            })
+                        });

-            pgb.write_message(&BeMessage::CopyData(&response.serialize()))?;
+                        pgb.write_message(&BeMessage::CopyData(&response.serialize()))?;
+                    } else {
+                        break;
+                    }
+                }
+                Err(e) => {
+                    if !is_socket_read_timed_out(&e) {
+                        return Err(e);
+                    }
+                }
+            }
        }
-
        Ok(())
    }

@@ -361,6 +395,8 @@ impl PageServerHandler {
        timeline: &dyn Timeline,
        req: &PagestreamExistsRequest,
    ) -> Result<PagestreamBeMessage> {
+        let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered();
+
        let tag = RelishTag::Relation(req.rel);
        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;

@@ -376,6 +412,7 @@ impl PageServerHandler {
        timeline: &dyn Timeline,
        req: &PagestreamNblocksRequest,
    ) -> Result<PagestreamBeMessage> {
+        let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered();
        let tag = RelishTag::Relation(req.rel);
        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;

@@ -395,6 +432,8 @@ impl PageServerHandler {
        timeline: &dyn Timeline,
        req: &PagestreamGetPageRequest,
    ) -> Result<PagestreamBeMessage> {
+        let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn)
+            .entered();
        let tag = RelishTag::Relation(req.rel);
        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;

@@ -412,17 +451,20 @@ impl PageServerHandler {
        lsn: Option<Lsn>,
        tenantid: ZTenantId,
    ) -> anyhow::Result<()> {
+        let span = info_span!("basebackup", timeline = %timelineid, tenant = %tenantid, lsn = field::Empty);
+        let _enter = span.enter();
+
        // check that the timeline exists
        let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;

-        /* switch client to COPYOUT */
+        // switch client to COPYOUT
        pgb.write_message(&BeMessage::CopyOutResponse)?;
-        info!("sent CopyOut");

        /* Send a tarball of the latest layer on the timeline */
        {
            let mut writer = CopyDataSink { pgb };
            let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?;
+            span.record("lsn", &basebackup.lsn.to_string().as_str());
            basebackup.send_tarball()?;
        }
        pgb.write_message(&BeMessage::CopyDone)?;
@@ -527,11 +569,6 @@ impl postgres_backend::Handler for PageServerHandler {
                None
            };

-            info!(
-                "got basebackup command. tenantid=\"{}\" timelineid=\"{}\" lsn=\"{:#?}\"",
-                tenantid, timelineid, lsn
-            );
-
            // Check that the timeline exists
            self.handle_basebackup_request(pgb, timelineid, lsn, tenantid)?;
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
@@ -549,6 +586,9 @@ impl postgres_backend::Handler for PageServerHandler {

            self.check_permission(Some(tenantid))?;

+            let _enter =
+                info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered();
+
            // Check that the timeline exists
            tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;

@@ -571,6 +611,9 @@ impl postgres_backend::Handler for PageServerHandler {

            self.check_permission(Some(tenantid))?;

+            let _enter =
+                info_span!("branch_create", name = %branchname, tenant = %tenantid).entered();
+
            let branch =
                branches::create_branch(self.conf, &branchname, &startpoint_str, &tenantid)?;
            let branch = serde_json::to_vec(&branch)?;
@@ -587,14 +630,16 @@ impl postgres_backend::Handler for PageServerHandler {

            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;

-            let branches = crate::branches::get_branches(self.conf, &tenantid)?;
+            // since these handlers for tenant/branch commands are deprecated (in favor of http based ones)
+            // just use false in place of include non incremental logical size
+            let branches = crate::branches::get_branches(self.conf, &tenantid, false)?;
            let branches_buf = serde_json::to_vec(&branches)?;

            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
                .write_message_noflush(&BeMessage::DataRow(&[Some(&branches_buf)]))?
                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else if query_string.starts_with("tenant_list") {
-            let tenants = crate::branches::get_tenants(self.conf)?;
+            let tenants = crate::tenant_mgr::list_tenants()?;
            let tenants_buf = serde_json::to_vec(&tenants)?;

            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
@@ -654,12 +699,14 @@ impl postgres_backend::Handler for PageServerHandler {
                RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"),
                RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"),
                RowDescriptor::int8_col(b"layer_relfiles_not_updated"),
+                RowDescriptor::int8_col(b"layer_relfiles_needed_as_tombstone"),
                RowDescriptor::int8_col(b"layer_relfiles_removed"),
                RowDescriptor::int8_col(b"layer_relfiles_dropped"),
                RowDescriptor::int8_col(b"layer_nonrelfiles_total"),
                RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"),
                RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"),
                RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_needed_as_tombstone"),
                RowDescriptor::int8_col(b"layer_nonrelfiles_removed"),
                RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"),
                RowDescriptor::int8_col(b"elapsed"),
@@ -679,6 +726,12 @@ impl postgres_backend::Handler for PageServerHandler {
                        .as_bytes(),
                ),
                Some(result.ondisk_relfiles_not_updated.to_string().as_bytes()),
+                Some(
+                    result
+                        .ondisk_relfiles_needed_as_tombstone
+                        .to_string()
+                        .as_bytes(),
+                ),
                Some(result.ondisk_relfiles_removed.to_string().as_bytes()),
                Some(result.ondisk_relfiles_dropped.to_string().as_bytes()),
                Some(result.ondisk_nonrelfiles_total.to_string().as_bytes()),
@@ -695,6 +748,12 @@ impl postgres_backend::Handler for PageServerHandler {
                        .as_bytes(),
                ),
                Some(result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()),
+                Some(
+                    result
+                        .ondisk_nonrelfiles_needed_as_tombstone
+                        .to_string()
+                        .as_bytes(),
+                ),
                Some(result.ondisk_nonrelfiles_removed.to_string().as_bytes()),
                Some(result.ondisk_nonrelfiles_dropped.to_string().as_bytes()),
                Some(result.elapsed.as_millis().to_string().as_bytes()),
--- a/pageserver/src/relish_storage.rs
+++ b/pageserver/src/relish_storage.rs
@@ -1,54 +1,323 @@
-//! Abstractions for the page server to store its relish layer data in the external storage.
+//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage.
+//! This particular module serves as a public API border between pageserver and the internal storage machinery.
+//! No other modules from this tree are supposed to be used directly by the external code.
 //!
-//! Main purpose of this module subtree is to provide a set of abstractions to manage the storage state
-//! in a way, optimal for page server.
+//! There are a few components the storage machinery consists of:
+//! * [`RelishStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
+//!     * [`local_fs`] allows to use local file system as an external storage
+//!     * [`rust_s3`] uses AWS S3 bucket entirely as an external storage
 //!
-//! The abstractions hide multiple custom external storage API implementations,
-//! such as AWS S3, local filesystem, etc., located in the submodules.
+//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync.
+//!
+//! * public API via to interact with the external world: [`run_storage_sync_thread`] and [`schedule_timeline_upload`]
+//!
+//! Here's a schematic overview of all interactions relish storage and the rest of the pageserver perform:
+//!
+//! +------------------------+                                    +--------->-------+
+//! |                        |  - - - (init async loop) - - - ->  |                 |
+//! |                        |                                    |                 |
+//! |                        |  ------------------------------->  |      async      |
+//! |       pageserver       |   (schedule frozen layer upload)   | upload/download |
+//! |                        |                                    |      loop       |
+//! |                        |  <-------------------------------  |                 |
+//! |                        |    (register downloaded layers)    |                 |
+//! +------------------------+                                    +---------<-------+
+//!                                                                         |
+//!                                                                         |
+//!                                          CRUD layer file operations     |
+//!                                     (upload/download/delete/list, etc.) |
+//!                                                                         V
+//!                                                            +------------------------+
+//!                                                            |                        |
+//!                                                            | [`RelishStorage`] impl |
+//!                                                            |                        |
+//!                                                            | pageserver assumes it  |
+//!                                                            | owns exclusive write   |
+//!                                                            | access to this storage |
+//!                                                            +------------------------+
+//!
+//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop unitialised, if configured so.
+//! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata.
+//! If the storage sync loop was successfully started before, pageserver schedules the new image uploads after every checkpoint.
+//! See [`crate::layered_repository`] for the upload calls and the adjacent logic.
+//!
+//! The storage logic considers `image` as a set of local files, fully representing a certain timeline at given moment (identified with `disk_consistent_lsn`).
+//! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed
+//! by the storage upload, if enabled.
+//! When a certain image gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same image state.
+//! No files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten
+//! when the newer timeline is downloaded.
+//!
+//! Meanwhile, the loop inits the storage connection and checks the remote files stored.
+//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server).
+//! Based on the remote image data, the storage sync logic queues image downloads, while accepting any potential upload tasks from pageserver and managing the tasks by their priority.
+//! On the image download, a [`crate::tenant_mgr::register_relish_download`] function is called to register the new image in pageserver, initializing all related threads and internal state.
+//!
+//! When the pageserver terminates, the upload loop finishes a current image sync task (if any) and exits.
+//!
+//! NOTES:
+//! * pageserver assumes it has exclusive write access to the relish storage. If supported, the way multiple pageservers can be separated in the same storage
+//! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API.
+//!
+//! * the uploads do not happen right after pageserver startup, they are registered when
+//!     1. pageserver does the checkpoint, which happens further in the future after the server start
+//!     2. pageserver loads the timeline from disk for the first time
+//!
+//! * the uploads do not happen right after the upload registration: the sync loop might be occupied with other tasks, or tasks with bigger priority could be waiting already
+//!
+//! * all synchronization tasks (including the public API to register uploads and downloads and the sync queue management) happens on an image scale: a big set of relish files,
+//! enough to represent (and recover, if needed) a certain timeline state. On the contrary, all internal storage CRUD calls are made per reilsh file from those images.
+//! This way, the synchronization is able to download the image partially, if some state was synced before, but exposes correctly synced images only.

 mod local_fs;
 mod rust_s3;
-/// A queue and the background machinery behind it to upload
-/// local page server layer files to external storage.
-pub mod storage_uploader;
+mod storage_sync;

-use std::path::Path;
+use std::{
+    path::{Path, PathBuf},
+    thread,
+};

-use anyhow::Context;
+use anyhow::{anyhow, ensure, Context};
+use zenith_utils::zid::{ZTenantId, ZTimelineId};
+
+pub use self::storage_sync::schedule_timeline_upload;
+use self::{local_fs::LocalFs, rust_s3::S3};
+use crate::{
+    layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME},
+    PageServerConf, RelishStorageKind,
+};
+
+/// Based on the config, initiates the remote storage connection and starts a separate thread
+/// that ensures that pageserver and the remote storage are in sync with each other.
+/// If no external configuraion connection given, no thread or storage initialization is done.
+pub fn run_storage_sync_thread(
+    config: &'static PageServerConf,
+) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
+    match &config.relish_storage_config {
+        Some(relish_storage_config) => {
+            let max_concurrent_sync = relish_storage_config.max_concurrent_sync;
+            let handle = match &relish_storage_config.storage {
+                RelishStorageKind::LocalFs(root) => storage_sync::spawn_storage_sync_thread(
+                    config,
+                    LocalFs::new(root.clone(), &config.workdir)?,
+                    max_concurrent_sync,
+                ),
+                RelishStorageKind::AwsS3(s3_config) => storage_sync::spawn_storage_sync_thread(
+                    config,
+                    S3::new(s3_config, &config.workdir)?,
+                    max_concurrent_sync,
+                ),
+            };
+            handle.map(Some)
+        }
+        None => Ok(None),
+    }
+}

 /// Storage (potentially remote) API to manage its state.
+/// This storage tries to be unaware of any layered repository context,
+/// providing basic CRUD operations with storage files.
 #[async_trait::async_trait]
-pub trait RelishStorage: Send + Sync {
+trait RelishStorage: Send + Sync {
+    /// A way to uniquely reference relish in the remote storage.
    type RelishStoragePath;

-    fn derive_destination(
-        page_server_workdir: &Path,
-        relish_local_path: &Path,
-    ) -> anyhow::Result<Self::RelishStoragePath>;
+    /// Attempts to derive the storage path out of the local path, if the latter is correct.
+    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::RelishStoragePath>;

+    /// Gets the layered storage information about the given entry.
+    fn info(&self, storage_path: &Self::RelishStoragePath) -> anyhow::Result<RemoteRelishInfo>;
+
+    /// Lists all items the storage has right now.
    async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>>;

-    async fn download_relish(
+    /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
+    async fn download_relish<W: 'static + std::io::Write + Send>(
        &self,
        from: &Self::RelishStoragePath,
-        to: &Path,
-    ) -> anyhow::Result<()>;
+        // rust_s3 `get_object_stream` method requires `std::io::BufWriter` for some reason, not the async counterpart
+        // that forces us to consume and return the writer to satisfy the blocking operation async wrapper requirements
+        to: std::io::BufWriter<W>,
+    ) -> anyhow::Result<std::io::BufWriter<W>>;

    async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()>;

-    async fn upload_relish(&self, from: &Path, to: &Self::RelishStoragePath) -> anyhow::Result<()>;
+    /// Streams the local file contents into remote into the remote storage entry.
+    async fn upload_relish<R: tokio::io::AsyncRead + std::marker::Unpin + Send>(
+        &self,
+        from: &mut tokio::io::BufReader<R>,
+        to: &Self::RelishStoragePath,
+    ) -> anyhow::Result<()>;
 }

-fn strip_workspace_prefix<'a>(
-    page_server_workdir: &'a Path,
-    relish_local_path: &'a Path,
-) -> anyhow::Result<&'a Path> {
-    relish_local_path
-        .strip_prefix(page_server_workdir)
-        .with_context(|| {
+/// Information about a certain remote storage entry.
+#[derive(Debug, PartialEq, Eq)]
+struct RemoteRelishInfo {
+    tenant_id: ZTenantId,
+    timeline_id: ZTimelineId,
+    /// Path in the pageserver workdir where the file should go to.
+    download_destination: PathBuf,
+    is_metadata: bool,
+}
+
+fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
+    if prefix == path {
+        anyhow::bail!(
+            "Prefix and the path are equal, cannot strip: '{}'",
+            prefix.display()
+        )
+    } else {
+        path.strip_prefix(prefix).with_context(|| {
            format!(
-                "Unexpected: relish local path '{}' is not relevant to server workdir",
-                relish_local_path.display(),
+                "Path '{}' is not prefixed with '{}'",
+                path.display(),
+                prefix.display(),
            )
        })
+    }
+}
+
+fn parse_ids_from_path<'a, R: std::fmt::Display>(
+    path_segments: impl Iterator<Item = &'a str>,
+    path_log_representation: &R,
+) -> anyhow::Result<(ZTenantId, ZTimelineId)> {
+    let mut segments = path_segments.skip_while(|&segment| segment != TENANTS_SEGMENT_NAME);
+    let tenants_segment = segments.next().ok_or_else(|| {
+        anyhow!(
+            "Found no '{}' segment in the storage path '{}'",
+            TENANTS_SEGMENT_NAME,
+            path_log_representation
+        )
+    })?;
+    ensure!(
+        tenants_segment == TENANTS_SEGMENT_NAME,
+        "Failed to extract '{}' segment from storage path '{}'",
+        TENANTS_SEGMENT_NAME,
+        path_log_representation
+    );
+    let tenant_id = segments
+        .next()
+        .ok_or_else(|| {
+            anyhow!(
+                "Found no tenant id in the storage path '{}'",
+                path_log_representation
+            )
+        })?
+        .parse::<ZTenantId>()
+        .with_context(|| {
+            format!(
+                "Failed to parse tenant id from storage path '{}'",
+                path_log_representation
+            )
+        })?;
+
+    let timelines_segment = segments.next().ok_or_else(|| {
+        anyhow!(
+            "Found no '{}' segment in the storage path '{}'",
+            TIMELINES_SEGMENT_NAME,
+            path_log_representation
+        )
+    })?;
+    ensure!(
+        timelines_segment == TIMELINES_SEGMENT_NAME,
+        "Failed to extract '{}' segment from storage path '{}'",
+        TIMELINES_SEGMENT_NAME,
+        path_log_representation
+    );
+    let timeline_id = segments
+        .next()
+        .ok_or_else(|| {
+            anyhow!(
+                "Found no timeline id in the storage path '{}'",
+                path_log_representation
+            )
+        })?
+        .parse::<ZTimelineId>()
+        .with_context(|| {
+            format!(
+                "Failed to parse timeline id from storage path '{}'",
+                path_log_representation
+            )
+        })?;
+
+    Ok((tenant_id, timeline_id))
+}
+
+/// A set of common test utils to share in unit tests inside the module tree.
+#[cfg(test)]
+mod test_utils {
+    use std::path::{Path, PathBuf};
+
+    use anyhow::ensure;
+
+    use crate::{
+        layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME},
+        repository::repo_harness::{RepoHarness, TIMELINE_ID},
+    };
+
+    /// Gives a timeline path with pageserver workdir stripped off.
+    pub fn relative_timeline_path(harness: &RepoHarness) -> anyhow::Result<PathBuf> {
+        let timeline_path = harness.timeline_path(&TIMELINE_ID);
+        Ok(timeline_path
+            .strip_prefix(&harness.conf.workdir)?
+            .to_path_buf())
+    }
+
+    /// Creates a path with custom tenant id in one of its segments.
+    /// Useful for emulating paths with wrong ids.
+    pub fn custom_tenant_id_path(
+        path_with_tenant_id: &Path,
+        new_tenant_id: &str,
+    ) -> anyhow::Result<PathBuf> {
+        let mut new_path = PathBuf::new();
+        let mut is_tenant_id = false;
+        let mut tenant_id_replaced = false;
+        for segment in path_with_tenant_id {
+            match segment.to_str() {
+                Some(TENANTS_SEGMENT_NAME) => is_tenant_id = true,
+                Some(_tenant_id_str) if is_tenant_id => {
+                    is_tenant_id = false;
+                    new_path.push(new_tenant_id);
+                    tenant_id_replaced = true;
+                    continue;
+                }
+                _ => {}
+            }
+            new_path.push(segment)
+        }
+
+        ensure!(tenant_id_replaced, "Found no tenant id segment to replace");
+        Ok(new_path)
+    }
+
+    /// Creates a path with custom timeline id in one of its segments.
+    /// Useful for emulating paths with wrong ids.
+    pub fn custom_timeline_id_path(
+        path_with_timeline_id: &Path,
+        new_timeline_id: &str,
+    ) -> anyhow::Result<PathBuf> {
+        let mut new_path = PathBuf::new();
+        let mut is_timeline_id = false;
+        let mut timeline_id_replaced = false;
+        for segment in path_with_timeline_id {
+            match segment.to_str() {
+                Some(TIMELINES_SEGMENT_NAME) => is_timeline_id = true,
+                Some(_timeline_id_str) if is_timeline_id => {
+                    is_timeline_id = false;
+                    new_path.push(new_timeline_id);
+                    timeline_id_replaced = true;
+                    continue;
+                }
+                _ => {}
+            }
+            new_path.push(segment)
+        }
+
+        ensure!(
+            timeline_id_replaced,
+            "Found no timeline id segment to replace"
+        );
+        Ok(new_path)
+    }
 }
--- a/pageserver/src/relish_storage/README.md
+++ b/pageserver/src/relish_storage/README.md
@@ -0,0 +1,82 @@
+# Non-implementation details
+
+This document describes the current state of the backup system in pageserver, existing limitations and concerns, why some things are done the way they are the future development plans.
+Detailed description on how the synchronization works and how it fits into the rest of the pageserver can be found in the [storage module](./../relish_storage.rs) and its submodules.
+Ideally, this document should disappear after current implementation concerns are mitigated, with the remaining useful knowledge bits moved into rustdocs.
+
+## Approach
+
+Backup functionality is a new component, appeared way after the core DB functionality was implemented.
+Pageserver layer functionality is also quite volatile at the moment, there's a risk its local file management changes over time.
+
+To avoid adding more chaos into that, backup functionality is currently designed as a relatively standalone component, with the majority of its logic placed in a standalone async loop.
+This way, the backups are managed in background, not affecting directly other pageserver parts: this way the backup and restoration process may lag behind, but eventually keep up with the reality. To track that, a set of prometheus metrics is exposed from pageserver.
+
+## What's done
+
+Current implementation
+* provides remote storage wrappers for AWS S3 and local FS
+* uploads layers, frozen by pageserver checkpoint thread
+* downloads and registers layers, found on the remote storage, but missing locally
+
+No good optimisations or performance testing is done, the feature is disabled by default and gets polished over time.
+It's planned to deal with all questions that are currently on and prepare the feature to be enabled by default in cloud environments.
+
+### Peculiarities
+
+As mentioned, the backup component is rather new and under development currently, so not all things are done properly from the start.
+Here's the list of known compromises with comments:
+
+* Remote storage model is the same as the `tenants/` directory contents of the pageserver's local workdir storage.
+This is relatively simple to implement, but may be costly to use in AWS S3: an initial data image contains ~782 relish file and a metadata file, ~31 MB combined.
+AWS charges per API call and for traffic either, layers are expected to be updated frequently, so this model most probably is ineffective.
+Additionally, pageservers might need to migrate images between tenants, which does not improve the situation.
+
+Storage sync API operates images when backing up or restoring a backup, so we're fluent to repack the layer contents the way we want to, which most probably will be done later.
+
+* no proper file comparison
+
+Currently, every layer contains `Lsn` in their name, to map the data it holds against a certain DB state.
+Then the images with same ids and different `Lsn`'s are compared, files are considered equal if their local file paths are equal (for remote files, "local file path" is their download destination).
+No file contents assertion is done currently, but should be.
+AWS S3 returns file checksums during the `list` operation, so that can be used to ensure the backup consistency, but that needs further research and, since current pageserver impl also needs to deal with layer file checksums.
+
+For now, due to this, we consider local workdir files as source of truth, not removing them ever and adjusting remote files instead, if image files mismatch.
+
+* no proper retry management
+
+Now, the storage sync attempts to redo the upload/download operation for the image files that failed.
+No proper task eviction or backpressure is implemented currently: the tasks will stay in the queue forever, reattempting the downloads.
+
+This will be fixed when more details on the file consistency model will be agreed on.
+
+* sad rust-s3 api
+
+rust-s3 is not very pleasant to use:
+1. it returns `anyhow::Result` and it's hard to distinguish "missing file" cases from "no connection" one, for instance
+2. at least one function it its API that we need (`get_object_stream`) has `async` keyword and blocks (!), see details [here](https://github.com/zenithdb/zenith/pull/752#discussion_r728373091)
+3. it's a prerelease library with unclear maintenance status
+4. noisy on debug level
+
+But it's already used in the project, so for now it's reused to avoid bloating the dependency tree.
+Based on previous evaluation, even `rusoto-s3` could be a better choice over this library, but needs further benchmarking.
+
+
+* gc and branches are ignored
+
+So far, we don't consider non-main images and don't adjust the remote storage based on GC thread loop results.
+Only checkpointer loop affects the remote storage.
+
+* more layers should be downloaded on demand
+
+Since we download and load remote layers into pageserver, there's a possibility a need for those layers' ancestors arise.
+Most probably, every downloaded image's ancestor is not present in locally too, but currently there's no logic for downloading such ancestors and their metadata,
+so the pageserver is unable to respond property on requests to such ancestors.
+
+To implement the downloading, more `tenant_mgr` refactoring is needed to properly handle web requests for layers and handle the state changes.
+[Here](https://github.com/zenithdb/zenith/pull/689#issuecomment-931216193) are the details about initial state management updates needed.
+
+* no IT tests
+
+Automated S3 testing is lacking currently, due to no convenient way to enable backups during the tests.
+After it's fixed, benchmark runs should also be carried out to find bottlenecks.
--- a/pageserver/src/relish_storage/local_fs.rs
+++ b/pageserver/src/relish_storage/local_fs.rs
@@ -1,38 +1,45 @@
 //! Local filesystem relish storage.
+//! Multiple pageservers can use the same "storage" of this kind by using different storage roots.
 //!
-//! Page server already stores layer data on the server, when freezing it.
-//! This storage serves a way to
-//!
-//! * test things locally simply
-//! * allow to compabre both binary sets
-//! * help validating the relish storage API
+//! This storage used in pageserver tests, but can also be used in cases when a certain persistent
+//! volume is mounted to the local FS.

 use std::{
+    ffi::OsStr,
    future::Future,
+    io::Write,
    path::{Path, PathBuf},
    pin::Pin,
 };

 use anyhow::{bail, Context};
+use tokio::{fs, io};
+use tracing::*;

-use super::{strip_workspace_prefix, RelishStorage};
+use crate::layered_repository::metadata::METADATA_FILE_NAME;
+
+use super::{parse_ids_from_path, strip_path_prefix, RelishStorage, RemoteRelishInfo};

 pub struct LocalFs {
+    pageserver_workdir: &'static Path,
    root: PathBuf,
 }

 impl LocalFs {
-    /// Atetmpts to create local FS relish storage, also creates the directory provided, if not exists.
-    pub fn new(root: PathBuf) -> anyhow::Result<Self> {
+    /// Attempts to create local FS relish storage, along with the storage root directory.
+    pub fn new(root: PathBuf, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
        if !root.exists() {
            std::fs::create_dir_all(&root).with_context(|| {
                format!(
-                    "Failed to create all directories in the given root path {}",
+                    "Failed to create all directories in the given root path '{}'",
                    root.display(),
                )
            })?;
        }
-        Ok(Self { root })
+        Ok(Self {
+            pageserver_workdir,
+            root,
+        })
    }

    fn resolve_in_storage(&self, path: &Path) -> anyhow::Result<PathBuf> {
@@ -53,27 +60,63 @@ impl LocalFs {
 impl RelishStorage for LocalFs {
    type RelishStoragePath = PathBuf;

-    fn derive_destination(
-        page_server_workdir: &Path,
-        relish_local_path: &Path,
-    ) -> anyhow::Result<Self::RelishStoragePath> {
-        Ok(strip_workspace_prefix(page_server_workdir, relish_local_path)?.to_path_buf())
+    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::RelishStoragePath> {
+        Ok(self.root.join(
+            strip_path_prefix(self.pageserver_workdir, local_path)
+                .context("local path does not belong to this storage")?,
+        ))
+    }
+
+    fn info(&self, storage_path: &Self::RelishStoragePath) -> anyhow::Result<RemoteRelishInfo> {
+        let is_metadata =
+            storage_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME);
+        let relative_path = strip_path_prefix(&self.root, storage_path)
+            .context("local path does not belong to this storage")?;
+        let download_destination = self.pageserver_workdir.join(relative_path);
+        let (tenant_id, timeline_id) = parse_ids_from_path(
+            relative_path.iter().filter_map(|segment| segment.to_str()),
+            &relative_path.display(),
+        )?;
+        Ok(RemoteRelishInfo {
+            tenant_id,
+            timeline_id,
+            download_destination,
+            is_metadata,
+        })
    }

    async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>> {
        Ok(get_all_files(&self.root).await?.into_iter().collect())
    }

-    async fn download_relish(
+    async fn download_relish<W: 'static + std::io::Write + Send>(
        &self,
        from: &Self::RelishStoragePath,
-        to: &Path,
-    ) -> anyhow::Result<()> {
+        mut to: std::io::BufWriter<W>,
+    ) -> anyhow::Result<std::io::BufWriter<W>> {
        let file_path = self.resolve_in_storage(from)?;
+
        if file_path.exists() && file_path.is_file() {
-            create_target_directory(to).await?;
-            tokio::fs::copy(file_path, to).await?;
-            Ok(())
+            let updated_buffer = tokio::task::spawn_blocking(move || {
+                let mut source = std::io::BufReader::new(
+                    std::fs::OpenOptions::new()
+                        .read(true)
+                        .open(&file_path)
+                        .with_context(|| {
+                            format!(
+                                "Failed to open source file '{}' to use in the download",
+                                file_path.display()
+                            )
+                        })?,
+                );
+                std::io::copy(&mut source, &mut to)
+                    .context("Failed to download the relish file")?;
+                to.flush().context("Failed to flush the download buffer")?;
+                Ok::<_, anyhow::Error>(to)
+            })
+            .await
+            .context("Failed to spawn a blocking task")??;
+            Ok(updated_buffer)
        } else {
            bail!(
                "File '{}' either does not exist or is not a file",
@@ -85,7 +128,7 @@ impl RelishStorage for LocalFs {
    async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()> {
        let file_path = self.resolve_in_storage(path)?;
        if file_path.exists() && file_path.is_file() {
-            Ok(tokio::fs::remove_file(file_path).await?)
+            Ok(fs::remove_file(file_path).await?)
        } else {
            bail!(
                "File '{}' either does not exist or is not a file",
@@ -94,18 +137,30 @@ impl RelishStorage for LocalFs {
        }
    }

-    async fn upload_relish(&self, from: &Path, to: &Self::RelishStoragePath) -> anyhow::Result<()> {
+    async fn upload_relish<R: io::AsyncRead + std::marker::Unpin + Send>(
+        &self,
+        from: &mut io::BufReader<R>,
+        to: &Self::RelishStoragePath,
+    ) -> anyhow::Result<()> {
        let target_file_path = self.resolve_in_storage(to)?;
        create_target_directory(&target_file_path).await?;
+        let mut destination = io::BufWriter::new(
+            fs::OpenOptions::new()
+                .write(true)
+                .create(true)
+                .open(&target_file_path)
+                .await
+                .with_context(|| {
+                    format!(
+                        "Failed to open target fs destination at '{}'",
+                        target_file_path.display()
+                    )
+                })?,
+        );

-        tokio::fs::copy(&from, &target_file_path)
+        io::copy_buf(from, &mut destination)
            .await
-            .with_context(|| {
-                format!(
-                    "Failed to upload relish '{}' to local storage",
-                    from.display(),
-                )
-            })?;
+            .context("Failed to upload relish to local storage")?;
        Ok(())
    }
 }
@@ -121,12 +176,12 @@ where
        if directory_path.exists() {
            if directory_path.is_dir() {
                let mut paths = Vec::new();
-                let mut dir_contents = tokio::fs::read_dir(directory_path).await?;
+                let mut dir_contents = fs::read_dir(directory_path).await?;
                while let Some(dir_entry) = dir_contents.next_entry().await? {
                    let file_type = dir_entry.file_type().await?;
                    let entry_path = dir_entry.path();
                    if file_type.is_symlink() {
-                        log::debug!("{:?} us a symlink, skipping", entry_path)
+                        debug!("{:?} us a symlink, skipping", entry_path)
                    } else if file_type.is_dir() {
                        paths.extend(get_all_files(entry_path).await?.into_iter())
                    } else {
@@ -152,7 +207,370 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()>
        ),
    };
    if !target_dir.exists() {
-        tokio::fs::create_dir_all(target_dir).await?;
+        fs::create_dir_all(target_dir).await?;
    }
    Ok(())
 }
+
+#[cfg(test)]
+mod pure_tests {
+    use crate::{
+        layered_repository::metadata::METADATA_FILE_NAME,
+        relish_storage::test_utils::{
+            custom_tenant_id_path, custom_timeline_id_path, relative_timeline_path,
+        },
+        repository::repo_harness::{RepoHarness, TIMELINE_ID},
+    };
+
+    use super::*;
+
+    #[test]
+    fn storage_path_positive() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("storage_path_positive")?;
+        let storage_root = PathBuf::from("somewhere").join("else");
+        let storage = LocalFs {
+            pageserver_workdir: &repo_harness.conf.workdir,
+            root: storage_root.clone(),
+        };
+
+        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("relish_name");
+        let expected_path = storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?);
+
+        assert_eq!(
+            expected_path,
+            storage.storage_path(&local_path).expect("Matching path should map to storage path normally"),
+            "Relish paths from pageserver workdir should be stored in local fs storage with the same path they have relative to the workdir"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn storage_path_negatives() -> anyhow::Result<()> {
+        #[track_caller]
+        fn storage_path_error(storage: &LocalFs, mismatching_path: &Path) -> String {
+            match storage.storage_path(mismatching_path) {
+                Ok(wrong_path) => panic!(
+                    "Expected path '{}' to error, but got storage path: {:?}",
+                    mismatching_path.display(),
+                    wrong_path,
+                ),
+                Err(e) => format!("{:?}", e),
+            }
+        }
+
+        let repo_harness = RepoHarness::create("storage_path_negatives")?;
+        let storage_root = PathBuf::from("somewhere").join("else");
+        let storage = LocalFs {
+            pageserver_workdir: &repo_harness.conf.workdir,
+            root: storage_root,
+        };
+
+        let error_string = storage_path_error(&storage, &repo_harness.conf.workdir);
+        assert!(error_string.contains("does not belong to this storage"));
+        assert!(error_string.contains(repo_harness.conf.workdir.to_str().unwrap()));
+
+        let mismatching_path_str = "/something/else";
+        let error_message = storage_path_error(&storage, Path::new(mismatching_path_str));
+        assert!(
+            error_message.contains(mismatching_path_str),
+            "Error should mention wrong path"
+        );
+        assert!(
+            error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
+            "Error should mention server workdir"
+        );
+        assert!(error_message.contains("does not belong to this storage"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn info_positive() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("info_positive")?;
+        let storage_root = PathBuf::from("somewhere").join("else");
+        let storage = LocalFs {
+            pageserver_workdir: &repo_harness.conf.workdir,
+            root: storage_root.clone(),
+        };
+
+        let name = "not a metadata";
+        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join(name);
+        assert_eq!(
+            RemoteRelishInfo {
+                tenant_id: repo_harness.tenant_id,
+                timeline_id: TIMELINE_ID,
+                download_destination: local_path.clone(),
+                is_metadata: false,
+            },
+            storage
+                .info(&storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?))
+                .expect("For a valid input, valid S3 info should be parsed"),
+            "Should be able to parse metadata out of the correctly named remote delta relish"
+        );
+
+        let local_metadata_path = repo_harness
+            .timeline_path(&TIMELINE_ID)
+            .join(METADATA_FILE_NAME);
+        let remote_metadata_path = storage.storage_path(&local_metadata_path)?;
+        assert_eq!(
+            RemoteRelishInfo {
+                tenant_id: repo_harness.tenant_id,
+                timeline_id: TIMELINE_ID,
+                download_destination: local_metadata_path,
+                is_metadata: true,
+            },
+            storage
+                .info(&remote_metadata_path)
+                .expect("For a valid input, valid S3 info should be parsed"),
+            "Should be able to parse metadata out of the correctly named remote metadata file"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn info_negatives() -> anyhow::Result<()> {
+        #[track_caller]
+        #[allow(clippy::ptr_arg)] // have to use &PathBuf due to `storage.info` parameter requirements
+        fn storage_info_error(storage: &LocalFs, storage_path: &PathBuf) -> String {
+            match storage.info(storage_path) {
+                Ok(wrong_info) => panic!(
+                    "Expected storage path input {:?} to cause an error, but got relish info: {:?}",
+                    storage_path, wrong_info,
+                ),
+                Err(e) => format!("{:?}", e),
+            }
+        }
+
+        let repo_harness = RepoHarness::create("info_negatives")?;
+        let storage_root = PathBuf::from("somewhere").join("else");
+        let storage = LocalFs {
+            pageserver_workdir: &repo_harness.conf.workdir,
+            root: storage_root.clone(),
+        };
+
+        let totally_wrong_path = "wrong_wrong_wrong";
+        let error_message = storage_info_error(&storage, &PathBuf::from(totally_wrong_path));
+        assert!(error_message.contains(totally_wrong_path));
+
+        let relative_timeline_path = relative_timeline_path(&repo_harness)?;
+
+        let relative_relish_path =
+            custom_tenant_id_path(&relative_timeline_path, "wrong_tenant_id")?
+                .join("wrong_tenant_id_name");
+        let wrong_tenant_id_path = storage_root.join(&relative_relish_path);
+        let error_message = storage_info_error(&storage, &wrong_tenant_id_path);
+        assert!(
+            error_message.contains(relative_relish_path.to_str().unwrap()),
+            "Error message '{}' does not contain the expected substring",
+            error_message
+        );
+
+        let relative_relish_path =
+            custom_timeline_id_path(&relative_timeline_path, "wrong_timeline_id")?
+                .join("wrong_timeline_id_name");
+        let wrong_timeline_id_path = storage_root.join(&relative_relish_path);
+        let error_message = storage_info_error(&storage, &wrong_timeline_id_path);
+        assert!(
+            error_message.contains(relative_relish_path.to_str().unwrap()),
+            "Error message '{}' does not contain the expected substring",
+            error_message
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn download_destination_matches_original_path() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
+        let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
+
+        let storage_root = PathBuf::from("somewhere").join("else");
+        let dummy_storage = LocalFs {
+            pageserver_workdir: &repo_harness.conf.workdir,
+            root: storage_root,
+        };
+
+        let storage_path = dummy_storage.storage_path(&original_path)?;
+        let download_destination = dummy_storage.info(&storage_path)?.download_destination;
+
+        assert_eq!(
+            original_path, download_destination,
+            "'original path -> storage path -> matching fs path' transformation should produce the same path as the input one for the correct path"
+        );
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod fs_tests {
+    use crate::{
+        relish_storage::test_utils::relative_timeline_path, repository::repo_harness::RepoHarness,
+    };
+
+    use super::*;
+
+    use tempfile::tempdir;
+
+    #[tokio::test]
+    async fn upload_relish() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("upload_relish")?;
+        let storage = create_storage()?;
+
+        let mut source = create_file_for_upload(
+            &storage.pageserver_workdir.join("whatever"),
+            "whatever_contents",
+        )
+        .await?;
+        let target_path = PathBuf::from("/").join("somewhere").join("else");
+        match storage.upload_relish(&mut source, &target_path).await {
+            Ok(()) => panic!("Should not allow storing files with wrong target path"),
+            Err(e) => {
+                let message = format!("{:?}", e);
+                assert!(message.contains(&target_path.display().to_string()));
+                assert!(message.contains("does not belong to the current storage"));
+            }
+        }
+        assert!(storage.list_relishes().await?.is_empty());
+
+        let target_path_1 = upload_dummy_file(&repo_harness, &storage, "upload_1").await?;
+        assert_eq!(
+            storage.list_relishes().await?,
+            vec![target_path_1.clone()],
+            "Should list a single file after first upload"
+        );
+
+        let target_path_2 = upload_dummy_file(&repo_harness, &storage, "upload_2").await?;
+        assert_eq!(
+            list_relishes_sorted(&storage).await?,
+            vec![target_path_1.clone(), target_path_2.clone()],
+            "Should list a two different files after second upload"
+        );
+
+        // match storage.upload_relish(&mut source, &target_path_1).await {
+        //     Ok(()) => panic!("Should not allow reuploading storage files"),
+        //     Err(e) => {
+        //         let message = format!("{:?}", e);
+        //         assert!(message.contains(&target_path_1.display().to_string()));
+        //         assert!(message.contains("File exists"));
+        //     }
+        // }
+        assert_eq!(
+            list_relishes_sorted(&storage).await?,
+            vec![target_path_1, target_path_2],
+            "Should list a two different files after all upload attempts"
+        );
+
+        Ok(())
+    }
+
+    fn create_storage() -> anyhow::Result<LocalFs> {
+        let pageserver_workdir = Box::leak(Box::new(tempdir()?.path().to_owned()));
+        let storage = LocalFs::new(tempdir()?.path().to_owned(), pageserver_workdir)?;
+        Ok(storage)
+    }
+
+    #[tokio::test]
+    async fn download_relish() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("download_relish")?;
+        let storage = create_storage()?;
+        let upload_name = "upload_1";
+        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
+
+        let contents_bytes = storage
+            .download_relish(&upload_target, std::io::BufWriter::new(Vec::new()))
+            .await?
+            .into_inner()?;
+        let contents = String::from_utf8(contents_bytes)?;
+        assert_eq!(
+            dummy_contents(upload_name),
+            contents,
+            "We should upload and download the same contents"
+        );
+
+        let non_existing_path = PathBuf::from("somewhere").join("else");
+        match storage
+            .download_relish(&non_existing_path, std::io::BufWriter::new(Vec::new()))
+            .await
+        {
+            Ok(_) => panic!("Should not allow downloading non-existing storage files"),
+            Err(e) => {
+                let error_string = e.to_string();
+                assert!(error_string.contains("does not exist"));
+                assert!(error_string.contains(&non_existing_path.display().to_string()));
+            }
+        }
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn delete_relish() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("delete_relish")?;
+        let storage = create_storage()?;
+        let upload_name = "upload_1";
+        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
+
+        storage.delete_relish(&upload_target).await?;
+        assert!(storage.list_relishes().await?.is_empty());
+
+        match storage.delete_relish(&upload_target).await {
+            Ok(()) => panic!("Should not allow deleting non-existing storage files"),
+            Err(e) => {
+                let error_string = e.to_string();
+                assert!(error_string.contains("does not exist"));
+                assert!(error_string.contains(&upload_target.display().to_string()));
+            }
+        }
+        Ok(())
+    }
+
+    async fn upload_dummy_file(
+        harness: &RepoHarness,
+        storage: &LocalFs,
+        name: &str,
+    ) -> anyhow::Result<PathBuf> {
+        let storage_path = storage
+            .root
+            .join(relative_timeline_path(harness)?)
+            .join(name);
+        storage
+            .upload_relish(
+                &mut create_file_for_upload(
+                    &storage.pageserver_workdir.join(name),
+                    &dummy_contents(name),
+                )
+                .await?,
+                &storage_path,
+            )
+            .await?;
+        Ok(storage_path)
+    }
+
+    async fn create_file_for_upload(
+        path: &Path,
+        contents: &str,
+    ) -> anyhow::Result<io::BufReader<fs::File>> {
+        std::fs::create_dir_all(path.parent().unwrap())?;
+        let mut file_for_writing = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .open(path)?;
+        write!(file_for_writing, "{}", contents)?;
+        drop(file_for_writing);
+        Ok(io::BufReader::new(
+            fs::OpenOptions::new().read(true).open(&path).await?,
+        ))
+    }
+
+    fn dummy_contents(name: &str) -> String {
+        format!("contents for {}", name)
+    }
+
+    async fn list_relishes_sorted(storage: &LocalFs) -> anyhow::Result<Vec<PathBuf>> {
+        let mut relishes = storage.list_relishes().await?;
+        relishes.sort();
+        Ok(relishes)
+    }
+}
--- a/pageserver/src/relish_storage/rust_s3.rs
+++ b/pageserver/src/relish_storage/rust_s3.rs
@@ -1,33 +1,45 @@
-//! A wrapper around AWS S3 client library `rust_s3` to be used a relish storage.
+//! AWS S3 relish storage wrapper around `rust_s3` library.
+//! Currently does not allow multiple pageservers to use the same bucket concurrently: relishes are
+//! placed in the root of the bucket.

-use std::path::Path;
+use std::{
+    io::Write,
+    path::{Path, PathBuf},
+};

 use anyhow::Context;
 use s3::{bucket::Bucket, creds::Credentials, region::Region};

-use crate::{relish_storage::strip_workspace_prefix, S3Config};
-
-use super::RelishStorage;
+use crate::{
+    layered_repository::metadata::METADATA_FILE_NAME,
+    relish_storage::{parse_ids_from_path, strip_path_prefix, RelishStorage, RemoteRelishInfo},
+    S3Config,
+};

 const S3_FILE_SEPARATOR: char = '/';

-#[derive(Debug)]
+#[derive(Debug, Eq, PartialEq)]
 pub struct S3ObjectKey(String);

 impl S3ObjectKey {
    fn key(&self) -> &str {
        &self.0
    }
+
+    fn download_destination(&self, pageserver_workdir: &Path) -> PathBuf {
+        pageserver_workdir.join(self.0.split(S3_FILE_SEPARATOR).collect::<PathBuf>())
+    }
 }

 /// AWS S3 relish storage.
-pub struct RustS3 {
+pub struct S3 {
+    pageserver_workdir: &'static Path,
    bucket: Bucket,
 }

-impl RustS3 {
+impl S3 {
    /// Creates the relish storage, errors if incorrect AWS S3 configuration provided.
-    pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
+    pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
        let region = aws_config
            .bucket_region
            .parse::<Region>()
@@ -47,19 +59,17 @@ impl RustS3 {
                credentials,
            )
            .context("Failed to create the s3 bucket")?,
+            pageserver_workdir,
        })
    }
 }

 #[async_trait::async_trait]
-impl RelishStorage for RustS3 {
+impl RelishStorage for S3 {
    type RelishStoragePath = S3ObjectKey;

-    fn derive_destination(
-        page_server_workdir: &Path,
-        relish_local_path: &Path,
-    ) -> anyhow::Result<Self::RelishStoragePath> {
-        let relative_path = strip_workspace_prefix(page_server_workdir, relish_local_path)?;
+    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::RelishStoragePath> {
+        let relative_path = strip_path_prefix(self.pageserver_workdir, local_path)?;
        let mut key = String::new();
        for segment in relative_path {
            key.push(S3_FILE_SEPARATOR);
@@ -68,6 +78,21 @@ impl RelishStorage for RustS3 {
        Ok(S3ObjectKey(key))
    }

+    fn info(&self, storage_path: &Self::RelishStoragePath) -> anyhow::Result<RemoteRelishInfo> {
+        let storage_path_key = &storage_path.0;
+        let is_metadata =
+            storage_path_key.ends_with(&format!("{}{}", S3_FILE_SEPARATOR, METADATA_FILE_NAME));
+        let download_destination = storage_path.download_destination(self.pageserver_workdir);
+        let (tenant_id, timeline_id) =
+            parse_ids_from_path(storage_path_key.split(S3_FILE_SEPARATOR), storage_path_key)?;
+        Ok(RemoteRelishInfo {
+            tenant_id,
+            timeline_id,
+            download_destination,
+            is_metadata,
+        })
+    }
+
    async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>> {
        let list_response = self
            .bucket
@@ -82,18 +107,14 @@ impl RelishStorage for RustS3 {
            .collect())
    }

-    async fn download_relish(
+    async fn download_relish<W: 'static + std::io::Write + Send>(
        &self,
        from: &Self::RelishStoragePath,
-        to: &Path,
-    ) -> anyhow::Result<()> {
-        let mut target_file = std::fs::OpenOptions::new()
-            .write(true)
-            .open(to)
-            .with_context(|| format!("Failed to open target s3 destination at {}", to.display()))?;
+        mut to: std::io::BufWriter<W>,
+    ) -> anyhow::Result<std::io::BufWriter<W>> {
        let code = self
            .bucket
-            .get_object_stream(from.key(), &mut target_file)
+            .get_object_stream(from.key(), &mut to)
            .await
            .with_context(|| format!("Failed to download s3 object with key {}", from.key()))?;
        if code != 200 {
@@ -102,7 +123,12 @@ impl RelishStorage for RustS3 {
                code
            ))
        } else {
-            Ok(())
+            tokio::task::spawn_blocking(move || {
+                to.flush().context("Failed to flush the download buffer")?;
+                Ok::<_, anyhow::Error>(to)
+            })
+            .await
+            .context("Failed to join the download buffer flush task")?
        }
    }

@@ -112,9 +138,9 @@ impl RelishStorage for RustS3 {
            .delete_object(path.key())
            .await
            .with_context(|| format!("Failed to delete s3 object with key {}", path.key()))?;
-        if code != 200 {
+        if code != 204 {
            Err(anyhow::format_err!(
-                "Received non-200 exit code during deleting object with key '{}', code: {}",
+                "Received non-204 exit code during deleting object with key '{}', code: {}",
                path.key(),
                code
            ))
@@ -123,12 +149,14 @@ impl RelishStorage for RustS3 {
        }
    }

-    async fn upload_relish(&self, from: &Path, to: &Self::RelishStoragePath) -> anyhow::Result<()> {
-        let mut local_file = tokio::fs::OpenOptions::new().read(true).open(from).await?;
-
+    async fn upload_relish<R: tokio::io::AsyncRead + std::marker::Unpin + Send>(
+        &self,
+        from: &mut tokio::io::BufReader<R>,
+        to: &Self::RelishStoragePath,
+    ) -> anyhow::Result<()> {
        let code = self
            .bucket
-            .put_object_stream(&mut local_file, to.key())
+            .put_object_stream(from, to.key())
            .await
            .with_context(|| format!("Failed to create s3 object with key {}", to.key()))?;
        if code != 200 {
@@ -142,3 +170,226 @@ impl RelishStorage for RustS3 {
        }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        relish_storage::test_utils::{
+            custom_tenant_id_path, custom_timeline_id_path, relative_timeline_path,
+        },
+        repository::repo_harness::{RepoHarness, TIMELINE_ID},
+    };
+
+    use super::*;
+
+    #[test]
+    fn download_destination() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("download_destination")?;
+
+        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("test_name");
+        let relative_path = local_path.strip_prefix(&repo_harness.conf.workdir)?;
+
+        let key = S3ObjectKey(format!(
+            "{}{}",
+            S3_FILE_SEPARATOR,
+            relative_path
+                .iter()
+                .map(|segment| segment.to_str().unwrap())
+                .collect::<Vec<_>>()
+                .join(&S3_FILE_SEPARATOR.to_string()),
+        ));
+
+        assert_eq!(
+            local_path,
+            key.download_destination(&repo_harness.conf.workdir),
+            "Download destination should consist of s3 path joined with the pageserver workdir prefix"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn storage_path_positive() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("storage_path_positive")?;
+
+        let segment_1 = "matching";
+        let segment_2 = "relish";
+        let local_path = &repo_harness.conf.workdir.join(segment_1).join(segment_2);
+        let expected_key = S3ObjectKey(format!(
+            "{SEPARATOR}{}{SEPARATOR}{}",
+            segment_1,
+            segment_2,
+            SEPARATOR = S3_FILE_SEPARATOR,
+        ));
+
+        let actual_key = dummy_storage(&repo_harness.conf.workdir)
+            .storage_path(local_path)
+            .expect("Matching path should map to S3 path normally");
+        assert_eq!(
+            expected_key,
+            actual_key,
+            "S3 key from the matching path should contain all segments after the workspace prefix, separated with S3 separator"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn storage_path_negatives() -> anyhow::Result<()> {
+        #[track_caller]
+        fn storage_path_error(storage: &S3, mismatching_path: &Path) -> String {
+            match storage.storage_path(mismatching_path) {
+                Ok(wrong_key) => panic!(
+                    "Expected path '{}' to error, but got S3 key: {:?}",
+                    mismatching_path.display(),
+                    wrong_key,
+                ),
+                Err(e) => e.to_string(),
+            }
+        }
+
+        let repo_harness = RepoHarness::create("storage_path_negatives")?;
+        let storage = dummy_storage(&repo_harness.conf.workdir);
+
+        let error_message = storage_path_error(&storage, &repo_harness.conf.workdir);
+        assert!(
+            error_message.contains("Prefix and the path are equal"),
+            "Message '{}' does not contain the required string",
+            error_message
+        );
+
+        let mismatching_path = PathBuf::from("somewhere").join("else");
+        let error_message = storage_path_error(&storage, &mismatching_path);
+        assert!(
+            error_message.contains(mismatching_path.to_str().unwrap()),
+            "Error should mention wrong path"
+        );
+        assert!(
+            error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
+            "Error should mention server workdir"
+        );
+        assert!(
+            error_message.contains("is not prefixed with"),
+            "Message '{}' does not contain a required string",
+            error_message
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn info_positive() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("info_positive")?;
+        let storage = dummy_storage(&repo_harness.conf.workdir);
+        let relative_timeline_path = relative_timeline_path(&repo_harness)?;
+
+        let s3_key = create_s3_key(&relative_timeline_path.join("not a metadata"));
+        assert_eq!(
+            RemoteRelishInfo {
+                tenant_id: repo_harness.tenant_id,
+                timeline_id: TIMELINE_ID,
+                download_destination: s3_key.download_destination(&repo_harness.conf.workdir),
+                is_metadata: false,
+            },
+            storage
+                .info(&s3_key)
+                .expect("For a valid input, valid S3 info should be parsed"),
+            "Should be able to parse metadata out of the correctly named remote delta relish"
+        );
+
+        let s3_key = create_s3_key(&relative_timeline_path.join(METADATA_FILE_NAME));
+        assert_eq!(
+            RemoteRelishInfo {
+                tenant_id: repo_harness.tenant_id,
+                timeline_id: TIMELINE_ID,
+                download_destination: s3_key.download_destination(&repo_harness.conf.workdir),
+                is_metadata: true,
+            },
+            storage
+                .info(&s3_key)
+                .expect("For a valid input, valid S3 info should be parsed"),
+            "Should be able to parse metadata out of the correctly named remote metadata file"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn info_negatives() -> anyhow::Result<()> {
+        #[track_caller]
+        fn storage_info_error(storage: &S3, s3_key: &S3ObjectKey) -> String {
+            match storage.info(s3_key) {
+                Ok(wrong_info) => panic!(
+                    "Expected key {:?} to error, but got relish info: {:?}",
+                    s3_key, wrong_info,
+                ),
+                Err(e) => e.to_string(),
+            }
+        }
+
+        let repo_harness = RepoHarness::create("info_negatives")?;
+        let storage = dummy_storage(&repo_harness.conf.workdir);
+        let relative_timeline_path = relative_timeline_path(&repo_harness)?;
+
+        let totally_wrong_path = "wrong_wrong_wrong";
+        let error_message =
+            storage_info_error(&storage, &S3ObjectKey(totally_wrong_path.to_string()));
+        assert!(error_message.contains(totally_wrong_path));
+
+        let wrong_tenant_id = create_s3_key(
+            &custom_tenant_id_path(&relative_timeline_path, "wrong_tenant_id")?.join("name"),
+        );
+        let error_message = storage_info_error(&storage, &wrong_tenant_id);
+        assert!(error_message.contains(&wrong_tenant_id.0));
+
+        let wrong_timeline_id = create_s3_key(
+            &custom_timeline_id_path(&relative_timeline_path, "wrong_timeline_id")?.join("name"),
+        );
+        let error_message = storage_info_error(&storage, &wrong_timeline_id);
+        assert!(error_message.contains(&wrong_timeline_id.0));
+
+        Ok(())
+    }
+
+    #[test]
+    fn download_destination_matches_original_path() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
+        let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
+
+        let dummy_storage = dummy_storage(&repo_harness.conf.workdir);
+
+        let key = dummy_storage.storage_path(&original_path)?;
+        let download_destination = dummy_storage.info(&key)?.download_destination;
+
+        assert_eq!(
+            original_path, download_destination,
+            "'original path -> storage key -> matching fs path' transformation should produce the same path as the input one for the correct path"
+        );
+
+        Ok(())
+    }
+
+    fn dummy_storage(pageserver_workdir: &'static Path) -> S3 {
+        S3 {
+            pageserver_workdir,
+            bucket: Bucket::new(
+                "dummy-bucket",
+                "us-east-1".parse().unwrap(),
+                Credentials::anonymous().unwrap(),
+            )
+            .unwrap(),
+        }
+    }
+
+    fn create_s3_key(relative_relish_path: &Path) -> S3ObjectKey {
+        S3ObjectKey(
+            relative_relish_path
+                .iter()
+                .fold(String::new(), |mut path_string, segment| {
+                    path_string.push(S3_FILE_SEPARATOR);
+                    path_string.push_str(segment.to_str().unwrap());
+                    path_string
+                }),
+        )
+    }
+}
--- a/pageserver/src/relish_storage/storage_sync.rs
+++ b/pageserver/src/relish_storage/storage_sync.rs
--- a/pageserver/src/relish_storage/storage_uploader.rs
+++ b/pageserver/src/relish_storage/storage_uploader.rs
@@ -1,116 +0,0 @@
-use std::{
-    collections::VecDeque,
-    path::{Path, PathBuf},
-    sync::{Arc, Mutex},
-    thread,
-};
-
-use zenith_utils::zid::ZTimelineId;
-
-use crate::{relish_storage::RelishStorage, RelishStorageConfig};
-
-use super::{local_fs::LocalFs, rust_s3::RustS3};
-
-pub struct QueueBasedRelishUploader {
-    upload_queue: Arc<Mutex<VecDeque<(ZTimelineId, PathBuf)>>>,
-}
-
-impl QueueBasedRelishUploader {
-    pub fn new(
-        config: &RelishStorageConfig,
-        page_server_workdir: &'static Path,
-    ) -> anyhow::Result<Self> {
-        let upload_queue = Arc::new(Mutex::new(VecDeque::new()));
-        let _handle = match config {
-            RelishStorageConfig::LocalFs(root) => {
-                let relish_storage = LocalFs::new(root.clone())?;
-                create_upload_thread(
-                    Arc::clone(&upload_queue),
-                    relish_storage,
-                    page_server_workdir,
-                )?
-            }
-            RelishStorageConfig::AwsS3(s3_config) => {
-                let relish_storage = RustS3::new(s3_config)?;
-                create_upload_thread(
-                    Arc::clone(&upload_queue),
-                    relish_storage,
-                    page_server_workdir,
-                )?
-            }
-        };
-
-        Ok(Self { upload_queue })
-    }
-
-    pub fn schedule_upload(&self, timeline_id: ZTimelineId, relish_path: PathBuf) {
-        self.upload_queue
-            .lock()
-            .unwrap()
-            .push_back((timeline_id, relish_path))
-    }
-}
-
-fn create_upload_thread<P, S: 'static + RelishStorage<RelishStoragePath = P>>(
-    upload_queue: Arc<Mutex<VecDeque<(ZTimelineId, PathBuf)>>>,
-    relish_storage: S,
-    page_server_workdir: &'static Path,
-) -> std::io::Result<thread::JoinHandle<()>> {
-    let runtime = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()?;
-    thread::Builder::new()
-        .name("Queue based relish uploader".to_string())
-        .spawn(move || loop {
-            runtime.block_on(async {
-                upload_loop_step(&upload_queue, &relish_storage, page_server_workdir).await;
-            })
-        })
-}
-
-async fn upload_loop_step<P, S: 'static + RelishStorage<RelishStoragePath = P>>(
-    upload_queue: &Mutex<VecDeque<(ZTimelineId, PathBuf)>>,
-    relish_storage: &S,
-    page_server_workdir: &Path,
-) {
-    let mut queue_accessor = upload_queue.lock().unwrap();
-    log::debug!("current upload queue length: {}", queue_accessor.len());
-    let next_upload = queue_accessor.pop_front();
-    drop(queue_accessor);
-
-    let (relish_timeline_id, relish_local_path) = match next_upload {
-        Some(data) => data,
-        None => {
-            // Don't spin and allow others to use the queue.
-            // In future, could be improved to be more clever about delays depending on relish upload stats
-            thread::sleep(std::time::Duration::from_secs(1));
-            return;
-        }
-    };
-
-    if let Err(e) = upload_relish(relish_storage, page_server_workdir, &relish_local_path).await {
-        log::error!(
-            "Failed to upload relish '{}' for timeline {}, reason: {}",
-            relish_local_path.display(),
-            relish_timeline_id,
-            e
-        );
-        upload_queue
-            .lock()
-            .unwrap()
-            .push_back((relish_timeline_id, relish_local_path))
-    } else {
-        log::debug!("Relish successfully uploaded");
-    }
-}
-
-async fn upload_relish<P, S: RelishStorage<RelishStoragePath = P>>(
-    relish_storage: &S,
-    page_server_workdir: &Path,
-    relish_local_path: &Path,
-) -> anyhow::Result<()> {
-    let destination = S::derive_destination(page_server_workdir, relish_local_path)?;
-    relish_storage
-        .upload_relish(relish_local_path, &destination)
-        .await
-}
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -1,9 +1,10 @@
 use crate::relish::*;
+use crate::CheckpointConfig;
 use anyhow::Result;
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use serde::{Deserialize, Serialize};
 use std::collections::HashSet;
-use std::ops::AddAssign;
+use std::ops::{AddAssign, Deref};
 use std::sync::Arc;
 use std::time::Duration;
 use zenith_utils::lsn::{Lsn, RecordLsn};
@@ -13,6 +14,8 @@ use zenith_utils::zid::ZTimelineId;
 /// A repository corresponds to one .zenith directory. One repository holds multiple
 /// timelines, forked off from the same initial call to 'initdb'.
 pub trait Repository: Send + Sync {
+    fn shutdown(&self) -> Result<()>;
+
    /// Get Timeline handle for given zenith timeline ID.
    fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;

@@ -22,25 +25,25 @@ pub trait Repository: Send + Sync {
    /// Branch a timeline
    fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>;

-    /// perform one garbage collection iteration.
-    /// garbage collection is periodically performed by gc thread,
-    /// but it can be explicitly requested through page server api.
+    /// perform one garbage collection iteration, removing old data files from disk.
+    /// this funtion is periodically called by gc thread.
+    /// also it can be explicitly requested through page server api 'do_gc' command.
    ///
    /// 'timelineid' specifies the timeline to GC, or None for all.
    /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval).
-    /// `compact` parameter is used to force compaction of storage.
-    /// some storage implementation are based on lsm tree and require periodic merge (compaction).
-    /// usually storage implementation determines itself when compaction should be performed.
-    /// but for gc tests it way be useful to force compaction just after completion of gc iteration
-    /// to make sure that all detected garbage is removed.
-    /// so right now `compact` is set to true when gc explicitly requested through page srver api,
-    /// and is st to false in gc threads which infinitely repeats gc iterations in loop.
+    /// `checkpoint_before_gc` parameter is used to force compaction of storage before CG
+    /// to make tests more deterministic.
+    /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed?
    fn gc_iteration(
        &self,
        timelineid: Option<ZTimelineId>,
        horizon: u64,
-        compact: bool,
+        checkpoint_before_gc: bool,
    ) -> Result<GcResult>;
+
+    /// perform one checkpoint iteration, flushing in-memory data on disk.
+    /// this function is periodically called by checkponter thread.
+    fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()>;
 }

 ///
@@ -52,6 +55,7 @@ pub struct GcResult {
    pub ondisk_relfiles_needed_by_cutoff: u64,
    pub ondisk_relfiles_needed_by_branches: u64,
    pub ondisk_relfiles_not_updated: u64,
+    pub ondisk_relfiles_needed_as_tombstone: u64,
    pub ondisk_relfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
    pub ondisk_relfiles_dropped: u64, // # of layer files removed because the relation was dropped

@@ -59,6 +63,7 @@ pub struct GcResult {
    pub ondisk_nonrelfiles_needed_by_cutoff: u64,
    pub ondisk_nonrelfiles_needed_by_branches: u64,
    pub ondisk_nonrelfiles_not_updated: u64,
+    pub ondisk_nonrelfiles_needed_as_tombstone: u64,
    pub ondisk_nonrelfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
    pub ondisk_nonrelfiles_dropped: u64, // # of layer files removed because the relation was dropped

@@ -71,6 +76,7 @@ impl AddAssign for GcResult {
        self.ondisk_relfiles_needed_by_cutoff += other.ondisk_relfiles_needed_by_cutoff;
        self.ondisk_relfiles_needed_by_branches += other.ondisk_relfiles_needed_by_branches;
        self.ondisk_relfiles_not_updated += other.ondisk_relfiles_not_updated;
+        self.ondisk_relfiles_needed_as_tombstone += other.ondisk_relfiles_needed_as_tombstone;
        self.ondisk_relfiles_removed += other.ondisk_relfiles_removed;
        self.ondisk_relfiles_dropped += other.ondisk_relfiles_dropped;

@@ -78,6 +84,7 @@ impl AddAssign for GcResult {
        self.ondisk_nonrelfiles_needed_by_cutoff += other.ondisk_nonrelfiles_needed_by_cutoff;
        self.ondisk_nonrelfiles_needed_by_branches += other.ondisk_nonrelfiles_needed_by_branches;
        self.ondisk_nonrelfiles_not_updated += other.ondisk_nonrelfiles_not_updated;
+        self.ondisk_nonrelfiles_needed_as_tombstone += other.ondisk_nonrelfiles_needed_as_tombstone;
        self.ondisk_nonrelfiles_removed += other.ondisk_nonrelfiles_removed;
        self.ondisk_nonrelfiles_dropped += other.ondisk_nonrelfiles_dropped;

@@ -117,17 +124,54 @@ pub trait Timeline: Send + Sync {
    /// Get a list of all existing non-relational objects
    fn list_nonrels(&self, lsn: Lsn) -> Result<HashSet<RelishTag>>;

+    /// Get the LSN where this branch was created
+    fn get_ancestor_lsn(&self) -> Lsn;
+
    //------------------------------------------------------------------------------
    // Public PUT functions, to update the repository with new page versions.
    //
    // These are called by the WAL receiver to digest WAL records.
    //------------------------------------------------------------------------------

+    /// Atomically get both last and prev.
+    fn get_last_record_rlsn(&self) -> RecordLsn;
+    /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
+    fn get_last_record_lsn(&self) -> Lsn;
+    fn get_prev_record_lsn(&self) -> Lsn;
+    fn get_start_lsn(&self) -> Lsn;
+    fn get_disk_consistent_lsn(&self) -> Lsn;
+
+    /// Mutate the timeline with a [`TimelineWriter`].
+    fn writer<'a>(&'a self) -> Box<dyn TimelineWriter + 'a>;
+
+    ///
+    /// Flush to disk all data that was written with the put_* functions
+    ///
+    /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
+    /// know anything about them here in the repository.
+    fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()>;
+
+    /// Retrieve current logical size of the timeline
+    ///
+    /// NOTE: counted incrementally, includes ancestors,
+    /// doesnt support TwoPhase relishes yet
+    fn get_current_logical_size(&self) -> usize;
+
+    /// Does the same as get_current_logical_size but counted on demand.
+    /// Used in tests to ensure thet incremental and non incremental variants match.
+    fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize>;
+}
+
+/// Various functions to mutate the timeline.
+// TODO Currently, Deref is used to allow easy access to read methods from this trait.
+// This is probably considered a bad practice in Rust and should be fixed eventually,
+// but will cause large code changes.
+pub trait TimelineWriter: Deref<Target = dyn Timeline> {
    /// Put a new page version that can be constructed from a WAL record
    ///
    /// This will implicitly extend the relation, if the page is beyond the
    /// current end-of-file.
-    fn put_wal_record(&self, tag: RelishTag, blknum: u32, rec: WALRecord) -> Result<()>;
+    fn put_wal_record(&self, lsn: Lsn, tag: RelishTag, blknum: u32, rec: WALRecord) -> Result<()>;

    /// Like put_wal_record, but with ready-made image of the page.
    fn put_page_image(&self, tag: RelishTag, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()>;
@@ -143,34 +187,10 @@ pub trait Timeline: Send + Sync {
    /// Advance requires aligned LSN as an argument and would wake wait_lsn() callers.
    /// Previous last record LSN is stored alongside the latest and can be read.
    fn advance_last_record_lsn(&self, lsn: Lsn);
-    /// Atomically get both last and prev.
-    fn get_last_record_rlsn(&self) -> RecordLsn;
-    /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
-    fn get_last_record_lsn(&self) -> Lsn;
-    fn get_prev_record_lsn(&self) -> Lsn;
-    fn get_start_lsn(&self) -> Lsn;
-
-    ///
-    /// Flush to disk all data that was written with the put_* functions
-    ///
-    /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
-    /// know anything about them here in the repository.
-    fn checkpoint(&self) -> Result<()>;
-
-    /// Retrieve current logical size of the timeline
-    ///
-    /// NOTE: counted incrementally, includes ancestors,
-    /// doesnt support TwoPhase relishes yet
-    fn get_current_logical_size(&self) -> usize;
-
-    /// Does the same as get_current_logical_size but counted on demand.
-    /// Used in tests to ensure thet incremental and non incremental variants match.
-    fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize>;
 }

 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub struct WALRecord {
-    pub lsn: Lsn, // LSN at the *end* of the record
    pub will_init: bool,
    pub rec: Bytes,
    // Remember the offset of main_data in rec,
@@ -181,42 +201,133 @@ pub struct WALRecord {

 impl WALRecord {
    pub fn pack(&self, buf: &mut BytesMut) {
-        buf.put_u64(self.lsn.0);
        buf.put_u8(self.will_init as u8);
        buf.put_u32(self.main_data_offset);
        buf.put_u32(self.rec.len() as u32);
        buf.put_slice(&self.rec[..]);
    }
    pub fn unpack(buf: &mut Bytes) -> WALRecord {
-        let lsn = Lsn::from(buf.get_u64());
        let will_init = buf.get_u8() != 0;
        let main_data_offset = buf.get_u32();
-        let mut dst = vec![0u8; buf.get_u32() as usize];
-        buf.copy_to_slice(&mut dst);
+        let rec_len = buf.get_u32() as usize;
+        let rec = buf.split_to(rec_len);
        WALRecord {
-            lsn,
            will_init,
-            rec: Bytes::from(dst),
+            rec,
            main_data_offset,
        }
    }
 }

+#[cfg(test)]
+pub mod repo_harness {
+    use std::{fs, path::PathBuf};
+
+    use crate::{
+        layered_repository::{LayeredRepository, TIMELINES_SEGMENT_NAME},
+        walredo::{WalRedoError, WalRedoManager},
+        PageServerConf,
+    };
+
+    use super::*;
+    use hex_literal::hex;
+    use zenith_utils::zid::ZTenantId;
+
+    pub const TIMELINE_ID: ZTimelineId =
+        ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
+    pub const NEW_TIMELINE_ID: ZTimelineId =
+        ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
+
+    /// Convenience function to create a page image with given string as the only content
+    #[allow(non_snake_case)]
+    pub fn TEST_IMG(s: &str) -> Bytes {
+        let mut buf = BytesMut::new();
+        buf.extend_from_slice(s.as_bytes());
+        buf.resize(8192, 0);
+
+        buf.freeze()
+    }
+
+    pub struct RepoHarness {
+        pub conf: &'static PageServerConf,
+        pub tenant_id: ZTenantId,
+    }
+
+    impl RepoHarness {
+        pub fn create(test_name: &'static str) -> Result<Self> {
+            let repo_dir = PageServerConf::test_repo_dir(test_name);
+            let _ = fs::remove_dir_all(&repo_dir);
+            fs::create_dir_all(&repo_dir)?;
+            fs::create_dir_all(&repo_dir.join(TIMELINES_SEGMENT_NAME))?;
+
+            let conf = PageServerConf::dummy_conf(repo_dir);
+            // Make a static copy of the config. This can never be free'd, but that's
+            // OK in a test.
+            let conf: &'static PageServerConf = Box::leak(Box::new(conf));
+
+            let tenant_id = ZTenantId::generate();
+            fs::create_dir_all(conf.tenant_path(&tenant_id))?;
+
+            Ok(Self { conf, tenant_id })
+        }
+
+        pub fn load(&self) -> Box<dyn Repository> {
+            let walredo_mgr = Arc::new(TestRedoManager);
+
+            Box::new(LayeredRepository::new(
+                self.conf,
+                walredo_mgr,
+                self.tenant_id,
+                false,
+            ))
+        }
+
+        pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
+            self.conf.timeline_path(timeline_id, &self.tenant_id)
+        }
+    }
+
+    // Mock WAL redo manager that doesn't do much
+    struct TestRedoManager;
+
+    impl WalRedoManager for TestRedoManager {
+        fn request_redo(
+            &self,
+            rel: RelishTag,
+            blknum: u32,
+            lsn: Lsn,
+            base_img: Option<Bytes>,
+            records: Vec<(Lsn, WALRecord)>,
+        ) -> Result<Bytes, WalRedoError> {
+            let s = format!(
+                "redo for {} blk {} to get to {}, with {} and {} records",
+                rel,
+                blknum,
+                lsn,
+                if base_img.is_some() {
+                    "base image"
+                } else {
+                    "no base image"
+                },
+                records.len()
+            );
+            println!("{}", s);
+            Ok(TEST_IMG(&s))
+        }
+    }
+}
+
 ///
 /// Tests that should work the same with any Repository/Timeline implementation.
 ///
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
+    use crate::layered_repository::metadata::METADATA_FILE_NAME;
+
+    use super::repo_harness::*;
    use super::*;
-    use crate::layered_repository::LayeredRepository;
-    use crate::walredo::{WalRedoError, WalRedoManager};
-    use crate::PageServerConf;
-    use postgres_ffi::pg_constants;
-    use postgres_ffi::xlog_utils::SIZEOF_CHECKPOINT;
-    use std::fs;
-    use std::str::FromStr;
-    use zenith_utils::zid::ZTenantId;
+    use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT};

    /// Arbitrary relation tag, for testing.
    const TESTREL_A: RelishTag = RelishTag::Relation(RelTag {
@@ -232,16 +343,6 @@ mod tests {
        forknum: 0,
    });

-    /// Convenience function to create a page image with given string as the only content
-    #[allow(non_snake_case)]
-    fn TEST_IMG(s: &str) -> Bytes {
-        let mut buf = BytesMut::new();
-        buf.extend_from_slice(s.as_bytes());
-        buf.resize(8192, 0);
-
-        buf.freeze()
-    }
-
    fn assert_current_logical_size(timeline: &Arc<dyn Timeline>, lsn: Lsn) {
        let incremental = timeline.get_current_logical_size();
        let non_incremental = timeline
@@ -253,47 +354,23 @@ mod tests {
    static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
    static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);

-    fn get_test_repo(test_name: &str) -> Result<Box<dyn Repository>> {
-        let repo_dir = PageServerConf::test_repo_dir(test_name);
-        let _ = fs::remove_dir_all(&repo_dir);
-        fs::create_dir_all(&repo_dir)?;
-        fs::create_dir_all(&repo_dir.join("timelines"))?;
-
-        let conf = PageServerConf::dummy_conf(repo_dir);
-        // Make a static copy of the config. This can never be free'd, but that's
-        // OK in a test.
-        let conf: &'static PageServerConf = Box::leak(Box::new(conf));
-        let tenantid = ZTenantId::generate();
-        fs::create_dir_all(conf.tenant_path(&tenantid)).unwrap();
-
-        let walredo_mgr = TestRedoManager {};
-
-        let repo = Box::new(LayeredRepository::new(
-            conf,
-            Arc::new(walredo_mgr),
-            tenantid,
-        ));
-
-        Ok(repo)
-    }
-
    #[test]
    fn test_relsize() -> Result<()> {
-        let repo = get_test_repo("test_relsize")?;
+        let repo = RepoHarness::create("test_relsize")?.load();
        // get_timeline() with non-existent timeline id should fail
        //repo.get_timeline("11223344556677881122334455667788");

        // Create timeline to work on
-        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
-        let tline = repo.create_empty_timeline(timelineid)?;
+        let tline = repo.create_empty_timeline(TIMELINE_ID)?;
+        let writer = tline.writer();

-        tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
-        tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
-        tline.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
-        tline.put_page_image(TESTREL_A, 1, Lsn(0x40), TEST_IMG("foo blk 1 at 4"))?;
-        tline.put_page_image(TESTREL_A, 2, Lsn(0x50), TEST_IMG("foo blk 2 at 5"))?;
+        writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
+        writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
+        writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
+        writer.put_page_image(TESTREL_A, 1, Lsn(0x40), TEST_IMG("foo blk 1 at 4"))?;
+        writer.put_page_image(TESTREL_A, 2, Lsn(0x50), TEST_IMG("foo blk 2 at 5"))?;

-        tline.advance_last_record_lsn(Lsn(0x50));
+        writer.advance_last_record_lsn(Lsn(0x50));

        assert_current_logical_size(&tline, Lsn(0x50));

@@ -339,8 +416,8 @@ mod tests {
        );

        // Truncate last block
-        tline.put_truncation(TESTREL_A, Lsn(0x60), 2)?;
-        tline.advance_last_record_lsn(Lsn(0x60));
+        writer.put_truncation(TESTREL_A, Lsn(0x60), 2)?;
+        writer.advance_last_record_lsn(Lsn(0x60));
        assert_current_logical_size(&tline, Lsn(0x60));

        // Check reported size and contents after truncation
@@ -362,13 +439,13 @@ mod tests {
        );

        // Truncate to zero length
-        tline.put_truncation(TESTREL_A, Lsn(0x68), 0)?;
-        tline.advance_last_record_lsn(Lsn(0x68));
+        writer.put_truncation(TESTREL_A, Lsn(0x68), 0)?;
+        writer.advance_last_record_lsn(Lsn(0x68));
        assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x68))?.unwrap(), 0);

        // Extend from 0 to 2 blocks, leaving a gap
-        tline.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?;
-        tline.advance_last_record_lsn(Lsn(0x70));
+        writer.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?;
+        writer.advance_last_record_lsn(Lsn(0x70));
        assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x70))?.unwrap(), 2);
        assert_eq!(tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, ZERO_PAGE);
        assert_eq!(
@@ -395,21 +472,159 @@ mod tests {
        Ok(())
    }

+    // Test what happens if we dropped a relation
+    // and then created it again within the same layer.
+    #[test]
+    fn test_drop_extend() -> Result<()> {
+        let repo = RepoHarness::create("test_drop_extend")?.load();
+
+        // Create timeline to work on
+        let tline = repo.create_empty_timeline(TIMELINE_ID)?;
+        let writer = tline.writer();
+
+        writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
+        writer.advance_last_record_lsn(Lsn(0x20));
+
+        // Check that rel exists and size is correct
+        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true);
+        assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(), 1);
+
+        // Drop relish
+        writer.drop_relish(TESTREL_A, Lsn(0x30))?;
+        writer.advance_last_record_lsn(Lsn(0x30));
+
+        // Check that rel is not visible anymore
+        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false);
+        assert!(tline.get_relish_size(TESTREL_A, Lsn(0x30))?.is_none());
+
+        // Extend it again
+        writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
+        writer.advance_last_record_lsn(Lsn(0x40));
+
+        // Check that rel exists and size is correct
+        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true);
+        assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x40))?.unwrap(), 1);
+
+        Ok(())
+    }
+
+    // Test what happens if we truncated a relation
+    // so that one of its segments was dropped
+    // and then extended it again within the same layer.
+    #[test]
+    fn test_truncate_extend() -> Result<()> {
+        let repo = RepoHarness::create("test_truncate_extend")?.load();
+
+        // Create timeline to work on
+        let tline = repo.create_empty_timeline(TIMELINE_ID)?;
+        let writer = tline.writer();
+
+        //from storage_layer.rs
+        const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192;
+        let relsize = RELISH_SEG_SIZE * 2;
+
+        // Create relation with relsize blocks
+        for blkno in 0..relsize {
+            let lsn = Lsn(0x20);
+            let data = format!("foo blk {} at {}", blkno, lsn);
+            writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
+        }
+
+        writer.advance_last_record_lsn(Lsn(0x20));
+
+        // The relation was created at LSN 2, not visible at LSN 1 yet.
+        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false);
+        assert!(tline.get_relish_size(TESTREL_A, Lsn(0x10))?.is_none());
+
+        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true);
+        assert_eq!(
+            tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(),
+            relsize
+        );
+
+        // Check relation content
+        for blkno in 0..relsize {
+            let lsn = Lsn(0x20);
+            let data = format!("foo blk {} at {}", blkno, lsn);
+            assert_eq!(
+                tline.get_page_at_lsn(TESTREL_A, blkno, lsn)?,
+                TEST_IMG(&data)
+            );
+        }
+
+        // Truncate relation so that second segment was dropped
+        // - only leave one page
+        writer.put_truncation(TESTREL_A, Lsn(0x60), 1)?;
+        writer.advance_last_record_lsn(Lsn(0x60));
+
+        // Check reported size and contents after truncation
+        assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x60))?.unwrap(), 1);
+
+        for blkno in 0..1 {
+            let lsn = Lsn(0x20);
+            let data = format!("foo blk {} at {}", blkno, lsn);
+            assert_eq!(
+                tline.get_page_at_lsn(TESTREL_A, blkno, Lsn(0x60))?,
+                TEST_IMG(&data)
+            );
+        }
+
+        // should still see all blocks with older LSN
+        assert_eq!(
+            tline.get_relish_size(TESTREL_A, Lsn(0x50))?.unwrap(),
+            relsize
+        );
+        for blkno in 0..relsize {
+            let lsn = Lsn(0x20);
+            let data = format!("foo blk {} at {}", blkno, lsn);
+            assert_eq!(
+                tline.get_page_at_lsn(TESTREL_A, blkno, Lsn(0x50))?,
+                TEST_IMG(&data)
+            );
+        }
+
+        // Extend relation again.
+        // Add enough blocks to create second segment
+        for blkno in 0..relsize {
+            let lsn = Lsn(0x80);
+            let data = format!("foo blk {} at {}", blkno, lsn);
+            writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
+        }
+        writer.advance_last_record_lsn(Lsn(0x80));
+
+        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true);
+        assert_eq!(
+            tline.get_relish_size(TESTREL_A, Lsn(0x80))?.unwrap(),
+            relsize
+        );
+        // Check relation content
+        for blkno in 0..relsize {
+            let lsn = Lsn(0x80);
+            let data = format!("foo blk {} at {}", blkno, lsn);
+            assert_eq!(
+                tline.get_page_at_lsn(TESTREL_A, blkno, Lsn(0x80))?,
+                TEST_IMG(&data)
+            );
+        }
+
+        Ok(())
+    }
+
    /// Test get_relsize() and truncation with a file larger than 1 GB, so that it's
    /// split into multiple 1 GB segments in Postgres.
    #[test]
    fn test_large_rel() -> Result<()> {
-        let repo = get_test_repo("test_large_rel")?;
-        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
-        let tline = repo.create_empty_timeline(timelineid)?;
+        let repo = RepoHarness::create("test_large_rel")?.load();
+        let tline = repo.create_empty_timeline(TIMELINE_ID)?;
+        let writer = tline.writer();

        let mut lsn = 0x10;
        for blknum in 0..pg_constants::RELSEG_SIZE + 1 {
            let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
            lsn += 0x10;
-            tline.put_page_image(TESTREL_A, blknum as u32, Lsn(lsn), img)?;
+            writer.put_page_image(TESTREL_A, blknum as u32, Lsn(lsn), img)?;
        }
-        tline.advance_last_record_lsn(Lsn(lsn));
+        writer.advance_last_record_lsn(Lsn(lsn));

        assert_current_logical_size(&tline, Lsn(lsn));

@@ -420,8 +635,8 @@ mod tests {

        // Truncate one block
        lsn += 0x10;
-        tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?;
-        tline.advance_last_record_lsn(Lsn(lsn));
+        writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?;
+        writer.advance_last_record_lsn(Lsn(lsn));
        assert_eq!(
            tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
            pg_constants::RELSEG_SIZE
@@ -430,8 +645,8 @@ mod tests {

        // Truncate another block
        lsn += 0x10;
-        tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?;
-        tline.advance_last_record_lsn(Lsn(lsn));
+        writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?;
+        writer.advance_last_record_lsn(Lsn(lsn));
        assert_eq!(
            tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
            pg_constants::RELSEG_SIZE - 1
@@ -443,8 +658,8 @@ mod tests {
        let mut size: i32 = 3000;
        while size >= 0 {
            lsn += 0x10;
-            tline.put_truncation(TESTREL_A, Lsn(lsn), size as u32)?;
-            tline.advance_last_record_lsn(Lsn(lsn));
+            writer.put_truncation(TESTREL_A, Lsn(lsn), size as u32)?;
+            writer.advance_last_record_lsn(Lsn(lsn));
            assert_eq!(
                tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
                size as u32
@@ -461,23 +676,20 @@ mod tests {
    /// Test list_rels() function, with branches and dropped relations
    ///
    #[test]
-    // FIXME: The last assertion in this test is currently failing, see
-    // https://github.com/zenithdb/zenith/issues/502. Ignore the failure until that's fixed.
-    #[ignore]
    fn test_list_rels_drop() -> Result<()> {
-        let repo = get_test_repo("test_list_rels_drop")?;
-        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
-        let tline = repo.create_empty_timeline(timelineid)?;
+        let repo = RepoHarness::create("test_list_rels_drop")?.load();
+        let tline = repo.create_empty_timeline(TIMELINE_ID)?;
+        let writer = tline.writer();
        const TESTDB: u32 = 111;

        // Import initial dummy checkpoint record, otherwise the get_timeline() call
        // after branching fails below
-        tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;
+        writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;

        // Create a relation on the timeline
-        tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
+        writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;

-        tline.advance_last_record_lsn(Lsn(0x30));
+        writer.advance_last_record_lsn(Lsn(0x30));

        // Check that list_rels() lists it after LSN 2, but no before it
        assert!(!tline.list_rels(0, TESTDB, Lsn(0x10))?.contains(&TESTREL_A));
@@ -485,17 +697,19 @@ mod tests {
        assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A));

        // Create a branch, check that the relation is visible there
-        let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap();
-        repo.branch_timeline(timelineid, newtimelineid, Lsn(0x30))?;
-        let newtline = repo.get_timeline(newtimelineid)?;
+        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
+        let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
+        let new_writer = newtline.writer();

        assert!(newtline
            .list_rels(0, TESTDB, Lsn(0x30))?
            .contains(&TESTREL_A));

        // Drop it on the branch
-        newtline.drop_relish(TESTREL_A, Lsn(0x40))?;
-        newtline.advance_last_record_lsn(Lsn(0x40));
+        new_writer.drop_relish(TESTREL_A, Lsn(0x40))?;
+        new_writer.advance_last_record_lsn(Lsn(0x40));
+
+        drop(new_writer);

        // Check that it's no longer listed on the branch after the point where it was dropped
        assert!(newtline
@@ -506,10 +720,9 @@ mod tests {
            .contains(&TESTREL_A));

        // Run checkpoint and garbage collection and check that it's still not visible
-        newtline.checkpoint()?;
-        repo.gc_iteration(Some(newtimelineid), 0, true)?;
+        newtline.checkpoint(CheckpointConfig::Forced)?;
+        repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?;

-        // FIXME: this is currently failing
        assert!(!newtline
            .list_rels(0, TESTDB, Lsn(0x40))?
            .contains(&TESTREL_A));
@@ -522,32 +735,32 @@ mod tests {
    ///
    #[test]
    fn test_branch() -> Result<()> {
-        let repo = get_test_repo("test_branch")?;
-        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
-        let tline = repo.create_empty_timeline(timelineid)?;
+        let repo = RepoHarness::create("test_branch")?.load();
+        let tline = repo.create_empty_timeline(TIMELINE_ID)?;
+        let writer = tline.writer();

        // Import initial dummy checkpoint record, otherwise the get_timeline() call
        // after branching fails below
-        tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;
+        writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;

        // Create a relation on the timeline
-        tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
-        tline.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
-        tline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
+        writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
+        writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
+        writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;

        // Create another relation
-        tline.put_page_image(TESTREL_B, 0, Lsn(0x20), TEST_IMG("foobar blk 0 at 2"))?;
+        writer.put_page_image(TESTREL_B, 0, Lsn(0x20), TEST_IMG("foobar blk 0 at 2"))?;

-        tline.advance_last_record_lsn(Lsn(0x40));
+        writer.advance_last_record_lsn(Lsn(0x40));
        assert_current_logical_size(&tline, Lsn(0x40));

        // Branch the history, modify relation differently on the new timeline
-        let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap();
-        repo.branch_timeline(timelineid, newtimelineid, Lsn(0x30))?;
-        let newtline = repo.get_timeline(newtimelineid)?;
+        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
+        let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
+        let new_writer = newtline.writer();

-        newtline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?;
-        newtline.advance_last_record_lsn(Lsn(0x40));
+        new_writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?;
+        new_writer.advance_last_record_lsn(Lsn(0x40));

        // Check page contents on both branches
        assert_eq!(
@@ -572,32 +785,84 @@ mod tests {
        Ok(())
    }

-    // Mock WAL redo manager that doesn't do much
-    struct TestRedoManager {}
+    #[test]
+    fn corrupt_metadata() -> Result<()> {
+        const TEST_NAME: &str = "corrupt_metadata";
+        let harness = RepoHarness::create(TEST_NAME)?;
+        let repo = harness.load();

-    impl WalRedoManager for TestRedoManager {
-        fn request_redo(
-            &self,
-            rel: RelishTag,
-            blknum: u32,
-            lsn: Lsn,
-            base_img: Option<Bytes>,
-            records: Vec<WALRecord>,
-        ) -> Result<Bytes, WalRedoError> {
-            let s = format!(
-                "redo for {} blk {} to get to {}, with {} and {} records",
-                rel,
-                blknum,
-                lsn,
-                if base_img.is_some() {
-                    "base image"
-                } else {
-                    "no base image"
-                },
-                records.len()
-            );
-            println!("{}", s);
-            Ok(TEST_IMG(&s))
-        }
+        repo.create_empty_timeline(TIMELINE_ID)?;
+        drop(repo);
+
+        let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
+
+        assert!(metadata_path.is_file());
+
+        let mut metadata_bytes = std::fs::read(&metadata_path)?;
+        assert_eq!(metadata_bytes.len(), 512);
+        metadata_bytes[512 - 4 - 2] ^= 1;
+        std::fs::write(metadata_path, metadata_bytes)?;
+
+        let new_repo = harness.load();
+        let err = new_repo.get_timeline(TIMELINE_ID).err().unwrap();
+        assert!(err.to_string().contains("checksum"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn future_layerfiles() -> Result<()> {
+        const TEST_NAME: &str = "future_layerfiles";
+        let harness = RepoHarness::create(TEST_NAME)?;
+        let repo = harness.load();
+
+        repo.create_empty_timeline(TIMELINE_ID)?;
+        drop(repo);
+
+        let timeline_path = harness.timeline_path(&TIMELINE_ID);
+
+        let make_empty_file = |filename: &str| -> std::io::Result<()> {
+            let path = timeline_path.join(filename);
+
+            assert!(!path.exists());
+            std::fs::write(&path, &[])?;
+
+            Ok(())
+        };
+
+        let image_filename = format!("pg_control_0_{:016X}", 8000);
+        let delta_filename = format!("pg_control_0_{:016X}_{:016X}", 8000, 8008);
+
+        make_empty_file(&image_filename)?;
+        make_empty_file(&delta_filename)?;
+
+        let new_repo = harness.load();
+        new_repo.get_timeline(TIMELINE_ID).unwrap();
+        drop(new_repo);
+
+        let check_old = |filename: &str, num: u32| {
+            let path = timeline_path.join(filename);
+            assert!(!path.exists());
+
+            let backup_path = timeline_path.join(format!("{}.{}.old", filename, num));
+            assert!(backup_path.exists());
+        };
+
+        check_old(&image_filename, 0);
+        check_old(&delta_filename, 0);
+
+        make_empty_file(&image_filename)?;
+        make_empty_file(&delta_filename)?;
+
+        let new_repo = harness.load();
+        new_repo.get_timeline(TIMELINE_ID).unwrap();
+        drop(new_repo);
+
+        check_old(&image_filename, 0);
+        check_old(&delta_filename, 0);
+        check_old(&image_filename, 1);
+        check_old(&delta_filename, 1);
+
+        Ok(())
    }
 }
--- a/pageserver/src/restore_local_repo.rs
+++ b/pageserver/src/restore_local_repo.rs
@@ -2,17 +2,17 @@
 //! Import data and WAL from a PostgreSQL data directory and WAL segments into
 //! zenith Timeline.
 //!
-use log::*;
 use postgres_ffi::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment;
 use std::cmp::min;
 use std::fs;
 use std::fs::File;
-use std::io::Read;
-use std::path::Path;
+use std::io::{Read, Seek, SeekFrom};
+use std::path::{Path, PathBuf};

-use anyhow::Result;
+use anyhow::{anyhow, bail, Result};
 use bytes::{Buf, Bytes};
+use tracing::*;

 use crate::relish::*;
 use crate::repository::*;
@@ -34,9 +34,11 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
 ///
 pub fn import_timeline_from_postgres_datadir(
    path: &Path,
-    timeline: &dyn Timeline,
+    writer: &dyn TimelineWriter,
    lsn: Lsn,
 ) -> Result<()> {
+    let mut pg_control: Option<ControlFileData> = None;
+
    // Scan 'global'
    for direntry in fs::read_dir(path.join("global"))? {
        let direntry = direntry?;
@@ -44,10 +46,10 @@ pub fn import_timeline_from_postgres_datadir(
            None => continue,

            Some("pg_control") => {
-                import_control_file(timeline, lsn, &direntry.path())?;
+                pg_control = Some(import_control_file(writer, lsn, &direntry.path())?);
            }
            Some("pg_filenode.map") => import_nonrel_file(
-                timeline,
+                writer,
                lsn,
                RelishTag::FileNodeMap {
                    spcnode: pg_constants::GLOBALTABLESPACE_OID,
@@ -59,7 +61,7 @@ pub fn import_timeline_from_postgres_datadir(
            // Load any relation files into the page server
            _ => import_relfile(
                &direntry.path(),
-                timeline,
+                writer,
                lsn,
                pg_constants::GLOBALTABLESPACE_OID,
                0,
@@ -86,7 +88,7 @@ pub fn import_timeline_from_postgres_datadir(

                Some("PG_VERSION") => continue,
                Some("pg_filenode.map") => import_nonrel_file(
-                    timeline,
+                    writer,
                    lsn,
                    RelishTag::FileNodeMap {
                        spcnode: pg_constants::DEFAULTTABLESPACE_OID,
@@ -98,7 +100,7 @@ pub fn import_timeline_from_postgres_datadir(
                // Load any relation files into the page server
                _ => import_relfile(
                    &direntry.path(),
-                    timeline,
+                    writer,
                    lsn,
                    pg_constants::DEFAULTTABLESPACE_OID,
                    dboid,
@@ -108,24 +110,36 @@ pub fn import_timeline_from_postgres_datadir(
    }
    for entry in fs::read_dir(path.join("pg_xact"))? {
        let entry = entry?;
-        import_slru_file(timeline, lsn, SlruKind::Clog, &entry.path())?;
+        import_slru_file(writer, lsn, SlruKind::Clog, &entry.path())?;
    }
    for entry in fs::read_dir(path.join("pg_multixact").join("members"))? {
        let entry = entry?;
-        import_slru_file(timeline, lsn, SlruKind::MultiXactMembers, &entry.path())?;
+        import_slru_file(writer, lsn, SlruKind::MultiXactMembers, &entry.path())?;
    }
    for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? {
        let entry = entry?;
-        import_slru_file(timeline, lsn, SlruKind::MultiXactOffsets, &entry.path())?;
+        import_slru_file(writer, lsn, SlruKind::MultiXactOffsets, &entry.path())?;
    }
    for entry in fs::read_dir(path.join("pg_twophase"))? {
        let entry = entry?;
        let xid = u32::from_str_radix(entry.path().to_str().unwrap(), 16)?;
-        import_nonrel_file(timeline, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
+        import_nonrel_file(writer, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
    }
    // TODO: Scan pg_tblspc

-    timeline.advance_last_record_lsn(lsn);
+    writer.advance_last_record_lsn(lsn);
+
+    // Import WAL. This is needed even when starting from a shutdown checkpoint, because
+    // this reads the checkpoint record itself, advancing the tip of the timeline to
+    // *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'
+    let pg_control = pg_control.ok_or_else(|| anyhow!("pg_control file not found"))?;
+    import_wal(
+        &path.join("pg_wal"),
+        writer,
+        Lsn(pg_control.checkPointCopy.redo),
+        lsn,
+        &mut pg_control.checkPointCopy.clone(),
+    )?;

    Ok(())
 }
@@ -133,12 +147,13 @@ pub fn import_timeline_from_postgres_datadir(
 // subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
 fn import_relfile(
    path: &Path,
-    timeline: &dyn Timeline,
+    timeline: &dyn TimelineWriter,
    lsn: Lsn,
    spcoid: Oid,
    dboid: Oid,
 ) -> Result<()> {
    // Does it look like a relation file?
+    trace!("importing rel file {}", path.display());

    let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
    if let Err(e) = p {
@@ -166,15 +181,14 @@ fn import_relfile(
            }

            // TODO: UnexpectedEof is expected
-            Err(e) => match e.kind() {
+            Err(err) => match err.kind() {
                std::io::ErrorKind::UnexpectedEof => {
                    // reached EOF. That's expected.
                    // FIXME: maybe check that we read the full length of the file?
                    break;
                }
                _ => {
-                    error!("error reading file: {:?} ({})", path, e);
-                    break;
+                    bail!("error reading file {}: {:#}", path.display(), err);
                }
            },
        };
@@ -191,7 +205,7 @@ fn import_relfile(
 /// are just slurped into the repository as one blob.
 ///
 fn import_nonrel_file(
-    timeline: &dyn Timeline,
+    timeline: &dyn TimelineWriter,
    lsn: Lsn,
    tag: RelishTag,
    path: &Path,
@@ -201,7 +215,7 @@ fn import_nonrel_file(
    // read the whole file
    file.read_to_end(&mut buffer)?;

-    info!("importing non-rel file {}", path.display());
+    trace!("importing non-rel file {}", path.display());

    timeline.put_page_image(tag, 0, lsn, Bytes::copy_from_slice(&buffer[..]))?;
    Ok(())
@@ -212,13 +226,17 @@ fn import_nonrel_file(
 ///
 /// The control file is imported as is, but we also extract the checkpoint record
 /// from it and store it separated.
-fn import_control_file(timeline: &dyn Timeline, lsn: Lsn, path: &Path) -> Result<()> {
+fn import_control_file(
+    timeline: &dyn TimelineWriter,
+    lsn: Lsn,
+    path: &Path,
+) -> Result<ControlFileData> {
    let mut file = File::open(path)?;
    let mut buffer = Vec::new();
    // read the whole file
    file.read_to_end(&mut buffer)?;

-    info!("importing control file {}", path.display());
+    trace!("importing control file {}", path.display());

    // Import it as ControlFile
    timeline.put_page_image(
@@ -233,19 +251,24 @@ fn import_control_file(timeline: &dyn Timeline, lsn: Lsn, path: &Path) -> Result
    let checkpoint_bytes = pg_control.checkPointCopy.encode();
    timeline.put_page_image(RelishTag::Checkpoint, 0, lsn, checkpoint_bytes)?;

-    Ok(())
+    Ok(pg_control)
 }

 ///
 /// Import an SLRU segment file
 ///
-fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Path) -> Result<()> {
+fn import_slru_file(
+    timeline: &dyn TimelineWriter,
+    lsn: Lsn,
+    slru: SlruKind,
+    path: &Path,
+) -> Result<()> {
    // Does it look like an SLRU file?
    let mut file = File::open(path)?;
    let mut buf: [u8; 8192] = [0u8; 8192];
    let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;

-    info!("importing slru file {}", path.display());
+    trace!("importing slru file {}", path.display());

    let mut rpageno = 0;
    loop {
@@ -261,15 +284,14 @@ fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Pa
            }

            // TODO: UnexpectedEof is expected
-            Err(e) => match e.kind() {
+            Err(err) => match err.kind() {
                std::io::ErrorKind::UnexpectedEof => {
                    // reached EOF. That's expected.
                    // FIXME: maybe check that we read the full length of the file?
                    break;
                }
                _ => {
-                    error!("error reading file: {:?} ({})", path, e);
-                    break;
+                    bail!("error reading file {}: {:#}", path.display(), err);
                }
            },
        };
@@ -281,18 +303,119 @@ fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Pa
    Ok(())
 }

+/// Scan PostgreSQL WAL files in given directory and load all records between
+/// 'startpoint' and 'endpoint' into the repository.
+fn import_wal(
+    walpath: &Path,
+    timeline: &dyn TimelineWriter,
+    startpoint: Lsn,
+    endpoint: Lsn,
+    checkpoint: &mut CheckPoint,
+) -> Result<()> {
+    let mut waldecoder = WalStreamDecoder::new(startpoint);
+
+    let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE);
+    let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
+    let mut last_lsn = startpoint;
+
+    while last_lsn <= endpoint {
+        // FIXME: assume postgresql tli 1 for now
+        let filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE);
+        let mut buf = Vec::new();
+
+        // Read local file
+        let mut path = walpath.join(&filename);
+
+        // It could be as .partial
+        if !PathBuf::from(&path).exists() {
+            path = walpath.join(filename + ".partial");
+        }
+
+        // Slurp the WAL file
+        let mut file = File::open(&path)?;
+
+        if offset > 0 {
+            file.seek(SeekFrom::Start(offset as u64))?;
+        }
+
+        let nread = file.read_to_end(&mut buf)?;
+        if nread != pg_constants::WAL_SEGMENT_SIZE - offset as usize {
+            // Maybe allow this for .partial files?
+            error!("read only {} bytes from WAL file", nread);
+        }
+
+        waldecoder.feed_bytes(&buf);
+
+        let mut nrecords = 0;
+        while last_lsn <= endpoint {
+            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
+                let mut checkpoint_modified = false;
+
+                let decoded = decode_wal_record(recdata.clone());
+                save_decoded_record(
+                    checkpoint,
+                    &mut checkpoint_modified,
+                    timeline,
+                    &decoded,
+                    recdata,
+                    lsn,
+                )?;
+                last_lsn = lsn;
+
+                if checkpoint_modified {
+                    let checkpoint_bytes = checkpoint.encode();
+                    timeline.put_page_image(
+                        RelishTag::Checkpoint,
+                        0,
+                        last_lsn,
+                        checkpoint_bytes,
+                    )?;
+                }
+
+                // Now that this record has been fully handled, including updating the
+                // checkpoint data, let the repository know that it is up-to-date to this LSN
+                timeline.advance_last_record_lsn(last_lsn);
+                nrecords += 1;
+
+                trace!("imported record at {} (end {})", lsn, endpoint);
+            }
+        }
+
+        debug!("imported {} records up to {}", nrecords, last_lsn);
+
+        segno += 1;
+        offset = 0;
+    }
+
+    if last_lsn != startpoint {
+        debug!(
+            "reached end of WAL at {}, updating checkpoint info",
+            last_lsn
+        );
+
+        timeline.advance_last_record_lsn(last_lsn);
+    } else {
+        info!("no WAL to import at {}", last_lsn);
+    }
+
+    Ok(())
+}
+
 ///
 /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
 /// relations/pages that the record affects.
 ///
 pub fn save_decoded_record(
    checkpoint: &mut CheckPoint,
-    timeline: &dyn Timeline,
+    checkpoint_modified: &mut bool,
+    timeline: &dyn TimelineWriter,
    decoded: &DecodedWALRecord,
    recdata: Bytes,
    lsn: Lsn,
 ) -> Result<()> {
-    checkpoint.update_next_xid(decoded.xl_xid);
+    if checkpoint.update_next_xid(decoded.xl_xid) {
+        *checkpoint_modified = true;
+    }

    // Iterate through all the blocks that the record modifies, and
    // "put" a separate copy of the record for each block.
@@ -305,13 +428,12 @@ pub fn save_decoded_record(
        });

        let rec = WALRecord {
-            lsn,
            will_init: blk.will_init || blk.apply_image,
            rec: recdata.clone(),
            main_data_offset: decoded.main_data_offset as u32,
        };

-        timeline.put_wal_record(tag, blk.blkno, rec)?;
+        timeline.put_wal_record(lsn, tag, blk.blkno, rec)?;
    }

    let mut buf = decoded.record.clone();
@@ -376,7 +498,7 @@ pub fn save_decoded_record(
        } else {
            assert!(info == pg_constants::CLOG_TRUNCATE);
            let xlrec = XlClogTruncate::decode(&mut buf);
-            save_clog_truncate_record(checkpoint, timeline, lsn, &xlrec)?;
+            save_clog_truncate_record(checkpoint, checkpoint_modified, timeline, lsn, &xlrec)?;
        }
    } else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
        let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
@@ -445,10 +567,17 @@ pub fn save_decoded_record(
            )?;
        } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
            let xlrec = XlMultiXactCreate::decode(&mut buf);
-            save_multixact_create_record(checkpoint, timeline, lsn, &xlrec, decoded)?;
+            save_multixact_create_record(
+                checkpoint,
+                checkpoint_modified,
+                timeline,
+                lsn,
+                &xlrec,
+                decoded,
+            )?;
        } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
            let xlrec = XlMultiXactTruncate::decode(&mut buf);
-            save_multixact_truncate_record(checkpoint, timeline, lsn, &xlrec)?;
+            save_multixact_truncate_record(checkpoint, checkpoint_modified, timeline, lsn, &xlrec)?;
        }
    } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
        let xlrec = XlRelmapUpdate::decode(&mut buf);
@@ -457,7 +586,10 @@ pub fn save_decoded_record(
        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
        if info == pg_constants::XLOG_NEXTOID {
            let next_oid = buf.get_u32_le();
-            checkpoint.nextOid = next_oid;
+            if checkpoint.nextOid != next_oid {
+                checkpoint.nextOid = next_oid;
+                *checkpoint_modified = true;
+            }
        } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
            || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
        {
@@ -473,6 +605,7 @@ pub fn save_decoded_record(
            );
            if (checkpoint.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
                checkpoint.oldestXid = xlog_checkpoint.oldestXid;
+                *checkpoint_modified = true;
            }
        }
    }
@@ -480,7 +613,11 @@ pub fn save_decoded_record(
 }

 /// Subroutine of save_decoded_record(), to handle an XLOG_DBASE_CREATE record.
-fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatabase) -> Result<()> {
+fn save_xlog_dbase_create(
+    timeline: &dyn TimelineWriter,
+    lsn: Lsn,
+    rec: &XlCreateDatabase,
+) -> Result<()> {
    let db_id = rec.db_id;
    let tablespace_id = rec.tablespace_id;
    let src_db_id = rec.src_db_id;
@@ -557,7 +694,11 @@ fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatab
 /// Subroutine of save_decoded_record(), to handle an XLOG_SMGR_TRUNCATE record.
 ///
 /// This is the same logic as in PostgreSQL's smgr_redo() function.
-fn save_xlog_smgr_truncate(timeline: &dyn Timeline, lsn: Lsn, rec: &XlSmgrTruncate) -> Result<()> {
+fn save_xlog_smgr_truncate(
+    timeline: &dyn TimelineWriter,
+    lsn: Lsn,
+    rec: &XlSmgrTruncate,
+) -> Result<()> {
    let spcnode = rec.rnode.spcnode;
    let dbnode = rec.rnode.dbnode;
    let relnode = rec.rnode.relnode;
@@ -619,7 +760,7 @@ fn save_xlog_smgr_truncate(timeline: &dyn Timeline, lsn: Lsn, rec: &XlSmgrTrunca
 /// Subroutine of save_decoded_record(), to handle an XLOG_XACT_* records.
 ///
 fn save_xact_record(
-    timeline: &dyn Timeline,
+    timeline: &dyn TimelineWriter,
    lsn: Lsn,
    parsed: &XlXactParsedRecord,
    decoded: &DecodedWALRecord,
@@ -630,12 +771,12 @@ fn save_xact_record(
    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
    let rec = WALRecord {
-        lsn,
        will_init: false,
        rec: decoded.record.clone(),
        main_data_offset: decoded.main_data_offset as u32,
    };
    timeline.put_wal_record(
+        lsn,
        RelishTag::Slru {
            slru: SlruKind::Clog,
            segno,
@@ -651,6 +792,7 @@ fn save_xact_record(
            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
            timeline.put_wal_record(
+                lsn,
                RelishTag::Slru {
                    slru: SlruKind::Clog,
                    segno,
@@ -676,7 +818,8 @@ fn save_xact_record(

 fn save_clog_truncate_record(
    checkpoint: &mut CheckPoint,
-    timeline: &dyn Timeline,
+    checkpoint_modified: &mut bool,
+    timeline: &dyn TimelineWriter,
    lsn: Lsn,
    xlrec: &XlClogTruncate,
 ) -> Result<()> {
@@ -694,6 +837,7 @@ fn save_clog_truncate_record(
    // TODO Figure out if there will be any issues with replica.
    checkpoint.oldestXid = xlrec.oldest_xid;
    checkpoint.oldestXidDB = xlrec.oldest_xid_db;
+    *checkpoint_modified = true;

    // TODO Treat AdvanceOldestClogXid() or write a comment why we don't need it

@@ -736,13 +880,13 @@ fn save_clog_truncate_record(

 fn save_multixact_create_record(
    checkpoint: &mut CheckPoint,
-    timeline: &dyn Timeline,
+    checkpoint_modified: &mut bool,
+    timeline: &dyn TimelineWriter,
    lsn: Lsn,
    xlrec: &XlMultiXactCreate,
    decoded: &DecodedWALRecord,
 ) -> Result<()> {
    let rec = WALRecord {
-        lsn,
        will_init: false,
        rec: decoded.record.clone(),
        main_data_offset: decoded.main_data_offset as u32,
@@ -751,6 +895,7 @@ fn save_multixact_create_record(
    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
    timeline.put_wal_record(
+        lsn,
        RelishTag::Slru {
            slru: SlruKind::MultiXactOffsets,
            segno,
@@ -770,6 +915,7 @@ fn save_multixact_create_record(
        let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
        let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
        timeline.put_wal_record(
+            lsn,
            RelishTag::Slru {
                slru: SlruKind::MultiXactMembers,
                segno,
@@ -792,9 +938,11 @@ fn save_multixact_create_record(
    }
    if xlrec.mid >= checkpoint.nextMulti {
        checkpoint.nextMulti = xlrec.mid + 1;
+        *checkpoint_modified = true;
    }
    if xlrec.moff + xlrec.nmembers > checkpoint.nextMultiOffset {
        checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
+        *checkpoint_modified = true;
    }
    let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| {
        if mbr.xid.wrapping_sub(acc) as i32 > 0 {
@@ -804,18 +952,22 @@ fn save_multixact_create_record(
        }
    });

-    checkpoint.update_next_xid(max_mbr_xid);
+    if checkpoint.update_next_xid(max_mbr_xid) {
+        *checkpoint_modified = true;
+    }
    Ok(())
 }

 fn save_multixact_truncate_record(
    checkpoint: &mut CheckPoint,
-    timeline: &dyn Timeline,
+    checkpoint_modified: &mut bool,
+    timeline: &dyn TimelineWriter,
    lsn: Lsn,
    xlrec: &XlMultiXactTruncate,
 ) -> Result<()> {
    checkpoint.oldestMulti = xlrec.end_trunc_off;
    checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
+    *checkpoint_modified = true;

    // PerformMembersTruncation
    let maxsegment: i32 = mx_offset_to_member_segment(pg_constants::MAX_MULTIXACT_OFFSET);
@@ -849,7 +1001,7 @@ fn save_multixact_truncate_record(
 }

 fn save_relmap_page(
-    timeline: &dyn Timeline,
+    timeline: &dyn TimelineWriter,
    lsn: Lsn,
    xlrec: &XlRelmapUpdate,
    decoded: &DecodedWALRecord,
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -4,74 +4,241 @@
 use crate::branches;
 use crate::layered_repository::LayeredRepository;
 use crate::repository::{Repository, Timeline};
+use crate::tenant_threads;
 use crate::walredo::PostgresRedoManager;
 use crate::PageServerConf;
 use anyhow::{anyhow, bail, Context, Result};
 use lazy_static::lazy_static;
-use log::info;
+use log::{debug, info};
+use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
+use std::fmt;
 use std::fs;
 use std::str::FromStr;
-use std::sync::{Arc, Mutex};
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::{Arc, Mutex, MutexGuard};
 use zenith_utils::zid::{ZTenantId, ZTimelineId};

 lazy_static! {
-    pub static ref REPOSITORY: Mutex<HashMap<ZTenantId, Arc<dyn Repository>>> =
-        Mutex::new(HashMap::new());
+    static ref TENANTS: Mutex<HashMap<ZTenantId, Tenant>> = Mutex::new(HashMap::new());
 }

-pub fn init(conf: &'static PageServerConf) {
-    let mut m = REPOSITORY.lock().unwrap();
+struct Tenant {
+    state: TenantState,
+    repo: Option<Arc<dyn Repository>>,
+}

+#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
+pub enum TenantState {
+    // This tenant only exists in cloud storage. It cannot be accessed.
+    CloudOnly,
+    // This tenant exists in cloud storage, and we are currently downloading it to local disk.
+    // It cannot be accessed yet, not until it's been fully downloaded to local disk.
+    Downloading,
+    // All data for this tenant is complete on local disk, but we haven't loaded the Repository,
+    // Timeline and Layer structs into memory yet, so it cannot be accessed yet.
+    //Ready,
+    // This tenant exists on local disk, and the layer map has been loaded into memory.
+    // The local disk might have some newer files that don't exist in cloud storage yet.
+    Active,
+    // Tenant is active, but there is no walreceiver connection.
+    Idle,
+    // This tenant exists on local disk, and the layer map has been loaded into memory.
+    // The local disk might have some newer files that don't exist in cloud storage yet.
+    // The tenant cannot be accessed anymore for any reason, but graceful shutdown.
+    Stopping,
+}
+
+impl fmt::Display for TenantState {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            TenantState::CloudOnly => f.write_str("CloudOnly"),
+            TenantState::Downloading => f.write_str("Downloading"),
+            TenantState::Active => f.write_str("Active"),
+            TenantState::Idle => f.write_str("Idle"),
+            TenantState::Stopping => f.write_str("Stopping"),
+        }
+    }
+}
+
+fn access_tenants() -> MutexGuard<'static, HashMap<ZTenantId, Tenant>> {
+    TENANTS.lock().unwrap()
+}
+
+static SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false);
+
+pub fn init(conf: &'static PageServerConf) {
    for dir_entry in fs::read_dir(conf.tenants_path()).unwrap() {
        let tenantid =
            ZTenantId::from_str(dir_entry.unwrap().file_name().to_str().unwrap()).unwrap();

-        // Set up a WAL redo manager, for applying WAL records.
-        let walredo_mgr = PostgresRedoManager::new(conf, tenantid);
-
-        // Set up an object repository, for actual data storage.
-        let repo = Arc::new(LayeredRepository::new(
-            conf,
-            Arc::new(walredo_mgr),
-            tenantid,
-        ));
-        LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
-        LayeredRepository::launch_gc_thread(conf, repo.clone());
+        {
+            let mut m = access_tenants();
+            let tenant = Tenant {
+                state: TenantState::CloudOnly,
+                repo: None,
+            };
+            m.insert(tenantid, tenant);
+        }

+        init_repo(conf, tenantid);
        info!("initialized storage for tenant: {}", &tenantid);
-        m.insert(tenantid, repo);
    }
 }

+fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) {
+    // Set up a WAL redo manager, for applying WAL records.
+    let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
+
+    // Set up an object repository, for actual data storage.
+    let repo = Arc::new(LayeredRepository::new(
+        conf,
+        Arc::new(walredo_mgr),
+        tenant_id,
+        true,
+    ));
+
+    let mut m = access_tenants();
+    let tenant = m.get_mut(&tenant_id).unwrap();
+    tenant.repo = Some(repo);
+    tenant.state = TenantState::Active;
+
+    // TODO Start these threads only if tenant actively receives some WAL
+    tenant_threads::start_tenant_threads(conf, tenant_id);
+}
+
+pub fn register_relish_download(
+    conf: &'static PageServerConf,
+    tenant_id: ZTenantId,
+    timeline_id: ZTimelineId,
+) {
+    log::info!(
+        "Registering new download, tenant id {}, timeline id: {}",
+        tenant_id,
+        timeline_id
+    );
+
+    {
+        let mut m = access_tenants();
+        let tenant = m.entry(tenant_id).or_insert_with(|| Tenant {
+            state: TenantState::Downloading,
+            repo: None,
+        });
+        tenant.state = TenantState::Downloading;
+        match &tenant.repo {
+            Some(repo) => {
+                init_timeline(repo.as_ref(), timeline_id);
+                tenant.state = TenantState::Active;
+                return;
+            }
+            None => log::warn!("Initialize new repo"),
+        }
+        tenant.state = TenantState::Active;
+    }
+
+    // init repo updates Tenant state
+    init_repo(conf, tenant_id);
+    let new_repo = get_repository_for_tenant(tenant_id).unwrap();
+    init_timeline(new_repo.as_ref(), timeline_id);
+}
+
+fn init_timeline(repo: &dyn Repository, timeline_id: ZTimelineId) {
+    match repo.get_timeline(timeline_id) {
+        Ok(_timeline) => log::info!("Successfully initialized timeline {}", timeline_id),
+        Err(e) => log::error!("Failed to init timeline {}, reason: {:#}", timeline_id, e),
+    }
+}
+
+// Check this flag in the thread loops to know when to exit
+pub fn shutdown_requested() -> bool {
+    SHUTDOWN_REQUESTED.load(Ordering::Relaxed)
+}
+
+pub fn shutdown_all_tenants() -> Result<()> {
+    SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed);
+
+    let tenantids = list_tenantids()?;
+
+    for tenantid in &tenantids {
+        set_tenant_state(*tenantid, TenantState::Stopping)?;
+    }
+
+    for tenantid in tenantids {
+        // Wait for checkpointer and GC to finish their job
+        tenant_threads::wait_for_tenant_threads_to_stop(tenantid);
+
+        let repo = get_repository_for_tenant(tenantid)?;
+        debug!("shutdown tenant {}", tenantid);
+        repo.shutdown()?;
+    }
+    Ok(())
+}
+
 pub fn create_repository_for_tenant(
    conf: &'static PageServerConf,
    tenantid: ZTenantId,
 ) -> Result<()> {
-    let mut m = REPOSITORY.lock().unwrap();
-
-    // First check that the tenant doesn't exist already
-    if m.get(&tenantid).is_some() {
-        bail!("tenant {} already exists", tenantid);
+    {
+        let mut m = access_tenants();
+        // First check that the tenant doesn't exist already
+        if m.get(&tenantid).is_some() {
+            bail!("tenant {} already exists", tenantid);
+        }
+        let tenant = Tenant {
+            state: TenantState::CloudOnly,
+            repo: None,
+        };
+        m.insert(tenantid, tenant);
    }
+
    let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
    let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?;

-    m.insert(tenantid, repo);
+    let mut m = access_tenants();
+    let tenant = m.get_mut(&tenantid).unwrap();
+    tenant.repo = Some(repo);
+    tenant.state = TenantState::Active;

    Ok(())
 }

-pub fn insert_repository_for_tenant(tenantid: ZTenantId, repo: Arc<dyn Repository>) {
-    let o = &mut REPOSITORY.lock().unwrap();
-    o.insert(tenantid, repo);
+// If tenant is not found in the repository, return CloudOnly state
+pub fn get_tenant_state(tenantid: ZTenantId) -> TenantState {
+    let m = access_tenants();
+    match m.get(&tenantid) {
+        Some(tenant) => tenant.state,
+        None => TenantState::CloudOnly,
+    }
+}
+
+pub fn set_tenant_state(tenantid: ZTenantId, state: TenantState) -> Result<TenantState> {
+    let mut m = access_tenants();
+    let tenant = m.get_mut(&tenantid);
+
+    match tenant {
+        Some(tenant) => {
+            if state == TenantState::Idle && tenant.state != TenantState::Active {
+                // Only Active tenant can become Idle
+                return Ok(tenant.state);
+            }
+            info!("set_tenant_state: {} -> {}", tenant.state, state);
+            tenant.state = state;
+            Ok(tenant.state)
+        }
+        None => bail!("Tenant not found for tenant {}", tenantid),
+    }
 }

 pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<dyn Repository>> {
-    let o = &REPOSITORY.lock().unwrap();
-    o.get(&tenantid)
-        .map(|repo| Arc::clone(repo))
-        .ok_or_else(|| anyhow!("repository not found for tenant name {}", tenantid))
+    let m = access_tenants();
+    let tenant = m
+        .get(&tenantid)
+        .ok_or_else(|| anyhow!("Tenant not found for tenant {}", tenantid))?;
+
+    match &tenant.repo {
+        Some(repo) => Ok(Arc::clone(repo)),
+        None => anyhow::bail!("Repository for tenant {} is not yet valid", tenantid),
+    }
 }

 pub fn get_timeline_for_tenant(
@@ -82,3 +249,33 @@ pub fn get_timeline_for_tenant(
        .get_timeline(timelineid)
        .with_context(|| format!("cannot fetch timeline {}", timelineid))
 }
+
+fn list_tenantids() -> Result<Vec<ZTenantId>> {
+    let m = access_tenants();
+    m.iter()
+        .map(|v| {
+            let (tenantid, _) = v;
+            Ok(*tenantid)
+        })
+        .collect()
+}
+
+#[derive(Serialize, Deserialize, Clone)]
+pub struct TenantInfo {
+    #[serde(with = "hex")]
+    pub id: ZTenantId,
+    pub state: TenantState,
+}
+
+pub fn list_tenants() -> Result<Vec<TenantInfo>> {
+    let m = access_tenants();
+    m.iter()
+        .map(|v| {
+            let (id, tenant) = v;
+            Ok(TenantInfo {
+                id: *id,
+                state: tenant.state,
+            })
+        })
+        .collect()
+}
--- a/pageserver/src/tenant_threads.rs
+++ b/pageserver/src/tenant_threads.rs
@@ -0,0 +1,125 @@
+//! This module contains functions to serve per-tenant background processes,
+//! such as checkpointer and GC
+use crate::tenant_mgr;
+use crate::tenant_mgr::TenantState;
+use crate::CheckpointConfig;
+use crate::PageServerConf;
+use anyhow::Result;
+use lazy_static::lazy_static;
+use std::collections::HashMap;
+use std::sync::Mutex;
+use std::thread::JoinHandle;
+use std::time::Duration;
+use tracing::*;
+use zenith_utils::zid::ZTenantId;
+
+struct TenantHandleEntry {
+    checkpointer_handle: Option<JoinHandle<()>>,
+    gc_handle: Option<JoinHandle<()>>,
+}
+
+// Preserve handles to wait for thread completion
+// at shutdown
+lazy_static! {
+    static ref TENANT_HANDLES: Mutex<HashMap<ZTenantId, TenantHandleEntry>> =
+        Mutex::new(HashMap::new());
+}
+
+pub fn start_tenant_threads(conf: &'static PageServerConf, tenantid: ZTenantId) {
+    //ensure that old threads are stopeed
+    wait_for_tenant_threads_to_stop(tenantid);
+
+    let checkpointer_handle = std::thread::Builder::new()
+        .name("Checkpointer thread".into())
+        .spawn(move || {
+            checkpoint_loop(tenantid, conf).expect("Checkpointer thread died");
+        })
+        .ok();
+
+    let gc_handle = std::thread::Builder::new()
+        .name("GC thread".into())
+        .spawn(move || {
+            gc_loop(tenantid, conf).expect("GC thread died");
+        })
+        .ok();
+
+    // TODO handle thread errors if any
+
+    let mut handles = TENANT_HANDLES.lock().unwrap();
+    let h = TenantHandleEntry {
+        checkpointer_handle,
+        gc_handle,
+    };
+
+    handles.insert(tenantid, h);
+}
+
+pub fn wait_for_tenant_threads_to_stop(tenantid: ZTenantId) {
+    let mut handles = TENANT_HANDLES.lock().unwrap();
+    if let Some(h) = handles.get_mut(&tenantid) {
+        h.checkpointer_handle.take().map(JoinHandle::join);
+        trace!("checkpointer for tenant {} has stopped", tenantid);
+        h.gc_handle.take().map(JoinHandle::join);
+        trace!("gc for tenant {} has stopped", tenantid);
+    }
+    handles.remove(&tenantid);
+}
+
+///
+/// Checkpointer thread's main loop
+///
+fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
+    loop {
+        if tenant_mgr::get_tenant_state(tenantid) != TenantState::Active {
+            break;
+        }
+
+        std::thread::sleep(conf.checkpoint_period);
+        trace!("checkpointer thread for tenant {} waking up", tenantid);
+
+        // checkpoint timelines that have accumulated more than CHECKPOINT_DISTANCE
+        // bytes of WAL since last checkpoint.
+        let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
+        repo.checkpoint_iteration(CheckpointConfig::Distance(conf.checkpoint_distance))?;
+    }
+
+    trace!(
+        "checkpointer thread stopped for tenant {} state is {}",
+        tenantid,
+        tenant_mgr::get_tenant_state(tenantid)
+    );
+    Ok(())
+}
+
+///
+/// GC thread's main loop
+///
+fn gc_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
+    loop {
+        if tenant_mgr::get_tenant_state(tenantid) != TenantState::Active {
+            break;
+        }
+
+        trace!("gc thread for tenant {} waking up", tenantid);
+
+        // Garbage collect old files that are not needed for PITR anymore
+        if conf.gc_horizon > 0 {
+            let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
+            repo.gc_iteration(None, conf.gc_horizon, false).unwrap();
+        }
+
+        // TODO Write it in more adequate way using
+        // condvar.wait_timeout() or something
+        let mut sleep_time = conf.gc_period.as_secs();
+        while sleep_time > 0 && tenant_mgr::get_tenant_state(tenantid) == TenantState::Active {
+            sleep_time -= 1;
+            std::thread::sleep(Duration::from_secs(1));
+        }
+    }
+    trace!(
+        "GC thread stopped for tenant {} state is {}",
+        tenantid,
+        tenant_mgr::get_tenant_state(tenantid)
+    );
+    Ok(())
+}
--- a/pageserver/src/waldecoder.rs
+++ b/pageserver/src/waldecoder.rs
@@ -72,6 +72,10 @@ impl WalStreamDecoder {
    ///     Err(WalDecodeError): an error occured while decoding, meaning the input was invalid.
    ///
    pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
+        let recordbuf;
+
+        // Run state machine that validates page headers, and reassembles records
+        // that cross page boundaries.
        loop {
            // parse and verify page boundaries as we go
            if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
@@ -120,29 +124,41 @@ impl WalStreamDecoder {
                self.lsn += self.padlen as u64;
                self.padlen = 0;
            } else if self.contlen == 0 {
-                // need to have at least the xl_tot_len field
+                assert!(self.recordbuf.is_empty());

+                // need to have at least the xl_tot_len field
                if self.inputbuf.remaining() < 4 {
                    return Ok(None);
                }

-                // read xl_tot_len FIXME: assumes little-endian
+                // peek xl_tot_len at the beginning of the record.
+                // FIXME: assumes little-endian
                self.startlsn = self.lsn;
-                let xl_tot_len = self.inputbuf.get_u32_le();
+                let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le();
                if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD {
                    return Err(WalDecodeError {
                        msg: format!("invalid xl_tot_len {}", xl_tot_len),
                        lsn: self.lsn,
                    });
                }
-                self.lsn += 4;

-                self.recordbuf.clear();
-                self.recordbuf.reserve(xl_tot_len as usize);
-                self.recordbuf.put_u32_le(xl_tot_len);
-
-                self.contlen = xl_tot_len - 4;
-                continue;
+                // Fast path for the common case that the whole record fits on the page.
+                let pageleft = self.lsn.remaining_in_block() as u32;
+                if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft {
+                    // Take the record from the 'inputbuf', and validate it.
+                    recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize);
+                    self.lsn += xl_tot_len as u64;
+                    break;
+                } else {
+                    // Need to assemble the record from pieces. Remember the size of the
+                    // record, and loop back. On next iteration, we will reach the 'else'
+                    // branch below, and copy the part of the record that was on this page
+                    // to 'recordbuf'.  Subsequent iterations will skip page headers, and
+                    // append the continuations from the next pages to 'recordbuf'.
+                    self.recordbuf.reserve(xl_tot_len as usize);
+                    self.contlen = xl_tot_len;
+                    continue;
+                }
            } else {
                // we're continuing a record, possibly from previous page.
                let pageleft = self.lsn.remaining_in_block() as u32;
@@ -159,47 +175,42 @@ impl WalStreamDecoder {
                self.contlen -= n as u32;

                if self.contlen == 0 {
-                    let recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new());
-
-                    let recordbuf = recordbuf.freeze();
-                    let mut buf = recordbuf.clone();
-
-                    let xlogrec = XLogRecord::from_bytes(&mut buf);
-
-                    // XLOG_SWITCH records are special. If we see one, we need to skip
-                    // to the next WAL segment.
-                    if xlogrec.is_xlog_switch_record() {
-                        trace!("saw xlog switch record at {}", self.lsn);
-                        self.padlen =
-                            self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32;
-                    } else {
-                        // Pad to an 8-byte boundary
-                        self.padlen = self.lsn.calc_padding(8u32) as u32;
-                    }
-
-                    let mut crc = crc32c_append(0, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]);
-                    crc = crc32c_append(crc, &recordbuf[0..XLOG_RECORD_CRC_OFFS]);
-                    if crc != xlogrec.xl_crc {
-                        return Err(WalDecodeError {
-                            msg: "WAL record crc mismatch".into(),
-                            lsn: self.lsn,
-                        });
-                    }
-
-                    // Always align resulting LSN on 0x8 boundary -- that is important for getPage()
-                    // and WalReceiver integration. Since this code is used both for WalReceiver and
-                    // initial WAL import let's force alignment right here.
-                    let result = (self.lsn.align(), recordbuf);
-                    return Ok(Some(result));
+                    // The record is now complete.
+                    recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new()).freeze();
+                    break;
                }
                continue;
            }
        }
-        // check record boundaries

-        // deal with continuation records
+        // We now have a record in the 'recordbuf' local variable.
+        let xlogrec = XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]);

-        // deal with xlog_switch records
+        let mut crc = 0;
+        crc = crc32c_append(crc, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]);
+        crc = crc32c_append(crc, &recordbuf[0..XLOG_RECORD_CRC_OFFS]);
+        if crc != xlogrec.xl_crc {
+            return Err(WalDecodeError {
+                msg: "WAL record crc mismatch".into(),
+                lsn: self.lsn,
+            });
+        }
+
+        // XLOG_SWITCH records are special. If we see one, we need to skip
+        // to the next WAL segment.
+        if xlogrec.is_xlog_switch_record() {
+            trace!("saw xlog switch record at {}", self.lsn);
+            self.padlen = self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32;
+        } else {
+            // Pad to an 8-byte boundary
+            self.padlen = self.lsn.calc_padding(8u32) as u32;
+        }
+
+        // Always align resulting LSN on 0x8 boundary -- that is important for getPage()
+        // and WalReceiver integration. Since this code is used both for WalReceiver and
+        // initial WAL import let's force alignment right here.
+        let result = (self.lsn.align(), recordbuf);
+        Ok(Some(result))
    }
 }

--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -10,24 +10,24 @@ use crate::restore_local_repo;
 use crate::tenant_mgr;
 use crate::waldecoder::*;
 use crate::PageServerConf;
-use anyhow::{Error, Result};
+use anyhow::{bail, Error, Result};
 use lazy_static::lazy_static;
-use log::*;
 use postgres::fallible_iterator::FallibleIterator;
 use postgres::replication::ReplicationIter;
 use postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow};
-use postgres_ffi::xlog_utils::*;
 use postgres_ffi::*;
 use postgres_protocol::message::backend::ReplicationMessage;
 use postgres_types::PgLsn;
-use std::cmp::{max, min};
+use std::cell::Cell;
 use std::collections::HashMap;
-use std::fs;
 use std::str::FromStr;
 use std::sync::Mutex;
 use std::thread;
 use std::thread::sleep;
+use std::thread::JoinHandle;
+use std::thread_local;
 use std::time::{Duration, SystemTime};
+use tracing::*;
 use zenith_utils::lsn::Lsn;
 use zenith_utils::zid::ZTenantId;
 use zenith_utils::zid::ZTimelineId;
@@ -37,6 +37,7 @@ use zenith_utils::zid::ZTimelineId;
 //
 struct WalReceiverEntry {
    wal_producer_connstr: String,
+    wal_receiver_handle: Option<JoinHandle<()>>,
 }

 lazy_static! {
@@ -44,6 +45,26 @@ lazy_static! {
        Mutex::new(HashMap::new());
 }

+thread_local! {
+    // Boolean that is true only for WAL receiver threads
+    //
+    // This is used in `wait_lsn` to guard against usage that might lead to a deadlock.
+    pub(crate) static IS_WAL_RECEIVER: Cell<bool> = Cell::new(false);
+}
+
+// Wait for walreceiver to stop
+// Now it stops when pageserver shutdown is requested.
+// In future we can make this more granular and send shutdown signals
+// per tenant/timeline to cancel inactive walreceivers.
+// TODO deal with blocking pg connections
+pub fn stop_wal_receiver(timelineid: ZTimelineId) {
+    let mut receivers = WAL_RECEIVERS.lock().unwrap();
+    if let Some(r) = receivers.get_mut(&timelineid) {
+        r.wal_receiver_handle.take();
+        // r.wal_receiver_handle.take().map(JoinHandle::join);
+    }
+}
+
 // Launch a new WAL receiver, or tell one that's running about change in connection string
 pub fn launch_wal_receiver(
    conf: &'static PageServerConf,
@@ -58,21 +79,19 @@ pub fn launch_wal_receiver(
            receiver.wal_producer_connstr = wal_producer_connstr.into();
        }
        None => {
-            let receiver = WalReceiverEntry {
-                wal_producer_connstr: wal_producer_connstr.into(),
-            };
-            receivers.insert(timelineid, receiver);
-
-            // Also launch a new thread to handle this connection
-            //
-            // NOTE: This thread name is checked in the assertion in wait_lsn. If you change
-            // this, make sure you update the assertion too.
-            let _walreceiver_thread = thread::Builder::new()
+            let wal_receiver_handle = thread::Builder::new()
                .name("WAL receiver thread".into())
                .spawn(move || {
+                    IS_WAL_RECEIVER.with(|c| c.set(true));
                    thread_main(conf, timelineid, tenantid);
                })
                .unwrap();
+
+            let receiver = WalReceiverEntry {
+                wal_producer_connstr: wal_producer_connstr.into(),
+                wal_receiver_handle: Some(wal_receiver_handle),
+            };
+            receivers.insert(timelineid, receiver);
        }
    };
 }
@@ -92,16 +111,14 @@ fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String {
 // This is the entry point for the WAL receiver thread.
 //
 fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid: ZTenantId) {
-    info!(
-        "WAL receiver thread started for timeline : '{}'",
-        timelineid
-    );
+    let _enter = info_span!("WAL receiver", timeline = %timelineid, tenant = %tenantid).entered();
+    info!("WAL receiver thread started");

    //
    // Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
    // and start streaming WAL from it. If the connection is lost, keep retrying.
    //
-    loop {
+    while !tenant_mgr::shutdown_requested() {
        // Look up the current WAL producer address
        let wal_producer_connstr = get_wal_producer_connstr(timelineid);

@@ -115,10 +132,11 @@ fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid:
            sleep(Duration::from_secs(1));
        }
    }
+    debug!("WAL streaming shut down");
 }

 fn walreceiver_main(
-    conf: &PageServerConf,
+    _conf: &PageServerConf,
    timelineid: ZTimelineId,
    wal_producer_connstr: &str,
    tenantid: ZTenantId,
@@ -158,15 +176,15 @@ fn walreceiver_main(
    let mut startpoint = last_rec_lsn;

    if startpoint == Lsn(0) {
-        error!("No previous WAL position");
+        bail!("No previous WAL position");
    }

    // There might be some padding after the last full record, skip it.
    startpoint += startpoint.calc_padding(8u32);

    info!(
-        "last_record_lsn {} starting replication from {} for timeline {}, server is at {}...",
-        last_rec_lsn, startpoint, timelineid, end_of_wal
+        "last_record_lsn {} starting replication from {}, server is at {}...",
+        last_rec_lsn, startpoint, end_of_wal
    );

    let query = format!("START_REPLICATION PHYSICAL {}", startpoint);
@@ -188,34 +206,38 @@ fn walreceiver_main(
                let data = xlog_data.data();
                let startlsn = Lsn::from(xlog_data.wal_start());
                let endlsn = startlsn + data.len() as u64;
-                let prev_last_rec_lsn = last_rec_lsn;

                trace!("received XLogData between {} and {}", startlsn, endlsn);

                waldecoder.feed_bytes(data);

                while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                    // Save old checkpoint value to compare with it after decoding WAL record
-                    let old_checkpoint_bytes = checkpoint.encode();
-                    let decoded = decode_wal_record(recdata.clone());
+                    let _enter = info_span!("processing record", lsn = %lsn).entered();

                    // It is important to deal with the aligned records as lsn in getPage@LSN is
                    // aligned and can be several bytes bigger. Without this alignment we are
                    // at risk of hittind a deadlock.
                    assert!(lsn.is_aligned());

+                    let writer = timeline.writer();
+
+                    let mut checkpoint_modified = false;
+
+                    let decoded = decode_wal_record(recdata.clone());
                    restore_local_repo::save_decoded_record(
                        &mut checkpoint,
-                        &*timeline,
+                        &mut checkpoint_modified,
+                        writer.as_ref(),
                        &decoded,
                        recdata,
                        lsn,
                    )?;

-                    let new_checkpoint_bytes = checkpoint.encode();
                    // Check if checkpoint data was updated by save_decoded_record
-                    if new_checkpoint_bytes != old_checkpoint_bytes {
-                        timeline.put_page_image(
+                    if checkpoint_modified {
+                        let new_checkpoint_bytes = checkpoint.encode();
+
+                        writer.put_page_image(
                            RelishTag::Checkpoint,
                            0,
                            lsn,
@@ -225,38 +247,10 @@ fn walreceiver_main(

                    // Now that this record has been fully handled, including updating the
                    // checkpoint data, let the repository know that it is up-to-date to this LSN
-                    timeline.advance_last_record_lsn(lsn);
+                    writer.advance_last_record_lsn(lsn);
                    last_rec_lsn = lsn;
                }

-                // Somewhat arbitrarily, if we have at least 10 complete wal segments (16 MB each),
-                // "checkpoint" the repository to flush all the changes from WAL we've processed
-                // so far to disk. After this, we don't need the original WAL anymore, and it
-                // can be removed. This is probably too aggressive for production, but it's useful
-                // to expose bugs now.
-                //
-                // TODO: We don't actually dare to remove the WAL. It's useful for debugging,
-                // and we might it for logical decoding other things in the future. Although
-                // we should also be able to fetch it back from the WAL safekeepers or S3 if
-                // needed.
-                if prev_last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
-                    != last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
-                {
-                    info!("switched segment {} to {}", prev_last_rec_lsn, last_rec_lsn);
-                    let (oldest_segno, newest_segno) = find_wal_file_range(
-                        conf,
-                        &timelineid,
-                        pg_constants::WAL_SEGMENT_SIZE,
-                        last_rec_lsn,
-                        &tenantid,
-                    )?;
-
-                    if newest_segno - oldest_segno >= 10 {
-                        // TODO: This is where we could remove WAL older than last_rec_lsn.
-                        //remove_wal_files(timelineid, pg_constants::WAL_SEGMENT_SIZE, last_rec_lsn)?;
-                    }
-                }
-
                if !caught_up && endlsn >= end_of_wal {
                    info!("caught up at LSN {}", endlsn);
                    caught_up = true;
@@ -278,7 +272,7 @@ fn walreceiver_main(
                );

                if reply_requested {
-                    Some(timeline.get_last_record_lsn())
+                    Some(last_rec_lsn)
                } else {
                    None
                }
@@ -290,59 +284,25 @@ fn walreceiver_main(
        if let Some(last_lsn) = status_update {
            // TODO: More thought should go into what values are sent here.
            let last_lsn = PgLsn::from(u64::from(last_lsn));
-            let write_lsn = last_lsn;
+            // We are using disk consistent LSN as `write_lsn`, i.e. LSN at which page server
+            // may guarantee persistence of all received data. Safekeeper is not free to remove
+            // WAL preceding `write_lsn`: it should not be requested by this page server.
+            let write_lsn = PgLsn::from(u64::from(timeline.get_disk_consistent_lsn()));
            let flush_lsn = last_lsn;
            let apply_lsn = PgLsn::from(0);
            let ts = SystemTime::now();
            const NO_REPLY: u8 = 0;
-
            physical_stream.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
        }
+
+        if tenant_mgr::shutdown_requested() {
+            debug!("stop walreceiver because pageserver shutdown is requested");
+            break;
+        }
    }
    Ok(())
 }

-fn find_wal_file_range(
-    conf: &PageServerConf,
-    timeline: &ZTimelineId,
-    wal_seg_size: usize,
-    written_upto: Lsn,
-    tenant: &ZTenantId,
-) -> Result<(u64, u64)> {
-    let written_upto_segno = written_upto.segment_number(wal_seg_size);
-
-    let mut oldest_segno = written_upto_segno;
-    let mut newest_segno = written_upto_segno;
-    // Scan the wal directory, and count how many WAL filed we could remove
-    let wal_dir = conf.wal_dir_path(timeline, tenant);
-    for entry in fs::read_dir(wal_dir)? {
-        let entry = entry?;
-        let path = entry.path();
-
-        if path.is_dir() {
-            continue;
-        }
-
-        let filename = path.file_name().unwrap().to_str().unwrap();
-
-        if IsXLogFileName(filename) {
-            let (segno, _tli) = XLogFromFileName(filename, wal_seg_size);
-
-            if segno > written_upto_segno {
-                // that's strange.
-                warn!("there is a WAL file from future at {}", path.display());
-                continue;
-            }
-
-            oldest_segno = min(oldest_segno, segno);
-            newest_segno = max(newest_segno, segno);
-        }
-    }
-    // FIXME: would be good to assert that there are no gaps in the WAL files
-
-    Ok((oldest_segno, newest_segno))
-}
-
 /// Data returned from the postgres `IDENTIFY_SYSTEM` command
 ///
 /// See the [postgres docs] for more details.
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,8 +22,7 @@ use byteorder::{ByteOrder, LittleEndian};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use lazy_static::lazy_static;
 use log::*;
-use serde::{Deserialize, Serialize};
-use std::cell::RefCell;
+use serde::Serialize;
 use std::fs;
 use std::fs::OpenOptions;
 use std::io::prelude::*;
@@ -60,7 +59,7 @@ use postgres_ffi::XLogRecord;
 /// In Postgres `BufferTag` structure is used for exactly the same purpose.
 /// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
 ///
-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize, Deserialize)]
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)]
 pub struct BufferTag {
    pub rel: RelTag,
    pub blknum: u32,
@@ -83,7 +82,7 @@ pub trait WalRedoManager: Send + Sync {
        blknum: u32,
        lsn: Lsn,
        base_img: Option<Bytes>,
-        records: Vec<WALRecord>,
+        records: Vec<(Lsn, WALRecord)>,
    ) -> Result<Bytes, WalRedoError>;
 }

@@ -100,7 +99,7 @@ impl crate::walredo::WalRedoManager for DummyRedoManager {
        _blknum: u32,
        _lsn: Lsn,
        _base_img: Option<Bytes>,
-        _records: Vec<WALRecord>,
+        _records: Vec<(Lsn, WALRecord)>,
    ) -> Result<Bytes, WalRedoError> {
        Err(WalRedoError::InvalidState)
    }
@@ -151,9 +150,16 @@ struct WalRedoRequest {
    lsn: Lsn,

    base_img: Option<Bytes>,
-    records: Vec<WALRecord>,
+    records: Vec<(Lsn, WALRecord)>,
 }

+impl WalRedoRequest {
+    // Can this request be served by zenith redo funcitons
+    // or we need to pass it to wal-redo postgres process?
+    fn can_apply_in_zenith(&self) -> bool {
+        !matches!(self.rel, RelishTag::Relation(_))
+    }
+}
 /// An error happened in WAL redo
 #[derive(Debug, thiserror::Error)]
 pub enum WalRedoError {
@@ -162,6 +168,8 @@ pub enum WalRedoError {

    #[error("cannot perform WAL redo now")]
    InvalidState,
+    #[error("cannot perform WAL redo for this request")]
+    InvalidRequest,
 }

 ///
@@ -180,10 +188,9 @@ impl WalRedoManager for PostgresRedoManager {
        blknum: u32,
        lsn: Lsn,
        base_img: Option<Bytes>,
-        records: Vec<WALRecord>,
+        records: Vec<(Lsn, WALRecord)>,
    ) -> Result<Bytes, WalRedoError> {
        let start_time;
-        let lock_time;
        let end_time;

        let request = WalRedoRequest {
@@ -195,9 +202,16 @@ impl WalRedoManager for PostgresRedoManager {
        };

        start_time = Instant::now();
-        let result = {
+        let result;
+
+        if request.can_apply_in_zenith() {
+            result = self.handle_apply_request_zenith(&request);
+
+            end_time = Instant::now();
+            WAL_REDO_TIME.observe(end_time.duration_since(start_time).as_secs_f64());
+        } else {
            let mut process_guard = self.process.lock().unwrap();
-            lock_time = Instant::now();
+            let lock_time = Instant::now();

            // launch the WAL redo process on first use
            if process_guard.is_none() {
@@ -206,15 +220,16 @@ impl WalRedoManager for PostgresRedoManager {
                    .block_on(PostgresRedoProcess::launch(self.conf, &self.tenantid))?;
                *process_guard = Some(p);
            }
-            let process = (*process_guard).as_ref().unwrap();
+            let process = process_guard.as_mut().unwrap();

-            self.runtime
-                .block_on(self.handle_apply_request(process, &request))
-        };
-        end_time = Instant::now();
+            result = self
+                .runtime
+                .block_on(self.handle_apply_request_postgres(process, &request));

-        WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
-        WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());
+            WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
+            end_time = Instant::now();
+            WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());
+        }

        result
    }
@@ -243,13 +258,47 @@ impl PostgresRedoManager {
    }

    ///
-    /// Process one request for WAL redo.
+    /// Process one request for WAL redo using wal-redo postgres
    ///
-    async fn handle_apply_request(
+    async fn handle_apply_request_postgres(
        &self,
-        process: &PostgresRedoProcess,
+        process: &mut PostgresRedoProcess,
        request: &WalRedoRequest,
    ) -> Result<Bytes, WalRedoError> {
+        let blknum = request.blknum;
+        let lsn = request.lsn;
+        let base_img = request.base_img.clone();
+        let records = &request.records;
+        let nrecords = records.len();
+
+        let start = Instant::now();
+
+        let apply_result: Result<Bytes, Error>;
+
+        if let RelishTag::Relation(rel) = request.rel {
+            // Relational WAL records are applied using wal-redo-postgres
+            let buf_tag = BufferTag { rel, blknum };
+            apply_result = process.apply_wal_records(buf_tag, base_img, records).await;
+
+            let duration = start.elapsed();
+
+            debug!(
+                "postgres applied {} WAL records in {} ms to reconstruct page image at LSN {}",
+                nrecords,
+                duration.as_millis(),
+                lsn
+            );
+
+            apply_result.map_err(WalRedoError::IoError)
+        } else {
+            Err(WalRedoError::InvalidRequest)
+        }
+    }
+
+    ///
+    /// Process one request for WAL redo using custom zenith code
+    ///
+    fn handle_apply_request_zenith(&self, request: &WalRedoRequest) -> Result<Bytes, WalRedoError> {
        let rel = request.rel;
        let blknum = request.blknum;
        let lsn = request.lsn;
@@ -261,176 +310,158 @@ impl PostgresRedoManager {
        let start = Instant::now();

        let apply_result: Result<Bytes, Error>;
-        if let RelishTag::Relation(rel) = rel {
-            // Relational WAL records are applied using wal-redo-postgres
-            let buf_tag = BufferTag { rel, blknum };
-            apply_result = process.apply_wal_records(buf_tag, base_img, records).await;
+
+        // Non-relational WAL records are handled here, with custom code that has the
+        // same effects as the corresponding Postgres WAL redo function.
+        const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
+        let mut page = BytesMut::new();
+        if let Some(fpi) = base_img {
+            // If full-page image is provided, then use it...
+            page.extend_from_slice(&fpi[..]);
        } else {
-            // Non-relational WAL records are handled here, with custom code that has the
-            // same effects as the corresponding Postgres WAL redo function.
-            const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
-            let mut page = BytesMut::new();
-            if let Some(fpi) = base_img {
-                // If full-page image is provided, then use it...
-                page.extend_from_slice(&fpi[..]);
-            } else {
-                // otherwise initialize page with zeros
-                page.extend_from_slice(&ZERO_PAGE);
+            // otherwise initialize page with zeros
+            page.extend_from_slice(&ZERO_PAGE);
+        }
+        // Apply all collected WAL records
+        for (_lsn, record) in records {
+            let mut buf = record.rec.clone();
+
+            WAL_REDO_RECORD_COUNTER.inc();
+
+            // 1. Parse XLogRecord struct
+            // FIXME: refactor to avoid code duplication.
+            let xlogrec = XLogRecord::from_bytes(&mut buf);
+
+            //move to main data
+            // TODO probably, we should store some records in our special format
+            // to avoid this weird parsing on replay
+            let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
+            if buf.remaining() > skip {
+                buf.advance(skip);
            }
-            // Apply all collected WAL records
-            for record in records {
-                let mut buf = record.rec.clone();

-                WAL_REDO_RECORD_COUNTER.inc();
-
-                // 1. Parse XLogRecord struct
-                // FIXME: refactor to avoid code duplication.
-                let xlogrec = XLogRecord::from_bytes(&mut buf);
-
-                //move to main data
-                // TODO probably, we should store some records in our special format
-                // to avoid this weird parsing on replay
-                let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
-                if buf.remaining() > skip {
-                    buf.advance(skip);
-                }
-
-                if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
-                    // Transaction manager stuff
-                    let rec_segno = match rel {
-                        RelishTag::Slru { slru, segno } => {
-                            if slru != SlruKind::Clog {
-                                panic!("Not valid XACT relish tag {:?}", rel);
-                            }
-                            segno
-                        }
-                        _ => panic!("Not valid XACT relish tag {:?}", rel),
-                    };
-                    let parsed_xact =
-                        XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
-                    if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
-                        || parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
-                    {
-                        transaction_id_set_status(
-                            parsed_xact.xid,
-                            pg_constants::TRANSACTION_STATUS_COMMITTED,
-                            &mut page,
+            if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
+                // Transaction manager stuff
+                let rec_segno = match rel {
+                    RelishTag::Slru { slru, segno } => {
+                        assert!(
+                            slru == SlruKind::Clog,
+                            "Not valid XACT relish tag {:?}",
+                            rel
                        );
-                        for subxact in &parsed_xact.subxacts {
-                            let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
-                            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                            // only update xids on the requested page
-                            if rec_segno == segno && blknum == rpageno {
-                                transaction_id_set_status(
-                                    *subxact,
-                                    pg_constants::TRANSACTION_STATUS_COMMITTED,
-                                    &mut page,
-                                );
-                            }
-                        }
-                    } else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
-                        || parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
-                    {
-                        transaction_id_set_status(
-                            parsed_xact.xid,
-                            pg_constants::TRANSACTION_STATUS_ABORTED,
-                            &mut page,
-                        );
-                        for subxact in &parsed_xact.subxacts {
-                            let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
-                            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                            // only update xids on the requested page
-                            if rec_segno == segno && blknum == rpageno {
-                                transaction_id_set_status(
-                                    *subxact,
-                                    pg_constants::TRANSACTION_STATUS_ABORTED,
-                                    &mut page,
-                                );
-                            }
+                        segno
+                    }
+                    _ => panic!("Not valid XACT relish tag {:?}", rel),
+                };
+                let parsed_xact =
+                    XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
+                if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
+                    || parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
+                {
+                    transaction_id_set_status(
+                        parsed_xact.xid,
+                        pg_constants::TRANSACTION_STATUS_COMMITTED,
+                        &mut page,
+                    );
+                    for subxact in &parsed_xact.subxacts {
+                        let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                        let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                        let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                        // only update xids on the requested page
+                        if rec_segno == segno && blknum == rpageno {
+                            transaction_id_set_status(
+                                *subxact,
+                                pg_constants::TRANSACTION_STATUS_COMMITTED,
+                                &mut page,
+                            );
                        }
                    }
-                } else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
-                    // Multixact operations
-                    let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-                    if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
-                        let xlrec = XlMultiXactCreate::decode(&mut buf);
-                        if let RelishTag::Slru {
-                            slru,
-                            segno: rec_segno,
-                        } = rel
-                        {
-                            if slru == SlruKind::MultiXactMembers {
-                                for i in 0..xlrec.nmembers {
-                                    let pageno =
-                                        i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
-                                    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                                    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                                    if segno == rec_segno && rpageno == blknum {
-                                        // update only target block
-                                        let offset = xlrec.moff + i;
-                                        let memberoff = mx_offset_to_member_offset(offset);
-                                        let flagsoff = mx_offset_to_flags_offset(offset);
-                                        let bshift = mx_offset_to_flags_bitshift(offset);
-                                        let mut flagsval =
-                                            LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
-                                        flagsval &= !(((1
-                                            << pg_constants::MXACT_MEMBER_BITS_PER_XACT)
-                                            - 1)
+                } else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
+                    || parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
+                {
+                    transaction_id_set_status(
+                        parsed_xact.xid,
+                        pg_constants::TRANSACTION_STATUS_ABORTED,
+                        &mut page,
+                    );
+                    for subxact in &parsed_xact.subxacts {
+                        let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                        let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                        let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                        // only update xids on the requested page
+                        if rec_segno == segno && blknum == rpageno {
+                            transaction_id_set_status(
+                                *subxact,
+                                pg_constants::TRANSACTION_STATUS_ABORTED,
+                                &mut page,
+                            );
+                        }
+                    }
+                }
+            } else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
+                // Multixact operations
+                let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+                if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
+                    let xlrec = XlMultiXactCreate::decode(&mut buf);
+                    if let RelishTag::Slru {
+                        slru,
+                        segno: rec_segno,
+                    } = rel
+                    {
+                        if slru == SlruKind::MultiXactMembers {
+                            for i in 0..xlrec.nmembers {
+                                let pageno = i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
+                                let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                                let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                                if segno == rec_segno && rpageno == blknum {
+                                    // update only target block
+                                    let offset = xlrec.moff + i;
+                                    let memberoff = mx_offset_to_member_offset(offset);
+                                    let flagsoff = mx_offset_to_flags_offset(offset);
+                                    let bshift = mx_offset_to_flags_bitshift(offset);
+                                    let mut flagsval =
+                                        LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
+                                    flagsval &=
+                                        !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1)
                                            << bshift);
-                                        flagsval |= xlrec.members[i as usize].status << bshift;
-                                        LittleEndian::write_u32(
-                                            &mut page[flagsoff..flagsoff + 4],
-                                            flagsval,
-                                        );
-                                        LittleEndian::write_u32(
-                                            &mut page[memberoff..memberoff + 4],
-                                            xlrec.members[i as usize].xid,
-                                        );
-                                    }
+                                    flagsval |= xlrec.members[i as usize].status << bshift;
+                                    LittleEndian::write_u32(
+                                        &mut page[flagsoff..flagsoff + 4],
+                                        flagsval,
+                                    );
+                                    LittleEndian::write_u32(
+                                        &mut page[memberoff..memberoff + 4],
+                                        xlrec.members[i as usize].xid,
+                                    );
                                }
-                            } else {
-                                // Multixact offsets SLRU
-                                let offs = (xlrec.mid
-                                    % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
-                                    * 4) as usize;
-                                LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
                            }
                        } else {
-                            panic!();
+                            // Multixact offsets SLRU
+                            let offs = (xlrec.mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
+                                * 4) as usize;
+                            LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
                        }
                    } else {
                        panic!();
                    }
+                } else {
+                    panic!();
                }
            }
-
-            apply_result = Ok::<Bytes, Error>(page.freeze());
        }

+        apply_result = Ok::<Bytes, Error>(page.freeze());
+
        let duration = start.elapsed();

-        let result: Result<Bytes, WalRedoError>;
-
        debug!(
-            "applied {} WAL records in {} ms to reconstruct page image at LSN {}",
+            "zenith applied {} WAL records in {} ms to reconstruct page image at LSN {}",
            nrecords,
            duration.as_millis(),
            lsn
        );

-        if let Err(e) = apply_result {
-            error!("could not apply WAL records: {}", e);
-            result = Err(WalRedoError::IoError(e));
-        } else {
-            let img = apply_result.unwrap();
-
-            result = Ok(img);
-        }
-
-        // The caller is responsible for sending the response
-        result
+        apply_result.map_err(WalRedoError::IoError)
    }
 }

@@ -438,8 +469,8 @@ impl PostgresRedoManager {
 /// Handle to the Postgres WAL redo process
 ///
 struct PostgresRedoProcess {
-    stdin: RefCell<ChildStdin>,
-    stdout: RefCell<ChildStdout>,
+    stdin: ChildStdin,
+    stdout: ChildStdout,
 }

 impl PostgresRedoProcess {
@@ -459,7 +490,7 @@ impl PostgresRedoProcess {
        if datadir.exists() {
            info!("directory {:?} exists, removing", &datadir);
            if let Err(e) = fs::remove_dir_all(&datadir) {
-                error!("could not remove old wal-redo-datadir: {:?}", e);
+                error!("could not remove old wal-redo-datadir: {:#}", e);
            }
        }
        info!("running initdb in {:?}", datadir.display());
@@ -532,10 +563,7 @@ impl PostgresRedoProcess {
        };
        tokio::spawn(f_stderr);

-        Ok(PostgresRedoProcess {
-            stdin: RefCell::new(stdin),
-            stdout: RefCell::new(stdout),
-        })
+        Ok(PostgresRedoProcess { stdin, stdout })
    }

    //
@@ -543,13 +571,14 @@ impl PostgresRedoProcess {
    // new page image.
    //
    async fn apply_wal_records(
-        &self,
+        &mut self,
        tag: BufferTag,
        base_img: Option<Bytes>,
-        records: &[WALRecord],
+        records: &[(Lsn, WALRecord)],
    ) -> Result<Bytes, std::io::Error> {
-        let mut stdin = self.stdin.borrow_mut();
-        let mut stdout = self.stdout.borrow_mut();
+        let stdout = &mut self.stdout;
+        // Buffer the writes to avoid a lot of small syscalls.
+        let mut stdin = tokio::io::BufWriter::new(&mut self.stdin);

        // We do three things simultaneously: send the old base image and WAL records to
        // the child process's stdin, read the result from child's stdout, and forward any logging
@@ -566,22 +595,16 @@ impl PostgresRedoProcess {
                stdin.write_all(&build_begin_redo_for_block_msg(tag)),
            )
            .await??;
-            if base_img.is_some() {
-                timeout(
-                    TIMEOUT,
-                    stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())),
-                )
-                .await??;
+            if let Some(img) = base_img {
+                timeout(TIMEOUT, stdin.write_all(&build_push_page_msg(tag, &img))).await??;
            }

            // Send WAL records.
-            for rec in records.iter() {
-                let r = rec.clone();
-
+            for (lsn, rec) in records.iter() {
                WAL_REDO_RECORD_COUNTER.inc();

                stdin
-                    .write_all(&build_apply_record_msg(r.lsn, r.rec))
+                    .write_all(&build_apply_record_msg(*lsn, &rec.rec))
                    .await?;

                //debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
@@ -618,58 +641,41 @@ impl PostgresRedoProcess {
 // process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
 // explanation of the protocol.

-fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
+fn build_begin_redo_for_block_msg(tag: BufferTag) -> Vec<u8> {
    let len = 4 + 1 + 4 * 4;
-    let mut buf = BytesMut::with_capacity(1 + len);
+    let mut buf = Vec::with_capacity(1 + len);

    buf.put_u8(b'B');
    buf.put_u32(len as u32);

-    // FIXME: this is a temporary hack that should go away when we refactor
-    // the postgres protocol serialization + handlers.
-    //
-    // BytesMut is a dynamic growable buffer, used a lot in tokio code but
-    // not in the std library. To write to a BytesMut from a serde serializer,
-    // we need to either:
-    // - pre-allocate the required buffer space. This is annoying because we
-    //   shouldn't care what the exact serialized size is-- that's the
-    //   serializer's job.
-    // - Or, we need to create a temporary "writer" (which implements the
-    //   `Write` trait). It's a bit awkward, because the writer consumes the
-    //   underlying BytesMut, and we need to extract it later with
-    //   `into_inner`.
-    let mut writer = buf.writer();
-    tag.ser_into(&mut writer)
+    tag.ser_into(&mut buf)
        .expect("serialize BufferTag should always succeed");
-    let buf = writer.into_inner();

    debug_assert!(buf.len() == 1 + len);

-    buf.freeze()
+    buf
 }

-fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
+fn build_push_page_msg(tag: BufferTag, base_img: &[u8]) -> Vec<u8> {
    assert!(base_img.len() == 8192);

    let len = 4 + 1 + 4 * 4 + base_img.len();
-    let mut buf = BytesMut::with_capacity(1 + len);
+    let mut buf = Vec::with_capacity(1 + len);

    buf.put_u8(b'P');
    buf.put_u32(len as u32);
-    let mut writer = buf.writer();
-    tag.ser_into(&mut writer)
+    tag.ser_into(&mut buf)
        .expect("serialize BufferTag should always succeed");
-    let mut buf = writer.into_inner();
    buf.put(base_img);

    debug_assert!(buf.len() == 1 + len);

-    buf.freeze()
+    buf
 }

-fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes {
+fn build_apply_record_msg(endlsn: Lsn, rec: &[u8]) -> Vec<u8> {
    let len = 4 + 8 + rec.len();
-    let mut buf = BytesMut::with_capacity(1 + len);
+    let mut buf: Vec<u8> = Vec::with_capacity(1 + len);

    buf.put_u8(b'A');
    buf.put_u32(len as u32);
@@ -678,21 +684,19 @@ fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes {

    debug_assert!(buf.len() == 1 + len);

-    buf.freeze()
+    buf
 }

-fn build_get_page_msg(tag: BufferTag) -> Bytes {
+fn build_get_page_msg(tag: BufferTag) -> Vec<u8> {
    let len = 4 + 1 + 4 * 4;
-    let mut buf = BytesMut::with_capacity(1 + len);
+    let mut buf = Vec::with_capacity(1 + len);

    buf.put_u8(b'G');
    buf.put_u32(len as u32);
-    let mut writer = buf.writer();
-    tag.ser_into(&mut writer)
+    tag.ser_into(&mut buf)
        .expect("serialize BufferTag should always succeed");
-    let buf = writer.into_inner();

    debug_assert!(buf.len() == 1 + len);

-    buf.freeze()
+    buf
 }
--- a/postgres_ffi/src/xlog_utils.rs
+++ b/postgres_ffi/src/xlog_utils.rs
@@ -9,7 +9,6 @@

 use crate::pg_constants;
 use crate::CheckPoint;
-use crate::ControlFileData;
 use crate::FullTransactionId;
 use crate::XLogLongPageHeaderData;
 use crate::XLogPageHeaderData;
@@ -18,8 +17,8 @@ use crate::XLOG_PAGE_MAGIC;

 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, LittleEndian};
+use bytes::BytesMut;
 use bytes::{Buf, Bytes};
-use bytes::{BufMut, BytesMut};
 use crc32c::*;
 use log::*;
 use std::cmp::max;
@@ -329,7 +328,12 @@ pub fn main() {
 }

 impl XLogRecord {
-    pub fn from_bytes(buf: &mut Bytes) -> XLogRecord {
+    pub fn from_slice(buf: &[u8]) -> XLogRecord {
+        use zenith_utils::bin_ser::LeSer;
+        XLogRecord::des(buf).unwrap()
+    }
+
+    pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogRecord {
        use zenith_utils::bin_ser::LeSer;
        XLogRecord::des_from(&mut buf.reader()).unwrap()
    }
@@ -377,10 +381,12 @@ impl CheckPoint {
        Ok(CheckPoint::des(buf)?)
    }

-    // Update next XID based on provided new_xid and stored epoch.
-    // Next XID should be greater than new_xid.
-    // Also take in account 32-bit wrap-around.
-    pub fn update_next_xid(&mut self, xid: u32) {
+    /// Update next XID based on provided new_xid and stored epoch.
+    /// Next XID should be greater than new_xid. This handles 32-bit
+    /// XID wraparound correctly.
+    ///
+    /// Returns 'true' if the XID was updated.
+    pub fn update_next_xid(&mut self, xid: u32) -> bool {
        let xid = xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1);
        let full_xid = self.nextXid.value;
        let new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID);
@@ -391,35 +397,37 @@ impl CheckPoint {
                // wrap-around
                epoch += 1;
            }
-            self.nextXid = FullTransactionId {
-                value: (epoch << 32) | new_xid as u64,
-            };
+            let nextXid = (epoch << 32) | new_xid as u64;
+
+            if nextXid != self.nextXid.value {
+                self.nextXid = FullTransactionId { value: nextXid };
+                return true;
+            }
        }
+        false
    }
 }

 //
-// Generate new WAL segment with single XLOG_CHECKPOINT_SHUTDOWN record.
+// Generate new, empty WAL segment.
 // We need this segment to start compute node.
-// In order to minimize changes in Postgres core, we prefer to
-// provide WAL segment from which is can extract checkpoint record in standard way,
-// rather then implement some alternative mechanism.
 //
-pub fn generate_wal_segment(pg_control: &ControlFileData) -> Bytes {
+pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes {
    let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize);

+    let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, pg_constants::WAL_SEGMENT_SIZE);
    let hdr = XLogLongPageHeaderData {
        std: {
            XLogPageHeaderData {
                xlp_magic: XLOG_PAGE_MAGIC as u16,
                xlp_info: pg_constants::XLP_LONG_HEADER,
                xlp_tli: 1, // FIXME: always use Postgres timeline 1
-                xlp_pageaddr: pg_control.checkPoint - XLOG_SIZE_OF_XLOG_LONG_PHD as u64,
+                xlp_pageaddr: pageaddr,
                xlp_rem_len: 0,
                ..Default::default() // Put 0 in padding fields.
            }
        },
-        xlp_sysid: pg_control.system_identifier,
+        xlp_sysid: system_id,
        xlp_seg_size: pg_constants::WAL_SEGMENT_SIZE as u32,
        xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
    };
@@ -427,36 +435,6 @@ pub fn generate_wal_segment(pg_control: &ControlFileData) -> Bytes {
    let hdr_bytes = hdr.encode();
    seg_buf.extend_from_slice(&hdr_bytes);

-    let rec_hdr = XLogRecord {
-        xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD
-            + SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT
-            + SIZEOF_CHECKPOINT) as u32,
-        xl_xid: 0, //0 is for InvalidTransactionId
-        xl_prev: 0,
-        xl_info: pg_constants::XLOG_CHECKPOINT_SHUTDOWN,
-        xl_rmid: pg_constants::RM_XLOG_ID,
-        xl_crc: 0,
-        ..Default::default() // Put 0 in padding fields.
-    };
-
-    let mut rec_shord_hdr_bytes = BytesMut::new();
-    rec_shord_hdr_bytes.put_u8(pg_constants::XLR_BLOCK_ID_DATA_SHORT);
-    rec_shord_hdr_bytes.put_u8(SIZEOF_CHECKPOINT as u8);
-
-    let rec_bytes = rec_hdr.encode();
-    let checkpoint_bytes = pg_control.checkPointCopy.encode();
-
-    //calculate record checksum
-    let mut crc = 0;
-    crc = crc32c_append(crc, &rec_shord_hdr_bytes[..]);
-    crc = crc32c_append(crc, &checkpoint_bytes[..]);
-    crc = crc32c_append(crc, &rec_bytes[0..XLOG_RECORD_CRC_OFFS]);
-
-    seg_buf.extend_from_slice(&rec_bytes[0..XLOG_RECORD_CRC_OFFS]);
-    seg_buf.put_u32_le(crc);
-    seg_buf.extend_from_slice(&rec_shord_hdr_bytes);
-    seg_buf.extend_from_slice(&checkpoint_bytes);
-
    //zero out the rest of the file
    seg_buf.resize(pg_constants::WAL_SEGMENT_SIZE, 0);
    seg_buf.freeze()
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -18,5 +18,6 @@ tokio = "1.11"
 tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
 clap = "2.33.0"
 rustls = "0.19.1"
+reqwest = { version = "0.11", features = ["blocking", "json"] }

 zenith_utils = { path = "../zenith_utils" }
--- a/proxy/src/cplane_api.rs
+++ b/proxy/src/cplane_api.rs
@@ -1,92 +1,88 @@
-use anyhow::{bail, Result};
+use anyhow::{bail, Context, Result};
 use serde::{Deserialize, Serialize};
-use std::{
-    collections::HashMap,
-    net::{IpAddr, SocketAddr},
-};
+use std::net::{SocketAddr, ToSocketAddrs};

 pub struct CPlaneApi {
-    // address: SocketAddr,
+    auth_endpoint: &'static str,
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Debug)]
 pub struct DatabaseInfo {
-    pub host: IpAddr, // TODO: allow host name here too
+    pub host: String,
    pub port: u16,
    pub dbname: String,
    pub user: String,
-    pub password: String,
+    pub password: Option<String>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct ProxyAuthResult {
+    pub ready: bool,
+    pub error: Option<String>,
+    pub conn_info: Option<DatabaseInfo>,
 }

 impl DatabaseInfo {
-    pub fn socket_addr(&self) -> SocketAddr {
-        SocketAddr::new(self.host, self.port)
-    }
-
-    pub fn conn_string(&self) -> String {
-        format!(
-            "dbname={} user={} password={}",
-            self.dbname, self.user, self.password
-        )
+    pub fn socket_addr(&self) -> Result<SocketAddr> {
+        let host_port = format!("{}:{}", self.host, self.port);
+        host_port
+            .to_socket_addrs()
+            .with_context(|| format!("cannot resolve {} to SocketAddr", host_port))?
+            .next()
+            .ok_or_else(|| anyhow::Error::msg("cannot resolve at least one SocketAddr"))
    }
 }

-// mock cplane api
-impl CPlaneApi {
-    pub fn new(_address: &SocketAddr) -> CPlaneApi {
-        CPlaneApi {
-            // address: address.clone(),
+impl From<DatabaseInfo> for tokio_postgres::Config {
+    fn from(db_info: DatabaseInfo) -> Self {
+        let mut config = tokio_postgres::Config::new();
+
+        config
+            .host(&db_info.host)
+            .port(db_info.port)
+            .dbname(&db_info.dbname)
+            .user(&db_info.user);
+
+        if let Some(password) = db_info.password {
+            config.password(password);
        }
+
+        config
+    }
+}
+
+impl CPlaneApi {
+    pub fn new(auth_endpoint: &'static str) -> CPlaneApi {
+        CPlaneApi { auth_endpoint }
    }

-    pub fn check_auth(&self, user: &str, md5_response: &[u8], salt: &[u8; 4]) -> Result<()> {
-        // passwords for both is "mypass"
-        let auth_map: HashMap<_, &str> = vec![
-            ("stas@zenith", "716ee6e1c4a9364d66285452c47402b1"),
-            ("stas2@zenith", "3996f75df64c16a8bfaf01301b61d582"),
-        ]
-        .into_iter()
-        .collect();
+    pub fn authenticate_proxy_request(
+        &self,
+        user: &str,
+        database: &str,
+        md5_response: &[u8],
+        salt: &[u8; 4],
+        psql_session_id: &str,
+    ) -> Result<ProxyAuthResult> {
+        let mut url = reqwest::Url::parse(self.auth_endpoint)?;
+        url.query_pairs_mut()
+            .append_pair("login", user)
+            .append_pair("database", database)
+            .append_pair("md5response", std::str::from_utf8(md5_response)?)
+            .append_pair("salt", &hex::encode(salt))
+            .append_pair("psql_session_id", psql_session_id);

-        let stored_hash = auth_map
-            .get(&user)
-            .ok_or_else(|| anyhow::Error::msg("user not found"))?;
-        let salted_stored_hash = format!(
-            "md5{:x}",
-            md5::compute([stored_hash.as_bytes(), salt].concat())
-        );
+        println!("cplane request: {}", url.as_str());

-        let received_hash = std::str::from_utf8(md5_response)?;
+        let resp = reqwest::blocking::get(url)?;

-        println!(
-            "auth: {} rh={} sh={} ssh={} {:?}",
-            user, received_hash, stored_hash, salted_stored_hash, salt
-        );
+        if resp.status().is_success() {
+            let auth_info: ProxyAuthResult = serde_json::from_str(resp.text()?.as_str())?;
+            println!("got auth info: #{:?}", auth_info);

-        if received_hash == salted_stored_hash {
-            Ok(())
+            Ok(auth_info)
        } else {
            bail!("Auth failed")
        }
    }
-
-    pub fn get_database_uri(&self, _user: &str, _database: &str) -> Result<DatabaseInfo> {
-        Ok(DatabaseInfo {
-            host: "127.0.0.1".parse()?,
-            port: 5432,
-            dbname: "stas".to_string(),
-            user: "stas".to_string(),
-            password: "mypass".to_string(),
-        })
-    }
-
-    // pub fn create_database(&self, _user: &String, _database: &String) -> Result<DatabaseInfo> {
-    //     Ok(DatabaseInfo {
-    //         host: "127.0.0.1".parse()?,
-    //         port: 5432,
-    //         dbname: "stas".to_string(),
-    //         user: "stas".to_string(),
-    //         password: "mypass".to_string(),
-    //     })
-    // }
 }
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -7,7 +7,7 @@
 ///
 use std::{
    collections::HashMap,
-    net::{SocketAddr, TcpListener},
+    net::SocketAddr,
    sync::{mpsc, Arc, Mutex},
    thread,
 };
@@ -17,6 +17,7 @@ use clap::{App, Arg, ArgMatches};

 use cplane_api::DatabaseInfo;
 use rustls::{internal::pemfile, NoClientAuth, ProtocolVersion, ServerConfig};
+use zenith_utils::tcp_listener;

 mod cplane_api;
 mod mgmt;
@@ -34,7 +35,7 @@ pub struct ProxyConf {
    pub redirect_uri: String,

    /// control plane address where we would check auth.
-    pub cplane_address: SocketAddr,
+    pub auth_endpoint: String,

    pub ssl_config: Option<Arc<ServerConfig>>,
 }
@@ -56,8 +57,7 @@ fn configure_ssl(arg_matches: &ArgMatches) -> anyhow::Result<Option<Arc<ServerCo

    let key = {
        let key_bytes = std::fs::read(key_path).context("SSL key file")?;
-        let mut keys = pemfile::rsa_private_keys(&mut &key_bytes[..])
-            .or_else(|_| pemfile::pkcs8_private_keys(&mut &key_bytes[..]))
+        let mut keys = pemfile::pkcs8_private_keys(&mut &key_bytes[..])
            .map_err(|_| anyhow!("couldn't read TLS keys"))?;
        ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
        keys.pop().unwrap()
@@ -102,6 +102,14 @@ fn main() -> anyhow::Result<()> {
                .help("redirect unauthenticated users to given uri")
                .default_value("http://localhost:3000/psql_session/"),
        )
+        .arg(
+            Arg::with_name("auth-endpoint")
+                .short("a")
+                .long("auth-endpoint")
+                .takes_value(true)
+                .help("redirect unauthenticated users to given uri")
+                .default_value("http://localhost:3000/authenticate_proxy_request/"),
+        )
        .arg(
            Arg::with_name("ssl-key")
                .short("k")
@@ -122,7 +130,7 @@ fn main() -> anyhow::Result<()> {
        proxy_address: arg_matches.value_of("proxy").unwrap().parse()?,
        mgmt_address: arg_matches.value_of("mgmt").unwrap().parse()?,
        redirect_uri: arg_matches.value_of("uri").unwrap().parse()?,
-        cplane_address: "127.0.0.1:3000".parse()?,
+        auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?,
        ssl_config: configure_ssl(&arg_matches)?,
    };
    let state = ProxyState {
@@ -133,23 +141,23 @@ fn main() -> anyhow::Result<()> {

    // Check that we can bind to address before further initialization
    println!("Starting proxy on {}", state.conf.proxy_address);
-    let pageserver_listener = TcpListener::bind(state.conf.proxy_address)?;
+    let pageserver_listener = tcp_listener::bind(state.conf.proxy_address)?;

    println!("Starting mgmt on {}", state.conf.mgmt_address);
-    let mgmt_listener = TcpListener::bind(state.conf.mgmt_address)?;
+    let mgmt_listener = tcp_listener::bind(state.conf.mgmt_address)?;

-    let threads = vec![
+    let threads = [
        // Spawn a thread to listen for connections. It will spawn further threads
        // for each connection.
        thread::Builder::new()
-            .name("Proxy thread".into())
+            .name("Listener thread".into())
            .spawn(move || proxy::thread_main(state, pageserver_listener))?,
        thread::Builder::new()
            .name("Mgmt thread".into())
            .spawn(move || mgmt::thread_main(state, mgmt_listener))?,
    ];

-    for t in threads.into_iter() {
+    for t in threads {
        t.join().unwrap()?;
    }

--- a/proxy/src/mgmt.rs
+++ b/proxy/src/mgmt.rs
@@ -5,7 +5,7 @@ use std::{

 use anyhow::bail;
 use bytes::Bytes;
-use serde::{Deserialize, Serialize};
+use serde::Deserialize;
 use zenith_utils::{
    postgres_backend::{self, query_from_cstring, AuthType, PostgresBackend},
    pq_proto::{BeMessage, SINGLE_COL_ROWDESC},
@@ -34,7 +34,7 @@ pub fn thread_main(state: &'static ProxyState, listener: TcpListener) -> anyhow:

 pub fn mgmt_conn_main(state: &'static ProxyState, socket: TcpStream) -> anyhow::Result<()> {
    let mut conn_handler = MgmtHandler { state };
-    let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?;
+    let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, true)?;
    pgbackend.run(&mut conn_handler)
 }

@@ -49,7 +49,7 @@ struct MgmtHandler {
 //             "host": "127.0.0.1",
 //             "port": 5432,
 //             "dbname": "stas",
-//             "user": "stas"
+//             "user": "stas",
 //             "password": "mypass"
 //         }
 //     }
@@ -60,13 +60,16 @@ struct MgmtHandler {
 //         "Failure": "oops"
 //     }
 // }
-#[derive(Serialize, Deserialize)]
+//
+// // to test manually by sending a query to mgmt interface:
+// psql -h 127.0.0.1 -p 9999 -c '{"session_id":"4f10dde522e14739","result":{"Success":{"host":"127.0.0.1","port":5432,"dbname":"stas","user":"stas","password":"stas"}}}'
+#[derive(Deserialize)]
 pub struct PsqlSessionResponse {
    session_id: String,
    result: PsqlSessionResult,
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Deserialize)]
 pub enum PsqlSessionResult {
    Success(DatabaseInfo),
    Failure(String),
@@ -78,34 +81,47 @@ impl postgres_backend::Handler for MgmtHandler {
        pgb: &mut PostgresBackend,
        query_string: Bytes,
    ) -> anyhow::Result<()> {
-        let query_string = query_from_cstring(query_string);
+        let res = try_process_query(self, pgb, query_string);
+        // intercept and log error message
+        if res.is_err() {
+            println!("Mgmt query failed: #{:?}", res);
+        }
+        res
+    }
+}

-        println!("Got mgmt query: '{}'", std::str::from_utf8(&query_string)?);
+fn try_process_query(
+    mgmt: &mut MgmtHandler,
+    pgb: &mut PostgresBackend,
+    query_string: Bytes,
+) -> anyhow::Result<()> {
+    let query_string = query_from_cstring(query_string);

-        let resp: PsqlSessionResponse = serde_json::from_slice(&query_string)?;
+    println!("Got mgmt query: '{}'", std::str::from_utf8(&query_string)?);

-        let waiters = self.state.waiters.lock().unwrap();
+    let resp: PsqlSessionResponse = serde_json::from_slice(&query_string)?;

-        let sender = waiters
-            .get(&resp.session_id)
-            .ok_or_else(|| anyhow::Error::msg("psql_session_id is not found"))?;
+    let waiters = mgmt.state.waiters.lock().unwrap();

-        match resp.result {
-            PsqlSessionResult::Success(db_info) => {
-                sender.send(Ok(db_info))?;
+    let sender = waiters
+        .get(&resp.session_id)
+        .ok_or_else(|| anyhow::Error::msg("psql_session_id is not found"))?;

-                pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
-                    .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
-                    .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-                pgb.flush()?;
-                Ok(())
-            }
+    match resp.result {
+        PsqlSessionResult::Success(db_info) => {
+            sender.send(Ok(db_info))?;

-            PsqlSessionResult::Failure(message) => {
-                sender.send(Err(anyhow::Error::msg(message.clone())))?;
+            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
+                .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
+                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+            pgb.flush()?;
+            Ok(())
+        }

-                bail!("psql session request failed: {}", message)
-            }
+        PsqlSessionResult::Failure(message) => {
+            sender.send(Err(anyhow::Error::msg(message.clone())))?;
+
+            bail!("psql session request failed: {}", message)
        }
    }
 }
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -6,7 +6,6 @@ use anyhow::bail;
 use tokio_postgres::NoTls;

 use rand::Rng;
-use std::io::Write;
 use std::{io, sync::mpsc::channel, thread};
 use zenith_utils::postgres_backend::Stream;
 use zenith_utils::postgres_backend::{PostgresBackend, ProtoState};
@@ -28,11 +27,13 @@ pub fn thread_main(
        println!("accepted connection from {}", peer_addr);
        socket.set_nodelay(true).unwrap();

-        thread::spawn(move || {
-            if let Err(err) = proxy_conn_main(state, socket) {
-                println!("error: {}", err);
-            }
-        });
+        thread::Builder::new()
+            .name("Proxy thread".into())
+            .spawn(move || {
+                if let Err(err) = proxy_conn_main(state, socket) {
+                    println!("error: {}", err);
+                }
+            })?;
    }
 }

@@ -57,13 +58,14 @@ pub fn proxy_conn_main(
 ) -> anyhow::Result<()> {
    let mut conn = ProxyConnection {
        state,
-        cplane: CPlaneApi::new(&state.conf.cplane_address),
+        cplane: CPlaneApi::new(&state.conf.auth_endpoint),
        user: "".into(),
        database: "".into(),
        pgb: PostgresBackend::new(
            socket,
            postgres_backend::AuthType::MD5,
            state.conf.ssl_config.clone(),
+            false,
        )?,
        md5_salt: [0u8; 4],
        psql_session_id: "".into(),
@@ -73,14 +75,20 @@ pub fn proxy_conn_main(
    // This will set conn.existing_user and we can decide on next actions
    conn.handle_startup()?;

+    let mut psql_session_id_buf = [0u8; 8];
+    rand::thread_rng().fill(&mut psql_session_id_buf);
+    conn.psql_session_id = hex::encode(psql_session_id_buf);
+
    // both scenarious here should end up producing database connection string
-    let db_info = if conn.is_existing_user() {
+    let conn_info = if conn.is_existing_user() {
        conn.handle_existing_user()?
    } else {
        conn.handle_new_user()?
    };

-    proxy_pass(conn.pgb, db_info)
+    // XXX: move that inside handle_new_user/handle_existing_user to be able to
+    // report wrong connection error.
+    proxy_pass(conn.pgb, conn_info)
 }

 impl ProxyConnection {
@@ -152,9 +160,25 @@ impl ProxyConnection {
        Ok(())
    }

+    // Wait for proxy kick form the console with conninfo
+    fn wait_for_conninfo(&mut self) -> anyhow::Result<DatabaseInfo> {
+        let (tx, rx) = channel::<anyhow::Result<DatabaseInfo>>();
+        let _ = self
+            .state
+            .waiters
+            .lock()
+            .unwrap()
+            .insert(self.psql_session_id.clone(), tx);
+
+        // Wait for web console response
+        // TODO: respond with error to client
+        rx.recv()?
+    }
+
    fn handle_existing_user(&mut self) -> anyhow::Result<DatabaseInfo> {
        // ask password
        rand::thread_rng().fill(&mut self.md5_salt);
+
        self.pgb
            .write_message(&BeMessage::AuthenticationMD5Password(&self.md5_salt))?;
        self.pgb.state = ProtoState::Authentication; // XXX
@@ -172,28 +196,61 @@ impl ProxyConnection {
                .split_last()
                .ok_or_else(|| anyhow::Error::msg("unexpected password message"))?;

-            if let Err(e) = self.check_auth_md5(md5_response) {
-                self.pgb
-                    .write_message(&BeMessage::ErrorResponse(format!("{}", e)))?;
-                bail!("auth failed: {}", e);
-            } else {
-                self.pgb
-                    .write_message_noflush(&BeMessage::AuthenticationOk)?;
-                self.pgb
-                    .write_message_noflush(&BeMessage::ParameterStatus)?;
-                self.pgb.write_message(&BeMessage::ReadyForQuery)?;
-            }
-        }
+            match self.cplane.authenticate_proxy_request(
+                self.user.as_str(),
+                self.database.as_str(),
+                md5_response,
+                &self.md5_salt,
+                &self.psql_session_id,
+            ) {
+                Err(e) => {
+                    self.pgb.write_message(&BeMessage::ErrorResponse(format!(
+                        "cannot authenticate proxy: {}",
+                        e
+                    )))?;

-        // ok, we are authorized
-        self.cplane.get_database_uri(&self.user, &self.database)
+                    bail!("auth failed: {}", e);
+                }
+
+                Ok(auth_info) => {
+                    let conn_info = if auth_info.ready {
+                        // Cluster is ready, so just take `conn_info` and respond to the client.
+                        auth_info
+                            .conn_info
+                            .expect("conn_info should be provided with ready cluster")
+                    } else {
+                        match auth_info.error {
+                            Some(e) => {
+                                self.pgb.write_message(&BeMessage::ErrorResponse(format!(
+                                    "cannot authenticate proxy: {}",
+                                    e
+                                )))?;
+
+                                bail!("auth failed: {}", e);
+                            }
+                            None => {
+                                // Cluster exists, but isn't active, await its start and proxy kick
+                                // with `conn_info`.
+                                self.wait_for_conninfo()?
+                            }
+                        }
+                    };
+
+                    self.pgb
+                        .write_message_noflush(&BeMessage::AuthenticationOk)?;
+                    self.pgb
+                        .write_message_noflush(&BeMessage::ParameterStatus)?;
+                    self.pgb.write_message(&BeMessage::ReadyForQuery)?;
+
+                    Ok(conn_info)
+                }
+            }
+        } else {
+            bail!("protocol violation");
+        }
    }

    fn handle_new_user(&mut self) -> anyhow::Result<DatabaseInfo> {
-        let mut psql_session_id_buf = [0u8; 8];
-        rand::thread_rng().fill(&mut psql_session_id_buf);
-        self.psql_session_id = hex::encode(psql_session_id_buf);
-
        let hello_message = format!("☀️  Welcome to Zenith!

 To proceed with database creation, open the following link:
@@ -212,80 +269,83 @@ databases without opening the browser.
        self.pgb
            .write_message(&BeMessage::NoticeResponse(hello_message))?;

-        // await for database creation
-        let (tx, rx) = channel::<anyhow::Result<DatabaseInfo>>();
-        let _ = self
-            .state
-            .waiters
-            .lock()
-            .unwrap()
-            .insert(self.psql_session_id.clone(), tx);
-
-        // Wait for web console response
-        // XXX: respond with error to client
-        let dbinfo = rx.recv()??;
+        // We requested the DB creation from the console. Now wait for conninfo
+        let conn_info = self.wait_for_conninfo()?;

        self.pgb.write_message_noflush(&BeMessage::NoticeResponse(
            "Connecting to database.".to_string(),
        ))?;
        self.pgb.write_message(&BeMessage::ReadyForQuery)?;

-        Ok(dbinfo)
-    }
-
-    fn check_auth_md5(&self, md5_response: &[u8]) -> anyhow::Result<()> {
-        assert!(self.is_existing_user());
-        self.cplane
-            .check_auth(self.user.as_str(), md5_response, &self.md5_salt)
+        Ok(conn_info)
    }
 }

 /// Create a TCP connection to a postgres database, authenticate with it, and receive the ReadyForQuery message
 async fn connect_to_db(db_info: DatabaseInfo) -> anyhow::Result<tokio::net::TcpStream> {
-    let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()).await?;
-    let config = db_info.conn_string().parse::<tokio_postgres::Config>()?;
+    let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()?).await?;
+    let config = tokio_postgres::Config::from(db_info);
    let _ = config.connect_raw(&mut socket, NoTls).await?;
    Ok(socket)
 }

 /// Concurrently proxy both directions of the client and server connections
 fn proxy(
-    client_read: ReadStream,
-    client_write: WriteStream,
-    server_read: ReadStream,
-    server_write: WriteStream,
+    (client_read, client_write): (ReadStream, WriteStream),
+    (server_read, server_write): (ReadStream, WriteStream),
 ) -> anyhow::Result<()> {
-    fn do_proxy(mut reader: ReadStream, mut writer: WriteStream) -> io::Result<()> {
-        std::io::copy(&mut reader, &mut writer)?;
-        writer.flush()?;
-        writer.shutdown(std::net::Shutdown::Both)
+    fn do_proxy(mut reader: impl io::Read, mut writer: WriteStream) -> io::Result<u64> {
+        /// FlushWriter will make sure that every message is sent as soon as possible
+        struct FlushWriter<W>(W);
+
+        impl<W: io::Write> io::Write for FlushWriter<W> {
+            fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
+                // `std::io::copy` is guaranteed to exit if we return an error,
+                // so we can afford to lose `res` in case `flush` fails
+                let res = self.0.write(buf);
+                if res.is_ok() {
+                    self.0.flush()?;
+                }
+                res
+            }
+
+            fn flush(&mut self) -> io::Result<()> {
+                self.0.flush()
+            }
+        }
+
+        let res = std::io::copy(&mut reader, &mut FlushWriter(&mut writer));
+        writer.shutdown(std::net::Shutdown::Both)?;
+        res
    }

    let client_to_server_jh = thread::spawn(move || do_proxy(client_read, server_write));

-    let res1 = do_proxy(server_read, client_write);
-    let res2 = client_to_server_jh.join().unwrap();
-    res1?;
-    res2?;
+    do_proxy(server_read, client_write)?;
+    client_to_server_jh.join().unwrap()?;

    Ok(())
 }

 /// Proxy a client connection to a postgres database
 fn proxy_pass(pgb: PostgresBackend, db_info: DatabaseInfo) -> anyhow::Result<()> {
-    let runtime = tokio::runtime::Builder::new_current_thread().build()?;
-    let db_stream = runtime.block_on(connect_to_db(db_info))?;
-    let db_stream = db_stream.into_std()?;
-    db_stream.set_nonblocking(false)?;
+    let db_stream = {
+        // We'll get rid of this once migration to async is complete
+        let runtime = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()?;

-    let db_stream = zenith_utils::sock_split::BidiStream::from_tcp(db_stream);
-    let (db_read, db_write) = db_stream.split();
+        let stream = runtime.block_on(connect_to_db(db_info))?.into_std()?;
+        stream.set_nonblocking(false)?;
+        stream
+    };

-    let stream = match pgb.into_stream() {
+    let db = zenith_utils::sock_split::BidiStream::from_tcp(db_stream);
+
+    let client = match pgb.into_stream() {
        Stream::Bidirectional(bidi_stream) => bidi_stream,
        _ => bail!("invalid stream"),
    };

-    let (client_read, client_write) = stream.split();
-    proxy(client_read, client_write, db_read, db_write)
+    proxy(client.split(), db.split())
 }
--- a/test_runner/Pipfile
+++ b/test_runner/Pipfile
@@ -11,12 +11,19 @@ pyjwt = {extras = ["crypto"], version = "*"}
 requests = "*"
 pytest-xdist = "*"
 asyncpg = "*"
+cached-property = "*"

 [dev-packages]
-yapf = "*"
+# Behavior may change slightly between versions. These are run continuously,
+# so we pin exact versions to avoid suprising breaks. Update if comfortable.
+yapf = "==0.31.0"
+mypy = "==0.910"
+# Non-pinned packages follow.
+pipenv = "*"
 flake8 = "*"
-mypy = "*"
+types-requests = "*"
+types-psycopg2 = "*"

 [requires]
-# we need at least 3.6, but pipenv doesn't allow to say this directly
+# we need at least 3.7, but pipenv doesn't allow to say this directly
 python_version = "3"
--- a/test_runner/Pipfile.lock
+++ b/test_runner/Pipfile.lock
@@ -1,7 +1,7 @@
 {
    "_meta": {
        "hash": {
-            "sha256": "3cdc048691824d0b93912b6b78a0aa01dc98f278212c1badb0cc2edbd2103c3a"
+            "sha256": "63b72760ef37375186a638066ba0ad5804dbace99ddc503ea654e9749070ab24"
        },
        "pipfile-spec": 6,
        "requires": {
@@ -43,94 +43,108 @@
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
            "version": "==21.2.0"
        },
+        "cached-property": {
+            "hashes": [
+                "sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130",
+                "sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"
+            ],
+            "index": "pypi",
+            "version": "==1.5.2"
+        },
        "certifi": {
            "hashes": [
-                "sha256:2bbf76fd432960138b3ef6dda3dde0544f27cbf8546c458e60baf371917ba9ee",
-                "sha256:50b1e4f8446b06f41be7dd6338db18e0990601dce795c2b1686458aa7e8fa7d8"
+                "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
+                "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
            ],
-            "version": "==2021.5.30"
+            "version": "==2021.10.8"
        },
        "cffi": {
            "hashes": [
-                "sha256:06c54a68935738d206570b20da5ef2b6b6d92b38ef3ec45c5422c0ebaf338d4d",
-                "sha256:0c0591bee64e438883b0c92a7bed78f6290d40bf02e54c5bf0978eaf36061771",
-                "sha256:19ca0dbdeda3b2615421d54bef8985f72af6e0c47082a8d26122adac81a95872",
-                "sha256:22b9c3c320171c108e903d61a3723b51e37aaa8c81255b5e7ce102775bd01e2c",
-                "sha256:26bb2549b72708c833f5abe62b756176022a7b9a7f689b571e74c8478ead51dc",
-                "sha256:33791e8a2dc2953f28b8d8d300dde42dd929ac28f974c4b4c6272cb2955cb762",
-                "sha256:3c8d896becff2fa653dc4438b54a5a25a971d1f4110b32bd3068db3722c80202",
-                "sha256:4373612d59c404baeb7cbd788a18b2b2a8331abcc84c3ba40051fcd18b17a4d5",
-                "sha256:487d63e1454627c8e47dd230025780e91869cfba4c753a74fda196a1f6ad6548",
-                "sha256:48916e459c54c4a70e52745639f1db524542140433599e13911b2f329834276a",
-                "sha256:4922cd707b25e623b902c86188aca466d3620892db76c0bdd7b99a3d5e61d35f",
-                "sha256:55af55e32ae468e9946f741a5d51f9896da6b9bf0bbdd326843fec05c730eb20",
-                "sha256:57e555a9feb4a8460415f1aac331a2dc833b1115284f7ded7278b54afc5bd218",
-                "sha256:5d4b68e216fc65e9fe4f524c177b54964af043dde734807586cf5435af84045c",
-                "sha256:64fda793737bc4037521d4899be780534b9aea552eb673b9833b01f945904c2e",
-                "sha256:6d6169cb3c6c2ad50db5b868db6491a790300ade1ed5d1da29289d73bbe40b56",
-                "sha256:7bcac9a2b4fdbed2c16fa5681356d7121ecabf041f18d97ed5b8e0dd38a80224",
-                "sha256:80b06212075346b5546b0417b9f2bf467fea3bfe7352f781ffc05a8ab24ba14a",
-                "sha256:818014c754cd3dba7229c0f5884396264d51ffb87ec86e927ef0be140bfdb0d2",
-                "sha256:8eb687582ed7cd8c4bdbff3df6c0da443eb89c3c72e6e5dcdd9c81729712791a",
-                "sha256:99f27fefe34c37ba9875f224a8f36e31d744d8083e00f520f133cab79ad5e819",
-                "sha256:9f3e33c28cd39d1b655ed1ba7247133b6f7fc16fa16887b120c0c670e35ce346",
-                "sha256:a8661b2ce9694ca01c529bfa204dbb144b275a31685a075ce123f12331be790b",
-                "sha256:a9da7010cec5a12193d1af9872a00888f396aba3dc79186604a09ea3ee7c029e",
-                "sha256:aedb15f0a5a5949ecb129a82b72b19df97bbbca024081ed2ef88bd5c0a610534",
-                "sha256:b315d709717a99f4b27b59b021e6207c64620790ca3e0bde636a6c7f14618abb",
-                "sha256:ba6f2b3f452e150945d58f4badd92310449876c4c954836cfb1803bdd7b422f0",
-                "sha256:c33d18eb6e6bc36f09d793c0dc58b0211fccc6ae5149b808da4a62660678b156",
-                "sha256:c9a875ce9d7fe32887784274dd533c57909b7b1dcadcc128a2ac21331a9765dd",
-                "sha256:c9e005e9bd57bc987764c32a1bee4364c44fdc11a3cc20a40b93b444984f2b87",
-                "sha256:d2ad4d668a5c0645d281dcd17aff2be3212bc109b33814bbb15c4939f44181cc",
-                "sha256:d950695ae4381ecd856bcaf2b1e866720e4ab9a1498cba61c602e56630ca7195",
-                "sha256:e22dcb48709fc51a7b58a927391b23ab37eb3737a98ac4338e2448bef8559b33",
-                "sha256:e8c6a99be100371dbb046880e7a282152aa5d6127ae01783e37662ef73850d8f",
-                "sha256:e9dc245e3ac69c92ee4c167fbdd7428ec1956d4e754223124991ef29eb57a09d",
-                "sha256:eb687a11f0a7a1839719edd80f41e459cc5366857ecbed383ff376c4e3cc6afd",
-                "sha256:eb9e2a346c5238a30a746893f23a9535e700f8192a68c07c0258e7ece6ff3728",
-                "sha256:ed38b924ce794e505647f7c331b22a693bee1538fdf46b0222c4717b42f744e7",
-                "sha256:f0010c6f9d1a4011e429109fda55a225921e3206e7f62a0c22a35344bfd13cca",
-                "sha256:f0c5d1acbfca6ebdd6b1e3eded8d261affb6ddcf2186205518f1428b8569bb99",
-                "sha256:f10afb1004f102c7868ebfe91c28f4a712227fe4cb24974350ace1f90e1febbf",
-                "sha256:f174135f5609428cc6e1b9090f9268f5c8935fddb1b25ccb8255a2d50de6789e",
-                "sha256:f3ebe6e73c319340830a9b2825d32eb6d8475c1dac020b4f0aa774ee3b898d1c",
-                "sha256:f627688813d0a4140153ff532537fbe4afea5a3dffce1f9deb7f91f848a832b5",
-                "sha256:fd4305f86f53dfd8cd3522269ed7fc34856a8ee3709a5e28b2836b2db9d4cd69"
+                "sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3",
+                "sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2",
+                "sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636",
+                "sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20",
+                "sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728",
+                "sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27",
+                "sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66",
+                "sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443",
+                "sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0",
+                "sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7",
+                "sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39",
+                "sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605",
+                "sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a",
+                "sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37",
+                "sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029",
+                "sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139",
+                "sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc",
+                "sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df",
+                "sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14",
+                "sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880",
+                "sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2",
+                "sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a",
+                "sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e",
+                "sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474",
+                "sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024",
+                "sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8",
+                "sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0",
+                "sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e",
+                "sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a",
+                "sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e",
+                "sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032",
+                "sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6",
+                "sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e",
+                "sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b",
+                "sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e",
+                "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954",
+                "sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962",
+                "sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c",
+                "sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4",
+                "sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55",
+                "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962",
+                "sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023",
+                "sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c",
+                "sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6",
+                "sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8",
+                "sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382",
+                "sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7",
+                "sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc",
+                "sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997",
+                "sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"
            ],
-            "version": "==1.14.6"
+            "version": "==1.15.0"
        },
        "charset-normalizer": {
            "hashes": [
-                "sha256:5d209c0a931f215cee683b6445e2d77677e7e75e159f78def0db09d68fafcaa6",
-                "sha256:5ec46d183433dcbd0ab716f2d7f29d8dee50505b3fdb40c6b985c7c4f5a3591f"
+                "sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0",
+                "sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b"
            ],
            "markers": "python_version >= '3'",
-            "version": "==2.0.6"
+            "version": "==2.0.7"
        },
        "cryptography": {
            "hashes": [
-                "sha256:0a7dcbcd3f1913f664aca35d47c1331fce738d44ec34b7be8b9d332151b0b01e",
-                "sha256:1eb7bb0df6f6f583dd8e054689def236255161ebbcf62b226454ab9ec663746b",
-                "sha256:21ca464b3a4b8d8e86ba0ee5045e103a1fcfac3b39319727bc0fc58c09c6aff7",
-                "sha256:34dae04a0dce5730d8eb7894eab617d8a70d0c97da76b905de9efb7128ad7085",
-                "sha256:3520667fda779eb788ea00080124875be18f2d8f0848ec00733c0ec3bb8219fc",
-                "sha256:3c4129fc3fdc0fa8e40861b5ac0c673315b3c902bbdc05fc176764815b43dd1d",
-                "sha256:3fa3a7ccf96e826affdf1a0a9432be74dc73423125c8f96a909e3835a5ef194a",
-                "sha256:5b0fbfae7ff7febdb74b574055c7466da334a5371f253732d7e2e7525d570498",
-                "sha256:695104a9223a7239d155d7627ad912953b540929ef97ae0c34c7b8bf30857e89",
-                "sha256:8695456444f277af73a4877db9fc979849cd3ee74c198d04fc0776ebc3db52b9",
-                "sha256:94cc5ed4ceaefcbe5bf38c8fba6a21fc1d365bb8fb826ea1688e3370b2e24a1c",
-                "sha256:94fff993ee9bc1b2440d3b7243d488c6a3d9724cc2b09cdb297f6a886d040ef7",
-                "sha256:9965c46c674ba8cc572bc09a03f4c649292ee73e1b683adb1ce81e82e9a6a0fb",
-                "sha256:a00cf305f07b26c351d8d4e1af84ad7501eca8a342dedf24a7acb0e7b7406e14",
-                "sha256:a305600e7a6b7b855cd798e00278161b681ad6e9b7eca94c721d5f588ab212af",
-                "sha256:cd65b60cfe004790c795cc35f272e41a3df4631e2fb6b35aa7ac6ef2859d554e",
-                "sha256:d2a6e5ef66503da51d2110edf6c403dc6b494cc0082f85db12f54e9c5d4c3ec5",
-                "sha256:d9ec0e67a14f9d1d48dd87a2531009a9b251c02ea42851c060b25c782516ff06",
-                "sha256:f44d141b8c4ea5eb4dbc9b3ad992d45580c1d22bf5e24363f2fbf50c2d7ae8a7"
+                "sha256:07bb7fbfb5de0980590ddfc7f13081520def06dc9ed214000ad4372fb4e3c7f6",
+                "sha256:18d90f4711bf63e2fb21e8c8e51ed8189438e6b35a6d996201ebd98a26abbbe6",
+                "sha256:1ed82abf16df40a60942a8c211251ae72858b25b7421ce2497c2eb7a1cee817c",
+                "sha256:22a38e96118a4ce3b97509443feace1d1011d0571fae81fc3ad35f25ba3ea999",
+                "sha256:2d69645f535f4b2c722cfb07a8eab916265545b3475fdb34e0be2f4ee8b0b15e",
+                "sha256:4a2d0e0acc20ede0f06ef7aa58546eee96d2592c00f450c9acb89c5879b61992",
+                "sha256:54b2605e5475944e2213258e0ab8696f4f357a31371e538ef21e8d61c843c28d",
+                "sha256:7075b304cd567694dc692ffc9747f3e9cb393cc4aa4fb7b9f3abd6f5c4e43588",
+                "sha256:7b7ceeff114c31f285528ba8b390d3e9cfa2da17b56f11d366769a807f17cbaa",
+                "sha256:7eba2cebca600a7806b893cb1d541a6e910afa87e97acf2021a22b32da1df52d",
+                "sha256:928185a6d1ccdb816e883f56ebe92e975a262d31cc536429041921f8cb5a62fd",
+                "sha256:9933f28f70d0517686bd7de36166dda42094eac49415459d9bdf5e7df3e0086d",
+                "sha256:a688ebcd08250eab5bb5bca318cc05a8c66de5e4171a65ca51db6bd753ff8953",
+                "sha256:abb5a361d2585bb95012a19ed9b2c8f412c5d723a9836418fab7aaa0243e67d2",
+                "sha256:c10c797ac89c746e488d2ee92bd4abd593615694ee17b2500578b63cad6b93a8",
+                "sha256:ced40344e811d6abba00295ced98c01aecf0c2de39481792d87af4fa58b7b4d6",
+                "sha256:d57e0cdc1b44b6cdf8af1d01807db06886f10177469312fbde8f44ccbb284bc9",
+                "sha256:d99915d6ab265c22873f1b4d6ea5ef462ef797b4140be4c9d8b179915e0985c6",
+                "sha256:eb80e8a1f91e4b7ef8b33041591e6d89b2b8e122d787e87eeb2b08da71bb16ad",
+                "sha256:ebeddd119f526bcf323a89f853afb12e225902a24d29b55fe18dd6fcb2838a76"
            ],
-            "version": "==3.4.8"
+            "version": "==35.0.0"
        },
        "execnet": {
            "hashes": [
@@ -142,11 +156,19 @@
        },
        "idna": {
            "hashes": [
-                "sha256:14475042e284991034cb48e06f6851428fb14c4dc953acd9be9a5e95c7b6dd7a",
-                "sha256:467fbad99067910785144ce333826c71fb0e63a425657295239737f7ecd125f3"
+                "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
+                "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
            ],
            "markers": "python_version >= '3'",
-            "version": "==3.2"
+            "version": "==3.3"
+        },
+        "importlib-metadata": {
+            "hashes": [
+                "sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
+                "sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
+            ],
+            "markers": "python_version < '3.8'",
+            "version": "==4.8.1"
        },
        "iniconfig": {
            "hashes": [
@@ -207,11 +229,11 @@
                "crypto"
            ],
            "hashes": [
-                "sha256:934d73fbba91b0483d3857d1aff50e96b2a892384ee2c17417ed3203f173fca1",
-                "sha256:fba44e7898bbca160a2b2b501f492824fc8382485d3a6f11ba5d0c1937ce6130"
+                "sha256:b888b4d56f06f6dcd777210c334e69c737be74755d3e5e9ee3fe67dc18a0ee41",
+                "sha256:e0c4bb8d9f0af0c7f5b1ec4c5036309617d03d56932877f2f7a0beeb5318322f"
            ],
            "index": "pypi",
-            "version": "==2.1.0"
+            "version": "==2.3.0"
        },
        "pyparsing": {
            "hashes": [
@@ -272,21 +294,67 @@
        },
        "urllib3": {
            "hashes": [
-                "sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4",
-                "sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f"
+                "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece",
+                "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"
            ],
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
-            "version": "==1.26.6"
+            "version": "==1.26.7"
+        },
+        "zipp": {
+            "hashes": [
+                "sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
+                "sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==3.6.0"
        }
    },
    "develop": {
+        "backports.entry-points-selectable": {
+            "hashes": [
+                "sha256:988468260ec1c196dab6ae1149260e2f5472c9110334e5d51adcb77867361f6a",
+                "sha256:a6d9a871cde5e15b4c4a53e3d43ba890cc6861ec1332c9c2428c92f977192acc"
+            ],
+            "markers": "python_version >= '2.7'",
+            "version": "==1.1.0"
+        },
+        "certifi": {
+            "hashes": [
+                "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
+                "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
+            ],
+            "version": "==2021.10.8"
+        },
+        "distlib": {
+            "hashes": [
+                "sha256:c8b54e8454e5bf6237cc84c20e8264c3e991e824ef27e8f1e81049867d861e31",
+                "sha256:d982d0751ff6eaaab5e2ec8e691d949ee80eddf01a62eaa96ddb11531fe16b05"
+            ],
+            "version": "==0.3.3"
+        },
+        "filelock": {
+            "hashes": [
+                "sha256:2b5eb3589e7fdda14599e7eb1a50e09b4cc14f34ed98b8ba56d33bfaafcbef2f",
+                "sha256:34a9f35f95c441e7b38209775d6e0337f9a3759f3565f6c5798f19618527c76f"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==3.3.1"
+        },
        "flake8": {
            "hashes": [
-                "sha256:07528381786f2a6237b061f6e96610a4167b226cb926e2aa2b6b1d78057c576b",
-                "sha256:bf8fd333346d844f616e8d47905ef3a3384edae6b4e9beb0c5101e25e3110907"
+                "sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d",
+                "sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"
            ],
            "index": "pypi",
-            "version": "==3.9.2"
+            "version": "==4.0.1"
+        },
+        "importlib-metadata": {
+            "hashes": [
+                "sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
+                "sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
+            ],
+            "markers": "python_version < '3.8'",
+            "version": "==4.8.1"
        },
        "mccabe": {
            "hashes": [
@@ -331,21 +399,45 @@
            ],
            "version": "==0.4.3"
        },
+        "pipenv": {
+            "hashes": [
+                "sha256:05958fadcd70b2de6a27542fcd2bd72dd5c59c6d35307fdac3e06361fb06e30e",
+                "sha256:d180f5be4775c552fd5e69ae18a9d6099d9dafb462efe54f11c72cb5f4d5e977"
+            ],
+            "index": "pypi",
+            "version": "==2021.5.29"
+        },
+        "platformdirs": {
+            "hashes": [
+                "sha256:367a5e80b3d04d2428ffa76d33f124cf11e8fff2acdaa9b43d545f5c7d661ef2",
+                "sha256:8868bbe3c3c80d42f20156f22e7131d2fb321f5bc86a2a345375c6481a67021d"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==2.4.0"
+        },
        "pycodestyle": {
            "hashes": [
-                "sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068",
-                "sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef"
+                "sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20",
+                "sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f"
            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==2.7.0"
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==2.8.0"
        },
        "pyflakes": {
            "hashes": [
-                "sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3",
-                "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"
+                "sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c",
+                "sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e"
            ],
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==2.3.1"
+            "version": "==2.4.0"
+        },
+        "six": {
+            "hashes": [
+                "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
+                "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==1.16.0"
        },
        "toml": {
            "hashes": [
@@ -355,6 +447,58 @@
            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
            "version": "==0.10.2"
        },
+        "typed-ast": {
+            "hashes": [
+                "sha256:01ae5f73431d21eead5015997ab41afa53aa1fbe252f9da060be5dad2c730ace",
+                "sha256:067a74454df670dcaa4e59349a2e5c81e567d8d65458d480a5b3dfecec08c5ff",
+                "sha256:0fb71b8c643187d7492c1f8352f2c15b4c4af3f6338f21681d3681b3dc31a266",
+                "sha256:1b3ead4a96c9101bef08f9f7d1217c096f31667617b58de957f690c92378b528",
+                "sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6",
+                "sha256:209596a4ec71d990d71d5e0d312ac935d86930e6eecff6ccc7007fe54d703808",
+                "sha256:2c726c276d09fc5c414693a2de063f521052d9ea7c240ce553316f70656c84d4",
+                "sha256:398e44cd480f4d2b7ee8d98385ca104e35c81525dd98c519acff1b79bdaac363",
+                "sha256:52b1eb8c83f178ab787f3a4283f68258525f8d70f778a2f6dd54d3b5e5fb4341",
+                "sha256:5feca99c17af94057417d744607b82dd0a664fd5e4ca98061480fd8b14b18d04",
+                "sha256:7538e495704e2ccda9b234b82423a4038f324f3a10c43bc088a1636180f11a41",
+                "sha256:760ad187b1041a154f0e4d0f6aae3e40fdb51d6de16e5c99aedadd9246450e9e",
+                "sha256:777a26c84bea6cd934422ac2e3b78863a37017618b6e5c08f92ef69853e765d3",
+                "sha256:95431a26309a21874005845c21118c83991c63ea800dd44843e42a916aec5899",
+                "sha256:9ad2c92ec681e02baf81fdfa056fe0d818645efa9af1f1cd5fd6f1bd2bdfd805",
+                "sha256:9c6d1a54552b5330bc657b7ef0eae25d00ba7ffe85d9ea8ae6540d2197a3788c",
+                "sha256:aee0c1256be6c07bd3e1263ff920c325b59849dc95392a05f258bb9b259cf39c",
+                "sha256:af3d4a73793725138d6b334d9d247ce7e5f084d96284ed23f22ee626a7b88e39",
+                "sha256:b36b4f3920103a25e1d5d024d155c504080959582b928e91cb608a65c3a49e1a",
+                "sha256:b9574c6f03f685070d859e75c7f9eeca02d6933273b5e69572e5ff9d5e3931c3",
+                "sha256:bff6ad71c81b3bba8fa35f0f1921fb24ff4476235a6e94a26ada2e54370e6da7",
+                "sha256:c190f0899e9f9f8b6b7863debfb739abcb21a5c054f911ca3596d12b8a4c4c7f",
+                "sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075",
+                "sha256:cae53c389825d3b46fb37538441f75d6aecc4174f615d048321b716df2757fb0",
+                "sha256:dd4a21253f42b8d2b48410cb31fe501d32f8b9fbeb1f55063ad102fe9c425e40",
+                "sha256:dde816ca9dac1d9c01dd504ea5967821606f02e510438120091b84e852367428",
+                "sha256:f2362f3cb0f3172c42938946dbc5b7843c2a28aec307c49100c8b38764eb6927",
+                "sha256:f328adcfebed9f11301eaedfa48e15bdece9b519fb27e6a8c01aa52a17ec31b3",
+                "sha256:f8afcf15cc511ada719a88e013cec87c11aff7b91f019295eb4530f96fe5ef2f",
+                "sha256:fb1bbeac803adea29cedd70781399c99138358c26d05fcbd23c13016b7f5ec65"
+            ],
+            "markers": "python_version < '3.8'",
+            "version": "==1.4.3"
+        },
+        "types-psycopg2": {
+            "hashes": [
+                "sha256:77ed80f2668582654623e04fb3d741ecce93effcc39c929d7e02f4a917a538ce",
+                "sha256:98a6e0e9580cd7eb4bd4d20f7c7063d154b2589a2b90c0ce4e3ca6085cde77c6"
+            ],
+            "index": "pypi",
+            "version": "==2.9.1"
+        },
+        "types-requests": {
+            "hashes": [
+                "sha256:b279284e51f668e38ee12d9665e4d789089f532dc2a0be4a1508ca0efd98ba9e",
+                "sha256:ba1d108d512e294b6080c37f6ae7cb2a2abf527560e2b671d1786c1fc46b541a"
+            ],
+            "index": "pypi",
+            "version": "==2.25.11"
+        },
        "typing-extensions": {
            "hashes": [
                "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
@@ -364,6 +508,22 @@
            "index": "pypi",
            "version": "==3.10.0.2"
        },
+        "virtualenv": {
+            "hashes": [
+                "sha256:10062e34c204b5e4ec5f62e6ef2473f8ba76513a9a617e873f1f8fb4a519d300",
+                "sha256:bcc17f0b3a29670dd777d6f0755a4c04f28815395bca279cdcb213b97199a6b8"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==20.8.1"
+        },
+        "virtualenv-clone": {
+            "hashes": [
+                "sha256:418ee935c36152f8f153c79824bb93eaf6f0f7984bae31d3f48f350b9183501a",
+                "sha256:44d5263bceed0bac3e1424d64f798095233b64def1c5689afa43dc3223caf5b0"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==0.5.7"
+        },
        "yapf": {
            "hashes": [
                "sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d",
@@ -371,6 +531,14 @@
            ],
            "index": "pypi",
            "version": "==0.31.0"
+        },
+        "zipp": {
+            "hashes": [
+                "sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
+                "sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==3.6.0"
        }
    }
 }
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -3,10 +3,13 @@
 This directory contains integration tests.

 Prerequisites:
- Python 3.6 or later
+- Python 3.7 or later
+    - Development headers may also be needed to build `psycopg2` from source.
+    - Python 3.7 is recommended if you want to update tests.
 - Dependencies: install them via `pipenv install`. Note that Debian/Ubuntu
  packages are stale, as it commonly happens, so manual installation is not
  recommended.
+  Exact version of `pipenv` is not important unless you change dependencies.
  Run `pipenv shell` to activate the venv or use `pipenv run` to run a single
  command in the venv, e.g. `pipenv run pytest`.
 - Zenith and Postgres binaries
@@ -53,8 +56,8 @@ Useful environment variables:
 should go.
 `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.

-Let stdout and stderr go to the terminal instead of capturing them:
-`pytest -s ...`
+Let stdout, stderr and `INFO` log messages go to the terminal instead of capturing them:
+`pytest -s --log-cli-level=INFO ...`
 (Note many tests capture subprocess outputs separately, so this may not
 show much.)

@@ -62,44 +65,87 @@ Exit after the first test failure:
 `pytest -x ...`
 (there are many more pytest options; run `pytest -h` to see them.)

+### Writing a test

-### Building new tests
+Every test needs a Zenith Environment, or ZenithEnv to operate in. A Zenith Environment
+is like a little cloud-in-a-box, and consists of a Pageserver, 0-N Safekeepers, and
+compute Postgres nodes. The connections between them can be configured to use JWT
+authentication tokens, and some other configuration options can be tweaked too.

-The tests make heavy use of pytest fixtures. You can read about how they work here: https://docs.pytest.org/en/stable/fixture.html
+The easiest way to get access to a Zenith Environment is by using the `zenith_simple_env`
+fixture. The 'simple' env may be shared across multiple tests, so don't shut down the nodes
+or make other destructive changes in that environment. Also don't assume that
+there are no tenants or branches or data in the cluster. For convenience, there is a
+branch called `empty`, though. The convention is to create a test-specific branch of
+that and load any test data there, instead of the 'main' branch.

-Essentially, this means that each time you see a fixture named as an input parameter, the function with that name will be run and passed as a parameter to the function.
-
-So this code:
+For more complicated cases, you can build a custom Zenith Environment, with the `zenith_env`
+fixture:

 ```python
-def test_something(zenith_cli, pg_bin):
-    pass
+def test_foobar(zenith_env_builder: ZenithEnvBuilder):
+    # Prescribe the environment.
+    # We want to have 3 safekeeper nodes, and use JWT authentication in the
+    # connections to the page server
+    zenith_env_builder.num_safekeepers = 3
+    zenith_env_builder.set_pageserver_auth(True)
+
+    # Now create the environment. This initializes the repository, and starts
+    # up the page server and the safekeepers
+    env = zenith_env_builder.init()
+
+    # Run the test
+    ...
 ```

-... will run the fixtures called `zenith_cli` and `pg_bin` and deliver those results to the test function.
+For more information about pytest fixtures, see https://docs.pytest.org/en/stable/fixture.html

-Fixtures can't be imported using the normal python syntax. Instead, use this:
+At the end of a test, all the nodes in the environment are automatically stopped, so you
+don't need to worry about cleaning up. Logs and test data are preserved for the analysis,
+in a directory under `../test_output/<testname>`

-```python
-pytest_plugins = ("fixtures.something")
+### Before submitting a patch
+#### Obligatory checks
+Install dev dependencies via `pipenv --python 3.7 install --dev` (better)
+or `pipenv install --dev` (if you don't have Python 3.7 and don't need to change dependencies).
+
+We force code formatting via yapf and type hints via mypy.
+Run the following commands in the `test_runner/` directory:
+
+```bash
+pipenv run yapf -ri .  # All code is reformatted
+pipenv run mypy .  # Ensure there are no typing errors
 ```

-That will make all the fixtures in the `fixtures/something.py` file available.
-
-Anything that's likely to be used in multiple tests should be built into a fixture.
-
-Note that fixtures can clean up after themselves if they use the `yield` syntax.
-Cleanup will happen even if the test fails (raises an unhandled exception).
-Python destructors, e.g. `__del__()` aren't recommended for cleanup.
-
-
-### Code quality
-
-Before submitting a patch, please consider:
-
+#### Advisable actions
 * Writing a couple of docstrings to clarify the reasoning behind a new test.
 * Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any.
-* Formatting the code with `yapf -r -i .` (TODO: implement an opt-in pre-commit hook for that).
-* (Optional) Typechecking the code with `mypy .`. Currently this mostly affects `fixtures/zenith_fixtures.py`.
+* Adding more type hints to your code to avoid `Any`, especially:
+  * For fixture parameters, they are not automatically deduced.
+  * For function arguments and return values.

-The tools can be installed with `pipenv install --dev`.
+#### Changing dependencies
+You have to update `Pipfile.lock` if you have changed `Pipfile`:
+
+```bash
+pipenv --python 3.7 install --dev  # Re-create venv for Python 3.7 and install recent pipenv inside
+pipenv run pipenv --version  # Should be at least 2021.5.29
+pipenv run pipenv lock  # Regenerate Pipfile.lock
+```
+
+As the minimal supported version is Python 3.7 and we use it in CI,
+you have to use a Python 3.7 environment when updating `Pipfile.lock`.
+Otherwise some back-compatibility packages will be missing.
+
+It is also important to run recent `pipenv`.
+Older versions remove markers from `Pipfile.lock`.
+
+If you don't have Python 3.7, you should install it and its headers (for `psycopg2`)
+separately, e.g.:
+
+```bash
+# In Ubuntu
+sudo add-apt-repository ppa:deadsnakes/ppa
+sudo apt update
+sudo apt install python3.7 python3.7-dev
+```
--- a/test_runner/batch_others/test_auth.py
+++ b/test_runner/batch_others/test_auth.py
@@ -1,21 +1,22 @@
-
 from contextlib import closing
 from typing import Iterator
 from uuid import uuid4
 import psycopg2
-from fixtures.zenith_fixtures import PortDistributor, Postgres, ZenithCli, ZenithPageserver, PgBin
+from fixtures.zenith_fixtures import ZenithEnvBuilder
 import pytest

-
 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_pageserver_auth(pageserver_auth_enabled: ZenithPageserver):
-    ps = pageserver_auth_enabled
+def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder):
+    zenith_env_builder.pageserver_auth_enabled = True
+    env = zenith_env_builder.init()

-    tenant_token = ps.auth_keys.generate_tenant_token(ps.initial_tenant)
-    invalid_tenant_token = ps.auth_keys.generate_tenant_token(uuid4().hex)
-    management_token = ps.auth_keys.generate_management_token()
+    ps = env.pageserver
+
+    tenant_token = env.auth_keys.generate_tenant_token(env.initial_tenant)
+    invalid_tenant_token = env.auth_keys.generate_tenant_token(uuid4().hex)
+    management_token = env.auth_keys.generate_management_token()

    # this does not invoke auth check and only decodes jwt and checks it for validity
    # check both tokens
@@ -23,57 +24,41 @@ def test_pageserver_auth(pageserver_auth_enabled: ZenithPageserver):
    ps.safe_psql("status", password=management_token)

    # tenant can create branches
-    ps.safe_psql(f"branch_create {ps.initial_tenant} new1 main", password=tenant_token)
+    ps.safe_psql(f"branch_create {env.initial_tenant} new1 main", password=tenant_token)
    # console can create branches for tenant
-    ps.safe_psql(f"branch_create {ps.initial_tenant} new2 main", password=management_token)
+    ps.safe_psql(f"branch_create {env.initial_tenant} new2 main", password=management_token)

    # fail to create branch using token with different tenantid
    with pytest.raises(psycopg2.DatabaseError, match='Tenant id mismatch. Permission denied'):
-        ps.safe_psql(f"branch_create {ps.initial_tenant} new2 main", password=invalid_tenant_token)
+        ps.safe_psql(f"branch_create {env.initial_tenant} new2 main", password=invalid_tenant_token)

    # create tenant using management token
    ps.safe_psql(f"tenant_create {uuid4().hex}", password=management_token)

    # fail to create tenant using tenant token
-    with pytest.raises(psycopg2.DatabaseError, match='Attempt to access management api with tenant scope. Permission denied'):
+    with pytest.raises(
+            psycopg2.DatabaseError,
+            match='Attempt to access management api with tenant scope. Permission denied'):
        ps.safe_psql(f"tenant_create {uuid4().hex}", password=tenant_token)


@pytest.mark.parametrize('with_wal_acceptors', [False, True])
-def test_compute_auth_to_pageserver(
-    zenith_cli: ZenithCli,
-    wa_factory,
-    pageserver_auth_enabled: ZenithPageserver,
-    repo_dir: str,
-    with_wal_acceptors: bool,
-    pg_bin: PgBin,
-    port_distributor: PortDistributor,
-):
-    ps = pageserver_auth_enabled
-    # since we are in progress of refactoring protocols between compute safekeeper and page server
-    # use hardcoded management token in safekeeper
-    management_token = ps.auth_keys.generate_management_token()
+def test_compute_auth_to_pageserver(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptors: bool):
+    zenith_env_builder.pageserver_auth_enabled = True
+    if with_wal_acceptors:
+        zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()

    branch = f"test_compute_auth_to_pageserver{with_wal_acceptors}"
-    zenith_cli.run(["branch", branch, "empty"])
-    if with_wal_acceptors:
-        wa_factory.start_n_new(3, management_token)
+    env.zenith_cli(["branch", branch, "main"])

-    with Postgres(
-        zenith_cli=zenith_cli,
-        repo_dir=repo_dir,
-        pg_bin=pg_bin,
-        tenant_id=ps.initial_tenant,
-        port=port_distributor.get_port(),
-    ).create_start(
-        branch,
-        wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None,
-    ) as pg:
-        with closing(pg.connect()) as conn:
-            with conn.cursor() as cur:
-                # we rely upon autocommit after each statement
-                # as waiting for acceptors happens there
-                cur.execute('CREATE TABLE t(key int primary key, value text)')
-                cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
-                cur.execute('SELECT sum(key) FROM t')
-                assert cur.fetchone() == (5000050000, )
+    pg = env.postgres.create_start(branch)
+
+    with closing(pg.connect()) as conn:
+        with conn.cursor() as cur:
+            # we rely upon autocommit after each statement
+            # as waiting for acceptors happens there
+            cur.execute('CREATE TABLE t(key int primary key, value text)')
+            cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
+            cur.execute('SELECT sum(key) FROM t')
+            assert cur.fetchone() == (5000050000, )
--- a/test_runner/batch_others/test_branch_behind.py
+++ b/test_runner/batch_others/test_branch_behind.py
@@ -1,6 +1,6 @@
 import subprocess
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
-
+from fixtures.zenith_fixtures import ZenithEnv
+from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")

@@ -8,12 +8,13 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Create a couple of branches off the main branch, at a historical point in time.
 #
-def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
+def test_branch_behind(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Branch at the point where only 100 rows were inserted
-    zenith_cli.run(["branch", "test_branch_behind", "empty"])
+    env.zenith_cli(["branch", "test_branch_behind", "empty"])

-    pgmain = postgres.create_start('test_branch_behind')
-    print("postgres is running on 'test_branch_behind' branch")
+    pgmain = env.postgres.create_start('test_branch_behind')
+    log.info("postgres is running on 'test_branch_behind' branch")

    main_pg_conn = pgmain.connect()
    main_cur = main_pg_conn.cursor()
@@ -27,38 +28,38 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
    ''')
    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
    lsn_a = main_cur.fetchone()[0]
-    print('LSN after 100 rows: ' + lsn_a)
+    log.info(f'LSN after 100 rows: {lsn_a}')

    # Insert some more rows. (This generates enough WAL to fill a few segments.)
    main_cur.execute('''
        INSERT INTO foo
            SELECT 'long string to consume some space' || g
-            FROM generate_series(1, 100000) g
+            FROM generate_series(1, 200000) g
    ''')
    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
    lsn_b = main_cur.fetchone()[0]
-    print('LSN after 100100 rows: ' + lsn_b)
+    log.info(f'LSN after 200100 rows: {lsn_b}')

    # Branch at the point where only 100 rows were inserted
-    zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])
+    env.zenith_cli(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])

    # Insert many more rows. This generates enough WAL to fill a few segments.
    main_cur.execute('''
        INSERT INTO foo
            SELECT 'long string to consume some space' || g
-            FROM generate_series(1, 100000) g
+            FROM generate_series(1, 200000) g
    ''')
    main_cur.execute('SELECT pg_current_wal_insert_lsn()')

    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
    lsn_c = main_cur.fetchone()[0]
-    print('LSN after 200100 rows: ' + lsn_c)
+    log.info(f'LSN after 400100 rows: {lsn_c}')

-    # Branch at the point where only 200 rows were inserted
-    zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])
+    # Branch at the point where only 200100 rows were inserted
+    env.zenith_cli(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])

-    pg_hundred = postgres.create_start("test_branch_behind_hundred")
-    pg_more = postgres.create_start("test_branch_behind_more")
+    pg_hundred = env.postgres.create_start("test_branch_behind_hundred")
+    pg_more = env.postgres.create_start("test_branch_behind_more")

    # On the 'hundred' branch, we should see only 100 rows
    hundred_pg_conn = pg_hundred.connect()
@@ -70,23 +71,26 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
    more_pg_conn = pg_more.connect()
    more_cur = more_pg_conn.cursor()
    more_cur.execute('SELECT count(*) FROM foo')
-    assert more_cur.fetchone() == (100100, )
+    assert more_cur.fetchone() == (200100, )

    # All the rows are visible on the main branch
    main_cur.execute('SELECT count(*) FROM foo')
-    assert main_cur.fetchone() == (200100, )
+    assert main_cur.fetchone() == (400100, )

    # Check bad lsn's for branching

    # branch at segment boundary
-    zenith_cli.run(["branch", "test_branch_segment_boundary", "test_branch_behind@0/3000000"])
-    pg = postgres.create_start("test_branch_segment_boundary")
+    env.zenith_cli(["branch", "test_branch_segment_boundary", "test_branch_behind@0/3000000"])
+    pg = env.postgres.create_start("test_branch_segment_boundary")
    cur = pg.connect().cursor()
    cur.execute('SELECT 1')
    assert cur.fetchone() == (1, )

    # branch at pre-initdb lsn
+    #
+    # FIXME: This works currently, but probably shouldn't be allowed
    try:
-        zenith_cli.run(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
+        env.zenith_cli(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
+        # FIXME: assert false, "branch with invalid LSN should have failed"
    except subprocess.CalledProcessError:
-        print("Branch creation with pre-initdb LSN failed (as expected)")
+        log.info("Branch creation with pre-initdb LSN failed (as expected)")
--- a/test_runner/batch_others/test_clog_truncate.py
+++ b/test_runner/batch_others/test_clog_truncate.py
@@ -3,7 +3,8 @@ import os

 from contextlib import closing

-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
+from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")

@@ -11,20 +12,24 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test compute node start after clog truncation
 #
-def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
+def test_clog_truncate(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_clog_truncate", "empty"])
+    env.zenith_cli(["branch", "test_clog_truncate", "empty"])

    # set agressive autovacuum to make sure that truncation will happen
    config = [
-        'autovacuum_max_workers=10', 'autovacuum_vacuum_threshold=0',
-        'autovacuum_vacuum_insert_threshold=0', 'autovacuum_vacuum_cost_delay=0',
-        'autovacuum_vacuum_cost_limit=10000', 'autovacuum_naptime =1s',
+        'autovacuum_max_workers=10',
+        'autovacuum_vacuum_threshold=0',
+        'autovacuum_vacuum_insert_threshold=0',
+        'autovacuum_vacuum_cost_delay=0',
+        'autovacuum_vacuum_cost_limit=10000',
+        'autovacuum_naptime =1s',
        'autovacuum_freeze_max_age=100000'
    ]

-    pg = postgres.create_start('test_clog_truncate', config_lines=config)
-    print('postgres is running on test_clog_truncate branch')
+    pg = env.postgres.create_start('test_clog_truncate', config_lines=config)
+    log.info('postgres is running on test_clog_truncate branch')

    # Install extension containing function needed for test
    pg.safe_psql('CREATE EXTENSION zenith_test_utils')
@@ -33,22 +38,22 @@ def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
            cur.execute('select test_consume_xids(1000*1000*10);')
-            print('xids consumed')
+            log.info('xids consumed')

            # call a checkpoint to trigger TruncateSubtrans
            cur.execute('CHECKPOINT;')

            # ensure WAL flush
            cur.execute('select txid_current()')
-            print(cur.fetchone())
+            log.info(cur.fetchone())

    # wait for autovacuum to truncate the pg_xact
    # XXX Is it worth to add a timeout here?
    pg_xact_0000_path = os.path.join(pg.pg_xact_dir_path(), '0000')
-    print("pg_xact_0000_path = " + pg_xact_0000_path)
+    log.info(f"pg_xact_0000_path = {pg_xact_0000_path}")

    while os.path.isfile(pg_xact_0000_path):
-        print("file exists. wait for truncation. " "pg_xact_0000_path = " + pg_xact_0000_path)
+        log.info(f"file exists. wait for truncation. " "pg_xact_0000_path = {pg_xact_0000_path}")
        time.sleep(5)

    # checkpoint to advance latest lsn
@@ -59,14 +64,14 @@ def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
            lsn_after_truncation = cur.fetchone()[0]

    # create new branch after clog truncation and start a compute node on it
-    print('create branch at lsn_after_truncation ' + lsn_after_truncation)
-    zenith_cli.run(
+    log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}')
+    env.zenith_cli(
        ["branch", "test_clog_truncate_new", "test_clog_truncate@" + lsn_after_truncation])

-    pg2 = postgres.create_start('test_clog_truncate_new')
-    print('postgres is running on test_clog_truncate_new branch')
+    pg2 = env.postgres.create_start('test_clog_truncate_new')
+    log.info('postgres is running on test_clog_truncate_new branch')

    # check that new node doesn't contain truncated segment
    pg_xact_0000_path_new = os.path.join(pg2.pg_xact_dir_path(), '0000')
-    print("pg_xact_0000_path_new = " + pg_xact_0000_path_new)
+    log.info(f"pg_xact_0000_path_new = {pg_xact_0000_path_new}")
    assert os.path.isfile(pg_xact_0000_path_new) is False
--- a/test_runner/batch_others/test_config.py
+++ b/test_runner/batch_others/test_config.py
@@ -1,6 +1,7 @@
 from contextlib import closing

-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
+from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")

@@ -8,13 +9,14 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test starting Postgres with custom options
 #
-def test_config(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
+def test_config(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_config", "empty"])
+    env.zenith_cli(["branch", "test_config", "empty"])

    # change config
-    pg = postgres.create_start('test_config', config_lines=['log_min_messages=debug1'])
-    print('postgres is running on test_config branch')
+    pg = env.postgres.create_start('test_config', config_lines=['log_min_messages=debug1'])
+    log.info('postgres is running on test_config branch')

    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
--- a/test_runner/batch_others/test_createdropdb.py
+++ b/test_runner/batch_others/test_createdropdb.py
@@ -2,7 +2,8 @@ import os
 import pathlib

 from contextlib import closing
-from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory, ZenithCli, check_restored_datadir_content
+from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content
+from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")

@@ -10,16 +11,12 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test CREATE DATABASE when there have been relmapper changes
 #
-def test_createdb(
-    zenith_cli: ZenithCli,
-    pageserver: ZenithPageserver,
-    postgres: PostgresFactory,
-    pg_bin,
-):
-    zenith_cli.run(["branch", "test_createdb", "empty"])
+def test_createdb(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    env.zenith_cli(["branch", "test_createdb", "empty"])

-    pg = postgres.create_start('test_createdb')
-    print("postgres is running on 'test_createdb' branch")
+    pg = env.postgres.create_start('test_createdb')
+    log.info("postgres is running on 'test_createdb' branch")

    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
@@ -32,28 +29,24 @@ def test_createdb(
            lsn = cur.fetchone()[0]

    # Create a branch
-    zenith_cli.run(["branch", "test_createdb2", "test_createdb@" + lsn])
+    env.zenith_cli(["branch", "test_createdb2", "test_createdb@" + lsn])

-    pg2 = postgres.create_start('test_createdb2')
+    pg2 = env.postgres.create_start('test_createdb2')

    # Test that you can connect to the new database on both branches
    for db in (pg, pg2):
        db.connect(dbname='foodb').close()

+
 #
 # Test DROP DATABASE
 #
-def test_dropdb(
-    zenith_cli: ZenithCli,
-    pageserver: ZenithPageserver,
-    postgres: PostgresFactory,
-    pg_bin,
-    test_output_dir
-):
-    zenith_cli.run(["branch", "test_dropdb", "empty"])
+def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir):
+    env = zenith_simple_env
+    env.zenith_cli(["branch", "test_dropdb", "empty"])

-    pg = postgres.create_start('test_dropdb')
-    print("postgres is running on 'test_dropdb' branch")
+    pg = env.postgres.create_start('test_dropdb')
+    log.info("postgres is running on 'test_dropdb' branch")

    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
@@ -65,7 +58,6 @@ def test_dropdb(
            cur.execute("SELECT oid FROM pg_database WHERE datname='foodb';")
            dboid = cur.fetchone()[0]

-
    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
            cur.execute('DROP DATABASE foodb')
@@ -75,28 +67,29 @@ def test_dropdb(
            cur.execute('SELECT pg_current_wal_insert_lsn()')
            lsn_after_drop = cur.fetchone()[0]

-
    # Create two branches before and after database drop.
-    zenith_cli.run(["branch", "test_before_dropdb", "test_dropdb@" + lsn_before_drop])
-    pg_before = postgres.create_start('test_before_dropdb')
+    env.zenith_cli(["branch", "test_before_dropdb", "test_dropdb@" + lsn_before_drop])
+    pg_before = env.postgres.create_start('test_before_dropdb')

-    zenith_cli.run(["branch", "test_after_dropdb", "test_dropdb@" + lsn_after_drop])
-    pg_after = postgres.create_start('test_after_dropdb')
+    env.zenith_cli(["branch", "test_after_dropdb", "test_dropdb@" + lsn_after_drop])
+    pg_after = env.postgres.create_start('test_after_dropdb')

    # Test that database exists on the branch before drop
    pg_before.connect(dbname='foodb').close()

    # Test that database subdir exists on the branch before drop
+    assert pg_before.pgdata_dir
    dbpath = pathlib.Path(pg_before.pgdata_dir) / 'base' / str(dboid)
-    print(dbpath)
+    log.info(dbpath)

    assert os.path.isdir(dbpath) == True

    # Test that database subdir doesn't exist on the branch after drop
+    assert pg_after.pgdata_dir
    dbpath = pathlib.Path(pg_after.pgdata_dir) / 'base' / str(dboid)
-    print(dbpath)
+    log.info(dbpath)

    assert os.path.isdir(dbpath) == False

    # Check that we restore the content of the datadir correctly
-    check_restored_datadir_content(zenith_cli, test_output_dir, pg, pageserver.service_port.pg)
+    check_restored_datadir_content(test_output_dir, env, pg)
--- a/test_runner/batch_others/test_createuser.py
+++ b/test_runner/batch_others/test_createuser.py
@@ -1,6 +1,7 @@
 from contextlib import closing

-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
+from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")

@@ -8,11 +9,12 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test CREATE USER to check shared catalog restore
 #
-def test_createuser(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
-    zenith_cli.run(["branch", "test_createuser", "empty"])
+def test_createuser(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    env.zenith_cli(["branch", "test_createuser", "empty"])

-    pg = postgres.create_start('test_createuser')
-    print("postgres is running on 'test_createuser' branch")
+    pg = env.postgres.create_start('test_createuser')
+    log.info("postgres is running on 'test_createuser' branch")

    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
@@ -25,9 +27,9 @@ def test_createuser(zenith_cli, pageserver: ZenithPageserver, postgres: Postgres
            lsn = cur.fetchone()[0]

    # Create a branch
-    zenith_cli.run(["branch", "test_createuser2", "test_createuser@" + lsn])
+    env.zenith_cli(["branch", "test_createuser2", "test_createuser@" + lsn])

-    pg2 = postgres.create_start('test_createuser2')
+    pg2 = env.postgres.create_start('test_createuser2')

    # Test that you can connect to new branch as a new user
    assert pg2.safe_psql('select current_user', username='testuser') == [('testuser', )]
--- a/test_runner/batch_others/test_multixact.py
+++ b/test_runner/batch_others/test_multixact.py
@@ -1,4 +1,5 @@
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content
+from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content
+from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")

@@ -9,13 +10,13 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 # it only checks next_multixact_id field in restored pg_control,
 # since we don't have functions to check multixact internals.
 #
-def test_multixact(pageserver: ZenithPageserver, postgres: PostgresFactory,
-                    pg_bin, zenith_cli, base_dir, test_output_dir):
+def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_multixact", "empty"])
-    pg = postgres.create_start('test_multixact')
+    env.zenith_cli(["branch", "test_multixact", "empty"])
+    pg = env.postgres.create_start('test_multixact')

-    print("postgres is running on 'test_multixact' branch")
+    log.info("postgres is running on 'test_multixact' branch")
    pg_conn = pg.connect()
    cur = pg_conn.cursor()

@@ -52,10 +53,10 @@ def test_multixact(pageserver: ZenithPageserver, postgres: PostgresFactory,
    assert int(next_multixact_id) > int(next_multixact_id_old)

    # Branch at this point
-    zenith_cli.run(["branch", "test_multixact_new", "test_multixact@" + lsn])
-    pg_new = postgres.create_start('test_multixact_new')
+    env.zenith_cli(["branch", "test_multixact_new", "test_multixact@" + lsn])
+    pg_new = env.postgres.create_start('test_multixact_new')

-    print("postgres is running on 'test_multixact_new' branch")
+    log.info("postgres is running on 'test_multixact_new' branch")
    pg_new_conn = pg_new.connect()
    cur_new = pg_new_conn.cursor()

@@ -66,4 +67,4 @@ def test_multixact(pageserver: ZenithPageserver, postgres: PostgresFactory,
    assert next_multixact_id_new == next_multixact_id

    # Check that we restore the content of the datadir correctly
-    check_restored_datadir_content(zenith_cli, test_output_dir, pg_new, pageserver.service_port.pg)
+    check_restored_datadir_content(test_output_dir, env, pg_new)
--- a/test_runner/batch_others/test_old_request_lsn.py
+++ b/test_runner/batch_others/test_old_request_lsn.py
@@ -1,9 +1,11 @@
 from contextlib import closing

-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
+from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")

+
 #
 # Test where Postgres generates a lot of WAL, and it's garbage collected away, but
 # no pages are evicted so that Postgres uses an old LSN in a GetPage request.
@@ -14,11 +16,12 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 # just a hint that the page hasn't been modified since that LSN, and the page
 # server should return the latest page version regardless of the LSN.
 #
-def test_old_request_lsn(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
+def test_old_request_lsn(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_old_request_lsn", "empty"])
-    pg = postgres.create_start('test_old_request_lsn')
-    print('postgres is running on test_old_request_lsn branch')
+    env.zenith_cli(["branch", "test_old_request_lsn", "empty"])
+    pg = env.postgres.create_start('test_old_request_lsn')
+    log.info('postgres is running on test_old_request_lsn branch')

    pg_conn = pg.connect()
    cur = pg_conn.cursor()
@@ -27,7 +30,7 @@ def test_old_request_lsn(zenith_cli, pageserver: ZenithPageserver, postgres: Pos
    cur.execute("SHOW zenith.zenith_timeline")
    timeline = cur.fetchone()[0]

-    psconn = pageserver.connect()
+    psconn = env.pageserver.connect()
    pscur = psconn.cursor()

    # Create table, and insert some rows. Make it big enough that it doesn't fit in
@@ -46,20 +49,20 @@ def test_old_request_lsn(zenith_cli, pageserver: ZenithPageserver, postgres: Pos
        from pg_settings where name = 'shared_buffers'
    ''')
    row = cur.fetchone()
-    print("shared_buffers is {}, table size {}", row[0], row[1]);
+    log.info(f'shared_buffers is {row[0]}, table size {row[1]}')
    assert int(row[0]) < int(row[1])

-    cur.execute('VACUUM foo');
+    cur.execute('VACUUM foo')

    # Make a lot of updates on a single row, generating a lot of WAL. Trigger
    # garbage collections so that the page server will remove old page versions.
    for i in range(10):
-        pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+        pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
        for j in range(100):
-            cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;');
+            cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;')

    # All (or at least most of) the updates should've been on the same page, so
    # that we haven't had to evict any dirty pages for a long time. Now run
    # a query that sends GetPage@LSN requests with the old LSN.
-    cur.execute("SELECT COUNT(*), SUM(val) FROM foo");
+    cur.execute("SELECT COUNT(*), SUM(val) FROM foo")
    assert cur.fetchone() == (100000, 101000)
--- a/test_runner/batch_others/test_pageserver_api.py
+++ b/test_runner/batch_others/test_pageserver_api.py
@@ -3,25 +3,28 @@ from uuid import uuid4
 import pytest
 import psycopg2
 import requests
-from fixtures.zenith_fixtures import ZenithPageserver, ZenithPageserverHttpClient
+from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient
+from typing import cast

 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_status_psql(pageserver):
-    assert pageserver.safe_psql('status') == [
+def test_status_psql(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    assert env.pageserver.safe_psql('status') == [
        ('hello world', ),
    ]


-def test_branch_list_psql(pageserver: ZenithPageserver, zenith_cli):
+def test_branch_list_psql(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_branch_list_main", "empty"])
+    env.zenith_cli(["branch", "test_branch_list_main", "empty"])

-    conn = pageserver.connect()
+    conn = env.pageserver.connect()
    cur = conn.cursor()

-    cur.execute(f'branch_list {pageserver.initial_tenant}')
+    cur.execute(f'branch_list {env.initial_tenant}')
    branches = json.loads(cur.fetchone()[0])
    # Filter out branches created by other tests
    branches = [x for x in branches if x['name'].startswith('test_branch_list')]
@@ -34,10 +37,10 @@ def test_branch_list_psql(pageserver: ZenithPageserver, zenith_cli):
    assert 'ancestor_lsn' in branches[0]

    # Create another branch, and start Postgres on it
-    zenith_cli.run(['branch', 'test_branch_list_experimental', 'test_branch_list_main'])
-    zenith_cli.run(['pg', 'create', 'test_branch_list_experimental'])
+    env.zenith_cli(['branch', 'test_branch_list_experimental', 'test_branch_list_main'])
+    env.zenith_cli(['pg', 'create', 'test_branch_list_experimental'])

-    cur.execute(f'branch_list {pageserver.initial_tenant}')
+    cur.execute(f'branch_list {env.initial_tenant}')
    new_branches = json.loads(cur.fetchone()[0])
    # Filter out branches created by other tests
    new_branches = [x for x in new_branches if x['name'].startswith('test_branch_list')]
@@ -53,18 +56,22 @@ def test_branch_list_psql(pageserver: ZenithPageserver, zenith_cli):
    conn.close()


-def test_tenant_list_psql(pageserver: ZenithPageserver, zenith_cli):
-    res = zenith_cli.run(["tenant", "list"])
-    res.check_returncode()
-    tenants = res.stdout.splitlines()
-    assert tenants == [pageserver.initial_tenant]
+def test_tenant_list_psql(zenith_env_builder: ZenithEnvBuilder):
+    # don't use zenith_simple_env, because there might be other tenants there,
+    # left over from other tests.
+    env = zenith_env_builder.init()

-    conn = pageserver.connect()
+    res = env.zenith_cli(["tenant", "list"])
+    res.check_returncode()
+    tenants = sorted(map(lambda t: t.split()[0], res.stdout.splitlines()))
+    assert tenants == [env.initial_tenant]
+
+    conn = env.pageserver.connect()
    cur = conn.cursor()

    # check same tenant cannot be created twice
-    with pytest.raises(psycopg2.DatabaseError, match=f'tenant {pageserver.initial_tenant} already exists'):
-        cur.execute(f'tenant_create {pageserver.initial_tenant}')
+    with pytest.raises(psycopg2.DatabaseError, match=f'tenant {env.initial_tenant} already exists'):
+        cur.execute(f'tenant_create {env.initial_tenant}')

    # create one more tenant
    tenant1 = uuid4().hex
@@ -73,20 +80,20 @@ def test_tenant_list_psql(pageserver: ZenithPageserver, zenith_cli):
    cur.execute('tenant_list')

    # compare tenants list
-    new_tenants = sorted(json.loads(cur.fetchone()[0]))
-    assert sorted([pageserver.initial_tenant, tenant1]) == new_tenants
+    new_tenants = sorted(map(lambda t: cast(str, t['id']), json.loads(cur.fetchone()[0])))
+    assert sorted([env.initial_tenant, tenant1]) == new_tenants


 def check_client(client: ZenithPageserverHttpClient, initial_tenant: str):
    client.check_status()

    # check initial tenant is there
-    assert initial_tenant in set(client.tenant_list())
+    assert initial_tenant in {t['id'] for t in client.tenant_list()}

    # create new tenant and check it is also there
    tenant_id = uuid4()
    client.tenant_create(tenant_id)
-    assert tenant_id.hex in set(client.tenant_list())
+    assert tenant_id.hex in {t['id'] for t in client.tenant_list()}

    # create branch
    branch_name = uuid4().hex
@@ -96,11 +103,17 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: str):
    assert branch_name in {b['name'] for b in client.branch_list(tenant_id)}


-def test_pageserver_http_api_client(pageserver: ZenithPageserver):
-    client = pageserver.http_client()
-    check_client(client, pageserver.initial_tenant)
+def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    client = env.pageserver.http_client()
+    check_client(client, env.initial_tenant)


-def test_pageserver_http_api_client_auth_enabled(pageserver_auth_enabled: ZenithPageserver):
-    client = pageserver_auth_enabled.http_client(auth_token=pageserver_auth_enabled.auth_keys.generate_management_token())
-    check_client(client, pageserver_auth_enabled.initial_tenant)
+def test_pageserver_http_api_client_auth_enabled(zenith_env_builder: ZenithEnvBuilder):
+    zenith_env_builder.pageserver_auth_enabled = True
+    env = zenith_env_builder.init()
+
+    management_token = env.auth_keys.generate_management_token()
+
+    client = env.pageserver.http_client(auth_token=management_token)
+    check_client(client, env.initial_tenant)
--- a/test_runner/batch_others/test_pageserver_restart.py
+++ b/test_runner/batch_others/test_pageserver_restart.py
@@ -4,21 +4,22 @@ import time

 from contextlib import closing
 from multiprocessing import Process, Value
-from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory
+from fixtures.zenith_fixtures import ZenithEnvBuilder
+from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")

+
 # Check that dead minority doesn't prevent the commits: execute insert n_inserts
 # times, with fault_probability chance of getting a wal acceptor down or up
 # along the way. 2 of 3 are always alive, so the work keeps going.
-def test_pageserver_restart(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory: WalAcceptorFactory):
-
+def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder):
    # One safekeeper is enough for this test.
-    wa_factory.start_n_new(1)
+    zenith_env_builder.num_safekeepers = 1
+    env = zenith_env_builder.init()

-    zenith_cli.run(["branch", "test_pageserver_restart", "empty"])
-    pg = postgres.create_start('test_pageserver_restart',
-                               wal_acceptors=wa_factory.get_connstrs())
+    env.zenith_cli(["branch", "test_pageserver_restart", "main"])
+    pg = env.postgres.create_start('test_pageserver_restart')

    pg_conn = pg.connect()
    cur = pg_conn.cursor()
@@ -40,14 +41,14 @@ def test_pageserver_restart(zenith_cli, pageserver: ZenithPageserver, postgres:
        from pg_settings where name = 'shared_buffers'
    ''')
    row = cur.fetchone()
-    print("shared_buffers is {}, table size {}", row[0], row[1]);
+    log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
    assert int(row[0]) < int(row[1])

    # Stop and restart pageserver. This is a more or less graceful shutdown, although
    # the page server doesn't currently have a shutdown routine so there's no difference
    # between stopping and crashing.
-    pageserver.stop();
-    pageserver.start();
+    env.pageserver.stop()
+    env.pageserver.start()

    # Stopping the pageserver breaks the connection from the postgres backend to
    # the page server, and causes the next query on the connection to fail. Start a new
@@ -61,6 +62,5 @@ def test_pageserver_restart(zenith_cli, pageserver: ZenithPageserver, postgres:
    assert cur.fetchone() == (100000, )

    # Stop the page server by force, and restart it
-    pageserver.stop();
-    pageserver.start();
-
+    env.pageserver.stop()
+    env.pageserver.start()
--- a/test_runner/batch_others/test_pgbench.py
+++ b/test_runner/batch_others/test_pgbench.py
@@ -1,16 +1,18 @@
-from fixtures.zenith_fixtures import PostgresFactory
+from fixtures.zenith_fixtures import ZenithEnv
+from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_pgbench(postgres: PostgresFactory, pg_bin, zenith_cli):
+def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_pgbench", "empty"])
+    env.zenith_cli(["branch", "test_pgbench", "empty"])

-    pg = postgres.create_start('test_pgbench')
-    print("postgres is running on 'test_pgbench' branch")
+    pg = env.postgres.create_start('test_pgbench')
+    log.info("postgres is running on 'test_pgbench' branch")

    connstr = pg.connstr()

-    pg_bin.run_capture(['pgbench', '-i', '-s', '100', connstr])
-    pg_bin.run_capture(['pgbench'] + '-c 1 -N -T 100 -P 1 -M prepared'.split() + [connstr])
+    pg_bin.run_capture(['pgbench', '-i', connstr])
+    pg_bin.run_capture(['pgbench'] + '-c 10 -T 5 -P 1 -M prepared'.split() + [connstr])
--- a/test_runner/batch_others/test_readonly_node.py
+++ b/test_runner/batch_others/test_readonly_node.py
@@ -0,0 +1,91 @@
+import subprocess
+from fixtures.zenith_fixtures import ZenithEnv
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+
+#
+# Create read-only compute nodes, anchored at historical points in time.
+#
+# This is very similar to the 'test_branch_behind' test, but instead of
+# creating branches, creates read-only nodes.
+#
+def test_readonly_node(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    env.zenith_cli(["branch", "test_readonly_node", "empty"])
+
+    pgmain = env.postgres.create_start('test_readonly_node')
+    print("postgres is running on 'test_readonly_node' branch")
+
+    main_pg_conn = pgmain.connect()
+    main_cur = main_pg_conn.cursor()
+
+    # Create table, and insert the first 100 rows
+    main_cur.execute('CREATE TABLE foo (t text)')
+    main_cur.execute('''
+        INSERT INTO foo
+            SELECT 'long string to consume some space' || g
+            FROM generate_series(1, 100) g
+    ''')
+    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
+    lsn_a = main_cur.fetchone()[0]
+    print('LSN after 100 rows: ' + lsn_a)
+
+    # Insert some more rows. (This generates enough WAL to fill a few segments.)
+    main_cur.execute('''
+        INSERT INTO foo
+            SELECT 'long string to consume some space' || g
+            FROM generate_series(1, 200000) g
+    ''')
+    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
+    lsn_b = main_cur.fetchone()[0]
+    print('LSN after 200100 rows: ' + lsn_b)
+
+    # Insert many more rows. This generates enough WAL to fill a few segments.
+    main_cur.execute('''
+        INSERT INTO foo
+            SELECT 'long string to consume some space' || g
+            FROM generate_series(1, 200000) g
+    ''')
+
+    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
+    lsn_c = main_cur.fetchone()[0]
+    print('LSN after 400100 rows: ' + lsn_c)
+
+    # Create first read-only node at the point where only 100 rows were inserted
+    pg_hundred = env.postgres.create_start("test_readonly_node_hundred",
+                                           branch=f'test_readonly_node@{lsn_a}')
+
+    # And another at the point where 200100 rows were inserted
+    pg_more = env.postgres.create_start("test_readonly_node_more",
+                                        branch=f'test_readonly_node@{lsn_b}')
+
+    # On the 'hundred' node, we should see only 100 rows
+    hundred_pg_conn = pg_hundred.connect()
+    hundred_cur = hundred_pg_conn.cursor()
+    hundred_cur.execute('SELECT count(*) FROM foo')
+    assert hundred_cur.fetchone() == (100, )
+
+    # On the 'more' node, we should see 100200 rows
+    more_pg_conn = pg_more.connect()
+    more_cur = more_pg_conn.cursor()
+    more_cur.execute('SELECT count(*) FROM foo')
+    assert more_cur.fetchone() == (200100, )
+
+    # All the rows are visible on the main branch
+    main_cur.execute('SELECT count(*) FROM foo')
+    assert main_cur.fetchone() == (400100, )
+
+    # Check creating a node at segment boundary
+    pg = env.postgres.create_start("test_branch_segment_boundary",
+                                   branch="test_readonly_node@0/3000000")
+    cur = pg.connect().cursor()
+    cur.execute('SELECT 1')
+    assert cur.fetchone() == (1, )
+
+    # Create node at pre-initdb lsn
+    try:
+        env.zenith_cli(["pg", "start", "test_branch_preinitdb", "test_readonly_node@0/42"])
+        assert False, "compute node startup with invalid LSN should have failed"
+    except Exception:
+        print("Node creation with pre-initdb LSN failed (as expected)")
--- a/test_runner/batch_others/test_restart_compute.py
+++ b/test_runner/batch_others/test_restart_compute.py
@@ -1,7 +1,8 @@
 import pytest

 from contextlib import closing
-from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory
+from fixtures.zenith_fixtures import ZenithEnvBuilder
+from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")

@@ -9,28 +10,17 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test restarting and recreating a postgres instance
 #
-# XXX: with_wal_acceptors=True fails now, would be fixed with
-# `postgres --sync-walkeepers` patches.
-#
-@pytest.mark.parametrize('with_wal_acceptors', [False])
-def test_restart_compute(
-        zenith_cli,
-        pageserver: ZenithPageserver,
-        postgres: PostgresFactory,
-        pg_bin,
-        wa_factory,
-        with_wal_acceptors: bool,
-    ):
-    wal_acceptor_connstrs = None
-    zenith_cli.run(["branch", "test_restart_compute", "empty"])
-
+@pytest.mark.parametrize('with_wal_acceptors', [False, True])
+def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptors: bool):
+    zenith_env_builder.pageserver_auth_enabled = True
    if with_wal_acceptors:
-        wa_factory.start_n_new(3)
-        wal_acceptor_connstrs = wa_factory.get_connstrs()
+        zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()

-    pg = postgres.create_start('test_restart_compute',
-                               wal_acceptors=wal_acceptor_connstrs)
-    print("postgres is running on 'test_restart_compute' branch")
+    env.zenith_cli(["branch", "test_restart_compute", "main"])
+
+    pg = env.postgres.create_start('test_restart_compute')
+    log.info("postgres is running on 'test_restart_compute' branch")

    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
@@ -39,12 +29,10 @@ def test_restart_compute(
            cur.execute('SELECT sum(key) FROM t')
            r = cur.fetchone()
            assert r == (5000050000, )
-            print("res = ", r)
+            log.info(f"res = {r}")

    # Remove data directory and restart
-    pg.stop_and_destroy().create_start('test_restart_compute',
-                                       wal_acceptors=wal_acceptor_connstrs)
-
+    pg.stop_and_destroy().create_start('test_restart_compute')

    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
@@ -52,7 +40,7 @@ def test_restart_compute(
            cur.execute('SELECT sum(key) FROM t')
            r = cur.fetchone()
            assert r == (5000050000, )
-            print("res = ", r)
+            log.info(f"res = {r}")

            # Insert another row
            cur.execute("INSERT INTO t VALUES (100001, 'payload2')")
@@ -60,11 +48,10 @@ def test_restart_compute(

            r = cur.fetchone()
            assert r == (100001, )
-            print("res = ", r)
+            log.info(f"res = {r}")

    # Again remove data directory and restart
-    pg.stop_and_destroy().create_start('test_restart_compute',
-                                       wal_acceptors=wal_acceptor_connstrs)
+    pg.stop_and_destroy().create_start('test_restart_compute')

    # That select causes lots of FPI's and increases probability of wakeepers
    # lagging behind after query completion
@@ -75,11 +62,10 @@ def test_restart_compute(

            r = cur.fetchone()
            assert r == (100001, )
-            print("res = ", r)
+            log.info(f"res = {r}")

    # And again remove data directory and restart
-    pg.stop_and_destroy().create_start('test_restart_compute',
-                                       wal_acceptors=wal_acceptor_connstrs)
+    pg.stop_and_destroy().create_start('test_restart_compute')

    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
@@ -88,4 +74,4 @@ def test_restart_compute(

            r = cur.fetchone()
            assert r == (100001, )
-            print("res = ", r)
+            log.info(f"res = {r}")
--- a/test_runner/batch_others/test_snapfiles_gc.py
+++ b/test_runner/batch_others/test_snapfiles_gc.py
@@ -1,13 +1,20 @@
 from contextlib import closing
 import psycopg2.extras
-import time;
+import time
+from fixtures.zenith_fixtures import ZenithEnv
+from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")

+
 def print_gc_result(row):
-    print("GC duration {elapsed} ms".format_map(row));
-    print("  REL    total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}".format_map(row))
-    print("  NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}".format_map(row))
+    log.info("GC duration {elapsed} ms".format_map(row))
+    log.info(
+        "  REL    total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, needed_as_tombstone {layer_relfiles_needed_as_tombstone}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}"
+        .format_map(row))
+    log.info(
+        "  NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, needed_as_tombstone {layer_nonrelfiles_needed_as_tombstone}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}"
+        .format_map(row))


 #
@@ -16,14 +23,15 @@ def print_gc_result(row):
 # This test is pretty tightly coupled with the current implementation of layered
 # storage, in layered_repository.rs.
 #
-def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
-    zenith_cli.run(["branch", "test_layerfiles_gc", "empty"])
-    pg = postgres.create_start('test_layerfiles_gc')
+def test_layerfiles_gc(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    env.zenith_cli(["branch", "test_layerfiles_gc", "empty"])
+    pg = env.postgres.create_start('test_layerfiles_gc')

    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
-            with closing(pageserver.connect()) as psconn:
-                with psconn.cursor(cursor_factory = psycopg2.extras.DictCursor) as pscur:
+            with closing(env.pageserver.connect()) as psconn:
+                with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur:

                    # Get the timeline ID of our branch. We need it for the 'do_gc' command
                    cur.execute("SHOW zenith.zenith_timeline")
@@ -33,9 +41,9 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
                    cur.execute("CREATE TABLE foo(x integer)")
                    cur.execute("INSERT INTO foo VALUES (1)")

-                    cur.execute("select relfilenode from pg_class where oid = 'foo'::regclass");
-                    row = cur.fetchone();
-                    print("relfilenode is {}", row[0]);
+                    cur.execute("select relfilenode from pg_class where oid = 'foo'::regclass")
+                    row = cur.fetchone()
+                    log.info(f"relfilenode is {row[0]}")

                    # Run GC, to clear out any garbage left behind in the catalogs by
                    # the CREATE TABLE command. We want to have a clean slate with no garbage
@@ -50,55 +58,58 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
                    # update to confuse our numbers either.
                    cur.execute("DELETE FROM foo")

-                    print("Running GC before test")
-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    log.info("Running GC before test")
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
-                    print_gc_result(row);
+                    print_gc_result(row)
                    # remember the number of files
-                    layer_relfiles_remain = row['layer_relfiles_total'] - row['layer_relfiles_removed']
+                    layer_relfiles_remain = (row['layer_relfiles_total'] -
+                                             row['layer_relfiles_removed'])
                    assert layer_relfiles_remain > 0

-                    # Insert a row.
-                    print("Inserting one row and running GC")
+                    # Insert a row and run GC. Checkpoint should freeze the layer
+                    # so that there is only the most recent image layer left for the rel,
+                    # removing the old image and delta layer.
+                    log.info("Inserting one row and running GC")
                    cur.execute("INSERT INTO foo VALUES (1)")
-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
-                    print_gc_result(row);
-                    assert row['layer_relfiles_total'] == layer_relfiles_remain + 1
-                    assert row['layer_relfiles_removed'] == 1
+                    print_gc_result(row)
+                    assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
+                    assert row['layer_relfiles_removed'] == 2
                    assert row['layer_relfiles_dropped'] == 0

                    # Insert two more rows and run GC.
-                    # This should create a new layer file with the new contents, and
-                    # remove the old one.
-                    print("Inserting two more rows and running GC")
+                    # This should create new image and delta layer file with the new contents, and
+                    # then remove the old one image and the just-created delta layer.
+                    log.info("Inserting two more rows and running GC")
                    cur.execute("INSERT INTO foo VALUES (2)")
                    cur.execute("INSERT INTO foo VALUES (3)")

-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
-                    print_gc_result(row);
-                    assert row['layer_relfiles_total'] == layer_relfiles_remain + 1
-                    assert row['layer_relfiles_removed'] == 1
+                    print_gc_result(row)
+                    assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
+                    assert row['layer_relfiles_removed'] == 2
                    assert row['layer_relfiles_dropped'] == 0

-                    # Do it again. Should again create a new layer file and remove old one.
-                    print("Inserting two more rows and running GC")
+                    # Do it again. Should again create two new layer files and remove old ones.
+                    log.info("Inserting two more rows and running GC")
                    cur.execute("INSERT INTO foo VALUES (2)")
                    cur.execute("INSERT INTO foo VALUES (3)")

-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
-                    print_gc_result(row);
-                    assert row['layer_relfiles_total'] == layer_relfiles_remain + 1
-                    assert row['layer_relfiles_removed'] == 1
+                    print_gc_result(row)
+                    assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
+                    assert row['layer_relfiles_removed'] == 2
                    assert row['layer_relfiles_dropped'] == 0

                    # Run GC again, with no changes in the database. Should not remove anything.
-                    print("Run GC again, with nothing to do")
-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    log.info("Run GC again, with nothing to do")
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
-                    print_gc_result(row);
+                    print_gc_result(row)
                    assert row['layer_relfiles_total'] == layer_relfiles_remain
                    assert row['layer_relfiles_removed'] == 0
                    assert row['layer_relfiles_dropped'] == 0
@@ -106,19 +117,26 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
                    #
                    # Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
                    #
-                    print("Drop table and run GC again");
+                    log.info("Drop table and run GC again")
                    cur.execute("DROP TABLE foo")

-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
-                    print_gc_result(row);
+                    print_gc_result(row)

+                    # We still cannot remove the latest layers
+                    # because they serve as tombstones for earlier layers.
+                    assert row['layer_relfiles_dropped'] == 0
                    # Each relation fork is counted separately, hence 3.
-                    assert row['layer_relfiles_dropped'] == 3
+                    assert row['layer_relfiles_needed_as_tombstone'] == 3

                    # The catalog updates also create new layer files of the catalogs, which
                    # are counted as 'removed'
                    assert row['layer_relfiles_removed'] > 0

+                    # TODO Change the test to check actual CG of dropped layers.
+                    # Each relation fork is counted separately, hence 3.
+                    #assert row['layer_relfiles_dropped'] == 3
+
                    # TODO: perhaps we should count catalog and user relations separately,
                    # to make this kind of testing more robust
--- a/test_runner/batch_others/test_tenants.py
+++ b/test_runner/batch_others/test_tenants.py
@@ -2,39 +2,41 @@ from contextlib import closing

 import pytest

-from fixtures.zenith_fixtures import (
-    TenantFactory,
-    ZenithCli,
-    PostgresFactory,
-)
+from fixtures.zenith_fixtures import ZenithEnvBuilder


@pytest.mark.parametrize('with_wal_acceptors', [False, True])
-def test_tenants_normal_work(
-    zenith_cli: ZenithCli,
-    tenant_factory: TenantFactory,
-    postgres: PostgresFactory,
-    wa_factory,
-    with_wal_acceptors: bool,
-):
-    """Tests tenants with and without wal acceptors"""
-    tenant_1 = tenant_factory.create()
-    tenant_2 = tenant_factory.create()
-
-    zenith_cli.run(["branch", f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", "main", f"--tenantid={tenant_1}"])
-    zenith_cli.run(["branch", f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", "main", f"--tenantid={tenant_2}"])
+def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptors: bool):
    if with_wal_acceptors:
-        wa_factory.start_n_new(3)
+        zenith_env_builder.num_safekeepers = 3

-    pg_tenant1 = postgres.create_start(
+    env = zenith_env_builder.init()
+    """Tests tenants with and without wal acceptors"""
+    tenant_1 = env.create_tenant()
+    tenant_2 = env.create_tenant()
+
+    env.zenith_cli([
+        "branch",
        f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
+        "main",
+        f"--tenantid={tenant_1}"
+    ])
+    env.zenith_cli([
+        "branch",
+        f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
+        "main",
+        f"--tenantid={tenant_2}"
+    ])
+
+    pg_tenant1 = env.postgres.create_start(
+        f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
+        None,  # branch name, None means same as node name
        tenant_1,
-        wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None,
    )
-    pg_tenant2 = postgres.create_start(
+    pg_tenant2 = env.postgres.create_start(
        f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
+        None,  # branch name, None means same as node name
        tenant_2,
-        wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None,
    )

    for pg in [pg_tenant1, pg_tenant2]:
@@ -45,4 +47,4 @@ def test_tenants_normal_work(
                cur.execute("CREATE TABLE t(key int primary key, value text)")
                cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
                cur.execute("SELECT sum(key) FROM t")
-                assert cur.fetchone() == (5000050000,)
+                assert cur.fetchone() == (5000050000, )
--- a/test_runner/batch_others/test_timeline_size.py
+++ b/test_runner/batch_others/test_timeline_size.py
@@ -1,21 +1,21 @@
 from contextlib import closing
 from uuid import UUID
 import psycopg2.extras
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
+from fixtures.log_helper import log


-def test_timeline_size(
-    zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin
-):
+def test_timeline_size(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Branch at the point where only 100 rows were inserted
-    zenith_cli.run(["branch", "test_timeline_size", "empty"])
+    env.zenith_cli(["branch", "test_timeline_size", "empty"])

-    client = pageserver.http_client()
-    res = client.branch_detail(UUID(pageserver.initial_tenant), "test_timeline_size")
+    client = env.pageserver.http_client()
+    res = client.branch_detail(UUID(env.initial_tenant), "test_timeline_size")
    assert res["current_logical_size"] == res["current_logical_size_non_incremental"]

-    pgmain = postgres.create_start("test_timeline_size")
-    print("postgres is running on 'test_timeline_size' branch")
+    pgmain = env.postgres.create_start("test_timeline_size")
+    log.info("postgres is running on 'test_timeline_size' branch")

    with closing(pgmain.connect()) as conn:
        with conn.cursor() as cur:
@@ -23,17 +23,15 @@ def test_timeline_size(

            # Create table, and insert the first 100 rows
            cur.execute("CREATE TABLE foo (t text)")
-            cur.execute(
-                """
+            cur.execute("""
                INSERT INTO foo
                    SELECT 'long string to consume some space' || g
                    FROM generate_series(1, 10) g
-            """
-            )
+            """)

-            res = client.branch_detail(UUID(pageserver.initial_tenant), "test_timeline_size")
+            res = client.branch_detail(UUID(env.initial_tenant), "test_timeline_size")
            assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
            cur.execute("TRUNCATE foo")

-            res = client.branch_detail(UUID(pageserver.initial_tenant), "test_timeline_size")
+            res = client.branch_detail(UUID(env.initial_tenant), "test_timeline_size")
            assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
--- a/test_runner/batch_others/test_twophase.py
+++ b/test_runner/batch_others/test_twophase.py
@@ -1,7 +1,7 @@
 import os

-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, PgBin
-
+from fixtures.zenith_fixtures import ZenithEnv
+from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")

@@ -9,11 +9,12 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test branching, when a transaction is in prepared state
 #
-def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin: PgBin):
-    zenith_cli.run(["branch", "test_twophase", "empty"])
+def test_twophase(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    env.zenith_cli(["branch", "test_twophase", "empty"])

-    pg = postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5'])
-    print("postgres is running on 'test_twophase' branch")
+    pg = env.postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5'])
+    log.info("postgres is running on 'test_twophase' branch")

    conn = pg.connect()
    cur = conn.cursor()
@@ -45,7 +46,7 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
    cur.execute('CHECKPOINT')

    twophase_files = os.listdir(pg.pg_twophase_dir_path())
-    print(twophase_files)
+    log.info(twophase_files)
    assert len(twophase_files) == 4

    cur.execute("COMMIT PREPARED 'insert_three'")
@@ -53,21 +54,21 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
    cur.execute('CHECKPOINT')

    twophase_files = os.listdir(pg.pg_twophase_dir_path())
-    print(twophase_files)
+    log.info(twophase_files)
    assert len(twophase_files) == 2

    # Create a branch with the transaction in prepared state
-    zenith_cli.run(["branch", "test_twophase_prepared", "test_twophase"])
+    env.zenith_cli(["branch", "test_twophase_prepared", "test_twophase"])

    # Start compute on the new branch
-    pg2 = postgres.create_start(
+    pg2 = env.postgres.create_start(
        'test_twophase_prepared',
        config_lines=['max_prepared_transactions=5'],
    )

    # Check that we restored only needed twophase files
    twophase_files2 = os.listdir(pg2.pg_twophase_dir_path())
-    print(twophase_files2)
+    log.info(twophase_files2)
    assert twophase_files2.sort() == twophase_files.sort()

    conn2 = pg2.connect()
@@ -79,8 +80,8 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
    cur2.execute("ROLLBACK PREPARED 'insert_two'")

    cur2.execute('SELECT * FROM foo')
-    assert cur2.fetchall() == [('one',), ('three',)]
+    assert cur2.fetchall() == [('one', ), ('three', )]

    # Only one committed insert is visible on the original branch
    cur.execute('SELECT * FROM foo')
-    assert cur.fetchall() == [('three',)]
+    assert cur.fetchall() == [('three', )]
--- a/test_runner/batch_others/test_vm_bits.py
+++ b/test_runner/batch_others/test_vm_bits.py
@@ -1,17 +1,21 @@
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
+from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")

+
 #
 # Test that the VM bit is cleared correctly at a HEAP_DELETE and
 # HEAP_UPDATE record.
 #
-def test_vm_bit_clear(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin, zenith_cli, base_dir):
-    # Create a branch for us
-    zenith_cli.run(["branch", "test_vm_bit_clear", "empty"])
-    pg = postgres.create_start('test_vm_bit_clear')
+def test_vm_bit_clear(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env

-    print("postgres is running on 'test_vm_bit_clear' branch")
+    # Create a branch for us
+    env.zenith_cli(["branch", "test_vm_bit_clear", "empty"])
+    pg = env.postgres.create_start('test_vm_bit_clear')
+
+    log.info("postgres is running on 'test_vm_bit_clear' branch")
    pg_conn = pg.connect()
    cur = pg_conn.cursor()

@@ -32,7 +36,7 @@ def test_vm_bit_clear(pageserver: ZenithPageserver, postgres: PostgresFactory, p
    cur.execute('UPDATE vmtest_update SET id = 5000 WHERE id = 1')

    # Branch at this point, to test that later
-    zenith_cli.run(["branch", "test_vm_bit_clear_new", "test_vm_bit_clear"])
+    env.zenith_cli(["branch", "test_vm_bit_clear_new", "test_vm_bit_clear"])

    # Clear the buffer cache, to force the VM page to be re-fetched from
    # the page server
@@ -48,22 +52,21 @@ def test_vm_bit_clear(pageserver: ZenithPageserver, postgres: PostgresFactory, p
    ''')

    cur.execute('SELECT * FROM vmtest_delete WHERE id = 1')
-    assert(cur.fetchall() == []);
+    assert (cur.fetchall() == [])
    cur.execute('SELECT * FROM vmtest_update WHERE id = 1')
-    assert(cur.fetchall() == []);
+    assert (cur.fetchall() == [])

    cur.close()

-
    # Check the same thing on the branch that we created right after the DELETE
    #
    # As of this writing, the code in smgrwrite() creates a full-page image whenever
    # a dirty VM page is evicted. If the VM bit was not correctly cleared by the
    # earlier WAL record, the full-page image hides the problem. Starting a new
    # server at the right point-in-time avoids that full-page image.
-    pg_new = postgres.create_start('test_vm_bit_clear_new')
+    pg_new = env.postgres.create_start('test_vm_bit_clear_new')

-    print("postgres is running on 'test_vm_bit_clear_new' branch")
+    log.info("postgres is running on 'test_vm_bit_clear_new' branch")
    pg_new_conn = pg_new.connect()
    cur_new = pg_new_conn.cursor()

@@ -74,6 +77,6 @@ def test_vm_bit_clear(pageserver: ZenithPageserver, postgres: PostgresFactory, p
    ''')

    cur_new.execute('SELECT * FROM vmtest_delete WHERE id = 1')
-    assert(cur_new.fetchall() == []);
+    assert (cur_new.fetchall() == [])
    cur_new.execute('SELECT * FROM vmtest_update WHERE id = 1')
-    assert(cur_new.fetchall() == []);
+    assert (cur_new.fetchall() == [])
--- a/test_runner/batch_others/test_wal_acceptor.py
+++ b/test_runner/batch_others/test_wal_acceptor.py
@@ -1,21 +1,28 @@
 import pytest
 import random
 import time
+import os
+import subprocess
+import uuid

 from contextlib import closing
 from multiprocessing import Process, Value
-from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory
+from fixtures.zenith_fixtures import PgBin, ZenithEnv, ZenithEnvBuilder
+from fixtures.utils import lsn_to_hex, mkdir_if_needed
+from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")


 # basic test, write something in setup with wal acceptors, ensure that commits
 # succeed and data is written
-def test_normal_work(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory):
-    zenith_cli.run(["branch", "test_wal_acceptors_normal_work", "empty"])
-    wa_factory.start_n_new(3)
-    pg = postgres.create_start('test_wal_acceptors_normal_work',
-                               wal_acceptors=wa_factory.get_connstrs())
+def test_normal_work(zenith_env_builder: ZenithEnvBuilder):
+    zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()
+
+    env.zenith_cli(["branch", "test_wal_acceptors_normal_work", "main"])
+
+    pg = env.postgres.create_start('test_wal_acceptors_normal_work')

    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
@@ -29,18 +36,19 @@ def test_normal_work(zenith_cli, pageserver: ZenithPageserver, postgres: Postgre

 # Run page server and multiple acceptors, and multiple compute nodes running
 # against different timelines.
-def test_many_timelines(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory):
-    n_timelines = 2
+def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
+    zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()

-    wa_factory.start_n_new(3)
+    n_timelines = 2

    branches = ["test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines)]

    # start postgres on each timeline
    pgs = []
    for branch in branches:
-        zenith_cli.run(["branch", branch, "empty"])
-        pgs.append(postgres.create_start(branch, wal_acceptors=wa_factory.get_connstrs()))
+        env.zenith_cli(["branch", branch, "main"])
+        pgs.append(env.postgres.create_start(branch))

    # Do everything in different loops to have actions on different timelines
    # interleaved.
@@ -61,16 +69,16 @@ def test_many_timelines(zenith_cli, pageserver: ZenithPageserver, postgres: Post
 # Check that dead minority doesn't prevent the commits: execute insert n_inserts
 # times, with fault_probability chance of getting a wal acceptor down or up
 # along the way. 2 of 3 are always alive, so the work keeps going.
-def test_restarts(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory: WalAcceptorFactory):
+def test_restarts(zenith_env_builder: ZenithEnvBuilder):
    fault_probability = 0.01
    n_inserts = 1000
    n_acceptors = 3

-    wa_factory.start_n_new(n_acceptors)
+    zenith_env_builder.num_safekeepers = n_acceptors
+    env = zenith_env_builder.init()

-    zenith_cli.run(["branch", "test_wal_acceptors_restarts", "empty"])
-    pg = postgres.create_start('test_wal_acceptors_restarts',
-                               wal_acceptors=wa_factory.get_connstrs())
+    env.zenith_cli(["branch", "test_wal_acceptors_restarts", "main"])
+    pg = env.postgres.create_start('test_wal_acceptors_restarts')

    # we rely upon autocommit after each statement
    # as waiting for acceptors happens there
@@ -84,7 +92,7 @@ def test_restarts(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa

        if random.random() <= fault_probability:
            if failed_node is None:
-                failed_node = wa_factory.instances[random.randrange(0, n_acceptors)]
+                failed_node = env.safekeepers[random.randrange(0, n_acceptors)]
                failed_node.stop()
            else:
                failed_node.start()
@@ -102,12 +110,12 @@ def delayed_wal_acceptor_start(wa):


 # When majority of acceptors is offline, commits are expected to be frozen
-def test_unavailability(zenith_cli, postgres: PostgresFactory, wa_factory):
-    wa_factory.start_n_new(2)
+def test_unavailability(zenith_env_builder: ZenithEnvBuilder):
+    zenith_env_builder.num_safekeepers = 2
+    env = zenith_env_builder.init()

-    zenith_cli.run(["branch", "test_wal_acceptors_unavailability", "empty"])
-    pg = postgres.create_start('test_wal_acceptors_unavailability',
-                               wal_acceptors=wa_factory.get_connstrs())
+    env.zenith_cli(["branch", "test_wal_acceptors_unavailability", "main"])
+    pg = env.postgres.create_start('test_wal_acceptors_unavailability')

    # we rely upon autocommit after each statement
    # as waiting for acceptors happens there
@@ -119,9 +127,9 @@ def test_unavailability(zenith_cli, postgres: PostgresFactory, wa_factory):
    cur.execute("INSERT INTO t values (1, 'payload')")

    # shutdown one of two acceptors, that is, majority
-    wa_factory.instances[0].stop()
+    env.safekeepers[0].stop()

-    proc = Process(target=delayed_wal_acceptor_start, args=(wa_factory.instances[0], ))
+    proc = Process(target=delayed_wal_acceptor_start, args=(env.safekeepers[0], ))
    proc.start()

    start = time.time()
@@ -131,9 +139,9 @@ def test_unavailability(zenith_cli, postgres: PostgresFactory, wa_factory):
    proc.join()

    # for the world's balance, do the same with second acceptor
-    wa_factory.instances[1].stop()
+    env.safekeepers[1].stop()

-    proc = Process(target=delayed_wal_acceptor_start, args=(wa_factory.instances[1], ))
+    proc = Process(target=delayed_wal_acceptor_start, args=(env.safekeepers[1], ))
    proc.start()

    start = time.time()
@@ -172,13 +180,13 @@ def stop_value():


 # do inserts while concurrently getting up/down subsets of acceptors
-def test_race_conditions(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory, stop_value):
+def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value):

-    wa_factory.start_n_new(3)
+    zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()

-    zenith_cli.run(["branch", "test_wal_acceptors_race_conditions", "empty"])
-    pg = postgres.create_start('test_wal_acceptors_race_conditions',
-                               wal_acceptors=wa_factory.get_connstrs())
+    env.zenith_cli(["branch", "test_wal_acceptors_race_conditions", "main"])
+    pg = env.postgres.create_start('test_wal_acceptors_race_conditions')

    # we rely upon autocommit after each statement
    # as waiting for acceptors happens there
@@ -187,7 +195,7 @@ def test_race_conditions(zenith_cli, pageserver: ZenithPageserver, postgres: Pos

    cur.execute('CREATE TABLE t(key int primary key, value text)')

-    proc = Process(target=xmas_garland, args=(wa_factory.instances, stop_value))
+    proc = Process(target=xmas_garland, args=(env.safekeepers, stop_value))
    proc.start()

    for i in range(1000):
@@ -198,3 +206,127 @@ def test_race_conditions(zenith_cli, pageserver: ZenithPageserver, postgres: Pos

    stop_value.value = 1
    proc.join()
+
+
+class ProposerPostgres:
+    """Object for running safekeepers sync with walproposer"""
+    def __init__(self, env: ZenithEnv, pgdata_dir: str, pg_bin, timeline_id: str, tenant_id: str):
+        self.env = env
+        self.pgdata_dir: str = pgdata_dir
+        self.pg_bin: PgBin = pg_bin
+        self.timeline_id: str = timeline_id
+        self.tenant_id: str = tenant_id
+
+    def pg_data_dir_path(self) -> str:
+        """ Path to data directory """
+        return self.pgdata_dir
+
+    def config_file_path(self) -> str:
+        """ Path to postgresql.conf """
+        return os.path.join(self.pgdata_dir, 'postgresql.conf')
+
+    def create_dir_config(self, wal_acceptors: str):
+        """ Create dir and config for running --sync-safekeepers """
+
+        mkdir_if_needed(self.pg_data_dir_path())
+        with open(self.config_file_path(), "w") as f:
+            f.writelines([
+                "synchronous_standby_names = 'walproposer'\n",
+                f"zenith.zenith_timeline = '{self.timeline_id}'\n",
+                f"zenith.zenith_tenant = '{self.tenant_id}'\n",
+                f"wal_acceptors = '{wal_acceptors}'\n",
+            ])
+
+    def sync_safekeepers(self) -> str:
+        """
+        Run 'postgres --sync-safekeepers'.
+        Returns execution result, which is commit_lsn after sync.
+        """
+
+        command = ["postgres", "--sync-safekeepers"]
+        env = {
+            "PGDATA": self.pg_data_dir_path(),
+        }
+
+        basepath = self.pg_bin.run_capture(command, env)
+        stdout_filename = basepath + '.stdout'
+
+        with open(stdout_filename, 'r') as stdout_f:
+            stdout = stdout_f.read()
+            return stdout.strip("\n ")
+
+
+# insert wal in all safekeepers and run sync on proposer
+def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, pg_bin: PgBin):
+
+    # We don't really need the full environment for this test, just the
+    # safekeepers would be enough.
+    zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()
+
+    timeline_id = uuid.uuid4().hex
+    tenant_id = uuid.uuid4().hex
+
+    # write config for proposer
+    pgdata_dir = os.path.join(env.repo_dir, "proposer_pgdata")
+    pg = ProposerPostgres(env, pgdata_dir, pg_bin, timeline_id, tenant_id)
+    pg.create_dir_config(env.get_safekeeper_connstrs())
+
+    # valid lsn, which is not in the segment start, nor in zero segment
+    epoch_start_lsn = 0x16B9188  # 0/16B9188
+    begin_lsn = epoch_start_lsn
+
+    # append and commit WAL
+    lsn_after_append = []
+    for i in range(3):
+        res = env.safekeepers[i].append_logical_message(
+            tenant_id,
+            timeline_id,
+            {
+                "lm_prefix": "prefix",
+                "lm_message": "message",
+                "set_commit_lsn": True,
+                "term": 2,
+                "begin_lsn": begin_lsn,
+                "epoch_start_lsn": epoch_start_lsn,
+                "truncate_lsn": epoch_start_lsn,
+            },
+        )
+        lsn_hex = lsn_to_hex(res["inserted_wal"]["end_lsn"])
+        lsn_after_append.append(lsn_hex)
+        log.info(f"safekeeper[{i}] lsn after append: {lsn_hex}")
+
+    # run sync safekeepers
+    lsn_after_sync = pg.sync_safekeepers()
+    log.info(f"lsn after sync = {lsn_after_sync}")
+
+    assert all(lsn_after_sync == lsn for lsn in lsn_after_append)
+
+
+def test_timeline_status(zenith_env_builder: ZenithEnvBuilder):
+
+    zenith_env_builder.num_safekeepers = 1
+    env = zenith_env_builder.init()
+
+    env.zenith_cli(["branch", "test_timeline_status", "main"])
+    pg = env.postgres.create_start('test_timeline_status')
+
+    wa = env.safekeepers[0]
+    wa_http_cli = wa.http_client()
+    wa_http_cli.check_status()
+
+    # learn zenith timeline from compute
+    tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0]
+    timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0]
+
+    # fetch something sensible from status
+    epoch = wa_http_cli.timeline_status(tenant_id, timeline_id).acceptor_epoch
+
+    pg.safe_psql("create table t(i int)")
+
+    # ensure epoch goes up after reboot
+    pg.stop().start()
+    pg.safe_psql("insert into t values(10)")
+
+    epoch_after_reboot = wa_http_cli.timeline_status(tenant_id, timeline_id).acceptor_epoch
+    assert epoch_after_reboot > epoch
--- a/test_runner/batch_others/test_wal_acceptor_async.py
+++ b/test_runner/batch_others/test_wal_acceptor_async.py
@@ -1,11 +1,14 @@
 import asyncio
 import asyncpg
 import random
+import time

-from fixtures.zenith_fixtures import WalAcceptor, WalAcceptorFactory, ZenithPageserver, PostgresFactory, Postgres
+from fixtures.zenith_fixtures import ZenithEnvBuilder, Postgres, Safekeeper
+from fixtures.log_helper import getLogger
+from fixtures.utils import lsn_from_hex, lsn_to_hex
 from typing import List
-from fixtures.utils import debug_print

+log = getLogger('root.wal_acceptor_async')
 pytest_plugins = ("fixtures.zenith_fixtures")


@@ -18,13 +21,16 @@ class BankClient(object):
    async def initdb(self):
        await self.conn.execute('DROP TABLE IF EXISTS bank_accs')
        await self.conn.execute('CREATE TABLE bank_accs(uid int primary key, amount int)')
-        await self.conn.execute('''
+        await self.conn.execute(
+            '''
            INSERT INTO bank_accs
            SELECT *, $1 FROM generate_series(0, $2)
-        ''', self.init_amount, self.n_accounts - 1)
+        ''',
+            self.init_amount,
+            self.n_accounts - 1)
        await self.conn.execute('DROP TABLE IF EXISTS bank_log')
        await self.conn.execute('CREATE TABLE bank_log(from_uid int, to_uid int, amount int)')
-        
+
        # TODO: Remove when https://github.com/zenithdb/zenith/issues/644 is fixed
        await self.conn.execute('ALTER TABLE bank_accs SET (autovacuum_enabled = false)')
        await self.conn.execute('ALTER TABLE bank_log SET (autovacuum_enabled = false)')
@@ -33,6 +39,7 @@ class BankClient(object):
        row = await self.conn.fetchrow('SELECT sum(amount) AS sum FROM bank_accs')
        assert row['sum'] == self.n_accounts * self.init_amount

+
 async def bank_transfer(conn: asyncpg.Connection, from_uid, to_uid, amount):
    # avoid deadlocks by sorting uids
    if from_uid > to_uid:
@@ -41,16 +48,22 @@ async def bank_transfer(conn: asyncpg.Connection, from_uid, to_uid, amount):
    async with conn.transaction():
        await conn.execute(
            'UPDATE bank_accs SET amount = amount + ($1) WHERE uid = $2',
-            amount, to_uid,
+            amount,
+            to_uid,
        )
        await conn.execute(
            'UPDATE bank_accs SET amount = amount - ($1) WHERE uid = $2',
-            amount, from_uid,
+            amount,
+            from_uid,
        )
-        await conn.execute('INSERT INTO bank_log VALUES ($1, $2, $3)',
-            from_uid, to_uid, amount,
+        await conn.execute(
+            'INSERT INTO bank_log VALUES ($1, $2, $3)',
+            from_uid,
+            to_uid,
+            amount,
        )

+
 class WorkerStats(object):
    def __init__(self, n_workers):
        self.counters = [0] * n_workers
@@ -63,18 +76,18 @@ class WorkerStats(object):
        self.counters[worker_id] += 1

    def check_progress(self):
-        debug_print("Workers progress: {}".format(self.counters))
+        log.debug("Workers progress: {}".format(self.counters))

        # every worker should finish at least one tx
        assert all(cnt > 0 for cnt in self.counters)

        progress = sum(self.counters)
-        print('All workers made {} transactions'.format(progress))
+        log.info('All workers made {} transactions'.format(progress))


 async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accounts, max_transfer):
    pg_conn = await pg.connect_async()
-    debug_print('Started worker {}'.format(worker_id))
+    log.debug('Started worker {}'.format(worker_id))

    while stats.running:
        from_uid = random.randint(0, n_accounts - 1)
@@ -84,18 +97,50 @@ async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accou
        await bank_transfer(pg_conn, from_uid, to_uid, amount)
        stats.inc_progress(worker_id)

-        debug_print('Executed transfer({}) {} => {}'.format(amount, from_uid, to_uid))
+        log.debug('Executed transfer({}) {} => {}'.format(amount, from_uid, to_uid))

-    debug_print('Finished worker {}'.format(worker_id))
+    log.debug('Finished worker {}'.format(worker_id))

    await pg_conn.close()


+async def wait_for_lsn(safekeeper: Safekeeper,
+                       tenant_id: str,
+                       timeline_id: str,
+                       wait_lsn: str,
+                       polling_interval=1,
+                       timeout=600):
+    """
+    Poll flush_lsn from safekeeper until it's greater or equal than
+    provided wait_lsn. To do that, timeline_status is fetched from
+    safekeeper every polling_interval seconds.
+    """
+
+    started_at = time.time()
+    client = safekeeper.http_client()
+
+    flush_lsn = client.timeline_status(tenant_id, timeline_id).flush_lsn
+    log.info(
+        f'Safekeeper at port {safekeeper.port.pg} has flush_lsn {flush_lsn}, waiting for lsn {wait_lsn}'
+    )
+
+    while lsn_from_hex(wait_lsn) > lsn_from_hex(flush_lsn):
+        elapsed = time.time() - started_at
+        if elapsed > timeout:
+            raise RuntimeError(
+                f"timed out waiting for safekeeper at port {safekeeper.port.pg} to reach {wait_lsn}, current lsn is {flush_lsn}"
+            )
+
+        await asyncio.sleep(polling_interval)
+        flush_lsn = client.timeline_status(tenant_id, timeline_id).flush_lsn
+        log.debug(f'safekeeper port={safekeeper.port.pg} flush_lsn={flush_lsn} wait_lsn={wait_lsn}')
+
+
 # This test will run several iterations and check progress in each of them.
 # On each iteration 1 acceptor is stopped, and 2 others should allow
 # background workers execute transactions. In the end, state should remain
 # consistent.
-async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_workers=10):
+async def run_restarts_under_load(pg: Postgres, acceptors: List[Safekeeper], n_workers=10):
    n_accounts = 100
    init_amount = 100000
    max_transfer = 100
@@ -103,6 +148,9 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_
    iterations = 6

    pg_conn = await pg.connect_async()
+    tenant_id = await pg_conn.fetchval("show zenith.zenith_tenant")
+    timeline_id = await pg_conn.fetchval("show zenith.zenith_timeline")
+
    bank = BankClient(pg_conn, n_accounts=n_accounts, init_amount=init_amount)
    # create tables and initial balances
    await bank.initdb()
@@ -113,14 +161,19 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_
        worker = run_random_worker(stats, pg, worker_id, bank.n_accounts, max_transfer)
        workers.append(asyncio.create_task(worker))

-
    for it in range(iterations):
-        victim = acceptors[it % len(acceptors)]
+        victim_idx = it % len(acceptors)
+        victim = acceptors[victim_idx]
        victim.stop()

-        # wait for transactions that could have started and finished before
-        # victim acceptor was stopped
-        await asyncio.sleep(1)
+        flush_lsn = await pg_conn.fetchval('SELECT pg_current_wal_flush_lsn()')
+        flush_lsn = lsn_to_hex(flush_lsn)
+        log.info(f'Postgres flush_lsn {flush_lsn}')
+
+        # Wait until alive safekeepers catch up with postgres
+        for idx, safekeeper in enumerate(acceptors):
+            if idx != victim_idx:
+                await wait_for_lsn(safekeeper, tenant_id, timeline_id, flush_lsn)

        stats.reset()
        await asyncio.sleep(period_time)
@@ -129,7 +182,7 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_

        victim.start()

-    print('Iterations are finished, exiting coroutines...')
+    log.info('Iterations are finished, exiting coroutines...')
    stats.running = False
    # await all workers
    await asyncio.gather(*workers)
@@ -139,16 +192,14 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_


 # restart acceptors one by one, while executing and validating bank transactions
-def test_restarts_under_load(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory,
-                             wa_factory: WalAcceptorFactory):
+def test_restarts_under_load(zenith_env_builder: ZenithEnvBuilder):
+    zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()

-    wa_factory.start_n_new(3)
+    env.zenith_cli(["branch", "test_wal_acceptors_restarts_under_load", "main"])
+    pg = env.postgres.create_start('test_wal_acceptors_restarts_under_load')

-    zenith_cli.run(["branch", "test_wal_acceptors_restarts_under_load", "empty"])
-    pg = postgres.create_start('test_wal_acceptors_restarts_under_load',
-                               wal_acceptors=wa_factory.get_connstrs())
-
-    asyncio.run(run_restarts_under_load(pg, wa_factory.instances))
+    asyncio.run(run_restarts_under_load(pg, env.safekeepers))

    # TODO: Remove when https://github.com/zenithdb/zenith/issues/644 is fixed
    pg.stop()
--- a/test_runner/batch_others/test_zenith_cli.py
+++ b/test_runner/batch_others/test_zenith_cli.py
@@ -1,97 +1,107 @@
 import json
 import uuid

-from fixtures.zenith_fixtures import ZenithCli, ZenithPageserver
+from psycopg2.extensions import cursor as PgCursor
+from fixtures.zenith_fixtures import ZenithEnv
+from typing import cast

 pytest_plugins = ("fixtures.zenith_fixtures")


-def helper_compare_branch_list(page_server_cur, zenith_cli, initial_tenant: str):
+def helper_compare_branch_list(page_server_cur: PgCursor, env: ZenithEnv, initial_tenant: str):
    """
    Compare branches list returned by CLI and directly via API.
    Filters out branches created by other tests.
    """

    page_server_cur.execute(f'branch_list {initial_tenant}')
-    branches_api = sorted(map(lambda b: b['name'], json.loads(page_server_cur.fetchone()[0])))
+    branches_api = sorted(
+        map(lambda b: cast(str, b['name']), json.loads(page_server_cur.fetchone()[0])))
    branches_api = [b for b in branches_api if b.startswith('test_cli_') or b in ('empty', 'main')]

-    res = zenith_cli.run(["branch"])
+    res = env.zenith_cli(["branch"])
    res.check_returncode()
    branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
    branches_cli = [b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')]

-    res = zenith_cli.run(["branch", f"--tenantid={initial_tenant}"])
+    res = env.zenith_cli(["branch", f"--tenantid={initial_tenant}"])
    res.check_returncode()
-    branches_cli_with_tenant_arg = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
-    branches_cli_with_tenant_arg = [b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')]
+    branches_cli_with_tenant_arg = sorted(
+        map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
+    branches_cli_with_tenant_arg = [
+        b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')
+    ]

    assert branches_api == branches_cli == branches_cli_with_tenant_arg


-def test_cli_branch_list(pageserver: ZenithPageserver, zenith_cli):
-    page_server_conn = pageserver.connect()
+def test_cli_branch_list(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    page_server_conn = env.pageserver.connect()
    page_server_cur = page_server_conn.cursor()

    # Initial sanity check
-    helper_compare_branch_list(page_server_cur, zenith_cli, pageserver.initial_tenant)
+    helper_compare_branch_list(page_server_cur, env, env.initial_tenant)

    # Create a branch for us
-    res = zenith_cli.run(["branch", "test_cli_branch_list_main", "main"])
+    res = env.zenith_cli(["branch", "test_cli_branch_list_main", "empty"])
    assert res.stderr == ''
-    helper_compare_branch_list(page_server_cur, zenith_cli, pageserver.initial_tenant)
+    helper_compare_branch_list(page_server_cur, env, env.initial_tenant)

    # Create a nested branch
-    res = zenith_cli.run(["branch", "test_cli_branch_list_nested", "test_cli_branch_list_main"])
+    res = env.zenith_cli(["branch", "test_cli_branch_list_nested", "test_cli_branch_list_main"])
    assert res.stderr == ''
-    helper_compare_branch_list(page_server_cur, zenith_cli, pageserver.initial_tenant)
+    helper_compare_branch_list(page_server_cur, env, env.initial_tenant)

    # Check that all new branches are visible via CLI
-    res = zenith_cli.run(["branch"])
+    res = env.zenith_cli(["branch"])
    assert res.stderr == ''
    branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))

    assert 'test_cli_branch_list_main' in branches_cli
    assert 'test_cli_branch_list_nested' in branches_cli

-def helper_compare_tenant_list(page_server_cur, zenith_cli: ZenithCli):
-    page_server_cur.execute(f'tenant_list')
-    tenants_api = sorted(json.loads(page_server_cur.fetchone()[0]))

-    res = zenith_cli.run(["tenant", "list"])
+def helper_compare_tenant_list(page_server_cur: PgCursor, env: ZenithEnv):
+    page_server_cur.execute(f'tenant_list')
+    tenants_api = sorted(
+        map(lambda t: cast(str, t['id']), json.loads(page_server_cur.fetchone()[0])))
+
+    res = env.zenith_cli(["tenant", "list"])
    assert res.stderr == ''
-    tenants_cli = sorted(res.stdout.splitlines())
+    tenants_cli = sorted(map(lambda t: t.split()[0], res.stdout.splitlines()))

    assert tenants_api == tenants_cli


-def test_cli_tenant_list(pageserver: ZenithPageserver, zenith_cli: ZenithCli):
-    page_server_conn = pageserver.connect()
+def test_cli_tenant_list(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    page_server_conn = env.pageserver.connect()
    page_server_cur = page_server_conn.cursor()

    # Initial sanity check
-    helper_compare_tenant_list(page_server_cur, zenith_cli)
+    helper_compare_tenant_list(page_server_cur, env)

    # Create new tenant
    tenant1 = uuid.uuid4().hex
-    res = zenith_cli.run(["tenant", "create", tenant1])
+    res = env.zenith_cli(["tenant", "create", tenant1])
    res.check_returncode()

    # check tenant1 appeared
-    helper_compare_tenant_list(page_server_cur, zenith_cli)
+    helper_compare_tenant_list(page_server_cur, env)

    # Create new tenant
    tenant2 = uuid.uuid4().hex
-    res = zenith_cli.run(["tenant", "create", tenant2])
+    res = env.zenith_cli(["tenant", "create", tenant2])
    res.check_returncode()

    # check tenant2 appeared
-    helper_compare_tenant_list(page_server_cur, zenith_cli)
+    helper_compare_tenant_list(page_server_cur, env)

-    res = zenith_cli.run(["tenant", "list"])
+    res = env.zenith_cli(["tenant", "list"])
    res.check_returncode()
-    tenants = sorted(res.stdout.splitlines())
+    tenants = sorted(map(lambda t: t.split()[0], res.stdout.splitlines()))

-    assert pageserver.initial_tenant in tenants
+    assert env.initial_tenant in tenants
    assert tenant1 in tenants
    assert tenant2 in tenants
--- a/test_runner/batch_pg_regress/test_isolation.py
+++ b/test_runner/batch_pg_regress/test_isolation.py
@@ -1,20 +1,20 @@
 import os

 from fixtures.utils import mkdir_if_needed
-from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory
+from fixtures.zenith_fixtures import ZenithEnv, base_dir, pg_distrib_dir

 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_isolation(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir,
-                   base_dir, capsys):
+def test_isolation(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys):
+    env = zenith_simple_env

    # Create a branch for us
-    zenith_cli.run(["branch", "test_isolation", "empty"])
+    env.zenith_cli(["branch", "test_isolation", "empty"])

    # Connect to postgres and create a database called "regression".
    # isolation tests use prepared transactions, so enable them
-    pg = postgres.create_start('test_isolation', config_lines=['max_prepared_transactions=100'])
+    pg = env.postgres.create_start('test_isolation', config_lines=['max_prepared_transactions=100'])
    pg.safe_psql('CREATE DATABASE isolation_regression')

    # Create some local directories for pg_isolation_regress to run in.
@@ -38,7 +38,7 @@ def test_isolation(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_b
        '--schedule={}'.format(schedule),
    ]

-    env = {
+    env_vars = {
        'PGPORT': str(pg.port),
        'PGUSER': pg.username,
        'PGHOST': pg.host,
@@ -48,4 +48,4 @@ def test_isolation(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_b
    # We don't capture the output. It's not too chatty, and it always
    # logs the exact same data to `regression.out` anyway.
    with capsys.disabled():
-        pg_bin.run(pg_isolation_regress_command, env=env, cwd=runpath)
+        pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath)
--- a/test_runner/batch_pg_regress/test_pg_regress.py
+++ b/test_runner/batch_pg_regress/test_pg_regress.py
@@ -1,19 +1,19 @@
 import os

 from fixtures.utils import mkdir_if_needed
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content
+from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content, base_dir, pg_distrib_dir

 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_pg_regress(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir,
-                    base_dir, capsys):
+def test_pg_regress(zenith_simple_env: ZenithEnv, test_output_dir: str, pg_bin, capsys):
+    env = zenith_simple_env

    # Create a branch for us
-    zenith_cli.run(["branch", "test_pg_regress", "empty"])
+    env.zenith_cli(["branch", "test_pg_regress", "empty"])

    # Connect to postgres and create a database called "regression".
-    pg = postgres.create_start('test_pg_regress')
+    pg = env.postgres.create_start('test_pg_regress')
    pg.safe_psql('CREATE DATABASE regression')

    # Create some local directories for pg_regress to run in.
@@ -38,7 +38,7 @@ def test_pg_regress(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_
        '--inputdir={}'.format(src_path),
    ]

-    env = {
+    env_vars = {
        'PGPORT': str(pg.port),
        'PGUSER': pg.username,
        'PGHOST': pg.host,
@@ -48,11 +48,11 @@ def test_pg_regress(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_
    # We don't capture the output. It's not too chatty, and it always
    # logs the exact same data to `regression.out` anyway.
    with capsys.disabled():
-        pg_bin.run(pg_regress_command, env=env, cwd=runpath)
+        pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)

        # checkpoint one more time to ensure that the lsn we get is the latest one
        pg.safe_psql('CHECKPOINT')
        lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0]

        # Check that we restore the content of the datadir correctly
-        check_restored_datadir_content(zenith_cli, test_output_dir, pg, pageserver.service_port.pg)
+        check_restored_datadir_content(test_output_dir, env, pg)
--- a/test_runner/batch_pg_regress/test_zenith_regress.py
+++ b/test_runner/batch_pg_regress/test_zenith_regress.py
@@ -1,19 +1,23 @@
 import os

 from fixtures.utils import mkdir_if_needed
-from fixtures.zenith_fixtures import PageserverPort, PostgresFactory, check_restored_datadir_content
+from fixtures.zenith_fixtures import (ZenithEnv,
+                                      check_restored_datadir_content,
+                                      base_dir,
+                                      pg_distrib_dir)
+from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_zenith_regress(postgres: PostgresFactory, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir,
-                        base_dir, capsys, pageserver_port: PageserverPort):
+def test_zenith_regress(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys):
+    env = zenith_simple_env

    # Create a branch for us
-    zenith_cli.run(["branch", "test_zenith_regress", "empty"])
+    env.zenith_cli(["branch", "test_zenith_regress", "empty"])

    # Connect to postgres and create a database called "regression".
-    pg = postgres.create_start('test_zenith_regress')
+    pg = env.postgres.create_start('test_zenith_regress')
    pg.safe_psql('CREATE DATABASE regression')

    # Create some local directories for pg_regress to run in.
@@ -38,8 +42,8 @@ def test_zenith_regress(postgres: PostgresFactory, pg_bin, zenith_cli, test_outp
        '--inputdir={}'.format(src_path),
    ]

-    print(pg_regress_command)
-    env = {
+    log.info(pg_regress_command)
+    env_vars = {
        'PGPORT': str(pg.port),
        'PGUSER': pg.username,
        'PGHOST': pg.host,
@@ -49,11 +53,11 @@ def test_zenith_regress(postgres: PostgresFactory, pg_bin, zenith_cli, test_outp
    # We don't capture the output. It's not too chatty, and it always
    # logs the exact same data to `regression.out` anyway.
    with capsys.disabled():
-        pg_bin.run(pg_regress_command, env=env, cwd=runpath)
+        pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)

        # checkpoint one more time to ensure that the lsn we get is the latest one
        pg.safe_psql('CHECKPOINT')
        lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0]

        # Check that we restore the content of the datadir correctly
-        check_restored_datadir_content(zenith_cli, test_output_dir, pg, pageserver_port.pg)
+        check_restored_datadir_content(test_output_dir, env, pg)
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -1,5 +1,3 @@
-from pprint import pprint
-
 import os
 import re
 import timeit
@@ -26,7 +24,6 @@ from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast
 from typing_extensions import Literal

 from .utils import (get_self_dir, mkdir_if_needed, subprocess_capture)
-
 """
 This file contains fixtures for micro-benchmarks.

@@ -34,11 +31,11 @@ To use, declare the 'zenbenchmark' fixture in the test function. Run the
 bencmark, and then record the result by calling zenbenchmark.record. For example:

 import timeit
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv

 pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")

-def test_mybench(postgres: PostgresFactory, pageserver: ZenithPageserver, zenbenchmark):
+def test_mybench(zenith_simple_env: env, zenbenchmark):

    # Initialize the test
    ...
@@ -58,15 +55,9 @@ in the test initialization, or measure disk usage after the test query.
 """


-# All the results are collected in this list, as a tuple:
-# (test_name: str, metric_name: str, metric_value: float, unit: str)
-#
 # TODO: It would perhaps be better to store the results as additional
 # properties in the pytest TestReport objects, to make them visible to
 # other pytest tools.
-global zenbenchmark_results
-zenbenchmark_results = []
-
 class ZenithBenchmarkResults:
    """ An object for recording benchmark results. """
    def __init__(self):
@@ -79,6 +70,11 @@ class ZenithBenchmarkResults:

        self.results.append((test_name, metric_name, metric_value, unit))

+
+# Will be recreated in each session.
+zenbenchmark_results: ZenithBenchmarkResults = ZenithBenchmarkResults()
+
+
 # Session scope fixture that initializes the results object
@pytest.fixture(autouse=True, scope='session')
 def zenbenchmark_global(request) -> Iterator[ZenithBenchmarkResults]:
@@ -90,6 +86,7 @@ def zenbenchmark_global(request) -> Iterator[ZenithBenchmarkResults]:

    yield zenbenchmark_results

+
 class ZenithBenchmarker:
    """
    An object for recording benchmark results. This is created for each test
@@ -105,7 +102,6 @@ class ZenithBenchmarker:
        """
        self.results.record(self.request.node.name, metric_name, metric_value, unit)

-
    @contextmanager
    def record_duration(self, metric_name):
        """
@@ -136,9 +132,36 @@ class ZenithBenchmarker:
        # The metric should be an integer, as it's a number of bytes. But in general
        # all prometheus metrics are floats. So to be pedantic, read it as a float
        # and round to integer.
-        matches = re.search(r'pageserver_disk_io_bytes{io_operation="write"} (\S+)', all_metrics)
+        matches = re.search(r'^pageserver_disk_io_bytes{io_operation="write"} (\S+)$',
+                            all_metrics,
+                            re.MULTILINE)
+        assert matches
        return int(round(float(matches.group(1))))

+    def get_peak_mem(self, pageserver) -> int:
+        """
+        Fetch the "maxrss" metric from the pageserver
+        """
+        # Fetch all the exposed prometheus metrics from page server
+        all_metrics = pageserver.http_client().get_metrics()
+        # See comment in get_io_writes()
+        matches = re.search(r'^pageserver_maxrss_kb (\S+)$', all_metrics, re.MULTILINE)
+        assert matches
+        return int(round(float(matches.group(1))))
+
+    def get_timeline_size(self, repo_dir: str, tenantid: str, timelineid: str):
+        """
+        Calculate the on-disk size of a timeline
+        """
+        path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid)
+
+        totalbytes = 0
+        for root, dirs, files in os.walk(path):
+            for name in files:
+                totalbytes += os.path.getsize(os.path.join(root, name))
+
+        return totalbytes
+
    @contextmanager
    def record_pageserver_writes(self, pageserver, metric_name):
        """
@@ -148,7 +171,11 @@ class ZenithBenchmarker:
        yield
        after = self.get_io_writes(pageserver)

-        self.results.record(self.request.node.name, metric_name, round((after - before) / (1024 * 1024)), 'MB')
+        self.results.record(self.request.node.name,
+                            metric_name,
+                            round((after - before) / (1024 * 1024)),
+                            'MB')
+

@pytest.fixture(scope='function')
 def zenbenchmark(zenbenchmark_global, request) -> Iterator[ZenithBenchmarker]:
@@ -162,9 +189,7 @@ def zenbenchmark(zenbenchmark_global, request) -> Iterator[ZenithBenchmarker]:

 # Hook to print the results at the end
@pytest.hookimpl(hookwrapper=True)
-def pytest_terminal_summary(
-    terminalreporter: TerminalReporter, exitstatus: int, config: Config
-):
+def pytest_terminal_summary(terminalreporter: TerminalReporter, exitstatus: int, config: Config):
    yield

    global zenbenchmark_results
--- a/test_runner/fixtures/log_helper.py
+++ b/test_runner/fixtures/log_helper.py
@@ -0,0 +1,45 @@
+import logging
+import logging.config
+"""
+This file configures logging to use in python tests.
+Logs are automatically captured and shown in their
+own section after all tests are executed. 
+
+To see logs for all (even successful) tests, run
+pytest with the following command:
+- `pipenv run pytest -n8 -rA`
+
+Other log config can be set in pytest.ini file.
+You can add `log_cli = true` to it to watch
+logs in real time.
+
+To get more info about logging with pytest, see
+https://docs.pytest.org/en/6.2.x/logging.html
+"""
+
+# this config is only used for default log levels,
+# log format is specified in pytest.ini file
+LOGGING = {
+    "version": 1,
+    "loggers": {
+        "root": {
+            "level": "INFO"
+        },
+        "root.wal_acceptor_async": {
+            "level": "INFO"  # a lot of logs on DEBUG level
+        }
+    }
+}
+
+
+def getLogger(name='root') -> logging.Logger:
+    """Method to get logger for tests.
+    
+    Should be used to get correctly initialized logger. """
+    return logging.getLogger(name)
+
+
+# default logger for tests
+log = getLogger()
+
+logging.config.dictConfig(LOGGING)
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -2,6 +2,7 @@ import os
 import subprocess

 from typing import Any, List
+from fixtures.log_helper import log


 def get_self_dir() -> str:
@@ -21,7 +22,7 @@ def mkdir_if_needed(path: str) -> None:
    assert os.path.isdir(path)


-def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> None:
+def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
    """ Run a process and capture its output

    Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
@@ -29,6 +30,7 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> None:
    counter.

    If those files already exist, we will overwrite them.
+    Returns basepath for files with captured output.
    """
    assert type(cmd) is list
    base = os.path.basename(cmd[0]) + '_{}'.format(global_counter())
@@ -38,9 +40,11 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> None:

    with open(stdout_filename, 'w') as stdout_f:
        with open(stderr_filename, 'w') as stderr_f:
-            print('(capturing output to "{}.stdout")'.format(base))
+            log.info('(capturing output to "{}.stdout")'.format(base))
            subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)

+    return basepath
+

 _global_counter = 0

@@ -55,10 +59,13 @@ def global_counter() -> int:
    _global_counter += 1
    return _global_counter

-def debug_print(*args, **kwargs) -> None:
-    """ Print to the console if TEST_DEBUG_PRINT is set in env.
-    
-    All parameters are passed to print().
-    """
-    if os.environ.get('TEST_DEBUG_PRINT') is not None:
-        print(*args, **kwargs)
+
+def lsn_to_hex(num: int) -> str:
+    """ Convert lsn from int to standard hex notation. """
+    return "{:X}/{:X}".format(num >> 32, num & 0xffffffff)
+
+
+def lsn_from_hex(lsn_hex: str) -> int:
+    """ Convert lsn from hex notation to int. """
+    l, r = lsn_hex.split('/')
+    return (int(l, 16) << 32) + int(r, 16)
--- a/test_runner/fixtures/zenith_fixtures.py
+++ b/test_runner/fixtures/zenith_fixtures.py
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -1,21 +1,10 @@
 import os
 from contextlib import closing
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
+from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")

-def get_timeline_size(repo_dir: str, tenantid: str, timelineid: str):
-    path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid)
-
-    totalbytes = 0
-    for root, dirs, files in os.walk(path):
-        for name in files:
-            totalbytes += os.path.getsize(os.path.join(root, name))
-
-        if 'wal' in dirs:
-            dirs.remove('wal')  # don't visit 'wal' subdirectory
-
-    return totalbytes

 #
 # Run bulk INSERT test.
@@ -25,17 +14,19 @@ def get_timeline_size(repo_dir: str, tenantid: str, timelineid: str):
 # 1. Time to INSERT 5 million rows
 # 2. Disk writes
 # 3. Disk space used
+# 4. Peak memory usage
 #
-def test_bulk_insert(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str):
+def test_bulk_insert(zenith_simple_env: ZenithEnv, zenbenchmark):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_bulk_insert", "empty"])
+    env.zenith_cli(["branch", "test_bulk_insert", "empty"])

-    pg = postgres.create_start('test_bulk_insert')
-    print("postgres is running on 'test_bulk_insert' branch")
+    pg = env.postgres.create_start('test_bulk_insert')
+    log.info("postgres is running on 'test_bulk_insert' branch")

    # Open a connection directly to the page server that we'll use to force
    # flushing the layers to disk
-    psconn = pageserver.connect();
+    psconn = env.pageserver.connect()
    pscur = psconn.cursor()

    # Get the timeline ID of our branch. We need it for the 'do_gc' command
@@ -47,14 +38,19 @@ def test_bulk_insert(postgres: PostgresFactory, pageserver: ZenithPageserver, pg
            cur.execute("create table huge (i int, j int);")

            # Run INSERT, recording the time and I/O it takes
-            with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
+            with zenbenchmark.record_pageserver_writes(env.pageserver, 'pageserver_writes'):
                with zenbenchmark.record_duration('insert'):
                    cur.execute("insert into huge values (generate_series(1, 5000000), 0);")

                    # Flush the layers from memory to disk. This is included in the reported
                    # time and I/O
-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
+
+            # Record peak memory usage
+            zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(env.pageserver) / 1024, 'MB')

            # Report disk space used by the repository
-            timeline_size = get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
-            zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')
+            timeline_size = zenbenchmark.get_timeline_size(env.repo_dir,
+                                                           env.initial_tenant,
+                                                           timeline)
+            zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
--- a/test_runner/performance/test_bulk_tenant_create.py
+++ b/test_runner/performance/test_bulk_tenant_create.py
@@ -0,0 +1,57 @@
+import timeit
+import pytest
+
+from fixtures.zenith_fixtures import ZenithEnvBuilder
+
+pytest_plugins = ("fixtures.benchmark_fixture")
+
+# Run bulk tenant creation test.
+#
+# Collects metrics:
+#
+# 1. Time to create {1,10,50} tenants
+# 2. Average creation time per tenant
+
+
+@pytest.mark.parametrize('tenants_count', [1, 5, 10])
+@pytest.mark.parametrize('use_wal_acceptors', ['with_wa', 'without_wa'])
+def test_bulk_tenant_create(
+    zenith_env_builder: ZenithEnvBuilder,
+    use_wal_acceptors: str,
+    tenants_count: int,
+    zenbenchmark,
+):
+    """Measure tenant creation time (with and without wal acceptors)"""
+    if use_wal_acceptors == 'with_wa':
+        zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()
+
+    time_slices = []
+
+    for i in range(tenants_count):
+        start = timeit.default_timer()
+
+        tenant = env.create_tenant()
+        env.zenith_cli([
+            "branch",
+            f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}",
+            "main",
+            f"--tenantid={tenant}"
+        ])
+
+        # FIXME: We used to start new safekeepers here. Did that make sense? Should we do it now?
+        #if use_wal_acceptors == 'with_wa':
+        #    wa_factory.start_n_new(3)
+
+        pg_tenant = env.postgres.create_start(
+            f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}",
+            None,  # branch name, None means same as node name
+            tenant,
+        )
+
+        end = timeit.default_timer()
+        time_slices.append(end - start)
+
+        pg_tenant.stop()
+
+    zenbenchmark.record('tenant_creation_time', sum(time_slices) / len(time_slices), 's')
--- a/test_runner/performance/test_gist_build.py
+++ b/test_runner/performance/test_gist_build.py
@@ -0,0 +1,57 @@
+import os
+from contextlib import closing
+from fixtures.zenith_fixtures import ZenithEnv
+from fixtures.log_helper import log
+
+pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
+
+
+#
+# Test buffering GisT build. It WAL-logs the whole relation, in 32-page chunks.
+# As of this writing, we're duplicate those giant WAL records for each page,
+# which makes the delta layer about 32x larger than it needs to be.
+#
+def test_gist_buffering_build(zenith_simple_env: ZenithEnv, zenbenchmark):
+    env = zenith_simple_env
+    # Create a branch for us
+    env.zenith_cli(["branch", "test_gist_buffering_build", "empty"])
+
+    pg = env.postgres.create_start('test_gist_buffering_build')
+    log.info("postgres is running on 'test_gist_buffering_build' branch")
+
+    # Open a connection directly to the page server that we'll use to force
+    # flushing the layers to disk
+    psconn = env.pageserver.connect()
+    pscur = psconn.cursor()
+
+    # Get the timeline ID of our branch. We need it for the 'do_gc' command
+    with closing(pg.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("SHOW zenith.zenith_timeline")
+            timeline = cur.fetchone()[0]
+
+            # Create test table.
+            cur.execute("create table gist_point_tbl(id int4, p point)")
+            cur.execute(
+                "insert into gist_point_tbl select g, point(g, g) from generate_series(1, 1000000) g;"
+            )
+
+            # Build the index.
+            with zenbenchmark.record_pageserver_writes(env.pageserver, 'pageserver_writes'):
+                with zenbenchmark.record_duration('build'):
+                    cur.execute(
+                        "create index gist_pointidx2 on gist_point_tbl using gist(p) with (buffering = on)"
+                    )
+
+                    # Flush the layers from memory to disk. This is included in the reported
+                    # time and I/O
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 1000000")
+
+            # Record peak memory usage
+            zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(env.pageserver) / 1024, 'MB')
+
+            # Report disk space used by the repository
+            timeline_size = zenbenchmark.get_timeline_size(env.repo_dir,
+                                                           env.initial_tenant,
+                                                           timeline)
+            zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
--- a/test_runner/performance/test_perf_pgbench.py
+++ b/test_runner/performance/test_perf_pgbench.py
@@ -1,21 +1,10 @@
 import os
 from contextlib import closing
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
+from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")

-def get_timeline_size(repo_dir: str, tenantid: str, timelineid: str):
-    path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid)
-
-    totalbytes = 0
-    for root, dirs, files in os.walk(path):
-        for name in files:
-            totalbytes += os.path.getsize(os.path.join(root, name))
-
-        if 'wal' in dirs:
-            dirs.remove('wal')  # don't visit 'wal' subdirectory
-
-    return totalbytes

 #
 # Run a very short pgbench test.
@@ -26,16 +15,17 @@ def get_timeline_size(repo_dir: str, tenantid: str, timelineid: str):
 # 2. Time to run 5000 pgbench transactions
 # 3. Disk space used
 #
-def test_pgbench(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str):
+def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin, zenbenchmark):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_pgbench_perf", "empty"])
+    env.zenith_cli(["branch", "test_pgbench_perf", "empty"])

-    pg = postgres.create_start('test_pgbench_perf')
-    print("postgres is running on 'test_pgbench_perf' branch")
+    pg = env.postgres.create_start('test_pgbench_perf')
+    log.info("postgres is running on 'test_pgbench_perf' branch")

    # Open a connection directly to the page server that we'll use to force
    # flushing the layers to disk
-    psconn = pageserver.connect();
+    psconn = env.pageserver.connect()
    pscur = psconn.cursor()

    # Get the timeline ID of our branch. We need it for the 'do_gc' command
@@ -47,13 +37,13 @@ def test_pgbench(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin
    connstr = pg.connstr()

    # Initialize pgbench database, recording the time and I/O it takes
-    with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
+    with zenbenchmark.record_pageserver_writes(env.pageserver, 'pageserver_writes'):
        with zenbenchmark.record_duration('init'):
            pg_bin.run_capture(['pgbench', '-s5', '-i', connstr])

            # Flush the layers from memory to disk. This is included in the reported
            # time and I/O
-            pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+            pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")

    # Run pgbench for 5000 transactions
    with zenbenchmark.record_duration('5000_xacts'):
@@ -61,8 +51,8 @@ def test_pgbench(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin

    # Flush the layers to disk again. This is *not' included in the reported time,
    # though.
-    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")

    # Report disk space used by the repository
-    timeline_size = get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
-    zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')
+    timeline_size = zenbenchmark.get_timeline_size(env.repo_dir, env.initial_tenant, timeline)
+    zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
--- a/test_runner/performance/test_write_amplification.py
+++ b/test_runner/performance/test_write_amplification.py
@@ -0,0 +1,79 @@
+# Demonstrate Write Amplification with naive oldest-first layer checkpointing
+# algorithm.
+#
+# In each iteration of the test, we create a new table that's slightly under 10
+# MB in size (10 MB is the current "segment size" used by the page server). Then
+# we make a tiny update to all the tables already created. This creates a WAL
+# pattern where you have a lot of updates on one segment (the newly created
+# one), alternating with a small updates on all relations. This is the worst
+# case scenario for the naive checkpointing policy where we write out the layers
+# in LSN order, writing the oldest layer first. That creates a new 10 MB image
+# layer to be created for each of those small updates.  This is the Write
+# Amplification problem at its finest.
+import os
+from contextlib import closing
+from fixtures.zenith_fixtures import ZenithEnv
+from fixtures.log_helper import log
+
+pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
+
+
+def test_write_amplification(zenith_simple_env: ZenithEnv, zenbenchmark):
+    env = zenith_simple_env
+    # Create a branch for us
+    env.zenith_cli(["branch", "test_write_amplification", "empty"])
+
+    pg = env.postgres.create_start('test_write_amplification')
+    log.info("postgres is running on 'test_write_amplification' branch")
+
+    # Open a connection directly to the page server that we'll use to force
+    # flushing the layers to disk
+    psconn = env.pageserver.connect()
+    pscur = psconn.cursor()
+
+    with closing(pg.connect()) as conn:
+        with conn.cursor() as cur:
+            # Get the timeline ID of our branch. We need it for the 'do_gc' command
+            cur.execute("SHOW zenith.zenith_timeline")
+            timeline = cur.fetchone()[0]
+
+            with zenbenchmark.record_pageserver_writes(env.pageserver, 'pageserver_writes'):
+                with zenbenchmark.record_duration('run'):
+
+                    # NOTE: Because each iteration updates every table already created,
+                    # the runtime and write amplification is O(n^2), where n is the
+                    # number of iterations.
+                    for i in range(25):
+                        cur.execute(f'''
+                        CREATE TABLE tbl{i} AS
+                            SELECT g as i, 'long string to consume some space' || g as t
+                            FROM generate_series(1, 100000) g
+                        ''')
+                        cur.execute(f"create index on tbl{i} (i);")
+                        for j in range(1, i):
+                            cur.execute(f"delete from tbl{j} where i = {i}")
+
+                        # Force checkpointing. As of this writing, we don't have
+                        # a back-pressure mechanism, and the page server cannot
+                        # keep up digesting and checkpointing the WAL at the
+                        # rate that it is generated. If we don't force a
+                        # checkpoint, the WAL will just accumulate in memory
+                        # until you hit OOM error. So in effect, we use much
+                        # more memory to hold the incoming WAL, and write them
+                        # out in larger batches than we'd really want. Using
+                        # more memory hides the write amplification problem this
+                        # test tries to demonstrate.
+                        #
+                        # The write amplification problem is real, and using
+                        # more memory isn't the right solution. We could
+                        # demonstrate the effect also by generating the WAL
+                        # slower, adding some delays in this loop.  But forcing
+                        # the the checkpointing and GC makes the test go faster,
+                        # with the same total I/O effect.
+                        pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
+
+            # Report disk space used by the repository
+            timeline_size = zenbenchmark.get_timeline_size(env.repo_dir,
+                                                           env.initial_tenant,
+                                                           timeline)
+            zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
--- a/test_runner/pytest.ini
+++ b/test_runner/pytest.ini
@@ -1,2 +1,5 @@
 [pytest]
 minversion = 6.0
+log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s
+log_date_format = %Y-%m-%d %H:%M:%S
+log_cli = true
--- a/test_runner/setup.cfg
+++ b/test_runner/setup.cfg
@@ -10,6 +10,7 @@ max-line-length = 100
 [yapf]
 based_on_style = pep8
 column_limit = 100
+split_all_top_level_comma_separated_values = true

 [mypy]
 # some tests don't typecheck when this flag is set
@@ -21,7 +22,11 @@ disallow_untyped_decorators = false
 disallow_untyped_defs = false
 strict = true

-[mypy-psycopg2.*]
+[mypy-asyncpg.*]
+# There is some work in progress, though: https://github.com/MagicStack/asyncpg/pull/577
+ignore_missing_imports = true
+
+[mypy-cached_property.*]
 ignore_missing_imports = true

 [mypy-pytest.*]
--- a/test_runner/test_broken.py
+++ b/test_runner/test_broken.py
@@ -1,6 +1,9 @@
 import pytest
 import os

+from fixtures.zenith_fixtures import ZenithEnv
+from fixtures.log_helper import log
+
 pytest_plugins = ("fixtures.zenith_fixtures")
 """
 Use this test to see what happens when tests fail.
@@ -17,12 +20,14 @@ run_broken = pytest.mark.skipif(os.environ.get('RUN_BROKEN') is None,


@run_broken
-def test_broken(zenith_cli, pageserver, postgres, pg_bin):
+def test_broken(zenith_simple_env: ZenithEnv, pg_bin):
+    env = zenith_simple_env
+
    # Create a branch for us
-    zenith_cli.run(["branch", "test_broken", "empty"])
+    env.zenith_cli(["branch", "test_broken", "empty"])

-    postgres.create_start("test_broken")
-    print('postgres is running')
+    env.postgres.create_start("test_broken")
+    log.info('postgres is running')

-    print('THIS NEXT COMMAND WILL FAIL:')
+    log.info('THIS NEXT COMMAND WILL FAIL:')
    pg_bin.run('pgbench -i_am_a_broken_test'.split())
--- a/vendor/postgres
+++ b/vendor/postgres
--- a/Show More
+++ b/Show More