pageserver - add naive layer IDs

Better API to handle timeline metadata properly
Set SO_REUSEADDR for all TCP listeners
2026-03-05 09:20:38 +00:00 · 2021-11-01 00:20:50 -07:00 · 2021-10-29 23:51:40 +03:00 · 2021-10-29 12:45:26 -07:00 · 2021-10-29 19:01:01 +03:00 · 2021-10-28 18:55:14 +03:00
109 changed files with 7451 additions and 2872 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,13 +1,13 @@
 version: 2.1

-orbs:
-  python: circleci/python@1.4.0
-
 executors:
  zenith-build-executor:
    resource_class: xlarge
    docker:
-      - image: cimg/rust:1.52.1
+      - image: cimg/rust:1.55.0
+  zenith-python-executor:
+    docker:
+      - image: cimg/python:3.7.10  # Oldest available 3.7 with Ubuntu 20.04 (for GLIBC and Rust) at CirlceCI

 jobs:
  check-codestyle:
@@ -110,7 +110,7 @@ jobs:
            # Require an exact match. While an out of date cache might speed up the build,
            # there's no way to clean out old packages, so the cache grows every time something
            # changes.
-            - v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
+            - v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}

        # Build the rust code, including test binaries
      - run:
@@ -128,7 +128,7 @@ jobs:

      - save_cache:
          name: Save rust cache
-          key: v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
+          key: v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
          paths:
            - ~/.cargo/registry
            - ~/.cargo/git
@@ -182,9 +182,27 @@ jobs:
          paths:
            - "*"

+  check-python:
+    executor: zenith-python-executor
+    steps:
+      - checkout
+      - run:
+          name: Install deps
+          working_directory: test_runner
+          command: pipenv --python 3.7 install --dev
+      - run:
+          name: Run yapf to ensure code format
+          when: always
+          working_directory: test_runner
+          command: pipenv run yapf --recursive --diff .
+      - run:
+          name: Run mypy to check types
+          when: always
+          working_directory: test_runner
+          command: pipenv run mypy .
+
  run-pytest:
-    #description: "Run pytest"
-    executor: python/default
+    executor: zenith-python-executor
    parameters:
      # pytest args to specify the tests to run.
      #
@@ -219,11 +237,9 @@ jobs:
          steps:
            - run: git submodule update --init --depth 1
      - run:
-          name: Install pipenv & deps
+          name: Install deps
          working_directory: test_runner
-          command: |
-            pip install pipenv
-            pipenv install
+          command: pipenv --python 3.7 install
      - run:
          name: Run pytest
          working_directory: test_runner
@@ -241,8 +257,6 @@ jobs:
            if << parameters.run_in_parallel >>; then
              EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
            fi;
-            ./netstat-script.sh &
-            NS_PID=$!
            # Run the tests.
            #
            # The junit.xml file allows CircleCI to display more fine-grained test information
@@ -254,8 +268,6 @@ jobs:
            # -s is not used to prevent pytest from capturing output, because tests are running
            # in parallel and logs are mixed between different tests
            pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short --verbose -rA $TEST_SELECTION $EXTRA_PARAMS
-            kill $NS_PID
-            awk '/===/ {if (count) print count; print; count=0; next} {count++} END {print count}' $TEST_OUTPUT/netstat.stdout > $TEST_OUTPUT/netstat_stats.stdout
      - run:
          # CircleCI artifacts are preserved one file at a time, so skipping
          # this step isn't a good idea. If you want to extract the
@@ -337,6 +349,7 @@ workflows:
  build_and_test:
    jobs:
      - check-codestyle
+      - check-python
      - build-postgres:
          name: build-postgres-<< matrix.build_type >>
          matrix:
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/2
+++ b/2
@@ -38,8 +38,6 @@ RUN apt-get update && apt-get -yq install libreadline-dev libseccomp-dev openssl

 COPY --from=build /zenith/target/release/pageserver /usr/local/bin
 COPY --from=build /zenith/target/release/safekeeper /usr/local/bin
-# TODO: temporary alias for compatibility, see https://github.com/zenithdb/zenith/pull/740
-RUN ln -s /usr/local/bin/safekeeper /usr/local/bin/wal_acceptor
 COPY --from=build /zenith/target/release/proxy /usr/local/bin
 COPY --from=pg-build /zenith/tmp_install postgres_install
 COPY docker-entrypoint.sh /docker-entrypoint.sh
--- a/Dockerfile.alpine
+++ b/Dockerfile.alpine
@@ -82,8 +82,6 @@ RUN apk add --update openssl build-base libseccomp-dev
 RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb
 COPY --from=build /zenith/target/release/pageserver /usr/local/bin
 COPY --from=build /zenith/target/release/safekeeper /usr/local/bin
-# TODO: temporary alias for compatibility, see https://github.com/zenithdb/zenith/pull/740
-RUN ln -s /usr/local/bin/safekeeper /usr/local/bin/wal_acceptor
 COPY --from=build /zenith/target/release/proxy /usr/local/bin
 COPY --from=pg-build /zenith/tmp_install /usr/local
 COPY docker-entrypoint.sh /docker-entrypoint.sh
--- a/README.md
+++ b/README.md
@@ -28,12 +28,12 @@ apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libsec
 libssl-dev clang pkg-config libpq-dev
 ```

-[Rust] 1.52 or later is also required.
+[Rust] 1.55 or later is also required.

 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.

 To run the integration tests (not required to use the code), install
-Python (3.6 or higher), and install python3 packages with `pipenv` using `pipenv install` in the project directory.
+Python (3.7 or higher), and install python3 packages with `pipenv` using `pipenv install` in the project directory.

 2. Build zenith and patched postgres
 ```sh
@@ -47,17 +47,26 @@ make -j5
 # Create repository in .zenith with proper paths to binaries and data
 # Later that would be responsibility of a package install script
 > ./target/debug/zenith init
+initializing tenantid c03ba6b7ad4c5e9cf556f059ade44229
+created initial timeline 5b014a9e41b4b63ce1a1febc04503636 timeline.lsn 0/169C3C8
+created main branch
 pageserver init succeeded

-# start pageserver
+# start pageserver and safekeeper
 > ./target/debug/zenith start
-Starting pageserver at '127.0.0.1:64000' in .zenith
+Starting pageserver at 'localhost:64000' in '.zenith'
 Pageserver started
+initializing for single for 7676
+Starting safekeeper at 'localhost:5454' in '.zenith/safekeepers/single'
+Safekeeper started

-# start postgres on top on the pageserver
+# start postgres compute node
 > ./target/debug/zenith pg start main
-Starting postgres node at 'host=127.0.0.1 port=55432 user=stas'
+Starting new postgres main on main...
+Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432
+Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres'
 waiting for server to start.... done
+server started

 # check list of running postgres instances
 > ./target/debug/zenith pg list
@@ -108,10 +117,9 @@ postgres=# insert into t values(2,2);
 INSERT 0 1
 ```

-6. If you want to run tests afterwards (see below), you have to stop pageserver and all postgres instances you have just started:
+6. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances
+   you have just started. You can stop them all with one command:
 ```sh
-> ./target/debug/zenith pg stop migration_check
-> ./target/debug/zenith pg stop main
 > ./target/debug/zenith stop
 ```

--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -18,7 +18,7 @@ regex = "1"
 anyhow = "1.0"
 thiserror = "1"
 bytes = "1.0.1"
-nix = "0.20"
+nix = "0.23"
 url = "2.2.2"
 hex = { version = "0.4.3", features = ["serde"] }
 reqwest = { version = "0.11", features = ["blocking", "json"] }
--- a/control_plane/safekeepers.conf
+++ b/control_plane/safekeepers.conf
@@ -0,0 +1,20 @@
+# Page server and three safekeepers.
+[pageserver]
+pg_port = 64000
+http_port = 9898
+auth_type = 'Trust'
+
+[[safekeepers]]
+name = 'sk1'
+pg_port = 5454
+http_port = 7676
+
+[[safekeepers]]
+name = 'sk2'
+pg_port = 5455
+http_port = 7677
+
+[[safekeepers]]
+name = 'sk3'
+pg_port = 5456
+http_port = 7678
--- a/control_plane/simple.conf
+++ b/control_plane/simple.conf
@@ -0,0 +1,11 @@
+# Minimal zenith environment with one safekeeper. This is equivalent to the built-in
+# defaults that you get with no --config
+[pageserver]
+pg_port = 64000
+http_port = 9898
+auth_type = 'Trust'
+
+[[safekeepers]]
+name = 'single'
+pg_port = 5454
+http_port = 7676
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -39,8 +39,6 @@ impl ComputeControlPlane {
    // |  |- <tenant_id>
    // |  |   |- <branch name>
    pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
-        // TODO: since pageserver do not have config file yet we believe here that
-        // it is running on default port. Change that when pageserver will have config.
        let pageserver = Arc::new(PageServerNode::from_env(&env));

        let mut nodes = BTreeMap::default();
@@ -75,40 +73,59 @@ impl ComputeControlPlane {
            .unwrap_or(self.base_port)
    }

-    pub fn local(local_env: &LocalEnv, pageserver: &Arc<PageServerNode>) -> ComputeControlPlane {
-        ComputeControlPlane {
-            base_port: 65431,
-            pageserver: Arc::clone(pageserver),
-            nodes: BTreeMap::new(),
-            env: local_env.clone(),
+    // FIXME: see also parse_point_in_time in branches.rs.
+    fn parse_point_in_time(
+        &self,
+        tenantid: ZTenantId,
+        s: &str,
+    ) -> Result<(ZTimelineId, Option<Lsn>)> {
+        let mut strings = s.split('@');
+        let name = strings.next().unwrap();
+
+        let lsn: Option<Lsn>;
+        if let Some(lsnstr) = strings.next() {
+            lsn = Some(
+                Lsn::from_str(lsnstr)
+                    .with_context(|| "invalid LSN in point-in-time specification")?,
+            );
+        } else {
+            lsn = None
        }
+
+        // Resolve the timeline ID, given the human-readable branch name
+        let timeline_id = self
+            .pageserver
+            .branch_get_by_name(&tenantid, name)?
+            .timeline_id;
+
+        Ok((timeline_id, lsn))
    }

    pub fn new_node(
        &mut self,
        tenantid: ZTenantId,
-        branch_name: &str,
+        name: &str,
+        timeline_spec: &str,
        port: Option<u16>,
    ) -> Result<Arc<PostgresNode>> {
-        let timeline_id = self
-            .pageserver
-            .branch_get_by_name(&tenantid, branch_name)?
-            .timeline_id;
+        // Resolve the human-readable timeline spec into timeline ID and LSN
+        let (timelineid, lsn) = self.parse_point_in_time(tenantid, timeline_spec)?;

        let port = port.unwrap_or_else(|| self.get_port());
        let node = Arc::new(PostgresNode {
-            name: branch_name.to_owned(),
+            name: name.to_owned(),
            address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
            env: self.env.clone(),
            pageserver: Arc::clone(&self.pageserver),
            is_test: false,
-            timelineid: timeline_id,
+            timelineid,
+            lsn,
            tenantid,
            uses_wal_proposer: false,
        });

        node.create_pgdata()?;
-        node.setup_pg_conf(self.env.auth_type)?;
+        node.setup_pg_conf(self.env.pageserver.auth_type)?;

        self.nodes
            .insert((tenantid, node.name.clone()), Arc::clone(&node));
@@ -127,6 +144,7 @@ pub struct PostgresNode {
    pageserver: Arc<PageServerNode>,
    is_test: bool,
    pub timelineid: ZTimelineId,
+    pub lsn: Option<Lsn>, // if it's a read-only node. None for primary
    pub tenantid: ZTenantId,
    uses_wal_proposer: bool,
 }
@@ -161,9 +179,12 @@ impl PostgresNode {
        let port: u16 = conf.parse_field("port", &context)?;
        let timelineid: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?;
        let tenantid: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?;
-
        let uses_wal_proposer = conf.get("wal_acceptors").is_some();

+        // parse recovery_target_lsn, if any
+        let recovery_target_lsn: Option<Lsn> =
+            conf.parse_field_optional("recovery_target_lsn", &context)?;
+
        // ok now
        Ok(PostgresNode {
            address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
@@ -172,12 +193,13 @@ impl PostgresNode {
            pageserver: Arc::clone(pageserver),
            is_test: false,
            timelineid,
+            lsn: recovery_target_lsn,
            tenantid,
            uses_wal_proposer,
        })
    }

-    fn sync_walkeepers(&self) -> Result<Lsn> {
+    fn sync_safekeepers(&self) -> Result<Lsn> {
        let pg_path = self.env.pg_bin_dir().join("postgres");
        let sync_handle = Command::new(pg_path)
            .arg("--sync-safekeepers")
@@ -202,7 +224,7 @@ impl PostgresNode {
        }

        let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?;
-        println!("Walkeepers synced on {}", lsn);
+        println!("Safekeepers synced on {}", lsn);
        Ok(lsn)
    }

@@ -233,7 +255,7 @@ impl PostgresNode {
        // Read the archive directly from the `CopyOutReader`
        tar::Archive::new(copyreader)
            .unpack(&self.pgdata())
-            .with_context(|| "extracting page backup failed")?;
+            .with_context(|| "extracting base backup failed")?;

        Ok(())
    }
@@ -301,11 +323,30 @@ impl PostgresNode {
        conf.append("zenith.page_server_connstring", &pageserver_connstr);
        conf.append("zenith.zenith_tenant", &self.tenantid.to_string());
        conf.append("zenith.zenith_timeline", &self.timelineid.to_string());
+        if let Some(lsn) = self.lsn {
+            conf.append("recovery_target_lsn", &lsn.to_string());
+        }
        conf.append_line("");

-        // Configure the node to stream WAL directly to the pageserver
-        conf.append("synchronous_standby_names", "pageserver"); // TODO: add a new function arg?
-        conf.append("zenith.callmemaybe_connstring", &self.connstr());
+        if !self.env.safekeepers.is_empty() {
+            // Configure the node to connect to the safekeepers
+            conf.append("synchronous_standby_names", "walproposer");
+
+            let wal_acceptors = self
+                .env
+                .safekeepers
+                .iter()
+                .map(|sk| format!("localhost:{}", sk.pg_port))
+                .collect::<Vec<String>>()
+                .join(",");
+            conf.append("wal_acceptors", &wal_acceptors);
+        } else {
+            // Configure the node to stream WAL directly to the pageserver
+            // This isn't really a supported configuration, but can be useful for
+            // testing.
+            conf.append("synchronous_standby_names", "pageserver");
+            conf.append("zenith.callmemaybe_connstring", &self.connstr());
+        }

        let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
        file.write_all(conf.to_string().as_bytes())?;
@@ -314,12 +355,14 @@ impl PostgresNode {
    }

    fn load_basebackup(&self) -> Result<()> {
-        let lsn = if self.uses_wal_proposer {
+        let backup_lsn = if let Some(lsn) = self.lsn {
+            Some(lsn)
+        } else if self.uses_wal_proposer {
            // LSN 0 means that it is bootstrap and we need to download just
            // latest data from the pageserver. That is a bit clumsy but whole bootstrap
            // procedure evolves quite actively right now, so let's think about it again
            // when things would be more stable (TODO).
-            let lsn = self.sync_walkeepers()?;
+            let lsn = self.sync_safekeepers()?;
            if lsn == Lsn(0) {
                None
            } else {
@@ -329,7 +372,7 @@ impl PostgresNode {
            None
        };

-        self.do_basebackup(lsn)?;
+        self.do_basebackup(backup_lsn)?;

        Ok(())
    }
@@ -406,6 +449,10 @@ impl PostgresNode {
        // 3. Load basebackup
        self.load_basebackup()?;

+        if self.lsn.is_some() {
+            File::create(self.pgdata().join("standby.signal"))?;
+        }
+
        // 4. Finally start the compute node postgres
        println!("Starting postgres node at '{}'", self.connstr());
        self.pg_ctl(&["start"], auth_token)
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -13,6 +13,7 @@ use std::path::Path;
 pub mod compute;
 pub mod local_env;
 pub mod postgresql_conf;
+pub mod safekeeper;
 pub mod storage;

 /// Read a PID file
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -7,46 +7,102 @@
 use anyhow::{Context, Result};
 use serde::{Deserialize, Serialize};
 use std::env;
+use std::fmt::Write;
 use std::fs;
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
-use zenith_utils::auth::{encode_from_key_path, Claims, Scope};
+use zenith_utils::auth::{encode_from_key_file, Claims, Scope};
 use zenith_utils::postgres_backend::AuthType;
 use zenith_utils::zid::ZTenantId;

 //
-// This data structures represent deserialized zenith CLI config
+// This data structures represents zenith CLI config
+//
+// It is deserialized from the .zenith/config file, or the config file passed
+// to 'zenith init --config=<path>' option. See control_plane/simple.conf for
+// an example.
 //
 #[derive(Serialize, Deserialize, Clone, Debug)]
 pub struct LocalEnv {
-    // Pageserver connection settings
-    pub pageserver_pg_port: u16,
-    pub pageserver_http_port: u16,
-
-    // Base directory for both pageserver and compute nodes
+    // Base directory for all the nodes (the pageserver, safekeepers and
+    // compute nodes).
+    //
+    // This is not stored in the config file. Rather, this is the path where the
+    // config file itself is. It is read from the ZENITH_REPO_DIR env variable or
+    // '.zenith' if not given.
+    #[serde(skip)]
    pub base_data_dir: PathBuf,

    // Path to postgres distribution. It's expected that "bin", "include",
    // "lib", "share" from postgres distribution are there. If at some point
    // in time we will be able to run against vanilla postgres we may split that
    // to four separate paths and match OS-specific installation layout.
+    #[serde(default)]
    pub pg_distrib_dir: PathBuf,

    // Path to pageserver binary.
+    #[serde(default)]
    pub zenith_distrib_dir: PathBuf,

-    // keeping tenant id in config to reduce copy paste when running zenith locally with single tenant
-    #[serde(with = "hex")]
-    pub tenantid: ZTenantId,
+    // Default tenant ID to use with the 'zenith' command line utility, when
+    // --tenantid is not explicitly specified.
+    #[serde(with = "opt_tenantid_serde")]
+    #[serde(default)]
+    pub default_tenantid: Option<ZTenantId>,

-    // jwt auth token used for communication with pageserver
-    pub auth_token: String,
+    // used to issue tokens during e.g pg start
+    #[serde(default)]
+    pub private_key_path: PathBuf,
+
+    pub pageserver: PageServerConf,
+
+    #[serde(default)]
+    pub safekeepers: Vec<SafekeeperConf>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(default)]
+pub struct PageServerConf {
+    // Pageserver connection settings
+    pub pg_port: u16,
+    pub http_port: u16,

    // used to determine which auth type is used
    pub auth_type: AuthType,

-    // used to issue tokens during e.g pg start
-    pub private_key_path: PathBuf,
+    // jwt auth token used for communication with pageserver
+    pub auth_token: String,
+}
+
+impl Default for PageServerConf {
+    fn default() -> Self {
+        Self {
+            pg_port: 0,
+            http_port: 0,
+            auth_type: AuthType::Trust,
+            auth_token: "".to_string(),
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(default)]
+pub struct SafekeeperConf {
+    pub name: String,
+    pub pg_port: u16,
+    pub http_port: u16,
+    pub sync: bool,
+}
+
+impl Default for SafekeeperConf {
+    fn default() -> Self {
+        Self {
+            name: "".to_string(),
+            pg_port: 0,
+            http_port: 0,
+            sync: true,
+        }
+    }
 }

 impl LocalEnv {
@@ -62,6 +118,10 @@ impl LocalEnv {
        Ok(self.zenith_distrib_dir.join("pageserver"))
    }

+    pub fn safekeeper_bin(&self) -> Result<PathBuf> {
+        Ok(self.zenith_distrib_dir.join("safekeeper"))
+    }
+
    pub fn pg_data_dirs_path(&self) -> PathBuf {
        self.base_data_dir.join("pgdatadirs").join("tenants")
    }
@@ -76,6 +136,187 @@ impl LocalEnv {
    pub fn pageserver_data_dir(&self) -> PathBuf {
        self.base_data_dir.clone()
    }
+
+    pub fn safekeeper_data_dir(&self, node_name: &str) -> PathBuf {
+        self.base_data_dir.join("safekeepers").join(node_name)
+    }
+
+    /// Create a LocalEnv from a config file.
+    ///
+    /// Unlike 'load_config', this function fills in any defaults that are missing
+    /// from the config file.
+    pub fn create_config(toml: &str) -> Result<LocalEnv> {
+        let mut env: LocalEnv = toml::from_str(toml)?;
+
+        // Find postgres binaries.
+        // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
+        if env.pg_distrib_dir == Path::new("") {
+            if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
+                env.pg_distrib_dir = postgres_bin.into();
+            } else {
+                let cwd = env::current_dir()?;
+                env.pg_distrib_dir = cwd.join("tmp_install")
+            }
+        }
+        if !env.pg_distrib_dir.join("bin/postgres").exists() {
+            anyhow::bail!(
+                "Can't find postgres binary at {}",
+                env.pg_distrib_dir.display()
+            );
+        }
+
+        // Find zenith binaries.
+        if env.zenith_distrib_dir == Path::new("") {
+            env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
+        }
+        if !env.zenith_distrib_dir.join("pageserver").exists() {
+            anyhow::bail!("Can't find pageserver binary.");
+        }
+        if !env.zenith_distrib_dir.join("safekeeper").exists() {
+            anyhow::bail!("Can't find safekeeper binary.");
+        }
+
+        // If no initial tenant ID was given, generate it.
+        if env.default_tenantid.is_none() {
+            env.default_tenantid = Some(ZTenantId::generate());
+        }
+
+        env.base_data_dir = base_path();
+
+        Ok(env)
+    }
+
+    /// Locate and load config
+    pub fn load_config() -> Result<LocalEnv> {
+        let repopath = base_path();
+
+        if !repopath.exists() {
+            anyhow::bail!(
+                "Zenith config is not found in {}. You need to run 'zenith init' first",
+                repopath.to_str().unwrap()
+            );
+        }
+
+        // TODO: check that it looks like a zenith repository
+
+        // load and parse file
+        let config = fs::read_to_string(repopath.join("config"))?;
+        let mut env: LocalEnv = toml::from_str(config.as_str())?;
+
+        env.base_data_dir = repopath;
+
+        Ok(env)
+    }
+
+    // this function is used only for testing purposes in CLI e g generate tokens during init
+    pub fn generate_auth_token(&self, claims: &Claims) -> Result<String> {
+        let private_key_path = if self.private_key_path.is_absolute() {
+            self.private_key_path.to_path_buf()
+        } else {
+            self.base_data_dir.join(&self.private_key_path)
+        };
+
+        let key_data = fs::read(private_key_path)?;
+        encode_from_key_file(claims, &key_data)
+    }
+
+    //
+    // Initialize a new Zenith repository
+    //
+    pub fn init(&mut self) -> Result<()> {
+        // check if config already exists
+        let base_path = &self.base_data_dir;
+        if base_path == Path::new("") {
+            anyhow::bail!("repository base path is missing");
+        }
+        if base_path.exists() {
+            anyhow::bail!(
+                "directory '{}' already exists. Perhaps already initialized?",
+                base_path.to_str().unwrap()
+            );
+        }
+
+        fs::create_dir(&base_path)?;
+
+        // generate keys for jwt
+        // openssl genrsa -out private_key.pem 2048
+        let private_key_path;
+        if self.private_key_path == PathBuf::new() {
+            private_key_path = base_path.join("auth_private_key.pem");
+            let keygen_output = Command::new("openssl")
+                .arg("genrsa")
+                .args(&["-out", private_key_path.to_str().unwrap()])
+                .arg("2048")
+                .stdout(Stdio::null())
+                .output()
+                .with_context(|| "failed to generate auth private key")?;
+            if !keygen_output.status.success() {
+                anyhow::bail!(
+                    "openssl failed: '{}'",
+                    String::from_utf8_lossy(&keygen_output.stderr)
+                );
+            }
+            self.private_key_path = Path::new("auth_private_key.pem").to_path_buf();
+
+            let public_key_path = base_path.join("auth_public_key.pem");
+            // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
+            let keygen_output = Command::new("openssl")
+                .arg("rsa")
+                .args(&["-in", private_key_path.to_str().unwrap()])
+                .arg("-pubout")
+                .args(&["-outform", "PEM"])
+                .args(&["-out", public_key_path.to_str().unwrap()])
+                .stdout(Stdio::null())
+                .output()
+                .with_context(|| "failed to generate auth private key")?;
+            if !keygen_output.status.success() {
+                anyhow::bail!(
+                    "openssl failed: '{}'",
+                    String::from_utf8_lossy(&keygen_output.stderr)
+                );
+            }
+        }
+
+        self.pageserver.auth_token =
+            self.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
+
+        fs::create_dir_all(self.pg_data_dirs_path())?;
+
+        for safekeeper in self.safekeepers.iter() {
+            fs::create_dir_all(self.safekeeper_data_dir(&safekeeper.name))?;
+        }
+
+        let mut conf_content = String::new();
+
+        // Currently, the user first passes a config file with 'zenith init --config=<path>'
+        // We read that in, in `create_config`, and fill any missing defaults. Then it's saved
+        // to .zenith/config. TODO: We lose any formatting and comments along the way, which is
+        // a bit sad.
+        write!(
+            &mut conf_content,
+            r#"# This file describes a locale deployment of the page server
+# and safekeeeper node. It is read by the 'zenith' command-line
+# utility.
+"#
+        )?;
+
+        // Convert the LocalEnv to a toml file.
+        //
+        // This could be as simple as this:
+        //
+        // conf_content += &toml::to_string_pretty(env)?;
+        //
+        // But it results in a "values must be emitted before tables". I'm not sure
+        // why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
+        // Maybe rust reorders the fields to squeeze avoid padding or something?
+        // In any case, converting to toml::Value first, and serializing that, works.
+        // See https://github.com/alexcrichton/toml-rs/issues/142
+        conf_content += &toml::to_string_pretty(&toml::Value::try_from(&self)?)?;
+
+        fs::write(base_path.join("config"), conf_content)?;
+
+        Ok(())
+    }
 }

 fn base_path() -> PathBuf {
@@ -85,118 +326,29 @@ fn base_path() -> PathBuf {
    }
 }

-//
-// Initialize a new Zenith repository
-//
-pub fn init(
-    pageserver_pg_port: u16,
-    pageserver_http_port: u16,
-    tenantid: ZTenantId,
-    auth_type: AuthType,
-) -> Result<()> {
-    // check if config already exists
-    let base_path = base_path();
-    if base_path.exists() {
-        anyhow::bail!(
-            "{} already exists. Perhaps already initialized?",
-            base_path.to_str().unwrap()
-        );
+/// Serde routines for Option<ZTenantId>. The serialized form is a hex string.
+mod opt_tenantid_serde {
+    use serde::{Deserialize, Deserializer, Serialize, Serializer};
+    use std::str::FromStr;
+    use zenith_utils::zid::ZTenantId;
+
+    pub fn serialize<S>(tenantid: &Option<ZTenantId>, ser: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        tenantid.map(|t| t.to_string()).serialize(ser)
    }
-    fs::create_dir(&base_path)?;

-    // ok, now check that expected binaries are present
-
-    // Find postgres binaries. Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
-    let pg_distrib_dir: PathBuf = {
-        if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
-            postgres_bin.into()
-        } else {
-            let cwd = env::current_dir()?;
-            cwd.join("tmp_install")
+    pub fn deserialize<'de, D>(des: D) -> Result<Option<ZTenantId>, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        let s: Option<String> = Option::deserialize(des)?;
+        if let Some(s) = s {
+            return Ok(Some(
+                ZTenantId::from_str(&s).map_err(serde::de::Error::custom)?,
+            ));
        }
-    };
-    if !pg_distrib_dir.join("bin/postgres").exists() {
-        anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
+        Ok(None)
    }
-
-    // generate keys for jwt
-    // openssl genrsa -out private_key.pem 2048
-    let private_key_path = base_path.join("auth_private_key.pem");
-    let keygen_output = Command::new("openssl")
-        .arg("genrsa")
-        .args(&["-out", private_key_path.to_str().unwrap()])
-        .arg("2048")
-        .stdout(Stdio::null())
-        .output()
-        .with_context(|| "failed to generate auth private key")?;
-    if !keygen_output.status.success() {
-        anyhow::bail!(
-            "openssl failed: '{}'",
-            String::from_utf8_lossy(&keygen_output.stderr)
-        );
-    }
-
-    let public_key_path = base_path.join("auth_public_key.pem");
-    // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
-    let keygen_output = Command::new("openssl")
-        .arg("rsa")
-        .args(&["-in", private_key_path.to_str().unwrap()])
-        .arg("-pubout")
-        .args(&["-outform", "PEM"])
-        .args(&["-out", public_key_path.to_str().unwrap()])
-        .stdout(Stdio::null())
-        .output()
-        .with_context(|| "failed to generate auth private key")?;
-    if !keygen_output.status.success() {
-        anyhow::bail!(
-            "openssl failed: '{}'",
-            String::from_utf8_lossy(&keygen_output.stderr)
-        );
-    }
-
-    let auth_token =
-        encode_from_key_path(&Claims::new(None, Scope::PageServerApi), &private_key_path)?;
-
-    // Find zenith binaries.
-    let zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
-    if !zenith_distrib_dir.join("pageserver").exists() {
-        anyhow::bail!("Can't find pageserver binary.",);
-    }
-
-    let conf = LocalEnv {
-        pageserver_pg_port,
-        pageserver_http_port,
-        pg_distrib_dir,
-        zenith_distrib_dir,
-        base_data_dir: base_path,
-        tenantid,
-        auth_token,
-        auth_type,
-        private_key_path,
-    };
-
-    fs::create_dir_all(conf.pg_data_dirs_path())?;
-
-    let toml = toml::to_string_pretty(&conf)?;
-    fs::write(conf.base_data_dir.join("config"), toml)?;
-
-    Ok(())
-}
-
-// Locate and load config
-pub fn load_config() -> Result<LocalEnv> {
-    let repopath = base_path();
-
-    if !repopath.exists() {
-        anyhow::bail!(
-            "Zenith config is not found in {}. You need to run 'zenith init' first",
-            repopath.to_str().unwrap()
-        );
-    }
-
-    // TODO: check that it looks like a zenith repository
-
-    // load and parse file
-    let config = fs::read_to_string(repopath.join("config"))?;
-    toml::from_str(config.as_str()).map_err(|e| e.into())
 }
--- a/control_plane/src/postgresql_conf.rs
+++ b/control_plane/src/postgresql_conf.rs
@@ -83,6 +83,22 @@ impl PostgresConf {
            .with_context(|| format!("could not parse '{}' option {}", field_name, context))
    }

+    pub fn parse_field_optional<T>(&self, field_name: &str, context: &str) -> Result<Option<T>>
+    where
+        T: FromStr,
+        <T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
+    {
+        if let Some(val) = self.get(field_name) {
+            let result = val
+                .parse::<T>()
+                .with_context(|| format!("could not parse '{}' option {}", field_name, context))?;
+
+            Ok(Some(result))
+        } else {
+            Ok(None)
+        }
+    }
+
    ///
    /// Note: if you call this multiple times for the same option, the config
    /// file will a line for each call. It would be nice to have a function
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -0,0 +1,277 @@
+use std::io::Write;
+use std::net::TcpStream;
+use std::path::PathBuf;
+use std::process::Command;
+use std::sync::Arc;
+use std::time::Duration;
+use std::{io, result, thread};
+
+use anyhow::bail;
+use nix::errno::Errno;
+use nix::sys::signal::{kill, Signal};
+use nix::unistd::Pid;
+use postgres::Config;
+use reqwest::blocking::{Client, RequestBuilder, Response};
+use reqwest::{IntoUrl, Method};
+use thiserror::Error;
+use zenith_utils::http::error::HttpErrorBody;
+use zenith_utils::postgres_backend::AuthType;
+
+use crate::local_env::{LocalEnv, SafekeeperConf};
+use crate::read_pidfile;
+use crate::storage::PageServerNode;
+use zenith_utils::connstring::connection_address;
+use zenith_utils::connstring::connection_host_port;
+
+#[derive(Error, Debug)]
+pub enum SafekeeperHttpError {
+    #[error("Reqwest error: {0}")]
+    Transport(#[from] reqwest::Error),
+
+    #[error("Error: {0}")]
+    Response(String),
+}
+
+type Result<T> = result::Result<T, SafekeeperHttpError>;
+
+pub trait ResponseErrorMessageExt: Sized {
+    fn error_from_body(self) -> Result<Self>;
+}
+
+impl ResponseErrorMessageExt for Response {
+    fn error_from_body(self) -> Result<Self> {
+        let status = self.status();
+        if !(status.is_client_error() || status.is_server_error()) {
+            return Ok(self);
+        }
+
+        // reqwest do not export it's error construction utility functions, so lets craft the message ourselves
+        let url = self.url().to_owned();
+        Err(SafekeeperHttpError::Response(
+            match self.json::<HttpErrorBody>() {
+                Ok(err_body) => format!("Error: {}", err_body.msg),
+                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
+            },
+        ))
+    }
+}
+
+//
+// Control routines for safekeeper.
+//
+// Used in CLI and tests.
+//
+#[derive(Debug)]
+pub struct SafekeeperNode {
+    pub name: String,
+
+    pub conf: SafekeeperConf,
+
+    pub pg_connection_config: Config,
+    pub env: LocalEnv,
+    pub http_client: Client,
+    pub http_base_url: String,
+
+    pub pageserver: Arc<PageServerNode>,
+}
+
+impl SafekeeperNode {
+    pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
+        let pageserver = Arc::new(PageServerNode::from_env(env));
+
+        println!("initializing for {} for {}", conf.name, conf.http_port);
+
+        SafekeeperNode {
+            name: conf.name.clone(),
+            conf: conf.clone(),
+            pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
+            env: env.clone(),
+            http_client: Client::new(),
+            http_base_url: format!("http://localhost:{}/v1", conf.http_port),
+            pageserver,
+        }
+    }
+
+    /// Construct libpq connection string for connecting to this safekeeper.
+    fn safekeeper_connection_config(port: u16) -> Config {
+        // TODO safekeeper authentication not implemented yet
+        format!("postgresql://no_user@localhost:{}/no_db", port)
+            .parse()
+            .unwrap()
+    }
+
+    pub fn datadir_path(&self) -> PathBuf {
+        self.env.safekeeper_data_dir(&self.name)
+    }
+
+    pub fn pid_file(&self) -> PathBuf {
+        self.datadir_path().join("safekeeper.pid")
+    }
+
+    pub fn start(&self) -> anyhow::Result<()> {
+        print!(
+            "Starting safekeeper at '{}' in '{}'",
+            connection_address(&self.pg_connection_config),
+            self.datadir_path().display()
+        );
+        io::stdout().flush().unwrap();
+
+        // Configure connection to page server
+        //
+        // FIXME: We extract the host and port from the connection string instead of using
+        // the connection string directly, because the 'safekeeper' binary expects
+        // host:port format. That's a bit silly when we already have a full libpq connection
+        // string at hand.
+        let pageserver_conn = {
+            let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
+            format!("{}:{}", host, port)
+        };
+
+        let listen_pg = format!("localhost:{}", self.conf.pg_port);
+        let listen_http = format!("localhost:{}", self.conf.http_port);
+
+        let mut cmd: &mut Command = &mut Command::new(self.env.safekeeper_bin()?);
+        cmd = cmd
+            .args(&["-D", self.datadir_path().to_str().unwrap()])
+            .args(&["--listen-pg", &listen_pg])
+            .args(&["--listen-http", &listen_http])
+            .args(&["--pageserver", &pageserver_conn])
+            .args(&["--recall", "1 second"])
+            .arg("--daemonize")
+            .env_clear()
+            .env("RUST_BACKTRACE", "1");
+        if !self.conf.sync {
+            cmd = cmd.arg("--no-sync");
+        }
+
+        if self.env.pageserver.auth_type == AuthType::ZenithJWT {
+            cmd.env("PAGESERVER_AUTH_TOKEN", &self.env.pageserver.auth_token);
+        }
+
+        if !cmd.status()?.success() {
+            bail!(
+                "Safekeeper failed to start. See '{}' for details.",
+                self.datadir_path().join("safekeeper.log").display()
+            );
+        }
+
+        // It takes a while for the safekeeper to start up. Wait until it is
+        // open for business.
+        const RETRIES: i8 = 15;
+        for retries in 1..RETRIES {
+            match self.check_status() {
+                Ok(_) => {
+                    println!("\nSafekeeper started");
+                    return Ok(());
+                }
+                Err(err) => {
+                    match err {
+                        SafekeeperHttpError::Transport(err) => {
+                            if err.is_connect() && retries < 5 {
+                                print!(".");
+                                io::stdout().flush().unwrap();
+                            } else {
+                                if retries == 5 {
+                                    println!() // put a line break after dots for second message
+                                }
+                                println!(
+                                    "Safekeeper not responding yet, err {} retrying ({})...",
+                                    err, retries
+                                );
+                            }
+                        }
+                        SafekeeperHttpError::Response(msg) => {
+                            bail!("safekeeper failed to start: {} ", msg)
+                        }
+                    }
+                    thread::sleep(Duration::from_secs(1));
+                }
+            }
+        }
+        bail!("safekeeper failed to start in {} seconds", RETRIES);
+    }
+
+    ///
+    /// Stop the server.
+    ///
+    /// If 'immediate' is true, we use SIGQUIT, killing the process immediately.
+    /// Otherwise we use SIGTERM, triggering a clean shutdown
+    ///
+    /// If the server is not running, returns success
+    ///
+    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
+        let pid_file = self.pid_file();
+        if !pid_file.exists() {
+            println!("Safekeeper {} is already stopped", self.name);
+            return Ok(());
+        }
+        let pid = read_pidfile(&pid_file)?;
+        let pid = Pid::from_raw(pid);
+
+        let sig = if immediate {
+            println!("Stop safekeeper immediately");
+            Signal::SIGQUIT
+        } else {
+            println!("Stop safekeeper gracefully");
+            Signal::SIGTERM
+        };
+        match kill(pid, sig) {
+            Ok(_) => (),
+            Err(Errno::ESRCH) => {
+                println!(
+                    "Safekeeper with pid {} does not exist, but a PID file was found",
+                    pid
+                );
+                return Ok(());
+            }
+            Err(err) => bail!(
+                "Failed to send signal to safekeeper with pid {}: {}",
+                pid,
+                err.desc()
+            ),
+        }
+
+        let address = connection_address(&self.pg_connection_config);
+
+        // TODO Remove this "timeout" and handle it on caller side instead.
+        // Shutting down may take a long time,
+        // if safekeeper flushes a lot of data
+        for _ in 0..100 {
+            if let Err(_e) = TcpStream::connect(&address) {
+                println!("Safekeeper stopped receiving connections");
+
+                //Now check status
+                match self.check_status() {
+                    Ok(_) => {
+                        println!("Safekeeper status is OK. Wait a bit.");
+                        thread::sleep(Duration::from_secs(1));
+                    }
+                    Err(err) => {
+                        println!("Safekeeper status is: {}", err);
+                        return Ok(());
+                    }
+                }
+            } else {
+                println!("Safekeeper still receives connections");
+                thread::sleep(Duration::from_secs(1));
+            }
+        }
+
+        bail!("Failed to stop safekeeper with pid {}", pid);
+    }
+
+    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
+        // TODO: authentication
+        //if self.env.auth_type == AuthType::ZenithJWT {
+        //    builder = builder.bearer_auth(&self.env.safekeeper_auth_token)
+        //}
+        self.http_client.request(method, url)
+    }
+
+    pub fn check_status(&self) -> Result<()> {
+        self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
+            .send()?
+            .error_from_body()?;
+        Ok(())
+    }
+}
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -6,6 +6,7 @@ use std::time::Duration;
 use std::{io, result, thread};

 use anyhow::{anyhow, bail};
+use nix::errno::Errno;
 use nix::sys::signal::{kill, Signal};
 use nix::unistd::Pid;
 use pageserver::http::models::{BranchCreateRequest, TenantCreateRequest};
@@ -20,6 +21,7 @@ use zenith_utils::zid::ZTenantId;
 use crate::local_env::LocalEnv;
 use crate::read_pidfile;
 use pageserver::branches::BranchInfo;
+use pageserver::tenant_mgr::TenantInfo;
 use zenith_utils::connstring::connection_address;

 #[derive(Error, Debug)]
@@ -62,7 +64,6 @@ impl ResponseErrorMessageExt for Response {
 //
 #[derive(Debug)]
 pub struct PageServerNode {
-    pub kill_on_exit: bool,
    pub pg_connection_config: Config,
    pub env: LocalEnv,
    pub http_client: Client,
@@ -71,34 +72,34 @@ pub struct PageServerNode {

 impl PageServerNode {
    pub fn from_env(env: &LocalEnv) -> PageServerNode {
-        let password = if env.auth_type == AuthType::ZenithJWT {
-            &env.auth_token
+        let password = if env.pageserver.auth_type == AuthType::ZenithJWT {
+            &env.pageserver.auth_token
        } else {
            ""
        };

        PageServerNode {
-            kill_on_exit: false,
            pg_connection_config: Self::pageserver_connection_config(
                password,
-                env.pageserver_pg_port,
+                env.pageserver.pg_port,
            ),
            env: env.clone(),
            http_client: Client::new(),
-            http_base_url: format!("http://127.0.0.1:{}/v1", env.pageserver_http_port),
+            http_base_url: format!("http://localhost:{}/v1", env.pageserver.http_port),
        }
    }

+    /// Construct libpq connection string for connecting to the pageserver.
    fn pageserver_connection_config(password: &str, port: u16) -> Config {
-        format!("postgresql://no_user:{}@127.0.0.1:{}/no_db", password, port)
+        format!("postgresql://no_user:{}@localhost:{}/no_db", password, port)
            .parse()
            .unwrap()
    }

-    pub fn init(&self, create_tenant: Option<&str>, enable_auth: bool) -> anyhow::Result<()> {
+    pub fn init(&self, create_tenant: Option<&str>) -> anyhow::Result<()> {
        let mut cmd = Command::new(self.env.pageserver_bin()?);
-        let listen_pg = format!("127.0.0.1:{}", self.env.pageserver_pg_port);
-        let listen_http = format!("127.0.0.1:{}", self.env.pageserver_http_port);
+        let listen_pg = format!("localhost:{}", self.env.pageserver.pg_port);
+        let listen_http = format!("localhost:{}", self.env.pageserver.http_port);
        let mut args = vec![
            "--init",
            "-D",
@@ -111,10 +112,11 @@ impl PageServerNode {
            &listen_http,
        ];

-        if enable_auth {
+        let auth_type_str = &self.env.pageserver.auth_type.to_string();
+        if self.env.pageserver.auth_type != AuthType::Trust {
            args.extend(&["--auth-validation-public-key-path", "auth_public_key.pem"]);
-            args.extend(&["--auth-type", "ZenithJWT"]);
        }
+        args.extend(&["--auth-type", auth_type_str]);

        if let Some(tenantid) = create_tenant {
            args.extend(&["--create-tenant", tenantid])
@@ -152,7 +154,7 @@ impl PageServerNode {

        let mut cmd = Command::new(self.env.pageserver_bin()?);
        cmd.args(&["-D", self.repo_path().to_str().unwrap()])
-            .arg("-d")
+            .arg("--daemonize")
            .env_clear()
            .env("RUST_BACKTRACE", "1");

@@ -199,19 +201,43 @@ impl PageServerNode {
        bail!("pageserver failed to start in {} seconds", RETRIES);
    }

+    ///
+    /// Stop the server.
+    ///
+    /// If 'immediate' is true, we use SIGQUIT, killing the process immediately.
+    /// Otherwise we use SIGTERM, triggering a clean shutdown
+    ///
+    /// If the server is not running, returns success
+    ///
    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        let pid = read_pidfile(&self.pid_file())?;
-        let pid = Pid::from_raw(pid);
-        if immediate {
+        let pid_file = self.pid_file();
+        if !pid_file.exists() {
+            println!("Pageserver is already stopped");
+            return Ok(());
+        }
+        let pid = Pid::from_raw(read_pidfile(&pid_file)?);
+
+        let sig = if immediate {
            println!("Stop pageserver immediately");
-            if kill(pid, Signal::SIGQUIT).is_err() {
-                bail!("Failed to kill pageserver with pid {}", pid);
-            }
+            Signal::SIGQUIT
        } else {
            println!("Stop pageserver gracefully");
-            if kill(pid, Signal::SIGTERM).is_err() {
-                bail!("Failed to stop pageserver with pid {}", pid);
+            Signal::SIGTERM
+        };
+        match kill(pid, sig) {
+            Ok(_) => (),
+            Err(Errno::ESRCH) => {
+                println!(
+                    "Pageserver with pid {} does not exist, but a PID file was found",
+                    pid
+                );
+                return Ok(());
            }
+            Err(err) => bail!(
+                "Failed to send signal to pageserver with pid {}: {}",
+                pid,
+                err.desc()
+            ),
        }

        let address = connection_address(&self.pg_connection_config);
@@ -256,8 +282,8 @@ impl PageServerNode {

    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
        let mut builder = self.http_client.request(method, url);
-        if self.env.auth_type == AuthType::ZenithJWT {
-            builder = builder.bearer_auth(&self.env.auth_token)
+        if self.env.pageserver.auth_type == AuthType::ZenithJWT {
+            builder = builder.bearer_auth(&self.env.pageserver.auth_token)
        }
        builder
    }
@@ -269,7 +295,7 @@ impl PageServerNode {
        Ok(())
    }

-    pub fn tenant_list(&self) -> Result<Vec<String>> {
+    pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
        Ok(self
            .http_request(Method::GET, format!("{}/{}", self.http_base_url, "tenant"))
            .send()?
@@ -332,12 +358,3 @@ impl PageServerNode {
            .json()?)
    }
 }
-
-impl Drop for PageServerNode {
-    fn drop(&mut self) {
-        // TODO Looks like this flag is never set
-        if self.kill_on_exit {
-            let _ = self.stop(true);
-        }
-    }
-}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -17,7 +17,7 @@ lazy_static = "1.4.0"
 log = "0.4.14"
 clap = "2.33.0"
 daemonize = "0.4.1"
-tokio = { version = "1.11", features = ["process", "macros", "fs", "rt"] }
+tokio = { version = "1.11", features = ["process", "macros", "fs", "rt", "io-util"] }
 postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
 postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
@@ -37,6 +37,7 @@ async-trait = "0.1"
 const_format = "0.2.21"
 tracing = "0.1.27"
 signal-hook = {version = "0.3.10", features = ["extended-siginfo"] }
+url = "2"

 postgres_ffi = { path = "../postgres_ffi" }
 zenith_metrics = { path = "../zenith_metrics" }
@@ -45,3 +46,4 @@ workspace_hack = { path = "../workspace_hack" }

 [dev-dependencies]
 hex-literal = "0.3"
+tempfile = "3.2"
--- a/pageserver/README.md
+++ b/pageserver/README.md
@@ -41,7 +41,7 @@ Legend:
 +--+

 ....
-.  .   Component that we will need, but doesn't exist at the moment. A TODO.
+.  .   Component at its early development phase.
 ....

 --->   Data flow
@@ -116,13 +116,49 @@ Remove old on-disk layer files that are no longer needed according to the
 PITR retention policy


-TODO: Backup service
--------------------
+### Backup service

-The backup service is responsible for periodically pushing the chunks to S3.
+The backup service, responsible for storing pageserver recovery data externally.

-TODO: How/when do restore from S3? Whenever we get a GetPage@LSN request for
-a chunk we don't currently have? Or when an external Control Plane tells us?
+Currently, pageserver stores its files in a filesystem directory it's pointed to.
+That working directory could be rather ephemeral for such cases as "a pageserver pod running in k8s with no persistent volumes attached".
+Therefore, the server interacts with external, more reliable storage to back up and restore its state.
+
+The code for storage support is extensible and can support arbitrary ones as long as they implement a certain Rust trait.
+There are the following implementations present:
+* local filesystem — to use in tests mainly
+* AWS S3           - to use in production
+
+Implementation details are covered in the [storage readme](./src/relish_storage/README.md) and corresponding Rust file docs.
+
+The backup service is disabled by default and can be enabled to interact with a single remote storage.
+
+CLI examples:
+* Local FS: `${PAGESERVER_BIN} --relish-storage-local-path="/some/local/path/"`
+* AWS S3  : `${PAGESERVER_BIN} --relish-storage-s3-bucket="some-sample-bucket" --relish-storage-region="eu-north-1" --relish-storage-access-key="SOMEKEYAAAAASADSAH*#" --relish-storage-secret-access-key="SOMEsEcReTsd292v"`
+
+For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
+For local S3 installations, refer to the their documentation for name format and credentials.
+
+Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup backup targets.
+Required sections are:
+
+```toml
+[relish_storage]
+local_path = '/Users/someonetoignore/Downloads/tmp_dir/'
+```
+
+or
+
+```toml
+[relish_storage]
+bucket_name = 'some-sample-bucket'
+bucket_region = 'eu-north-1'
+access_key_id = 'SOMEKEYAAAAASADSAH*#'
+secret_access_key = 'SOMEsEcReTsd292v'
+```
+
+Also, `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` variables can be used to specify the credentials instead of any of the ways above.

 TODO: Sharding
 --------------------
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,6 +13,7 @@
 use anyhow::Result;
 use bytes::{BufMut, BytesMut};
 use log::*;
+use std::fmt::Write as FmtWrite;
 use std::io;
 use std::io::Write;
 use std::sync::Arc;
@@ -83,7 +84,7 @@ impl<'a> Basebackup<'a> {

        info!(
            "taking basebackup lsn={}, prev_lsn={}",
-            backup_prev, backup_lsn
+            backup_lsn, backup_prev
        );

        Ok(Basebackup {
@@ -248,13 +249,7 @@ impl<'a> Basebackup<'a> {
        let mut pg_control = ControlFileData::decode(&pg_control_bytes)?;
        let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;

-        // Generate new pg_control and WAL needed for bootstrap
-        let checkpoint_segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE);
-        let checkpoint_lsn = XLogSegNoOffsetToRecPtr(
-            checkpoint_segno,
-            XLOG_SIZE_OF_XLOG_LONG_PHD as u32,
-            pg_constants::WAL_SEGMENT_SIZE,
-        );
+        // Generate new pg_control needed for bootstrap
        checkpoint.redo = normalize_lsn(self.lsn, pg_constants::WAL_SEGMENT_SIZE).0;

        //reset some fields we don't want to preserve
@@ -263,19 +258,24 @@ impl<'a> Basebackup<'a> {
        checkpoint.oldestActiveXid = 0;

        //save new values in pg_control
-        pg_control.checkPoint = checkpoint_lsn;
+        pg_control.checkPoint = 0;
        pg_control.checkPointCopy = checkpoint;
        pg_control.state = pg_constants::DB_SHUTDOWNED;

        // add zenith.signal file
-        let xl_prev = if self.prev_record_lsn == Lsn(0) {
-            0xBAD0 // magic value to indicate that we don't know prev_lsn
+        let mut zenith_signal = String::new();
+        if self.prev_record_lsn == Lsn(0) {
+            if self.lsn == self.timeline.get_ancestor_lsn() {
+                write!(zenith_signal, "PREV LSN: none")?;
+            } else {
+                write!(zenith_signal, "PREV LSN: invalid")?;
+            }
        } else {
-            self.prev_record_lsn.0
-        };
+            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
+        }
        self.ar.append(
-            &new_tar_header("zenith.signal", 8)?,
-            &xl_prev.to_le_bytes()[..],
+            &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
+            zenith_signal.as_bytes(),
        )?;

        //send pg_control
@@ -284,14 +284,15 @@ impl<'a> Basebackup<'a> {
        self.ar.append(&header, &pg_control_bytes[..])?;

        //send wal segment
+        let segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE);
        let wal_file_name = XLogFileName(
            1, // FIXME: always use Postgres timeline 1
-            checkpoint_segno,
+            segno,
            pg_constants::WAL_SEGMENT_SIZE,
        );
        let wal_file_path = format!("pg_wal/{}", wal_file_name);
        let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?;
-        let wal_seg = generate_wal_segment(&pg_control);
+        let wal_seg = generate_wal_segment(segno, pg_control.system_identifier);
        assert!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE);
        self.ar.append(&header, &wal_seg[..])?;
        Ok(())
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -2,17 +2,15 @@
 // Main entry point for the Page Server executable
 //

-use pageserver::defaults::*;
 use serde::{Deserialize, Serialize};
 use std::{
    env,
-    net::TcpListener,
    path::{Path, PathBuf},
    str::FromStr,
    thread,
 };
 use tracing::*;
-use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType};
+use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType, tcp_listener};

 use anyhow::{bail, ensure, Context, Result};
 use signal_hook::consts::signal::*;
@@ -28,13 +26,8 @@ use clap::{App, Arg, ArgMatches};
 use daemonize::Daemonize;

 use pageserver::{
-    branches,
-    defaults::{
-        DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR,
-        DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS,
-    },
-    http, page_service, relish_storage, tenant_mgr, PageServerConf, RelishStorageConfig,
-    RelishStorageKind, S3Config, LOG_FILE_NAME,
+    branches, defaults::*, http, page_service, relish_storage, tenant_mgr, PageServerConf,
+    RelishStorageConfig, RelishStorageKind, S3Config, LOG_FILE_NAME,
 };
 use zenith_utils::http::endpoint;
 use zenith_utils::postgres_backend;
@@ -42,7 +35,7 @@ use zenith_utils::postgres_backend;
 use const_format::formatcp;

 /// String arguments that can be declared via CLI or config file
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
 struct CfgFileParams {
    listen_pg_addr: Option<String>,
    listen_http_addr: Option<String>,
@@ -53,12 +46,21 @@ struct CfgFileParams {
    pg_distrib_dir: Option<String>,
    auth_validation_public_key_path: Option<String>,
    auth_type: Option<String>,
-    // see https://github.com/alexcrichton/toml-rs/blob/6c162e6562c3e432bf04c82a3d1d789d80761a86/examples/enum_external.rs for enum deserialisation examples
-    relish_storage: Option<RelishStorage>,
    relish_storage_max_concurrent_sync: Option<String>,
+    /////////////////////////////////
+    //// Don't put `Option<String>` and other "simple" values below.
+    ////
+    /// `Option<RelishStorage>` is a <a href='https://toml.io/en/v1.0.0#table'>table</a> in TOML.
+    /// Values in TOML cannot be defined after tables (other tables can),
+    /// and [`toml`] crate serializes all fields in the order of their appearance.
+    ////////////////////////////////
+    relish_storage: Option<RelishStorage>,
 }

-#[derive(Serialize, Deserialize, Clone)]
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
+// Without this attribute, enums with values won't be serialized by the `toml` library (but can be deserialized nonetheless!).
+// See https://github.com/alexcrichton/toml-rs/blob/6c162e6562c3e432bf04c82a3d1d789d80761a86/examples/enum_external.rs for the examples
+#[serde(untagged)]
 enum RelishStorage {
    Local {
        local_path: String,
@@ -477,13 +479,13 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
        "Starting pageserver http handler on {}",
        conf.listen_http_addr
    );
-    let http_listener = TcpListener::bind(conf.listen_http_addr.clone())?;
+    let http_listener = tcp_listener::bind(conf.listen_http_addr.clone())?;

    info!(
        "Starting pageserver pg protocol handler on {}",
        conf.listen_pg_addr
    );
-    let pageserver_listener = TcpListener::bind(conf.listen_pg_addr.clone())?;
+    let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;

    if conf.daemonize {
        info!("daemonizing...");
@@ -552,7 +554,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
                info!("Got SIGQUIT. Terminate pageserver in immediate shutdown mode");
                exit(111);
            }
-            SIGTERM => {
+            SIGINT | SIGTERM => {
                info!("Got SIGINT/SIGTERM. Terminate gracefully in fast shutdown mode");
                // Terminate postgres backends
                postgres_backend::set_pgbackend_shutdown_requested();
@@ -577,11 +579,142 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
                info!("Pageserver shut down successfully completed");
                exit(0);
            }
-            _ => {
-                debug!("Unknown signal.");
+            unknown_signal => {
+                debug!("Unknown signal {}", unknown_signal);
            }
        }
    }

    Ok(())
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn page_server_conf_toml_serde() {
+        let params = CfgFileParams {
+            listen_pg_addr: Some("listen_pg_addr_VALUE".to_string()),
+            listen_http_addr: Some("listen_http_addr_VALUE".to_string()),
+            checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()),
+            checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
+            gc_horizon: Some("gc_horizon_VALUE".to_string()),
+            gc_period: Some("gc_period_VALUE".to_string()),
+            pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
+            auth_validation_public_key_path: Some(
+                "auth_validation_public_key_path_VALUE".to_string(),
+            ),
+            auth_type: Some("auth_type_VALUE".to_string()),
+            relish_storage: Some(RelishStorage::Local {
+                local_path: "relish_storage_local_VALUE".to_string(),
+            }),
+            relish_storage_max_concurrent_sync: Some(
+                "relish_storage_max_concurrent_sync_VALUE".to_string(),
+            ),
+        };
+
+        let toml_string = toml::to_string(&params).expect("Failed to serialize correct config");
+        let toml_pretty_string =
+            toml::to_string_pretty(&params).expect("Failed to serialize correct config");
+        assert_eq!(
+            r#"listen_pg_addr = 'listen_pg_addr_VALUE'
+listen_http_addr = 'listen_http_addr_VALUE'
+checkpoint_distance = 'checkpoint_distance_VALUE'
+checkpoint_period = 'checkpoint_period_VALUE'
+gc_horizon = 'gc_horizon_VALUE'
+gc_period = 'gc_period_VALUE'
+pg_distrib_dir = 'pg_distrib_dir_VALUE'
+auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
+auth_type = 'auth_type_VALUE'
+relish_storage_max_concurrent_sync = 'relish_storage_max_concurrent_sync_VALUE'
+
+[relish_storage]
+local_path = 'relish_storage_local_VALUE'
+"#,
+            toml_pretty_string
+        );
+
+        let params_from_serialized: CfgFileParams = toml::from_str(&toml_string)
+            .expect("Failed to deserialize the serialization result of the config");
+        let params_from_serialized_pretty: CfgFileParams = toml::from_str(&toml_pretty_string)
+            .expect("Failed to deserialize the prettified serialization result of the config");
+        assert!(
+            params_from_serialized == params,
+            "Expected the same config in the end of config -> serialize -> deserialize chain"
+        );
+        assert!(
+            params_from_serialized_pretty == params,
+            "Expected the same config in the end of config -> serialize pretty -> deserialize chain"
+        );
+    }
+
+    #[test]
+    fn credentials_omitted_during_serialization() {
+        let params = CfgFileParams {
+            listen_pg_addr: Some("listen_pg_addr_VALUE".to_string()),
+            listen_http_addr: Some("listen_http_addr_VALUE".to_string()),
+            checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()),
+            checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
+            gc_horizon: Some("gc_horizon_VALUE".to_string()),
+            gc_period: Some("gc_period_VALUE".to_string()),
+            pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
+            auth_validation_public_key_path: Some(
+                "auth_validation_public_key_path_VALUE".to_string(),
+            ),
+            auth_type: Some("auth_type_VALUE".to_string()),
+            relish_storage: Some(RelishStorage::AwsS3 {
+                bucket_name: "bucket_name_VALUE".to_string(),
+                bucket_region: "bucket_region_VALUE".to_string(),
+                access_key_id: Some("access_key_id_VALUE".to_string()),
+                secret_access_key: Some("secret_access_key_VALUE".to_string()),
+            }),
+            relish_storage_max_concurrent_sync: Some(
+                "relish_storage_max_concurrent_sync_VALUE".to_string(),
+            ),
+        };
+
+        let toml_string = toml::to_string(&params).expect("Failed to serialize correct config");
+        let toml_pretty_string =
+            toml::to_string_pretty(&params).expect("Failed to serialize correct config");
+        assert_eq!(
+            r#"listen_pg_addr = 'listen_pg_addr_VALUE'
+listen_http_addr = 'listen_http_addr_VALUE'
+checkpoint_distance = 'checkpoint_distance_VALUE'
+checkpoint_period = 'checkpoint_period_VALUE'
+gc_horizon = 'gc_horizon_VALUE'
+gc_period = 'gc_period_VALUE'
+pg_distrib_dir = 'pg_distrib_dir_VALUE'
+auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
+auth_type = 'auth_type_VALUE'
+relish_storage_max_concurrent_sync = 'relish_storage_max_concurrent_sync_VALUE'
+
+[relish_storage]
+bucket_name = 'bucket_name_VALUE'
+bucket_region = 'bucket_region_VALUE'
+"#,
+            toml_pretty_string
+        );
+
+        let params_from_serialized: CfgFileParams = toml::from_str(&toml_string)
+            .expect("Failed to deserialize the serialization result of the config");
+        let params_from_serialized_pretty: CfgFileParams = toml::from_str(&toml_pretty_string)
+            .expect("Failed to deserialize the prettified serialization result of the config");
+
+        let mut expected_params = params;
+        expected_params.relish_storage = Some(RelishStorage::AwsS3 {
+            bucket_name: "bucket_name_VALUE".to_string(),
+            bucket_region: "bucket_region_VALUE".to_string(),
+            access_key_id: None,
+            secret_access_key: None,
+        });
+        assert!(
+            params_from_serialized == expected_params,
+            "Expected the config without credentials in the end of a 'config -> serialize -> deserialize' chain"
+        );
+        assert!(
+            params_from_serialized_pretty == expected_params,
+            "Expected the config without credentials in the end of a 'config -> serialize pretty -> deserialize' chain"
+        );
+    }
+}
--- a/pageserver/src/branches.rs
+++ b/pageserver/src/branches.rs
@@ -4,7 +4,7 @@
 // TODO: move all paths construction to conf impl
 //

-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{bail, Context, Result};
 use postgres_ffi::ControlFileData;
 use serde::{Deserialize, Serialize};
 use std::{
@@ -23,6 +23,7 @@ use zenith_utils::zid::{ZTenantId, ZTimelineId};

 use crate::tenant_mgr;
 use crate::walredo::WalRedoManager;
+use crate::CheckpointConfig;
 use crate::{repository::Repository, PageServerConf};
 use crate::{restore_local_repo, LOG_FILE_NAME};

@@ -35,7 +36,7 @@ pub struct BranchInfo {
    pub ancestor_id: Option<String>,
    pub ancestor_lsn: Option<String>,
    pub current_logical_size: usize,
-    pub current_logical_size_non_incremental: usize,
+    pub current_logical_size_non_incremental: Option<usize>,
 }

 impl BranchInfo {
@@ -44,6 +45,7 @@ impl BranchInfo {
        conf: &PageServerConf,
        tenantid: &ZTenantId,
        repo: &Arc<dyn Repository>,
+        include_non_incremental_logical_size: bool,
    ) -> Result<Self> {
        let name = path
            .as_ref()
@@ -78,6 +80,14 @@ impl BranchInfo {
            );
        }

+        // non incremental size calculation can be heavy, so let it be optional
+        // needed for tests to check size calculation
+        let current_logical_size_non_incremental = include_non_incremental_logical_size
+            .then(|| {
+                timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn())
+            })
+            .transpose()?;
+
        Ok(BranchInfo {
            name,
            timeline_id,
@@ -85,8 +95,7 @@ impl BranchInfo {
            ancestor_id,
            ancestor_lsn,
            current_logical_size: timeline.get_current_logical_size(),
-            current_logical_size_non_incremental: timeline
-                .get_current_logical_size_non_incremental(timeline.get_last_record_lsn())?,
+            current_logical_size_non_incremental,
        })
    }
 }
@@ -230,7 +239,7 @@ fn bootstrap_timeline(
        timeline.writer().as_ref(),
        lsn,
    )?;
-    timeline.checkpoint()?;
+    timeline.checkpoint(CheckpointConfig::Forced)?;

    println!(
        "created initial timeline {} timeline.lsn {}",
@@ -248,19 +257,11 @@ fn bootstrap_timeline(
    Ok(())
 }

-pub(crate) fn get_tenants(conf: &PageServerConf) -> Result<Vec<String>> {
-    let tenants_dir = conf.tenants_path();
-
-    std::fs::read_dir(&tenants_dir)?
-        .map(|dir_entry_res| {
-            let dir_entry = dir_entry_res?;
-            ensure!(dir_entry.file_type()?.is_dir());
-            Ok(dir_entry.file_name().to_str().unwrap().to_owned())
-        })
-        .collect()
-}
-
-pub(crate) fn get_branches(conf: &PageServerConf, tenantid: &ZTenantId) -> Result<Vec<BranchInfo>> {
+pub(crate) fn get_branches(
+    conf: &PageServerConf,
+    tenantid: &ZTenantId,
+    include_non_incremental_logical_size: bool,
+) -> Result<Vec<BranchInfo>> {
    let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?;

    // Each branch has a corresponding record (text file) in the refs/branches
@@ -270,7 +271,13 @@ pub(crate) fn get_branches(conf: &PageServerConf, tenantid: &ZTenantId) -> Resul
    std::fs::read_dir(&branches_dir)?
        .map(|dir_entry_res| {
            let dir_entry = dir_entry_res?;
-            BranchInfo::from_path(dir_entry.path(), conf, tenantid, &repo)
+            BranchInfo::from_path(
+                dir_entry.path(),
+                conf,
+                tenantid,
+                &repo,
+                include_non_incremental_logical_size,
+            )
        })
        .collect()
 }
@@ -332,7 +339,7 @@ pub(crate) fn create_branch(
        ancestor_id: None,
        ancestor_lsn: None,
        current_logical_size: 0,
-        current_logical_size_non_incremental: 0,
+        current_logical_size_non_incremental: Some(0),
    })
 }

--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -25,6 +25,11 @@ paths:
        schema:
          type: string
          format: hex
+      - name: include-non-incremental-logical-size
+        in: query
+        schema:
+          type: string
+          description: Controls calculation of current_logical_size_non_incremental
    get:
      description: Get branches for tenant
      responses:
@@ -73,6 +78,11 @@ paths:
        required: true
        schema:
          type: string
+      - name: include-non-incremental-logical-size
+        in: query
+        schema:
+          type: string
+          description: Controls calculation of current_logical_size_non_incremental
    get:
      description: Get branches for tenant
      responses:
@@ -164,13 +174,13 @@ paths:
      description: Get tenants list
      responses:
        "200":
-          description: OK
+          description: TenantInfo
          content:
            application/json:
              schema:
                type: array
                items:
-                  type: string
+                  $ref: "#/components/schemas/TenantInfo"
        "401":
          description: Unauthorized Error
          content:
@@ -243,6 +253,16 @@ components:
      scheme: bearer
      bearerFormat: JWT
  schemas:
+    TenantInfo:
+      type: object
+      required:
+        - id
+        - state
+      properties:
+        id:
+          type: string
+        state:
+          type: string
    BranchInfo:
      type: object
      required:
@@ -250,7 +270,6 @@ components:
        - timeline_id
        - latest_valid_lsn
        - current_logical_size
-        - current_logical_size_non_incremental
      properties:
        name:
          type: string
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -86,31 +86,59 @@ async fn branch_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    Ok(json_response(StatusCode::CREATED, response_data)?)
 }

+// Gate non incremental logical size calculation behind a flag
+// after pgbench -i -s100 calculation took 28ms so if multiplied by the number of timelines
+// and tenants it can take noticeable amount of time. Also the value currently used only in tests
+fn get_include_non_incremental_logical_size(request: &Request<Body>) -> bool {
+    request
+        .uri()
+        .query()
+        .map(|v| {
+            url::form_urlencoded::parse(v.as_bytes())
+                .into_owned()
+                .any(|(param, _)| param == "include-non-incremental-logical-size")
+        })
+        .unwrap_or(false)
+}
+
 async fn branch_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;

+    let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
+
    check_permission(&request, Some(tenantid))?;

    let response_data = tokio::task::spawn_blocking(move || {
        let _enter = info_span!("branch_list", tenant = %tenantid).entered();
-        crate::branches::get_branches(get_config(&request), &tenantid)
+        crate::branches::get_branches(
+            get_config(&request),
+            &tenantid,
+            include_non_incremental_logical_size,
+        )
    })
    .await
    .map_err(ApiError::from_err)??;
    Ok(json_response(StatusCode::OK, response_data)?)
 }

-// TODO add to swagger
 async fn branch_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
    let branch_name: String = get_request_param(&request, "branch_name")?.to_string();
    let conf = get_state(&request).conf;
    let path = conf.branch_path(&branch_name, &tenantid);

+    let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
+
    let response_data = tokio::task::spawn_blocking(move || {
        let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered();
        let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
-        BranchInfo::from_path(path, conf, &tenantid, &repo)
+        BranchInfo::from_path(
+            path,
+            conf,
+            &tenantid,
+            &repo,
+            include_non_incremental_logical_size,
+        )
    })
    .await
    .map_err(ApiError::from_err)??;
@@ -124,7 +152,7 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A

    let response_data = tokio::task::spawn_blocking(move || {
        let _enter = info_span!("tenant_list").entered();
-        crate::branches::get_tenants(get_config(&request))
+        crate::tenant_mgr::list_tenants()
    })
    .await
    .map_err(ApiError::from_err)??;
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
@@ -16,13 +16,11 @@ use bookfile::Book;
 use bytes::Bytes;
 use lazy_static::lazy_static;
 use postgres_ffi::pg_constants::BLCKSZ;
-use serde::{Deserialize, Serialize};
 use tracing::*;

 use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::collections::{BTreeSet, HashSet};
-use std::convert::TryInto;
 use std::fs;
 use std::fs::{File, OpenOptions};
 use std::io::Write;
@@ -30,9 +28,9 @@ use std::ops::{Bound::Included, Deref};
 use std::path::{Path, PathBuf};
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex, MutexGuard};
-use std::thread::JoinHandle;
 use std::time::{Duration, Instant};

+use self::metadata::{metadata_path, TimelineMetadata};
 use crate::relish::*;
 use crate::relish_storage::schedule_timeline_upload;
 use crate::repository::{GcResult, Repository, Timeline, TimelineWriter, WALRecord};
@@ -40,6 +38,7 @@ use crate::tenant_mgr;
 use crate::walreceiver;
 use crate::walreceiver::IS_WAL_RECEIVER;
 use crate::walredo::WalRedoManager;
+use crate::CheckpointConfig;
 use crate::PageServerConf;
 use crate::{ZTenantId, ZTimelineId};

@@ -47,7 +46,6 @@ use zenith_metrics::{
    register_histogram, register_int_gauge_vec, Histogram, IntGauge, IntGaugeVec,
 };
 use zenith_metrics::{register_histogram_vec, HistogramVec};
-use zenith_utils::bin_ser::BeSer;
 use zenith_utils::crashsafe_dir;
 use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn};
 use zenith_utils::seqwait::SeqWait;
@@ -59,6 +57,7 @@ mod image_layer;
 mod inmemory_layer;
 mod interval_tree;
 mod layer_map;
+pub mod metadata;
 mod page_versions;
 mod storage_layer;

@@ -111,8 +110,9 @@ lazy_static! {
    .expect("failed to define a metric");
 }

-/// The name of the metadata file pageserver creates per timeline.
-pub const METADATA_FILE_NAME: &str = "metadata";
+/// Parts of the `.zenith/tenants/<tenantid>/timelines/<timelineid>` directory prefix.
+pub const TENANTS_SEGMENT_NAME: &str = "tenants";
+pub const TIMELINES_SEGMENT_NAME: &str = "timelines";

 ///
 /// Repository consists of multiple timelines. Keep them in a hash table.
@@ -142,12 +142,7 @@ impl Repository for LayeredRepository {
        // Create the timeline directory, and write initial metadata to file.
        crashsafe_dir::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenantid))?;

-        let metadata = TimelineMetadata {
-            disk_consistent_lsn: Lsn(0),
-            prev_record_lsn: None,
-            ancestor_timeline: None,
-            ancestor_lsn: Lsn(0),
-        };
+        let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0));
        Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata, true)?;

        let timeline = LayeredTimeline::new(
@@ -186,12 +181,7 @@ impl Repository for LayeredRepository {
        // Create the metadata file, noting the ancestor of the new timeline.
        // There is initially no data in it, but all the read-calls know to look
        // into the ancestor.
-        let metadata = TimelineMetadata {
-            disk_consistent_lsn: start_lsn,
-            prev_record_lsn: dst_prev,
-            ancestor_timeline: Some(src),
-            ancestor_lsn: start_lsn,
-        };
+        let metadata = TimelineMetadata::new(start_lsn, dst_prev, Some(src), start_lsn);
        crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenantid))?;
        Self::save_metadata(self.conf, dst, self.tenantid, &metadata, true)?;

@@ -216,6 +206,22 @@ impl Repository for LayeredRepository {
            })
    }

+    fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()> {
+        {
+            let timelines = self.timelines.lock().unwrap();
+
+            for (timelineid, timeline) in timelines.iter() {
+                let _entered =
+                    info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenantid)
+                        .entered();
+
+                timeline.checkpoint(cconf)?;
+            }
+        }
+
+        Ok(())
+    }
+
    // Wait for all threads to complete and persist repository data before pageserver shutdown.
    fn shutdown(&self) -> Result<()> {
        trace!("LayeredRepository shutdown for tenant {}", self.tenantid);
@@ -225,7 +231,7 @@ impl Repository for LayeredRepository {
            walreceiver::stop_wal_receiver(*timelineid);
            // Wait for syncing data to disk
            trace!("repo shutdown. checkpoint timeline {}", timelineid);
-            timeline.checkpoint()?;
+            timeline.checkpoint(CheckpointConfig::Forced)?;

            //TODO Wait for walredo process to shutdown too
        }
@@ -247,14 +253,14 @@ impl LayeredRepository {
            Some(timeline) => Ok(timeline.clone()),
            None => {
                let metadata = Self::load_metadata(self.conf, timelineid, self.tenantid)?;
-                let disk_consistent_lsn = metadata.disk_consistent_lsn;
+                let disk_consistent_lsn = metadata.disk_consistent_lsn();

                // Recurse to look up the ancestor timeline.
                //
                // TODO: If you have a very deep timeline history, this could become
                // expensive. Perhaps delay this until we need to look up a page in
                // ancestor.
-                let ancestor = if let Some(ancestor_timelineid) = metadata.ancestor_timeline {
+                let ancestor = if let Some(ancestor_timelineid) = metadata.ancestor_timeline() {
                    Some(self.get_timeline_locked(ancestor_timelineid, timelines)?)
                } else {
                    None
@@ -266,7 +272,7 @@ impl LayeredRepository {

                let mut timeline = LayeredTimeline::new(
                    self.conf,
-                    metadata,
+                    metadata.clone(),
                    ancestor,
                    timelineid,
                    self.tenantid,
@@ -276,15 +282,9 @@ impl LayeredRepository {
                )?;

                // List the layers on disk, and load them into the layer map
-                let _loaded_layers = timeline.load_layer_map(disk_consistent_lsn)?;
+                let loaded_layers = timeline.load_layer_map(disk_consistent_lsn)?;
                if self.upload_relishes {
-                    schedule_timeline_upload(());
-                    // schedule_timeline_upload(
-                    //     self.tenantid,
-                    //     timelineid,
-                    //     loaded_layers,
-                    //     disk_consistent_lsn,
-                    // );
+                    schedule_timeline_upload(self.tenantid, timelineid, loaded_layers, metadata);
                }

                // needs to be after load_layer_map
@@ -312,90 +312,6 @@ impl LayeredRepository {
        }
    }

-    ///
-    /// Launch the checkpointer thread in given repository.
-    ///
-    pub fn launch_checkpointer_thread(
-        conf: &'static PageServerConf,
-        rc: Arc<LayeredRepository>,
-    ) -> JoinHandle<()> {
-        std::thread::Builder::new()
-            .name("Checkpointer thread".into())
-            .spawn(move || {
-                // FIXME: relaunch it? Panic is not good.
-                rc.checkpoint_loop(conf).expect("Checkpointer thread died");
-            })
-            .unwrap()
-    }
-
-    ///
-    /// Checkpointer thread's main loop
-    ///
-    fn checkpoint_loop(&self, conf: &'static PageServerConf) -> Result<()> {
-        while !tenant_mgr::shutdown_requested() {
-            std::thread::sleep(conf.checkpoint_period);
-            info!("checkpointer thread for tenant {} waking up", self.tenantid);
-
-            // checkpoint timelines that have accumulated more than CHECKPOINT_DISTANCE
-            // bytes of WAL since last checkpoint.
-            {
-                let timelines = self.timelines.lock().unwrap();
-                for (timelineid, timeline) in timelines.iter() {
-                    let _entered =
-                        info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenantid)
-                            .entered();
-
-                    STORAGE_TIME
-                        .with_label_values(&["checkpoint_timed"])
-                        .observe_closure_duration(|| {
-                            timeline.checkpoint_internal(conf.checkpoint_distance, false)
-                        })?
-                }
-                // release lock on 'timelines'
-            }
-        }
-        trace!("Checkpointer thread shut down");
-        Ok(())
-    }
-
-    ///
-    /// Launch the GC thread in given repository.
-    ///
-    pub fn launch_gc_thread(
-        conf: &'static PageServerConf,
-        rc: Arc<LayeredRepository>,
-    ) -> JoinHandle<()> {
-        std::thread::Builder::new()
-            .name("GC thread".into())
-            .spawn(move || {
-                // FIXME: relaunch it? Panic is not good.
-                rc.gc_loop(conf).expect("GC thread died");
-            })
-            .unwrap()
-    }
-
-    ///
-    /// GC thread's main loop
-    ///
-    fn gc_loop(&self, conf: &'static PageServerConf) -> Result<()> {
-        while !tenant_mgr::shutdown_requested() {
-            // Garbage collect old files that are not needed for PITR anymore
-            if conf.gc_horizon > 0 {
-                self.gc_iteration(None, conf.gc_horizon, false).unwrap();
-            }
-
-            // TODO Write it in more adequate way using
-            // condvar.wait_timeout() or something
-            let mut sleep_time = conf.gc_period.as_secs();
-            while sleep_time > 0 && !tenant_mgr::shutdown_requested() {
-                sleep_time -= 1;
-                std::thread::sleep(Duration::from_secs(1));
-            }
-            info!("gc thread for tenant {} waking up", self.tenantid);
-        }
-        Ok(())
-    }
-
    /// Save timeline metadata to file
    fn save_metadata(
        conf: &'static PageServerConf,
@@ -412,13 +328,7 @@ impl LayeredRepository {
            .create_new(first_save)
            .open(&path)?;

-        let mut metadata_bytes = TimelineMetadata::ser(data)?;
-
-        assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
-        metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8);
-
-        let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]);
-        metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum));
+        let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?;

        if file.write(&metadata_bytes)? != metadata_bytes.len() {
            bail!("Could not write all the metadata bytes in a single call");
@@ -445,20 +355,7 @@ impl LayeredRepository {
    ) -> Result<TimelineMetadata> {
        let path = metadata_path(conf, timelineid, tenantid);
        let metadata_bytes = std::fs::read(&path)?;
-        ensure!(metadata_bytes.len() == METADATA_MAX_SAFE_SIZE);
-
-        let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE];
-        let calculated_checksum = crc32c::crc32c(data);
-
-        let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] =
-            metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?;
-        let expected_checksum = u32::from_le_bytes(*checksum_bytes);
-        ensure!(calculated_checksum == expected_checksum);
-
-        let data = TimelineMetadata::des_prefix(data)?;
-        assert!(data.disk_consistent_lsn.is_aligned());
-
-        Ok(data)
+        TimelineMetadata::from_bytes(&metadata_bytes)
    }

    //
@@ -568,7 +465,7 @@ impl LayeredRepository {
                // so that they too can be garbage collected. That's
                // used in tests, so we want as deterministic results as possible.
                if checkpoint_before_gc {
-                    timeline.checkpoint()?;
+                    timeline.checkpoint(CheckpointConfig::Forced)?;
                    info!("timeline {} checkpoint_before_gc done", timelineid);
                }

@@ -583,29 +480,6 @@ impl LayeredRepository {
    }
 }

-/// Metadata stored on disk for each timeline
-///
-/// The fields correspond to the values we hold in memory, in LayeredTimeline.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct TimelineMetadata {
-    disk_consistent_lsn: Lsn,
-
-    // This is only set if we know it. We track it in memory when the page
-    // server is running, but we only track the value corresponding to
-    // 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a
-    // lot. We only store it in the metadata file when we flush *all* the
-    // in-memory data so that 'last_record_lsn' is the same as
-    // 'disk_consistent_lsn'.  That's OK, because after page server restart, as
-    // soon as we reprocess at least one record, we will have a valid
-    // 'prev_record_lsn' value in memory again. This is only really needed when
-    // doing a clean shutdown, so that there is no more WAL beyond
-    // 'disk_consistent_lsn'
-    prev_record_lsn: Option<Lsn>,
-
-    ancestor_timeline: Option<ZTimelineId>,
-    ancestor_lsn: Lsn,
-}
-
 pub struct LayeredTimeline {
    conf: &'static PageServerConf,

@@ -678,6 +552,10 @@ pub struct LayeredTimeline {

 /// Public interface functions
 impl Timeline for LayeredTimeline {
+    fn get_ancestor_lsn(&self) -> Lsn {
+        self.ancestor_lsn
+    }
+
    /// Wait until WAL has been received up to the given LSN.
    fn wait_lsn(&self, lsn: Lsn) -> Result<()> {
        // This should never be called from the WAL receiver thread, because that could lead
@@ -691,8 +569,8 @@ impl Timeline for LayeredTimeline {
            .wait_for_timeout(lsn, TIMEOUT)
            .with_context(|| {
                format!(
-                    "Timed out while waiting for WAL record at LSN {} to arrive",
-                    lsn
+                    "Timed out while waiting for WAL record at LSN {} to arrive, disk consistent LSN={}",
+                    lsn, self.get_disk_consistent_lsn()
                )
            })?;

@@ -848,11 +726,15 @@ impl Timeline for LayeredTimeline {
    /// Public entry point for checkpoint(). All the logic is in the private
    /// checkpoint_internal function, this public facade just wraps it for
    /// metrics collection.
-    fn checkpoint(&self) -> Result<()> {
-        STORAGE_TIME
-            .with_label_values(&["checkpoint_force"])
-            //pass checkpoint_distance=0 to force checkpoint
-            .observe_closure_duration(|| self.checkpoint_internal(0, true))
+    fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()> {
+        match cconf {
+            CheckpointConfig::Forced => STORAGE_TIME
+                .with_label_values(&["forced checkpoint"])
+                .observe_closure_duration(|| self.checkpoint_internal(0)),
+            CheckpointConfig::Distance(distance) => STORAGE_TIME
+                .with_label_values(&["checkpoint"])
+                .observe_closure_duration(|| self.checkpoint_internal(distance)),
+        }
    }

    fn get_last_record_lsn(&self) -> Lsn {
@@ -906,6 +788,10 @@ impl Timeline for LayeredTimeline {
        Ok(total_blocks * BLCKSZ as usize)
    }

+    fn get_disk_consistent_lsn(&self) -> Lsn {
+        self.disk_consistent_lsn.load()
+    }
+
    fn writer<'a>(&'a self) -> Box<dyn TimelineWriter + 'a> {
        Box::new(LayeredTimelineWriter {
            tl: self,
@@ -942,13 +828,13 @@ impl LayeredTimeline {

            // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
            last_record_lsn: SeqWait::new(RecordLsn {
-                last: metadata.disk_consistent_lsn,
-                prev: metadata.prev_record_lsn.unwrap_or(Lsn(0)),
+                last: metadata.disk_consistent_lsn(),
+                prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)),
            }),
-            disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn.0),
+            disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0),

            ancestor_timeline: ancestor,
-            ancestor_lsn: metadata.ancestor_lsn,
+            ancestor_lsn: metadata.ancestor_lsn(),
            current_logical_size: AtomicUsize::new(current_logical_size),
            current_logical_size_gauge,
            upload_relishes,
@@ -1227,7 +1113,7 @@ impl LayeredTimeline {
    /// Flush to disk all data that was written with the put_* functions
    ///
    /// NOTE: This has nothing to do with checkpoint in PostgreSQL.
-    fn checkpoint_internal(&self, checkpoint_distance: u64, forced: bool) -> Result<()> {
+    fn checkpoint_internal(&self, checkpoint_distance: u64) -> Result<()> {
        let mut write_guard = self.write_lock.lock().unwrap();
        let mut layers = self.layers.lock().unwrap();

@@ -1258,10 +1144,6 @@ impl LayeredTimeline {
        while let Some((oldest_layer, oldest_generation)) = layers.peek_oldest_open() {
            let oldest_pending_lsn = oldest_layer.get_oldest_pending_lsn();

-            if tenant_mgr::shutdown_requested() && !forced {
-                return Ok(());
-            }
-
            // Does this layer need freezing?
            //
            // Write out all in-memory layers that contain WAL older than CHECKPOINT_DISTANCE.
@@ -1340,50 +1222,48 @@ impl LayeredTimeline {
            timeline_dir.sync_all()?;
        }

-        // Save the metadata, with updated 'disk_consistent_lsn', to a
-        // file in the timeline dir. After crash, we will restart WAL
-        // streaming and processing from that point.
+        // If we were able to advance 'disk_consistent_lsn', save it the metadata file.
+        // After crash, we will restart WAL streaming and processing from that point.
+        let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
+        if disk_consistent_lsn != old_disk_consistent_lsn {
+            assert!(disk_consistent_lsn > old_disk_consistent_lsn);

-        // We can only save a valid 'prev_record_lsn' value on disk if we
-        // flushed *all* in-memory changes to disk. We only track
-        // 'prev_record_lsn' in memory for the latest processed record, so we
-        // don't remember what the correct value that corresponds to some old
-        // LSN is. But if we flush everything, then the value corresponding
-        // current 'last_record_lsn' is correct and we can store it on disk.
-        let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn {
-            Some(prev_record_lsn)
-        } else {
-            None
-        };
+            // We can only save a valid 'prev_record_lsn' value on disk if we
+            // flushed *all* in-memory changes to disk. We only track
+            // 'prev_record_lsn' in memory for the latest processed record, so we
+            // don't remember what the correct value that corresponds to some old
+            // LSN is. But if we flush everything, then the value corresponding
+            // current 'last_record_lsn' is correct and we can store it on disk.
+            let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn {
+                Some(prev_record_lsn)
+            } else {
+                None
+            };

-        let ancestor_timelineid = self.ancestor_timeline.as_ref().map(|x| x.timelineid);
+            let ancestor_timelineid = self.ancestor_timeline.as_ref().map(|x| x.timelineid);

-        let metadata = TimelineMetadata {
-            disk_consistent_lsn,
-            prev_record_lsn: ondisk_prev_record_lsn,
-            ancestor_timeline: ancestor_timelineid,
-            ancestor_lsn: self.ancestor_lsn,
-        };
-        LayeredRepository::save_metadata(
-            self.conf,
-            self.timelineid,
-            self.tenantid,
-            &metadata,
-            false,
-        )?;
-        if self.upload_relishes {
-            schedule_timeline_upload(())
-            // schedule_timeline_upload(
-            //     self.tenantid,
-            //     self.timelineid,
-            //     layer_uploads,
-            //     disk_consistent_lsn,
-            // });
+            let metadata = TimelineMetadata::new(
+                disk_consistent_lsn,
+                ondisk_prev_record_lsn,
+                ancestor_timelineid,
+                self.ancestor_lsn,
+            );
+
+            LayeredRepository::save_metadata(
+                self.conf,
+                self.timelineid,
+                self.tenantid,
+                &metadata,
+                false,
+            )?;
+            if self.upload_relishes {
+                schedule_timeline_upload(self.tenantid, self.timelineid, layer_uploads, metadata);
+            }
+
+            // Also update the in-memory copy
+            self.disk_consistent_lsn.store(disk_consistent_lsn);
        }

-        // Also update the in-memory copy
-        self.disk_consistent_lsn.store(disk_consistent_lsn);
-
        Ok(())
    }

@@ -1935,15 +1815,6 @@ pub fn dump_layerfile_from_path(path: &Path) -> Result<()> {
    Ok(())
 }

-fn metadata_path(
-    conf: &'static PageServerConf,
-    timelineid: ZTimelineId,
-    tenantid: ZTenantId,
-) -> PathBuf {
-    conf.timeline_path(&timelineid, &tenantid)
-        .join(METADATA_FILE_NAME)
-}
-
 /// Add a suffix to a layer file's name: .{num}.old
 /// Uses the first available num (starts at 0)
 fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> {
--- a/pageserver/src/layered_repository/delta_layer.rs
+++ b/pageserver/src/layered_repository/delta_layer.rs
@@ -442,12 +442,7 @@ impl DeltaLayer {
    }

    fn open_book(&self) -> Result<(PathBuf, Book<File>)> {
-        let path = Self::path_for(
-            &self.path_or_conf,
-            self.timelineid,
-            self.tenantid,
-            &self.layer_name(),
-        );
+        let path = self.path();

        let file = File::open(&path)?;
        let book = Book::new(file)?;
--- a/pageserver/src/layered_repository/filename.rs
+++ b/pageserver/src/layered_repository/filename.rs
@@ -13,7 +13,7 @@ use anyhow::Result;
 use log::*;
 use zenith_utils::lsn::Lsn;

-use super::METADATA_FILE_NAME;
+use super::metadata::METADATA_FILE_NAME;

 // Note: LayeredTimeline::load_layer_map() relies on this sort order
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
--- a/pageserver/src/layered_repository/layer_map.rs
+++ b/pageserver/src/layered_repository/layer_map.rs
@@ -17,6 +17,7 @@ use anyhow::Result;
 use lazy_static::lazy_static;
 use std::cmp::Ordering;
 use std::collections::{BinaryHeap, HashMap};
+use std::sync::atomic::{self, AtomicU64};
 use std::sync::Arc;
 use zenith_metrics::{register_int_gauge, IntGauge};
 use zenith_utils::lsn::Lsn;
@@ -30,6 +31,17 @@ lazy_static! {
            .expect("failed to define a metric");
 }

+static NEXT_LAYER_ID: AtomicU64 = AtomicU64::new(0);
+
+#[derive(PartialEq, Eq, Hash, Clone, Copy)]
+pub struct LayerId(u64);
+
+impl LayerId {
+    fn next() -> LayerId {
+        Self(NEXT_LAYER_ID.fetch_add(1, atomic::Ordering::Relaxed))
+    }
+}
+
 ///
 /// LayerMap tracks what layers exist on a timeline.
 ///
@@ -43,6 +55,8 @@ pub struct LayerMap {
    /// contains the oldest WAL record.
    open_layers: BinaryHeap<OpenLayerEntry>,

+    open_layers_by_id: HashMap<LayerId, Arc<InMemoryLayer>>,
+
    /// Generation number, used to distinguish newly inserted entries in the
    /// binary heap from older entries during checkpoint.
    current_generation: u64,
@@ -71,10 +85,15 @@ impl LayerMap {
        segentry.open.as_ref().map(Arc::clone)
    }

+    #[allow(dead_code)]
+    pub fn get_open_by_id(&self, layer_id: &LayerId) -> Option<Arc<InMemoryLayer>> {
+        self.open_layers_by_id.get(layer_id).cloned()
+    }
+
    ///
    /// Insert an open in-memory layer
    ///
-    pub fn insert_open(&mut self, layer: Arc<InMemoryLayer>) {
+    pub fn insert_open(&mut self, layer: Arc<InMemoryLayer>) -> LayerId {
        let segentry = self.segs.entry(layer.get_seg_tag()).or_default();

        segentry.update_open(Arc::clone(&layer));
@@ -86,15 +105,23 @@ impl LayerMap {
        // in the middle of a WAL record.
        assert!(oldest_pending_lsn.is_aligned());

+        let id = LayerId::next();
+
        // Also add it to the binary heap
        let open_layer_entry = OpenLayerEntry {
            oldest_pending_lsn: layer.get_oldest_pending_lsn(),
-            layer,
+            layer: Arc::clone(&layer),
            generation: self.current_generation,
+            id,
        };
        self.open_layers.push(open_layer_entry);

+        let old_layer = self.open_layers_by_id.insert(id, layer);
+        assert!(old_layer.is_none());
+
        NUM_INMEMORY_LAYERS.inc();
+
+        id
    }

    /// Remove the oldest in-memory layer
@@ -114,6 +141,8 @@ impl LayerMap {
            assert!(oldest_entry.layer.is_dropped());
        }

+        self.open_layers_by_id.remove(&oldest_entry.id).unwrap();
+
        NUM_INMEMORY_LAYERS.dec();
    }

@@ -319,6 +348,7 @@ struct OpenLayerEntry {
    pub oldest_pending_lsn: Lsn, // copy of layer.get_oldest_pending_lsn()
    pub generation: u64,
    pub layer: Arc<InMemoryLayer>,
+    id: LayerId,
 }
 impl Ord for OpenLayerEntry {
    fn cmp(&self, other: &Self) -> Ordering {
--- a/pageserver/src/layered_repository/metadata.rs
+++ b/pageserver/src/layered_repository/metadata.rs
@@ -0,0 +1,202 @@
+//! Every image of a certain timeline from [`crate::layered_repository::LayeredRepository`]
+//! has a metadata that needs to be stored persistently.
+//!
+//! Later, the file gets is used in [`crate::relish_storage::storage_sync`] as a part of
+//! external storage import and export operations.
+//!
+//! The module contains all structs and related helper methods related to timeline metadata.
+
+use std::{convert::TryInto, path::PathBuf};
+
+use anyhow::ensure;
+use zenith_utils::{
+    bin_ser::BeSer,
+    lsn::Lsn,
+    zid::{ZTenantId, ZTimelineId},
+};
+
+use crate::{
+    layered_repository::{METADATA_CHECKSUM_SIZE, METADATA_MAX_DATA_SIZE, METADATA_MAX_SAFE_SIZE},
+    PageServerConf,
+};
+
+/// The name of the metadata file pageserver creates per timeline.
+pub const METADATA_FILE_NAME: &str = "metadata";
+
+/// Metadata stored on disk for each timeline
+///
+/// The fields correspond to the values we hold in memory, in LayeredTimeline.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
+pub struct TimelineMetadata {
+    disk_consistent_lsn: Lsn,
+    // This is only set if we know it. We track it in memory when the page
+    // server is running, but we only track the value corresponding to
+    // 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a
+    // lot. We only store it in the metadata file when we flush *all* the
+    // in-memory data so that 'last_record_lsn' is the same as
+    // 'disk_consistent_lsn'.  That's OK, because after page server restart, as
+    // soon as we reprocess at least one record, we will have a valid
+    // 'prev_record_lsn' value in memory again. This is only really needed when
+    // doing a clean shutdown, so that there is no more WAL beyond
+    // 'disk_consistent_lsn'
+    prev_record_lsn: Option<Lsn>,
+    ancestor_timeline: Option<ZTimelineId>,
+    ancestor_lsn: Lsn,
+}
+
+/// Points to a place in pageserver's local directory,
+/// where certain timeline's metadata file should be located.
+pub fn metadata_path(
+    conf: &'static PageServerConf,
+    timelineid: ZTimelineId,
+    tenantid: ZTenantId,
+) -> PathBuf {
+    conf.timeline_path(&timelineid, &tenantid)
+        .join(METADATA_FILE_NAME)
+}
+
+impl TimelineMetadata {
+    pub fn new(
+        disk_consistent_lsn: Lsn,
+        prev_record_lsn: Option<Lsn>,
+        ancestor_timeline: Option<ZTimelineId>,
+        ancestor_lsn: Lsn,
+    ) -> Self {
+        Self {
+            disk_consistent_lsn,
+            prev_record_lsn,
+            ancestor_timeline,
+            ancestor_lsn,
+        }
+    }
+
+    pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
+        ensure!(
+            metadata_bytes.len() == METADATA_MAX_SAFE_SIZE,
+            "metadata bytes size is wrong"
+        );
+
+        let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE];
+        let calculated_checksum = crc32c::crc32c(data);
+
+        let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] =
+            metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?;
+        let expected_checksum = u32::from_le_bytes(*checksum_bytes);
+        ensure!(
+            calculated_checksum == expected_checksum,
+            "metadata checksum mismatch"
+        );
+
+        let data = TimelineMetadata::from(serialize::DeTimelineMetadata::des_prefix(data)?);
+        assert!(data.disk_consistent_lsn.is_aligned());
+
+        Ok(data)
+    }
+
+    pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
+        let serializeable_metadata = serialize::SeTimelineMetadata::from(self);
+        let mut metadata_bytes = serialize::SeTimelineMetadata::ser(&serializeable_metadata)?;
+        assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
+        metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8);
+
+        let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]);
+        metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum));
+        Ok(metadata_bytes)
+    }
+
+    /// [`Lsn`] that corresponds to the corresponding timeline directory
+    /// contents, stored locally in the pageserver workdir.
+    pub fn disk_consistent_lsn(&self) -> Lsn {
+        self.disk_consistent_lsn
+    }
+
+    pub fn prev_record_lsn(&self) -> Option<Lsn> {
+        self.prev_record_lsn
+    }
+
+    pub fn ancestor_timeline(&self) -> Option<ZTimelineId> {
+        self.ancestor_timeline
+    }
+
+    pub fn ancestor_lsn(&self) -> Lsn {
+        self.ancestor_lsn
+    }
+}
+
+/// This module is for direct conversion of metadata to bytes and back.
+/// For a certain metadata, besides the conversion a few verification steps has to
+/// be done, so all serde derives are hidden from the user, to avoid accidental
+/// verification-less metadata creation.
+mod serialize {
+    use serde::{Deserialize, Serialize};
+    use zenith_utils::{lsn::Lsn, zid::ZTimelineId};
+
+    use super::TimelineMetadata;
+
+    #[derive(Serialize)]
+    pub(super) struct SeTimelineMetadata<'a> {
+        disk_consistent_lsn: &'a Lsn,
+        prev_record_lsn: &'a Option<Lsn>,
+        ancestor_timeline: &'a Option<ZTimelineId>,
+        ancestor_lsn: &'a Lsn,
+    }
+
+    impl<'a> From<&'a TimelineMetadata> for SeTimelineMetadata<'a> {
+        fn from(other: &'a TimelineMetadata) -> Self {
+            Self {
+                disk_consistent_lsn: &other.disk_consistent_lsn,
+                prev_record_lsn: &other.prev_record_lsn,
+                ancestor_timeline: &other.ancestor_timeline,
+                ancestor_lsn: &other.ancestor_lsn,
+            }
+        }
+    }
+
+    #[derive(Deserialize)]
+    pub(super) struct DeTimelineMetadata {
+        disk_consistent_lsn: Lsn,
+        prev_record_lsn: Option<Lsn>,
+        ancestor_timeline: Option<ZTimelineId>,
+        ancestor_lsn: Lsn,
+    }
+
+    impl From<DeTimelineMetadata> for TimelineMetadata {
+        fn from(other: DeTimelineMetadata) -> Self {
+            Self {
+                disk_consistent_lsn: other.disk_consistent_lsn,
+                prev_record_lsn: other.prev_record_lsn,
+                ancestor_timeline: other.ancestor_timeline,
+                ancestor_lsn: other.ancestor_lsn,
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::repository::repo_harness::TIMELINE_ID;
+
+    use super::*;
+
+    #[test]
+    fn metadata_serializes_correctly() {
+        let original_metadata = TimelineMetadata {
+            disk_consistent_lsn: Lsn(0x200),
+            prev_record_lsn: Some(Lsn(0x100)),
+            ancestor_timeline: Some(TIMELINE_ID),
+            ancestor_lsn: Lsn(0),
+        };
+
+        let metadata_bytes = original_metadata
+            .to_bytes()
+            .expect("Should serialize correct metadata to bytes");
+
+        let deserialized_metadata = TimelineMetadata::from_bytes(&metadata_bytes)
+            .expect("Should deserialize its own bytes");
+
+        assert_eq!(
+            deserialized_metadata, original_metadata,
+            "Metadata that was serialized to bytes and deserialized back should not change"
+        );
+    }
+}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,3 +1,4 @@
+use layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use zenith_utils::postgres_backend::AuthType;
 use zenith_utils::zid::{ZTenantId, ZTimelineId};

@@ -17,6 +18,7 @@ pub mod relish_storage;
 pub mod repository;
 pub mod restore_local_repo;
 pub mod tenant_mgr;
+pub mod tenant_threads;
 pub mod waldecoder;
 pub mod walreceiver;
 pub mod walredo;
@@ -91,7 +93,7 @@ impl PageServerConf {
    //

    fn tenants_path(&self) -> PathBuf {
-        self.workdir.join("tenants")
+        self.workdir.join(TENANTS_SEGMENT_NAME)
    }

    fn tenant_path(&self, tenantid: &ZTenantId) -> PathBuf {
@@ -115,7 +117,7 @@ impl PageServerConf {
    }

    fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf {
-        self.tenant_path(tenantid).join("timelines")
+        self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME)
    }

    fn timeline_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
@@ -163,6 +165,15 @@ impl PageServerConf {
    }
 }

+/// Config for the Repository checkpointer
+#[derive(Debug, Clone, Copy)]
+pub enum CheckpointConfig {
+    // Flush in-memory data that is older than this
+    Distance(u64),
+    // Flush all in-memory data
+    Forced,
+}
+
 /// External relish storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone)]
 pub struct RelishStorageConfig {
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -630,14 +630,16 @@ impl postgres_backend::Handler for PageServerHandler {

            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;

-            let branches = crate::branches::get_branches(self.conf, &tenantid)?;
+            // since these handlers for tenant/branch commands are deprecated (in favor of http based ones)
+            // just use false in place of include non incremental logical size
+            let branches = crate::branches::get_branches(self.conf, &tenantid, false)?;
            let branches_buf = serde_json::to_vec(&branches)?;

            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
                .write_message_noflush(&BeMessage::DataRow(&[Some(&branches_buf)]))?
                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else if query_string.starts_with("tenant_list") {
-            let tenants = crate::branches::get_tenants(self.conf)?;
+            let tenants = crate::tenant_mgr::list_tenants()?;
            let tenants_buf = serde_json::to_vec(&tenants)?;

            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
--- a/pageserver/src/relish_storage.rs
+++ b/pageserver/src/relish_storage.rs
@@ -1,60 +1,138 @@
-//! Abstractions for the page server to store its relish layer data in the external storage.
+//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage.
+//! This particular module serves as a public API border between pageserver and the internal storage machinery.
+//! No other modules from this tree are supposed to be used directly by the external code.
 //!
-//! Main purpose of this module subtree is to provide a set of abstractions to manage the storage state
-//! in a way, optimal for page server.
+//! There are a few components the storage machinery consists of:
+//! * [`RelishStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
+//!     * [`local_fs`] allows to use local file system as an external storage
+//!     * [`rust_s3`] uses AWS S3 bucket entirely as an external storage
 //!
-//! The abstractions hide multiple custom external storage API implementations,
-//! such as AWS S3, local filesystem, etc., located in the submodules.
+//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync.
+//!
+//! * public API via to interact with the external world: [`run_storage_sync_thread`] and [`schedule_timeline_upload`]
+//!
+//! Here's a schematic overview of all interactions relish storage and the rest of the pageserver perform:
+//!
+//! +------------------------+                                    +--------->-------+
+//! |                        |  - - - (init async loop) - - - ->  |                 |
+//! |                        |                                    |                 |
+//! |                        |  ------------------------------->  |      async      |
+//! |       pageserver       |   (schedule frozen layer upload)   | upload/download |
+//! |                        |                                    |      loop       |
+//! |                        |  <-------------------------------  |                 |
+//! |                        |    (register downloaded layers)    |                 |
+//! +------------------------+                                    +---------<-------+
+//!                                                                         |
+//!                                                                         |
+//!                                          CRUD layer file operations     |
+//!                                     (upload/download/delete/list, etc.) |
+//!                                                                         V
+//!                                                            +------------------------+
+//!                                                            |                        |
+//!                                                            | [`RelishStorage`] impl |
+//!                                                            |                        |
+//!                                                            | pageserver assumes it  |
+//!                                                            | owns exclusive write   |
+//!                                                            | access to this storage |
+//!                                                            +------------------------+
+//!
+//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop unitialised, if configured so.
+//! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata.
+//! If the storage sync loop was successfully started before, pageserver schedules the new image uploads after every checkpoint.
+//! See [`crate::layered_repository`] for the upload calls and the adjacent logic.
+//!
+//! The storage logic considers `image` as a set of local files, fully representing a certain timeline at given moment (identified with `disk_consistent_lsn`).
+//! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed
+//! by the storage upload, if enabled.
+//! When a certain image gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same image state.
+//! No files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten
+//! when the newer timeline is downloaded.
+//!
+//! Meanwhile, the loop inits the storage connection and checks the remote files stored.
+//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server).
+//! Based on the remote image data, the storage sync logic queues image downloads, while accepting any potential upload tasks from pageserver and managing the tasks by their priority.
+//! On the image download, a [`crate::tenant_mgr::register_relish_download`] function is called to register the new image in pageserver, initializing all related threads and internal state.
+//!
+//! When the pageserver terminates, the upload loop finishes a current image sync task (if any) and exits.
+//!
+//! NOTES:
+//! * pageserver assumes it has exclusive write access to the relish storage. If supported, the way multiple pageservers can be separated in the same storage
+//! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API.
+//!
+//! * the uploads do not happen right after pageserver startup, they are registered when
+//!     1. pageserver does the checkpoint, which happens further in the future after the server start
+//!     2. pageserver loads the timeline from disk for the first time
+//!
+//! * the uploads do not happen right after the upload registration: the sync loop might be occupied with other tasks, or tasks with bigger priority could be waiting already
+//!
+//! * all synchronization tasks (including the public API to register uploads and downloads and the sync queue management) happens on an image scale: a big set of relish files,
+//! enough to represent (and recover, if needed) a certain timeline state. On the contrary, all internal storage CRUD calls are made per reilsh file from those images.
+//! This way, the synchronization is able to download the image partially, if some state was synced before, but exposes correctly synced images only.

 mod local_fs;
 mod rust_s3;
-/// A queue-based storage with the background machinery behind it to synchronize
-/// local page server layer files with external storage.
-mod synced_storage;
+mod storage_sync;

-use std::{path::Path, thread};
+use std::{
+    path::{Path, PathBuf},
+    thread,
+};

-use anyhow::Context;
+use anyhow::{anyhow, ensure, Context};
+use zenith_utils::zid::{ZTenantId, ZTimelineId};

-pub use self::synced_storage::schedule_timeline_upload;
-use self::{local_fs::LocalFs, rust_s3::RustS3};
-use crate::{PageServerConf, RelishStorageKind};
+pub use self::storage_sync::schedule_timeline_upload;
+use self::{local_fs::LocalFs, rust_s3::S3};
+use crate::{
+    layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME},
+    PageServerConf, RelishStorageKind,
+};

+/// Based on the config, initiates the remote storage connection and starts a separate thread
+/// that ensures that pageserver and the remote storage are in sync with each other.
+/// If no external configuraion connection given, no thread or storage initialization is done.
 pub fn run_storage_sync_thread(
    config: &'static PageServerConf,
 ) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
    match &config.relish_storage_config {
        Some(relish_storage_config) => {
            let max_concurrent_sync = relish_storage_config.max_concurrent_sync;
-            match &relish_storage_config.storage {
-                RelishStorageKind::LocalFs(root) => synced_storage::run_storage_sync_thread(
+            let handle = match &relish_storage_config.storage {
+                RelishStorageKind::LocalFs(root) => storage_sync::spawn_storage_sync_thread(
                    config,
-                    LocalFs::new(root.clone())?,
+                    LocalFs::new(root.clone(), &config.workdir)?,
                    max_concurrent_sync,
                ),
-                RelishStorageKind::AwsS3(s3_config) => synced_storage::run_storage_sync_thread(
+                RelishStorageKind::AwsS3(s3_config) => storage_sync::spawn_storage_sync_thread(
                    config,
-                    RustS3::new(s3_config)?,
+                    S3::new(s3_config, &config.workdir)?,
                    max_concurrent_sync,
                ),
-            }
+            };
+            handle.map(Some)
        }
        None => Ok(None),
    }
 }

 /// Storage (potentially remote) API to manage its state.
+/// This storage tries to be unaware of any layered repository context,
+/// providing basic CRUD operations with storage files.
 #[async_trait::async_trait]
-pub trait RelishStorage: Send + Sync {
+trait RelishStorage: Send + Sync {
+    /// A way to uniquely reference relish in the remote storage.
    type RelishStoragePath;

-    fn derive_destination(
-        page_server_workdir: &Path,
-        relish_local_path: &Path,
-    ) -> anyhow::Result<Self::RelishStoragePath>;
+    /// Attempts to derive the storage path out of the local path, if the latter is correct.
+    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::RelishStoragePath>;

+    /// Gets the layered storage information about the given entry.
+    fn info(&self, storage_path: &Self::RelishStoragePath) -> anyhow::Result<RemoteRelishInfo>;
+
+    /// Lists all items the storage has right now.
    async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>>;

+    /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
    async fn download_relish<W: 'static + std::io::Write + Send>(
        &self,
        from: &Self::RelishStoragePath,
@@ -65,6 +143,7 @@ pub trait RelishStorage: Send + Sync {

    async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()>;

+    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload_relish<R: tokio::io::AsyncRead + std::marker::Unpin + Send>(
        &self,
        from: &mut tokio::io::BufReader<R>,
@@ -72,16 +151,173 @@ pub trait RelishStorage: Send + Sync {
    ) -> anyhow::Result<()>;
 }

-fn strip_workspace_prefix<'a>(
-    page_server_workdir: &'a Path,
-    relish_local_path: &'a Path,
-) -> anyhow::Result<&'a Path> {
-    relish_local_path
-        .strip_prefix(page_server_workdir)
-        .with_context(|| {
+/// Information about a certain remote storage entry.
+#[derive(Debug, PartialEq, Eq)]
+struct RemoteRelishInfo {
+    tenant_id: ZTenantId,
+    timeline_id: ZTimelineId,
+    /// Path in the pageserver workdir where the file should go to.
+    download_destination: PathBuf,
+    is_metadata: bool,
+}
+
+fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
+    if prefix == path {
+        anyhow::bail!(
+            "Prefix and the path are equal, cannot strip: '{}'",
+            prefix.display()
+        )
+    } else {
+        path.strip_prefix(prefix).with_context(|| {
            format!(
-                "Unexpected: relish local path '{}' is not relevant to server workdir",
-                relish_local_path.display(),
+                "Path '{}' is not prefixed with '{}'",
+                path.display(),
+                prefix.display(),
            )
        })
+    }
+}
+
+fn parse_ids_from_path<'a, R: std::fmt::Display>(
+    path_segments: impl Iterator<Item = &'a str>,
+    path_log_representation: &R,
+) -> anyhow::Result<(ZTenantId, ZTimelineId)> {
+    let mut segments = path_segments.skip_while(|&segment| segment != TENANTS_SEGMENT_NAME);
+    let tenants_segment = segments.next().ok_or_else(|| {
+        anyhow!(
+            "Found no '{}' segment in the storage path '{}'",
+            TENANTS_SEGMENT_NAME,
+            path_log_representation
+        )
+    })?;
+    ensure!(
+        tenants_segment == TENANTS_SEGMENT_NAME,
+        "Failed to extract '{}' segment from storage path '{}'",
+        TENANTS_SEGMENT_NAME,
+        path_log_representation
+    );
+    let tenant_id = segments
+        .next()
+        .ok_or_else(|| {
+            anyhow!(
+                "Found no tenant id in the storage path '{}'",
+                path_log_representation
+            )
+        })?
+        .parse::<ZTenantId>()
+        .with_context(|| {
+            format!(
+                "Failed to parse tenant id from storage path '{}'",
+                path_log_representation
+            )
+        })?;
+
+    let timelines_segment = segments.next().ok_or_else(|| {
+        anyhow!(
+            "Found no '{}' segment in the storage path '{}'",
+            TIMELINES_SEGMENT_NAME,
+            path_log_representation
+        )
+    })?;
+    ensure!(
+        timelines_segment == TIMELINES_SEGMENT_NAME,
+        "Failed to extract '{}' segment from storage path '{}'",
+        TIMELINES_SEGMENT_NAME,
+        path_log_representation
+    );
+    let timeline_id = segments
+        .next()
+        .ok_or_else(|| {
+            anyhow!(
+                "Found no timeline id in the storage path '{}'",
+                path_log_representation
+            )
+        })?
+        .parse::<ZTimelineId>()
+        .with_context(|| {
+            format!(
+                "Failed to parse timeline id from storage path '{}'",
+                path_log_representation
+            )
+        })?;
+
+    Ok((tenant_id, timeline_id))
+}
+
+/// A set of common test utils to share in unit tests inside the module tree.
+#[cfg(test)]
+mod test_utils {
+    use std::path::{Path, PathBuf};
+
+    use anyhow::ensure;
+
+    use crate::{
+        layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME},
+        repository::repo_harness::{RepoHarness, TIMELINE_ID},
+    };
+
+    /// Gives a timeline path with pageserver workdir stripped off.
+    pub fn relative_timeline_path(harness: &RepoHarness) -> anyhow::Result<PathBuf> {
+        let timeline_path = harness.timeline_path(&TIMELINE_ID);
+        Ok(timeline_path
+            .strip_prefix(&harness.conf.workdir)?
+            .to_path_buf())
+    }
+
+    /// Creates a path with custom tenant id in one of its segments.
+    /// Useful for emulating paths with wrong ids.
+    pub fn custom_tenant_id_path(
+        path_with_tenant_id: &Path,
+        new_tenant_id: &str,
+    ) -> anyhow::Result<PathBuf> {
+        let mut new_path = PathBuf::new();
+        let mut is_tenant_id = false;
+        let mut tenant_id_replaced = false;
+        for segment in path_with_tenant_id {
+            match segment.to_str() {
+                Some(TENANTS_SEGMENT_NAME) => is_tenant_id = true,
+                Some(_tenant_id_str) if is_tenant_id => {
+                    is_tenant_id = false;
+                    new_path.push(new_tenant_id);
+                    tenant_id_replaced = true;
+                    continue;
+                }
+                _ => {}
+            }
+            new_path.push(segment)
+        }
+
+        ensure!(tenant_id_replaced, "Found no tenant id segment to replace");
+        Ok(new_path)
+    }
+
+    /// Creates a path with custom timeline id in one of its segments.
+    /// Useful for emulating paths with wrong ids.
+    pub fn custom_timeline_id_path(
+        path_with_timeline_id: &Path,
+        new_timeline_id: &str,
+    ) -> anyhow::Result<PathBuf> {
+        let mut new_path = PathBuf::new();
+        let mut is_timeline_id = false;
+        let mut timeline_id_replaced = false;
+        for segment in path_with_timeline_id {
+            match segment.to_str() {
+                Some(TIMELINES_SEGMENT_NAME) => is_timeline_id = true,
+                Some(_timeline_id_str) if is_timeline_id => {
+                    is_timeline_id = false;
+                    new_path.push(new_timeline_id);
+                    timeline_id_replaced = true;
+                    continue;
+                }
+                _ => {}
+            }
+            new_path.push(segment)
+        }
+
+        ensure!(
+            timeline_id_replaced,
+            "Found no timeline id segment to replace"
+        );
+        Ok(new_path)
+    }
 }
--- a/pageserver/src/relish_storage/README.md
+++ b/pageserver/src/relish_storage/README.md
@@ -0,0 +1,82 @@
+# Non-implementation details
+
+This document describes the current state of the backup system in pageserver, existing limitations and concerns, why some things are done the way they are the future development plans.
+Detailed description on how the synchronization works and how it fits into the rest of the pageserver can be found in the [storage module](./../relish_storage.rs) and its submodules.
+Ideally, this document should disappear after current implementation concerns are mitigated, with the remaining useful knowledge bits moved into rustdocs.
+
+## Approach
+
+Backup functionality is a new component, appeared way after the core DB functionality was implemented.
+Pageserver layer functionality is also quite volatile at the moment, there's a risk its local file management changes over time.
+
+To avoid adding more chaos into that, backup functionality is currently designed as a relatively standalone component, with the majority of its logic placed in a standalone async loop.
+This way, the backups are managed in background, not affecting directly other pageserver parts: this way the backup and restoration process may lag behind, but eventually keep up with the reality. To track that, a set of prometheus metrics is exposed from pageserver.
+
+## What's done
+
+Current implementation
+* provides remote storage wrappers for AWS S3 and local FS
+* uploads layers, frozen by pageserver checkpoint thread
+* downloads and registers layers, found on the remote storage, but missing locally
+
+No good optimisations or performance testing is done, the feature is disabled by default and gets polished over time.
+It's planned to deal with all questions that are currently on and prepare the feature to be enabled by default in cloud environments.
+
+### Peculiarities
+
+As mentioned, the backup component is rather new and under development currently, so not all things are done properly from the start.
+Here's the list of known compromises with comments:
+
+* Remote storage model is the same as the `tenants/` directory contents of the pageserver's local workdir storage.
+This is relatively simple to implement, but may be costly to use in AWS S3: an initial data image contains ~782 relish file and a metadata file, ~31 MB combined.
+AWS charges per API call and for traffic either, layers are expected to be updated frequently, so this model most probably is ineffective.
+Additionally, pageservers might need to migrate images between tenants, which does not improve the situation.
+
+Storage sync API operates images when backing up or restoring a backup, so we're fluent to repack the layer contents the way we want to, which most probably will be done later.
+
+* no proper file comparison
+
+Currently, every layer contains `Lsn` in their name, to map the data it holds against a certain DB state.
+Then the images with same ids and different `Lsn`'s are compared, files are considered equal if their local file paths are equal (for remote files, "local file path" is their download destination).
+No file contents assertion is done currently, but should be.
+AWS S3 returns file checksums during the `list` operation, so that can be used to ensure the backup consistency, but that needs further research and, since current pageserver impl also needs to deal with layer file checksums.
+
+For now, due to this, we consider local workdir files as source of truth, not removing them ever and adjusting remote files instead, if image files mismatch.
+
+* no proper retry management
+
+Now, the storage sync attempts to redo the upload/download operation for the image files that failed.
+No proper task eviction or backpressure is implemented currently: the tasks will stay in the queue forever, reattempting the downloads.
+
+This will be fixed when more details on the file consistency model will be agreed on.
+
+* sad rust-s3 api
+
+rust-s3 is not very pleasant to use:
+1. it returns `anyhow::Result` and it's hard to distinguish "missing file" cases from "no connection" one, for instance
+2. at least one function it its API that we need (`get_object_stream`) has `async` keyword and blocks (!), see details [here](https://github.com/zenithdb/zenith/pull/752#discussion_r728373091)
+3. it's a prerelease library with unclear maintenance status
+4. noisy on debug level
+
+But it's already used in the project, so for now it's reused to avoid bloating the dependency tree.
+Based on previous evaluation, even `rusoto-s3` could be a better choice over this library, but needs further benchmarking.
+
+
+* gc and branches are ignored
+
+So far, we don't consider non-main images and don't adjust the remote storage based on GC thread loop results.
+Only checkpointer loop affects the remote storage.
+
+* more layers should be downloaded on demand
+
+Since we download and load remote layers into pageserver, there's a possibility a need for those layers' ancestors arise.
+Most probably, every downloaded image's ancestor is not present in locally too, but currently there's no logic for downloading such ancestors and their metadata,
+so the pageserver is unable to respond property on requests to such ancestors.
+
+To implement the downloading, more `tenant_mgr` refactoring is needed to properly handle web requests for layers and handle the state changes.
+[Here](https://github.com/zenithdb/zenith/pull/689#issuecomment-931216193) are the details about initial state management updates needed.
+
+* no IT tests
+
+Automated S3 testing is lacking currently, due to no convenient way to enable backups during the tests.
+After it's fixed, benchmark runs should also be carried out to find bottlenecks.
--- a/pageserver/src/relish_storage/local_fs.rs
+++ b/pageserver/src/relish_storage/local_fs.rs
@@ -1,13 +1,11 @@
 //! Local filesystem relish storage.
+//! Multiple pageservers can use the same "storage" of this kind by using different storage roots.
 //!
-//! Page server already stores layer data on the server, when freezing it.
-//! This storage serves a way to
-//!
-//! * test things locally simply
-//! * allow to compabre both binary sets
-//! * help validating the relish storage API
+//! This storage used in pageserver tests, but can also be used in cases when a certain persistent
+//! volume is mounted to the local FS.

 use std::{
+    ffi::OsStr,
    future::Future,
    io::Write,
    path::{Path, PathBuf},
@@ -16,25 +14,32 @@ use std::{

 use anyhow::{bail, Context};
 use tokio::{fs, io};
+use tracing::*;

-use super::{strip_workspace_prefix, RelishStorage};
+use crate::layered_repository::metadata::METADATA_FILE_NAME;
+
+use super::{parse_ids_from_path, strip_path_prefix, RelishStorage, RemoteRelishInfo};

 pub struct LocalFs {
+    pageserver_workdir: &'static Path,
    root: PathBuf,
 }

 impl LocalFs {
-    /// Atetmpts to create local FS relish storage, also creates the directory provided, if not exists.
-    pub fn new(root: PathBuf) -> anyhow::Result<Self> {
+    /// Attempts to create local FS relish storage, along with the storage root directory.
+    pub fn new(root: PathBuf, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
        if !root.exists() {
            std::fs::create_dir_all(&root).with_context(|| {
                format!(
-                    "Failed to create all directories in the given root path {}",
+                    "Failed to create all directories in the given root path '{}'",
                    root.display(),
                )
            })?;
        }
-        Ok(Self { root })
+        Ok(Self {
+            pageserver_workdir,
+            root,
+        })
    }

    fn resolve_in_storage(&self, path: &Path) -> anyhow::Result<PathBuf> {
@@ -55,11 +60,29 @@ impl LocalFs {
 impl RelishStorage for LocalFs {
    type RelishStoragePath = PathBuf;

-    fn derive_destination(
-        page_server_workdir: &Path,
-        relish_local_path: &Path,
-    ) -> anyhow::Result<Self::RelishStoragePath> {
-        Ok(strip_workspace_prefix(page_server_workdir, relish_local_path)?.to_path_buf())
+    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::RelishStoragePath> {
+        Ok(self.root.join(
+            strip_path_prefix(self.pageserver_workdir, local_path)
+                .context("local path does not belong to this storage")?,
+        ))
+    }
+
+    fn info(&self, storage_path: &Self::RelishStoragePath) -> anyhow::Result<RemoteRelishInfo> {
+        let is_metadata =
+            storage_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME);
+        let relative_path = strip_path_prefix(&self.root, storage_path)
+            .context("local path does not belong to this storage")?;
+        let download_destination = self.pageserver_workdir.join(relative_path);
+        let (tenant_id, timeline_id) = parse_ids_from_path(
+            relative_path.iter().filter_map(|segment| segment.to_str()),
+            &relative_path.display(),
+        )?;
+        Ok(RemoteRelishInfo {
+            tenant_id,
+            timeline_id,
+            download_destination,
+            is_metadata,
+        })
    }

    async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>> {
@@ -72,6 +95,7 @@ impl RelishStorage for LocalFs {
        mut to: std::io::BufWriter<W>,
    ) -> anyhow::Result<std::io::BufWriter<W>> {
        let file_path = self.resolve_in_storage(from)?;
+
        if file_path.exists() && file_path.is_file() {
            let updated_buffer = tokio::task::spawn_blocking(move || {
                let mut source = std::io::BufReader::new(
@@ -104,7 +128,7 @@ impl RelishStorage for LocalFs {
    async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()> {
        let file_path = self.resolve_in_storage(path)?;
        if file_path.exists() && file_path.is_file() {
-            Ok(tokio::fs::remove_file(file_path).await?)
+            Ok(fs::remove_file(file_path).await?)
        } else {
            bail!(
                "File '{}' either does not exist or is not a file",
@@ -152,12 +176,12 @@ where
        if directory_path.exists() {
            if directory_path.is_dir() {
                let mut paths = Vec::new();
-                let mut dir_contents = tokio::fs::read_dir(directory_path).await?;
+                let mut dir_contents = fs::read_dir(directory_path).await?;
                while let Some(dir_entry) = dir_contents.next_entry().await? {
                    let file_type = dir_entry.file_type().await?;
                    let entry_path = dir_entry.path();
                    if file_type.is_symlink() {
-                        log::debug!("{:?} us a symlink, skipping", entry_path)
+                        debug!("{:?} us a symlink, skipping", entry_path)
                    } else if file_type.is_dir() {
                        paths.extend(get_all_files(entry_path).await?.into_iter())
                    } else {
@@ -183,7 +207,370 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()>
        ),
    };
    if !target_dir.exists() {
-        tokio::fs::create_dir_all(target_dir).await?;
+        fs::create_dir_all(target_dir).await?;
    }
    Ok(())
 }
+
+#[cfg(test)]
+mod pure_tests {
+    use crate::{
+        layered_repository::metadata::METADATA_FILE_NAME,
+        relish_storage::test_utils::{
+            custom_tenant_id_path, custom_timeline_id_path, relative_timeline_path,
+        },
+        repository::repo_harness::{RepoHarness, TIMELINE_ID},
+    };
+
+    use super::*;
+
+    #[test]
+    fn storage_path_positive() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("storage_path_positive")?;
+        let storage_root = PathBuf::from("somewhere").join("else");
+        let storage = LocalFs {
+            pageserver_workdir: &repo_harness.conf.workdir,
+            root: storage_root.clone(),
+        };
+
+        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("relish_name");
+        let expected_path = storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?);
+
+        assert_eq!(
+            expected_path,
+            storage.storage_path(&local_path).expect("Matching path should map to storage path normally"),
+            "Relish paths from pageserver workdir should be stored in local fs storage with the same path they have relative to the workdir"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn storage_path_negatives() -> anyhow::Result<()> {
+        #[track_caller]
+        fn storage_path_error(storage: &LocalFs, mismatching_path: &Path) -> String {
+            match storage.storage_path(mismatching_path) {
+                Ok(wrong_path) => panic!(
+                    "Expected path '{}' to error, but got storage path: {:?}",
+                    mismatching_path.display(),
+                    wrong_path,
+                ),
+                Err(e) => format!("{:?}", e),
+            }
+        }
+
+        let repo_harness = RepoHarness::create("storage_path_negatives")?;
+        let storage_root = PathBuf::from("somewhere").join("else");
+        let storage = LocalFs {
+            pageserver_workdir: &repo_harness.conf.workdir,
+            root: storage_root,
+        };
+
+        let error_string = storage_path_error(&storage, &repo_harness.conf.workdir);
+        assert!(error_string.contains("does not belong to this storage"));
+        assert!(error_string.contains(repo_harness.conf.workdir.to_str().unwrap()));
+
+        let mismatching_path_str = "/something/else";
+        let error_message = storage_path_error(&storage, Path::new(mismatching_path_str));
+        assert!(
+            error_message.contains(mismatching_path_str),
+            "Error should mention wrong path"
+        );
+        assert!(
+            error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
+            "Error should mention server workdir"
+        );
+        assert!(error_message.contains("does not belong to this storage"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn info_positive() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("info_positive")?;
+        let storage_root = PathBuf::from("somewhere").join("else");
+        let storage = LocalFs {
+            pageserver_workdir: &repo_harness.conf.workdir,
+            root: storage_root.clone(),
+        };
+
+        let name = "not a metadata";
+        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join(name);
+        assert_eq!(
+            RemoteRelishInfo {
+                tenant_id: repo_harness.tenant_id,
+                timeline_id: TIMELINE_ID,
+                download_destination: local_path.clone(),
+                is_metadata: false,
+            },
+            storage
+                .info(&storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?))
+                .expect("For a valid input, valid S3 info should be parsed"),
+            "Should be able to parse metadata out of the correctly named remote delta relish"
+        );
+
+        let local_metadata_path = repo_harness
+            .timeline_path(&TIMELINE_ID)
+            .join(METADATA_FILE_NAME);
+        let remote_metadata_path = storage.storage_path(&local_metadata_path)?;
+        assert_eq!(
+            RemoteRelishInfo {
+                tenant_id: repo_harness.tenant_id,
+                timeline_id: TIMELINE_ID,
+                download_destination: local_metadata_path,
+                is_metadata: true,
+            },
+            storage
+                .info(&remote_metadata_path)
+                .expect("For a valid input, valid S3 info should be parsed"),
+            "Should be able to parse metadata out of the correctly named remote metadata file"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn info_negatives() -> anyhow::Result<()> {
+        #[track_caller]
+        #[allow(clippy::ptr_arg)] // have to use &PathBuf due to `storage.info` parameter requirements
+        fn storage_info_error(storage: &LocalFs, storage_path: &PathBuf) -> String {
+            match storage.info(storage_path) {
+                Ok(wrong_info) => panic!(
+                    "Expected storage path input {:?} to cause an error, but got relish info: {:?}",
+                    storage_path, wrong_info,
+                ),
+                Err(e) => format!("{:?}", e),
+            }
+        }
+
+        let repo_harness = RepoHarness::create("info_negatives")?;
+        let storage_root = PathBuf::from("somewhere").join("else");
+        let storage = LocalFs {
+            pageserver_workdir: &repo_harness.conf.workdir,
+            root: storage_root.clone(),
+        };
+
+        let totally_wrong_path = "wrong_wrong_wrong";
+        let error_message = storage_info_error(&storage, &PathBuf::from(totally_wrong_path));
+        assert!(error_message.contains(totally_wrong_path));
+
+        let relative_timeline_path = relative_timeline_path(&repo_harness)?;
+
+        let relative_relish_path =
+            custom_tenant_id_path(&relative_timeline_path, "wrong_tenant_id")?
+                .join("wrong_tenant_id_name");
+        let wrong_tenant_id_path = storage_root.join(&relative_relish_path);
+        let error_message = storage_info_error(&storage, &wrong_tenant_id_path);
+        assert!(
+            error_message.contains(relative_relish_path.to_str().unwrap()),
+            "Error message '{}' does not contain the expected substring",
+            error_message
+        );
+
+        let relative_relish_path =
+            custom_timeline_id_path(&relative_timeline_path, "wrong_timeline_id")?
+                .join("wrong_timeline_id_name");
+        let wrong_timeline_id_path = storage_root.join(&relative_relish_path);
+        let error_message = storage_info_error(&storage, &wrong_timeline_id_path);
+        assert!(
+            error_message.contains(relative_relish_path.to_str().unwrap()),
+            "Error message '{}' does not contain the expected substring",
+            error_message
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn download_destination_matches_original_path() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
+        let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
+
+        let storage_root = PathBuf::from("somewhere").join("else");
+        let dummy_storage = LocalFs {
+            pageserver_workdir: &repo_harness.conf.workdir,
+            root: storage_root,
+        };
+
+        let storage_path = dummy_storage.storage_path(&original_path)?;
+        let download_destination = dummy_storage.info(&storage_path)?.download_destination;
+
+        assert_eq!(
+            original_path, download_destination,
+            "'original path -> storage path -> matching fs path' transformation should produce the same path as the input one for the correct path"
+        );
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod fs_tests {
+    use crate::{
+        relish_storage::test_utils::relative_timeline_path, repository::repo_harness::RepoHarness,
+    };
+
+    use super::*;
+
+    use tempfile::tempdir;
+
+    #[tokio::test]
+    async fn upload_relish() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("upload_relish")?;
+        let storage = create_storage()?;
+
+        let mut source = create_file_for_upload(
+            &storage.pageserver_workdir.join("whatever"),
+            "whatever_contents",
+        )
+        .await?;
+        let target_path = PathBuf::from("/").join("somewhere").join("else");
+        match storage.upload_relish(&mut source, &target_path).await {
+            Ok(()) => panic!("Should not allow storing files with wrong target path"),
+            Err(e) => {
+                let message = format!("{:?}", e);
+                assert!(message.contains(&target_path.display().to_string()));
+                assert!(message.contains("does not belong to the current storage"));
+            }
+        }
+        assert!(storage.list_relishes().await?.is_empty());
+
+        let target_path_1 = upload_dummy_file(&repo_harness, &storage, "upload_1").await?;
+        assert_eq!(
+            storage.list_relishes().await?,
+            vec![target_path_1.clone()],
+            "Should list a single file after first upload"
+        );
+
+        let target_path_2 = upload_dummy_file(&repo_harness, &storage, "upload_2").await?;
+        assert_eq!(
+            list_relishes_sorted(&storage).await?,
+            vec![target_path_1.clone(), target_path_2.clone()],
+            "Should list a two different files after second upload"
+        );
+
+        // match storage.upload_relish(&mut source, &target_path_1).await {
+        //     Ok(()) => panic!("Should not allow reuploading storage files"),
+        //     Err(e) => {
+        //         let message = format!("{:?}", e);
+        //         assert!(message.contains(&target_path_1.display().to_string()));
+        //         assert!(message.contains("File exists"));
+        //     }
+        // }
+        assert_eq!(
+            list_relishes_sorted(&storage).await?,
+            vec![target_path_1, target_path_2],
+            "Should list a two different files after all upload attempts"
+        );
+
+        Ok(())
+    }
+
+    fn create_storage() -> anyhow::Result<LocalFs> {
+        let pageserver_workdir = Box::leak(Box::new(tempdir()?.path().to_owned()));
+        let storage = LocalFs::new(tempdir()?.path().to_owned(), pageserver_workdir)?;
+        Ok(storage)
+    }
+
+    #[tokio::test]
+    async fn download_relish() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("download_relish")?;
+        let storage = create_storage()?;
+        let upload_name = "upload_1";
+        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
+
+        let contents_bytes = storage
+            .download_relish(&upload_target, std::io::BufWriter::new(Vec::new()))
+            .await?
+            .into_inner()?;
+        let contents = String::from_utf8(contents_bytes)?;
+        assert_eq!(
+            dummy_contents(upload_name),
+            contents,
+            "We should upload and download the same contents"
+        );
+
+        let non_existing_path = PathBuf::from("somewhere").join("else");
+        match storage
+            .download_relish(&non_existing_path, std::io::BufWriter::new(Vec::new()))
+            .await
+        {
+            Ok(_) => panic!("Should not allow downloading non-existing storage files"),
+            Err(e) => {
+                let error_string = e.to_string();
+                assert!(error_string.contains("does not exist"));
+                assert!(error_string.contains(&non_existing_path.display().to_string()));
+            }
+        }
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn delete_relish() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("delete_relish")?;
+        let storage = create_storage()?;
+        let upload_name = "upload_1";
+        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
+
+        storage.delete_relish(&upload_target).await?;
+        assert!(storage.list_relishes().await?.is_empty());
+
+        match storage.delete_relish(&upload_target).await {
+            Ok(()) => panic!("Should not allow deleting non-existing storage files"),
+            Err(e) => {
+                let error_string = e.to_string();
+                assert!(error_string.contains("does not exist"));
+                assert!(error_string.contains(&upload_target.display().to_string()));
+            }
+        }
+        Ok(())
+    }
+
+    async fn upload_dummy_file(
+        harness: &RepoHarness,
+        storage: &LocalFs,
+        name: &str,
+    ) -> anyhow::Result<PathBuf> {
+        let storage_path = storage
+            .root
+            .join(relative_timeline_path(harness)?)
+            .join(name);
+        storage
+            .upload_relish(
+                &mut create_file_for_upload(
+                    &storage.pageserver_workdir.join(name),
+                    &dummy_contents(name),
+                )
+                .await?,
+                &storage_path,
+            )
+            .await?;
+        Ok(storage_path)
+    }
+
+    async fn create_file_for_upload(
+        path: &Path,
+        contents: &str,
+    ) -> anyhow::Result<io::BufReader<fs::File>> {
+        std::fs::create_dir_all(path.parent().unwrap())?;
+        let mut file_for_writing = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .open(path)?;
+        write!(file_for_writing, "{}", contents)?;
+        drop(file_for_writing);
+        Ok(io::BufReader::new(
+            fs::OpenOptions::new().read(true).open(&path).await?,
+        ))
+    }
+
+    fn dummy_contents(name: &str) -> String {
+        format!("contents for {}", name)
+    }
+
+    async fn list_relishes_sorted(storage: &LocalFs) -> anyhow::Result<Vec<PathBuf>> {
+        let mut relishes = storage.list_relishes().await?;
+        relishes.sort();
+        Ok(relishes)
+    }
+}
--- a/pageserver/src/relish_storage/rust_s3.rs
+++ b/pageserver/src/relish_storage/rust_s3.rs
@@ -1,35 +1,45 @@
-//! A wrapper around AWS S3 client library `rust_s3` to be used a relish storage.
+//! AWS S3 relish storage wrapper around `rust_s3` library.
+//! Currently does not allow multiple pageservers to use the same bucket concurrently: relishes are
+//! placed in the root of the bucket.

-use std::io::Write;
-use std::path::Path;
+use std::{
+    io::Write,
+    path::{Path, PathBuf},
+};

 use anyhow::Context;
 use s3::{bucket::Bucket, creds::Credentials, region::Region};

 use crate::{
-    relish_storage::{strip_workspace_prefix, RelishStorage},
+    layered_repository::metadata::METADATA_FILE_NAME,
+    relish_storage::{parse_ids_from_path, strip_path_prefix, RelishStorage, RemoteRelishInfo},
    S3Config,
 };

 const S3_FILE_SEPARATOR: char = '/';

-#[derive(Debug)]
+#[derive(Debug, Eq, PartialEq)]
 pub struct S3ObjectKey(String);

 impl S3ObjectKey {
    fn key(&self) -> &str {
        &self.0
    }
+
+    fn download_destination(&self, pageserver_workdir: &Path) -> PathBuf {
+        pageserver_workdir.join(self.0.split(S3_FILE_SEPARATOR).collect::<PathBuf>())
+    }
 }

 /// AWS S3 relish storage.
-pub struct RustS3 {
+pub struct S3 {
+    pageserver_workdir: &'static Path,
    bucket: Bucket,
 }

-impl RustS3 {
+impl S3 {
    /// Creates the relish storage, errors if incorrect AWS S3 configuration provided.
-    pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
+    pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
        let region = aws_config
            .bucket_region
            .parse::<Region>()
@@ -49,19 +59,17 @@ impl RustS3 {
                credentials,
            )
            .context("Failed to create the s3 bucket")?,
+            pageserver_workdir,
        })
    }
 }

 #[async_trait::async_trait]
-impl RelishStorage for RustS3 {
+impl RelishStorage for S3 {
    type RelishStoragePath = S3ObjectKey;

-    fn derive_destination(
-        page_server_workdir: &Path,
-        relish_local_path: &Path,
-    ) -> anyhow::Result<Self::RelishStoragePath> {
-        let relative_path = strip_workspace_prefix(page_server_workdir, relish_local_path)?;
+    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::RelishStoragePath> {
+        let relative_path = strip_path_prefix(self.pageserver_workdir, local_path)?;
        let mut key = String::new();
        for segment in relative_path {
            key.push(S3_FILE_SEPARATOR);
@@ -70,6 +78,21 @@ impl RelishStorage for RustS3 {
        Ok(S3ObjectKey(key))
    }

+    fn info(&self, storage_path: &Self::RelishStoragePath) -> anyhow::Result<RemoteRelishInfo> {
+        let storage_path_key = &storage_path.0;
+        let is_metadata =
+            storage_path_key.ends_with(&format!("{}{}", S3_FILE_SEPARATOR, METADATA_FILE_NAME));
+        let download_destination = storage_path.download_destination(self.pageserver_workdir);
+        let (tenant_id, timeline_id) =
+            parse_ids_from_path(storage_path_key.split(S3_FILE_SEPARATOR), storage_path_key)?;
+        Ok(RemoteRelishInfo {
+            tenant_id,
+            timeline_id,
+            download_destination,
+            is_metadata,
+        })
+    }
+
    async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>> {
        let list_response = self
            .bucket
@@ -101,11 +124,11 @@ impl RelishStorage for RustS3 {
            ))
        } else {
            tokio::task::spawn_blocking(move || {
-                to.flush().context("Failed to fluch the downoad buffer")?;
+                to.flush().context("Failed to flush the download buffer")?;
                Ok::<_, anyhow::Error>(to)
            })
            .await
-            .context("Failed to joim the download buffer flush task")?
+            .context("Failed to join the download buffer flush task")?
        }
    }

@@ -115,9 +138,9 @@ impl RelishStorage for RustS3 {
            .delete_object(path.key())
            .await
            .with_context(|| format!("Failed to delete s3 object with key {}", path.key()))?;
-        if code != 200 {
+        if code != 204 {
            Err(anyhow::format_err!(
-                "Received non-200 exit code during deleting object with key '{}', code: {}",
+                "Received non-204 exit code during deleting object with key '{}', code: {}",
                path.key(),
                code
            ))
@@ -147,3 +170,226 @@ impl RelishStorage for RustS3 {
        }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        relish_storage::test_utils::{
+            custom_tenant_id_path, custom_timeline_id_path, relative_timeline_path,
+        },
+        repository::repo_harness::{RepoHarness, TIMELINE_ID},
+    };
+
+    use super::*;
+
+    #[test]
+    fn download_destination() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("download_destination")?;
+
+        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("test_name");
+        let relative_path = local_path.strip_prefix(&repo_harness.conf.workdir)?;
+
+        let key = S3ObjectKey(format!(
+            "{}{}",
+            S3_FILE_SEPARATOR,
+            relative_path
+                .iter()
+                .map(|segment| segment.to_str().unwrap())
+                .collect::<Vec<_>>()
+                .join(&S3_FILE_SEPARATOR.to_string()),
+        ));
+
+        assert_eq!(
+            local_path,
+            key.download_destination(&repo_harness.conf.workdir),
+            "Download destination should consist of s3 path joined with the pageserver workdir prefix"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn storage_path_positive() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("storage_path_positive")?;
+
+        let segment_1 = "matching";
+        let segment_2 = "relish";
+        let local_path = &repo_harness.conf.workdir.join(segment_1).join(segment_2);
+        let expected_key = S3ObjectKey(format!(
+            "{SEPARATOR}{}{SEPARATOR}{}",
+            segment_1,
+            segment_2,
+            SEPARATOR = S3_FILE_SEPARATOR,
+        ));
+
+        let actual_key = dummy_storage(&repo_harness.conf.workdir)
+            .storage_path(local_path)
+            .expect("Matching path should map to S3 path normally");
+        assert_eq!(
+            expected_key,
+            actual_key,
+            "S3 key from the matching path should contain all segments after the workspace prefix, separated with S3 separator"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn storage_path_negatives() -> anyhow::Result<()> {
+        #[track_caller]
+        fn storage_path_error(storage: &S3, mismatching_path: &Path) -> String {
+            match storage.storage_path(mismatching_path) {
+                Ok(wrong_key) => panic!(
+                    "Expected path '{}' to error, but got S3 key: {:?}",
+                    mismatching_path.display(),
+                    wrong_key,
+                ),
+                Err(e) => e.to_string(),
+            }
+        }
+
+        let repo_harness = RepoHarness::create("storage_path_negatives")?;
+        let storage = dummy_storage(&repo_harness.conf.workdir);
+
+        let error_message = storage_path_error(&storage, &repo_harness.conf.workdir);
+        assert!(
+            error_message.contains("Prefix and the path are equal"),
+            "Message '{}' does not contain the required string",
+            error_message
+        );
+
+        let mismatching_path = PathBuf::from("somewhere").join("else");
+        let error_message = storage_path_error(&storage, &mismatching_path);
+        assert!(
+            error_message.contains(mismatching_path.to_str().unwrap()),
+            "Error should mention wrong path"
+        );
+        assert!(
+            error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
+            "Error should mention server workdir"
+        );
+        assert!(
+            error_message.contains("is not prefixed with"),
+            "Message '{}' does not contain a required string",
+            error_message
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn info_positive() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("info_positive")?;
+        let storage = dummy_storage(&repo_harness.conf.workdir);
+        let relative_timeline_path = relative_timeline_path(&repo_harness)?;
+
+        let s3_key = create_s3_key(&relative_timeline_path.join("not a metadata"));
+        assert_eq!(
+            RemoteRelishInfo {
+                tenant_id: repo_harness.tenant_id,
+                timeline_id: TIMELINE_ID,
+                download_destination: s3_key.download_destination(&repo_harness.conf.workdir),
+                is_metadata: false,
+            },
+            storage
+                .info(&s3_key)
+                .expect("For a valid input, valid S3 info should be parsed"),
+            "Should be able to parse metadata out of the correctly named remote delta relish"
+        );
+
+        let s3_key = create_s3_key(&relative_timeline_path.join(METADATA_FILE_NAME));
+        assert_eq!(
+            RemoteRelishInfo {
+                tenant_id: repo_harness.tenant_id,
+                timeline_id: TIMELINE_ID,
+                download_destination: s3_key.download_destination(&repo_harness.conf.workdir),
+                is_metadata: true,
+            },
+            storage
+                .info(&s3_key)
+                .expect("For a valid input, valid S3 info should be parsed"),
+            "Should be able to parse metadata out of the correctly named remote metadata file"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn info_negatives() -> anyhow::Result<()> {
+        #[track_caller]
+        fn storage_info_error(storage: &S3, s3_key: &S3ObjectKey) -> String {
+            match storage.info(s3_key) {
+                Ok(wrong_info) => panic!(
+                    "Expected key {:?} to error, but got relish info: {:?}",
+                    s3_key, wrong_info,
+                ),
+                Err(e) => e.to_string(),
+            }
+        }
+
+        let repo_harness = RepoHarness::create("info_negatives")?;
+        let storage = dummy_storage(&repo_harness.conf.workdir);
+        let relative_timeline_path = relative_timeline_path(&repo_harness)?;
+
+        let totally_wrong_path = "wrong_wrong_wrong";
+        let error_message =
+            storage_info_error(&storage, &S3ObjectKey(totally_wrong_path.to_string()));
+        assert!(error_message.contains(totally_wrong_path));
+
+        let wrong_tenant_id = create_s3_key(
+            &custom_tenant_id_path(&relative_timeline_path, "wrong_tenant_id")?.join("name"),
+        );
+        let error_message = storage_info_error(&storage, &wrong_tenant_id);
+        assert!(error_message.contains(&wrong_tenant_id.0));
+
+        let wrong_timeline_id = create_s3_key(
+            &custom_timeline_id_path(&relative_timeline_path, "wrong_timeline_id")?.join("name"),
+        );
+        let error_message = storage_info_error(&storage, &wrong_timeline_id);
+        assert!(error_message.contains(&wrong_timeline_id.0));
+
+        Ok(())
+    }
+
+    #[test]
+    fn download_destination_matches_original_path() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
+        let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
+
+        let dummy_storage = dummy_storage(&repo_harness.conf.workdir);
+
+        let key = dummy_storage.storage_path(&original_path)?;
+        let download_destination = dummy_storage.info(&key)?.download_destination;
+
+        assert_eq!(
+            original_path, download_destination,
+            "'original path -> storage key -> matching fs path' transformation should produce the same path as the input one for the correct path"
+        );
+
+        Ok(())
+    }
+
+    fn dummy_storage(pageserver_workdir: &'static Path) -> S3 {
+        S3 {
+            pageserver_workdir,
+            bucket: Bucket::new(
+                "dummy-bucket",
+                "us-east-1".parse().unwrap(),
+                Credentials::anonymous().unwrap(),
+            )
+            .unwrap(),
+        }
+    }
+
+    fn create_s3_key(relative_relish_path: &Path) -> S3ObjectKey {
+        S3ObjectKey(
+            relative_relish_path
+                .iter()
+                .fold(String::new(), |mut path_string, segment| {
+                    path_string.push(S3_FILE_SEPARATOR);
+                    path_string.push_str(segment.to_str().unwrap());
+                    path_string
+                }),
+        )
+    }
+}
--- a/pageserver/src/relish_storage/storage_sync.rs
+++ b/pageserver/src/relish_storage/storage_sync.rs
--- a/pageserver/src/relish_storage/synced_storage.rs
+++ b/pageserver/src/relish_storage/synced_storage.rs
@@ -1,57 +0,0 @@
-use std::time::Duration;
-use std::{collections::BinaryHeap, sync::Mutex, thread};
-
-use crate::tenant_mgr;
-use crate::{relish_storage::RelishStorage, PageServerConf};
-
-lazy_static::lazy_static! {
-    static ref UPLOAD_QUEUE: Mutex<BinaryHeap<SyncTask>> = Mutex::new(BinaryHeap::new());
-}
-
-pub fn schedule_timeline_upload(_local_timeline: ()) {
-    // UPLOAD_QUEUE
-    //     .lock()
-    //     .unwrap()
-    //     .push(SyncTask::Upload(local_timeline))
-}
-
-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
-enum SyncTask {}
-
-pub fn run_storage_sync_thread<
-    P: std::fmt::Debug,
-    S: 'static + RelishStorage<RelishStoragePath = P>,
->(
-    config: &'static PageServerConf,
-    relish_storage: S,
-    max_concurrent_sync: usize,
-) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
-    let runtime = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()?;
-
-    let handle = thread::Builder::new()
-        .name("Queue based relish storage sync".to_string())
-        .spawn(move || {
-            while !tenant_mgr::shutdown_requested() {
-                let mut queue_accessor = UPLOAD_QUEUE.lock().unwrap();
-                log::debug!("Upload queue length: {}", queue_accessor.len());
-                let next_task = queue_accessor.pop();
-                drop(queue_accessor);
-                match next_task {
-                    Some(task) => runtime.block_on(async {
-                        // suppress warnings
-                        let _ = (config, task, &relish_storage, max_concurrent_sync);
-                        todo!("omitted for brevity")
-                    }),
-                    None => {
-                        thread::sleep(Duration::from_secs(1));
-                        continue;
-                    }
-                }
-            }
-            log::debug!("Queue based relish storage sync thread shut down");
-            Ok(())
-        })?;
-    Ok(Some(handle))
-}
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -1,4 +1,5 @@
 use crate::relish::*;
+use crate::CheckpointConfig;
 use anyhow::Result;
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use serde::{Deserialize, Serialize};
@@ -24,9 +25,9 @@ pub trait Repository: Send + Sync {
    /// Branch a timeline
    fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>;

-    /// perform one garbage collection iteration.
-    /// garbage collection is periodically performed by gc thread,
-    /// but it can be explicitly requested through page server api.
+    /// perform one garbage collection iteration, removing old data files from disk.
+    /// this funtion is periodically called by gc thread.
+    /// also it can be explicitly requested through page server api 'do_gc' command.
    ///
    /// 'timelineid' specifies the timeline to GC, or None for all.
    /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval).
@@ -39,6 +40,10 @@ pub trait Repository: Send + Sync {
        horizon: u64,
        checkpoint_before_gc: bool,
    ) -> Result<GcResult>;
+
+    /// perform one checkpoint iteration, flushing in-memory data on disk.
+    /// this function is periodically called by checkponter thread.
+    fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()>;
 }

 ///
@@ -119,6 +124,9 @@ pub trait Timeline: Send + Sync {
    /// Get a list of all existing non-relational objects
    fn list_nonrels(&self, lsn: Lsn) -> Result<HashSet<RelishTag>>;

+    /// Get the LSN where this branch was created
+    fn get_ancestor_lsn(&self) -> Lsn;
+
    //------------------------------------------------------------------------------
    // Public PUT functions, to update the repository with new page versions.
    //
@@ -131,6 +139,7 @@ pub trait Timeline: Send + Sync {
    fn get_last_record_lsn(&self) -> Lsn;
    fn get_prev_record_lsn(&self) -> Lsn;
    fn get_start_lsn(&self) -> Lsn;
+    fn get_disk_consistent_lsn(&self) -> Lsn;

    /// Mutate the timeline with a [`TimelineWriter`].
    fn writer<'a>(&'a self) -> Box<dyn TimelineWriter + 'a>;
@@ -140,7 +149,7 @@ pub trait Timeline: Send + Sync {
    ///
    /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
    /// know anything about them here in the repository.
-    fn checkpoint(&self) -> Result<()>;
+    fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()>;

    /// Retrieve current logical size of the timeline
    ///
@@ -210,27 +219,115 @@ impl WALRecord {
    }
 }

+#[cfg(test)]
+pub mod repo_harness {
+    use std::{fs, path::PathBuf};
+
+    use crate::{
+        layered_repository::{LayeredRepository, TIMELINES_SEGMENT_NAME},
+        walredo::{WalRedoError, WalRedoManager},
+        PageServerConf,
+    };
+
+    use super::*;
+    use hex_literal::hex;
+    use zenith_utils::zid::ZTenantId;
+
+    pub const TIMELINE_ID: ZTimelineId =
+        ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
+    pub const NEW_TIMELINE_ID: ZTimelineId =
+        ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
+
+    /// Convenience function to create a page image with given string as the only content
+    #[allow(non_snake_case)]
+    pub fn TEST_IMG(s: &str) -> Bytes {
+        let mut buf = BytesMut::new();
+        buf.extend_from_slice(s.as_bytes());
+        buf.resize(8192, 0);
+
+        buf.freeze()
+    }
+
+    pub struct RepoHarness {
+        pub conf: &'static PageServerConf,
+        pub tenant_id: ZTenantId,
+    }
+
+    impl RepoHarness {
+        pub fn create(test_name: &'static str) -> Result<Self> {
+            let repo_dir = PageServerConf::test_repo_dir(test_name);
+            let _ = fs::remove_dir_all(&repo_dir);
+            fs::create_dir_all(&repo_dir)?;
+            fs::create_dir_all(&repo_dir.join(TIMELINES_SEGMENT_NAME))?;
+
+            let conf = PageServerConf::dummy_conf(repo_dir);
+            // Make a static copy of the config. This can never be free'd, but that's
+            // OK in a test.
+            let conf: &'static PageServerConf = Box::leak(Box::new(conf));
+
+            let tenant_id = ZTenantId::generate();
+            fs::create_dir_all(conf.tenant_path(&tenant_id))?;
+
+            Ok(Self { conf, tenant_id })
+        }
+
+        pub fn load(&self) -> Box<dyn Repository> {
+            let walredo_mgr = Arc::new(TestRedoManager);
+
+            Box::new(LayeredRepository::new(
+                self.conf,
+                walredo_mgr,
+                self.tenant_id,
+                false,
+            ))
+        }
+
+        pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
+            self.conf.timeline_path(timeline_id, &self.tenant_id)
+        }
+    }
+
+    // Mock WAL redo manager that doesn't do much
+    struct TestRedoManager;
+
+    impl WalRedoManager for TestRedoManager {
+        fn request_redo(
+            &self,
+            rel: RelishTag,
+            blknum: u32,
+            lsn: Lsn,
+            base_img: Option<Bytes>,
+            records: Vec<(Lsn, WALRecord)>,
+        ) -> Result<Bytes, WalRedoError> {
+            let s = format!(
+                "redo for {} blk {} to get to {}, with {} and {} records",
+                rel,
+                blknum,
+                lsn,
+                if base_img.is_some() {
+                    "base image"
+                } else {
+                    "no base image"
+                },
+                records.len()
+            );
+            println!("{}", s);
+            Ok(TEST_IMG(&s))
+        }
+    }
+}
+
 ///
 /// Tests that should work the same with any Repository/Timeline implementation.
 ///
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
-    use super::*;
-    use crate::layered_repository::{LayeredRepository, METADATA_FILE_NAME};
-    use crate::walredo::{WalRedoError, WalRedoManager};
-    use crate::PageServerConf;
-    use hex_literal::hex;
-    use postgres_ffi::pg_constants;
-    use postgres_ffi::xlog_utils::SIZEOF_CHECKPOINT;
-    use std::fs;
-    use std::path::PathBuf;
-    use zenith_utils::zid::ZTenantId;
+    use crate::layered_repository::metadata::METADATA_FILE_NAME;

-    const TIMELINE_ID: ZTimelineId =
-        ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
-    const NEW_TIMELINE_ID: ZTimelineId =
-        ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
+    use super::repo_harness::*;
+    use super::*;
+    use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT};

    /// Arbitrary relation tag, for testing.
    const TESTREL_A: RelishTag = RelishTag::Relation(RelTag {
@@ -246,16 +343,6 @@ mod tests {
        forknum: 0,
    });

-    /// Convenience function to create a page image with given string as the only content
-    #[allow(non_snake_case)]
-    fn TEST_IMG(s: &str) -> Bytes {
-        let mut buf = BytesMut::new();
-        buf.extend_from_slice(s.as_bytes());
-        buf.resize(8192, 0);
-
-        buf.freeze()
-    }
-
    fn assert_current_logical_size(timeline: &Arc<dyn Timeline>, lsn: Lsn) {
        let incremental = timeline.get_current_logical_size();
        let non_incremental = timeline
@@ -267,45 +354,6 @@ mod tests {
    static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
    static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);

-    struct RepoHarness {
-        conf: &'static PageServerConf,
-        tenant_id: ZTenantId,
-    }
-
-    impl RepoHarness {
-        fn create(test_name: &'static str) -> Result<Self> {
-            let repo_dir = PageServerConf::test_repo_dir(test_name);
-            let _ = fs::remove_dir_all(&repo_dir);
-            fs::create_dir_all(&repo_dir)?;
-            fs::create_dir_all(&repo_dir.join("timelines"))?;
-
-            let conf = PageServerConf::dummy_conf(repo_dir);
-            // Make a static copy of the config. This can never be free'd, but that's
-            // OK in a test.
-            let conf: &'static PageServerConf = Box::leak(Box::new(conf));
-
-            let tenant_id = ZTenantId::generate();
-            fs::create_dir_all(conf.tenant_path(&tenant_id))?;
-
-            Ok(Self { conf, tenant_id })
-        }
-
-        fn load(&self) -> Box<dyn Repository> {
-            let walredo_mgr = Arc::new(TestRedoManager);
-
-            Box::new(LayeredRepository::new(
-                self.conf,
-                walredo_mgr,
-                self.tenant_id,
-                false,
-            ))
-        }
-
-        fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
-            self.conf.timeline_path(timeline_id, &self.tenant_id)
-        }
-    }
-
    #[test]
    fn test_relsize() -> Result<()> {
        let repo = RepoHarness::create("test_relsize")?.load();
@@ -672,7 +720,7 @@ mod tests {
            .contains(&TESTREL_A));

        // Run checkpoint and garbage collection and check that it's still not visible
-        newtline.checkpoint()?;
+        newtline.checkpoint(CheckpointConfig::Forced)?;
        repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?;

        assert!(!newtline
@@ -817,33 +865,4 @@ mod tests {

        Ok(())
    }
-
-    // Mock WAL redo manager that doesn't do much
-    struct TestRedoManager;
-
-    impl WalRedoManager for TestRedoManager {
-        fn request_redo(
-            &self,
-            rel: RelishTag,
-            blknum: u32,
-            lsn: Lsn,
-            base_img: Option<Bytes>,
-            records: Vec<(Lsn, WALRecord)>,
-        ) -> Result<Bytes, WalRedoError> {
-            let s = format!(
-                "redo for {} blk {} to get to {}, with {} and {} records",
-                rel,
-                blknum,
-                lsn,
-                if base_img.is_some() {
-                    "base image"
-                } else {
-                    "no base image"
-                },
-                records.len()
-            );
-            println!("{}", s);
-            Ok(TEST_IMG(&s))
-        }
-    }
 }
--- a/pageserver/src/restore_local_repo.rs
+++ b/pageserver/src/restore_local_repo.rs
@@ -7,10 +7,10 @@ use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment;
 use std::cmp::min;
 use std::fs;
 use std::fs::File;
-use std::io::Read;
-use std::path::Path;
+use std::io::{Read, Seek, SeekFrom};
+use std::path::{Path, PathBuf};

-use anyhow::{bail, Result};
+use anyhow::{anyhow, bail, Result};
 use bytes::{Buf, Bytes};
 use tracing::*;

@@ -37,6 +37,8 @@ pub fn import_timeline_from_postgres_datadir(
    writer: &dyn TimelineWriter,
    lsn: Lsn,
 ) -> Result<()> {
+    let mut pg_control: Option<ControlFileData> = None;
+
    // Scan 'global'
    for direntry in fs::read_dir(path.join("global"))? {
        let direntry = direntry?;
@@ -44,7 +46,7 @@ pub fn import_timeline_from_postgres_datadir(
            None => continue,

            Some("pg_control") => {
-                import_control_file(writer, lsn, &direntry.path())?;
+                pg_control = Some(import_control_file(writer, lsn, &direntry.path())?);
            }
            Some("pg_filenode.map") => import_nonrel_file(
                writer,
@@ -127,6 +129,18 @@ pub fn import_timeline_from_postgres_datadir(

    writer.advance_last_record_lsn(lsn);

+    // Import WAL. This is needed even when starting from a shutdown checkpoint, because
+    // this reads the checkpoint record itself, advancing the tip of the timeline to
+    // *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'
+    let pg_control = pg_control.ok_or_else(|| anyhow!("pg_control file not found"))?;
+    import_wal(
+        &path.join("pg_wal"),
+        writer,
+        Lsn(pg_control.checkPointCopy.redo),
+        lsn,
+        &mut pg_control.checkPointCopy.clone(),
+    )?;
+
    Ok(())
 }

@@ -212,7 +226,11 @@ fn import_nonrel_file(
 ///
 /// The control file is imported as is, but we also extract the checkpoint record
 /// from it and store it separated.
-fn import_control_file(timeline: &dyn TimelineWriter, lsn: Lsn, path: &Path) -> Result<()> {
+fn import_control_file(
+    timeline: &dyn TimelineWriter,
+    lsn: Lsn,
+    path: &Path,
+) -> Result<ControlFileData> {
    let mut file = File::open(path)?;
    let mut buffer = Vec::new();
    // read the whole file
@@ -233,7 +251,7 @@ fn import_control_file(timeline: &dyn TimelineWriter, lsn: Lsn, path: &Path) ->
    let checkpoint_bytes = pg_control.checkPointCopy.encode();
    timeline.put_page_image(RelishTag::Checkpoint, 0, lsn, checkpoint_bytes)?;

-    Ok(())
+    Ok(pg_control)
 }

 ///
@@ -285,6 +303,104 @@ fn import_slru_file(
    Ok(())
 }

+/// Scan PostgreSQL WAL files in given directory and load all records between
+/// 'startpoint' and 'endpoint' into the repository.
+fn import_wal(
+    walpath: &Path,
+    timeline: &dyn TimelineWriter,
+    startpoint: Lsn,
+    endpoint: Lsn,
+    checkpoint: &mut CheckPoint,
+) -> Result<()> {
+    let mut waldecoder = WalStreamDecoder::new(startpoint);
+
+    let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE);
+    let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
+    let mut last_lsn = startpoint;
+
+    while last_lsn <= endpoint {
+        // FIXME: assume postgresql tli 1 for now
+        let filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE);
+        let mut buf = Vec::new();
+
+        // Read local file
+        let mut path = walpath.join(&filename);
+
+        // It could be as .partial
+        if !PathBuf::from(&path).exists() {
+            path = walpath.join(filename + ".partial");
+        }
+
+        // Slurp the WAL file
+        let mut file = File::open(&path)?;
+
+        if offset > 0 {
+            file.seek(SeekFrom::Start(offset as u64))?;
+        }
+
+        let nread = file.read_to_end(&mut buf)?;
+        if nread != pg_constants::WAL_SEGMENT_SIZE - offset as usize {
+            // Maybe allow this for .partial files?
+            error!("read only {} bytes from WAL file", nread);
+        }
+
+        waldecoder.feed_bytes(&buf);
+
+        let mut nrecords = 0;
+        while last_lsn <= endpoint {
+            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
+                let mut checkpoint_modified = false;
+
+                let decoded = decode_wal_record(recdata.clone());
+                save_decoded_record(
+                    checkpoint,
+                    &mut checkpoint_modified,
+                    timeline,
+                    &decoded,
+                    recdata,
+                    lsn,
+                )?;
+                last_lsn = lsn;
+
+                if checkpoint_modified {
+                    let checkpoint_bytes = checkpoint.encode();
+                    timeline.put_page_image(
+                        RelishTag::Checkpoint,
+                        0,
+                        last_lsn,
+                        checkpoint_bytes,
+                    )?;
+                }
+
+                // Now that this record has been fully handled, including updating the
+                // checkpoint data, let the repository know that it is up-to-date to this LSN
+                timeline.advance_last_record_lsn(last_lsn);
+                nrecords += 1;
+
+                trace!("imported record at {} (end {})", lsn, endpoint);
+            }
+        }
+
+        debug!("imported {} records up to {}", nrecords, last_lsn);
+
+        segno += 1;
+        offset = 0;
+    }
+
+    if last_lsn != startpoint {
+        debug!(
+            "reached end of WAL at {}, updating checkpoint info",
+            last_lsn
+        );
+
+        timeline.advance_last_record_lsn(last_lsn);
+    } else {
+        info!("no WAL to import at {}", last_lsn);
+    }
+
+    Ok(())
+}
+
 ///
 /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
 /// relations/pages that the record affects.
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -4,54 +4,89 @@
 use crate::branches;
 use crate::layered_repository::LayeredRepository;
 use crate::repository::{Repository, Timeline};
+use crate::tenant_threads;
 use crate::walredo::PostgresRedoManager;
 use crate::PageServerConf;
 use anyhow::{anyhow, bail, Context, Result};
 use lazy_static::lazy_static;
 use log::{debug, info};
-use std::collections::hash_map::Entry;
+use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
+use std::fmt;
 use std::fs;
 use std::str::FromStr;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, Mutex, MutexGuard};
-use std::thread::JoinHandle;
 use zenith_utils::zid::{ZTenantId, ZTimelineId};

 lazy_static! {
-    static ref REPOSITORY: Mutex<HashMap<ZTenantId, Arc<dyn Repository>>> =
-        Mutex::new(HashMap::new());
+    static ref TENANTS: Mutex<HashMap<ZTenantId, Tenant>> = Mutex::new(HashMap::new());
 }

-fn access_repository() -> MutexGuard<'static, HashMap<ZTenantId, Arc<dyn Repository>>> {
-    REPOSITORY.lock().unwrap()
-}
-struct TenantHandleEntry {
-    checkpointer_handle: Option<JoinHandle<()>>,
-    gc_handle: Option<JoinHandle<()>>,
+struct Tenant {
+    state: TenantState,
+    repo: Option<Arc<dyn Repository>>,
 }

-// Logically these handles belong to Repository,
-// but it's just simpler to store them separately
-lazy_static! {
-    static ref TENANT_HANDLES: Mutex<HashMap<ZTenantId, TenantHandleEntry>> =
-        Mutex::new(HashMap::new());
+#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
+pub enum TenantState {
+    // This tenant only exists in cloud storage. It cannot be accessed.
+    CloudOnly,
+    // This tenant exists in cloud storage, and we are currently downloading it to local disk.
+    // It cannot be accessed yet, not until it's been fully downloaded to local disk.
+    Downloading,
+    // All data for this tenant is complete on local disk, but we haven't loaded the Repository,
+    // Timeline and Layer structs into memory yet, so it cannot be accessed yet.
+    //Ready,
+    // This tenant exists on local disk, and the layer map has been loaded into memory.
+    // The local disk might have some newer files that don't exist in cloud storage yet.
+    Active,
+    // Tenant is active, but there is no walreceiver connection.
+    Idle,
+    // This tenant exists on local disk, and the layer map has been loaded into memory.
+    // The local disk might have some newer files that don't exist in cloud storage yet.
+    // The tenant cannot be accessed anymore for any reason, but graceful shutdown.
+    Stopping,
+}
+
+impl fmt::Display for TenantState {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            TenantState::CloudOnly => f.write_str("CloudOnly"),
+            TenantState::Downloading => f.write_str("Downloading"),
+            TenantState::Active => f.write_str("Active"),
+            TenantState::Idle => f.write_str("Idle"),
+            TenantState::Stopping => f.write_str("Stopping"),
+        }
+    }
+}
+
+fn access_tenants() -> MutexGuard<'static, HashMap<ZTenantId, Tenant>> {
+    TENANTS.lock().unwrap()
 }

 static SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false);

 pub fn init(conf: &'static PageServerConf) {
-    let mut m = access_repository();
    for dir_entry in fs::read_dir(conf.tenants_path()).unwrap() {
        let tenantid =
            ZTenantId::from_str(dir_entry.unwrap().file_name().to_str().unwrap()).unwrap();
-        let repo = init_repo(conf, tenantid);
+
+        {
+            let mut m = access_tenants();
+            let tenant = Tenant {
+                state: TenantState::CloudOnly,
+                repo: None,
+            };
+            m.insert(tenantid, tenant);
+        }
+
+        init_repo(conf, tenantid);
        info!("initialized storage for tenant: {}", &tenantid);
-        m.insert(tenantid, repo);
    }
 }

-fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) -> Arc<LayeredRepository> {
+fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) {
    // Set up a WAL redo manager, for applying WAL records.
    let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);

@@ -63,22 +98,15 @@ fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) -> Arc<Layered
        true,
    ));

-    let checkpointer_handle = LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
-    let gc_handle = LayeredRepository::launch_gc_thread(conf, repo.clone());
+    let mut m = access_tenants();
+    let tenant = m.get_mut(&tenant_id).unwrap();
+    tenant.repo = Some(repo);
+    tenant.state = TenantState::Active;

-    let mut handles = TENANT_HANDLES.lock().unwrap();
-    let h = TenantHandleEntry {
-        checkpointer_handle: Some(checkpointer_handle),
-        gc_handle: Some(gc_handle),
-    };
-
-    handles.insert(tenant_id, h);
-
-    repo
+    // TODO Start these threads only if tenant actively receives some WAL
+    tenant_threads::start_tenant_threads(conf, tenant_id);
 }

-// TODO kb Currently unused function, will later be used when the relish storage downloads a new layer.
-// Relevant PR: https://github.com/zenithdb/zenith/pull/686
 pub fn register_relish_download(
    conf: &'static PageServerConf,
    tenant_id: ZTenantId,
@@ -89,15 +117,29 @@ pub fn register_relish_download(
        tenant_id,
        timeline_id
    );
-    match access_repository().entry(tenant_id) {
-        Entry::Occupied(o) => init_timeline(o.get().as_ref(), timeline_id),
-        Entry::Vacant(v) => {
-            log::info!("New repo initialized");
-            let new_repo = init_repo(conf, tenant_id);
-            init_timeline(new_repo.as_ref(), timeline_id);
-            v.insert(new_repo);
+
+    {
+        let mut m = access_tenants();
+        let tenant = m.entry(tenant_id).or_insert_with(|| Tenant {
+            state: TenantState::Downloading,
+            repo: None,
+        });
+        tenant.state = TenantState::Downloading;
+        match &tenant.repo {
+            Some(repo) => {
+                init_timeline(repo.as_ref(), timeline_id);
+                tenant.state = TenantState::Active;
+                return;
+            }
+            None => log::warn!("Initialize new repo"),
        }
+        tenant.state = TenantState::Active;
    }
+
+    // init repo updates Tenant state
+    init_repo(conf, tenant_id);
+    let new_repo = get_repository_for_tenant(tenant_id).unwrap();
+    init_timeline(new_repo.as_ref(), timeline_id);
 }

 fn init_timeline(repo: &dyn Repository, timeline_id: ZTimelineId) {
@@ -112,27 +154,23 @@ pub fn shutdown_requested() -> bool {
    SHUTDOWN_REQUESTED.load(Ordering::Relaxed)
 }

-pub fn stop_tenant_threads(tenantid: ZTenantId) {
-    let mut handles = TENANT_HANDLES.lock().unwrap();
-    if let Some(h) = handles.get_mut(&tenantid) {
-        h.checkpointer_handle.take().map(JoinHandle::join);
-        debug!("checkpointer for tenant {} has stopped", tenantid);
-        h.gc_handle.take().map(JoinHandle::join);
-        debug!("gc for tenant {} has stopped", tenantid);
-    }
-}
-
 pub fn shutdown_all_tenants() -> Result<()> {
    SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed);

-    let tenants = list_tenants()?;
-    for tenantid in tenants {
-        stop_tenant_threads(tenantid);
+    let tenantids = list_tenantids()?;
+
+    for tenantid in &tenantids {
+        set_tenant_state(*tenantid, TenantState::Stopping)?;
+    }
+
+    for tenantid in tenantids {
+        // Wait for checkpointer and GC to finish their job
+        tenant_threads::wait_for_tenant_threads_to_stop(tenantid);
+
        let repo = get_repository_for_tenant(tenantid)?;
        debug!("shutdown tenant {}", tenantid);
        repo.shutdown()?;
    }
-
    Ok(())
 }

@@ -140,25 +178,67 @@ pub fn create_repository_for_tenant(
    conf: &'static PageServerConf,
    tenantid: ZTenantId,
 ) -> Result<()> {
-    let mut m = access_repository();
-
-    // First check that the tenant doesn't exist already
-    if m.get(&tenantid).is_some() {
-        bail!("tenant {} already exists", tenantid);
+    {
+        let mut m = access_tenants();
+        // First check that the tenant doesn't exist already
+        if m.get(&tenantid).is_some() {
+            bail!("tenant {} already exists", tenantid);
+        }
+        let tenant = Tenant {
+            state: TenantState::CloudOnly,
+            repo: None,
+        };
+        m.insert(tenantid, tenant);
    }
+
    let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
    let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?;

-    m.insert(tenantid, repo);
+    let mut m = access_tenants();
+    let tenant = m.get_mut(&tenantid).unwrap();
+    tenant.repo = Some(repo);
+    tenant.state = TenantState::Active;

    Ok(())
 }

+// If tenant is not found in the repository, return CloudOnly state
+pub fn get_tenant_state(tenantid: ZTenantId) -> TenantState {
+    let m = access_tenants();
+    match m.get(&tenantid) {
+        Some(tenant) => tenant.state,
+        None => TenantState::CloudOnly,
+    }
+}
+
+pub fn set_tenant_state(tenantid: ZTenantId, state: TenantState) -> Result<TenantState> {
+    let mut m = access_tenants();
+    let tenant = m.get_mut(&tenantid);
+
+    match tenant {
+        Some(tenant) => {
+            if state == TenantState::Idle && tenant.state != TenantState::Active {
+                // Only Active tenant can become Idle
+                return Ok(tenant.state);
+            }
+            info!("set_tenant_state: {} -> {}", tenant.state, state);
+            tenant.state = state;
+            Ok(tenant.state)
+        }
+        None => bail!("Tenant not found for tenant {}", tenantid),
+    }
+}
+
 pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<dyn Repository>> {
-    access_repository()
+    let m = access_tenants();
+    let tenant = m
        .get(&tenantid)
-        .map(Arc::clone)
-        .ok_or_else(|| anyhow!("repository not found for tenant name {}", tenantid))
+        .ok_or_else(|| anyhow!("Tenant not found for tenant {}", tenantid))?;
+
+    match &tenant.repo {
+        Some(repo) => Ok(Arc::clone(repo)),
+        None => anyhow::bail!("Repository for tenant {} is not yet valid", tenantid),
+    }
 }

 pub fn get_timeline_for_tenant(
@@ -170,13 +250,32 @@ pub fn get_timeline_for_tenant(
        .with_context(|| format!("cannot fetch timeline {}", timelineid))
 }

-fn list_tenants() -> Result<Vec<ZTenantId>> {
-    let o = &mut REPOSITORY.lock().unwrap();
-
-    o.iter()
-        .map(|tenant| {
-            let (tenantid, _) = tenant;
+fn list_tenantids() -> Result<Vec<ZTenantId>> {
+    let m = access_tenants();
+    m.iter()
+        .map(|v| {
+            let (tenantid, _) = v;
            Ok(*tenantid)
        })
        .collect()
 }
+
+#[derive(Serialize, Deserialize, Clone)]
+pub struct TenantInfo {
+    #[serde(with = "hex")]
+    pub id: ZTenantId,
+    pub state: TenantState,
+}
+
+pub fn list_tenants() -> Result<Vec<TenantInfo>> {
+    let m = access_tenants();
+    m.iter()
+        .map(|v| {
+            let (id, tenant) = v;
+            Ok(TenantInfo {
+                id: *id,
+                state: tenant.state,
+            })
+        })
+        .collect()
+}
--- a/pageserver/src/tenant_threads.rs
+++ b/pageserver/src/tenant_threads.rs
@@ -0,0 +1,125 @@
+//! This module contains functions to serve per-tenant background processes,
+//! such as checkpointer and GC
+use crate::tenant_mgr;
+use crate::tenant_mgr::TenantState;
+use crate::CheckpointConfig;
+use crate::PageServerConf;
+use anyhow::Result;
+use lazy_static::lazy_static;
+use std::collections::HashMap;
+use std::sync::Mutex;
+use std::thread::JoinHandle;
+use std::time::Duration;
+use tracing::*;
+use zenith_utils::zid::ZTenantId;
+
+struct TenantHandleEntry {
+    checkpointer_handle: Option<JoinHandle<()>>,
+    gc_handle: Option<JoinHandle<()>>,
+}
+
+// Preserve handles to wait for thread completion
+// at shutdown
+lazy_static! {
+    static ref TENANT_HANDLES: Mutex<HashMap<ZTenantId, TenantHandleEntry>> =
+        Mutex::new(HashMap::new());
+}
+
+pub fn start_tenant_threads(conf: &'static PageServerConf, tenantid: ZTenantId) {
+    //ensure that old threads are stopeed
+    wait_for_tenant_threads_to_stop(tenantid);
+
+    let checkpointer_handle = std::thread::Builder::new()
+        .name("Checkpointer thread".into())
+        .spawn(move || {
+            checkpoint_loop(tenantid, conf).expect("Checkpointer thread died");
+        })
+        .ok();
+
+    let gc_handle = std::thread::Builder::new()
+        .name("GC thread".into())
+        .spawn(move || {
+            gc_loop(tenantid, conf).expect("GC thread died");
+        })
+        .ok();
+
+    // TODO handle thread errors if any
+
+    let mut handles = TENANT_HANDLES.lock().unwrap();
+    let h = TenantHandleEntry {
+        checkpointer_handle,
+        gc_handle,
+    };
+
+    handles.insert(tenantid, h);
+}
+
+pub fn wait_for_tenant_threads_to_stop(tenantid: ZTenantId) {
+    let mut handles = TENANT_HANDLES.lock().unwrap();
+    if let Some(h) = handles.get_mut(&tenantid) {
+        h.checkpointer_handle.take().map(JoinHandle::join);
+        trace!("checkpointer for tenant {} has stopped", tenantid);
+        h.gc_handle.take().map(JoinHandle::join);
+        trace!("gc for tenant {} has stopped", tenantid);
+    }
+    handles.remove(&tenantid);
+}
+
+///
+/// Checkpointer thread's main loop
+///
+fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
+    loop {
+        if tenant_mgr::get_tenant_state(tenantid) != TenantState::Active {
+            break;
+        }
+
+        std::thread::sleep(conf.checkpoint_period);
+        trace!("checkpointer thread for tenant {} waking up", tenantid);
+
+        // checkpoint timelines that have accumulated more than CHECKPOINT_DISTANCE
+        // bytes of WAL since last checkpoint.
+        let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
+        repo.checkpoint_iteration(CheckpointConfig::Distance(conf.checkpoint_distance))?;
+    }
+
+    trace!(
+        "checkpointer thread stopped for tenant {} state is {}",
+        tenantid,
+        tenant_mgr::get_tenant_state(tenantid)
+    );
+    Ok(())
+}
+
+///
+/// GC thread's main loop
+///
+fn gc_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
+    loop {
+        if tenant_mgr::get_tenant_state(tenantid) != TenantState::Active {
+            break;
+        }
+
+        trace!("gc thread for tenant {} waking up", tenantid);
+
+        // Garbage collect old files that are not needed for PITR anymore
+        if conf.gc_horizon > 0 {
+            let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
+            repo.gc_iteration(None, conf.gc_horizon, false).unwrap();
+        }
+
+        // TODO Write it in more adequate way using
+        // condvar.wait_timeout() or something
+        let mut sleep_time = conf.gc_period.as_secs();
+        while sleep_time > 0 && tenant_mgr::get_tenant_state(tenantid) == TenantState::Active {
+            sleep_time -= 1;
+            std::thread::sleep(Duration::from_secs(1));
+        }
+    }
+    trace!(
+        "GC thread stopped for tenant {} state is {}",
+        tenantid,
+        tenant_mgr::get_tenant_state(tenantid)
+    );
+    Ok(())
+}
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -284,12 +284,14 @@ fn walreceiver_main(
        if let Some(last_lsn) = status_update {
            // TODO: More thought should go into what values are sent here.
            let last_lsn = PgLsn::from(u64::from(last_lsn));
-            let write_lsn = last_lsn;
+            // We are using disk consistent LSN as `write_lsn`, i.e. LSN at which page server
+            // may guarantee persistence of all received data. Safekeeper is not free to remove
+            // WAL preceding `write_lsn`: it should not be requested by this page server.
+            let write_lsn = PgLsn::from(u64::from(timeline.get_disk_consistent_lsn()));
            let flush_lsn = last_lsn;
            let apply_lsn = PgLsn::from(0);
            let ts = SystemTime::now();
            const NO_REPLY: u8 = 0;
-
            physical_stream.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
        }

--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -153,6 +153,13 @@ struct WalRedoRequest {
    records: Vec<(Lsn, WALRecord)>,
 }

+impl WalRedoRequest {
+    // Can this request be served by zenith redo funcitons
+    // or we need to pass it to wal-redo postgres process?
+    fn can_apply_in_zenith(&self) -> bool {
+        !matches!(self.rel, RelishTag::Relation(_))
+    }
+}
 /// An error happened in WAL redo
 #[derive(Debug, thiserror::Error)]
 pub enum WalRedoError {
@@ -161,6 +168,8 @@ pub enum WalRedoError {

    #[error("cannot perform WAL redo now")]
    InvalidState,
+    #[error("cannot perform WAL redo for this request")]
+    InvalidRequest,
 }

 ///
@@ -182,7 +191,6 @@ impl WalRedoManager for PostgresRedoManager {
        records: Vec<(Lsn, WALRecord)>,
    ) -> Result<Bytes, WalRedoError> {
        let start_time;
-        let lock_time;
        let end_time;

        let request = WalRedoRequest {
@@ -194,9 +202,16 @@ impl WalRedoManager for PostgresRedoManager {
        };

        start_time = Instant::now();
-        let result = {
+        let result;
+
+        if request.can_apply_in_zenith() {
+            result = self.handle_apply_request_zenith(&request);
+
+            end_time = Instant::now();
+            WAL_REDO_TIME.observe(end_time.duration_since(start_time).as_secs_f64());
+        } else {
            let mut process_guard = self.process.lock().unwrap();
-            lock_time = Instant::now();
+            let lock_time = Instant::now();

            // launch the WAL redo process on first use
            if process_guard.is_none() {
@@ -207,13 +222,14 @@ impl WalRedoManager for PostgresRedoManager {
            }
            let process = process_guard.as_mut().unwrap();

-            self.runtime
-                .block_on(self.handle_apply_request(process, &request))
-        };
-        end_time = Instant::now();
+            result = self
+                .runtime
+                .block_on(self.handle_apply_request_postgres(process, &request));

-        WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
-        WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());
+            WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
+            end_time = Instant::now();
+            WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());
+        }

        result
    }
@@ -242,13 +258,47 @@ impl PostgresRedoManager {
    }

    ///
-    /// Process one request for WAL redo.
+    /// Process one request for WAL redo using wal-redo postgres
    ///
-    async fn handle_apply_request(
+    async fn handle_apply_request_postgres(
        &self,
        process: &mut PostgresRedoProcess,
        request: &WalRedoRequest,
    ) -> Result<Bytes, WalRedoError> {
+        let blknum = request.blknum;
+        let lsn = request.lsn;
+        let base_img = request.base_img.clone();
+        let records = &request.records;
+        let nrecords = records.len();
+
+        let start = Instant::now();
+
+        let apply_result: Result<Bytes, Error>;
+
+        if let RelishTag::Relation(rel) = request.rel {
+            // Relational WAL records are applied using wal-redo-postgres
+            let buf_tag = BufferTag { rel, blknum };
+            apply_result = process.apply_wal_records(buf_tag, base_img, records).await;
+
+            let duration = start.elapsed();
+
+            debug!(
+                "postgres applied {} WAL records in {} ms to reconstruct page image at LSN {}",
+                nrecords,
+                duration.as_millis(),
+                lsn
+            );
+
+            apply_result.map_err(WalRedoError::IoError)
+        } else {
+            Err(WalRedoError::InvalidRequest)
+        }
+    }
+
+    ///
+    /// Process one request for WAL redo using custom zenith code
+    ///
+    fn handle_apply_request_zenith(&self, request: &WalRedoRequest) -> Result<Bytes, WalRedoError> {
        let rel = request.rel;
        let blknum = request.blknum;
        let lsn = request.lsn;
@@ -260,178 +310,158 @@ impl PostgresRedoManager {
        let start = Instant::now();

        let apply_result: Result<Bytes, Error>;
-        if let RelishTag::Relation(rel) = rel {
-            // Relational WAL records are applied using wal-redo-postgres
-            let buf_tag = BufferTag { rel, blknum };
-            apply_result = process.apply_wal_records(buf_tag, base_img, records).await;
+
+        // Non-relational WAL records are handled here, with custom code that has the
+        // same effects as the corresponding Postgres WAL redo function.
+        const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
+        let mut page = BytesMut::new();
+        if let Some(fpi) = base_img {
+            // If full-page image is provided, then use it...
+            page.extend_from_slice(&fpi[..]);
        } else {
-            // Non-relational WAL records are handled here, with custom code that has the
-            // same effects as the corresponding Postgres WAL redo function.
-            const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
-            let mut page = BytesMut::new();
-            if let Some(fpi) = base_img {
-                // If full-page image is provided, then use it...
-                page.extend_from_slice(&fpi[..]);
-            } else {
-                // otherwise initialize page with zeros
-                page.extend_from_slice(&ZERO_PAGE);
+            // otherwise initialize page with zeros
+            page.extend_from_slice(&ZERO_PAGE);
+        }
+        // Apply all collected WAL records
+        for (_lsn, record) in records {
+            let mut buf = record.rec.clone();
+
+            WAL_REDO_RECORD_COUNTER.inc();
+
+            // 1. Parse XLogRecord struct
+            // FIXME: refactor to avoid code duplication.
+            let xlogrec = XLogRecord::from_bytes(&mut buf);
+
+            //move to main data
+            // TODO probably, we should store some records in our special format
+            // to avoid this weird parsing on replay
+            let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
+            if buf.remaining() > skip {
+                buf.advance(skip);
            }
-            // Apply all collected WAL records
-            for (_lsn, record) in records {
-                let mut buf = record.rec.clone();

-                WAL_REDO_RECORD_COUNTER.inc();
-
-                // 1. Parse XLogRecord struct
-                // FIXME: refactor to avoid code duplication.
-                let xlogrec = XLogRecord::from_bytes(&mut buf);
-
-                //move to main data
-                // TODO probably, we should store some records in our special format
-                // to avoid this weird parsing on replay
-                let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
-                if buf.remaining() > skip {
-                    buf.advance(skip);
-                }
-
-                if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
-                    // Transaction manager stuff
-                    let rec_segno = match rel {
-                        RelishTag::Slru { slru, segno } => {
-                            assert!(
-                                slru == SlruKind::Clog,
-                                "Not valid XACT relish tag {:?}",
-                                rel
+            if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
+                // Transaction manager stuff
+                let rec_segno = match rel {
+                    RelishTag::Slru { slru, segno } => {
+                        assert!(
+                            slru == SlruKind::Clog,
+                            "Not valid XACT relish tag {:?}",
+                            rel
+                        );
+                        segno
+                    }
+                    _ => panic!("Not valid XACT relish tag {:?}", rel),
+                };
+                let parsed_xact =
+                    XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
+                if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
+                    || parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
+                {
+                    transaction_id_set_status(
+                        parsed_xact.xid,
+                        pg_constants::TRANSACTION_STATUS_COMMITTED,
+                        &mut page,
+                    );
+                    for subxact in &parsed_xact.subxacts {
+                        let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                        let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                        let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                        // only update xids on the requested page
+                        if rec_segno == segno && blknum == rpageno {
+                            transaction_id_set_status(
+                                *subxact,
+                                pg_constants::TRANSACTION_STATUS_COMMITTED,
+                                &mut page,
                            );
-                            segno
-                        }
-                        _ => panic!("Not valid XACT relish tag {:?}", rel),
-                    };
-                    let parsed_xact =
-                        XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
-                    if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
-                        || parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
-                    {
-                        transaction_id_set_status(
-                            parsed_xact.xid,
-                            pg_constants::TRANSACTION_STATUS_COMMITTED,
-                            &mut page,
-                        );
-                        for subxact in &parsed_xact.subxacts {
-                            let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
-                            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                            // only update xids on the requested page
-                            if rec_segno == segno && blknum == rpageno {
-                                transaction_id_set_status(
-                                    *subxact,
-                                    pg_constants::TRANSACTION_STATUS_COMMITTED,
-                                    &mut page,
-                                );
-                            }
-                        }
-                    } else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
-                        || parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
-                    {
-                        transaction_id_set_status(
-                            parsed_xact.xid,
-                            pg_constants::TRANSACTION_STATUS_ABORTED,
-                            &mut page,
-                        );
-                        for subxact in &parsed_xact.subxacts {
-                            let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
-                            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                            // only update xids on the requested page
-                            if rec_segno == segno && blknum == rpageno {
-                                transaction_id_set_status(
-                                    *subxact,
-                                    pg_constants::TRANSACTION_STATUS_ABORTED,
-                                    &mut page,
-                                );
-                            }
                        }
                    }
-                } else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
-                    // Multixact operations
-                    let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-                    if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
-                        let xlrec = XlMultiXactCreate::decode(&mut buf);
-                        if let RelishTag::Slru {
-                            slru,
-                            segno: rec_segno,
-                        } = rel
-                        {
-                            if slru == SlruKind::MultiXactMembers {
-                                for i in 0..xlrec.nmembers {
-                                    let pageno =
-                                        i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
-                                    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                                    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                                    if segno == rec_segno && rpageno == blknum {
-                                        // update only target block
-                                        let offset = xlrec.moff + i;
-                                        let memberoff = mx_offset_to_member_offset(offset);
-                                        let flagsoff = mx_offset_to_flags_offset(offset);
-                                        let bshift = mx_offset_to_flags_bitshift(offset);
-                                        let mut flagsval =
-                                            LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
-                                        flagsval &= !(((1
-                                            << pg_constants::MXACT_MEMBER_BITS_PER_XACT)
-                                            - 1)
+                } else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
+                    || parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
+                {
+                    transaction_id_set_status(
+                        parsed_xact.xid,
+                        pg_constants::TRANSACTION_STATUS_ABORTED,
+                        &mut page,
+                    );
+                    for subxact in &parsed_xact.subxacts {
+                        let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                        let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                        let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                        // only update xids on the requested page
+                        if rec_segno == segno && blknum == rpageno {
+                            transaction_id_set_status(
+                                *subxact,
+                                pg_constants::TRANSACTION_STATUS_ABORTED,
+                                &mut page,
+                            );
+                        }
+                    }
+                }
+            } else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
+                // Multixact operations
+                let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+                if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
+                    let xlrec = XlMultiXactCreate::decode(&mut buf);
+                    if let RelishTag::Slru {
+                        slru,
+                        segno: rec_segno,
+                    } = rel
+                    {
+                        if slru == SlruKind::MultiXactMembers {
+                            for i in 0..xlrec.nmembers {
+                                let pageno = i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
+                                let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                                let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                                if segno == rec_segno && rpageno == blknum {
+                                    // update only target block
+                                    let offset = xlrec.moff + i;
+                                    let memberoff = mx_offset_to_member_offset(offset);
+                                    let flagsoff = mx_offset_to_flags_offset(offset);
+                                    let bshift = mx_offset_to_flags_bitshift(offset);
+                                    let mut flagsval =
+                                        LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
+                                    flagsval &=
+                                        !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1)
                                            << bshift);
-                                        flagsval |= xlrec.members[i as usize].status << bshift;
-                                        LittleEndian::write_u32(
-                                            &mut page[flagsoff..flagsoff + 4],
-                                            flagsval,
-                                        );
-                                        LittleEndian::write_u32(
-                                            &mut page[memberoff..memberoff + 4],
-                                            xlrec.members[i as usize].xid,
-                                        );
-                                    }
+                                    flagsval |= xlrec.members[i as usize].status << bshift;
+                                    LittleEndian::write_u32(
+                                        &mut page[flagsoff..flagsoff + 4],
+                                        flagsval,
+                                    );
+                                    LittleEndian::write_u32(
+                                        &mut page[memberoff..memberoff + 4],
+                                        xlrec.members[i as usize].xid,
+                                    );
                                }
-                            } else {
-                                // Multixact offsets SLRU
-                                let offs = (xlrec.mid
-                                    % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
-                                    * 4) as usize;
-                                LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
                            }
                        } else {
-                            panic!();
+                            // Multixact offsets SLRU
+                            let offs = (xlrec.mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
+                                * 4) as usize;
+                            LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
                        }
                    } else {
                        panic!();
                    }
+                } else {
+                    panic!();
                }
            }
-
-            apply_result = Ok::<Bytes, Error>(page.freeze());
        }

+        apply_result = Ok::<Bytes, Error>(page.freeze());
+
        let duration = start.elapsed();

-        let result: Result<Bytes, WalRedoError>;
-
        debug!(
-            "applied {} WAL records in {} ms to reconstruct page image at LSN {}",
+            "zenith applied {} WAL records in {} ms to reconstruct page image at LSN {}",
            nrecords,
            duration.as_millis(),
            lsn
        );

-        if let Err(e) = apply_result {
-            error!("could not apply WAL records: {:#}", e);
-            result = Err(WalRedoError::IoError(e));
-        } else {
-            let img = apply_result.unwrap();
-
-            result = Ok(img);
-        }
-
-        // The caller is responsible for sending the response
-        result
+        apply_result.map_err(WalRedoError::IoError)
    }
 }

--- a/postgres_ffi/src/xlog_utils.rs
+++ b/postgres_ffi/src/xlog_utils.rs
@@ -9,7 +9,6 @@

 use crate::pg_constants;
 use crate::CheckPoint;
-use crate::ControlFileData;
 use crate::FullTransactionId;
 use crate::XLogLongPageHeaderData;
 use crate::XLogPageHeaderData;
@@ -18,8 +17,8 @@ use crate::XLOG_PAGE_MAGIC;

 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, LittleEndian};
+use bytes::BytesMut;
 use bytes::{Buf, Bytes};
-use bytes::{BufMut, BytesMut};
 use crc32c::*;
 use log::*;
 use std::cmp::max;
@@ -410,27 +409,25 @@ impl CheckPoint {
 }

 //
-// Generate new WAL segment with single XLOG_CHECKPOINT_SHUTDOWN record.
+// Generate new, empty WAL segment.
 // We need this segment to start compute node.
-// In order to minimize changes in Postgres core, we prefer to
-// provide WAL segment from which is can extract checkpoint record in standard way,
-// rather then implement some alternative mechanism.
 //
-pub fn generate_wal_segment(pg_control: &ControlFileData) -> Bytes {
+pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes {
    let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize);

+    let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, pg_constants::WAL_SEGMENT_SIZE);
    let hdr = XLogLongPageHeaderData {
        std: {
            XLogPageHeaderData {
                xlp_magic: XLOG_PAGE_MAGIC as u16,
                xlp_info: pg_constants::XLP_LONG_HEADER,
                xlp_tli: 1, // FIXME: always use Postgres timeline 1
-                xlp_pageaddr: pg_control.checkPoint - XLOG_SIZE_OF_XLOG_LONG_PHD as u64,
+                xlp_pageaddr: pageaddr,
                xlp_rem_len: 0,
                ..Default::default() // Put 0 in padding fields.
            }
        },
-        xlp_sysid: pg_control.system_identifier,
+        xlp_sysid: system_id,
        xlp_seg_size: pg_constants::WAL_SEGMENT_SIZE as u32,
        xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
    };
@@ -438,36 +435,6 @@ pub fn generate_wal_segment(pg_control: &ControlFileData) -> Bytes {
    let hdr_bytes = hdr.encode();
    seg_buf.extend_from_slice(&hdr_bytes);

-    let rec_hdr = XLogRecord {
-        xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD
-            + SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT
-            + SIZEOF_CHECKPOINT) as u32,
-        xl_xid: 0, //0 is for InvalidTransactionId
-        xl_prev: 0,
-        xl_info: pg_constants::XLOG_CHECKPOINT_SHUTDOWN,
-        xl_rmid: pg_constants::RM_XLOG_ID,
-        xl_crc: 0,
-        ..Default::default() // Put 0 in padding fields.
-    };
-
-    let mut rec_shord_hdr_bytes = BytesMut::new();
-    rec_shord_hdr_bytes.put_u8(pg_constants::XLR_BLOCK_ID_DATA_SHORT);
-    rec_shord_hdr_bytes.put_u8(SIZEOF_CHECKPOINT as u8);
-
-    let rec_bytes = rec_hdr.encode();
-    let checkpoint_bytes = pg_control.checkPointCopy.encode();
-
-    //calculate record checksum
-    let mut crc = 0;
-    crc = crc32c_append(crc, &rec_shord_hdr_bytes[..]);
-    crc = crc32c_append(crc, &checkpoint_bytes[..]);
-    crc = crc32c_append(crc, &rec_bytes[0..XLOG_RECORD_CRC_OFFS]);
-
-    seg_buf.extend_from_slice(&rec_bytes[0..XLOG_RECORD_CRC_OFFS]);
-    seg_buf.put_u32_le(crc);
-    seg_buf.extend_from_slice(&rec_shord_hdr_bytes);
-    seg_buf.extend_from_slice(&checkpoint_bytes);
-
    //zero out the rest of the file
    seg_buf.resize(pg_constants::WAL_SEGMENT_SIZE, 0);
    seg_buf.freeze()
--- a/proxy/src/cplane_api.rs
+++ b/proxy/src/cplane_api.rs
@@ -12,7 +12,14 @@ pub struct DatabaseInfo {
    pub port: u16,
    pub dbname: String,
    pub user: String,
-    pub password: String,
+    pub password: Option<String>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct ProxyAuthResult {
+    pub ready: bool,
+    pub error: Option<String>,
+    pub conn_info: Option<DatabaseInfo>,
 }

 impl DatabaseInfo {
@@ -24,12 +31,23 @@ impl DatabaseInfo {
            .next()
            .ok_or_else(|| anyhow::Error::msg("cannot resolve at least one SocketAddr"))
    }
+}

-    pub fn conn_string(&self) -> String {
-        format!(
-            "dbname={} user={} password={}",
-            self.dbname, self.user, self.password
-        )
+impl From<DatabaseInfo> for tokio_postgres::Config {
+    fn from(db_info: DatabaseInfo) -> Self {
+        let mut config = tokio_postgres::Config::new();
+
+        config
+            .host(&db_info.host)
+            .port(db_info.port)
+            .dbname(&db_info.dbname)
+            .user(&db_info.user);
+
+        if let Some(password) = db_info.password {
+            config.password(password);
+        }
+
+        config
    }
 }

@@ -44,22 +62,25 @@ impl CPlaneApi {
        database: &str,
        md5_response: &[u8],
        salt: &[u8; 4],
-    ) -> Result<DatabaseInfo> {
+        psql_session_id: &str,
+    ) -> Result<ProxyAuthResult> {
        let mut url = reqwest::Url::parse(self.auth_endpoint)?;
        url.query_pairs_mut()
            .append_pair("login", user)
            .append_pair("database", database)
            .append_pair("md5response", std::str::from_utf8(md5_response)?)
-            .append_pair("salt", &hex::encode(salt));
+            .append_pair("salt", &hex::encode(salt))
+            .append_pair("psql_session_id", psql_session_id);

        println!("cplane request: {}", url.as_str());

        let resp = reqwest::blocking::get(url)?;

        if resp.status().is_success() {
-            let conn_info: DatabaseInfo = serde_json::from_str(resp.text()?.as_str())?;
-            println!("got conn info: #{:?}", conn_info);
-            Ok(conn_info)
+            let auth_info: ProxyAuthResult = serde_json::from_str(resp.text()?.as_str())?;
+            println!("got auth info: #{:?}", auth_info);
+
+            Ok(auth_info)
        } else {
            bail!("Auth failed")
        }
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -7,7 +7,7 @@
 ///
 use std::{
    collections::HashMap,
-    net::{SocketAddr, TcpListener},
+    net::SocketAddr,
    sync::{mpsc, Arc, Mutex},
    thread,
 };
@@ -17,6 +17,7 @@ use clap::{App, Arg, ArgMatches};

 use cplane_api::DatabaseInfo;
 use rustls::{internal::pemfile, NoClientAuth, ProtocolVersion, ServerConfig};
+use zenith_utils::tcp_listener;

 mod cplane_api;
 mod mgmt;
@@ -99,7 +100,7 @@ fn main() -> anyhow::Result<()> {
                .long("uri")
                .takes_value(true)
                .help("redirect unauthenticated users to given uri")
-                .default_value("http://127.0.0.1:3000/psql_session/"),
+                .default_value("http://localhost:3000/psql_session/"),
        )
        .arg(
            Arg::with_name("auth-endpoint")
@@ -107,7 +108,7 @@ fn main() -> anyhow::Result<()> {
                .long("auth-endpoint")
                .takes_value(true)
                .help("redirect unauthenticated users to given uri")
-                .default_value("http://127.0.0.1:3000/authenticate_proxy_request/"),
+                .default_value("http://localhost:3000/authenticate_proxy_request/"),
        )
        .arg(
            Arg::with_name("ssl-key")
@@ -140,23 +141,23 @@ fn main() -> anyhow::Result<()> {

    // Check that we can bind to address before further initialization
    println!("Starting proxy on {}", state.conf.proxy_address);
-    let pageserver_listener = TcpListener::bind(state.conf.proxy_address)?;
+    let pageserver_listener = tcp_listener::bind(state.conf.proxy_address)?;

    println!("Starting mgmt on {}", state.conf.mgmt_address);
-    let mgmt_listener = TcpListener::bind(state.conf.mgmt_address)?;
+    let mgmt_listener = tcp_listener::bind(state.conf.mgmt_address)?;

-    let threads = vec![
+    let threads = [
        // Spawn a thread to listen for connections. It will spawn further threads
        // for each connection.
        thread::Builder::new()
-            .name("Proxy thread".into())
+            .name("Listener thread".into())
            .spawn(move || proxy::thread_main(state, pageserver_listener))?,
        thread::Builder::new()
            .name("Mgmt thread".into())
            .spawn(move || mgmt::thread_main(state, mgmt_listener))?,
    ];

-    for t in threads.into_iter() {
+    for t in threads {
        t.join().unwrap()?;
    }

--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -6,7 +6,6 @@ use anyhow::bail;
 use tokio_postgres::NoTls;

 use rand::Rng;
-use std::io::Write;
 use std::{io, sync::mpsc::channel, thread};
 use zenith_utils::postgres_backend::Stream;
 use zenith_utils::postgres_backend::{PostgresBackend, ProtoState};
@@ -28,11 +27,13 @@ pub fn thread_main(
        println!("accepted connection from {}", peer_addr);
        socket.set_nodelay(true).unwrap();

-        thread::spawn(move || {
-            if let Err(err) = proxy_conn_main(state, socket) {
-                println!("error: {}", err);
-            }
-        });
+        thread::Builder::new()
+            .name("Proxy thread".into())
+            .spawn(move || {
+                if let Err(err) = proxy_conn_main(state, socket) {
+                    println!("error: {}", err);
+                }
+            })?;
    }
 }

@@ -74,8 +75,12 @@ pub fn proxy_conn_main(
    // This will set conn.existing_user and we can decide on next actions
    conn.handle_startup()?;

+    let mut psql_session_id_buf = [0u8; 8];
+    rand::thread_rng().fill(&mut psql_session_id_buf);
+    conn.psql_session_id = hex::encode(psql_session_id_buf);
+
    // both scenarious here should end up producing database connection string
-    let db_info = if conn.is_existing_user() {
+    let conn_info = if conn.is_existing_user() {
        conn.handle_existing_user()?
    } else {
        conn.handle_new_user()?
@@ -83,7 +88,7 @@ pub fn proxy_conn_main(

    // XXX: move that inside handle_new_user/handle_existing_user to be able to
    // report wrong connection error.
-    proxy_pass(conn.pgb, db_info)
+    proxy_pass(conn.pgb, conn_info)
 }

 impl ProxyConnection {
@@ -155,9 +160,25 @@ impl ProxyConnection {
        Ok(())
    }

+    // Wait for proxy kick form the console with conninfo
+    fn wait_for_conninfo(&mut self) -> anyhow::Result<DatabaseInfo> {
+        let (tx, rx) = channel::<anyhow::Result<DatabaseInfo>>();
+        let _ = self
+            .state
+            .waiters
+            .lock()
+            .unwrap()
+            .insert(self.psql_session_id.clone(), tx);
+
+        // Wait for web console response
+        // TODO: respond with error to client
+        rx.recv()?
+    }
+
    fn handle_existing_user(&mut self) -> anyhow::Result<DatabaseInfo> {
        // ask password
        rand::thread_rng().fill(&mut self.md5_salt);
+
        self.pgb
            .write_message(&BeMessage::AuthenticationMD5Password(&self.md5_salt))?;
        self.pgb.state = ProtoState::Authentication; // XXX
@@ -180,14 +201,41 @@ impl ProxyConnection {
                self.database.as_str(),
                md5_response,
                &self.md5_salt,
+                &self.psql_session_id,
            ) {
                Err(e) => {
-                    self.pgb
-                        .write_message(&BeMessage::ErrorResponse(format!("{}", e)))?;
+                    self.pgb.write_message(&BeMessage::ErrorResponse(format!(
+                        "cannot authenticate proxy: {}",
+                        e
+                    )))?;

                    bail!("auth failed: {}", e);
                }
-                Ok(conn_info) => {
+
+                Ok(auth_info) => {
+                    let conn_info = if auth_info.ready {
+                        // Cluster is ready, so just take `conn_info` and respond to the client.
+                        auth_info
+                            .conn_info
+                            .expect("conn_info should be provided with ready cluster")
+                    } else {
+                        match auth_info.error {
+                            Some(e) => {
+                                self.pgb.write_message(&BeMessage::ErrorResponse(format!(
+                                    "cannot authenticate proxy: {}",
+                                    e
+                                )))?;
+
+                                bail!("auth failed: {}", e);
+                            }
+                            None => {
+                                // Cluster exists, but isn't active, await its start and proxy kick
+                                // with `conn_info`.
+                                self.wait_for_conninfo()?
+                            }
+                        }
+                    };
+
                    self.pgb
                        .write_message_noflush(&BeMessage::AuthenticationOk)?;
                    self.pgb
@@ -203,10 +251,6 @@ impl ProxyConnection {
    }

    fn handle_new_user(&mut self) -> anyhow::Result<DatabaseInfo> {
-        let mut psql_session_id_buf = [0u8; 8];
-        rand::thread_rng().fill(&mut psql_session_id_buf);
-        self.psql_session_id = hex::encode(psql_session_id_buf);
-
        let hello_message = format!("☀️  Welcome to Zenith!

 To proceed with database creation, open the following link:
@@ -225,76 +269,83 @@ databases without opening the browser.
        self.pgb
            .write_message(&BeMessage::NoticeResponse(hello_message))?;

-        // await for database creation
-        let (tx, rx) = channel::<anyhow::Result<DatabaseInfo>>();
-        let _ = self
-            .state
-            .waiters
-            .lock()
-            .unwrap()
-            .insert(self.psql_session_id.clone(), tx);
-
-        // Wait for web console response
-        // XXX: respond with error to client
-        let dbinfo = rx.recv()??;
+        // We requested the DB creation from the console. Now wait for conninfo
+        let conn_info = self.wait_for_conninfo()?;

        self.pgb.write_message_noflush(&BeMessage::NoticeResponse(
            "Connecting to database.".to_string(),
        ))?;
        self.pgb.write_message(&BeMessage::ReadyForQuery)?;

-        Ok(dbinfo)
+        Ok(conn_info)
    }
 }

 /// Create a TCP connection to a postgres database, authenticate with it, and receive the ReadyForQuery message
 async fn connect_to_db(db_info: DatabaseInfo) -> anyhow::Result<tokio::net::TcpStream> {
    let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()?).await?;
-    let config = db_info.conn_string().parse::<tokio_postgres::Config>()?;
+    let config = tokio_postgres::Config::from(db_info);
    let _ = config.connect_raw(&mut socket, NoTls).await?;
    Ok(socket)
 }

 /// Concurrently proxy both directions of the client and server connections
 fn proxy(
-    client_read: ReadStream,
-    client_write: WriteStream,
-    server_read: ReadStream,
-    server_write: WriteStream,
+    (client_read, client_write): (ReadStream, WriteStream),
+    (server_read, server_write): (ReadStream, WriteStream),
 ) -> anyhow::Result<()> {
-    fn do_proxy(mut reader: ReadStream, mut writer: WriteStream) -> io::Result<()> {
-        std::io::copy(&mut reader, &mut writer)?;
-        writer.flush()?;
-        writer.shutdown(std::net::Shutdown::Both)
+    fn do_proxy(mut reader: impl io::Read, mut writer: WriteStream) -> io::Result<u64> {
+        /// FlushWriter will make sure that every message is sent as soon as possible
+        struct FlushWriter<W>(W);
+
+        impl<W: io::Write> io::Write for FlushWriter<W> {
+            fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
+                // `std::io::copy` is guaranteed to exit if we return an error,
+                // so we can afford to lose `res` in case `flush` fails
+                let res = self.0.write(buf);
+                if res.is_ok() {
+                    self.0.flush()?;
+                }
+                res
+            }
+
+            fn flush(&mut self) -> io::Result<()> {
+                self.0.flush()
+            }
+        }
+
+        let res = std::io::copy(&mut reader, &mut FlushWriter(&mut writer));
+        writer.shutdown(std::net::Shutdown::Both)?;
+        res
    }

    let client_to_server_jh = thread::spawn(move || do_proxy(client_read, server_write));

-    let res1 = do_proxy(server_read, client_write);
-    let res2 = client_to_server_jh.join().unwrap();
-    res1?;
-    res2?;
+    do_proxy(server_read, client_write)?;
+    client_to_server_jh.join().unwrap()?;

    Ok(())
 }

 /// Proxy a client connection to a postgres database
 fn proxy_pass(pgb: PostgresBackend, db_info: DatabaseInfo) -> anyhow::Result<()> {
-    let runtime = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()?;
-    let db_stream = runtime.block_on(connect_to_db(db_info))?;
-    let db_stream = db_stream.into_std()?;
-    db_stream.set_nonblocking(false)?;
+    let db_stream = {
+        // We'll get rid of this once migration to async is complete
+        let runtime = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()?;

-    let db_stream = zenith_utils::sock_split::BidiStream::from_tcp(db_stream);
-    let (db_read, db_write) = db_stream.split();
+        let stream = runtime.block_on(connect_to_db(db_info))?.into_std()?;
+        stream.set_nonblocking(false)?;
+        stream
+    };

-    let stream = match pgb.into_stream() {
+    let db = zenith_utils::sock_split::BidiStream::from_tcp(db_stream);
+
+    let client = match pgb.into_stream() {
        Stream::Bidirectional(bidi_stream) => bidi_stream,
        _ => bail!("invalid stream"),
    };

-    let (client_read, client_write) = stream.split();
-    proxy(client_read, client_write, db_read, db_write)
+    proxy(client.split(), db.split())
 }
--- a/test_runner/Pipfile
+++ b/test_runner/Pipfile
@@ -11,12 +11,19 @@ pyjwt = {extras = ["crypto"], version = "*"}
 requests = "*"
 pytest-xdist = "*"
 asyncpg = "*"
+cached-property = "*"

 [dev-packages]
-yapf = "*"
+# Behavior may change slightly between versions. These are run continuously,
+# so we pin exact versions to avoid suprising breaks. Update if comfortable.
+yapf = "==0.31.0"
+mypy = "==0.910"
+# Non-pinned packages follow.
+pipenv = "*"
 flake8 = "*"
-mypy = "*"
+types-requests = "*"
+types-psycopg2 = "*"

 [requires]
-# we need at least 3.6, but pipenv doesn't allow to say this directly
+# we need at least 3.7, but pipenv doesn't allow to say this directly
 python_version = "3"
--- a/test_runner/Pipfile.lock
+++ b/test_runner/Pipfile.lock
@@ -1,7 +1,7 @@
 {
    "_meta": {
        "hash": {
-            "sha256": "3cdc048691824d0b93912b6b78a0aa01dc98f278212c1badb0cc2edbd2103c3a"
+            "sha256": "63b72760ef37375186a638066ba0ad5804dbace99ddc503ea654e9749070ab24"
        },
        "pipfile-spec": 6,
        "requires": {
@@ -43,94 +43,108 @@
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
            "version": "==21.2.0"
        },
+        "cached-property": {
+            "hashes": [
+                "sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130",
+                "sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"
+            ],
+            "index": "pypi",
+            "version": "==1.5.2"
+        },
        "certifi": {
            "hashes": [
-                "sha256:2bbf76fd432960138b3ef6dda3dde0544f27cbf8546c458e60baf371917ba9ee",
-                "sha256:50b1e4f8446b06f41be7dd6338db18e0990601dce795c2b1686458aa7e8fa7d8"
+                "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
+                "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
            ],
-            "version": "==2021.5.30"
+            "version": "==2021.10.8"
        },
        "cffi": {
            "hashes": [
-                "sha256:06c54a68935738d206570b20da5ef2b6b6d92b38ef3ec45c5422c0ebaf338d4d",
-                "sha256:0c0591bee64e438883b0c92a7bed78f6290d40bf02e54c5bf0978eaf36061771",
-                "sha256:19ca0dbdeda3b2615421d54bef8985f72af6e0c47082a8d26122adac81a95872",
-                "sha256:22b9c3c320171c108e903d61a3723b51e37aaa8c81255b5e7ce102775bd01e2c",
-                "sha256:26bb2549b72708c833f5abe62b756176022a7b9a7f689b571e74c8478ead51dc",
-                "sha256:33791e8a2dc2953f28b8d8d300dde42dd929ac28f974c4b4c6272cb2955cb762",
-                "sha256:3c8d896becff2fa653dc4438b54a5a25a971d1f4110b32bd3068db3722c80202",
-                "sha256:4373612d59c404baeb7cbd788a18b2b2a8331abcc84c3ba40051fcd18b17a4d5",
-                "sha256:487d63e1454627c8e47dd230025780e91869cfba4c753a74fda196a1f6ad6548",
-                "sha256:48916e459c54c4a70e52745639f1db524542140433599e13911b2f329834276a",
-                "sha256:4922cd707b25e623b902c86188aca466d3620892db76c0bdd7b99a3d5e61d35f",
-                "sha256:55af55e32ae468e9946f741a5d51f9896da6b9bf0bbdd326843fec05c730eb20",
-                "sha256:57e555a9feb4a8460415f1aac331a2dc833b1115284f7ded7278b54afc5bd218",
-                "sha256:5d4b68e216fc65e9fe4f524c177b54964af043dde734807586cf5435af84045c",
-                "sha256:64fda793737bc4037521d4899be780534b9aea552eb673b9833b01f945904c2e",
-                "sha256:6d6169cb3c6c2ad50db5b868db6491a790300ade1ed5d1da29289d73bbe40b56",
-                "sha256:7bcac9a2b4fdbed2c16fa5681356d7121ecabf041f18d97ed5b8e0dd38a80224",
-                "sha256:80b06212075346b5546b0417b9f2bf467fea3bfe7352f781ffc05a8ab24ba14a",
-                "sha256:818014c754cd3dba7229c0f5884396264d51ffb87ec86e927ef0be140bfdb0d2",
-                "sha256:8eb687582ed7cd8c4bdbff3df6c0da443eb89c3c72e6e5dcdd9c81729712791a",
-                "sha256:99f27fefe34c37ba9875f224a8f36e31d744d8083e00f520f133cab79ad5e819",
-                "sha256:9f3e33c28cd39d1b655ed1ba7247133b6f7fc16fa16887b120c0c670e35ce346",
-                "sha256:a8661b2ce9694ca01c529bfa204dbb144b275a31685a075ce123f12331be790b",
-                "sha256:a9da7010cec5a12193d1af9872a00888f396aba3dc79186604a09ea3ee7c029e",
-                "sha256:aedb15f0a5a5949ecb129a82b72b19df97bbbca024081ed2ef88bd5c0a610534",
-                "sha256:b315d709717a99f4b27b59b021e6207c64620790ca3e0bde636a6c7f14618abb",
-                "sha256:ba6f2b3f452e150945d58f4badd92310449876c4c954836cfb1803bdd7b422f0",
-                "sha256:c33d18eb6e6bc36f09d793c0dc58b0211fccc6ae5149b808da4a62660678b156",
-                "sha256:c9a875ce9d7fe32887784274dd533c57909b7b1dcadcc128a2ac21331a9765dd",
-                "sha256:c9e005e9bd57bc987764c32a1bee4364c44fdc11a3cc20a40b93b444984f2b87",
-                "sha256:d2ad4d668a5c0645d281dcd17aff2be3212bc109b33814bbb15c4939f44181cc",
-                "sha256:d950695ae4381ecd856bcaf2b1e866720e4ab9a1498cba61c602e56630ca7195",
-                "sha256:e22dcb48709fc51a7b58a927391b23ab37eb3737a98ac4338e2448bef8559b33",
-                "sha256:e8c6a99be100371dbb046880e7a282152aa5d6127ae01783e37662ef73850d8f",
-                "sha256:e9dc245e3ac69c92ee4c167fbdd7428ec1956d4e754223124991ef29eb57a09d",
-                "sha256:eb687a11f0a7a1839719edd80f41e459cc5366857ecbed383ff376c4e3cc6afd",
-                "sha256:eb9e2a346c5238a30a746893f23a9535e700f8192a68c07c0258e7ece6ff3728",
-                "sha256:ed38b924ce794e505647f7c331b22a693bee1538fdf46b0222c4717b42f744e7",
-                "sha256:f0010c6f9d1a4011e429109fda55a225921e3206e7f62a0c22a35344bfd13cca",
-                "sha256:f0c5d1acbfca6ebdd6b1e3eded8d261affb6ddcf2186205518f1428b8569bb99",
-                "sha256:f10afb1004f102c7868ebfe91c28f4a712227fe4cb24974350ace1f90e1febbf",
-                "sha256:f174135f5609428cc6e1b9090f9268f5c8935fddb1b25ccb8255a2d50de6789e",
-                "sha256:f3ebe6e73c319340830a9b2825d32eb6d8475c1dac020b4f0aa774ee3b898d1c",
-                "sha256:f627688813d0a4140153ff532537fbe4afea5a3dffce1f9deb7f91f848a832b5",
-                "sha256:fd4305f86f53dfd8cd3522269ed7fc34856a8ee3709a5e28b2836b2db9d4cd69"
+                "sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3",
+                "sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2",
+                "sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636",
+                "sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20",
+                "sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728",
+                "sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27",
+                "sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66",
+                "sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443",
+                "sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0",
+                "sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7",
+                "sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39",
+                "sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605",
+                "sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a",
+                "sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37",
+                "sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029",
+                "sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139",
+                "sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc",
+                "sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df",
+                "sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14",
+                "sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880",
+                "sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2",
+                "sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a",
+                "sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e",
+                "sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474",
+                "sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024",
+                "sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8",
+                "sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0",
+                "sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e",
+                "sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a",
+                "sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e",
+                "sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032",
+                "sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6",
+                "sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e",
+                "sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b",
+                "sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e",
+                "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954",
+                "sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962",
+                "sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c",
+                "sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4",
+                "sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55",
+                "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962",
+                "sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023",
+                "sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c",
+                "sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6",
+                "sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8",
+                "sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382",
+                "sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7",
+                "sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc",
+                "sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997",
+                "sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"
            ],
-            "version": "==1.14.6"
+            "version": "==1.15.0"
        },
        "charset-normalizer": {
            "hashes": [
-                "sha256:5d209c0a931f215cee683b6445e2d77677e7e75e159f78def0db09d68fafcaa6",
-                "sha256:5ec46d183433dcbd0ab716f2d7f29d8dee50505b3fdb40c6b985c7c4f5a3591f"
+                "sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0",
+                "sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b"
            ],
            "markers": "python_version >= '3'",
-            "version": "==2.0.6"
+            "version": "==2.0.7"
        },
        "cryptography": {
            "hashes": [
-                "sha256:0a7dcbcd3f1913f664aca35d47c1331fce738d44ec34b7be8b9d332151b0b01e",
-                "sha256:1eb7bb0df6f6f583dd8e054689def236255161ebbcf62b226454ab9ec663746b",
-                "sha256:21ca464b3a4b8d8e86ba0ee5045e103a1fcfac3b39319727bc0fc58c09c6aff7",
-                "sha256:34dae04a0dce5730d8eb7894eab617d8a70d0c97da76b905de9efb7128ad7085",
-                "sha256:3520667fda779eb788ea00080124875be18f2d8f0848ec00733c0ec3bb8219fc",
-                "sha256:3c4129fc3fdc0fa8e40861b5ac0c673315b3c902bbdc05fc176764815b43dd1d",
-                "sha256:3fa3a7ccf96e826affdf1a0a9432be74dc73423125c8f96a909e3835a5ef194a",
-                "sha256:5b0fbfae7ff7febdb74b574055c7466da334a5371f253732d7e2e7525d570498",
-                "sha256:695104a9223a7239d155d7627ad912953b540929ef97ae0c34c7b8bf30857e89",
-                "sha256:8695456444f277af73a4877db9fc979849cd3ee74c198d04fc0776ebc3db52b9",
-                "sha256:94cc5ed4ceaefcbe5bf38c8fba6a21fc1d365bb8fb826ea1688e3370b2e24a1c",
-                "sha256:94fff993ee9bc1b2440d3b7243d488c6a3d9724cc2b09cdb297f6a886d040ef7",
-                "sha256:9965c46c674ba8cc572bc09a03f4c649292ee73e1b683adb1ce81e82e9a6a0fb",
-                "sha256:a00cf305f07b26c351d8d4e1af84ad7501eca8a342dedf24a7acb0e7b7406e14",
-                "sha256:a305600e7a6b7b855cd798e00278161b681ad6e9b7eca94c721d5f588ab212af",
-                "sha256:cd65b60cfe004790c795cc35f272e41a3df4631e2fb6b35aa7ac6ef2859d554e",
-                "sha256:d2a6e5ef66503da51d2110edf6c403dc6b494cc0082f85db12f54e9c5d4c3ec5",
-                "sha256:d9ec0e67a14f9d1d48dd87a2531009a9b251c02ea42851c060b25c782516ff06",
-                "sha256:f44d141b8c4ea5eb4dbc9b3ad992d45580c1d22bf5e24363f2fbf50c2d7ae8a7"
+                "sha256:07bb7fbfb5de0980590ddfc7f13081520def06dc9ed214000ad4372fb4e3c7f6",
+                "sha256:18d90f4711bf63e2fb21e8c8e51ed8189438e6b35a6d996201ebd98a26abbbe6",
+                "sha256:1ed82abf16df40a60942a8c211251ae72858b25b7421ce2497c2eb7a1cee817c",
+                "sha256:22a38e96118a4ce3b97509443feace1d1011d0571fae81fc3ad35f25ba3ea999",
+                "sha256:2d69645f535f4b2c722cfb07a8eab916265545b3475fdb34e0be2f4ee8b0b15e",
+                "sha256:4a2d0e0acc20ede0f06ef7aa58546eee96d2592c00f450c9acb89c5879b61992",
+                "sha256:54b2605e5475944e2213258e0ab8696f4f357a31371e538ef21e8d61c843c28d",
+                "sha256:7075b304cd567694dc692ffc9747f3e9cb393cc4aa4fb7b9f3abd6f5c4e43588",
+                "sha256:7b7ceeff114c31f285528ba8b390d3e9cfa2da17b56f11d366769a807f17cbaa",
+                "sha256:7eba2cebca600a7806b893cb1d541a6e910afa87e97acf2021a22b32da1df52d",
+                "sha256:928185a6d1ccdb816e883f56ebe92e975a262d31cc536429041921f8cb5a62fd",
+                "sha256:9933f28f70d0517686bd7de36166dda42094eac49415459d9bdf5e7df3e0086d",
+                "sha256:a688ebcd08250eab5bb5bca318cc05a8c66de5e4171a65ca51db6bd753ff8953",
+                "sha256:abb5a361d2585bb95012a19ed9b2c8f412c5d723a9836418fab7aaa0243e67d2",
+                "sha256:c10c797ac89c746e488d2ee92bd4abd593615694ee17b2500578b63cad6b93a8",
+                "sha256:ced40344e811d6abba00295ced98c01aecf0c2de39481792d87af4fa58b7b4d6",
+                "sha256:d57e0cdc1b44b6cdf8af1d01807db06886f10177469312fbde8f44ccbb284bc9",
+                "sha256:d99915d6ab265c22873f1b4d6ea5ef462ef797b4140be4c9d8b179915e0985c6",
+                "sha256:eb80e8a1f91e4b7ef8b33041591e6d89b2b8e122d787e87eeb2b08da71bb16ad",
+                "sha256:ebeddd119f526bcf323a89f853afb12e225902a24d29b55fe18dd6fcb2838a76"
            ],
-            "version": "==3.4.8"
+            "version": "==35.0.0"
        },
        "execnet": {
            "hashes": [
@@ -142,11 +156,19 @@
        },
        "idna": {
            "hashes": [
-                "sha256:14475042e284991034cb48e06f6851428fb14c4dc953acd9be9a5e95c7b6dd7a",
-                "sha256:467fbad99067910785144ce333826c71fb0e63a425657295239737f7ecd125f3"
+                "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
+                "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
            ],
            "markers": "python_version >= '3'",
-            "version": "==3.2"
+            "version": "==3.3"
+        },
+        "importlib-metadata": {
+            "hashes": [
+                "sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
+                "sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
+            ],
+            "markers": "python_version < '3.8'",
+            "version": "==4.8.1"
        },
        "iniconfig": {
            "hashes": [
@@ -207,11 +229,11 @@
                "crypto"
            ],
            "hashes": [
-                "sha256:934d73fbba91b0483d3857d1aff50e96b2a892384ee2c17417ed3203f173fca1",
-                "sha256:fba44e7898bbca160a2b2b501f492824fc8382485d3a6f11ba5d0c1937ce6130"
+                "sha256:b888b4d56f06f6dcd777210c334e69c737be74755d3e5e9ee3fe67dc18a0ee41",
+                "sha256:e0c4bb8d9f0af0c7f5b1ec4c5036309617d03d56932877f2f7a0beeb5318322f"
            ],
            "index": "pypi",
-            "version": "==2.1.0"
+            "version": "==2.3.0"
        },
        "pyparsing": {
            "hashes": [
@@ -272,21 +294,67 @@
        },
        "urllib3": {
            "hashes": [
-                "sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4",
-                "sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f"
+                "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece",
+                "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"
            ],
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
-            "version": "==1.26.6"
+            "version": "==1.26.7"
+        },
+        "zipp": {
+            "hashes": [
+                "sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
+                "sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==3.6.0"
        }
    },
    "develop": {
+        "backports.entry-points-selectable": {
+            "hashes": [
+                "sha256:988468260ec1c196dab6ae1149260e2f5472c9110334e5d51adcb77867361f6a",
+                "sha256:a6d9a871cde5e15b4c4a53e3d43ba890cc6861ec1332c9c2428c92f977192acc"
+            ],
+            "markers": "python_version >= '2.7'",
+            "version": "==1.1.0"
+        },
+        "certifi": {
+            "hashes": [
+                "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
+                "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
+            ],
+            "version": "==2021.10.8"
+        },
+        "distlib": {
+            "hashes": [
+                "sha256:c8b54e8454e5bf6237cc84c20e8264c3e991e824ef27e8f1e81049867d861e31",
+                "sha256:d982d0751ff6eaaab5e2ec8e691d949ee80eddf01a62eaa96ddb11531fe16b05"
+            ],
+            "version": "==0.3.3"
+        },
+        "filelock": {
+            "hashes": [
+                "sha256:2b5eb3589e7fdda14599e7eb1a50e09b4cc14f34ed98b8ba56d33bfaafcbef2f",
+                "sha256:34a9f35f95c441e7b38209775d6e0337f9a3759f3565f6c5798f19618527c76f"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==3.3.1"
+        },
        "flake8": {
            "hashes": [
-                "sha256:07528381786f2a6237b061f6e96610a4167b226cb926e2aa2b6b1d78057c576b",
-                "sha256:bf8fd333346d844f616e8d47905ef3a3384edae6b4e9beb0c5101e25e3110907"
+                "sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d",
+                "sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"
            ],
            "index": "pypi",
-            "version": "==3.9.2"
+            "version": "==4.0.1"
+        },
+        "importlib-metadata": {
+            "hashes": [
+                "sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
+                "sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
+            ],
+            "markers": "python_version < '3.8'",
+            "version": "==4.8.1"
        },
        "mccabe": {
            "hashes": [
@@ -331,21 +399,45 @@
            ],
            "version": "==0.4.3"
        },
+        "pipenv": {
+            "hashes": [
+                "sha256:05958fadcd70b2de6a27542fcd2bd72dd5c59c6d35307fdac3e06361fb06e30e",
+                "sha256:d180f5be4775c552fd5e69ae18a9d6099d9dafb462efe54f11c72cb5f4d5e977"
+            ],
+            "index": "pypi",
+            "version": "==2021.5.29"
+        },
+        "platformdirs": {
+            "hashes": [
+                "sha256:367a5e80b3d04d2428ffa76d33f124cf11e8fff2acdaa9b43d545f5c7d661ef2",
+                "sha256:8868bbe3c3c80d42f20156f22e7131d2fb321f5bc86a2a345375c6481a67021d"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==2.4.0"
+        },
        "pycodestyle": {
            "hashes": [
-                "sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068",
-                "sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef"
+                "sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20",
+                "sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f"
            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==2.7.0"
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==2.8.0"
        },
        "pyflakes": {
            "hashes": [
-                "sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3",
-                "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"
+                "sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c",
+                "sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e"
            ],
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==2.3.1"
+            "version": "==2.4.0"
+        },
+        "six": {
+            "hashes": [
+                "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
+                "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==1.16.0"
        },
        "toml": {
            "hashes": [
@@ -355,6 +447,58 @@
            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
            "version": "==0.10.2"
        },
+        "typed-ast": {
+            "hashes": [
+                "sha256:01ae5f73431d21eead5015997ab41afa53aa1fbe252f9da060be5dad2c730ace",
+                "sha256:067a74454df670dcaa4e59349a2e5c81e567d8d65458d480a5b3dfecec08c5ff",
+                "sha256:0fb71b8c643187d7492c1f8352f2c15b4c4af3f6338f21681d3681b3dc31a266",
+                "sha256:1b3ead4a96c9101bef08f9f7d1217c096f31667617b58de957f690c92378b528",
+                "sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6",
+                "sha256:209596a4ec71d990d71d5e0d312ac935d86930e6eecff6ccc7007fe54d703808",
+                "sha256:2c726c276d09fc5c414693a2de063f521052d9ea7c240ce553316f70656c84d4",
+                "sha256:398e44cd480f4d2b7ee8d98385ca104e35c81525dd98c519acff1b79bdaac363",
+                "sha256:52b1eb8c83f178ab787f3a4283f68258525f8d70f778a2f6dd54d3b5e5fb4341",
+                "sha256:5feca99c17af94057417d744607b82dd0a664fd5e4ca98061480fd8b14b18d04",
+                "sha256:7538e495704e2ccda9b234b82423a4038f324f3a10c43bc088a1636180f11a41",
+                "sha256:760ad187b1041a154f0e4d0f6aae3e40fdb51d6de16e5c99aedadd9246450e9e",
+                "sha256:777a26c84bea6cd934422ac2e3b78863a37017618b6e5c08f92ef69853e765d3",
+                "sha256:95431a26309a21874005845c21118c83991c63ea800dd44843e42a916aec5899",
+                "sha256:9ad2c92ec681e02baf81fdfa056fe0d818645efa9af1f1cd5fd6f1bd2bdfd805",
+                "sha256:9c6d1a54552b5330bc657b7ef0eae25d00ba7ffe85d9ea8ae6540d2197a3788c",
+                "sha256:aee0c1256be6c07bd3e1263ff920c325b59849dc95392a05f258bb9b259cf39c",
+                "sha256:af3d4a73793725138d6b334d9d247ce7e5f084d96284ed23f22ee626a7b88e39",
+                "sha256:b36b4f3920103a25e1d5d024d155c504080959582b928e91cb608a65c3a49e1a",
+                "sha256:b9574c6f03f685070d859e75c7f9eeca02d6933273b5e69572e5ff9d5e3931c3",
+                "sha256:bff6ad71c81b3bba8fa35f0f1921fb24ff4476235a6e94a26ada2e54370e6da7",
+                "sha256:c190f0899e9f9f8b6b7863debfb739abcb21a5c054f911ca3596d12b8a4c4c7f",
+                "sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075",
+                "sha256:cae53c389825d3b46fb37538441f75d6aecc4174f615d048321b716df2757fb0",
+                "sha256:dd4a21253f42b8d2b48410cb31fe501d32f8b9fbeb1f55063ad102fe9c425e40",
+                "sha256:dde816ca9dac1d9c01dd504ea5967821606f02e510438120091b84e852367428",
+                "sha256:f2362f3cb0f3172c42938946dbc5b7843c2a28aec307c49100c8b38764eb6927",
+                "sha256:f328adcfebed9f11301eaedfa48e15bdece9b519fb27e6a8c01aa52a17ec31b3",
+                "sha256:f8afcf15cc511ada719a88e013cec87c11aff7b91f019295eb4530f96fe5ef2f",
+                "sha256:fb1bbeac803adea29cedd70781399c99138358c26d05fcbd23c13016b7f5ec65"
+            ],
+            "markers": "python_version < '3.8'",
+            "version": "==1.4.3"
+        },
+        "types-psycopg2": {
+            "hashes": [
+                "sha256:77ed80f2668582654623e04fb3d741ecce93effcc39c929d7e02f4a917a538ce",
+                "sha256:98a6e0e9580cd7eb4bd4d20f7c7063d154b2589a2b90c0ce4e3ca6085cde77c6"
+            ],
+            "index": "pypi",
+            "version": "==2.9.1"
+        },
+        "types-requests": {
+            "hashes": [
+                "sha256:b279284e51f668e38ee12d9665e4d789089f532dc2a0be4a1508ca0efd98ba9e",
+                "sha256:ba1d108d512e294b6080c37f6ae7cb2a2abf527560e2b671d1786c1fc46b541a"
+            ],
+            "index": "pypi",
+            "version": "==2.25.11"
+        },
        "typing-extensions": {
            "hashes": [
                "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
@@ -364,6 +508,22 @@
            "index": "pypi",
            "version": "==3.10.0.2"
        },
+        "virtualenv": {
+            "hashes": [
+                "sha256:10062e34c204b5e4ec5f62e6ef2473f8ba76513a9a617e873f1f8fb4a519d300",
+                "sha256:bcc17f0b3a29670dd777d6f0755a4c04f28815395bca279cdcb213b97199a6b8"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==20.8.1"
+        },
+        "virtualenv-clone": {
+            "hashes": [
+                "sha256:418ee935c36152f8f153c79824bb93eaf6f0f7984bae31d3f48f350b9183501a",
+                "sha256:44d5263bceed0bac3e1424d64f798095233b64def1c5689afa43dc3223caf5b0"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==0.5.7"
+        },
        "yapf": {
            "hashes": [
                "sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d",
@@ -371,6 +531,14 @@
            ],
            "index": "pypi",
            "version": "==0.31.0"
+        },
+        "zipp": {
+            "hashes": [
+                "sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
+                "sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==3.6.0"
        }
    }
 }
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -3,10 +3,13 @@
 This directory contains integration tests.

 Prerequisites:
- Python 3.6 or later
+- Python 3.7 or later
+    - Development headers may also be needed to build `psycopg2` from source.
+    - Python 3.7 is recommended if you want to update tests.
 - Dependencies: install them via `pipenv install`. Note that Debian/Ubuntu
  packages are stale, as it commonly happens, so manual installation is not
  recommended.
+  Exact version of `pipenv` is not important unless you change dependencies.
  Run `pipenv shell` to activate the venv or use `pipenv run` to run a single
  command in the venv, e.g. `pipenv run pytest`.
 - Zenith and Postgres binaries
@@ -53,8 +56,8 @@ Useful environment variables:
 should go.
 `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.

-Let stdout and stderr go to the terminal instead of capturing them:
-`pytest -s ...`
+Let stdout, stderr and `INFO` log messages go to the terminal instead of capturing them:
+`pytest -s --log-cli-level=INFO ...`
 (Note many tests capture subprocess outputs separately, so this may not
 show much.)

@@ -62,44 +65,87 @@ Exit after the first test failure:
 `pytest -x ...`
 (there are many more pytest options; run `pytest -h` to see them.)

+### Writing a test

-### Building new tests
+Every test needs a Zenith Environment, or ZenithEnv to operate in. A Zenith Environment
+is like a little cloud-in-a-box, and consists of a Pageserver, 0-N Safekeepers, and
+compute Postgres nodes. The connections between them can be configured to use JWT
+authentication tokens, and some other configuration options can be tweaked too.

-The tests make heavy use of pytest fixtures. You can read about how they work here: https://docs.pytest.org/en/stable/fixture.html
+The easiest way to get access to a Zenith Environment is by using the `zenith_simple_env`
+fixture. The 'simple' env may be shared across multiple tests, so don't shut down the nodes
+or make other destructive changes in that environment. Also don't assume that
+there are no tenants or branches or data in the cluster. For convenience, there is a
+branch called `empty`, though. The convention is to create a test-specific branch of
+that and load any test data there, instead of the 'main' branch.

-Essentially, this means that each time you see a fixture named as an input parameter, the function with that name will be run and passed as a parameter to the function.
-
-So this code:
+For more complicated cases, you can build a custom Zenith Environment, with the `zenith_env`
+fixture:

 ```python
-def test_something(zenith_cli, pg_bin):
-    pass
+def test_foobar(zenith_env_builder: ZenithEnvBuilder):
+    # Prescribe the environment.
+    # We want to have 3 safekeeper nodes, and use JWT authentication in the
+    # connections to the page server
+    zenith_env_builder.num_safekeepers = 3
+    zenith_env_builder.set_pageserver_auth(True)
+
+    # Now create the environment. This initializes the repository, and starts
+    # up the page server and the safekeepers
+    env = zenith_env_builder.init()
+
+    # Run the test
+    ...
 ```

-... will run the fixtures called `zenith_cli` and `pg_bin` and deliver those results to the test function.
+For more information about pytest fixtures, see https://docs.pytest.org/en/stable/fixture.html

-Fixtures can't be imported using the normal python syntax. Instead, use this:
+At the end of a test, all the nodes in the environment are automatically stopped, so you
+don't need to worry about cleaning up. Logs and test data are preserved for the analysis,
+in a directory under `../test_output/<testname>`

-```python
-pytest_plugins = ("fixtures.something")
+### Before submitting a patch
+#### Obligatory checks
+Install dev dependencies via `pipenv --python 3.7 install --dev` (better)
+or `pipenv install --dev` (if you don't have Python 3.7 and don't need to change dependencies).
+
+We force code formatting via yapf and type hints via mypy.
+Run the following commands in the `test_runner/` directory:
+
+```bash
+pipenv run yapf -ri .  # All code is reformatted
+pipenv run mypy .  # Ensure there are no typing errors
 ```

-That will make all the fixtures in the `fixtures/something.py` file available.
-
-Anything that's likely to be used in multiple tests should be built into a fixture.
-
-Note that fixtures can clean up after themselves if they use the `yield` syntax.
-Cleanup will happen even if the test fails (raises an unhandled exception).
-Python destructors, e.g. `__del__()` aren't recommended for cleanup.
-
-
-### Code quality
-
-Before submitting a patch, please consider:
-
+#### Advisable actions
 * Writing a couple of docstrings to clarify the reasoning behind a new test.
 * Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any.
-* Formatting the code with `yapf -r -i .` (TODO: implement an opt-in pre-commit hook for that).
-* (Optional) Typechecking the code with `mypy .`. Currently this mostly affects `fixtures/zenith_fixtures.py`.
+* Adding more type hints to your code to avoid `Any`, especially:
+  * For fixture parameters, they are not automatically deduced.
+  * For function arguments and return values.

-The tools can be installed with `pipenv install --dev`.
+#### Changing dependencies
+You have to update `Pipfile.lock` if you have changed `Pipfile`:
+
+```bash
+pipenv --python 3.7 install --dev  # Re-create venv for Python 3.7 and install recent pipenv inside
+pipenv run pipenv --version  # Should be at least 2021.5.29
+pipenv run pipenv lock  # Regenerate Pipfile.lock
+```
+
+As the minimal supported version is Python 3.7 and we use it in CI,
+you have to use a Python 3.7 environment when updating `Pipfile.lock`.
+Otherwise some back-compatibility packages will be missing.
+
+It is also important to run recent `pipenv`.
+Older versions remove markers from `Pipfile.lock`.
+
+If you don't have Python 3.7, you should install it and its headers (for `psycopg2`)
+separately, e.g.:
+
+```bash
+# In Ubuntu
+sudo add-apt-repository ppa:deadsnakes/ppa
+sudo apt update
+sudo apt install python3.7 python3.7-dev
+```
--- a/test_runner/batch_others/test_auth.py
+++ b/test_runner/batch_others/test_auth.py
@@ -1,21 +1,22 @@
-
 from contextlib import closing
 from typing import Iterator
 from uuid import uuid4
 import psycopg2
-from fixtures.zenith_fixtures import PortDistributor, Postgres, ZenithCli, ZenithPageserver, PgBin
+from fixtures.zenith_fixtures import ZenithEnvBuilder
 import pytest

-
 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_pageserver_auth(pageserver_auth_enabled: ZenithPageserver):
-    ps = pageserver_auth_enabled
+def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder):
+    zenith_env_builder.pageserver_auth_enabled = True
+    env = zenith_env_builder.init()

-    tenant_token = ps.auth_keys.generate_tenant_token(ps.initial_tenant)
-    invalid_tenant_token = ps.auth_keys.generate_tenant_token(uuid4().hex)
-    management_token = ps.auth_keys.generate_management_token()
+    ps = env.pageserver
+
+    tenant_token = env.auth_keys.generate_tenant_token(env.initial_tenant)
+    invalid_tenant_token = env.auth_keys.generate_tenant_token(uuid4().hex)
+    management_token = env.auth_keys.generate_management_token()

    # this does not invoke auth check and only decodes jwt and checks it for validity
    # check both tokens
@@ -23,57 +24,41 @@ def test_pageserver_auth(pageserver_auth_enabled: ZenithPageserver):
    ps.safe_psql("status", password=management_token)

    # tenant can create branches
-    ps.safe_psql(f"branch_create {ps.initial_tenant} new1 main", password=tenant_token)
+    ps.safe_psql(f"branch_create {env.initial_tenant} new1 main", password=tenant_token)
    # console can create branches for tenant
-    ps.safe_psql(f"branch_create {ps.initial_tenant} new2 main", password=management_token)
+    ps.safe_psql(f"branch_create {env.initial_tenant} new2 main", password=management_token)

    # fail to create branch using token with different tenantid
    with pytest.raises(psycopg2.DatabaseError, match='Tenant id mismatch. Permission denied'):
-        ps.safe_psql(f"branch_create {ps.initial_tenant} new2 main", password=invalid_tenant_token)
+        ps.safe_psql(f"branch_create {env.initial_tenant} new2 main", password=invalid_tenant_token)

    # create tenant using management token
    ps.safe_psql(f"tenant_create {uuid4().hex}", password=management_token)

    # fail to create tenant using tenant token
-    with pytest.raises(psycopg2.DatabaseError, match='Attempt to access management api with tenant scope. Permission denied'):
+    with pytest.raises(
+            psycopg2.DatabaseError,
+            match='Attempt to access management api with tenant scope. Permission denied'):
        ps.safe_psql(f"tenant_create {uuid4().hex}", password=tenant_token)


@pytest.mark.parametrize('with_wal_acceptors', [False, True])
-def test_compute_auth_to_pageserver(
-    zenith_cli: ZenithCli,
-    wa_factory,
-    pageserver_auth_enabled: ZenithPageserver,
-    repo_dir: str,
-    with_wal_acceptors: bool,
-    pg_bin: PgBin,
-    port_distributor: PortDistributor,
-):
-    ps = pageserver_auth_enabled
-    # since we are in progress of refactoring protocols between compute safekeeper and page server
-    # use hardcoded management token in safekeeper
-    management_token = ps.auth_keys.generate_management_token()
+def test_compute_auth_to_pageserver(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptors: bool):
+    zenith_env_builder.pageserver_auth_enabled = True
+    if with_wal_acceptors:
+        zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()

    branch = f"test_compute_auth_to_pageserver{with_wal_acceptors}"
-    zenith_cli.run(["branch", branch, "empty"])
-    if with_wal_acceptors:
-        wa_factory.start_n_new(3, management_token)
+    env.zenith_cli(["branch", branch, "main"])

-    with Postgres(
-        zenith_cli=zenith_cli,
-        repo_dir=repo_dir,
-        pg_bin=pg_bin,
-        tenant_id=ps.initial_tenant,
-        port=port_distributor.get_port(),
-    ).create_start(
-        branch,
-        wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None,
-    ) as pg:
-        with closing(pg.connect()) as conn:
-            with conn.cursor() as cur:
-                # we rely upon autocommit after each statement
-                # as waiting for acceptors happens there
-                cur.execute('CREATE TABLE t(key int primary key, value text)')
-                cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
-                cur.execute('SELECT sum(key) FROM t')
-                assert cur.fetchone() == (5000050000, )
+    pg = env.postgres.create_start(branch)
+
+    with closing(pg.connect()) as conn:
+        with conn.cursor() as cur:
+            # we rely upon autocommit after each statement
+            # as waiting for acceptors happens there
+            cur.execute('CREATE TABLE t(key int primary key, value text)')
+            cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
+            cur.execute('SELECT sum(key) FROM t')
+            assert cur.fetchone() == (5000050000, )
--- a/test_runner/batch_others/test_branch_behind.py
+++ b/test_runner/batch_others/test_branch_behind.py
@@ -1,5 +1,5 @@
 import subprocess
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -8,11 +8,12 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Create a couple of branches off the main branch, at a historical point in time.
 #
-def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
+def test_branch_behind(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Branch at the point where only 100 rows were inserted
-    zenith_cli.run(["branch", "test_branch_behind", "empty"])
+    env.zenith_cli(["branch", "test_branch_behind", "empty"])

-    pgmain = postgres.create_start('test_branch_behind')
+    pgmain = env.postgres.create_start('test_branch_behind')
    log.info("postgres is running on 'test_branch_behind' branch")

    main_pg_conn = pgmain.connect()
@@ -40,7 +41,7 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
    log.info(f'LSN after 200100 rows: {lsn_b}')

    # Branch at the point where only 100 rows were inserted
-    zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])
+    env.zenith_cli(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])

    # Insert many more rows. This generates enough WAL to fill a few segments.
    main_cur.execute('''
@@ -55,10 +56,10 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
    log.info(f'LSN after 400100 rows: {lsn_c}')

    # Branch at the point where only 200100 rows were inserted
-    zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])
+    env.zenith_cli(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])

-    pg_hundred = postgres.create_start("test_branch_behind_hundred")
-    pg_more = postgres.create_start("test_branch_behind_more")
+    pg_hundred = env.postgres.create_start("test_branch_behind_hundred")
+    pg_more = env.postgres.create_start("test_branch_behind_more")

    # On the 'hundred' branch, we should see only 100 rows
    hundred_pg_conn = pg_hundred.connect()
@@ -79,14 +80,17 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
    # Check bad lsn's for branching

    # branch at segment boundary
-    zenith_cli.run(["branch", "test_branch_segment_boundary", "test_branch_behind@0/3000000"])
-    pg = postgres.create_start("test_branch_segment_boundary")
+    env.zenith_cli(["branch", "test_branch_segment_boundary", "test_branch_behind@0/3000000"])
+    pg = env.postgres.create_start("test_branch_segment_boundary")
    cur = pg.connect().cursor()
    cur.execute('SELECT 1')
    assert cur.fetchone() == (1, )

    # branch at pre-initdb lsn
+    #
+    # FIXME: This works currently, but probably shouldn't be allowed
    try:
-        zenith_cli.run(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
+        env.zenith_cli(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
+        # FIXME: assert false, "branch with invalid LSN should have failed"
    except subprocess.CalledProcessError:
        log.info("Branch creation with pre-initdb LSN failed (as expected)")
--- a/test_runner/batch_others/test_clog_truncate.py
+++ b/test_runner/batch_others/test_clog_truncate.py
@@ -3,7 +3,7 @@ import os

 from contextlib import closing

-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -12,19 +12,23 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test compute node start after clog truncation
 #
-def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
+def test_clog_truncate(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_clog_truncate", "empty"])
+    env.zenith_cli(["branch", "test_clog_truncate", "empty"])

    # set agressive autovacuum to make sure that truncation will happen
    config = [
-        'autovacuum_max_workers=10', 'autovacuum_vacuum_threshold=0',
-        'autovacuum_vacuum_insert_threshold=0', 'autovacuum_vacuum_cost_delay=0',
-        'autovacuum_vacuum_cost_limit=10000', 'autovacuum_naptime =1s',
+        'autovacuum_max_workers=10',
+        'autovacuum_vacuum_threshold=0',
+        'autovacuum_vacuum_insert_threshold=0',
+        'autovacuum_vacuum_cost_delay=0',
+        'autovacuum_vacuum_cost_limit=10000',
+        'autovacuum_naptime =1s',
        'autovacuum_freeze_max_age=100000'
    ]

-    pg = postgres.create_start('test_clog_truncate', config_lines=config)
+    pg = env.postgres.create_start('test_clog_truncate', config_lines=config)
    log.info('postgres is running on test_clog_truncate branch')

    # Install extension containing function needed for test
@@ -61,10 +65,10 @@ def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: Postg

    # create new branch after clog truncation and start a compute node on it
    log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}')
-    zenith_cli.run(
+    env.zenith_cli(
        ["branch", "test_clog_truncate_new", "test_clog_truncate@" + lsn_after_truncation])

-    pg2 = postgres.create_start('test_clog_truncate_new')
+    pg2 = env.postgres.create_start('test_clog_truncate_new')
    log.info('postgres is running on test_clog_truncate_new branch')

    # check that new node doesn't contain truncated segment
--- a/test_runner/batch_others/test_config.py
+++ b/test_runner/batch_others/test_config.py
@@ -1,6 +1,6 @@
 from contextlib import closing

-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -9,12 +9,13 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test starting Postgres with custom options
 #
-def test_config(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
+def test_config(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_config", "empty"])
+    env.zenith_cli(["branch", "test_config", "empty"])

    # change config
-    pg = postgres.create_start('test_config', config_lines=['log_min_messages=debug1'])
+    pg = env.postgres.create_start('test_config', config_lines=['log_min_messages=debug1'])
    log.info('postgres is running on test_config branch')

    with closing(pg.connect()) as conn:
--- a/test_runner/batch_others/test_createdropdb.py
+++ b/test_runner/batch_others/test_createdropdb.py
@@ -2,7 +2,7 @@ import os
 import pathlib

 from contextlib import closing
-from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory, ZenithCli, check_restored_datadir_content
+from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -11,15 +11,11 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test CREATE DATABASE when there have been relmapper changes
 #
-def test_createdb(
-    zenith_cli: ZenithCli,
-    pageserver: ZenithPageserver,
-    postgres: PostgresFactory,
-    pg_bin,
-):
-    zenith_cli.run(["branch", "test_createdb", "empty"])
+def test_createdb(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    env.zenith_cli(["branch", "test_createdb", "empty"])

-    pg = postgres.create_start('test_createdb')
+    pg = env.postgres.create_start('test_createdb')
    log.info("postgres is running on 'test_createdb' branch")

    with closing(pg.connect()) as conn:
@@ -33,27 +29,23 @@ def test_createdb(
            lsn = cur.fetchone()[0]

    # Create a branch
-    zenith_cli.run(["branch", "test_createdb2", "test_createdb@" + lsn])
+    env.zenith_cli(["branch", "test_createdb2", "test_createdb@" + lsn])

-    pg2 = postgres.create_start('test_createdb2')
+    pg2 = env.postgres.create_start('test_createdb2')

    # Test that you can connect to the new database on both branches
    for db in (pg, pg2):
        db.connect(dbname='foodb').close()

+
 #
 # Test DROP DATABASE
 #
-def test_dropdb(
-    zenith_cli: ZenithCli,
-    pageserver: ZenithPageserver,
-    postgres: PostgresFactory,
-    pg_bin,
-    test_output_dir
-):
-    zenith_cli.run(["branch", "test_dropdb", "empty"])
+def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir):
+    env = zenith_simple_env
+    env.zenith_cli(["branch", "test_dropdb", "empty"])

-    pg = postgres.create_start('test_dropdb')
+    pg = env.postgres.create_start('test_dropdb')
    log.info("postgres is running on 'test_dropdb' branch")

    with closing(pg.connect()) as conn:
@@ -66,7 +58,6 @@ def test_dropdb(
            cur.execute("SELECT oid FROM pg_database WHERE datname='foodb';")
            dboid = cur.fetchone()[0]

-
    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
            cur.execute('DROP DATABASE foodb')
@@ -76,28 +67,29 @@ def test_dropdb(
            cur.execute('SELECT pg_current_wal_insert_lsn()')
            lsn_after_drop = cur.fetchone()[0]

-
    # Create two branches before and after database drop.
-    zenith_cli.run(["branch", "test_before_dropdb", "test_dropdb@" + lsn_before_drop])
-    pg_before = postgres.create_start('test_before_dropdb')
+    env.zenith_cli(["branch", "test_before_dropdb", "test_dropdb@" + lsn_before_drop])
+    pg_before = env.postgres.create_start('test_before_dropdb')

-    zenith_cli.run(["branch", "test_after_dropdb", "test_dropdb@" + lsn_after_drop])
-    pg_after = postgres.create_start('test_after_dropdb')
+    env.zenith_cli(["branch", "test_after_dropdb", "test_dropdb@" + lsn_after_drop])
+    pg_after = env.postgres.create_start('test_after_dropdb')

    # Test that database exists on the branch before drop
    pg_before.connect(dbname='foodb').close()

    # Test that database subdir exists on the branch before drop
+    assert pg_before.pgdata_dir
    dbpath = pathlib.Path(pg_before.pgdata_dir) / 'base' / str(dboid)
    log.info(dbpath)

    assert os.path.isdir(dbpath) == True

    # Test that database subdir doesn't exist on the branch after drop
+    assert pg_after.pgdata_dir
    dbpath = pathlib.Path(pg_after.pgdata_dir) / 'base' / str(dboid)
    log.info(dbpath)

    assert os.path.isdir(dbpath) == False

    # Check that we restore the content of the datadir correctly
-    check_restored_datadir_content(zenith_cli, test_output_dir, pg, pageserver.service_port.pg)
+    check_restored_datadir_content(test_output_dir, env, pg)
--- a/test_runner/batch_others/test_createuser.py
+++ b/test_runner/batch_others/test_createuser.py
@@ -1,6 +1,6 @@
 from contextlib import closing

-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -9,10 +9,11 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test CREATE USER to check shared catalog restore
 #
-def test_createuser(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
-    zenith_cli.run(["branch", "test_createuser", "empty"])
+def test_createuser(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    env.zenith_cli(["branch", "test_createuser", "empty"])

-    pg = postgres.create_start('test_createuser')
+    pg = env.postgres.create_start('test_createuser')
    log.info("postgres is running on 'test_createuser' branch")

    with closing(pg.connect()) as conn:
@@ -26,9 +27,9 @@ def test_createuser(zenith_cli, pageserver: ZenithPageserver, postgres: Postgres
            lsn = cur.fetchone()[0]

    # Create a branch
-    zenith_cli.run(["branch", "test_createuser2", "test_createuser@" + lsn])
+    env.zenith_cli(["branch", "test_createuser2", "test_createuser@" + lsn])

-    pg2 = postgres.create_start('test_createuser2')
+    pg2 = env.postgres.create_start('test_createuser2')

    # Test that you can connect to new branch as a new user
    assert pg2.safe_psql('select current_user', username='testuser') == [('testuser', )]
--- a/test_runner/batch_others/test_multixact.py
+++ b/test_runner/batch_others/test_multixact.py
@@ -1,4 +1,4 @@
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content
+from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -10,11 +10,11 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 # it only checks next_multixact_id field in restored pg_control,
 # since we don't have functions to check multixact internals.
 #
-def test_multixact(pageserver: ZenithPageserver, postgres: PostgresFactory,
-                    pg_bin, zenith_cli, base_dir, test_output_dir):
+def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_multixact", "empty"])
-    pg = postgres.create_start('test_multixact')
+    env.zenith_cli(["branch", "test_multixact", "empty"])
+    pg = env.postgres.create_start('test_multixact')

    log.info("postgres is running on 'test_multixact' branch")
    pg_conn = pg.connect()
@@ -53,8 +53,8 @@ def test_multixact(pageserver: ZenithPageserver, postgres: PostgresFactory,
    assert int(next_multixact_id) > int(next_multixact_id_old)

    # Branch at this point
-    zenith_cli.run(["branch", "test_multixact_new", "test_multixact@" + lsn])
-    pg_new = postgres.create_start('test_multixact_new')
+    env.zenith_cli(["branch", "test_multixact_new", "test_multixact@" + lsn])
+    pg_new = env.postgres.create_start('test_multixact_new')

    log.info("postgres is running on 'test_multixact_new' branch")
    pg_new_conn = pg_new.connect()
@@ -67,4 +67,4 @@ def test_multixact(pageserver: ZenithPageserver, postgres: PostgresFactory,
    assert next_multixact_id_new == next_multixact_id

    # Check that we restore the content of the datadir correctly
-    check_restored_datadir_content(zenith_cli, test_output_dir, pg_new, pageserver.service_port.pg)
+    check_restored_datadir_content(test_output_dir, env, pg_new)
--- a/test_runner/batch_others/test_old_request_lsn.py
+++ b/test_runner/batch_others/test_old_request_lsn.py
@@ -1,10 +1,11 @@
 from contextlib import closing

-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")

+
 #
 # Test where Postgres generates a lot of WAL, and it's garbage collected away, but
 # no pages are evicted so that Postgres uses an old LSN in a GetPage request.
@@ -15,10 +16,11 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 # just a hint that the page hasn't been modified since that LSN, and the page
 # server should return the latest page version regardless of the LSN.
 #
-def test_old_request_lsn(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
+def test_old_request_lsn(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_old_request_lsn", "empty"])
-    pg = postgres.create_start('test_old_request_lsn')
+    env.zenith_cli(["branch", "test_old_request_lsn", "empty"])
+    pg = env.postgres.create_start('test_old_request_lsn')
    log.info('postgres is running on test_old_request_lsn branch')

    pg_conn = pg.connect()
@@ -28,7 +30,7 @@ def test_old_request_lsn(zenith_cli, pageserver: ZenithPageserver, postgres: Pos
    cur.execute("SHOW zenith.zenith_timeline")
    timeline = cur.fetchone()[0]

-    psconn = pageserver.connect()
+    psconn = env.pageserver.connect()
    pscur = psconn.cursor()

    # Create table, and insert some rows. Make it big enough that it doesn't fit in
@@ -47,20 +49,20 @@ def test_old_request_lsn(zenith_cli, pageserver: ZenithPageserver, postgres: Pos
        from pg_settings where name = 'shared_buffers'
    ''')
    row = cur.fetchone()
-    log.info(f'shared_buffers is {row[0]}, table size {row[1]}');
+    log.info(f'shared_buffers is {row[0]}, table size {row[1]}')
    assert int(row[0]) < int(row[1])

-    cur.execute('VACUUM foo');
+    cur.execute('VACUUM foo')

    # Make a lot of updates on a single row, generating a lot of WAL. Trigger
    # garbage collections so that the page server will remove old page versions.
    for i in range(10):
-        pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+        pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
        for j in range(100):
-            cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;');
+            cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;')

    # All (or at least most of) the updates should've been on the same page, so
    # that we haven't had to evict any dirty pages for a long time. Now run
    # a query that sends GetPage@LSN requests with the old LSN.
-    cur.execute("SELECT COUNT(*), SUM(val) FROM foo");
+    cur.execute("SELECT COUNT(*), SUM(val) FROM foo")
    assert cur.fetchone() == (100000, 101000)
--- a/test_runner/batch_others/test_pageserver_api.py
+++ b/test_runner/batch_others/test_pageserver_api.py
@@ -3,25 +3,28 @@ from uuid import uuid4
 import pytest
 import psycopg2
 import requests
-from fixtures.zenith_fixtures import ZenithPageserver, ZenithPageserverHttpClient
+from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient
+from typing import cast

 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_status_psql(pageserver):
-    assert pageserver.safe_psql('status') == [
+def test_status_psql(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    assert env.pageserver.safe_psql('status') == [
        ('hello world', ),
    ]


-def test_branch_list_psql(pageserver: ZenithPageserver, zenith_cli):
+def test_branch_list_psql(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_branch_list_main", "empty"])
+    env.zenith_cli(["branch", "test_branch_list_main", "empty"])

-    conn = pageserver.connect()
+    conn = env.pageserver.connect()
    cur = conn.cursor()

-    cur.execute(f'branch_list {pageserver.initial_tenant}')
+    cur.execute(f'branch_list {env.initial_tenant}')
    branches = json.loads(cur.fetchone()[0])
    # Filter out branches created by other tests
    branches = [x for x in branches if x['name'].startswith('test_branch_list')]
@@ -34,10 +37,10 @@ def test_branch_list_psql(pageserver: ZenithPageserver, zenith_cli):
    assert 'ancestor_lsn' in branches[0]

    # Create another branch, and start Postgres on it
-    zenith_cli.run(['branch', 'test_branch_list_experimental', 'test_branch_list_main'])
-    zenith_cli.run(['pg', 'create', 'test_branch_list_experimental'])
+    env.zenith_cli(['branch', 'test_branch_list_experimental', 'test_branch_list_main'])
+    env.zenith_cli(['pg', 'create', 'test_branch_list_experimental'])

-    cur.execute(f'branch_list {pageserver.initial_tenant}')
+    cur.execute(f'branch_list {env.initial_tenant}')
    new_branches = json.loads(cur.fetchone()[0])
    # Filter out branches created by other tests
    new_branches = [x for x in new_branches if x['name'].startswith('test_branch_list')]
@@ -53,18 +56,22 @@ def test_branch_list_psql(pageserver: ZenithPageserver, zenith_cli):
    conn.close()


-def test_tenant_list_psql(pageserver: ZenithPageserver, zenith_cli):
-    res = zenith_cli.run(["tenant", "list"])
-    res.check_returncode()
-    tenants = res.stdout.splitlines()
-    assert tenants == [pageserver.initial_tenant]
+def test_tenant_list_psql(zenith_env_builder: ZenithEnvBuilder):
+    # don't use zenith_simple_env, because there might be other tenants there,
+    # left over from other tests.
+    env = zenith_env_builder.init()

-    conn = pageserver.connect()
+    res = env.zenith_cli(["tenant", "list"])
+    res.check_returncode()
+    tenants = sorted(map(lambda t: t.split()[0], res.stdout.splitlines()))
+    assert tenants == [env.initial_tenant]
+
+    conn = env.pageserver.connect()
    cur = conn.cursor()

    # check same tenant cannot be created twice
-    with pytest.raises(psycopg2.DatabaseError, match=f'tenant {pageserver.initial_tenant} already exists'):
-        cur.execute(f'tenant_create {pageserver.initial_tenant}')
+    with pytest.raises(psycopg2.DatabaseError, match=f'tenant {env.initial_tenant} already exists'):
+        cur.execute(f'tenant_create {env.initial_tenant}')

    # create one more tenant
    tenant1 = uuid4().hex
@@ -73,20 +80,20 @@ def test_tenant_list_psql(pageserver: ZenithPageserver, zenith_cli):
    cur.execute('tenant_list')

    # compare tenants list
-    new_tenants = sorted(json.loads(cur.fetchone()[0]))
-    assert sorted([pageserver.initial_tenant, tenant1]) == new_tenants
+    new_tenants = sorted(map(lambda t: cast(str, t['id']), json.loads(cur.fetchone()[0])))
+    assert sorted([env.initial_tenant, tenant1]) == new_tenants


 def check_client(client: ZenithPageserverHttpClient, initial_tenant: str):
    client.check_status()

    # check initial tenant is there
-    assert initial_tenant in set(client.tenant_list())
+    assert initial_tenant in {t['id'] for t in client.tenant_list()}

    # create new tenant and check it is also there
    tenant_id = uuid4()
    client.tenant_create(tenant_id)
-    assert tenant_id.hex in set(client.tenant_list())
+    assert tenant_id.hex in {t['id'] for t in client.tenant_list()}

    # create branch
    branch_name = uuid4().hex
@@ -96,11 +103,17 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: str):
    assert branch_name in {b['name'] for b in client.branch_list(tenant_id)}


-def test_pageserver_http_api_client(pageserver: ZenithPageserver):
-    client = pageserver.http_client()
-    check_client(client, pageserver.initial_tenant)
+def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    client = env.pageserver.http_client()
+    check_client(client, env.initial_tenant)


-def test_pageserver_http_api_client_auth_enabled(pageserver_auth_enabled: ZenithPageserver):
-    client = pageserver_auth_enabled.http_client(auth_token=pageserver_auth_enabled.auth_keys.generate_management_token())
-    check_client(client, pageserver_auth_enabled.initial_tenant)
+def test_pageserver_http_api_client_auth_enabled(zenith_env_builder: ZenithEnvBuilder):
+    zenith_env_builder.pageserver_auth_enabled = True
+    env = zenith_env_builder.init()
+
+    management_token = env.auth_keys.generate_management_token()
+
+    client = env.pageserver.http_client(auth_token=management_token)
+    check_client(client, env.initial_tenant)
--- a/test_runner/batch_others/test_pageserver_restart.py
+++ b/test_runner/batch_others/test_pageserver_restart.py
@@ -4,22 +4,22 @@ import time

 from contextlib import closing
 from multiprocessing import Process, Value
-from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory
+from fixtures.zenith_fixtures import ZenithEnvBuilder
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")

+
 # Check that dead minority doesn't prevent the commits: execute insert n_inserts
 # times, with fault_probability chance of getting a wal acceptor down or up
 # along the way. 2 of 3 are always alive, so the work keeps going.
-def test_pageserver_restart(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory: WalAcceptorFactory):
-
+def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder):
    # One safekeeper is enough for this test.
-    wa_factory.start_n_new(1)
+    zenith_env_builder.num_safekeepers = 1
+    env = zenith_env_builder.init()

-    zenith_cli.run(["branch", "test_pageserver_restart", "empty"])
-    pg = postgres.create_start('test_pageserver_restart',
-                               wal_acceptors=wa_factory.get_connstrs())
+    env.zenith_cli(["branch", "test_pageserver_restart", "main"])
+    pg = env.postgres.create_start('test_pageserver_restart')

    pg_conn = pg.connect()
    cur = pg_conn.cursor()
@@ -41,14 +41,14 @@ def test_pageserver_restart(zenith_cli, pageserver: ZenithPageserver, postgres:
        from pg_settings where name = 'shared_buffers'
    ''')
    row = cur.fetchone()
-    log.info(f"shared_buffers is {row[0]}, table size {row[1]}");
+    log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
    assert int(row[0]) < int(row[1])

    # Stop and restart pageserver. This is a more or less graceful shutdown, although
    # the page server doesn't currently have a shutdown routine so there's no difference
    # between stopping and crashing.
-    pageserver.stop();
-    pageserver.start();
+    env.pageserver.stop()
+    env.pageserver.start()

    # Stopping the pageserver breaks the connection from the postgres backend to
    # the page server, and causes the next query on the connection to fail. Start a new
@@ -62,6 +62,5 @@ def test_pageserver_restart(zenith_cli, pageserver: ZenithPageserver, postgres:
    assert cur.fetchone() == (100000, )

    # Stop the page server by force, and restart it
-    pageserver.stop();
-    pageserver.start();
-
+    env.pageserver.stop()
+    env.pageserver.start()
--- a/test_runner/batch_others/test_pgbench.py
+++ b/test_runner/batch_others/test_pgbench.py
@@ -1,14 +1,15 @@
-from fixtures.zenith_fixtures import PostgresFactory
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_pgbench(postgres: PostgresFactory, pg_bin, zenith_cli):
+def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_pgbench", "empty"])
+    env.zenith_cli(["branch", "test_pgbench", "empty"])

-    pg = postgres.create_start('test_pgbench')
+    pg = env.postgres.create_start('test_pgbench')
    log.info("postgres is running on 'test_pgbench' branch")

    connstr = pg.connstr()
--- a/test_runner/batch_others/test_readonly_node.py
+++ b/test_runner/batch_others/test_readonly_node.py
@@ -0,0 +1,91 @@
+import subprocess
+from fixtures.zenith_fixtures import ZenithEnv
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+
+#
+# Create read-only compute nodes, anchored at historical points in time.
+#
+# This is very similar to the 'test_branch_behind' test, but instead of
+# creating branches, creates read-only nodes.
+#
+def test_readonly_node(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    env.zenith_cli(["branch", "test_readonly_node", "empty"])
+
+    pgmain = env.postgres.create_start('test_readonly_node')
+    print("postgres is running on 'test_readonly_node' branch")
+
+    main_pg_conn = pgmain.connect()
+    main_cur = main_pg_conn.cursor()
+
+    # Create table, and insert the first 100 rows
+    main_cur.execute('CREATE TABLE foo (t text)')
+    main_cur.execute('''
+        INSERT INTO foo
+            SELECT 'long string to consume some space' || g
+            FROM generate_series(1, 100) g
+    ''')
+    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
+    lsn_a = main_cur.fetchone()[0]
+    print('LSN after 100 rows: ' + lsn_a)
+
+    # Insert some more rows. (This generates enough WAL to fill a few segments.)
+    main_cur.execute('''
+        INSERT INTO foo
+            SELECT 'long string to consume some space' || g
+            FROM generate_series(1, 200000) g
+    ''')
+    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
+    lsn_b = main_cur.fetchone()[0]
+    print('LSN after 200100 rows: ' + lsn_b)
+
+    # Insert many more rows. This generates enough WAL to fill a few segments.
+    main_cur.execute('''
+        INSERT INTO foo
+            SELECT 'long string to consume some space' || g
+            FROM generate_series(1, 200000) g
+    ''')
+
+    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
+    lsn_c = main_cur.fetchone()[0]
+    print('LSN after 400100 rows: ' + lsn_c)
+
+    # Create first read-only node at the point where only 100 rows were inserted
+    pg_hundred = env.postgres.create_start("test_readonly_node_hundred",
+                                           branch=f'test_readonly_node@{lsn_a}')
+
+    # And another at the point where 200100 rows were inserted
+    pg_more = env.postgres.create_start("test_readonly_node_more",
+                                        branch=f'test_readonly_node@{lsn_b}')
+
+    # On the 'hundred' node, we should see only 100 rows
+    hundred_pg_conn = pg_hundred.connect()
+    hundred_cur = hundred_pg_conn.cursor()
+    hundred_cur.execute('SELECT count(*) FROM foo')
+    assert hundred_cur.fetchone() == (100, )
+
+    # On the 'more' node, we should see 100200 rows
+    more_pg_conn = pg_more.connect()
+    more_cur = more_pg_conn.cursor()
+    more_cur.execute('SELECT count(*) FROM foo')
+    assert more_cur.fetchone() == (200100, )
+
+    # All the rows are visible on the main branch
+    main_cur.execute('SELECT count(*) FROM foo')
+    assert main_cur.fetchone() == (400100, )
+
+    # Check creating a node at segment boundary
+    pg = env.postgres.create_start("test_branch_segment_boundary",
+                                   branch="test_readonly_node@0/3000000")
+    cur = pg.connect().cursor()
+    cur.execute('SELECT 1')
+    assert cur.fetchone() == (1, )
+
+    # Create node at pre-initdb lsn
+    try:
+        env.zenith_cli(["pg", "start", "test_branch_preinitdb", "test_readonly_node@0/42"])
+        assert False, "compute node startup with invalid LSN should have failed"
+    except Exception:
+        print("Node creation with pre-initdb LSN failed (as expected)")
--- a/test_runner/batch_others/test_restart_compute.py
+++ b/test_runner/batch_others/test_restart_compute.py
@@ -1,7 +1,7 @@
 import pytest

 from contextlib import closing
-from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory
+from fixtures.zenith_fixtures import ZenithEnvBuilder
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -11,23 +11,15 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 # Test restarting and recreating a postgres instance
 #
@pytest.mark.parametrize('with_wal_acceptors', [False, True])
-def test_restart_compute(
-        zenith_cli,
-        pageserver: ZenithPageserver,
-        postgres: PostgresFactory,
-        pg_bin,
-        wa_factory,
-        with_wal_acceptors: bool,
-    ):
-    wal_acceptor_connstrs = None
-    zenith_cli.run(["branch", "test_restart_compute", "empty"])
-
+def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptors: bool):
+    zenith_env_builder.pageserver_auth_enabled = True
    if with_wal_acceptors:
-        wa_factory.start_n_new(3)
-        wal_acceptor_connstrs = wa_factory.get_connstrs()
+        zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()

-    pg = postgres.create_start('test_restart_compute',
-                               wal_acceptors=wal_acceptor_connstrs)
+    env.zenith_cli(["branch", "test_restart_compute", "main"])
+
+    pg = env.postgres.create_start('test_restart_compute')
    log.info("postgres is running on 'test_restart_compute' branch")

    with closing(pg.connect()) as conn:
@@ -40,9 +32,7 @@ def test_restart_compute(
            log.info(f"res = {r}")

    # Remove data directory and restart
-    pg.stop_and_destroy().create_start('test_restart_compute',
-                                       wal_acceptors=wal_acceptor_connstrs)
-
+    pg.stop_and_destroy().create_start('test_restart_compute')

    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
@@ -61,8 +51,7 @@ def test_restart_compute(
            log.info(f"res = {r}")

    # Again remove data directory and restart
-    pg.stop_and_destroy().create_start('test_restart_compute',
-                                       wal_acceptors=wal_acceptor_connstrs)
+    pg.stop_and_destroy().create_start('test_restart_compute')

    # That select causes lots of FPI's and increases probability of wakeepers
    # lagging behind after query completion
@@ -76,8 +65,7 @@ def test_restart_compute(
            log.info(f"res = {r}")

    # And again remove data directory and restart
-    pg.stop_and_destroy().create_start('test_restart_compute',
-                                       wal_acceptors=wal_acceptor_connstrs)
+    pg.stop_and_destroy().create_start('test_restart_compute')

    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
--- a/test_runner/batch_others/test_snapfiles_gc.py
+++ b/test_runner/batch_others/test_snapfiles_gc.py
@@ -1,14 +1,20 @@
 from contextlib import closing
 import psycopg2.extras
 import time
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")

+
 def print_gc_result(row):
-    log.info("GC duration {elapsed} ms".format_map(row));
-    log.info("  REL    total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, needed_as_tombstone {layer_relfiles_needed_as_tombstone}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}".format_map(row))
-    log.info("  NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, needed_as_tombstone {layer_nonrelfiles_needed_as_tombstone}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}".format_map(row))
+    log.info("GC duration {elapsed} ms".format_map(row))
+    log.info(
+        "  REL    total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, needed_as_tombstone {layer_relfiles_needed_as_tombstone}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}"
+        .format_map(row))
+    log.info(
+        "  NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, needed_as_tombstone {layer_nonrelfiles_needed_as_tombstone}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}"
+        .format_map(row))


 #
@@ -17,14 +23,15 @@ def print_gc_result(row):
 # This test is pretty tightly coupled with the current implementation of layered
 # storage, in layered_repository.rs.
 #
-def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
-    zenith_cli.run(["branch", "test_layerfiles_gc", "empty"])
-    pg = postgres.create_start('test_layerfiles_gc')
+def test_layerfiles_gc(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    env.zenith_cli(["branch", "test_layerfiles_gc", "empty"])
+    pg = env.postgres.create_start('test_layerfiles_gc')

    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
-            with closing(pageserver.connect()) as psconn:
-                with psconn.cursor(cursor_factory = psycopg2.extras.DictCursor) as pscur:
+            with closing(env.pageserver.connect()) as psconn:
+                with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur:

                    # Get the timeline ID of our branch. We need it for the 'do_gc' command
                    cur.execute("SHOW zenith.zenith_timeline")
@@ -34,9 +41,9 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
                    cur.execute("CREATE TABLE foo(x integer)")
                    cur.execute("INSERT INTO foo VALUES (1)")

-                    cur.execute("select relfilenode from pg_class where oid = 'foo'::regclass");
-                    row = cur.fetchone();
-                    log.info(f"relfilenode is {row[0]}");
+                    cur.execute("select relfilenode from pg_class where oid = 'foo'::regclass")
+                    row = cur.fetchone()
+                    log.info(f"relfilenode is {row[0]}")

                    # Run GC, to clear out any garbage left behind in the catalogs by
                    # the CREATE TABLE command. We want to have a clean slate with no garbage
@@ -52,11 +59,12 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
                    cur.execute("DELETE FROM foo")

                    log.info("Running GC before test")
-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
-                    print_gc_result(row);
+                    print_gc_result(row)
                    # remember the number of files
-                    layer_relfiles_remain = row['layer_relfiles_total'] - row['layer_relfiles_removed']
+                    layer_relfiles_remain = (row['layer_relfiles_total'] -
+                                             row['layer_relfiles_removed'])
                    assert layer_relfiles_remain > 0

                    # Insert a row and run GC. Checkpoint should freeze the layer
@@ -64,9 +72,9 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
                    # removing the old image and delta layer.
                    log.info("Inserting one row and running GC")
                    cur.execute("INSERT INTO foo VALUES (1)")
-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
-                    print_gc_result(row);
+                    print_gc_result(row)
                    assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
                    assert row['layer_relfiles_removed'] == 2
                    assert row['layer_relfiles_dropped'] == 0
@@ -78,9 +86,9 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
                    cur.execute("INSERT INTO foo VALUES (2)")
                    cur.execute("INSERT INTO foo VALUES (3)")

-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
-                    print_gc_result(row);
+                    print_gc_result(row)
                    assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
                    assert row['layer_relfiles_removed'] == 2
                    assert row['layer_relfiles_dropped'] == 0
@@ -90,18 +98,18 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
                    cur.execute("INSERT INTO foo VALUES (2)")
                    cur.execute("INSERT INTO foo VALUES (3)")

-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
-                    print_gc_result(row);
+                    print_gc_result(row)
                    assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
                    assert row['layer_relfiles_removed'] == 2
                    assert row['layer_relfiles_dropped'] == 0

                    # Run GC again, with no changes in the database. Should not remove anything.
                    log.info("Run GC again, with nothing to do")
-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
-                    print_gc_result(row);
+                    print_gc_result(row)
                    assert row['layer_relfiles_total'] == layer_relfiles_remain
                    assert row['layer_relfiles_removed'] == 0
                    assert row['layer_relfiles_dropped'] == 0
@@ -109,12 +117,12 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
                    #
                    # Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
                    #
-                    log.info("Drop table and run GC again");
+                    log.info("Drop table and run GC again")
                    cur.execute("DROP TABLE foo")

-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
-                    print_gc_result(row);
+                    print_gc_result(row)

                    # We still cannot remove the latest layers
                    # because they serve as tombstones for earlier layers.
--- a/test_runner/batch_others/test_tenants.py
+++ b/test_runner/batch_others/test_tenants.py
@@ -2,39 +2,41 @@ from contextlib import closing

 import pytest

-from fixtures.zenith_fixtures import (
-    TenantFactory,
-    ZenithCli,
-    PostgresFactory,
-)
+from fixtures.zenith_fixtures import ZenithEnvBuilder


@pytest.mark.parametrize('with_wal_acceptors', [False, True])
-def test_tenants_normal_work(
-    zenith_cli: ZenithCli,
-    tenant_factory: TenantFactory,
-    postgres: PostgresFactory,
-    wa_factory,
-    with_wal_acceptors: bool,
-):
-    """Tests tenants with and without wal acceptors"""
-    tenant_1 = tenant_factory.create()
-    tenant_2 = tenant_factory.create()
-
-    zenith_cli.run(["branch", f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", "main", f"--tenantid={tenant_1}"])
-    zenith_cli.run(["branch", f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", "main", f"--tenantid={tenant_2}"])
+def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptors: bool):
    if with_wal_acceptors:
-        wa_factory.start_n_new(3)
+        zenith_env_builder.num_safekeepers = 3

-    pg_tenant1 = postgres.create_start(
+    env = zenith_env_builder.init()
+    """Tests tenants with and without wal acceptors"""
+    tenant_1 = env.create_tenant()
+    tenant_2 = env.create_tenant()
+
+    env.zenith_cli([
+        "branch",
        f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
+        "main",
+        f"--tenantid={tenant_1}"
+    ])
+    env.zenith_cli([
+        "branch",
+        f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
+        "main",
+        f"--tenantid={tenant_2}"
+    ])
+
+    pg_tenant1 = env.postgres.create_start(
+        f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
+        None,  # branch name, None means same as node name
        tenant_1,
-        wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None,
    )
-    pg_tenant2 = postgres.create_start(
+    pg_tenant2 = env.postgres.create_start(
        f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
+        None,  # branch name, None means same as node name
        tenant_2,
-        wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None,
    )

    for pg in [pg_tenant1, pg_tenant2]:
@@ -45,4 +47,4 @@ def test_tenants_normal_work(
                cur.execute("CREATE TABLE t(key int primary key, value text)")
                cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
                cur.execute("SELECT sum(key) FROM t")
-                assert cur.fetchone() == (5000050000,)
+                assert cur.fetchone() == (5000050000, )
--- a/test_runner/batch_others/test_timeline_size.py
+++ b/test_runner/batch_others/test_timeline_size.py
@@ -1,20 +1,20 @@
 from contextlib import closing
 from uuid import UUID
 import psycopg2.extras
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

-def test_timeline_size(
-    zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin
-):
-    # Branch at the point where only 100 rows were inserted
-    zenith_cli.run(["branch", "test_timeline_size", "empty"])

-    client = pageserver.http_client()
-    res = client.branch_detail(UUID(pageserver.initial_tenant), "test_timeline_size")
+def test_timeline_size(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    # Branch at the point where only 100 rows were inserted
+    env.zenith_cli(["branch", "test_timeline_size", "empty"])
+
+    client = env.pageserver.http_client()
+    res = client.branch_detail(UUID(env.initial_tenant), "test_timeline_size")
    assert res["current_logical_size"] == res["current_logical_size_non_incremental"]

-    pgmain = postgres.create_start("test_timeline_size")
+    pgmain = env.postgres.create_start("test_timeline_size")
    log.info("postgres is running on 'test_timeline_size' branch")

    with closing(pgmain.connect()) as conn:
@@ -23,17 +23,15 @@ def test_timeline_size(

            # Create table, and insert the first 100 rows
            cur.execute("CREATE TABLE foo (t text)")
-            cur.execute(
-                """
+            cur.execute("""
                INSERT INTO foo
                    SELECT 'long string to consume some space' || g
                    FROM generate_series(1, 10) g
-            """
-            )
+            """)

-            res = client.branch_detail(UUID(pageserver.initial_tenant), "test_timeline_size")
+            res = client.branch_detail(UUID(env.initial_tenant), "test_timeline_size")
            assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
            cur.execute("TRUNCATE foo")

-            res = client.branch_detail(UUID(pageserver.initial_tenant), "test_timeline_size")
+            res = client.branch_detail(UUID(env.initial_tenant), "test_timeline_size")
            assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
--- a/test_runner/batch_others/test_twophase.py
+++ b/test_runner/batch_others/test_twophase.py
@@ -1,6 +1,6 @@
 import os

-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, PgBin
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -9,10 +9,11 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test branching, when a transaction is in prepared state
 #
-def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin: PgBin):
-    zenith_cli.run(["branch", "test_twophase", "empty"])
+def test_twophase(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    env.zenith_cli(["branch", "test_twophase", "empty"])

-    pg = postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5'])
+    pg = env.postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5'])
    log.info("postgres is running on 'test_twophase' branch")

    conn = pg.connect()
@@ -57,10 +58,10 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
    assert len(twophase_files) == 2

    # Create a branch with the transaction in prepared state
-    zenith_cli.run(["branch", "test_twophase_prepared", "test_twophase"])
+    env.zenith_cli(["branch", "test_twophase_prepared", "test_twophase"])

    # Start compute on the new branch
-    pg2 = postgres.create_start(
+    pg2 = env.postgres.create_start(
        'test_twophase_prepared',
        config_lines=['max_prepared_transactions=5'],
    )
@@ -79,8 +80,8 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
    cur2.execute("ROLLBACK PREPARED 'insert_two'")

    cur2.execute('SELECT * FROM foo')
-    assert cur2.fetchall() == [('one',), ('three',)]
+    assert cur2.fetchall() == [('one', ), ('three', )]

    # Only one committed insert is visible on the original branch
    cur.execute('SELECT * FROM foo')
-    assert cur.fetchall() == [('three',)]
+    assert cur.fetchall() == [('three', )]
--- a/test_runner/batch_others/test_vm_bits.py
+++ b/test_runner/batch_others/test_vm_bits.py
@@ -1,16 +1,19 @@
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")

+
 #
 # Test that the VM bit is cleared correctly at a HEAP_DELETE and
 # HEAP_UPDATE record.
 #
-def test_vm_bit_clear(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin, zenith_cli, base_dir):
+def test_vm_bit_clear(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+
    # Create a branch for us
-    zenith_cli.run(["branch", "test_vm_bit_clear", "empty"])
-    pg = postgres.create_start('test_vm_bit_clear')
+    env.zenith_cli(["branch", "test_vm_bit_clear", "empty"])
+    pg = env.postgres.create_start('test_vm_bit_clear')

    log.info("postgres is running on 'test_vm_bit_clear' branch")
    pg_conn = pg.connect()
@@ -33,7 +36,7 @@ def test_vm_bit_clear(pageserver: ZenithPageserver, postgres: PostgresFactory, p
    cur.execute('UPDATE vmtest_update SET id = 5000 WHERE id = 1')

    # Branch at this point, to test that later
-    zenith_cli.run(["branch", "test_vm_bit_clear_new", "test_vm_bit_clear"])
+    env.zenith_cli(["branch", "test_vm_bit_clear_new", "test_vm_bit_clear"])

    # Clear the buffer cache, to force the VM page to be re-fetched from
    # the page server
@@ -49,20 +52,19 @@ def test_vm_bit_clear(pageserver: ZenithPageserver, postgres: PostgresFactory, p
    ''')

    cur.execute('SELECT * FROM vmtest_delete WHERE id = 1')
-    assert(cur.fetchall() == []);
+    assert (cur.fetchall() == [])
    cur.execute('SELECT * FROM vmtest_update WHERE id = 1')
-    assert(cur.fetchall() == []);
+    assert (cur.fetchall() == [])

    cur.close()

-
    # Check the same thing on the branch that we created right after the DELETE
    #
    # As of this writing, the code in smgrwrite() creates a full-page image whenever
    # a dirty VM page is evicted. If the VM bit was not correctly cleared by the
    # earlier WAL record, the full-page image hides the problem. Starting a new
    # server at the right point-in-time avoids that full-page image.
-    pg_new = postgres.create_start('test_vm_bit_clear_new')
+    pg_new = env.postgres.create_start('test_vm_bit_clear_new')

    log.info("postgres is running on 'test_vm_bit_clear_new' branch")
    pg_new_conn = pg_new.connect()
@@ -75,6 +77,6 @@ def test_vm_bit_clear(pageserver: ZenithPageserver, postgres: PostgresFactory, p
    ''')

    cur_new.execute('SELECT * FROM vmtest_delete WHERE id = 1')
-    assert(cur_new.fetchall() == []);
+    assert (cur_new.fetchall() == [])
    cur_new.execute('SELECT * FROM vmtest_update WHERE id = 1')
-    assert(cur_new.fetchall() == []);
+    assert (cur_new.fetchall() == [])
--- a/test_runner/batch_others/test_wal_acceptor.py
+++ b/test_runner/batch_others/test_wal_acceptor.py
@@ -7,7 +7,7 @@ import uuid

 from contextlib import closing
 from multiprocessing import Process, Value
-from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory, PgBin
+from fixtures.zenith_fixtures import PgBin, ZenithEnv, ZenithEnvBuilder
 from fixtures.utils import lsn_to_hex, mkdir_if_needed
 from fixtures.log_helper import log

@@ -16,11 +16,13 @@ pytest_plugins = ("fixtures.zenith_fixtures")

 # basic test, write something in setup with wal acceptors, ensure that commits
 # succeed and data is written
-def test_normal_work(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory):
-    zenith_cli.run(["branch", "test_wal_acceptors_normal_work", "empty"])
-    wa_factory.start_n_new(3)
-    pg = postgres.create_start('test_wal_acceptors_normal_work',
-                               wal_acceptors=wa_factory.get_connstrs())
+def test_normal_work(zenith_env_builder: ZenithEnvBuilder):
+    zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()
+
+    env.zenith_cli(["branch", "test_wal_acceptors_normal_work", "main"])
+
+    pg = env.postgres.create_start('test_wal_acceptors_normal_work')

    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
@@ -34,18 +36,19 @@ def test_normal_work(zenith_cli, pageserver: ZenithPageserver, postgres: Postgre

 # Run page server and multiple acceptors, and multiple compute nodes running
 # against different timelines.
-def test_many_timelines(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory):
-    n_timelines = 2
+def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
+    zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()

-    wa_factory.start_n_new(3)
+    n_timelines = 2

    branches = ["test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines)]

    # start postgres on each timeline
    pgs = []
    for branch in branches:
-        zenith_cli.run(["branch", branch, "empty"])
-        pgs.append(postgres.create_start(branch, wal_acceptors=wa_factory.get_connstrs()))
+        env.zenith_cli(["branch", branch, "main"])
+        pgs.append(env.postgres.create_start(branch))

    # Do everything in different loops to have actions on different timelines
    # interleaved.
@@ -66,16 +69,16 @@ def test_many_timelines(zenith_cli, pageserver: ZenithPageserver, postgres: Post
 # Check that dead minority doesn't prevent the commits: execute insert n_inserts
 # times, with fault_probability chance of getting a wal acceptor down or up
 # along the way. 2 of 3 are always alive, so the work keeps going.
-def test_restarts(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory: WalAcceptorFactory):
+def test_restarts(zenith_env_builder: ZenithEnvBuilder):
    fault_probability = 0.01
    n_inserts = 1000
    n_acceptors = 3

-    wa_factory.start_n_new(n_acceptors)
+    zenith_env_builder.num_safekeepers = n_acceptors
+    env = zenith_env_builder.init()

-    zenith_cli.run(["branch", "test_wal_acceptors_restarts", "empty"])
-    pg = postgres.create_start('test_wal_acceptors_restarts',
-                               wal_acceptors=wa_factory.get_connstrs())
+    env.zenith_cli(["branch", "test_wal_acceptors_restarts", "main"])
+    pg = env.postgres.create_start('test_wal_acceptors_restarts')

    # we rely upon autocommit after each statement
    # as waiting for acceptors happens there
@@ -89,7 +92,7 @@ def test_restarts(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa

        if random.random() <= fault_probability:
            if failed_node is None:
-                failed_node = wa_factory.instances[random.randrange(0, n_acceptors)]
+                failed_node = env.safekeepers[random.randrange(0, n_acceptors)]
                failed_node.stop()
            else:
                failed_node.start()
@@ -107,12 +110,12 @@ def delayed_wal_acceptor_start(wa):


 # When majority of acceptors is offline, commits are expected to be frozen
-def test_unavailability(zenith_cli, postgres: PostgresFactory, wa_factory):
-    wa_factory.start_n_new(2)
+def test_unavailability(zenith_env_builder: ZenithEnvBuilder):
+    zenith_env_builder.num_safekeepers = 2
+    env = zenith_env_builder.init()

-    zenith_cli.run(["branch", "test_wal_acceptors_unavailability", "empty"])
-    pg = postgres.create_start('test_wal_acceptors_unavailability',
-                               wal_acceptors=wa_factory.get_connstrs())
+    env.zenith_cli(["branch", "test_wal_acceptors_unavailability", "main"])
+    pg = env.postgres.create_start('test_wal_acceptors_unavailability')

    # we rely upon autocommit after each statement
    # as waiting for acceptors happens there
@@ -124,9 +127,9 @@ def test_unavailability(zenith_cli, postgres: PostgresFactory, wa_factory):
    cur.execute("INSERT INTO t values (1, 'payload')")

    # shutdown one of two acceptors, that is, majority
-    wa_factory.instances[0].stop()
+    env.safekeepers[0].stop()

-    proc = Process(target=delayed_wal_acceptor_start, args=(wa_factory.instances[0], ))
+    proc = Process(target=delayed_wal_acceptor_start, args=(env.safekeepers[0], ))
    proc.start()

    start = time.time()
@@ -136,9 +139,9 @@ def test_unavailability(zenith_cli, postgres: PostgresFactory, wa_factory):
    proc.join()

    # for the world's balance, do the same with second acceptor
-    wa_factory.instances[1].stop()
+    env.safekeepers[1].stop()

-    proc = Process(target=delayed_wal_acceptor_start, args=(wa_factory.instances[1], ))
+    proc = Process(target=delayed_wal_acceptor_start, args=(env.safekeepers[1], ))
    proc.start()

    start = time.time()
@@ -177,13 +180,13 @@ def stop_value():


 # do inserts while concurrently getting up/down subsets of acceptors
-def test_race_conditions(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory, stop_value):
+def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value):

-    wa_factory.start_n_new(3)
+    zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()

-    zenith_cli.run(["branch", "test_wal_acceptors_race_conditions", "empty"])
-    pg = postgres.create_start('test_wal_acceptors_race_conditions',
-                               wal_acceptors=wa_factory.get_connstrs())
+    env.zenith_cli(["branch", "test_wal_acceptors_race_conditions", "main"])
+    pg = env.postgres.create_start('test_wal_acceptors_race_conditions')

    # we rely upon autocommit after each statement
    # as waiting for acceptors happens there
@@ -192,7 +195,7 @@ def test_race_conditions(zenith_cli, pageserver: ZenithPageserver, postgres: Pos

    cur.execute('CREATE TABLE t(key int primary key, value text)')

-    proc = Process(target=xmas_garland, args=(wa_factory.instances, stop_value))
+    proc = Process(target=xmas_garland, args=(env.safekeepers, stop_value))
    proc.start()

    for i in range(1000):
@@ -207,7 +210,8 @@ def test_race_conditions(zenith_cli, pageserver: ZenithPageserver, postgres: Pos

 class ProposerPostgres:
    """Object for running safekeepers sync with walproposer"""
-    def __init__(self, pgdata_dir: str, pg_bin: PgBin, timeline_id: str, tenant_id: str):
+    def __init__(self, env: ZenithEnv, pgdata_dir: str, pg_bin, timeline_id: str, tenant_id: str):
+        self.env = env
        self.pgdata_dir: str = pgdata_dir
        self.pg_bin: PgBin = pg_bin
        self.timeline_id: str = timeline_id
@@ -253,16 +257,20 @@ class ProposerPostgres:


 # insert wal in all safekeepers and run sync on proposer
-def test_sync_safekeepers(repo_dir: str, pg_bin: PgBin, wa_factory: WalAcceptorFactory):
-    wa_factory.start_n_new(3)
+def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, pg_bin: PgBin):
+
+    # We don't really need the full environment for this test, just the
+    # safekeepers would be enough.
+    zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()

    timeline_id = uuid.uuid4().hex
    tenant_id = uuid.uuid4().hex

    # write config for proposer
-    pgdata_dir = os.path.join(repo_dir, "proposer_pgdata")
-    pg = ProposerPostgres(pgdata_dir, pg_bin, timeline_id, tenant_id)
-    pg.create_dir_config(wa_factory.get_connstrs())
+    pgdata_dir = os.path.join(env.repo_dir, "proposer_pgdata")
+    pg = ProposerPostgres(env, pgdata_dir, pg_bin, timeline_id, tenant_id)
+    pg.create_dir_config(env.get_safekeeper_connstrs())

    # valid lsn, which is not in the segment start, nor in zero segment
    epoch_start_lsn = 0x16B9188  # 0/16B9188
@@ -271,7 +279,7 @@ def test_sync_safekeepers(repo_dir: str, pg_bin: PgBin, wa_factory: WalAcceptorF
    # append and commit WAL
    lsn_after_append = []
    for i in range(3):
-        res = wa_factory.instances[i].append_logical_message(
+        res = env.safekeepers[i].append_logical_message(
            tenant_id,
            timeline_id,
            {
@@ -295,13 +303,15 @@ def test_sync_safekeepers(repo_dir: str, pg_bin: PgBin, wa_factory: WalAcceptorF
    assert all(lsn_after_sync == lsn for lsn in lsn_after_append)


-def test_timeline_status(zenith_cli, pageserver, postgres, wa_factory: WalAcceptorFactory):
-    wa_factory.start_n_new(1)
+def test_timeline_status(zenith_env_builder: ZenithEnvBuilder):

-    zenith_cli.run(["branch", "test_timeline_status", "empty"])
-    pg = postgres.create_start('test_timeline_status', wal_acceptors=wa_factory.get_connstrs())
+    zenith_env_builder.num_safekeepers = 1
+    env = zenith_env_builder.init()

-    wa = wa_factory.instances[0]
+    env.zenith_cli(["branch", "test_timeline_status", "main"])
+    pg = env.postgres.create_start('test_timeline_status')
+
+    wa = env.safekeepers[0]
    wa_http_cli = wa.http_client()
    wa_http_cli.check_status()

@@ -318,6 +328,5 @@ def test_timeline_status(zenith_cli, pageserver, postgres, wa_factory: WalAccept
    pg.stop().start()
    pg.safe_psql("insert into t values(10)")

-    epoch_after_reboot = wa_http_cli.timeline_status(tenant_id,
-                                                     timeline_id).acceptor_epoch
+    epoch_after_reboot = wa_http_cli.timeline_status(tenant_id, timeline_id).acceptor_epoch
    assert epoch_after_reboot > epoch
--- a/test_runner/batch_others/test_wal_acceptor_async.py
+++ b/test_runner/batch_others/test_wal_acceptor_async.py
@@ -1,9 +1,11 @@
 import asyncio
 import asyncpg
 import random
+import time

-from fixtures.zenith_fixtures import WalAcceptor, WalAcceptorFactory, ZenithPageserver, PostgresFactory, Postgres
+from fixtures.zenith_fixtures import ZenithEnvBuilder, Postgres, Safekeeper
 from fixtures.log_helper import getLogger
+from fixtures.utils import lsn_from_hex, lsn_to_hex
 from typing import List

 log = getLogger('root.wal_acceptor_async')
@@ -19,13 +21,16 @@ class BankClient(object):
    async def initdb(self):
        await self.conn.execute('DROP TABLE IF EXISTS bank_accs')
        await self.conn.execute('CREATE TABLE bank_accs(uid int primary key, amount int)')
-        await self.conn.execute('''
+        await self.conn.execute(
+            '''
            INSERT INTO bank_accs
            SELECT *, $1 FROM generate_series(0, $2)
-        ''', self.init_amount, self.n_accounts - 1)
+        ''',
+            self.init_amount,
+            self.n_accounts - 1)
        await self.conn.execute('DROP TABLE IF EXISTS bank_log')
        await self.conn.execute('CREATE TABLE bank_log(from_uid int, to_uid int, amount int)')
-        
+
        # TODO: Remove when https://github.com/zenithdb/zenith/issues/644 is fixed
        await self.conn.execute('ALTER TABLE bank_accs SET (autovacuum_enabled = false)')
        await self.conn.execute('ALTER TABLE bank_log SET (autovacuum_enabled = false)')
@@ -34,6 +39,7 @@ class BankClient(object):
        row = await self.conn.fetchrow('SELECT sum(amount) AS sum FROM bank_accs')
        assert row['sum'] == self.n_accounts * self.init_amount

+
 async def bank_transfer(conn: asyncpg.Connection, from_uid, to_uid, amount):
    # avoid deadlocks by sorting uids
    if from_uid > to_uid:
@@ -42,16 +48,22 @@ async def bank_transfer(conn: asyncpg.Connection, from_uid, to_uid, amount):
    async with conn.transaction():
        await conn.execute(
            'UPDATE bank_accs SET amount = amount + ($1) WHERE uid = $2',
-            amount, to_uid,
+            amount,
+            to_uid,
        )
        await conn.execute(
            'UPDATE bank_accs SET amount = amount - ($1) WHERE uid = $2',
-            amount, from_uid,
+            amount,
+            from_uid,
        )
-        await conn.execute('INSERT INTO bank_log VALUES ($1, $2, $3)',
-            from_uid, to_uid, amount,
+        await conn.execute(
+            'INSERT INTO bank_log VALUES ($1, $2, $3)',
+            from_uid,
+            to_uid,
+            amount,
        )

+
 class WorkerStats(object):
    def __init__(self, n_workers):
        self.counters = [0] * n_workers
@@ -92,11 +104,43 @@ async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accou
    await pg_conn.close()


+async def wait_for_lsn(safekeeper: Safekeeper,
+                       tenant_id: str,
+                       timeline_id: str,
+                       wait_lsn: str,
+                       polling_interval=1,
+                       timeout=600):
+    """
+    Poll flush_lsn from safekeeper until it's greater or equal than
+    provided wait_lsn. To do that, timeline_status is fetched from
+    safekeeper every polling_interval seconds.
+    """
+
+    started_at = time.time()
+    client = safekeeper.http_client()
+
+    flush_lsn = client.timeline_status(tenant_id, timeline_id).flush_lsn
+    log.info(
+        f'Safekeeper at port {safekeeper.port.pg} has flush_lsn {flush_lsn}, waiting for lsn {wait_lsn}'
+    )
+
+    while lsn_from_hex(wait_lsn) > lsn_from_hex(flush_lsn):
+        elapsed = time.time() - started_at
+        if elapsed > timeout:
+            raise RuntimeError(
+                f"timed out waiting for safekeeper at port {safekeeper.port.pg} to reach {wait_lsn}, current lsn is {flush_lsn}"
+            )
+
+        await asyncio.sleep(polling_interval)
+        flush_lsn = client.timeline_status(tenant_id, timeline_id).flush_lsn
+        log.debug(f'safekeeper port={safekeeper.port.pg} flush_lsn={flush_lsn} wait_lsn={wait_lsn}')
+
+
 # This test will run several iterations and check progress in each of them.
 # On each iteration 1 acceptor is stopped, and 2 others should allow
 # background workers execute transactions. In the end, state should remain
 # consistent.
-async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_workers=10):
+async def run_restarts_under_load(pg: Postgres, acceptors: List[Safekeeper], n_workers=10):
    n_accounts = 100
    init_amount = 100000
    max_transfer = 100
@@ -104,6 +148,9 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_
    iterations = 6

    pg_conn = await pg.connect_async()
+    tenant_id = await pg_conn.fetchval("show zenith.zenith_tenant")
+    timeline_id = await pg_conn.fetchval("show zenith.zenith_timeline")
+
    bank = BankClient(pg_conn, n_accounts=n_accounts, init_amount=init_amount)
    # create tables and initial balances
    await bank.initdb()
@@ -114,19 +161,19 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_
        worker = run_random_worker(stats, pg, worker_id, bank.n_accounts, max_transfer)
        workers.append(asyncio.create_task(worker))

-
    for it in range(iterations):
-        victim = acceptors[it % len(acceptors)]
+        victim_idx = it % len(acceptors)
+        victim = acceptors[victim_idx]
        victim.stop()

-        # Wait till previous victim recovers so it is ready for the next
-        # iteration by making any writing xact.
-        conn = await pg.connect_async()
-        await conn.execute(
-            'UPDATE bank_accs SET amount = amount WHERE uid = 1',
-            timeout=120
-        )
-        await conn.close()
+        flush_lsn = await pg_conn.fetchval('SELECT pg_current_wal_flush_lsn()')
+        flush_lsn = lsn_to_hex(flush_lsn)
+        log.info(f'Postgres flush_lsn {flush_lsn}')
+
+        # Wait until alive safekeepers catch up with postgres
+        for idx, safekeeper in enumerate(acceptors):
+            if idx != victim_idx:
+                await wait_for_lsn(safekeeper, tenant_id, timeline_id, flush_lsn)

        stats.reset()
        await asyncio.sleep(period_time)
@@ -145,16 +192,14 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_


 # restart acceptors one by one, while executing and validating bank transactions
-def test_restarts_under_load(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory,
-                             wa_factory: WalAcceptorFactory):
+def test_restarts_under_load(zenith_env_builder: ZenithEnvBuilder):
+    zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()

-    wa_factory.start_n_new(3)
+    env.zenith_cli(["branch", "test_wal_acceptors_restarts_under_load", "main"])
+    pg = env.postgres.create_start('test_wal_acceptors_restarts_under_load')

-    zenith_cli.run(["branch", "test_wal_acceptors_restarts_under_load", "empty"])
-    pg = postgres.create_start('test_wal_acceptors_restarts_under_load',
-                               wal_acceptors=wa_factory.get_connstrs())
-
-    asyncio.run(run_restarts_under_load(pg, wa_factory.instances))
+    asyncio.run(run_restarts_under_load(pg, env.safekeepers))

    # TODO: Remove when https://github.com/zenithdb/zenith/issues/644 is fixed
    pg.stop()
--- a/test_runner/batch_others/test_zenith_cli.py
+++ b/test_runner/batch_others/test_zenith_cli.py
@@ -1,97 +1,107 @@
 import json
 import uuid

-from fixtures.zenith_fixtures import ZenithCli, ZenithPageserver
+from psycopg2.extensions import cursor as PgCursor
+from fixtures.zenith_fixtures import ZenithEnv
+from typing import cast

 pytest_plugins = ("fixtures.zenith_fixtures")


-def helper_compare_branch_list(page_server_cur, zenith_cli, initial_tenant: str):
+def helper_compare_branch_list(page_server_cur: PgCursor, env: ZenithEnv, initial_tenant: str):
    """
    Compare branches list returned by CLI and directly via API.
    Filters out branches created by other tests.
    """

    page_server_cur.execute(f'branch_list {initial_tenant}')
-    branches_api = sorted(map(lambda b: b['name'], json.loads(page_server_cur.fetchone()[0])))
+    branches_api = sorted(
+        map(lambda b: cast(str, b['name']), json.loads(page_server_cur.fetchone()[0])))
    branches_api = [b for b in branches_api if b.startswith('test_cli_') or b in ('empty', 'main')]

-    res = zenith_cli.run(["branch"])
+    res = env.zenith_cli(["branch"])
    res.check_returncode()
    branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
    branches_cli = [b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')]

-    res = zenith_cli.run(["branch", f"--tenantid={initial_tenant}"])
+    res = env.zenith_cli(["branch", f"--tenantid={initial_tenant}"])
    res.check_returncode()
-    branches_cli_with_tenant_arg = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
-    branches_cli_with_tenant_arg = [b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')]
+    branches_cli_with_tenant_arg = sorted(
+        map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
+    branches_cli_with_tenant_arg = [
+        b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')
+    ]

    assert branches_api == branches_cli == branches_cli_with_tenant_arg


-def test_cli_branch_list(pageserver: ZenithPageserver, zenith_cli):
-    page_server_conn = pageserver.connect()
+def test_cli_branch_list(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    page_server_conn = env.pageserver.connect()
    page_server_cur = page_server_conn.cursor()

    # Initial sanity check
-    helper_compare_branch_list(page_server_cur, zenith_cli, pageserver.initial_tenant)
+    helper_compare_branch_list(page_server_cur, env, env.initial_tenant)

    # Create a branch for us
-    res = zenith_cli.run(["branch", "test_cli_branch_list_main", "main"])
+    res = env.zenith_cli(["branch", "test_cli_branch_list_main", "empty"])
    assert res.stderr == ''
-    helper_compare_branch_list(page_server_cur, zenith_cli, pageserver.initial_tenant)
+    helper_compare_branch_list(page_server_cur, env, env.initial_tenant)

    # Create a nested branch
-    res = zenith_cli.run(["branch", "test_cli_branch_list_nested", "test_cli_branch_list_main"])
+    res = env.zenith_cli(["branch", "test_cli_branch_list_nested", "test_cli_branch_list_main"])
    assert res.stderr == ''
-    helper_compare_branch_list(page_server_cur, zenith_cli, pageserver.initial_tenant)
+    helper_compare_branch_list(page_server_cur, env, env.initial_tenant)

    # Check that all new branches are visible via CLI
-    res = zenith_cli.run(["branch"])
+    res = env.zenith_cli(["branch"])
    assert res.stderr == ''
    branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))

    assert 'test_cli_branch_list_main' in branches_cli
    assert 'test_cli_branch_list_nested' in branches_cli

-def helper_compare_tenant_list(page_server_cur, zenith_cli: ZenithCli):
-    page_server_cur.execute(f'tenant_list')
-    tenants_api = sorted(json.loads(page_server_cur.fetchone()[0]))

-    res = zenith_cli.run(["tenant", "list"])
+def helper_compare_tenant_list(page_server_cur: PgCursor, env: ZenithEnv):
+    page_server_cur.execute(f'tenant_list')
+    tenants_api = sorted(
+        map(lambda t: cast(str, t['id']), json.loads(page_server_cur.fetchone()[0])))
+
+    res = env.zenith_cli(["tenant", "list"])
    assert res.stderr == ''
-    tenants_cli = sorted(res.stdout.splitlines())
+    tenants_cli = sorted(map(lambda t: t.split()[0], res.stdout.splitlines()))

    assert tenants_api == tenants_cli


-def test_cli_tenant_list(pageserver: ZenithPageserver, zenith_cli: ZenithCli):
-    page_server_conn = pageserver.connect()
+def test_cli_tenant_list(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    page_server_conn = env.pageserver.connect()
    page_server_cur = page_server_conn.cursor()

    # Initial sanity check
-    helper_compare_tenant_list(page_server_cur, zenith_cli)
+    helper_compare_tenant_list(page_server_cur, env)

    # Create new tenant
    tenant1 = uuid.uuid4().hex
-    res = zenith_cli.run(["tenant", "create", tenant1])
+    res = env.zenith_cli(["tenant", "create", tenant1])
    res.check_returncode()

    # check tenant1 appeared
-    helper_compare_tenant_list(page_server_cur, zenith_cli)
+    helper_compare_tenant_list(page_server_cur, env)

    # Create new tenant
    tenant2 = uuid.uuid4().hex
-    res = zenith_cli.run(["tenant", "create", tenant2])
+    res = env.zenith_cli(["tenant", "create", tenant2])
    res.check_returncode()

    # check tenant2 appeared
-    helper_compare_tenant_list(page_server_cur, zenith_cli)
+    helper_compare_tenant_list(page_server_cur, env)

-    res = zenith_cli.run(["tenant", "list"])
+    res = env.zenith_cli(["tenant", "list"])
    res.check_returncode()
-    tenants = sorted(res.stdout.splitlines())
+    tenants = sorted(map(lambda t: t.split()[0], res.stdout.splitlines()))

-    assert pageserver.initial_tenant in tenants
+    assert env.initial_tenant in tenants
    assert tenant1 in tenants
    assert tenant2 in tenants
--- a/test_runner/batch_pg_regress/test_isolation.py
+++ b/test_runner/batch_pg_regress/test_isolation.py
@@ -1,20 +1,20 @@
 import os

 from fixtures.utils import mkdir_if_needed
-from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory
+from fixtures.zenith_fixtures import ZenithEnv, base_dir, pg_distrib_dir

 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_isolation(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir,
-                   base_dir, capsys):
+def test_isolation(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys):
+    env = zenith_simple_env

    # Create a branch for us
-    zenith_cli.run(["branch", "test_isolation", "empty"])
+    env.zenith_cli(["branch", "test_isolation", "empty"])

    # Connect to postgres and create a database called "regression".
    # isolation tests use prepared transactions, so enable them
-    pg = postgres.create_start('test_isolation', config_lines=['max_prepared_transactions=100'])
+    pg = env.postgres.create_start('test_isolation', config_lines=['max_prepared_transactions=100'])
    pg.safe_psql('CREATE DATABASE isolation_regression')

    # Create some local directories for pg_isolation_regress to run in.
@@ -38,7 +38,7 @@ def test_isolation(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_b
        '--schedule={}'.format(schedule),
    ]

-    env = {
+    env_vars = {
        'PGPORT': str(pg.port),
        'PGUSER': pg.username,
        'PGHOST': pg.host,
@@ -48,4 +48,4 @@ def test_isolation(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_b
    # We don't capture the output. It's not too chatty, and it always
    # logs the exact same data to `regression.out` anyway.
    with capsys.disabled():
-        pg_bin.run(pg_isolation_regress_command, env=env, cwd=runpath)
+        pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath)
--- a/test_runner/batch_pg_regress/test_pg_regress.py
+++ b/test_runner/batch_pg_regress/test_pg_regress.py
@@ -1,19 +1,19 @@
 import os

 from fixtures.utils import mkdir_if_needed
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content
+from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content, base_dir, pg_distrib_dir

 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_pg_regress(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir,
-                    base_dir, capsys):
+def test_pg_regress(zenith_simple_env: ZenithEnv, test_output_dir: str, pg_bin, capsys):
+    env = zenith_simple_env

    # Create a branch for us
-    zenith_cli.run(["branch", "test_pg_regress", "empty"])
+    env.zenith_cli(["branch", "test_pg_regress", "empty"])

    # Connect to postgres and create a database called "regression".
-    pg = postgres.create_start('test_pg_regress')
+    pg = env.postgres.create_start('test_pg_regress')
    pg.safe_psql('CREATE DATABASE regression')

    # Create some local directories for pg_regress to run in.
@@ -38,7 +38,7 @@ def test_pg_regress(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_
        '--inputdir={}'.format(src_path),
    ]

-    env = {
+    env_vars = {
        'PGPORT': str(pg.port),
        'PGUSER': pg.username,
        'PGHOST': pg.host,
@@ -48,11 +48,11 @@ def test_pg_regress(pageserver: ZenithPageserver, postgres: PostgresFactory, pg_
    # We don't capture the output. It's not too chatty, and it always
    # logs the exact same data to `regression.out` anyway.
    with capsys.disabled():
-        pg_bin.run(pg_regress_command, env=env, cwd=runpath)
+        pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)

        # checkpoint one more time to ensure that the lsn we get is the latest one
        pg.safe_psql('CHECKPOINT')
        lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0]

        # Check that we restore the content of the datadir correctly
-        check_restored_datadir_content(zenith_cli, test_output_dir, pg, pageserver.service_port.pg)
+        check_restored_datadir_content(test_output_dir, env, pg)
--- a/test_runner/batch_pg_regress/test_zenith_regress.py
+++ b/test_runner/batch_pg_regress/test_zenith_regress.py
@@ -1,20 +1,23 @@
 import os

 from fixtures.utils import mkdir_if_needed
-from fixtures.zenith_fixtures import PageserverPort, PostgresFactory, check_restored_datadir_content
+from fixtures.zenith_fixtures import (ZenithEnv,
+                                      check_restored_datadir_content,
+                                      base_dir,
+                                      pg_distrib_dir)
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_zenith_regress(postgres: PostgresFactory, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir,
-                        base_dir, capsys, pageserver_port: PageserverPort):
+def test_zenith_regress(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys):
+    env = zenith_simple_env

    # Create a branch for us
-    zenith_cli.run(["branch", "test_zenith_regress", "empty"])
+    env.zenith_cli(["branch", "test_zenith_regress", "empty"])

    # Connect to postgres and create a database called "regression".
-    pg = postgres.create_start('test_zenith_regress')
+    pg = env.postgres.create_start('test_zenith_regress')
    pg.safe_psql('CREATE DATABASE regression')

    # Create some local directories for pg_regress to run in.
@@ -40,7 +43,7 @@ def test_zenith_regress(postgres: PostgresFactory, pg_bin, zenith_cli, test_outp
    ]

    log.info(pg_regress_command)
-    env = {
+    env_vars = {
        'PGPORT': str(pg.port),
        'PGUSER': pg.username,
        'PGHOST': pg.host,
@@ -50,11 +53,11 @@ def test_zenith_regress(postgres: PostgresFactory, pg_bin, zenith_cli, test_outp
    # We don't capture the output. It's not too chatty, and it always
    # logs the exact same data to `regression.out` anyway.
    with capsys.disabled():
-        pg_bin.run(pg_regress_command, env=env, cwd=runpath)
+        pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)

        # checkpoint one more time to ensure that the lsn we get is the latest one
        pg.safe_psql('CHECKPOINT')
        lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0]

        # Check that we restore the content of the datadir correctly
-        check_restored_datadir_content(zenith_cli, test_output_dir, pg, pageserver_port.pg)
+        check_restored_datadir_content(test_output_dir, env, pg)
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -24,7 +24,6 @@ from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast
 from typing_extensions import Literal

 from .utils import (get_self_dir, mkdir_if_needed, subprocess_capture)
-
 """
 This file contains fixtures for micro-benchmarks.

@@ -32,11 +31,11 @@ To use, declare the 'zenbenchmark' fixture in the test function. Run the
 bencmark, and then record the result by calling zenbenchmark.record. For example:

 import timeit
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv

 pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")

-def test_mybench(postgres: PostgresFactory, pageserver: ZenithPageserver, zenbenchmark):
+def test_mybench(zenith_simple_env: env, zenbenchmark):

    # Initialize the test
    ...
@@ -56,15 +55,9 @@ in the test initialization, or measure disk usage after the test query.
 """


-# All the results are collected in this list, as a tuple:
-# (test_name: str, metric_name: str, metric_value: float, unit: str)
-#
 # TODO: It would perhaps be better to store the results as additional
 # properties in the pytest TestReport objects, to make them visible to
 # other pytest tools.
-global zenbenchmark_results
-zenbenchmark_results = []
-
 class ZenithBenchmarkResults:
    """ An object for recording benchmark results. """
    def __init__(self):
@@ -77,6 +70,11 @@ class ZenithBenchmarkResults:

        self.results.append((test_name, metric_name, metric_value, unit))

+
+# Will be recreated in each session.
+zenbenchmark_results: ZenithBenchmarkResults = ZenithBenchmarkResults()
+
+
 # Session scope fixture that initializes the results object
@pytest.fixture(autouse=True, scope='session')
 def zenbenchmark_global(request) -> Iterator[ZenithBenchmarkResults]:
@@ -88,6 +86,7 @@ def zenbenchmark_global(request) -> Iterator[ZenithBenchmarkResults]:

    yield zenbenchmark_results

+
 class ZenithBenchmarker:
    """
    An object for recording benchmark results. This is created for each test
@@ -103,7 +102,6 @@ class ZenithBenchmarker:
        """
        self.results.record(self.request.node.name, metric_name, metric_value, unit)

-
    @contextmanager
    def record_duration(self, metric_name):
        """
@@ -134,8 +132,10 @@ class ZenithBenchmarker:
        # The metric should be an integer, as it's a number of bytes. But in general
        # all prometheus metrics are floats. So to be pedantic, read it as a float
        # and round to integer.
-        matches = re.search(r'^pageserver_disk_io_bytes{io_operation="write"} (\S+)$', all_metrics,
+        matches = re.search(r'^pageserver_disk_io_bytes{io_operation="write"} (\S+)$',
+                            all_metrics,
                            re.MULTILINE)
+        assert matches
        return int(round(float(matches.group(1))))

    def get_peak_mem(self, pageserver) -> int:
@@ -145,8 +145,8 @@ class ZenithBenchmarker:
        # Fetch all the exposed prometheus metrics from page server
        all_metrics = pageserver.http_client().get_metrics()
        # See comment in get_io_writes()
-        matches = re.search(r'^pageserver_maxrss_kb (\S+)$', all_metrics,
-                            re.MULTILINE)
+        matches = re.search(r'^pageserver_maxrss_kb (\S+)$', all_metrics, re.MULTILINE)
+        assert matches
        return int(round(float(matches.group(1))))

    def get_timeline_size(self, repo_dir: str, tenantid: str, timelineid: str):
@@ -171,7 +171,11 @@ class ZenithBenchmarker:
        yield
        after = self.get_io_writes(pageserver)

-        self.results.record(self.request.node.name, metric_name, round((after - before) / (1024 * 1024)), 'MB')
+        self.results.record(self.request.node.name,
+                            metric_name,
+                            round((after - before) / (1024 * 1024)),
+                            'MB')
+

@pytest.fixture(scope='function')
 def zenbenchmark(zenbenchmark_global, request) -> Iterator[ZenithBenchmarker]:
@@ -185,9 +189,7 @@ def zenbenchmark(zenbenchmark_global, request) -> Iterator[ZenithBenchmarker]:

 # Hook to print the results at the end
@pytest.hookimpl(hookwrapper=True)
-def pytest_terminal_summary(
-    terminalreporter: TerminalReporter, exitstatus: int, config: Config
-):
+def pytest_terminal_summary(terminalreporter: TerminalReporter, exitstatus: int, config: Config):
    yield

    global zenbenchmark_results
--- a/test_runner/fixtures/log_helper.py
+++ b/test_runner/fixtures/log_helper.py
@@ -1,6 +1,5 @@
 import logging
 import logging.config
-
 """
 This file configures logging to use in python tests.
 Logs are automatically captured and shown in their
@@ -27,17 +26,19 @@ LOGGING = {
            "level": "INFO"
        },
        "root.wal_acceptor_async": {
-            "level": "INFO" # a lot of logs on DEBUG level
+            "level": "INFO"  # a lot of logs on DEBUG level
        }
    }
 }

+
 def getLogger(name='root') -> logging.Logger:
    """Method to get logger for tests.
    
    Should be used to get correctly initialized logger. """
    return logging.getLogger(name)

+
 # default logger for tests
 log = getLogger()

--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -4,6 +4,7 @@ import subprocess
 from typing import Any, List
 from fixtures.log_helper import log

+
 def get_self_dir() -> str:
    """ Get the path to the directory where this script lives. """
    return os.path.dirname(os.path.abspath(__file__))
@@ -58,6 +59,13 @@ def global_counter() -> int:
    _global_counter += 1
    return _global_counter

+
 def lsn_to_hex(num: int) -> str:
    """ Convert lsn from int to standard hex notation. """
    return "{:X}/{:X}".format(num >> 32, num & 0xffffffff)
+
+
+def lsn_from_hex(lsn_hex: str) -> int:
+    """ Convert lsn from hex notation to int. """
+    l, r = lsn_hex.split('/')
+    return (int(l, 16) << 32) + int(r, 16)
--- a/test_runner/fixtures/zenith_fixtures.py
+++ b/test_runner/fixtures/zenith_fixtures.py
--- a/test_runner/netstat-script.sh
+++ b/test_runner/netstat-script.sh
@@ -1,8 +0,0 @@
-#!/bin/bash
-
-while true; do
-  echo -n "==== CURRENT TIME:" >> /tmp/test_output/netstat.stdout
-  date +"%T.%N" >> /tmp/test_output/netstat.stdout
-  sudo netstat -vpnoa | grep tcp | sort >> /tmp/test_output/netstat.stdout
-  sleep 0.5
-done
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -1,10 +1,11 @@
 import os
 from contextlib import closing
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")

+
 #
 # Run bulk INSERT test.
 #
@@ -15,16 +16,17 @@ pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
 # 3. Disk space used
 # 4. Peak memory usage
 #
-def test_bulk_insert(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str):
+def test_bulk_insert(zenith_simple_env: ZenithEnv, zenbenchmark):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_bulk_insert", "empty"])
+    env.zenith_cli(["branch", "test_bulk_insert", "empty"])

-    pg = postgres.create_start('test_bulk_insert')
+    pg = env.postgres.create_start('test_bulk_insert')
    log.info("postgres is running on 'test_bulk_insert' branch")

    # Open a connection directly to the page server that we'll use to force
    # flushing the layers to disk
-    psconn = pageserver.connect();
+    psconn = env.pageserver.connect()
    pscur = psconn.cursor()

    # Get the timeline ID of our branch. We need it for the 'do_gc' command
@@ -36,17 +38,19 @@ def test_bulk_insert(postgres: PostgresFactory, pageserver: ZenithPageserver, pg
            cur.execute("create table huge (i int, j int);")

            # Run INSERT, recording the time and I/O it takes
-            with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
+            with zenbenchmark.record_pageserver_writes(env.pageserver, 'pageserver_writes'):
                with zenbenchmark.record_duration('insert'):
                    cur.execute("insert into huge values (generate_series(1, 5000000), 0);")

                    # Flush the layers from memory to disk. This is included in the reported
                    # time and I/O
-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")

            # Record peak memory usage
-            zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(pageserver) / 1024, 'MB')
+            zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(env.pageserver) / 1024, 'MB')

            # Report disk space used by the repository
-            timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
-            zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')
+            timeline_size = zenbenchmark.get_timeline_size(env.repo_dir,
+                                                           env.initial_tenant,
+                                                           timeline)
+            zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
--- a/test_runner/performance/test_bulk_tenant_create.py
+++ b/test_runner/performance/test_bulk_tenant_create.py
@@ -1,11 +1,7 @@
 import timeit
 import pytest

-from fixtures.zenith_fixtures import (
-    TenantFactory,
-    ZenithCli,
-    PostgresFactory,
-)
+from fixtures.zenith_fixtures import ZenithEnvBuilder

 pytest_plugins = ("fixtures.benchmark_fixture")

@@ -20,34 +16,37 @@ pytest_plugins = ("fixtures.benchmark_fixture")
@pytest.mark.parametrize('tenants_count', [1, 5, 10])
@pytest.mark.parametrize('use_wal_acceptors', ['with_wa', 'without_wa'])
 def test_bulk_tenant_create(
-    zenith_cli: ZenithCli,
-    tenant_factory: TenantFactory,
-    postgres: PostgresFactory,
-    wa_factory,
+    zenith_env_builder: ZenithEnvBuilder,
    use_wal_acceptors: str,
    tenants_count: int,
    zenbenchmark,
 ):
    """Measure tenant creation time (with and without wal acceptors)"""
+    if use_wal_acceptors == 'with_wa':
+        zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()

    time_slices = []

    for i in range(tenants_count):
        start = timeit.default_timer()

-        tenant = tenant_factory.create()
-        zenith_cli.run([
-            "branch", f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}", "main",
+        tenant = env.create_tenant()
+        env.zenith_cli([
+            "branch",
+            f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}",
+            "main",
            f"--tenantid={tenant}"
        ])

-        if use_wal_acceptors == 'with_wa':
-            wa_factory.start_n_new(3)
+        # FIXME: We used to start new safekeepers here. Did that make sense? Should we do it now?
+        #if use_wal_acceptors == 'with_wa':
+        #    wa_factory.start_n_new(3)

-        pg_tenant = postgres.create_start(
+        pg_tenant = env.postgres.create_start(
            f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}",
+            None,  # branch name, None means same as node name
            tenant,
-            wal_acceptors=wa_factory.get_connstrs() if use_wal_acceptors == 'with_wa' else None,
        )

        end = timeit.default_timer()
--- a/test_runner/performance/test_gist_build.py
+++ b/test_runner/performance/test_gist_build.py
@@ -1,25 +1,27 @@
 import os
 from contextlib import closing
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")

+
 #
 # Test buffering GisT build. It WAL-logs the whole relation, in 32-page chunks.
 # As of this writing, we're duplicate those giant WAL records for each page,
 # which makes the delta layer about 32x larger than it needs to be.
 #
-def test_gist_buffering_build(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str):
+def test_gist_buffering_build(zenith_simple_env: ZenithEnv, zenbenchmark):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_gist_buffering_build", "empty"])
+    env.zenith_cli(["branch", "test_gist_buffering_build", "empty"])

-    pg = postgres.create_start('test_gist_buffering_build')
+    pg = env.postgres.create_start('test_gist_buffering_build')
    log.info("postgres is running on 'test_gist_buffering_build' branch")

    # Open a connection directly to the page server that we'll use to force
    # flushing the layers to disk
-    psconn = pageserver.connect();
+    psconn = env.pageserver.connect()
    pscur = psconn.cursor()

    # Get the timeline ID of our branch. We need it for the 'do_gc' command
@@ -29,21 +31,27 @@ def test_gist_buffering_build(postgres: PostgresFactory, pageserver: ZenithPages
            timeline = cur.fetchone()[0]

            # Create test table.
-            cur.execute("create table gist_point_tbl(id int4, p point)");
-            cur.execute("insert into gist_point_tbl select g, point(g, g) from generate_series(1, 1000000) g;");
+            cur.execute("create table gist_point_tbl(id int4, p point)")
+            cur.execute(
+                "insert into gist_point_tbl select g, point(g, g) from generate_series(1, 1000000) g;"
+            )

            # Build the index.
-            with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
+            with zenbenchmark.record_pageserver_writes(env.pageserver, 'pageserver_writes'):
                with zenbenchmark.record_duration('build'):
-                    cur.execute("create index gist_pointidx2 on gist_point_tbl using gist(p) with (buffering = on)");
+                    cur.execute(
+                        "create index gist_pointidx2 on gist_point_tbl using gist(p) with (buffering = on)"
+                    )

                    # Flush the layers from memory to disk. This is included in the reported
                    # time and I/O
-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 1000000")
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 1000000")

            # Record peak memory usage
-            zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(pageserver) / 1024, 'MB')
+            zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(env.pageserver) / 1024, 'MB')

            # Report disk space used by the repository
-            timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
-            zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')
+            timeline_size = zenbenchmark.get_timeline_size(env.repo_dir,
+                                                           env.initial_tenant,
+                                                           timeline)
+            zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
--- a/test_runner/performance/test_perf_pgbench.py
+++ b/test_runner/performance/test_perf_pgbench.py
@@ -1,10 +1,11 @@
 import os
 from contextlib import closing
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")

+
 #
 # Run a very short pgbench test.
 #
@@ -14,16 +15,17 @@ pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
 # 2. Time to run 5000 pgbench transactions
 # 3. Disk space used
 #
-def test_pgbench(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str):
+def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin, zenbenchmark):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_pgbench_perf", "empty"])
+    env.zenith_cli(["branch", "test_pgbench_perf", "empty"])

-    pg = postgres.create_start('test_pgbench_perf')
+    pg = env.postgres.create_start('test_pgbench_perf')
    log.info("postgres is running on 'test_pgbench_perf' branch")

    # Open a connection directly to the page server that we'll use to force
    # flushing the layers to disk
-    psconn = pageserver.connect();
+    psconn = env.pageserver.connect()
    pscur = psconn.cursor()

    # Get the timeline ID of our branch. We need it for the 'do_gc' command
@@ -35,13 +37,13 @@ def test_pgbench(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin
    connstr = pg.connstr()

    # Initialize pgbench database, recording the time and I/O it takes
-    with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
+    with zenbenchmark.record_pageserver_writes(env.pageserver, 'pageserver_writes'):
        with zenbenchmark.record_duration('init'):
            pg_bin.run_capture(['pgbench', '-s5', '-i', connstr])

            # Flush the layers from memory to disk. This is included in the reported
            # time and I/O
-            pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+            pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")

    # Run pgbench for 5000 transactions
    with zenbenchmark.record_duration('5000_xacts'):
@@ -49,8 +51,8 @@ def test_pgbench(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin

    # Flush the layers to disk again. This is *not' included in the reported time,
    # though.
-    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")

    # Report disk space used by the repository
-    timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
-    zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')
+    timeline_size = zenbenchmark.get_timeline_size(env.repo_dir, env.initial_tenant, timeline)
+    zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
--- a/test_runner/performance/test_write_amplification.py
+++ b/test_runner/performance/test_write_amplification.py
@@ -12,21 +12,23 @@
 # Amplification problem at its finest.
 import os
 from contextlib import closing
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")

-def test_write_amplification(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str):
-    # Create a branch for us
-    zenith_cli.run(["branch", "test_write_amplification", "empty"])

-    pg = postgres.create_start('test_write_amplification')
+def test_write_amplification(zenith_simple_env: ZenithEnv, zenbenchmark):
+    env = zenith_simple_env
+    # Create a branch for us
+    env.zenith_cli(["branch", "test_write_amplification", "empty"])
+
+    pg = env.postgres.create_start('test_write_amplification')
    log.info("postgres is running on 'test_write_amplification' branch")

    # Open a connection directly to the page server that we'll use to force
    # flushing the layers to disk
-    psconn = pageserver.connect();
+    psconn = env.pageserver.connect()
    pscur = psconn.cursor()

    with closing(pg.connect()) as conn:
@@ -35,7 +37,7 @@ def test_write_amplification(postgres: PostgresFactory, pageserver: ZenithPagese
            cur.execute("SHOW zenith.zenith_timeline")
            timeline = cur.fetchone()[0]

-            with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
+            with zenbenchmark.record_pageserver_writes(env.pageserver, 'pageserver_writes'):
                with zenbenchmark.record_duration('run'):

                    # NOTE: Because each iteration updates every table already created,
@@ -68,8 +70,10 @@ def test_write_amplification(postgres: PostgresFactory, pageserver: ZenithPagese
                        # slower, adding some delays in this loop.  But forcing
                        # the the checkpointing and GC makes the test go faster,
                        # with the same total I/O effect.
-                        pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                        pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")

            # Report disk space used by the repository
-            timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
-            zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')
+            timeline_size = zenbenchmark.get_timeline_size(env.repo_dir,
+                                                           env.initial_tenant,
+                                                           timeline)
+            zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
--- a/test_runner/pytest.ini
+++ b/test_runner/pytest.ini
@@ -2,3 +2,4 @@
 minversion = 6.0
 log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s
 log_date_format = %Y-%m-%d %H:%M:%S
+log_cli = true
--- a/test_runner/setup.cfg
+++ b/test_runner/setup.cfg
@@ -10,6 +10,7 @@ max-line-length = 100
 [yapf]
 based_on_style = pep8
 column_limit = 100
+split_all_top_level_comma_separated_values = true

 [mypy]
 # some tests don't typecheck when this flag is set
@@ -21,7 +22,11 @@ disallow_untyped_decorators = false
 disallow_untyped_defs = false
 strict = true

-[mypy-psycopg2.*]
+[mypy-asyncpg.*]
+# There is some work in progress, though: https://github.com/MagicStack/asyncpg/pull/577
+ignore_missing_imports = true
+
+[mypy-cached_property.*]
 ignore_missing_imports = true

 [mypy-pytest.*]
--- a/test_runner/test_broken.py
+++ b/test_runner/test_broken.py
@@ -1,6 +1,7 @@
 import pytest
 import os

+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -19,11 +20,13 @@ run_broken = pytest.mark.skipif(os.environ.get('RUN_BROKEN') is None,


@run_broken
-def test_broken(zenith_cli, pageserver, postgres, pg_bin):
-    # Create a branch for us
-    zenith_cli.run(["branch", "test_broken", "empty"])
+def test_broken(zenith_simple_env: ZenithEnv, pg_bin):
+    env = zenith_simple_env

-    postgres.create_start("test_broken")
+    # Create a branch for us
+    env.zenith_cli(["branch", "test_broken", "empty"])
+
+    env.postgres.create_start("test_broken")
    log.info('postgres is running')

    log.info('THIS NEXT COMMAND WILL FAIL:')
--- a/vendor/postgres
+++ b/vendor/postgres
--- a/walkeeper/src/bin/safekeeper.rs
+++ b/walkeeper/src/bin/safekeeper.rs
@@ -7,17 +7,16 @@ use const_format::formatcp;
 use daemonize::Daemonize;
 use log::*;
 use std::env;
-use std::net::TcpListener;
 use std::path::{Path, PathBuf};
 use std::thread;
 use zenith_utils::http::endpoint;
-use zenith_utils::logging;
+use zenith_utils::{logging, tcp_listener};

 use walkeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR};
 use walkeeper::http;
 use walkeeper::s3_offload;
 use walkeeper::wal_service;
-use walkeeper::WalAcceptorConf;
+use walkeeper::SafeKeeperConf;

 fn main() -> Result<()> {
    zenith_metrics::set_common_metrics_prefix("safekeeper");
@@ -54,7 +53,7 @@ fn main() -> Result<()> {
            Arg::with_name("ttl")
                .long("ttl")
                .takes_value(true)
-                .help("interval for keeping WAL as walkeeper node, after which them will be uploaded to S3 and removed locally"),
+                .help("interval for keeping WAL at safekeeper node, after which them will be uploaded to S3 and removed locally"),
        )
        .arg(
            Arg::with_name("recall")
@@ -78,8 +77,11 @@ fn main() -> Result<()> {
        )
        .get_matches();

-    let mut conf = WalAcceptorConf {
-        data_dir: PathBuf::from("./"),
+    let mut conf = SafeKeeperConf {
+        // Always set to './'. We will chdir into the directory specified on the
+        // command line, so that when the server is running, all paths are relative
+        // to that.
+        workdir: PathBuf::from("./"),
        daemonize: false,
        no_sync: false,
        pageserver_addr: None,
@@ -91,10 +93,8 @@ fn main() -> Result<()> {
    };

    if let Some(dir) = arg_matches.value_of("datadir") {
-        conf.data_dir = PathBuf::from(dir);
-
        // change into the data directory.
-        std::env::set_current_dir(&conf.data_dir)?;
+        std::env::set_current_dir(PathBuf::from(dir))?;
    }

    if arg_matches.is_present("no-sync") {
@@ -125,20 +125,19 @@ fn main() -> Result<()> {
        conf.recall_period = Some(humantime::parse_duration(recall)?);
    }

-    start_wal_acceptor(conf)
+    start_safekeeper(conf)
 }

-fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
-    let log_filename = conf.data_dir.join("safekeeper.log");
-    let log_file = logging::init(log_filename, conf.daemonize)?;
+fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
+    let log_file = logging::init("safekeeper.log", conf.daemonize)?;

-    let http_listener = TcpListener::bind(conf.listen_http_addr.clone()).map_err(|e| {
+    let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
        error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
        e
    })?;

    info!("Starting safekeeper on {}", conf.listen_pg_addr);
-    let pg_listener = TcpListener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
+    let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
        error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
        e
    })?;
--- a/walkeeper/src/http/routes.rs
+++ b/walkeeper/src/http/routes.rs
@@ -10,7 +10,7 @@ use zenith_utils::lsn::Lsn;
 use crate::safekeeper::AcceptorState;
 use crate::timeline::CreateControlFile;
 use crate::timeline::GlobalTimelines;
-use crate::WalAcceptorConf;
+use crate::SafeKeeperConf;
 use zenith_utils::http::endpoint;
 use zenith_utils::http::error::ApiError;
 use zenith_utils::http::json::json_response;
@@ -22,9 +22,9 @@ async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    Ok(json_response(StatusCode::OK, "")?)
 }

-fn get_conf(request: &Request<Body>) -> &WalAcceptorConf {
+fn get_conf(request: &Request<Body>) -> &SafeKeeperConf {
    request
-        .data::<Arc<WalAcceptorConf>>()
+        .data::<Arc<SafeKeeperConf>>()
        .expect("unknown state type")
        .as_ref()
 }
@@ -49,6 +49,8 @@ struct TimelineStatus {
    commit_lsn: Lsn,
    #[serde(serialize_with = "display_serialize")]
    truncate_lsn: Lsn,
+    #[serde(serialize_with = "display_serialize")]
+    flush_lsn: Lsn,
 }

 /// Report info about timeline.
@@ -64,6 +66,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
    )
    .map_err(ApiError::from_err)?;
    let sk_state = tli.get_info();
+    let (flush_lsn, _) = tli.get_end_of_wal();

    let status = TimelineStatus {
        tenant_id,
@@ -71,12 +74,13 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
        acceptor_state: sk_state.acceptor_state,
        commit_lsn: sk_state.commit_lsn,
        truncate_lsn: sk_state.truncate_lsn,
+        flush_lsn,
    };
    Ok(json_response(StatusCode::OK, status)?)
 }

 /// Safekeeper http router.
-pub fn make_router(conf: WalAcceptorConf) -> RouterBuilder<hyper::Body, ApiError> {
+pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
    let router = endpoint::make_router();
    router
        .data(Arc::new(conf))
--- a/walkeeper/src/lib.rs
+++ b/walkeeper/src/lib.rs
@@ -23,8 +23,15 @@ pub mod defaults {
 }

 #[derive(Debug, Clone)]
-pub struct WalAcceptorConf {
-    pub data_dir: PathBuf,
+pub struct SafeKeeperConf {
+    // Repository directory, relative to current working directory.
+    // Normally, the safekeeper changes the current working directory
+    // to the repository, and 'workdir' is always '.'. But we don't do
+    // that during unit testing, because the current directory is global
+    // to the process but different unit tests work on different
+    // data directories to avoid clashing with each other.
+    pub workdir: PathBuf,
+
    pub daemonize: bool,
    pub no_sync: bool,
    pub listen_pg_addr: String,
--- a/walkeeper/src/receive_wal.rs
+++ b/walkeeper/src/receive_wal.rs
@@ -16,7 +16,7 @@ use crate::safekeeper::ProposerAcceptorMessage;

 use crate::send_wal::SendWalHandler;
 use crate::timeline::TimelineTools;
-use crate::WalAcceptorConf;
+use crate::SafeKeeperConf;
 use zenith_utils::connstring::connection_host_port;
 use zenith_utils::postgres_backend::PostgresBackend;
 use zenith_utils::pq_proto::{BeMessage, FeMessage};
@@ -33,7 +33,7 @@ pub struct ReceiveWalConn<'pg> {
 /// Periodically request pageserver to call back.
 /// If pageserver already has replication channel, it will just ignore this request
 ///
-fn request_callback(conf: WalAcceptorConf, timelineid: ZTimelineId, tenantid: ZTenantId) {
+fn request_callback(conf: SafeKeeperConf, timelineid: ZTimelineId, tenantid: ZTenantId) {
    let ps_addr = conf.pageserver_addr.unwrap();
    let ps_connstr = format!(
        "postgresql://no_user:{}@{}/no_db",
--- a/walkeeper/src/replication.rs
+++ b/walkeeper/src/replication.rs
@@ -2,7 +2,7 @@
 //! with the "START_REPLICATION" message.

 use crate::send_wal::SendWalHandler;
-use crate::timeline::{Timeline, TimelineTools};
+use crate::timeline::{ReplicaState, Timeline, TimelineTools};
 use anyhow::{anyhow, Context, Result};
 use bytes::Bytes;
 use log::*;
@@ -20,7 +20,7 @@ use std::{str, thread};
 use zenith_utils::bin_ser::BeSer;
 use zenith_utils::lsn::Lsn;
 use zenith_utils::postgres_backend::PostgresBackend;
-use zenith_utils::pq_proto::{BeMessage, FeMessage, XLogDataBody};
+use zenith_utils::pq_proto::{BeMessage, FeMessage, WalSndKeepAlive, XLogDataBody};
 use zenith_utils::sock_split::ReadStream;

 pub const END_REPLICATION_MARKER: Lsn = Lsn::MAX;
@@ -32,7 +32,7 @@ const STANDBY_STATUS_UPDATE_TAG_BYTE: u8 = b'r';
 type FullTransactionId = u64;

 /// Hot standby feedback received from replica
-#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
 pub struct HotStandbyFeedback {
    pub ts: TimestampTz,
    pub xmin: FullTransactionId,
@@ -49,6 +49,16 @@ impl HotStandbyFeedback {
    }
 }

+/// Standby status update
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct StandbyReply {
+    pub write_lsn: Lsn, // disk consistent lSN
+    pub flush_lsn: Lsn, // LSN committedby quorum
+    pub apply_lsn: Lsn, // not used
+    pub reply_ts: TimestampTz,
+    pub reply_requested: bool,
+}
+
 /// A network connection that's speaking the replication protocol.
 pub struct ReplicationConn {
    /// This is an `Option` because we will spawn a background thread that will
@@ -56,16 +66,15 @@ pub struct ReplicationConn {
    stream_in: Option<ReadStream>,
 }

-// TODO: move this to crate::timeline when there's more users
-// TODO: design a proper Timeline mock api
-trait HsFeedbackSubscriber {
-    fn add_hs_feedback(&self, _feedback: HotStandbyFeedback) {}
+/// Scope guard to unregister replication connection from timeline
+struct ReplicationConnGuard {
+    replica: usize, // replica internal ID assigned by timeline
+    timeline: Arc<Timeline>,
 }

-impl HsFeedbackSubscriber for Arc<Timeline> {
-    #[inline(always)]
-    fn add_hs_feedback(&self, feedback: HotStandbyFeedback) {
-        Timeline::add_hs_feedback(self, feedback);
+impl Drop for ReplicationConnGuard {
+    fn drop(&mut self) {
+        self.timeline.update_replica_state(self.replica, None);
    }
 }

@@ -79,26 +88,33 @@ impl ReplicationConn {

    /// Handle incoming messages from the network.
    /// This is spawned into the background by `handle_start_replication`.
-    fn background_thread(
-        mut stream_in: impl Read,
-        subscriber: impl HsFeedbackSubscriber,
-    ) -> Result<()> {
+    fn background_thread(mut stream_in: impl Read, timeline: Arc<Timeline>) -> Result<()> {
+        let mut state = ReplicaState::new();
+        let replica = timeline.add_replica(state);
+        let _guard = ReplicationConnGuard {
+            replica,
+            timeline: timeline.clone(),
+        };
        // Wait for replica's feedback.
        while let Some(msg) = FeMessage::read(&mut stream_in)? {
            match &msg {
                FeMessage::CopyData(m) => {
                    // There's two possible data messages that the client is supposed to send here:
-                    // `HotStandbyFeedback` and `StandbyStatusUpdate`. We only handle hot standby
-                    // feedback.
+                    // `HotStandbyFeedback` and `StandbyStatusUpdate`.

                    match m.first().cloned() {
                        Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => {
                            // Note: deserializing is on m[1..] because we skip the tag byte.
-                            let feedback = HotStandbyFeedback::des(&m[1..])
+                            state.hs_feedback = HotStandbyFeedback::des(&m[1..])
                                .context("failed to deserialize HotStandbyFeedback")?;
-                            subscriber.add_hs_feedback(feedback);
+                            timeline.update_replica_state(replica, Some(state));
+                        }
+                        Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => {
+                            let reply = StandbyReply::des(&m[1..])
+                                .context("failed to deserialize StandbyReply")?;
+                            state.disk_consistent_lsn = reply.write_lsn;
+                            timeline.update_replica_state(replica, Some(state));
                        }
-                        Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => (),
                        _ => warn!("unexpected message {:?}", msg),
                    }
                }
@@ -187,7 +203,7 @@ impl ReplicationConn {
        // switch to copy
        pgb.write_message(&BeMessage::CopyBothResponse)?;

-        let mut end_pos: Lsn;
+        let mut end_pos = Lsn(0);
        let mut wal_file: Option<File> = None;

        loop {
@@ -202,7 +218,18 @@ impl ReplicationConn {
            } else {
                /* normal mode */
                let timeline = swh.timeline.get();
-                end_pos = timeline.wait_for_lsn(start_pos);
+                if let Some(lsn) = timeline.wait_for_lsn(start_pos) {
+                    end_pos = lsn
+                } else {
+                    // timeout expired: request pageserver status
+                    pgb.write_message(&BeMessage::KeepAlive(WalSndKeepAlive {
+                        sent_ptr: end_pos.0,
+                        timestamp: get_current_timestamp(),
+                        request_reply: true,
+                    }))
+                    .context("Failed to send KeepAlive message")?;
+                    continue;
+                }
            }
            if end_pos == END_REPLICATION_MARKER {
                break;
@@ -216,7 +243,7 @@ impl ReplicationConn {
                    let segno = start_pos.segment_number(wal_seg_size);
                    let wal_file_name = XLogFileName(timeline, segno, wal_seg_size);
                    let timeline_id = swh.timeline.get().timelineid.to_string();
-                    let wal_file_path = swh.conf.data_dir.join(timeline_id).join(wal_file_name);
+                    let wal_file_path = swh.conf.workdir.join(timeline_id).join(wal_file_name);
                    Self::open_wal_file(&wal_file_path)?
                }
            };
@@ -257,18 +284,3 @@ impl ReplicationConn {
        Ok(())
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    // A no-op impl for tests
-    impl HsFeedbackSubscriber for () {}
-
-    #[test]
-    fn test_replication_conn_background_thread_eof() {
-        // Test that background_thread recognizes EOF
-        let stream: &[u8] = &[];
-        ReplicationConn::background_thread(stream, ()).unwrap();
-    }
-}
--- a/walkeeper/src/s3_offload.rs
+++ b/walkeeper/src/s3_offload.rs
@@ -18,9 +18,9 @@ use tokio::runtime;
 use tokio::time::sleep;
 use walkdir::WalkDir;

-use crate::WalAcceptorConf;
+use crate::SafeKeeperConf;

-pub fn thread_main(conf: WalAcceptorConf) {
+pub fn thread_main(conf: SafeKeeperConf) {
    // Create a new thread pool
    //
    // FIXME: keep it single-threaded for now, make it easier to debug with gdb,
@@ -42,7 +42,7 @@ async fn offload_files(
    bucket: &Bucket,
    listing: &HashSet<String>,
    dir_path: &Path,
-    conf: &WalAcceptorConf,
+    conf: &SafeKeeperConf,
 ) -> Result<u64> {
    let horizon = SystemTime::now() - conf.ttl.unwrap();
    let mut n: u64 = 0;
@@ -54,7 +54,7 @@ async fn offload_files(
            && IsXLogFileName(entry.file_name().to_str().unwrap())
            && entry.metadata().unwrap().created().unwrap() <= horizon
        {
-            let relpath = path.strip_prefix(&conf.data_dir).unwrap();
+            let relpath = path.strip_prefix(&conf.workdir).unwrap();
            let s3path = String::from("walarchive/") + relpath.to_str().unwrap();
            if !listing.contains(&s3path) {
                let mut file = File::open(&path)?;
@@ -70,7 +70,7 @@ async fn offload_files(
    Ok(n)
 }

-async fn main_loop(conf: &WalAcceptorConf) -> Result<()> {
+async fn main_loop(conf: &SafeKeeperConf) -> Result<()> {
    let region = Region::Custom {
        region: env::var("S3_REGION").unwrap(),
        endpoint: env::var("S3_ENDPOINT").unwrap(),
@@ -97,7 +97,7 @@ async fn main_loop(conf: &WalAcceptorConf) -> Result<()> {
            .flat_map(|b| b.contents.iter().map(|o| o.key.clone()))
            .collect();

-        let n = offload_files(&bucket, &listing, &conf.data_dir, conf).await?;
+        let n = offload_files(&bucket, &listing, &conf.workdir, conf).await?;
        info!("Offload {} files to S3", n);
        sleep(conf.ttl.unwrap()).await;
    }
--- a/walkeeper/src/safekeeper.rs
+++ b/walkeeper/src/safekeeper.rs
@@ -19,7 +19,10 @@ use lazy_static::lazy_static;

 use crate::replication::HotStandbyFeedback;
 use postgres_ffi::xlog_utils::MAX_SEND_SIZE;
-use zenith_metrics::{register_gauge_vec, Gauge, GaugeVec};
+use zenith_metrics::{
+    register_gauge_vec, register_histogram_vec, Gauge, GaugeVec, Histogram, HistogramVec,
+    DISK_WRITE_SECONDS_BUCKETS,
+};
 use zenith_utils::bin_ser::LeSer;
 use zenith_utils::lsn::Lsn;
 use zenith_utils::pq_proto::SystemId;
@@ -191,6 +194,8 @@ pub struct AppendResponse {
    // We report back our awareness about which WAL is committed, as this is
    // a criterion for walproposer --sync mode exit
    pub commit_lsn: Lsn,
+    // Min disk consistent lsn of pageservers (portion of WAL applied and written to the disk by pageservers)
+    pub disk_consistent_lsn: Lsn,
    pub hs_feedback: HotStandbyFeedback,
 }

@@ -297,11 +302,27 @@ lazy_static! {
        &["ztli"]
    )
    .expect("Failed to register safekeeper_commit_lsn gauge vec");
+    static ref WRITE_WAL_BYTES: HistogramVec = register_histogram_vec!(
+        "safekeeper_write_wal_bytes",
+        "Bytes written to WAL in a single request, grouped by timeline",
+        &["timeline_id"],
+        vec![1.0, 10.0, 100.0, 1024.0, 8192.0, 128.0 * 1024.0, 1024.0 * 1024.0, 10.0 * 1024.0 * 1024.0]
+    )
+    .expect("Failed to register safekeeper_write_wal_bytes histogram vec");
+    static ref WRITE_WAL_SECONDS: HistogramVec = register_histogram_vec!(
+        "safekeeper_write_wal_seconds",
+        "Seconds spent writing and syncing WAL to a disk in a single request, grouped by timeline",
+        &["timeline_id"],
+        DISK_WRITE_SECONDS_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_write_wal_seconds histogram vec");
 }

 struct SafeKeeperMetrics {
    flush_lsn: Gauge,
    commit_lsn: Gauge,
+    write_wal_bytes: Histogram,
+    write_wal_seconds: Histogram,
 }

 impl SafeKeeperMetrics {
@@ -310,6 +331,8 @@ impl SafeKeeperMetrics {
        SafeKeeperMetrics {
            flush_lsn: FLUSH_LSN_GAUGE.with_label_values(&[&ztli_str]),
            commit_lsn: COMMIT_LSN_GAUGE.with_label_values(&[&ztli_str]),
+            write_wal_bytes: WRITE_WAL_BYTES.with_label_values(&[&ztli_str]),
+            write_wal_seconds: WRITE_WAL_SECONDS.with_label_values(&[&ztli_str]),
        }
    }

@@ -317,6 +340,8 @@ impl SafeKeeperMetrics {
        SafeKeeperMetrics {
            flush_lsn: FLUSH_LSN_GAUGE.with_label_values(&["n/a"]),
            commit_lsn: COMMIT_LSN_GAUGE.with_label_values(&["n/a"]),
+            write_wal_bytes: WRITE_WAL_BYTES.with_label_values(&["n/a"]),
+            write_wal_seconds: WRITE_WAL_SECONDS.with_label_values(&["n/a"]),
        }
    }
 }
@@ -458,6 +483,7 @@ where
                epoch: self.s.acceptor_state.epoch,
                commit_lsn: Lsn(0),
                flush_lsn: Lsn(0),
+                disk_consistent_lsn: Lsn(0),
                hs_feedback: HotStandbyFeedback::empty(),
            };
            return Ok(AcceptorProposerMessage::AppendResponse(resp));
@@ -469,8 +495,14 @@ where
        // do the job
        let mut last_rec_lsn = Lsn(0);
        if !msg.wal_data.is_empty() {
-            self.storage
-                .write_wal(&self.s.server, msg.h.begin_lsn, &msg.wal_data)?;
+            self.metrics
+                .write_wal_bytes
+                .observe(msg.wal_data.len() as f64);
+            {
+                let _timer = self.metrics.write_wal_seconds.start_timer();
+                self.storage
+                    .write_wal(&self.s.server, msg.h.begin_lsn, &msg.wal_data)?;
+            }

            // figure out last record's end lsn for reporting (if we got the
            // whole record)
@@ -567,6 +599,7 @@ where
            epoch: self.s.acceptor_state.epoch,
            flush_lsn: self.flush_lsn,
            commit_lsn: self.s.commit_lsn,
+            disk_consistent_lsn: Lsn(0),
            // will be filled by caller code to avoid bothering safekeeper
            hs_feedback: HotStandbyFeedback::empty(),
        };
--- a/walkeeper/src/send_wal.rs
+++ b/walkeeper/src/send_wal.rs
@@ -1,4 +1,4 @@
-//! Part of WAL acceptor pretending to be Postgres, streaming xlog to
+//! Part of Safekeeper pretending to be Postgres, streaming xlog to
 //! pageserver/any other consumer.
 //!

@@ -6,7 +6,7 @@ use crate::json_ctrl::handle_json_ctrl;
 use crate::receive_wal::ReceiveWalConn;
 use crate::replication::ReplicationConn;
 use crate::timeline::{Timeline, TimelineTools};
-use crate::WalAcceptorConf;
+use crate::SafeKeeperConf;
 use anyhow::{anyhow, bail, Result};
 use bytes::Bytes;
 use std::str::FromStr;
@@ -20,7 +20,7 @@ use crate::timeline::CreateControlFile;

 /// Handler for streaming WAL from acceptor
 pub struct SendWalHandler {
-    pub conf: WalAcceptorConf,
+    pub conf: SafeKeeperConf,
    /// assigned application name
    pub appname: Option<String>,
    pub tenantid: Option<ZTenantId>,
@@ -85,7 +85,7 @@ impl postgres_backend::Handler for SendWalHandler {
 }

 impl SendWalHandler {
-    pub fn new(conf: WalAcceptorConf) -> Self {
+    pub fn new(conf: SafeKeeperConf) -> Self {
        SendWalHandler {
            conf,
            appname: None,
--- a/walkeeper/src/timeline.rs
+++ b/walkeeper/src/timeline.rs
@@ -11,9 +11,10 @@ use std::collections::HashMap;
 use std::fs::{self, File, OpenOptions};
 use std::io::{Seek, SeekFrom, Write};
 use std::sync::{Arc, Condvar, Mutex};
+use std::time::Duration;
+use zenith_metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS};
 use zenith_utils::bin_ser::LeSer;
 use zenith_utils::lsn::Lsn;
-
 use zenith_utils::zid::{ZTenantId, ZTimelineId};

 use crate::replication::{HotStandbyFeedback, END_REPLICATION_MARKER};
@@ -21,10 +22,39 @@ use crate::safekeeper::{
    AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, ServerInfo,
    Storage, SK_FORMAT_VERSION, SK_MAGIC,
 };
-use crate::WalAcceptorConf;
+use crate::SafeKeeperConf;
 use postgres_ffi::xlog_utils::{XLogFileName, XLOG_BLCKSZ};

 const CONTROL_FILE_NAME: &str = "safekeeper.control";
+const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
+
+/// Replica status: host standby feedback + disk consistent lsn
+#[derive(Debug, Clone, Copy)]
+pub struct ReplicaState {
+    /// combined disk_consistent_lsn of pageservers
+    pub disk_consistent_lsn: Lsn,
+    /// combined hot standby feedback from all replicas
+    pub hs_feedback: HotStandbyFeedback,
+}
+
+impl Default for ReplicaState {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ReplicaState {
+    pub fn new() -> ReplicaState {
+        ReplicaState {
+            disk_consistent_lsn: Lsn(u64::MAX),
+            hs_feedback: HotStandbyFeedback {
+                ts: 0,
+                xmin: u64::MAX,
+                catalog_xmin: u64::MAX,
+            },
+        }
+    }
+}

 /// Shared state associated with database instance (tenant)
 struct SharedState {
@@ -33,8 +63,8 @@ struct SharedState {
    /// For receiving-sending wal cooperation
    /// quorum commit LSN we've notified walsenders about
    notified_commit_lsn: Lsn,
-    /// combined hot standby feedback from all replicas
-    hs_feedback: HotStandbyFeedback,
+    /// State of replicas
+    replicas: Vec<Option<ReplicaState>>,
 }

 // A named boolean.
@@ -44,23 +74,70 @@ pub enum CreateControlFile {
    False,
 }

+lazy_static! {
+    static ref PERSIST_SYNC_CONTROL_FILE_SECONDS: HistogramVec = register_histogram_vec!(
+        "safekeeper_persist_sync_control_file_seconds",
+        "Seconds to persist and sync control file, grouped by timeline",
+        &["timeline_id"],
+        DISK_WRITE_SECONDS_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_persist_sync_control_file_seconds histogram vec");
+    static ref PERSIST_NOSYNC_CONTROL_FILE_SECONDS: HistogramVec = register_histogram_vec!(
+        "safekeeper_persist_nosync_control_file_seconds",
+        "Seconds to persist and sync control file, grouped by timeline",
+        &["timeline_id"],
+        DISK_WRITE_SECONDS_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_persist_nosync_control_file_seconds histogram vec");
+}
+
 impl SharedState {
+    /// Get combined stateof all alive replicas
+    pub fn get_replicas_state(&self) -> ReplicaState {
+        let mut acc = ReplicaState::new();
+        for state in self.replicas.iter().flatten() {
+            acc.hs_feedback.ts = max(acc.hs_feedback.ts, state.hs_feedback.ts);
+            acc.hs_feedback.xmin = min(acc.hs_feedback.xmin, state.hs_feedback.xmin);
+            acc.hs_feedback.catalog_xmin =
+                min(acc.hs_feedback.catalog_xmin, state.hs_feedback.catalog_xmin);
+            acc.disk_consistent_lsn = Lsn::min(acc.disk_consistent_lsn, state.disk_consistent_lsn);
+        }
+        acc
+    }
+
+    /// Assign new replica ID. We choose first empty cell in the replicas vector
+    /// or extend the vector if there are not free items.
+    pub fn add_replica(&mut self, state: ReplicaState) -> usize {
+        if let Some(pos) = self.replicas.iter().position(|r| r.is_none()) {
+            self.replicas[pos] = Some(state);
+            return pos;
+        }
+        let pos = self.replicas.len();
+        self.replicas.push(Some(state));
+        pos
+    }
+
    /// Restore SharedState from control file. Locks the control file along the
    /// way to prevent running more than one instance of safekeeper on the same
    /// data dir.
    /// If create=false and file doesn't exist, bails out.
    fn create_restore(
-        conf: &WalAcceptorConf,
+        conf: &SafeKeeperConf,
        timelineid: ZTimelineId,
        create: CreateControlFile,
    ) -> Result<Self> {
        let (cf, state) = SharedState::load_control_file(conf, timelineid, create)?;
+        let timelineid_str = format!("{}", timelineid);
        let storage = FileStorage {
            control_file: cf,
            conf: conf.clone(),
+            persist_sync_control_file_seconds: PERSIST_SYNC_CONTROL_FILE_SECONDS
+                .with_label_values(&[&timelineid_str]),
+            persist_nosync_control_file_seconds: PERSIST_NOSYNC_CONTROL_FILE_SECONDS
+                .with_label_values(&[&timelineid_str]),
        };
        let (flush_lsn, tli) = if state.server.wal_seg_size != 0 {
-            let wal_dir = conf.data_dir.join(format!("{}", timelineid));
+            let wal_dir = conf.workdir.join(format!("{}", timelineid));
            find_end_of_wal(
                &wal_dir,
                state.server.wal_seg_size as usize,
@@ -74,30 +151,19 @@ impl SharedState {
        Ok(Self {
            notified_commit_lsn: Lsn(0),
            sk: SafeKeeper::new(Lsn(flush_lsn), tli, storage, state),
-            hs_feedback: HotStandbyFeedback {
-                ts: 0,
-                xmin: u64::MAX,
-                catalog_xmin: u64::MAX,
-            },
+            replicas: Vec::new(),
        })
    }

-    /// Accumulate hot standby feedbacks from replicas
-    pub fn add_hs_feedback(&mut self, feedback: HotStandbyFeedback) {
-        self.hs_feedback.xmin = min(self.hs_feedback.xmin, feedback.xmin);
-        self.hs_feedback.catalog_xmin = min(self.hs_feedback.catalog_xmin, feedback.catalog_xmin);
-        self.hs_feedback.ts = max(self.hs_feedback.ts, feedback.ts);
-    }
-
    /// Fetch and lock control file (prevent running more than one instance of safekeeper)
    /// If create=false and file doesn't exist, bails out.
    fn load_control_file(
-        conf: &WalAcceptorConf,
+        conf: &SafeKeeperConf,
        timelineid: ZTimelineId,
        create: CreateControlFile,
    ) -> Result<(File, SafeKeeperState)> {
        let control_file_path = conf
-            .data_dir
+            .workdir
            .join(timelineid.to_string())
            .join(CONTROL_FILE_NAME);
        info!(
@@ -178,20 +244,27 @@ impl Timeline {
        }
    }

-    /// Wait for an LSN to be committed.
+    /// Timed wait for an LSN to be committed.
    ///
    /// Returns the last committed LSN, which will be at least
-    /// as high as the LSN waited for.
+    /// as high as the LSN waited for, or None if timeout expired.
    ///
-    pub fn wait_for_lsn(&self, lsn: Lsn) -> Lsn {
+    pub fn wait_for_lsn(&self, lsn: Lsn) -> Option<Lsn> {
        let mut shared_state = self.mutex.lock().unwrap();
        loop {
            let commit_lsn = shared_state.notified_commit_lsn;
            // This must be `>`, not `>=`.
            if commit_lsn > lsn {
-                return commit_lsn;
+                return Some(commit_lsn);
            }
-            shared_state = self.cond.wait(shared_state).unwrap();
+            let result = self
+                .cond
+                .wait_timeout(shared_state, POLL_STATE_TIMEOUT)
+                .unwrap();
+            if result.1.timed_out() {
+                return None;
+            }
+            shared_state = result.0
        }
    }

@@ -219,9 +292,11 @@ impl Timeline {
            // commit_lsn if we are catching up safekeeper.
            commit_lsn = shared_state.sk.commit_lsn;

-            // if this is AppendResponse, fill in proper hot standby feedback
+            // if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn
            if let AcceptorProposerMessage::AppendResponse(ref mut resp) = rmsg {
-                resp.hs_feedback = shared_state.hs_feedback.clone();
+                let state = shared_state.get_replicas_state();
+                resp.hs_feedback = state.hs_feedback;
+                resp.disk_consistent_lsn = state.disk_consistent_lsn;
            }
        }
        // Ping wal sender that new data might be available.
@@ -233,15 +308,14 @@ impl Timeline {
        self.mutex.lock().unwrap().sk.s.clone()
    }

-    // Accumulate hot standby feedbacks from replicas
-    pub fn add_hs_feedback(&self, feedback: HotStandbyFeedback) {
+    pub fn add_replica(&self, state: ReplicaState) -> usize {
        let mut shared_state = self.mutex.lock().unwrap();
-        shared_state.add_hs_feedback(feedback);
+        shared_state.add_replica(state)
    }

-    pub fn get_hs_feedback(&self) -> HotStandbyFeedback {
-        let shared_state = self.mutex.lock().unwrap();
-        shared_state.hs_feedback.clone()
+    pub fn update_replica_state(&self, id: usize, state: Option<ReplicaState>) {
+        let mut shared_state = self.mutex.lock().unwrap();
+        shared_state.replicas[id] = state;
    }

    pub fn get_end_of_wal(&self) -> (Lsn, u32) {
@@ -254,7 +328,7 @@ impl Timeline {
 pub trait TimelineTools {
    fn set(
        &mut self,
-        conf: &WalAcceptorConf,
+        conf: &SafeKeeperConf,
        tenant_id: ZTenantId,
        timeline_id: ZTimelineId,
        create: CreateControlFile,
@@ -266,7 +340,7 @@ pub trait TimelineTools {
 impl TimelineTools for Option<Arc<Timeline>> {
    fn set(
        &mut self,
-        conf: &WalAcceptorConf,
+        conf: &SafeKeeperConf,
        tenant_id: ZTenantId,
        timeline_id: ZTimelineId,
        create: CreateControlFile,
@@ -295,7 +369,7 @@ impl GlobalTimelines {
    /// Get a timeline with control file loaded from the global TIMELINES map.
    /// If control file doesn't exist and create=false, bails out.
    pub fn get(
-        conf: &WalAcceptorConf,
+        conf: &SafeKeeperConf,
        tenant_id: ZTenantId,
        timeline_id: ZTimelineId,
        create: CreateControlFile,
@@ -324,11 +398,19 @@ impl GlobalTimelines {
 #[derive(Debug)]
 struct FileStorage {
    control_file: File,
-    conf: WalAcceptorConf,
+    conf: SafeKeeperConf,
+    persist_sync_control_file_seconds: Histogram,
+    persist_nosync_control_file_seconds: Histogram,
 }

 impl Storage for FileStorage {
    fn persist(&mut self, s: &SafeKeeperState, sync: bool) -> Result<()> {
+        let _timer = if sync {
+            &self.persist_sync_control_file_seconds
+        } else {
+            &self.persist_nosync_control_file_seconds
+        }
+        .start_timer();
        self.control_file.seek(SeekFrom::Start(0))?;
        s.ser_into(&mut self.control_file)?;
        if sync {
@@ -368,12 +450,12 @@ impl Storage for FileStorage {
            let wal_file_name = XLogFileName(server.tli, segno, wal_seg_size);
            let wal_file_path = self
                .conf
-                .data_dir
+                .workdir
                .join(ztli.to_string())
                .join(wal_file_name.clone());
            let wal_file_partial_path = self
                .conf
-                .data_dir
+                .workdir
                .join(ztli.to_string())
                .join(wal_file_name.clone() + ".partial");

--- a/walkeeper/src/wal_service.rs
+++ b/walkeeper/src/wal_service.rs
@@ -8,11 +8,11 @@ use std::net::{TcpListener, TcpStream};
 use std::thread;

 use crate::send_wal::SendWalHandler;
-use crate::WalAcceptorConf;
+use crate::SafeKeeperConf;
 use zenith_utils::postgres_backend::{AuthType, PostgresBackend};

 /// Accept incoming TCP connections and spawn them into a background thread.
-pub fn thread_main(conf: WalAcceptorConf, listener: TcpListener) -> Result<()> {
+pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> Result<()> {
    loop {
        match listener.accept() {
            Ok((socket, peer_addr)) => {
@@ -31,7 +31,7 @@ pub fn thread_main(conf: WalAcceptorConf, listener: TcpListener) -> Result<()> {

 /// This is run by `thread_main` above, inside a background thread.
 ///
-fn handle_socket(socket: TcpStream, conf: WalAcceptorConf) -> Result<()> {
+fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<()> {
    socket.set_nodelay(true)?;

    let mut conn_handler = SendWalHandler::new(conf);
--- a/zenith/Cargo.toml
+++ b/zenith/Cargo.toml
@@ -15,6 +15,7 @@ postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbf
 # FIXME: 'pageserver' is needed for BranchInfo. Refactor
 pageserver = { path = "../pageserver" }
 control_plane = { path = "../control_plane" }
+walkeeper = { path = "../walkeeper" }
 postgres_ffi = { path = "../postgres_ffi" }
 zenith_utils = { path = "../zenith_utils" }
 workspace_hack = { path = "../workspace_hack" }
--- a/zenith/src/main.rs
+++ b/zenith/src/main.rs
@@ -1,19 +1,54 @@
-use anyhow::anyhow;
+use anyhow::{anyhow, bail};
 use anyhow::{Context, Result};
 use clap::{App, AppSettings, Arg, ArgMatches, SubCommand};
 use control_plane::compute::ComputeControlPlane;
 use control_plane::local_env;
+use control_plane::local_env::LocalEnv;
+use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage::PageServerNode;
-use pageserver::defaults::{DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_PORT};
+use pageserver::defaults::{
+    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
+    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
+};
 use std::collections::HashMap;
 use std::process::exit;
 use std::str::FromStr;
-use zenith_utils::auth::{encode_from_key_path, Claims, Scope};
+use walkeeper::defaults::{
+    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
+    DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
+};
+use zenith_utils::auth::{Claims, Scope};
 use zenith_utils::postgres_backend::AuthType;
 use zenith_utils::zid::{ZTenantId, ZTimelineId};

 use pageserver::branches::BranchInfo;

+// Default name of a safekeeper node, if not specified on the command line.
+const DEFAULT_SAFEKEEPER_NAME: &str = "single";
+
+fn default_conf() -> String {
+    format!(
+        r#"
+# Default built-in configuration, defined in main.rs
+[pageserver]
+pg_port = {pageserver_pg_port}
+http_port = {pageserver_http_port}
+auth_type = '{pageserver_auth_type}'
+
+[[safekeepers]]
+name = '{safekeeper_name}'
+pg_port = {safekeeper_pg_port}
+http_port = {safekeeper_http_port}
+"#,
+        pageserver_pg_port = DEFAULT_PAGESERVER_PG_PORT,
+        pageserver_http_port = DEFAULT_PAGESERVER_HTTP_PORT,
+        pageserver_auth_type = AuthType::Trust,
+        safekeeper_name = DEFAULT_SAFEKEEPER_NAME,
+        safekeeper_pg_port = DEFAULT_SAFEKEEPER_PG_PORT,
+        safekeeper_http_port = DEFAULT_SAFEKEEPER_HTTP_PORT,
+    )
+}
+
 ///
 /// Branches tree element used as a value in the HashMap.
 ///
@@ -32,12 +67,21 @@ struct BranchTreeEl {
 //   * Providing CLI api to the pageserver
 //   * TODO: export/import to/from usual postgres
 fn main() -> Result<()> {
-    let timeline_arg = Arg::with_name("timeline")
-        .short("n")
+    let pg_node_arg = Arg::with_name("node")
        .index(1)
-        .help("Timeline name")
+        .help("Node name")
        .required(true);

+    let safekeeper_node_arg = Arg::with_name("node")
+        .index(1)
+        .help("Node name")
+        .required(false);
+
+    let timeline_arg = Arg::with_name("timeline")
+        .index(2)
+        .help("Branch name or a point-in time specification")
+        .required(false);
+
    let tenantid_arg = Arg::with_name("tenantid")
        .long("tenantid")
        .help("Tenant id. Represented as a hexadecimal string 32 symbols length")
@@ -49,29 +93,25 @@ fn main() -> Result<()> {
        .required(false)
        .value_name("port");

+    let stop_mode_arg = Arg::with_name("stop-mode")
+        .short("m")
+        .takes_value(true)
+        .possible_values(&["fast", "immediate"])
+        .help("If 'immediate', don't flush repository data at shutdown")
+        .required(false)
+        .value_name("stop-mode");
+
    let matches = App::new("Zenith CLI")
        .setting(AppSettings::ArgRequiredElseHelp)
        .subcommand(
            SubCommand::with_name("init")
                .about("Initialize a new Zenith repository")
                .arg(
-                    Arg::with_name("pageserver-pg-port")
-                        .long("pageserver-pg-port")
+                    Arg::with_name("config")
+                        .long("config")
                        .required(false)
-                        .value_name("pageserver-pg-port"),
+                        .value_name("config"),
                )
-                .arg(
-                    Arg::with_name("pageserver-http-port")
-                        .long("pageserver-http-port")
-                        .required(false)
-                        .value_name("pageserver-http-port"),
-                )
-                .arg(
-                    Arg::with_name("enable-auth")
-                        .long("enable-auth")
-                        .takes_value(false)
-                        .help("Enable authentication using ZenithJWT")
-                ),
        )
        .subcommand(
            SubCommand::with_name("branch")
@@ -86,15 +126,35 @@ fn main() -> Result<()> {
            .subcommand(SubCommand::with_name("list"))
            .subcommand(SubCommand::with_name("create").arg(Arg::with_name("tenantid").required(false).index(1)))
        )
-        .subcommand(SubCommand::with_name("status"))
-        .subcommand(SubCommand::with_name("start").about("Start local pageserver"))
-        .subcommand(SubCommand::with_name("stop").about("Stop local pageserver")
-                    .arg(Arg::with_name("immediate")
-                    .help("Don't flush repository data at shutdown")
-                    .required(false)
-                    )
+        .subcommand(
+            SubCommand::with_name("pageserver")
+                .setting(AppSettings::ArgRequiredElseHelp)
+                .about("Manage pageserver")
+                .subcommand(SubCommand::with_name("status"))
+                .subcommand(SubCommand::with_name("start").about("Start local pageserver"))
+                .subcommand(SubCommand::with_name("stop").about("Stop local pageserver")
+                            .arg(stop_mode_arg.clone()))
+                .subcommand(SubCommand::with_name("restart").about("Restart local pageserver"))
+        )
+        .subcommand(
+            SubCommand::with_name("safekeeper")
+                .setting(AppSettings::ArgRequiredElseHelp)
+                .about("Manage safekeepers")
+                .subcommand(SubCommand::with_name("start")
+                            .about("Start local safekeeper")
+                            .arg(safekeeper_node_arg.clone())
+                )
+                .subcommand(SubCommand::with_name("stop")
+                            .about("Stop local safekeeper")
+                            .arg(safekeeper_node_arg.clone())
+                            .arg(stop_mode_arg.clone())
+                )
+                .subcommand(SubCommand::with_name("restart")
+                            .about("Restart local safekeeper")
+                            .arg(safekeeper_node_arg.clone())
+                            .arg(stop_mode_arg.clone())
+                )
        )
-        .subcommand(SubCommand::with_name("restart").about("Restart local pageserver"))
        .subcommand(
            SubCommand::with_name("pg")
                .setting(AppSettings::ArgRequiredElseHelp)
@@ -102,7 +162,10 @@ fn main() -> Result<()> {
                .subcommand(SubCommand::with_name("list").arg(tenantid_arg.clone()))
                .subcommand(SubCommand::with_name("create")
                    .about("Create a postgres compute node")
-                    .arg(timeline_arg.clone()).arg(tenantid_arg.clone()).arg(port_arg.clone())
+                    .arg(pg_node_arg.clone())
+                    .arg(timeline_arg.clone())
+                    .arg(tenantid_arg.clone())
+                    .arg(port_arg.clone())
                    .arg(
                        Arg::with_name("config-only")
                            .help("Don't do basebackup, create compute node with only config files")
@@ -111,13 +174,13 @@ fn main() -> Result<()> {
                    ))
                .subcommand(SubCommand::with_name("start")
                    .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files")
-                    .arg(
-                        timeline_arg.clone()
-                    ).arg(
-                        tenantid_arg.clone()
-                    ).arg(port_arg.clone()))
+                    .arg(pg_node_arg.clone())
+                    .arg(timeline_arg.clone())
+                    .arg(tenantid_arg.clone())
+                    .arg(port_arg.clone()))
                .subcommand(
                    SubCommand::with_name("stop")
+                        .arg(pg_node_arg.clone())
                        .arg(timeline_arg.clone())
                        .arg(tenantid_arg.clone())
                        .arg(
@@ -129,116 +192,49 @@ fn main() -> Result<()> {
                    )

        )
+        .subcommand(
+            SubCommand::with_name("start")
+                .about("Start page server and safekeepers")
+        )
+        .subcommand(
+            SubCommand::with_name("stop")
+                .about("Stop page server and safekeepers")
+                .arg(stop_mode_arg.clone())
+        )
        .get_matches();

-    // Create config file
-    if let ("init", Some(init_match)) = matches.subcommand() {
-        let tenantid = ZTenantId::generate();
-        let pageserver_pg_port = match init_match.value_of("pageserver-pg-port") {
-            Some(v) => v.parse()?,
-            None => DEFAULT_PG_LISTEN_PORT,
-        };
-        let pageserver_http_port = match init_match.value_of("pageserver-http-port") {
-            Some(v) => v.parse()?,
-            None => DEFAULT_HTTP_LISTEN_PORT,
+    let (sub_name, sub_args) = matches.subcommand();
+    let sub_args = sub_args.expect("no subcommand");
+
+    // Check for 'zenith init' command first.
+    let subcmd_result = if sub_name == "init" {
+        handle_init(sub_args)
+    } else {
+        // all other commands need an existing config
+        let env = match LocalEnv::load_config() {
+            Ok(conf) => conf,
+            Err(e) => {
+                eprintln!("Error loading config: {}", e);
+                exit(1);
+            }
        };

-        let auth_type = if init_match.is_present("enable-auth") {
-            AuthType::ZenithJWT
-        } else {
-            AuthType::Trust
-        };
-
-        local_env::init(
-            pageserver_pg_port,
-            pageserver_http_port,
-            tenantid,
-            auth_type,
-        )
-        .with_context(|| "Failed to create config file")?;
+        match sub_name {
+            "tenant" => handle_tenant(sub_args, &env),
+            "branch" => handle_branch(sub_args, &env),
+            "start" => handle_start_all(sub_args, &env),
+            "stop" => handle_stop_all(sub_args, &env),
+            "pageserver" => handle_pageserver(sub_args, &env),
+            "pg" => handle_pg(sub_args, &env),
+            "safekeeper" => handle_safekeeper(sub_args, &env),
+            _ => bail!("unexpected subcommand {}", sub_name),
+        }
+    };
+    if let Err(e) = subcmd_result {
+        eprintln!("command failed: {}", e);
+        exit(1);
    }

-    // all other commands would need config
-    let env = match local_env::load_config() {
-        Ok(conf) => conf,
-        Err(e) => {
-            eprintln!("Error loading config: {}", e);
-            exit(1);
-        }
-    };
-
-    match matches.subcommand() {
-        ("init", Some(init_match)) => {
-            let pageserver = PageServerNode::from_env(&env);
-            if let Err(e) = pageserver.init(
-                Some(&env.tenantid.to_string()),
-                init_match.is_present("enable-auth"),
-            ) {
-                eprintln!("pageserver init failed: {}", e);
-                exit(1);
-            }
-        }
-        ("tenant", Some(args)) => {
-            if let Err(e) = handle_tenant(args, &env) {
-                eprintln!("tenant command failed: {}", e);
-                exit(1);
-            }
-        }
-
-        ("branch", Some(sub_args)) => {
-            if let Err(e) = handle_branch(sub_args, &env) {
-                eprintln!("branch command failed: {}", e);
-                exit(1);
-            }
-        }
-
-        ("start", Some(_sub_m)) => {
-            let pageserver = PageServerNode::from_env(&env);
-
-            if let Err(e) = pageserver.start() {
-                eprintln!("pageserver start failed: {}", e);
-                exit(1);
-            }
-        }
-
-        ("stop", Some(stop_match)) => {
-            let pageserver = PageServerNode::from_env(&env);
-
-            let immediate = stop_match.is_present("immediate");
-
-            if let Err(e) = pageserver.stop(immediate) {
-                eprintln!("pageserver stop failed: {}", e);
-                exit(1);
-            }
-        }
-
-        ("restart", Some(_sub_m)) => {
-            let pageserver = PageServerNode::from_env(&env);
-
-            //TODO what shutdown strategy should we use here?
-            if let Err(e) = pageserver.stop(false) {
-                eprintln!("pageserver stop failed: {}", e);
-                exit(1);
-            }
-
-            if let Err(e) = pageserver.start() {
-                eprintln!("pageserver start failed: {}", e);
-                exit(1);
-            }
-        }
-
-        ("status", Some(_sub_m)) => {}
-
-        ("pg", Some(pg_match)) => {
-            if let Err(e) = handle_pg(pg_match, &env) {
-                eprintln!("pg operation failed: {:?}", e);
-                exit(1);
-            }
-        }
-
-        _ => {}
-    };
-
    Ok(())
 }

@@ -373,12 +369,52 @@ fn get_branch_infos(
    Ok(branch_infos)
 }

+// Helper function to parse --tenantid option, or get the default from config file
+fn get_tenantid(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<ZTenantId> {
+    if let Some(tenantid_cmd) = sub_match.value_of("tenantid") {
+        Ok(ZTenantId::from_str(tenantid_cmd)?)
+    } else if let Some(tenantid_conf) = env.default_tenantid {
+        Ok(tenantid_conf)
+    } else {
+        bail!("No tenantid. Use --tenantid, or set 'default_tenantid' in the config file");
+    }
+}
+
+fn handle_init(init_match: &ArgMatches) -> Result<()> {
+    // Create config file
+    let toml_file: String = if let Some(config_path) = init_match.value_of("config") {
+        // load and parse the file
+        std::fs::read_to_string(std::path::Path::new(config_path))
+            .with_context(|| format!("Could not read configuration file \"{}\"", config_path))?
+    } else {
+        // Built-in default config
+        default_conf()
+    };
+
+    let mut env = LocalEnv::create_config(&toml_file)
+        .with_context(|| "Failed to create zenith configuration")?;
+    env.init()
+        .with_context(|| "Failed to initialize zenith repository")?;
+
+    // Call 'pageserver init'.
+    let pageserver = PageServerNode::from_env(&env);
+    if let Err(e) = pageserver.init(
+        // default_tenantid was generated by the `env.init()` call above
+        Some(&env.default_tenantid.unwrap().to_string()),
+    ) {
+        eprintln!("pageserver init failed: {}", e);
+        exit(1);
+    }
+
+    Ok(())
+}
+
 fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    let pageserver = PageServerNode::from_env(env);
    match tenant_match.subcommand() {
        ("list", Some(_)) => {
-            for tenant in pageserver.tenant_list()? {
-                println!("{}", tenant);
+            for t in pageserver.tenant_list()? {
+                println!("{} {}", t.id, t.state);
            }
        }
        ("create", Some(create_match)) => {
@@ -390,7 +426,10 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result
            pageserver.tenant_create(tenantid)?;
            println!("tenant successfully created on the pageserver");
        }
-        _ => {}
+
+        (sub_name, _) => {
+            bail!("Unexpected tenant subcommand '{}'", sub_name)
+        }
    }
    Ok(())
 }
@@ -398,22 +437,18 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result
 fn handle_branch(branch_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    let pageserver = PageServerNode::from_env(env);

+    let tenantid = get_tenantid(branch_match, env)?;
+
    if let Some(branchname) = branch_match.value_of("branchname") {
        let startpoint_str = branch_match
            .value_of("start-point")
            .ok_or_else(|| anyhow!("Missing start-point"))?;
-        let tenantid: ZTenantId = branch_match
-            .value_of("tenantid")
-            .map_or(Ok(env.tenantid), |value| value.parse())?;
        let branch = pageserver.branch_create(branchname, startpoint_str, &tenantid)?;
        println!(
            "Created branch '{}' at {:?} for tenant: {}",
            branch.name, branch.latest_valid_lsn, tenantid,
        );
    } else {
-        let tenantid: ZTenantId = branch_match
-            .value_of("tenantid")
-            .map_or(Ok(env.tenantid), |value| value.parse())?;
        // No arguments, list branches for tenant
        let branches = pageserver.branch_list(&tenantid)?;
        print_branches_tree(branches)?;
@@ -423,74 +458,79 @@ fn handle_branch(branch_match: &ArgMatches, env: &local_env::LocalEnv) -> Result
 }

 fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+    let (sub_name, sub_args) = pg_match.subcommand();
+    let sub_args = sub_args.expect("no pg subcommand");
+
    let mut cplane = ComputeControlPlane::load(env.clone())?;

-    match pg_match.subcommand() {
-        ("list", Some(list_match)) => {
-            let tenantid: ZTenantId = list_match
-                .value_of("tenantid")
-                .map_or(Ok(env.tenantid), |value| value.parse())?;
+    // All subcommands take an optional --tenantid option
+    let tenantid = get_tenantid(sub_args, env)?;
+
+    match sub_name {
+        "list" => {
            let branch_infos = get_branch_infos(env, &tenantid).unwrap_or_else(|e| {
                eprintln!("Failed to load branch info: {}", e);
                HashMap::new()
            });

-            println!("BRANCH\tADDRESS\t\tLSN\t\tSTATUS");
-            for ((_, timeline_name), node) in cplane
+            println!("NODE\tADDRESS\t\tBRANCH\tLSN\t\tSTATUS");
+            for ((_, node_name), node) in cplane
                .nodes
                .iter()
                .filter(|((node_tenantid, _), _)| node_tenantid == &tenantid)
            {
+                // FIXME: This shows the LSN at the end of the timeline. It's not the
+                // right thing to do for read-only nodes that might be anchored at an
+                // older point in time, or following but lagging behind the primary.
+                let lsn_str = branch_infos
+                    .get(&node.timelineid)
+                    .map(|bi| bi.latest_valid_lsn.to_string())
+                    .unwrap_or_else(|| "?".to_string());
+
                println!(
-                    "{}\t{}\t{}\t{}",
-                    timeline_name,
+                    "{}\t{}\t{}\t{}\t{}",
+                    node_name,
                    node.address,
-                    branch_infos
-                        .get(&node.timelineid)
-                        .map(|bi| bi.latest_valid_lsn.to_string())
-                        .unwrap_or_else(|| "?".to_string()),
+                    node.timelineid, // FIXME: resolve human-friendly branch name
+                    lsn_str,
                    node.status(),
                );
            }
        }
-        ("create", Some(create_match)) => {
-            let tenantid: ZTenantId = create_match
-                .value_of("tenantid")
-                .map_or(Ok(env.tenantid), |value| value.parse())?;
-            let timeline_name = create_match.value_of("timeline").unwrap_or("main");
+        "create" => {
+            let node_name = sub_args.value_of("node").unwrap_or("main");
+            let timeline_name = sub_args.value_of("timeline").unwrap_or(node_name);

-            let port: Option<u16> = match create_match.value_of("port") {
+            let port: Option<u16> = match sub_args.value_of("port") {
                Some(p) => Some(p.parse()?),
                None => None,
            };
-            cplane.new_node(tenantid, timeline_name, port)?;
+            cplane.new_node(tenantid, node_name, timeline_name, port)?;
        }
-        ("start", Some(start_match)) => {
-            let tenantid: ZTenantId = start_match
-                .value_of("tenantid")
-                .map_or(Ok(env.tenantid), |value| value.parse())?;
-            let timeline_name = start_match.value_of("timeline").unwrap_or("main");
+        "start" => {
+            let node_name = sub_args.value_of("node").unwrap_or("main");
+            let timeline_name = sub_args.value_of("timeline");

-            let port: Option<u16> = match start_match.value_of("port") {
+            let port: Option<u16> = match sub_args.value_of("port") {
                Some(p) => Some(p.parse()?),
                None => None,
            };

-            let node = cplane.nodes.get(&(tenantid, timeline_name.to_owned()));
+            let node = cplane.nodes.get(&(tenantid, node_name.to_owned()));

-            let auth_token = if matches!(env.auth_type, AuthType::ZenithJWT) {
+            let auth_token = if matches!(env.pageserver.auth_type, AuthType::ZenithJWT) {
                let claims = Claims::new(Some(tenantid), Scope::Tenant);
-                Some(encode_from_key_path(&claims, &env.private_key_path)?)
+
+                Some(env.generate_auth_token(&claims)?)
            } else {
                None
            };

-            println!(
-                "Starting {} postgres on timeline {}...",
-                if node.is_some() { "existing" } else { "new" },
-                timeline_name
-            );
            if let Some(node) = node {
+                if timeline_name.is_some() {
+                    println!("timeline name ignored because node exists already");
+                }
+                println!("Starting existing postgres {}...", node_name);
                node.start(&auth_token)?;
            } else {
                // when used with custom port this results in non obvious behaviour
@@ -498,26 +538,170 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
                // start --port X
                // stop
                // start <-- will also use port X even without explicit port argument
-                let node = cplane.new_node(tenantid, timeline_name, port)?;
+                let timeline_name = timeline_name.unwrap_or(node_name);
+                println!(
+                    "Starting new postgres {} on {}...",
+                    node_name, timeline_name
+                );
+                let node = cplane.new_node(tenantid, node_name, timeline_name, port)?;
                node.start(&auth_token)?;
            }
        }
-        ("stop", Some(stop_match)) => {
-            let timeline_name = stop_match.value_of("timeline").unwrap_or("main");
-            let destroy = stop_match.is_present("destroy");
-            let tenantid: ZTenantId = stop_match
-                .value_of("tenantid")
-                .map_or(Ok(env.tenantid), |value| value.parse())?;
+        "stop" => {
+            let node_name = sub_args.value_of("node").unwrap_or("main");
+            let destroy = sub_args.is_present("destroy");

            let node = cplane
                .nodes
-                .get(&(tenantid, timeline_name.to_owned()))
-                .ok_or_else(|| anyhow!("postgres {} is not found", timeline_name))?;
+                .get(&(tenantid, node_name.to_owned()))
+                .ok_or_else(|| anyhow!("postgres {} is not found", node_name))?;
            node.stop(destroy)?;
        }

-        _ => {}
+        _ => {
+            bail!("Unexpected pg subcommand '{}'", sub_name)
+        }
    }

    Ok(())
 }
+
+fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+    let pageserver = PageServerNode::from_env(env);
+
+    match sub_match.subcommand() {
+        ("start", Some(_sub_m)) => {
+            if let Err(e) = pageserver.start() {
+                eprintln!("pageserver start failed: {}", e);
+                exit(1);
+            }
+        }
+
+        ("stop", Some(stop_match)) => {
+            let immediate = stop_match.value_of("stop-mode") == Some("immediate");
+
+            if let Err(e) = pageserver.stop(immediate) {
+                eprintln!("pageserver stop failed: {}", e);
+                exit(1);
+            }
+        }
+
+        ("restart", Some(_sub_m)) => {
+            //TODO what shutdown strategy should we use here?
+            if let Err(e) = pageserver.stop(false) {
+                eprintln!("pageserver stop failed: {}", e);
+                exit(1);
+            }
+
+            if let Err(e) = pageserver.start() {
+                eprintln!("pageserver start failed: {}", e);
+                exit(1);
+            }
+        }
+
+        (sub_name, _) => {
+            bail!("Unexpected pageserver subcommand '{}'", sub_name)
+        }
+    }
+    Ok(())
+}
+
+fn get_safekeeper(env: &local_env::LocalEnv, name: &str) -> Result<SafekeeperNode> {
+    if let Some(node) = env.safekeepers.iter().find(|node| node.name == name) {
+        Ok(SafekeeperNode::from_env(env, node))
+    } else {
+        bail!("could not find safekeeper '{}'", name)
+    }
+}
+
+fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+    let (sub_name, sub_args) = sub_match.subcommand();
+    let sub_args = sub_args.expect("no safekeeper subcommand");
+
+    // All the commands take an optional safekeeper name argument
+    let node_name = sub_args.value_of("node").unwrap_or(DEFAULT_SAFEKEEPER_NAME);
+    let safekeeper = get_safekeeper(env, node_name)?;
+
+    match sub_name {
+        "start" => {
+            if let Err(e) = safekeeper.start() {
+                eprintln!("safekeeper start failed: {}", e);
+                exit(1);
+            }
+        }
+
+        "stop" => {
+            let immediate = sub_args.value_of("stop-mode") == Some("immediate");
+
+            if let Err(e) = safekeeper.stop(immediate) {
+                eprintln!("safekeeper stop failed: {}", e);
+                exit(1);
+            }
+        }
+
+        "restart" => {
+            let immediate = sub_args.value_of("stop-mode") == Some("immediate");
+
+            if let Err(e) = safekeeper.stop(immediate) {
+                eprintln!("safekeeper stop failed: {}", e);
+                exit(1);
+            }
+
+            if let Err(e) = safekeeper.start() {
+                eprintln!("safekeeper start failed: {}", e);
+                exit(1);
+            }
+        }
+
+        _ => {
+            bail!("Unexpected safekeeper subcommand '{}'", sub_name)
+        }
+    }
+    Ok(())
+}
+
+fn handle_start_all(_sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+    let pageserver = PageServerNode::from_env(env);
+
+    // Postgres nodes are not started automatically
+
+    if let Err(e) = pageserver.start() {
+        eprintln!("pageserver start failed: {}", e);
+        exit(1);
+    }
+
+    for node in env.safekeepers.iter() {
+        let safekeeper = SafekeeperNode::from_env(env, node);
+        if let Err(e) = safekeeper.start() {
+            eprintln!("safekeeper '{}' start failed: {}", safekeeper.name, e);
+            exit(1);
+        }
+    }
+    Ok(())
+}
+
+fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+    let immediate = sub_match.value_of("stop-mode") == Some("immediate");
+
+    let pageserver = PageServerNode::from_env(env);
+
+    // Stop all compute nodes
+    let cplane = ComputeControlPlane::load(env.clone())?;
+    for (_k, node) in cplane.nodes {
+        if let Err(e) = node.stop(false) {
+            eprintln!("postgres stop failed: {}", e);
+        }
+    }
+
+    if let Err(e) = pageserver.stop(immediate) {
+        eprintln!("pageserver stop failed: {}", e);
+    }
+
+    for node in env.safekeepers.iter() {
+        let safekeeper = SafekeeperNode::from_env(env, node);
+        if let Err(e) = safekeeper.stop(immediate) {
+            eprintln!("safekeeper '{}' stop failed: {}", safekeeper.name, e);
+        }
+    }
+    Ok(())
+}
--- a/zenith_metrics/src/lib.rs
+++ b/zenith_metrics/src/lib.rs
@@ -74,6 +74,10 @@ lazy_static! {
    .expect("Failed to register maxrss_kb int gauge");
 }

+pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
+    0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
+];
+
 // Records I/O stats in a "cross-platform" way.
 // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
 // An alternative is to read procfs (`/proc/[pid]/io`) which does not work under macOS at all, hence abandoned.
--- a/zenith_utils/Cargo.toml
+++ b/zenith_utils/Cargo.toml
@@ -11,7 +11,6 @@ byteorder = "1.4.3"
 bytes = "1.0.1"
 hyper = { version = "0.14.7", features = ["full"] }
 lazy_static = "1.4.0"
-log = "0.4.14"
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
 routerify = "2"
 serde = { version = "1.0", features = ["derive"] }
@@ -19,8 +18,8 @@ serde_json = "1"
 thiserror = "1.0"
 tokio = "1.11"
 tracing = "0.1"
-tracing-log = "0.1"
-tracing-subscriber = "0.2"
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+nix = "0.23.0"

 zenith_metrics = { path = "../zenith_metrics" }
 workspace_hack = { path = "../workspace_hack" }
--- a/zenith_utils/src/auth.rs
+++ b/zenith_utils/src/auth.rs
@@ -104,8 +104,8 @@ impl JwtAuth {
    }

    pub fn from_key_path(key_path: &Path) -> Result<Self> {
-        let public_key = fs::read_to_string(key_path)?;
-        Ok(Self::new(DecodingKey::from_rsa_pem(public_key.as_bytes())?))
+        let public_key = fs::read(key_path)?;
+        Ok(Self::new(DecodingKey::from_rsa_pem(&public_key)?))
    }

    pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
@@ -114,8 +114,7 @@ impl JwtAuth {
 }

 // this function is used only for testing purposes in CLI e g generate tokens during init
-pub fn encode_from_key_path(claims: &Claims, key_path: &Path) -> Result<String> {
-    let key_data = fs::read_to_string(key_path)?;
-    let key = EncodingKey::from_rsa_pem(key_data.as_bytes())?;
+pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result<String> {
+    let key = EncodingKey::from_rsa_pem(key_data)?;
    Ok(encode(&Header::new(JWT_ALGORITHM), claims, &key)?)
 }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Patrick Insinger	32dd786650	pageserver - add naive layer IDs	2021-11-01 00:20:50 -07:00
Kirill Bulatov	e6ef27637b	Better API to handle timeline metadata properly	2021-10-29 23:51:40 +03:00
Patrick Insinger	b532470792	Set SO_REUSEADDR for all TCP listeners	2021-10-29 12:45:26 -07:00
Heikki Linnakangas	e0d7ecf91c	Refactor 'zenith' CLI subcommand handling Also fixes 'zenith safekeeper restart -m immediate'. The stop-mode was previously ignored.	2021-10-29 19:01:01 +03:00
Kirill Bulatov	edba2e9744	Use a proper extension for the readme file	2021-10-28 18:55:14 +03:00
Egor Suvorov	7e552b645f	Add disk write/sync metrics to Safekeeper (#745 )	2021-10-28 18:38:36 +03:00
anastasia	ea5900f155	Refactoring of checkpointer and GC. Move them to a separate tenant_threads module to detangle thread management from LayeredRepository implementation.	2021-10-27 20:50:26 +03:00
anastasia	28ab40c8b7	fix init_repo() call in register_relish_download()	2021-10-27 20:50:26 +03:00
Alexey Kondratov	d423142623	Proxy: wait for kick on .pgpass connection (zenithdb/console#227 )	2021-10-27 20:24:23 +03:00
Dmitry Rodionov	1c0e85f9a0	review cleanups	2021-10-27 13:30:34 +03:00
Dmitry Rodionov	5bc09074ea	add a flag to avoid non incremental size calculation in pageserver http api This calculation is not that heavy but it is needed only in tests, and in case the number of tenants/timelines is high the calculation can take noticeable time. Resolves https://github.com/zenithdb/zenith/issues/804	2021-10-27 13:30:34 +03:00
Heikki Linnakangas	1fac4a3c91	Fix a few messages. Pointed out by Egor in https://github.com/zenithdb/zenith/pull/788, but I accidentally pushed that before fixing these.	2021-10-27 10:58:21 +03:00
Heikki Linnakangas	1bc917324d	Use -m immediate for 'immediate' shutdown	2021-10-27 10:49:38 +03:00
Heikki Linnakangas	af429fb401	Improve 'zenith' CLI utility for safekeepers and a config file. The 'zenith' CLI utility can now be used to launch safekeepers. By default, one safekeeper is configured. There are new 'safekeeper start/stop' subcommands to manage the safekeepers. Each safekeeper is given a name that can be used to identify the safekeeper to start/stop with the 'zenith start/stop' commands. The safekeeper data is stored in '.zenith/safekeepers/<name>'. The 'zenith start' command now starts the pageserver and also all safekeepers. 'zenith stop' stops pageserver, all safekeepers, and all postgres nodes. Introduce new 'zenith pageserver start/stop' subcommands for starting/stopping just the page server. The biggest change here is to the 'zenith init' command. This adds a new 'zenith init --config=<path to toml file>' option. It takes a toml config file that describes the environment. In the config file, you can specify options for the pageserver, like the pg and http ports, and authentication. For each safekeeper, you can define a name and the pg and http ports. If you don't use the --config option, you get a default configuration with a pageserver and one safekeeper. Note that that's different from the previous default of no safekeepers. Any fields that are omitted in the configuration file are filled with defaults. You can also specify the initial tenant ID in the config file. A couple of sample config files are added in the control_plane/ directory. The --pageserver-pg-port, --pageserver-http-port, and --pageserver-auth options to 'zenith init' are removed. Use a config file instead. Finally, change the python test fixtures to use the new 'zenith' commands and the config file to describe the environment.	2021-10-27 10:49:38 +03:00
Heikki Linnakangas	710fe02d0b	Return success on 'zenith stop' if the page server is already stopped.	2021-10-27 01:10:24 +03:00
Heikki Linnakangas	de87aad990	Remove a few unused functions	2021-10-27 01:10:24 +03:00
Heikki Linnakangas	41d48719e1	In python tests, skip ports that are already in use. We've seen some failures with "Address already in use" errors in the tests. It's not clear why, perhaps some server processes are not cleaned up properly after test, or maybe the socket is still in TIME_WAIT state. In any case, let's make the tests more robust by checking that the port is free, before trying to use it.	2021-10-27 00:46:24 +03:00
Kirill Bulatov	d88377f9f0	Remove log from zenith_utils	2021-10-26 23:24:11 +03:00
Kirill Bulatov	ecd577c934	Simplify tracing declarations	2021-10-26 23:24:11 +03:00
anastasia	f43f8401ee	Don't wait for wal-redo process for non-relational records replay	2021-10-26 19:30:28 +03:00
Arseny Sher	1877bbc7cb	bump vendor/postgres to fix reconnection busy loop	2021-10-26 15:43:19 +03:00
Heikki Linnakangas	a064ebb64c	Cope with missing 'tenantid' in '.zenith/config' file. We generate the initial tenantid and store it in the file, so it shouldn't be missing. But let's cope with it. (This comes handy with the bigger changes I'm working on at https://github.com/zenithdb/zenith/pull/788)	2021-10-25 21:24:11 +03:00
Heikki Linnakangas	4726870e8d	Remove obsolete comment. We store the pageserver port in the .zenith/config file.	2021-10-25 21:16:58 +03:00
Heikki Linnakangas	3bbc106c70	Prefer long CLI option name for clarity.	2021-10-25 21:16:58 +03:00
Heikki Linnakangas	66eb081876	Improve comment on 'base_dir'	2021-10-25 21:16:58 +03:00
Kirill Bulatov	f291ab2b87	Do not panic on missing tenant	2021-10-25 18:36:30 +03:00
Heikki Linnakangas	66ec135676	Refactor pytest fixtures Instead of having a lot of separate fixtures for setting up the page server, the compute nodes, the safekeepers etc., have one big ZenithEnv object that encapsulates the whole environment. Every test either uses a shared "zenith_simple_env" fixture, which contains the default setup of a pageserver with no authentication, and no safekeepers. Tests that want to use safekeepers or authentication set up a custom test-specific ZenithEnv fixture. Gathering information about the whole environment into one object makes some things simpler. For example, when a new compute node is created, you no longer need to pass the 'wal_acceptors' connection string as argument to the 'postgres.create_start' function. The 'create_start' function fetches that information directly from the ZenithEnv object.	2021-10-25 14:14:47 +03:00
Heikki Linnakangas	28af3e5008	Remove some unnecessary fixture arguments	2021-10-25 14:14:45 +03:00
Heikki Linnakangas	f337d73a6c	Rearrange output dirs a bit Each test now gets its own test output directory, like 'test_output/test_foobar', even when TEST_SHARED_FIXTURES is used. When TEST_SHARED_FIXTURES is not used, the zenith repo for each test is created under a 'repo' subdir inside the test output dir, e.g. 'test_output/test_foobar/repo'	2021-10-25 14:14:43 +03:00
Heikki Linnakangas	57ce541521	Remove unnecessary 'pg_bin' object from 'postgres' fixture. It was only used in check_restored_datadir_content(), and that function can construct it easily from the other information it has.	2021-10-25 14:14:41 +03:00
Heikki Linnakangas	e14f24034f	Turn a few path-fixtures to global variables This way, they're readily accessible from the classes and functions that are not themselves fixtures	2021-10-25 14:14:38 +03:00
Kirill Bulatov	04fb0a0342	Add core relish backup and restore functionality	2021-10-22 22:22:38 +03:00
Heikki Linnakangas	8c42dcc041	Fix safekeeper -D option. The -D option to specify working directory was broken: $ mkdir foobar $ ./target/debug/safekeeper -D foobar Error: failed to open "foobar/safekeeper.log" Caused by: No such file or directory (os error 2) This was because we both chdir'd into to specified directory, and also prepended the directory to all the paths. So in the above example, it actually tried to create the log file in "foobar/foobar/safekepeer.log" Change it to work the same way as in the pageserver: chdir to the specified directory, and leave 'workdir' always set to ".". We wouldn't necessarily need the 'workdir' variable in the config at all, and could assume that the current working directory is always the safekeeper data directory, but I'd like to keep this consistent with the the pageserver. The page server doesn't assume that for the sake of unit tests. We don't currently have unit tests in the safekeeper that write to disk but we might want to in the future.	2021-10-22 08:39:58 +03:00
Alexey Kondratov	9070a4dc02	Turn off back pressure by default	2021-10-22 01:40:43 +03:00
Egor Suvorov	86a28458c6	test_runner: use Python 3.7 in CI and improve its support (#775 ) * We actually need Python 3.7 because of dataclasses * Rerun 'pipenv lock' under Python 3.7 and add 'pipenv' to dev deps * Update docs on developing for Python 3.7 * CircleCI: use Python 3.7 via Docker image instead of Orb	2021-10-21 20:01:29 +03:00
Egor Suvorov	c058d04250	Rename WalAcceptor to Safekeeper in most places (#741 )	2021-10-21 18:26:43 +03:00
Konstantin Knizhnik	c310932121	Implement backpressure for compute node to avoid WAL overflow Co-authored-by: Arseny Sher <sher-ars@yandex.ru> Co-authored-by: Alexey Kondratov <kondratov.aleksey@gmail.com>	2021-10-21 18:15:50 +03:00
Egor Suvorov	ff563ff080	test_runner: fix mypy errors and force it on CI (#774 ) * Fix bugs found by mypy * Add some missing types and runtime checks, remove unused code * Make ZenithPageserver start right away for better type safety * Add `types-` packages to Pipfile Pin mypy version and run it on CircleCI	2021-10-21 13:51:54 +03:00
anastasia	7f9d2a7d05	Change 'zenith tenant list' API to return tenant state added in `0dc7a3fc`	2021-10-21 11:04:22 +03:00
Arthur Petukhovsky	13f4e173c9	Wait for safekeepers to catch up in test_restarts_under_load (#776 )	2021-10-20 14:42:53 +03:00
Dmitry Ivanov	85116a8375	[proxy] Prevent TLS stream from hanging This change causes writer halves of a TLS stream to always flush after a portion of bytes has been written by `std::io::copy`. Furthermore, some cosmetic and minor functional changes are made to facilitate debug.	2021-10-20 14:15:49 +03:00
Egor Suvorov	e42c884c2b	test_runner/README: add note on capturing logs (#778 ) Became actual after #674	2021-10-20 01:55:49 +03:00
Egor Suvorov	eb706bc9f4	Force yapf (Python code formatter) in CI (#772 ) * Add yapf run to CircleCI * Pin yapf version * Enable `SPLIT_ALL_TOP_LEVEL_COMMA_SEPARATED_VALUES` setting * Reformat all existing code with slight manual adjustments * test_runner/README: note that yapf is forced	2021-10-19 20:13:47 +03:00
Dmitry Rodionov	798df756de	suppress FileNotFound exception instead of missing_ok=True because the latter is added in python 3.8 and we claim to support >3.6	2021-10-19 17:13:42 +03:00
Dmitry Rodionov	732d13fe06	use cached-property package because python<3.8 doesnt have cached_property in functools	2021-10-19 17:13:42 +03:00
Heikki Linnakangas	feae7f39c1	Support read-only nodes Change 'zenith.signal' file to a human-readable format, similar to backup_label. It can contain a "PREV LSN: %X/%X" line, or a special value to indicate that it's OK to start with invalid LSN ('none'), or that it's a read-only node and generating WAL is forbidden ('invalid'). The 'zenith pg create' and 'zenith pg start' commands now take a node name parameter, separate from the branch name. If the node name is not given, it defaults to the branch name, so this doesn't break existing scripts. If you pass "foo@<lsn>" as the branch name, a read-only node anchored at that LSN is created. The anchoring is performed by setting the 'recovery_target_lsn' option in the postgresql.conf file, and putting the server into standby mode with 'standby.signal'. We no longer store the synthetic checkpoint record in the WAL segment. The postgres startup code has been changed to use the copy of the checkpoint record in the pg_control file, when starting in zenith mode.	2021-10-19 09:48:12 +03:00
Heikki Linnakangas	c2b468c958	Separate node name from the branch name in ComputeControlPlane This is in preparation for supporting read-only nodes. You can launch multiple read-only nodes on the same brach, so we need an identifier for each node, separate from the branch name.	2021-10-19 09:48:10 +03:00
Heikki Linnakangas	e272a380b4	On new repo, start writing WAL only after the initial checkpoint record. Previously, the first WAL record on the 'main' branch overwrote the initial checkpoint record, with invalid 'xl_prev'. That's harmless, but also pretty ugly. I bumped into this while I was trying to tighen up the checks for when a valid 'prev_lsn' is required. With this patch, the first WAL record gets a valid 'xl_prev' value. It doesn't matter much currently, but let's be tidy.	2021-10-19 09:48:04 +03:00
anastasia	0dc7a3fc15	Change tenant_mgr to use TenantState. It allows to avoid locking entire TENANTS list while one tenant is bootstrapping and prepares the code for remote storage integration.	2021-10-18 15:40:06 +03:00
Egor Suvorov	a1bc0ada59	Dockerfile: remove wal_acceptor alias for safekeeper (#743 )	2021-10-18 14:56:30 +03:00
Kirill Bulatov	e9b5224a8a	Fix toml serde gotchas	2021-10-18 14:14:27 +03:00
Heikki Linnakangas	bdd039a9ee	S3 DELETE call returns 204, not 200. According to the S3 API docs, the DELETE call returns code "204 No content" on success.	2021-10-17 16:21:58 +03:00
Heikki Linnakangas	b405eef324	Avoid writing the metadata file when it hasn't changed.	2021-10-17 14:54:39 +03:00
Kirill Bulatov	ba557d126b	React on sigint	2021-10-15 21:24:24 +03:00
Patrick Insinger	2dde20a227	Bump MSRV to 1.55	2021-10-15 09:10:08 -07:00