WIP: Cache latest image of each page

Just an experiment. I'm not sure if this makes sense, given that we cache materialized page versions in the page cache now. This is different from that though, in that this stores the image in the ephemeral file, so it will get spilled to disk. (while the ephemeral file page stays in memory, we are caching it twice, so at least that needs some work)
Use the page cache for the "ephemeral files" that back the open layers.
2026-05-22 23:50:39 +00:00 · 2021-11-15 09:25:05 +02:00 · 2021-11-15 09:25:03 +02:00 · 2021-11-12 21:52:03 +02:00 · 2021-11-12 11:02:12 -08:00 · 2021-11-12 19:59:31 +02:00
135 changed files with 12207 additions and 4244 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,16 +1,16 @@
 version: 2.1

-orbs:
-  python: circleci/python@1.4.0
-
 executors:
  zenith-build-executor:
    resource_class: xlarge
    docker:
      - image: cimg/rust:1.55.0
+  zenith-python-executor:
+    docker:
+      - image: cimg/python:3.7.10  # Oldest available 3.7 with Ubuntu 20.04 (for GLIBC and Rust) at CirlceCI

 jobs:
-  check-codestyle:
+  check-codestyle-rust:
    executor: zenith-build-executor
    steps:
      - checkout
@@ -182,24 +182,24 @@ jobs:
          paths:
            - "*"

-  check-python:
-    executor: python/default
+  check-codestyle-python:
+    executor: zenith-python-executor
    steps:
      - checkout
      - run:
-          name: Install pipenv & deps
-          working_directory: test_runner
-          command: |
-            pip install pipenv
-            pipenv install --dev
+          name: Install deps
+          command: pipenv --python 3.7 install --dev
      - run:
          name: Run yapf to ensure code format
-          working_directory: test_runner
+          when: always
          command: pipenv run yapf --recursive --diff .
+      - run:
+          name: Run mypy to check types
+          when: always
+          command: pipenv run mypy .

  run-pytest:
-    #description: "Run pytest"
-    executor: python/default
+    executor: zenith-python-executor
    parameters:
      # pytest args to specify the tests to run.
      #
@@ -234,14 +234,16 @@ jobs:
          steps:
            - run: git submodule update --init --depth 1
      - run:
-          name: Install pipenv & deps
-          working_directory: test_runner
-          command: |
-            pip install pipenv
-            pipenv install
+          name: Install deps
+          command: pipenv --python 3.7 install
      - run:
          name: Run pytest
          working_directory: test_runner
+          # pytest doesn't output test logs in real time, so CI job may fail with
+          # `Too long with no output` error, if a test is running for a long time.
+          # In that case, tests should have internal timeouts that are less than 
+          # no_output_timeout, specified here.
+          no_output_timeout: 10m
          environment:
            - ZENITH_BIN: /tmp/zenith/bin
            - POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
@@ -266,7 +268,7 @@ jobs:
            # -n4 uses four processes to run tests via pytest-xdist
            # -s is not used to prevent pytest from capturing output, because tests are running
            # in parallel and logs are mixed between different tests
-            pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short --verbose -rA $TEST_SELECTION $EXTRA_PARAMS
+            pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short --verbose -m "not remote_cluster" -rA $TEST_SELECTION $EXTRA_PARAMS
      - run:
          # CircleCI artifacts are preserved one file at a time, so skipping
          # this step isn't a good idea. If you want to extract the
@@ -298,7 +300,7 @@ jobs:
          name: Build and push Docker image
          command: |
            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
-            docker build -t zenithdb/zenith:latest . && docker push zenithdb/zenith:latest
+            docker build --build-arg GIT_VERSION=$CIRCLE_SHA1 -t zenithdb/zenith:latest . && docker push zenithdb/zenith:latest

  # Trigger a new remote CI job
  remote-ci-trigger:
@@ -347,8 +349,8 @@ jobs:
 workflows:
  build_and_test:
    jobs:
-      - check-codestyle
-      - check-python
+      - check-codestyle-rust
+      - check-codestyle-python
      - build-postgres:
          name: build-postgres-<< matrix.build_type >>
          matrix:
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -0,0 +1,125 @@
+name: benchmarking
+
+on:
+  # uncomment to run on push for debugging your PR
+  # push:
+  #   branches: [ mybranch ]
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '36 7 * * *' # run once a day, timezone is utc
+
+  workflow_dispatch: # adds ability to run this manually
+
+env:
+  BASE_URL: "https://console.zenith.tech"
+
+jobs:
+  bench:
+    # this workflow runs on self hosteed runner
+    # it's environment is quite different from usual guthub runner
+    # probably the most important difference is that it doesnt start from clean workspace each time
+    # e g if you install system packages they are not cleaned up since you install them directly in host machine
+    # not a container or something
+    # See documentation for more info: https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners
+    runs-on: [self-hosted, zenith-benchmarker]
+
+    env:
+      PG_BIN: "/usr/pgsql-13/bin"
+
+    steps:
+    - name: Checkout zenith repo
+      uses: actions/checkout@v2
+
+    - name: Checkout zenith-perf-data repo
+      uses: actions/checkout@v2
+      with:
+        repository: zenithdb/zenith-perf-data
+        token: ${{ secrets.VIP_VAP_ACCESS_TOKEN }}
+        ref: master
+        path: zenith-perf-data
+
+    # actions/setup-python@v2 is not working correctly on self-hosted runners
+    # see https://github.com/actions/setup-python/issues/162
+    # and probably https://github.com/actions/setup-python/issues/162#issuecomment-865387976 in particular
+    # so the simplest solution to me is to use already installed system python and spin virtualenvs for job runs.
+    # there is Python 3.7.10 already installed on the machine so use it to install pipenv and then use pipenv's virtuealenvs
+    - name: Install pipenv & deps
+      run: |
+        python3 -m pip install --upgrade pipenv wheel
+        # since pip/pipenv caches are reused there shouldn't be any troubles with install every time
+        pipenv install
+
+    - name: Show versions
+      run: |
+        echo Python
+        python3 --version
+        pipenv run python3 --version
+        echo Pipenv
+        pipenv --version
+        echo Pgbench
+        $PG_BIN/pgbench --version
+
+    # FIXME cluster setup is skipped due to various changes in console API
+    # for now pre created cluster is used. When API gain some stability
+    # after massive changes dynamic cluster setup will be revived.
+    # So use pre created cluster. It needs to be started manually, but stop is automatic after 5 minutes of inactivity
+    - name: Setup cluster
+      env:
+        BENCHMARK_CONSOLE_USER_PASSWORD: "${{ secrets.BENCHMARK_CONSOLE_USER_PASSWORD }}"
+        BENCHMARK_CONSOLE_ACCESS_TOKEN: "${{ secrets.BENCHMARK_CONSOLE_ACCESS_TOKEN }}"
+        BENCHMARK_CLUSTER_ID: "${{ secrets.BENCHMARK_CLUSTER_ID }}"
+      shell: bash
+      run: |
+        set -e
+
+        echo "Starting cluster"
+        CLUSTER=$(curl -s --fail --show-error -X POST $BASE_URL/api/v1/clusters/$BENCHMARK_CLUSTER_ID/start \
+            -H "Authorization: Bearer $BENCHMARK_CONSOLE_ACCESS_TOKEN")
+        echo $CLUSTER | python -m json.tool
+
+        echo "Waiting for cluster to become ready"
+        sleep 10
+
+        echo "CLUSTER_ID=$BENCHMARK_CLUSTER_ID" >> $GITHUB_ENV
+        CLUSTER=$(curl -s --fail --show-error -X GET $BASE_URL/api/v1/clusters/$BENCHMARK_CLUSTER_ID.json \
+            -H "Authorization: Bearer $BENCHMARK_CONSOLE_ACCESS_TOKEN")
+        echo $CLUSTER | python -m json.tool
+
+    - name: Run benchmark
+      # pgbench is installed system wide from official repo
+      # https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
+      # via
+      # sudo tee /etc/yum.repos.d/pgdg.repo<<EOF
+      # [pgdg13]
+      # name=PostgreSQL 13 for RHEL/CentOS 7 - x86_64
+      # baseurl=https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
+      # enabled=1
+      # gpgcheck=0
+      # EOF
+      # sudo yum makecache
+      # sudo yum install postgresql13-contrib
+      # actual binaries are located in /usr/pgsql-13/bin/
+      env:
+        TEST_PG_BENCH_TRANSACTIONS_MATRIX: "5000,10000,20000"
+        TEST_PG_BENCH_SCALES_MATRIX: "10,15"
+        PLATFORM: "zenith-staging"
+        BENCHMARK_CONSOLE_ACCESS_TOKEN: "${{ secrets.BENCHMARK_CONSOLE_ACCESS_TOKEN }}"
+        BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
+        REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
+      run: |
+        mkdir -p zenith-perf-data/data/staging
+        pipenv run pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir zenith-perf-data/data/staging
+
+    - name: Submit result
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+      run: |
+        cd zenith-perf-data
+        git add data
+        git commit --author="vipvap <vipvap@zenith.tech>" -m "add performance test result for $GITHUB_SHA zenith revision"
+        git push https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-perf-data.git master
--- a/.yapfignore
+++ b/.yapfignore
@@ -0,0 +1,10 @@
+# This file is only read when `yapf` is run from this directory.
+# Hence we only top-level directories here to avoid confusion.
+# See source code for the exact file format: https://github.com/google/yapf/blob/c6077954245bc3add82dafd853a1c7305a6ebd20/yapf/yapflib/file_resources.py#L40-L43
+vendor/
+target/
+tmp_install/
+__pycache__/
+test_output/
+.zenith/
+.git/
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/6
+++ b/6
@@ -21,11 +21,15 @@ RUN rm -rf postgres_install/build
 # net time waste in a lot of cases. Copying Cargo.lock with empty lib.rs should do the work.
 #
 FROM zenithdb/build:buster AS build
+
+ARG GIT_VERSION
+RUN if [ -z "$GIT_VERSION" ]; then echo "GIT_VERSION is reqired, use build_arg to pass it"; exit 1; fi
+
 WORKDIR /zenith
 COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server

 COPY . .
-RUN cargo build --release
+RUN GIT_VERSION=$GIT_VERSION cargo build --release

 #
 # Copy binaries to resulting image.
--- a/1
+++ b/1
@@ -1 +0,0 @@
-./test_runner/Pipfile
--- a/30
+++ b/30
@@ -0,0 +1,30 @@
+[[source]]
+url = "https://pypi.python.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+pytest = ">=6.0.0"
+typing-extensions = "*"
+pyjwt = {extras = ["crypto"], version = "*"}
+requests = "*"
+pytest-xdist = "*"
+asyncpg = "*"
+cached-property = "*"
+psycopg2-binary = "*"
+jinja2 = "*"
+
+[dev-packages]
+# Behavior may change slightly between versions. These are run continuously,
+# so we pin exact versions to avoid suprising breaks. Update if comfortable.
+yapf = "==0.31.0"
+mypy = "==0.910"
+# Non-pinned packages follow.
+pipenv = "*"
+flake8 = "*"
+types-requests = "*"
+types-psycopg2 = "*"
+
+[requires]
+# we need at least 3.7, but pipenv doesn't allow to say this directly
+python_version = "3"
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1 +0,0 @@
-./test_runner/Pipfile.lock
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -0,0 +1,652 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "c309cb963a7b07ae3d30e9cbf08b495f77bdecc0e5356fc89d133c4fbcb65b2b"
+        },
+        "pipfile-spec": 6,
+        "requires": {
+            "python_version": "3"
+        },
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.python.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "asyncpg": {
+            "hashes": [
+                "sha256:129d501f3d30616afd51eb8d3142ef51ba05374256bd5834cec3ef4956a9b317",
+                "sha256:29ef6ae0a617fc13cc2ac5dc8e9b367bb83cba220614b437af9b67766f4b6b20",
+                "sha256:41704c561d354bef01353835a7846e5606faabbeb846214dfcf666cf53319f18",
+                "sha256:556b0e92e2b75dc028b3c4bc9bd5162ddf0053b856437cf1f04c97f9c6837d03",
+                "sha256:8ff5073d4b654e34bd5eaadc01dc4d68b8a9609084d835acd364cd934190a08d",
+                "sha256:a458fc69051fbb67d995fdda46d75a012b5d6200f91e17d23d4751482640ed4c",
+                "sha256:a7095890c96ba36f9f668eb552bb020dddb44f8e73e932f8573efc613ee83843",
+                "sha256:a738f4807c853623d3f93f0fea11f61be6b0e5ca16ea8aeb42c2c7ee742aa853",
+                "sha256:c4fc0205fe4ddd5aeb3dfdc0f7bafd43411181e1f5650189608e5971cceacff1",
+                "sha256:dd2fa063c3344823487d9ddccb40802f02622ddf8bf8a6cc53885ee7a2c1c0c6",
+                "sha256:ddffcb85227bf39cd1bedd4603e0082b243cf3b14ced64dce506a15b05232b83",
+                "sha256:e36c6806883786b19551bb70a4882561f31135dc8105a59662e0376cf5b2cbc5",
+                "sha256:eed43abc6ccf1dc02e0d0efc06ce46a411362f3358847c6b0ec9a43426f91ece"
+            ],
+            "index": "pypi",
+            "version": "==0.24.0"
+        },
+        "attrs": {
+            "hashes": [
+                "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1",
+                "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==21.2.0"
+        },
+        "cached-property": {
+            "hashes": [
+                "sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130",
+                "sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"
+            ],
+            "index": "pypi",
+            "version": "==1.5.2"
+        },
+        "certifi": {
+            "hashes": [
+                "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
+                "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
+            ],
+            "version": "==2021.10.8"
+        },
+        "cffi": {
+            "hashes": [
+                "sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3",
+                "sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2",
+                "sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636",
+                "sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20",
+                "sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728",
+                "sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27",
+                "sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66",
+                "sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443",
+                "sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0",
+                "sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7",
+                "sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39",
+                "sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605",
+                "sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a",
+                "sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37",
+                "sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029",
+                "sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139",
+                "sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc",
+                "sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df",
+                "sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14",
+                "sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880",
+                "sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2",
+                "sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a",
+                "sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e",
+                "sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474",
+                "sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024",
+                "sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8",
+                "sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0",
+                "sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e",
+                "sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a",
+                "sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e",
+                "sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032",
+                "sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6",
+                "sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e",
+                "sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b",
+                "sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e",
+                "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954",
+                "sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962",
+                "sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c",
+                "sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4",
+                "sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55",
+                "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962",
+                "sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023",
+                "sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c",
+                "sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6",
+                "sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8",
+                "sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382",
+                "sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7",
+                "sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc",
+                "sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997",
+                "sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"
+            ],
+            "version": "==1.15.0"
+        },
+        "charset-normalizer": {
+            "hashes": [
+                "sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0",
+                "sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b"
+            ],
+            "markers": "python_version >= '3'",
+            "version": "==2.0.7"
+        },
+        "cryptography": {
+            "hashes": [
+                "sha256:07bb7fbfb5de0980590ddfc7f13081520def06dc9ed214000ad4372fb4e3c7f6",
+                "sha256:18d90f4711bf63e2fb21e8c8e51ed8189438e6b35a6d996201ebd98a26abbbe6",
+                "sha256:1ed82abf16df40a60942a8c211251ae72858b25b7421ce2497c2eb7a1cee817c",
+                "sha256:22a38e96118a4ce3b97509443feace1d1011d0571fae81fc3ad35f25ba3ea999",
+                "sha256:2d69645f535f4b2c722cfb07a8eab916265545b3475fdb34e0be2f4ee8b0b15e",
+                "sha256:4a2d0e0acc20ede0f06ef7aa58546eee96d2592c00f450c9acb89c5879b61992",
+                "sha256:54b2605e5475944e2213258e0ab8696f4f357a31371e538ef21e8d61c843c28d",
+                "sha256:7075b304cd567694dc692ffc9747f3e9cb393cc4aa4fb7b9f3abd6f5c4e43588",
+                "sha256:7b7ceeff114c31f285528ba8b390d3e9cfa2da17b56f11d366769a807f17cbaa",
+                "sha256:7eba2cebca600a7806b893cb1d541a6e910afa87e97acf2021a22b32da1df52d",
+                "sha256:928185a6d1ccdb816e883f56ebe92e975a262d31cc536429041921f8cb5a62fd",
+                "sha256:9933f28f70d0517686bd7de36166dda42094eac49415459d9bdf5e7df3e0086d",
+                "sha256:a688ebcd08250eab5bb5bca318cc05a8c66de5e4171a65ca51db6bd753ff8953",
+                "sha256:abb5a361d2585bb95012a19ed9b2c8f412c5d723a9836418fab7aaa0243e67d2",
+                "sha256:c10c797ac89c746e488d2ee92bd4abd593615694ee17b2500578b63cad6b93a8",
+                "sha256:ced40344e811d6abba00295ced98c01aecf0c2de39481792d87af4fa58b7b4d6",
+                "sha256:d57e0cdc1b44b6cdf8af1d01807db06886f10177469312fbde8f44ccbb284bc9",
+                "sha256:d99915d6ab265c22873f1b4d6ea5ef462ef797b4140be4c9d8b179915e0985c6",
+                "sha256:eb80e8a1f91e4b7ef8b33041591e6d89b2b8e122d787e87eeb2b08da71bb16ad",
+                "sha256:ebeddd119f526bcf323a89f853afb12e225902a24d29b55fe18dd6fcb2838a76"
+            ],
+            "version": "==35.0.0"
+        },
+        "execnet": {
+            "hashes": [
+                "sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5",
+                "sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==1.9.0"
+        },
+        "idna": {
+            "hashes": [
+                "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
+                "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
+            ],
+            "markers": "python_version >= '3'",
+            "version": "==3.3"
+        },
+        "importlib-metadata": {
+            "hashes": [
+                "sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
+                "sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
+            ],
+            "markers": "python_version < '3.8'",
+            "version": "==4.8.1"
+        },
+        "iniconfig": {
+            "hashes": [
+                "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
+                "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
+            ],
+            "version": "==1.1.1"
+        },
+        "jinja2": {
+            "hashes": [
+                "sha256:827a0e32839ab1600d4eb1c4c33ec5a8edfbc5cb42dafa13b81f182f97784b45",
+                "sha256:8569982d3f0889eed11dd620c706d39b60c36d6d25843961f33f77fb6bc6b20c"
+            ],
+            "index": "pypi",
+            "version": "==3.0.2"
+        },
+        "markupsafe": {
+            "hashes": [
+                "sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298",
+                "sha256:023cb26ec21ece8dc3907c0e8320058b2e0cb3c55cf9564da612bc325bed5e64",
+                "sha256:0446679737af14f45767963a1a9ef7620189912317d095f2d9ffa183a4d25d2b",
+                "sha256:04635854b943835a6ea959e948d19dcd311762c5c0c6e1f0e16ee57022669194",
+                "sha256:0717a7390a68be14b8c793ba258e075c6f4ca819f15edfc2a3a027c823718567",
+                "sha256:0955295dd5eec6cb6cc2fe1698f4c6d84af2e92de33fbcac4111913cd100a6ff",
+                "sha256:0d4b31cc67ab36e3392bbf3862cfbadac3db12bdd8b02a2731f509ed5b829724",
+                "sha256:10f82115e21dc0dfec9ab5c0223652f7197feb168c940f3ef61563fc2d6beb74",
+                "sha256:168cd0a3642de83558a5153c8bd34f175a9a6e7f6dc6384b9655d2697312a646",
+                "sha256:1d609f577dc6e1aa17d746f8bd3c31aa4d258f4070d61b2aa5c4166c1539de35",
+                "sha256:1f2ade76b9903f39aa442b4aadd2177decb66525062db244b35d71d0ee8599b6",
+                "sha256:20dca64a3ef2d6e4d5d615a3fd418ad3bde77a47ec8a23d984a12b5b4c74491a",
+                "sha256:2a7d351cbd8cfeb19ca00de495e224dea7e7d919659c2841bbb7f420ad03e2d6",
+                "sha256:2d7d807855b419fc2ed3e631034685db6079889a1f01d5d9dac950f764da3dad",
+                "sha256:2ef54abee730b502252bcdf31b10dacb0a416229b72c18b19e24a4509f273d26",
+                "sha256:36bc903cbb393720fad60fc28c10de6acf10dc6cc883f3e24ee4012371399a38",
+                "sha256:37205cac2a79194e3750b0af2a5720d95f786a55ce7df90c3af697bfa100eaac",
+                "sha256:3c112550557578c26af18a1ccc9e090bfe03832ae994343cfdacd287db6a6ae7",
+                "sha256:3dd007d54ee88b46be476e293f48c85048603f5f516008bee124ddd891398ed6",
+                "sha256:4296f2b1ce8c86a6aea78613c34bb1a672ea0e3de9c6ba08a960efe0b0a09047",
+                "sha256:47ab1e7b91c098ab893b828deafa1203de86d0bc6ab587b160f78fe6c4011f75",
+                "sha256:49e3ceeabbfb9d66c3aef5af3a60cc43b85c33df25ce03d0031a608b0a8b2e3f",
+                "sha256:4dc8f9fb58f7364b63fd9f85013b780ef83c11857ae79f2feda41e270468dd9b",
+                "sha256:4efca8f86c54b22348a5467704e3fec767b2db12fc39c6d963168ab1d3fc9135",
+                "sha256:53edb4da6925ad13c07b6d26c2a852bd81e364f95301c66e930ab2aef5b5ddd8",
+                "sha256:5855f8438a7d1d458206a2466bf82b0f104a3724bf96a1c781ab731e4201731a",
+                "sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a",
+                "sha256:5b6d930f030f8ed98e3e6c98ffa0652bdb82601e7a016ec2ab5d7ff23baa78d1",
+                "sha256:5bb28c636d87e840583ee3adeb78172efc47c8b26127267f54a9c0ec251d41a9",
+                "sha256:60bf42e36abfaf9aff1f50f52644b336d4f0a3fd6d8a60ca0d054ac9f713a864",
+                "sha256:611d1ad9a4288cf3e3c16014564df047fe08410e628f89805e475368bd304914",
+                "sha256:6300b8454aa6930a24b9618fbb54b5a68135092bc666f7b06901f897fa5c2fee",
+                "sha256:63f3268ba69ace99cab4e3e3b5840b03340efed0948ab8f78d2fd87ee5442a4f",
+                "sha256:6557b31b5e2c9ddf0de32a691f2312a32f77cd7681d8af66c2692efdbef84c18",
+                "sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8",
+                "sha256:6a7fae0dd14cf60ad5ff42baa2e95727c3d81ded453457771d02b7d2b3f9c0c2",
+                "sha256:6c4ca60fa24e85fe25b912b01e62cb969d69a23a5d5867682dd3e80b5b02581d",
+                "sha256:6fcf051089389abe060c9cd7caa212c707e58153afa2c649f00346ce6d260f1b",
+                "sha256:7d91275b0245b1da4d4cfa07e0faedd5b0812efc15b702576d103293e252af1b",
+                "sha256:89c687013cb1cd489a0f0ac24febe8c7a666e6e221b783e53ac50ebf68e45d86",
+                "sha256:8d206346619592c6200148b01a2142798c989edcb9c896f9ac9722a99d4e77e6",
+                "sha256:905fec760bd2fa1388bb5b489ee8ee5f7291d692638ea5f67982d968366bef9f",
+                "sha256:97383d78eb34da7e1fa37dd273c20ad4320929af65d156e35a5e2d89566d9dfb",
+                "sha256:984d76483eb32f1bcb536dc27e4ad56bba4baa70be32fa87152832cdd9db0833",
+                "sha256:99df47edb6bda1249d3e80fdabb1dab8c08ef3975f69aed437cb69d0a5de1e28",
+                "sha256:9f02365d4e99430a12647f09b6cc8bab61a6564363f313126f775eb4f6ef798e",
+                "sha256:a30e67a65b53ea0a5e62fe23682cfe22712e01f453b95233b25502f7c61cb415",
+                "sha256:ab3ef638ace319fa26553db0624c4699e31a28bb2a835c5faca8f8acf6a5a902",
+                "sha256:aca6377c0cb8a8253e493c6b451565ac77e98c2951c45f913e0b52facdcff83f",
+                "sha256:add36cb2dbb8b736611303cd3bfcee00afd96471b09cda130da3581cbdc56a6d",
+                "sha256:b2f4bf27480f5e5e8ce285a8c8fd176c0b03e93dcc6646477d4630e83440c6a9",
+                "sha256:b7f2d075102dc8c794cbde1947378051c4e5180d52d276987b8d28a3bd58c17d",
+                "sha256:baa1a4e8f868845af802979fcdbf0bb11f94f1cb7ced4c4b8a351bb60d108145",
+                "sha256:be98f628055368795d818ebf93da628541e10b75b41c559fdf36d104c5787066",
+                "sha256:bf5d821ffabf0ef3533c39c518f3357b171a1651c1ff6827325e4489b0e46c3c",
+                "sha256:c47adbc92fc1bb2b3274c4b3a43ae0e4573d9fbff4f54cd484555edbf030baf1",
+                "sha256:cdfba22ea2f0029c9261a4bd07e830a8da012291fbe44dc794e488b6c9bb353a",
+                "sha256:d6c7ebd4e944c85e2c3421e612a7057a2f48d478d79e61800d81468a8d842207",
+                "sha256:d7f9850398e85aba693bb640262d3611788b1f29a79f0c93c565694658f4071f",
+                "sha256:d8446c54dc28c01e5a2dbac5a25f071f6653e6e40f3a8818e8b45d790fe6ef53",
+                "sha256:deb993cacb280823246a026e3b2d81c493c53de6acfd5e6bfe31ab3402bb37dd",
+                "sha256:e0f138900af21926a02425cf736db95be9f4af72ba1bb21453432a07f6082134",
+                "sha256:e9936f0b261d4df76ad22f8fee3ae83b60d7c3e871292cd42f40b81b70afae85",
+                "sha256:f0567c4dc99f264f49fe27da5f735f414c4e7e7dd850cfd8e69f0862d7c74ea9",
+                "sha256:f5653a225f31e113b152e56f154ccbe59eeb1c7487b39b9d9f9cdb58e6c79dc5",
+                "sha256:f826e31d18b516f653fe296d967d700fddad5901ae07c622bb3705955e1faa94",
+                "sha256:f8ba0e8349a38d3001fae7eadded3f6606f0da5d748ee53cc1dab1d6527b9509",
+                "sha256:f9081981fe268bd86831e5c75f7de206ef275defcb82bc70740ae6dc507aee51",
+                "sha256:fa130dd50c57d53368c9d59395cb5526eda596d3ffe36666cd81a44d56e48872"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==2.0.1"
+        },
+        "packaging": {
+            "hashes": [
+                "sha256:096d689d78ca690e4cd8a89568ba06d07ca097e3306a4381635073ca91479966",
+                "sha256:14317396d1e8cdb122989b916fa2c7e9ca8e2be9e8060a6eff75b6b7b4d8a7e0"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==21.2"
+        },
+        "pluggy": {
+            "hashes": [
+                "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
+                "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==1.0.0"
+        },
+        "psycopg2-binary": {
+            "hashes": [
+                "sha256:0b7dae87f0b729922e06f85f667de7bf16455d411971b2043bbd9577af9d1975",
+                "sha256:0f2e04bd2a2ab54fa44ee67fe2d002bb90cee1c0f1cc0ebc3148af7b02034cbd",
+                "sha256:123c3fb684e9abfc47218d3784c7b4c47c8587951ea4dd5bc38b6636ac57f616",
+                "sha256:1473c0215b0613dd938db54a653f68251a45a78b05f6fc21af4326f40e8360a2",
+                "sha256:14db1752acdd2187d99cb2ca0a1a6dfe57fc65c3281e0f20e597aac8d2a5bd90",
+                "sha256:1e3a362790edc0a365385b1ac4cc0acc429a0c0d662d829a50b6ce743ae61b5a",
+                "sha256:1e85b74cbbb3056e3656f1cc4781294df03383127a8114cbc6531e8b8367bf1e",
+                "sha256:20f1ab44d8c352074e2d7ca67dc00843067788791be373e67a0911998787ce7d",
+                "sha256:24b0b6688b9f31a911f2361fe818492650795c9e5d3a1bc647acbd7440142a4f",
+                "sha256:2f62c207d1740b0bde5c4e949f857b044818f734a3d57f1d0d0edc65050532ed",
+                "sha256:3242b9619de955ab44581a03a64bdd7d5e470cc4183e8fcadd85ab9d3756ce7a",
+                "sha256:35c4310f8febe41f442d3c65066ca93cccefd75013df3d8c736c5b93ec288140",
+                "sha256:4235f9d5ddcab0b8dbd723dca56ea2922b485ea00e1dafacf33b0c7e840b3d32",
+                "sha256:542875f62bc56e91c6eac05a0deadeae20e1730be4c6334d8f04c944fcd99759",
+                "sha256:5ced67f1e34e1a450cdb48eb53ca73b60aa0af21c46b9b35ac3e581cf9f00e31",
+                "sha256:661509f51531ec125e52357a489ea3806640d0ca37d9dada461ffc69ee1e7b6e",
+                "sha256:7360647ea04db2e7dff1648d1da825c8cf68dc5fbd80b8fb5b3ee9f068dcd21a",
+                "sha256:736b8797b58febabb85494142c627bd182b50d2a7ec65322983e71065ad3034c",
+                "sha256:8c13d72ed6af7fd2c8acbd95661cf9477f94e381fce0792c04981a8283b52917",
+                "sha256:988b47ac70d204aed01589ed342303da7c4d84b56c2f4c4b8b00deda123372bf",
+                "sha256:995fc41ebda5a7a663a254a1dcac52638c3e847f48307b5416ee373da15075d7",
+                "sha256:a36c7eb6152ba5467fb264d73844877be8b0847874d4822b7cf2d3c0cb8cdcb0",
+                "sha256:aed4a9a7e3221b3e252c39d0bf794c438dc5453bc2963e8befe9d4cd324dff72",
+                "sha256:aef9aee84ec78af51107181d02fe8773b100b01c5dfde351184ad9223eab3698",
+                "sha256:b0221ca5a9837e040ebf61f48899926b5783668b7807419e4adae8175a31f773",
+                "sha256:b4d7679a08fea64573c969f6994a2631908bb2c0e69a7235648642f3d2e39a68",
+                "sha256:c250a7ec489b652c892e4f0a5d122cc14c3780f9f643e1a326754aedf82d9a76",
+                "sha256:ca86db5b561b894f9e5f115d6a159fff2a2570a652e07889d8a383b5fae66eb4",
+                "sha256:cfc523edecddaef56f6740d7de1ce24a2fdf94fd5e704091856a201872e37f9f",
+                "sha256:d92272c7c16e105788efe2cfa5d680f07e34e0c29b03c1908f8636f55d5f915a",
+                "sha256:da113b70f6ec40e7d81b43d1b139b9db6a05727ab8be1ee559f3a69854a69d34",
+                "sha256:f6fac64a38f6768e7bc7b035b9e10d8a538a9fadce06b983fb3e6fa55ac5f5ce",
+                "sha256:f8559617b1fcf59a9aedba2c9838b5b6aa211ffedecabca412b92a1ff75aac1a",
+                "sha256:fbb42a541b1093385a2d8c7eec94d26d30437d0e77c1d25dae1dcc46741a385e"
+            ],
+            "index": "pypi",
+            "version": "==2.9.1"
+        },
+        "py": {
+            "hashes": [
+                "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3",
+                "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==1.10.0"
+        },
+        "pycparser": {
+            "hashes": [
+                "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
+                "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==2.20"
+        },
+        "pyjwt": {
+            "extras": [
+                "crypto"
+            ],
+            "hashes": [
+                "sha256:b888b4d56f06f6dcd777210c334e69c737be74755d3e5e9ee3fe67dc18a0ee41",
+                "sha256:e0c4bb8d9f0af0c7f5b1ec4c5036309617d03d56932877f2f7a0beeb5318322f"
+            ],
+            "index": "pypi",
+            "version": "==2.3.0"
+        },
+        "pyparsing": {
+            "hashes": [
+                "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
+                "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
+            ],
+            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==2.4.7"
+        },
+        "pytest": {
+            "hashes": [
+                "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89",
+                "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"
+            ],
+            "index": "pypi",
+            "version": "==6.2.5"
+        },
+        "pytest-forked": {
+            "hashes": [
+                "sha256:6aa9ac7e00ad1a539c41bec6d21011332de671e938c7637378ec9710204e37ca",
+                "sha256:dc4147784048e70ef5d437951728825a131b81714b398d5d52f17c7c144d8815"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==1.3.0"
+        },
+        "pytest-xdist": {
+            "hashes": [
+                "sha256:7b61ebb46997a0820a263553179d6d1e25a8c50d8a8620cd1aa1e20e3be99168",
+                "sha256:89b330316f7fc475f999c81b577c2b926c9569f3d397ae432c0c2e2496d61ff9"
+            ],
+            "index": "pypi",
+            "version": "==2.4.0"
+        },
+        "requests": {
+            "hashes": [
+                "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
+                "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
+            ],
+            "index": "pypi",
+            "version": "==2.26.0"
+        },
+        "toml": {
+            "hashes": [
+                "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
+                "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
+            ],
+            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==0.10.2"
+        },
+        "typing-extensions": {
+            "hashes": [
+                "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
+                "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
+                "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
+            ],
+            "index": "pypi",
+            "version": "==3.10.0.2"
+        },
+        "urllib3": {
+            "hashes": [
+                "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece",
+                "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
+            "version": "==1.26.7"
+        },
+        "zipp": {
+            "hashes": [
+                "sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
+                "sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==3.6.0"
+        }
+    },
+    "develop": {
+        "backports.entry-points-selectable": {
+            "hashes": [
+                "sha256:988468260ec1c196dab6ae1149260e2f5472c9110334e5d51adcb77867361f6a",
+                "sha256:a6d9a871cde5e15b4c4a53e3d43ba890cc6861ec1332c9c2428c92f977192acc"
+            ],
+            "markers": "python_version >= '2.7'",
+            "version": "==1.1.0"
+        },
+        "certifi": {
+            "hashes": [
+                "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
+                "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
+            ],
+            "version": "==2021.10.8"
+        },
+        "distlib": {
+            "hashes": [
+                "sha256:c8b54e8454e5bf6237cc84c20e8264c3e991e824ef27e8f1e81049867d861e31",
+                "sha256:d982d0751ff6eaaab5e2ec8e691d949ee80eddf01a62eaa96ddb11531fe16b05"
+            ],
+            "version": "==0.3.3"
+        },
+        "filelock": {
+            "hashes": [
+                "sha256:7afc856f74fa7006a289fd10fa840e1eebd8bbff6bffb69c26c54a0512ea8cf8",
+                "sha256:bb2a1c717df74c48a2d00ed625e5a66f8572a3a30baacb7657add1d7bac4097b"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==3.3.2"
+        },
+        "flake8": {
+            "hashes": [
+                "sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d",
+                "sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"
+            ],
+            "index": "pypi",
+            "version": "==4.0.1"
+        },
+        "importlib-metadata": {
+            "hashes": [
+                "sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
+                "sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
+            ],
+            "markers": "python_version < '3.8'",
+            "version": "==4.8.1"
+        },
+        "mccabe": {
+            "hashes": [
+                "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
+                "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
+            ],
+            "version": "==0.6.1"
+        },
+        "mypy": {
+            "hashes": [
+                "sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9",
+                "sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a",
+                "sha256:119bed3832d961f3a880787bf621634ba042cb8dc850a7429f643508eeac97b9",
+                "sha256:1a85e280d4d217150ce8cb1a6dddffd14e753a4e0c3cf90baabb32cefa41b59e",
+                "sha256:3c4b8ca36877fc75339253721f69603a9c7fdb5d4d5a95a1a1b899d8b86a4de2",
+                "sha256:3e382b29f8e0ccf19a2df2b29a167591245df90c0b5a2542249873b5c1d78212",
+                "sha256:42c266ced41b65ed40a282c575705325fa7991af370036d3f134518336636f5b",
+                "sha256:53fd2eb27a8ee2892614370896956af2ff61254c275aaee4c230ae771cadd885",
+                "sha256:704098302473cb31a218f1775a873b376b30b4c18229421e9e9dc8916fd16150",
+                "sha256:7df1ead20c81371ccd6091fa3e2878559b5c4d4caadaf1a484cf88d93ca06703",
+                "sha256:866c41f28cee548475f146aa4d39a51cf3b6a84246969f3759cb3e9c742fc072",
+                "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457",
+                "sha256:adaeee09bfde366d2c13fe6093a7df5df83c9a2ba98638c7d76b010694db760e",
+                "sha256:b6fb13123aeef4a3abbcfd7e71773ff3ff1526a7d3dc538f3929a49b42be03f0",
+                "sha256:b94e4b785e304a04ea0828759172a15add27088520dc7e49ceade7834275bedb",
+                "sha256:c0df2d30ed496a08de5daed2a9ea807d07c21ae0ab23acf541ab88c24b26ab97",
+                "sha256:c6c2602dffb74867498f86e6129fd52a2770c48b7cd3ece77ada4fa38f94eba8",
+                "sha256:ceb6e0a6e27fb364fb3853389607cf7eb3a126ad335790fa1e14ed02fba50811",
+                "sha256:d9dd839eb0dc1bbe866a288ba3c1afc33a202015d2ad83b31e875b5905a079b6",
+                "sha256:e4dab234478e3bd3ce83bac4193b2ecd9cf94e720ddd95ce69840273bf44f6de",
+                "sha256:ec4e0cd079db280b6bdabdc807047ff3e199f334050db5cbb91ba3e959a67504",
+                "sha256:ecd2c3fe726758037234c93df7e98deb257fd15c24c9180dacf1ef829da5f921",
+                "sha256:ef565033fa5a958e62796867b1df10c40263ea9ded87164d67572834e57a174d"
+            ],
+            "index": "pypi",
+            "version": "==0.910"
+        },
+        "mypy-extensions": {
+            "hashes": [
+                "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d",
+                "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"
+            ],
+            "version": "==0.4.3"
+        },
+        "pipenv": {
+            "hashes": [
+                "sha256:05958fadcd70b2de6a27542fcd2bd72dd5c59c6d35307fdac3e06361fb06e30e",
+                "sha256:d180f5be4775c552fd5e69ae18a9d6099d9dafb462efe54f11c72cb5f4d5e977"
+            ],
+            "index": "pypi",
+            "version": "==2021.5.29"
+        },
+        "platformdirs": {
+            "hashes": [
+                "sha256:367a5e80b3d04d2428ffa76d33f124cf11e8fff2acdaa9b43d545f5c7d661ef2",
+                "sha256:8868bbe3c3c80d42f20156f22e7131d2fb321f5bc86a2a345375c6481a67021d"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==2.4.0"
+        },
+        "pycodestyle": {
+            "hashes": [
+                "sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20",
+                "sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==2.8.0"
+        },
+        "pyflakes": {
+            "hashes": [
+                "sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c",
+                "sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==2.4.0"
+        },
+        "six": {
+            "hashes": [
+                "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
+                "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==1.16.0"
+        },
+        "toml": {
+            "hashes": [
+                "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
+                "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
+            ],
+            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==0.10.2"
+        },
+        "typed-ast": {
+            "hashes": [
+                "sha256:01ae5f73431d21eead5015997ab41afa53aa1fbe252f9da060be5dad2c730ace",
+                "sha256:067a74454df670dcaa4e59349a2e5c81e567d8d65458d480a5b3dfecec08c5ff",
+                "sha256:0fb71b8c643187d7492c1f8352f2c15b4c4af3f6338f21681d3681b3dc31a266",
+                "sha256:1b3ead4a96c9101bef08f9f7d1217c096f31667617b58de957f690c92378b528",
+                "sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6",
+                "sha256:209596a4ec71d990d71d5e0d312ac935d86930e6eecff6ccc7007fe54d703808",
+                "sha256:2c726c276d09fc5c414693a2de063f521052d9ea7c240ce553316f70656c84d4",
+                "sha256:398e44cd480f4d2b7ee8d98385ca104e35c81525dd98c519acff1b79bdaac363",
+                "sha256:52b1eb8c83f178ab787f3a4283f68258525f8d70f778a2f6dd54d3b5e5fb4341",
+                "sha256:5feca99c17af94057417d744607b82dd0a664fd5e4ca98061480fd8b14b18d04",
+                "sha256:7538e495704e2ccda9b234b82423a4038f324f3a10c43bc088a1636180f11a41",
+                "sha256:760ad187b1041a154f0e4d0f6aae3e40fdb51d6de16e5c99aedadd9246450e9e",
+                "sha256:777a26c84bea6cd934422ac2e3b78863a37017618b6e5c08f92ef69853e765d3",
+                "sha256:95431a26309a21874005845c21118c83991c63ea800dd44843e42a916aec5899",
+                "sha256:9ad2c92ec681e02baf81fdfa056fe0d818645efa9af1f1cd5fd6f1bd2bdfd805",
+                "sha256:9c6d1a54552b5330bc657b7ef0eae25d00ba7ffe85d9ea8ae6540d2197a3788c",
+                "sha256:aee0c1256be6c07bd3e1263ff920c325b59849dc95392a05f258bb9b259cf39c",
+                "sha256:af3d4a73793725138d6b334d9d247ce7e5f084d96284ed23f22ee626a7b88e39",
+                "sha256:b36b4f3920103a25e1d5d024d155c504080959582b928e91cb608a65c3a49e1a",
+                "sha256:b9574c6f03f685070d859e75c7f9eeca02d6933273b5e69572e5ff9d5e3931c3",
+                "sha256:bff6ad71c81b3bba8fa35f0f1921fb24ff4476235a6e94a26ada2e54370e6da7",
+                "sha256:c190f0899e9f9f8b6b7863debfb739abcb21a5c054f911ca3596d12b8a4c4c7f",
+                "sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075",
+                "sha256:cae53c389825d3b46fb37538441f75d6aecc4174f615d048321b716df2757fb0",
+                "sha256:dd4a21253f42b8d2b48410cb31fe501d32f8b9fbeb1f55063ad102fe9c425e40",
+                "sha256:dde816ca9dac1d9c01dd504ea5967821606f02e510438120091b84e852367428",
+                "sha256:f2362f3cb0f3172c42938946dbc5b7843c2a28aec307c49100c8b38764eb6927",
+                "sha256:f328adcfebed9f11301eaedfa48e15bdece9b519fb27e6a8c01aa52a17ec31b3",
+                "sha256:f8afcf15cc511ada719a88e013cec87c11aff7b91f019295eb4530f96fe5ef2f",
+                "sha256:fb1bbeac803adea29cedd70781399c99138358c26d05fcbd23c13016b7f5ec65"
+            ],
+            "markers": "python_version < '3.8'",
+            "version": "==1.4.3"
+        },
+        "types-psycopg2": {
+            "hashes": [
+                "sha256:77ed80f2668582654623e04fb3d741ecce93effcc39c929d7e02f4a917a538ce",
+                "sha256:98a6e0e9580cd7eb4bd4d20f7c7063d154b2589a2b90c0ce4e3ca6085cde77c6"
+            ],
+            "index": "pypi",
+            "version": "==2.9.1"
+        },
+        "types-requests": {
+            "hashes": [
+                "sha256:b279284e51f668e38ee12d9665e4d789089f532dc2a0be4a1508ca0efd98ba9e",
+                "sha256:ba1d108d512e294b6080c37f6ae7cb2a2abf527560e2b671d1786c1fc46b541a"
+            ],
+            "index": "pypi",
+            "version": "==2.25.11"
+        },
+        "typing-extensions": {
+            "hashes": [
+                "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
+                "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
+                "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
+            ],
+            "index": "pypi",
+            "version": "==3.10.0.2"
+        },
+        "virtualenv": {
+            "hashes": [
+                "sha256:4b02e52a624336eece99c96e3ab7111f469c24ba226a53ec474e8e787b365814",
+                "sha256:576d05b46eace16a9c348085f7d0dc8ef28713a2cabaa1cf0aea41e8f12c9218"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==20.10.0"
+        },
+        "virtualenv-clone": {
+            "hashes": [
+                "sha256:418ee935c36152f8f153c79824bb93eaf6f0f7984bae31d3f48f350b9183501a",
+                "sha256:44d5263bceed0bac3e1424d64f798095233b64def1c5689afa43dc3223caf5b0"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==0.5.7"
+        },
+        "yapf": {
+            "hashes": [
+                "sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d",
+                "sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"
+            ],
+            "index": "pypi",
+            "version": "==0.31.0"
+        },
+        "zipp": {
+            "hashes": [
+                "sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
+                "sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==3.6.0"
+        }
+    }
+}
--- a/README.md
+++ b/README.md
@@ -32,8 +32,8 @@ libssl-dev clang pkg-config libpq-dev

 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.

-To run the integration tests (not required to use the code), install
-Python (3.6 or higher), and install python3 packages with `pipenv` using `pipenv install` in the project directory.
+To run the integration tests or Python scripts (not required to use the code), install
+Python (3.7 or higher), and install python3 packages using `pipenv install` in the project directory.

 2. Build zenith and patched postgres
 ```sh
@@ -47,17 +47,26 @@ make -j5
 # Create repository in .zenith with proper paths to binaries and data
 # Later that would be responsibility of a package install script
 > ./target/debug/zenith init
+initializing tenantid c03ba6b7ad4c5e9cf556f059ade44229
+created initial timeline 5b014a9e41b4b63ce1a1febc04503636 timeline.lsn 0/169C3C8
+created main branch
 pageserver init succeeded

-# start pageserver
+# start pageserver and safekeeper
 > ./target/debug/zenith start
-Starting pageserver at '127.0.0.1:64000' in .zenith
+Starting pageserver at 'localhost:64000' in '.zenith'
 Pageserver started
+initializing for single for 7676
+Starting safekeeper at 'localhost:5454' in '.zenith/safekeepers/single'
+Safekeeper started

-# start postgres on top on the pageserver
+# start postgres compute node
 > ./target/debug/zenith pg start main
-Starting postgres node at 'host=127.0.0.1 port=55432 user=stas'
+Starting new postgres main on main...
+Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432
+Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres'
 waiting for server to start.... done
+server started

 # check list of running postgres instances
 > ./target/debug/zenith pg list
@@ -108,10 +117,9 @@ postgres=# insert into t values(2,2);
 INSERT 0 1
 ```

-6. If you want to run tests afterwards (see below), you have to stop pageserver and all postgres instances you have just started:
+6. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances
+   you have just started. You can stop them all with one command:
 ```sh
-> ./target/debug/zenith pg stop migration_check
-> ./target/debug/zenith pg stop main
 > ./target/debug/zenith stop
 ```

@@ -121,7 +129,7 @@ INSERT 0 1
 git clone --recursive https://github.com/zenithdb/zenith.git
 make # builds also postgres and installs it to ./tmp_install
 cd test_runner
-pytest
+pipenv run pytest
 ```

 ## Documentation
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -18,7 +18,7 @@ regex = "1"
 anyhow = "1.0"
 thiserror = "1"
 bytes = "1.0.1"
-nix = "0.20"
+nix = "0.23"
 url = "2.2.2"
 hex = { version = "0.4.3", features = ["serde"] }
 reqwest = { version = "0.11", features = ["blocking", "json"] }
--- a/control_plane/safekeepers.conf
+++ b/control_plane/safekeepers.conf
@@ -0,0 +1,20 @@
+# Page server and three safekeepers.
+[pageserver]
+pg_port = 64000
+http_port = 9898
+auth_type = 'Trust'
+
+[[safekeepers]]
+name = 'sk1'
+pg_port = 5454
+http_port = 7676
+
+[[safekeepers]]
+name = 'sk2'
+pg_port = 5455
+http_port = 7677
+
+[[safekeepers]]
+name = 'sk3'
+pg_port = 5456
+http_port = 7678
--- a/control_plane/simple.conf
+++ b/control_plane/simple.conf
@@ -0,0 +1,11 @@
+# Minimal zenith environment with one safekeeper. This is equivalent to the built-in
+# defaults that you get with no --config
+[pageserver]
+pg_port = 64000
+http_port = 9898
+auth_type = 'Trust'
+
+[[safekeepers]]
+name = 'single'
+pg_port = 5454
+http_port = 7676
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -39,8 +39,6 @@ impl ComputeControlPlane {
    // |  |- <tenant_id>
    // |  |   |- <branch name>
    pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
-        // TODO: since pageserver do not have config file yet we believe here that
-        // it is running on default port. Change that when pageserver will have config.
        let pageserver = Arc::new(PageServerNode::from_env(&env));

        let mut nodes = BTreeMap::default();
@@ -75,15 +73,6 @@ impl ComputeControlPlane {
            .unwrap_or(self.base_port)
    }

-    pub fn local(local_env: &LocalEnv, pageserver: &Arc<PageServerNode>) -> ComputeControlPlane {
-        ComputeControlPlane {
-            base_port: 65431,
-            pageserver: Arc::clone(pageserver),
-            nodes: BTreeMap::new(),
-            env: local_env.clone(),
-        }
-    }
-
    // FIXME: see also parse_point_in_time in branches.rs.
    fn parse_point_in_time(
        &self,
@@ -136,7 +125,7 @@ impl ComputeControlPlane {
        });

        node.create_pgdata()?;
-        node.setup_pg_conf(self.env.auth_type)?;
+        node.setup_pg_conf(self.env.pageserver.auth_type)?;

        self.nodes
            .insert((tenantid, node.name.clone()), Arc::clone(&node));
@@ -210,7 +199,7 @@ impl PostgresNode {
        })
    }

-    fn sync_walkeepers(&self) -> Result<Lsn> {
+    fn sync_safekeepers(&self) -> Result<Lsn> {
        let pg_path = self.env.pg_bin_dir().join("postgres");
        let sync_handle = Command::new(pg_path)
            .arg("--sync-safekeepers")
@@ -235,7 +224,7 @@ impl PostgresNode {
        }

        let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?;
-        println!("Walkeepers synced on {}", lsn);
+        println!("Safekeepers synced on {}", lsn);
        Ok(lsn)
    }

@@ -339,9 +328,25 @@ impl PostgresNode {
        }
        conf.append_line("");

-        // Configure the node to stream WAL directly to the pageserver
-        conf.append("synchronous_standby_names", "pageserver"); // TODO: add a new function arg?
-        conf.append("zenith.callmemaybe_connstring", &self.connstr());
+        if !self.env.safekeepers.is_empty() {
+            // Configure the node to connect to the safekeepers
+            conf.append("synchronous_standby_names", "walproposer");
+
+            let wal_acceptors = self
+                .env
+                .safekeepers
+                .iter()
+                .map(|sk| format!("localhost:{}", sk.pg_port))
+                .collect::<Vec<String>>()
+                .join(",");
+            conf.append("wal_acceptors", &wal_acceptors);
+        } else {
+            // Configure the node to stream WAL directly to the pageserver
+            // This isn't really a supported configuration, but can be useful for
+            // testing.
+            conf.append("synchronous_standby_names", "pageserver");
+            conf.append("zenith.callmemaybe_connstring", &self.connstr());
+        }

        let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
        file.write_all(conf.to_string().as_bytes())?;
@@ -357,7 +362,7 @@ impl PostgresNode {
            // latest data from the pageserver. That is a bit clumsy but whole bootstrap
            // procedure evolves quite actively right now, so let's think about it again
            // when things would be more stable (TODO).
-            let lsn = self.sync_walkeepers()?;
+            let lsn = self.sync_safekeepers()?;
            if lsn == Lsn(0) {
                None
            } else {
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -13,6 +13,7 @@ use std::path::Path;
 pub mod compute;
 pub mod local_env;
 pub mod postgresql_conf;
+pub mod safekeeper;
 pub mod storage;

 /// Read a PID file
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -7,46 +7,102 @@
 use anyhow::{Context, Result};
 use serde::{Deserialize, Serialize};
 use std::env;
+use std::fmt::Write;
 use std::fs;
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
-use zenith_utils::auth::{encode_from_key_path, Claims, Scope};
+use zenith_utils::auth::{encode_from_key_file, Claims, Scope};
 use zenith_utils::postgres_backend::AuthType;
 use zenith_utils::zid::ZTenantId;

 //
-// This data structures represent deserialized zenith CLI config
+// This data structures represents zenith CLI config
+//
+// It is deserialized from the .zenith/config file, or the config file passed
+// to 'zenith init --config=<path>' option. See control_plane/simple.conf for
+// an example.
 //
 #[derive(Serialize, Deserialize, Clone, Debug)]
 pub struct LocalEnv {
-    // Pageserver connection settings
-    pub pageserver_pg_port: u16,
-    pub pageserver_http_port: u16,
-
-    // Base directory for both pageserver and compute nodes
+    // Base directory for all the nodes (the pageserver, safekeepers and
+    // compute nodes).
+    //
+    // This is not stored in the config file. Rather, this is the path where the
+    // config file itself is. It is read from the ZENITH_REPO_DIR env variable or
+    // '.zenith' if not given.
+    #[serde(skip)]
    pub base_data_dir: PathBuf,

    // Path to postgres distribution. It's expected that "bin", "include",
    // "lib", "share" from postgres distribution are there. If at some point
    // in time we will be able to run against vanilla postgres we may split that
    // to four separate paths and match OS-specific installation layout.
+    #[serde(default)]
    pub pg_distrib_dir: PathBuf,

    // Path to pageserver binary.
+    #[serde(default)]
    pub zenith_distrib_dir: PathBuf,

-    // keeping tenant id in config to reduce copy paste when running zenith locally with single tenant
-    #[serde(with = "hex")]
-    pub tenantid: ZTenantId,
+    // Default tenant ID to use with the 'zenith' command line utility, when
+    // --tenantid is not explicitly specified.
+    #[serde(with = "opt_tenantid_serde")]
+    #[serde(default)]
+    pub default_tenantid: Option<ZTenantId>,

-    // jwt auth token used for communication with pageserver
-    pub auth_token: String,
+    // used to issue tokens during e.g pg start
+    #[serde(default)]
+    pub private_key_path: PathBuf,
+
+    pub pageserver: PageServerConf,
+
+    #[serde(default)]
+    pub safekeepers: Vec<SafekeeperConf>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(default)]
+pub struct PageServerConf {
+    // Pageserver connection settings
+    pub pg_port: u16,
+    pub http_port: u16,

    // used to determine which auth type is used
    pub auth_type: AuthType,

-    // used to issue tokens during e.g pg start
-    pub private_key_path: PathBuf,
+    // jwt auth token used for communication with pageserver
+    pub auth_token: String,
+}
+
+impl Default for PageServerConf {
+    fn default() -> Self {
+        Self {
+            pg_port: 0,
+            http_port: 0,
+            auth_type: AuthType::Trust,
+            auth_token: "".to_string(),
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(default)]
+pub struct SafekeeperConf {
+    pub name: String,
+    pub pg_port: u16,
+    pub http_port: u16,
+    pub sync: bool,
+}
+
+impl Default for SafekeeperConf {
+    fn default() -> Self {
+        Self {
+            name: "".to_string(),
+            pg_port: 0,
+            http_port: 0,
+            sync: true,
+        }
+    }
 }

 impl LocalEnv {
@@ -62,6 +118,10 @@ impl LocalEnv {
        Ok(self.zenith_distrib_dir.join("pageserver"))
    }

+    pub fn safekeeper_bin(&self) -> Result<PathBuf> {
+        Ok(self.zenith_distrib_dir.join("safekeeper"))
+    }
+
    pub fn pg_data_dirs_path(&self) -> PathBuf {
        self.base_data_dir.join("pgdatadirs").join("tenants")
    }
@@ -76,6 +136,187 @@ impl LocalEnv {
    pub fn pageserver_data_dir(&self) -> PathBuf {
        self.base_data_dir.clone()
    }
+
+    pub fn safekeeper_data_dir(&self, node_name: &str) -> PathBuf {
+        self.base_data_dir.join("safekeepers").join(node_name)
+    }
+
+    /// Create a LocalEnv from a config file.
+    ///
+    /// Unlike 'load_config', this function fills in any defaults that are missing
+    /// from the config file.
+    pub fn create_config(toml: &str) -> Result<LocalEnv> {
+        let mut env: LocalEnv = toml::from_str(toml)?;
+
+        // Find postgres binaries.
+        // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
+        if env.pg_distrib_dir == Path::new("") {
+            if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
+                env.pg_distrib_dir = postgres_bin.into();
+            } else {
+                let cwd = env::current_dir()?;
+                env.pg_distrib_dir = cwd.join("tmp_install")
+            }
+        }
+        if !env.pg_distrib_dir.join("bin/postgres").exists() {
+            anyhow::bail!(
+                "Can't find postgres binary at {}",
+                env.pg_distrib_dir.display()
+            );
+        }
+
+        // Find zenith binaries.
+        if env.zenith_distrib_dir == Path::new("") {
+            env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
+        }
+        if !env.zenith_distrib_dir.join("pageserver").exists() {
+            anyhow::bail!("Can't find pageserver binary.");
+        }
+        if !env.zenith_distrib_dir.join("safekeeper").exists() {
+            anyhow::bail!("Can't find safekeeper binary.");
+        }
+
+        // If no initial tenant ID was given, generate it.
+        if env.default_tenantid.is_none() {
+            env.default_tenantid = Some(ZTenantId::generate());
+        }
+
+        env.base_data_dir = base_path();
+
+        Ok(env)
+    }
+
+    /// Locate and load config
+    pub fn load_config() -> Result<LocalEnv> {
+        let repopath = base_path();
+
+        if !repopath.exists() {
+            anyhow::bail!(
+                "Zenith config is not found in {}. You need to run 'zenith init' first",
+                repopath.to_str().unwrap()
+            );
+        }
+
+        // TODO: check that it looks like a zenith repository
+
+        // load and parse file
+        let config = fs::read_to_string(repopath.join("config"))?;
+        let mut env: LocalEnv = toml::from_str(config.as_str())?;
+
+        env.base_data_dir = repopath;
+
+        Ok(env)
+    }
+
+    // this function is used only for testing purposes in CLI e g generate tokens during init
+    pub fn generate_auth_token(&self, claims: &Claims) -> Result<String> {
+        let private_key_path = if self.private_key_path.is_absolute() {
+            self.private_key_path.to_path_buf()
+        } else {
+            self.base_data_dir.join(&self.private_key_path)
+        };
+
+        let key_data = fs::read(private_key_path)?;
+        encode_from_key_file(claims, &key_data)
+    }
+
+    //
+    // Initialize a new Zenith repository
+    //
+    pub fn init(&mut self) -> Result<()> {
+        // check if config already exists
+        let base_path = &self.base_data_dir;
+        if base_path == Path::new("") {
+            anyhow::bail!("repository base path is missing");
+        }
+        if base_path.exists() {
+            anyhow::bail!(
+                "directory '{}' already exists. Perhaps already initialized?",
+                base_path.to_str().unwrap()
+            );
+        }
+
+        fs::create_dir(&base_path)?;
+
+        // generate keys for jwt
+        // openssl genrsa -out private_key.pem 2048
+        let private_key_path;
+        if self.private_key_path == PathBuf::new() {
+            private_key_path = base_path.join("auth_private_key.pem");
+            let keygen_output = Command::new("openssl")
+                .arg("genrsa")
+                .args(&["-out", private_key_path.to_str().unwrap()])
+                .arg("2048")
+                .stdout(Stdio::null())
+                .output()
+                .with_context(|| "failed to generate auth private key")?;
+            if !keygen_output.status.success() {
+                anyhow::bail!(
+                    "openssl failed: '{}'",
+                    String::from_utf8_lossy(&keygen_output.stderr)
+                );
+            }
+            self.private_key_path = Path::new("auth_private_key.pem").to_path_buf();
+
+            let public_key_path = base_path.join("auth_public_key.pem");
+            // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
+            let keygen_output = Command::new("openssl")
+                .arg("rsa")
+                .args(&["-in", private_key_path.to_str().unwrap()])
+                .arg("-pubout")
+                .args(&["-outform", "PEM"])
+                .args(&["-out", public_key_path.to_str().unwrap()])
+                .stdout(Stdio::null())
+                .output()
+                .with_context(|| "failed to generate auth private key")?;
+            if !keygen_output.status.success() {
+                anyhow::bail!(
+                    "openssl failed: '{}'",
+                    String::from_utf8_lossy(&keygen_output.stderr)
+                );
+            }
+        }
+
+        self.pageserver.auth_token =
+            self.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
+
+        fs::create_dir_all(self.pg_data_dirs_path())?;
+
+        for safekeeper in self.safekeepers.iter() {
+            fs::create_dir_all(self.safekeeper_data_dir(&safekeeper.name))?;
+        }
+
+        let mut conf_content = String::new();
+
+        // Currently, the user first passes a config file with 'zenith init --config=<path>'
+        // We read that in, in `create_config`, and fill any missing defaults. Then it's saved
+        // to .zenith/config. TODO: We lose any formatting and comments along the way, which is
+        // a bit sad.
+        write!(
+            &mut conf_content,
+            r#"# This file describes a locale deployment of the page server
+# and safekeeeper node. It is read by the 'zenith' command-line
+# utility.
+"#
+        )?;
+
+        // Convert the LocalEnv to a toml file.
+        //
+        // This could be as simple as this:
+        //
+        // conf_content += &toml::to_string_pretty(env)?;
+        //
+        // But it results in a "values must be emitted before tables". I'm not sure
+        // why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
+        // Maybe rust reorders the fields to squeeze avoid padding or something?
+        // In any case, converting to toml::Value first, and serializing that, works.
+        // See https://github.com/alexcrichton/toml-rs/issues/142
+        conf_content += &toml::to_string_pretty(&toml::Value::try_from(&self)?)?;
+
+        fs::write(base_path.join("config"), conf_content)?;
+
+        Ok(())
+    }
 }

 fn base_path() -> PathBuf {
@@ -85,118 +326,29 @@ fn base_path() -> PathBuf {
    }
 }

-//
-// Initialize a new Zenith repository
-//
-pub fn init(
-    pageserver_pg_port: u16,
-    pageserver_http_port: u16,
-    tenantid: ZTenantId,
-    auth_type: AuthType,
-) -> Result<()> {
-    // check if config already exists
-    let base_path = base_path();
-    if base_path.exists() {
-        anyhow::bail!(
-            "{} already exists. Perhaps already initialized?",
-            base_path.to_str().unwrap()
-        );
+/// Serde routines for Option<ZTenantId>. The serialized form is a hex string.
+mod opt_tenantid_serde {
+    use serde::{Deserialize, Deserializer, Serialize, Serializer};
+    use std::str::FromStr;
+    use zenith_utils::zid::ZTenantId;
+
+    pub fn serialize<S>(tenantid: &Option<ZTenantId>, ser: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        tenantid.map(|t| t.to_string()).serialize(ser)
    }
-    fs::create_dir(&base_path)?;

-    // ok, now check that expected binaries are present
-
-    // Find postgres binaries. Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
-    let pg_distrib_dir: PathBuf = {
-        if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
-            postgres_bin.into()
-        } else {
-            let cwd = env::current_dir()?;
-            cwd.join("tmp_install")
+    pub fn deserialize<'de, D>(des: D) -> Result<Option<ZTenantId>, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        let s: Option<String> = Option::deserialize(des)?;
+        if let Some(s) = s {
+            return Ok(Some(
+                ZTenantId::from_str(&s).map_err(serde::de::Error::custom)?,
+            ));
        }
-    };
-    if !pg_distrib_dir.join("bin/postgres").exists() {
-        anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
+        Ok(None)
    }
-
-    // generate keys for jwt
-    // openssl genrsa -out private_key.pem 2048
-    let private_key_path = base_path.join("auth_private_key.pem");
-    let keygen_output = Command::new("openssl")
-        .arg("genrsa")
-        .args(&["-out", private_key_path.to_str().unwrap()])
-        .arg("2048")
-        .stdout(Stdio::null())
-        .output()
-        .with_context(|| "failed to generate auth private key")?;
-    if !keygen_output.status.success() {
-        anyhow::bail!(
-            "openssl failed: '{}'",
-            String::from_utf8_lossy(&keygen_output.stderr)
-        );
-    }
-
-    let public_key_path = base_path.join("auth_public_key.pem");
-    // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
-    let keygen_output = Command::new("openssl")
-        .arg("rsa")
-        .args(&["-in", private_key_path.to_str().unwrap()])
-        .arg("-pubout")
-        .args(&["-outform", "PEM"])
-        .args(&["-out", public_key_path.to_str().unwrap()])
-        .stdout(Stdio::null())
-        .output()
-        .with_context(|| "failed to generate auth private key")?;
-    if !keygen_output.status.success() {
-        anyhow::bail!(
-            "openssl failed: '{}'",
-            String::from_utf8_lossy(&keygen_output.stderr)
-        );
-    }
-
-    let auth_token =
-        encode_from_key_path(&Claims::new(None, Scope::PageServerApi), &private_key_path)?;
-
-    // Find zenith binaries.
-    let zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
-    if !zenith_distrib_dir.join("pageserver").exists() {
-        anyhow::bail!("Can't find pageserver binary.",);
-    }
-
-    let conf = LocalEnv {
-        pageserver_pg_port,
-        pageserver_http_port,
-        pg_distrib_dir,
-        zenith_distrib_dir,
-        base_data_dir: base_path,
-        tenantid,
-        auth_token,
-        auth_type,
-        private_key_path,
-    };
-
-    fs::create_dir_all(conf.pg_data_dirs_path())?;
-
-    let toml = toml::to_string_pretty(&conf)?;
-    fs::write(conf.base_data_dir.join("config"), toml)?;
-
-    Ok(())
-}
-
-// Locate and load config
-pub fn load_config() -> Result<LocalEnv> {
-    let repopath = base_path();
-
-    if !repopath.exists() {
-        anyhow::bail!(
-            "Zenith config is not found in {}. You need to run 'zenith init' first",
-            repopath.to_str().unwrap()
-        );
-    }
-
-    // TODO: check that it looks like a zenith repository
-
-    // load and parse file
-    let config = fs::read_to_string(repopath.join("config"))?;
-    toml::from_str(config.as_str()).map_err(|e| e.into())
 }
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -0,0 +1,277 @@
+use std::io::Write;
+use std::net::TcpStream;
+use std::path::PathBuf;
+use std::process::Command;
+use std::sync::Arc;
+use std::time::Duration;
+use std::{io, result, thread};
+
+use anyhow::bail;
+use nix::errno::Errno;
+use nix::sys::signal::{kill, Signal};
+use nix::unistd::Pid;
+use postgres::Config;
+use reqwest::blocking::{Client, RequestBuilder, Response};
+use reqwest::{IntoUrl, Method};
+use thiserror::Error;
+use zenith_utils::http::error::HttpErrorBody;
+use zenith_utils::postgres_backend::AuthType;
+
+use crate::local_env::{LocalEnv, SafekeeperConf};
+use crate::read_pidfile;
+use crate::storage::PageServerNode;
+use zenith_utils::connstring::connection_address;
+use zenith_utils::connstring::connection_host_port;
+
+#[derive(Error, Debug)]
+pub enum SafekeeperHttpError {
+    #[error("Reqwest error: {0}")]
+    Transport(#[from] reqwest::Error),
+
+    #[error("Error: {0}")]
+    Response(String),
+}
+
+type Result<T> = result::Result<T, SafekeeperHttpError>;
+
+pub trait ResponseErrorMessageExt: Sized {
+    fn error_from_body(self) -> Result<Self>;
+}
+
+impl ResponseErrorMessageExt for Response {
+    fn error_from_body(self) -> Result<Self> {
+        let status = self.status();
+        if !(status.is_client_error() || status.is_server_error()) {
+            return Ok(self);
+        }
+
+        // reqwest do not export it's error construction utility functions, so lets craft the message ourselves
+        let url = self.url().to_owned();
+        Err(SafekeeperHttpError::Response(
+            match self.json::<HttpErrorBody>() {
+                Ok(err_body) => format!("Error: {}", err_body.msg),
+                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
+            },
+        ))
+    }
+}
+
+//
+// Control routines for safekeeper.
+//
+// Used in CLI and tests.
+//
+#[derive(Debug)]
+pub struct SafekeeperNode {
+    pub name: String,
+
+    pub conf: SafekeeperConf,
+
+    pub pg_connection_config: Config,
+    pub env: LocalEnv,
+    pub http_client: Client,
+    pub http_base_url: String,
+
+    pub pageserver: Arc<PageServerNode>,
+}
+
+impl SafekeeperNode {
+    pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
+        let pageserver = Arc::new(PageServerNode::from_env(env));
+
+        println!("initializing for {} for {}", conf.name, conf.http_port);
+
+        SafekeeperNode {
+            name: conf.name.clone(),
+            conf: conf.clone(),
+            pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
+            env: env.clone(),
+            http_client: Client::new(),
+            http_base_url: format!("http://localhost:{}/v1", conf.http_port),
+            pageserver,
+        }
+    }
+
+    /// Construct libpq connection string for connecting to this safekeeper.
+    fn safekeeper_connection_config(port: u16) -> Config {
+        // TODO safekeeper authentication not implemented yet
+        format!("postgresql://no_user@localhost:{}/no_db", port)
+            .parse()
+            .unwrap()
+    }
+
+    pub fn datadir_path(&self) -> PathBuf {
+        self.env.safekeeper_data_dir(&self.name)
+    }
+
+    pub fn pid_file(&self) -> PathBuf {
+        self.datadir_path().join("safekeeper.pid")
+    }
+
+    pub fn start(&self) -> anyhow::Result<()> {
+        print!(
+            "Starting safekeeper at '{}' in '{}'",
+            connection_address(&self.pg_connection_config),
+            self.datadir_path().display()
+        );
+        io::stdout().flush().unwrap();
+
+        // Configure connection to page server
+        //
+        // FIXME: We extract the host and port from the connection string instead of using
+        // the connection string directly, because the 'safekeeper' binary expects
+        // host:port format. That's a bit silly when we already have a full libpq connection
+        // string at hand.
+        let pageserver_conn = {
+            let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
+            format!("{}:{}", host, port)
+        };
+
+        let listen_pg = format!("localhost:{}", self.conf.pg_port);
+        let listen_http = format!("localhost:{}", self.conf.http_port);
+
+        let mut cmd: &mut Command = &mut Command::new(self.env.safekeeper_bin()?);
+        cmd = cmd
+            .args(&["-D", self.datadir_path().to_str().unwrap()])
+            .args(&["--listen-pg", &listen_pg])
+            .args(&["--listen-http", &listen_http])
+            .args(&["--pageserver", &pageserver_conn])
+            .args(&["--recall", "1 second"])
+            .arg("--daemonize")
+            .env_clear()
+            .env("RUST_BACKTRACE", "1");
+        if !self.conf.sync {
+            cmd = cmd.arg("--no-sync");
+        }
+
+        if self.env.pageserver.auth_type == AuthType::ZenithJWT {
+            cmd.env("PAGESERVER_AUTH_TOKEN", &self.env.pageserver.auth_token);
+        }
+
+        if !cmd.status()?.success() {
+            bail!(
+                "Safekeeper failed to start. See '{}' for details.",
+                self.datadir_path().join("safekeeper.log").display()
+            );
+        }
+
+        // It takes a while for the safekeeper to start up. Wait until it is
+        // open for business.
+        const RETRIES: i8 = 15;
+        for retries in 1..RETRIES {
+            match self.check_status() {
+                Ok(_) => {
+                    println!("\nSafekeeper started");
+                    return Ok(());
+                }
+                Err(err) => {
+                    match err {
+                        SafekeeperHttpError::Transport(err) => {
+                            if err.is_connect() && retries < 5 {
+                                print!(".");
+                                io::stdout().flush().unwrap();
+                            } else {
+                                if retries == 5 {
+                                    println!() // put a line break after dots for second message
+                                }
+                                println!(
+                                    "Safekeeper not responding yet, err {} retrying ({})...",
+                                    err, retries
+                                );
+                            }
+                        }
+                        SafekeeperHttpError::Response(msg) => {
+                            bail!("safekeeper failed to start: {} ", msg)
+                        }
+                    }
+                    thread::sleep(Duration::from_secs(1));
+                }
+            }
+        }
+        bail!("safekeeper failed to start in {} seconds", RETRIES);
+    }
+
+    ///
+    /// Stop the server.
+    ///
+    /// If 'immediate' is true, we use SIGQUIT, killing the process immediately.
+    /// Otherwise we use SIGTERM, triggering a clean shutdown
+    ///
+    /// If the server is not running, returns success
+    ///
+    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
+        let pid_file = self.pid_file();
+        if !pid_file.exists() {
+            println!("Safekeeper {} is already stopped", self.name);
+            return Ok(());
+        }
+        let pid = read_pidfile(&pid_file)?;
+        let pid = Pid::from_raw(pid);
+
+        let sig = if immediate {
+            println!("Stop safekeeper immediately");
+            Signal::SIGQUIT
+        } else {
+            println!("Stop safekeeper gracefully");
+            Signal::SIGTERM
+        };
+        match kill(pid, sig) {
+            Ok(_) => (),
+            Err(Errno::ESRCH) => {
+                println!(
+                    "Safekeeper with pid {} does not exist, but a PID file was found",
+                    pid
+                );
+                return Ok(());
+            }
+            Err(err) => bail!(
+                "Failed to send signal to safekeeper with pid {}: {}",
+                pid,
+                err.desc()
+            ),
+        }
+
+        let address = connection_address(&self.pg_connection_config);
+
+        // TODO Remove this "timeout" and handle it on caller side instead.
+        // Shutting down may take a long time,
+        // if safekeeper flushes a lot of data
+        for _ in 0..100 {
+            if let Err(_e) = TcpStream::connect(&address) {
+                println!("Safekeeper stopped receiving connections");
+
+                //Now check status
+                match self.check_status() {
+                    Ok(_) => {
+                        println!("Safekeeper status is OK. Wait a bit.");
+                        thread::sleep(Duration::from_secs(1));
+                    }
+                    Err(err) => {
+                        println!("Safekeeper status is: {}", err);
+                        return Ok(());
+                    }
+                }
+            } else {
+                println!("Safekeeper still receives connections");
+                thread::sleep(Duration::from_secs(1));
+            }
+        }
+
+        bail!("Failed to stop safekeeper with pid {}", pid);
+    }
+
+    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
+        // TODO: authentication
+        //if self.env.auth_type == AuthType::ZenithJWT {
+        //    builder = builder.bearer_auth(&self.env.safekeeper_auth_token)
+        //}
+        self.http_client.request(method, url)
+    }
+
+    pub fn check_status(&self) -> Result<()> {
+        self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
+            .send()?
+            .error_from_body()?;
+        Ok(())
+    }
+}
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -6,6 +6,7 @@ use std::time::Duration;
 use std::{io, result, thread};

 use anyhow::{anyhow, bail};
+use nix::errno::Errno;
 use nix::sys::signal::{kill, Signal};
 use nix::unistd::Pid;
 use pageserver::http::models::{BranchCreateRequest, TenantCreateRequest};
@@ -20,6 +21,7 @@ use zenith_utils::zid::ZTenantId;
 use crate::local_env::LocalEnv;
 use crate::read_pidfile;
 use pageserver::branches::BranchInfo;
+use pageserver::tenant_mgr::TenantInfo;
 use zenith_utils::connstring::connection_address;

 #[derive(Error, Debug)]
@@ -62,7 +64,6 @@ impl ResponseErrorMessageExt for Response {
 //
 #[derive(Debug)]
 pub struct PageServerNode {
-    pub kill_on_exit: bool,
    pub pg_connection_config: Config,
    pub env: LocalEnv,
    pub http_client: Client,
@@ -71,34 +72,34 @@ pub struct PageServerNode {

 impl PageServerNode {
    pub fn from_env(env: &LocalEnv) -> PageServerNode {
-        let password = if env.auth_type == AuthType::ZenithJWT {
-            &env.auth_token
+        let password = if env.pageserver.auth_type == AuthType::ZenithJWT {
+            &env.pageserver.auth_token
        } else {
            ""
        };

        PageServerNode {
-            kill_on_exit: false,
            pg_connection_config: Self::pageserver_connection_config(
                password,
-                env.pageserver_pg_port,
+                env.pageserver.pg_port,
            ),
            env: env.clone(),
            http_client: Client::new(),
-            http_base_url: format!("http://localhost:{}/v1", env.pageserver_http_port),
+            http_base_url: format!("http://localhost:{}/v1", env.pageserver.http_port),
        }
    }

+    /// Construct libpq connection string for connecting to the pageserver.
    fn pageserver_connection_config(password: &str, port: u16) -> Config {
        format!("postgresql://no_user:{}@localhost:{}/no_db", password, port)
            .parse()
            .unwrap()
    }

-    pub fn init(&self, create_tenant: Option<&str>, enable_auth: bool) -> anyhow::Result<()> {
+    pub fn init(&self, create_tenant: Option<&str>) -> anyhow::Result<()> {
        let mut cmd = Command::new(self.env.pageserver_bin()?);
-        let listen_pg = format!("localhost:{}", self.env.pageserver_pg_port);
-        let listen_http = format!("localhost:{}", self.env.pageserver_http_port);
+        let listen_pg = format!("localhost:{}", self.env.pageserver.pg_port);
+        let listen_http = format!("localhost:{}", self.env.pageserver.http_port);
        let mut args = vec![
            "--init",
            "-D",
@@ -111,10 +112,11 @@ impl PageServerNode {
            &listen_http,
        ];

-        if enable_auth {
+        let auth_type_str = &self.env.pageserver.auth_type.to_string();
+        if self.env.pageserver.auth_type != AuthType::Trust {
            args.extend(&["--auth-validation-public-key-path", "auth_public_key.pem"]);
-            args.extend(&["--auth-type", "ZenithJWT"]);
        }
+        args.extend(&["--auth-type", auth_type_str]);

        if let Some(tenantid) = create_tenant {
            args.extend(&["--create-tenant", tenantid])
@@ -152,7 +154,7 @@ impl PageServerNode {

        let mut cmd = Command::new(self.env.pageserver_bin()?);
        cmd.args(&["-D", self.repo_path().to_str().unwrap()])
-            .arg("-d")
+            .arg("--daemonize")
            .env_clear()
            .env("RUST_BACKTRACE", "1");

@@ -199,19 +201,43 @@ impl PageServerNode {
        bail!("pageserver failed to start in {} seconds", RETRIES);
    }

+    ///
+    /// Stop the server.
+    ///
+    /// If 'immediate' is true, we use SIGQUIT, killing the process immediately.
+    /// Otherwise we use SIGTERM, triggering a clean shutdown
+    ///
+    /// If the server is not running, returns success
+    ///
    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        let pid = read_pidfile(&self.pid_file())?;
-        let pid = Pid::from_raw(pid);
-        if immediate {
+        let pid_file = self.pid_file();
+        if !pid_file.exists() {
+            println!("Pageserver is already stopped");
+            return Ok(());
+        }
+        let pid = Pid::from_raw(read_pidfile(&pid_file)?);
+
+        let sig = if immediate {
            println!("Stop pageserver immediately");
-            if kill(pid, Signal::SIGQUIT).is_err() {
-                bail!("Failed to kill pageserver with pid {}", pid);
-            }
+            Signal::SIGQUIT
        } else {
            println!("Stop pageserver gracefully");
-            if kill(pid, Signal::SIGTERM).is_err() {
-                bail!("Failed to stop pageserver with pid {}", pid);
+            Signal::SIGTERM
+        };
+        match kill(pid, sig) {
+            Ok(_) => (),
+            Err(Errno::ESRCH) => {
+                println!(
+                    "Pageserver with pid {} does not exist, but a PID file was found",
+                    pid
+                );
+                return Ok(());
            }
+            Err(err) => bail!(
+                "Failed to send signal to pageserver with pid {}: {}",
+                pid,
+                err.desc()
+            ),
        }

        let address = connection_address(&self.pg_connection_config);
@@ -256,8 +282,8 @@ impl PageServerNode {

    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
        let mut builder = self.http_client.request(method, url);
-        if self.env.auth_type == AuthType::ZenithJWT {
-            builder = builder.bearer_auth(&self.env.auth_token)
+        if self.env.pageserver.auth_type == AuthType::ZenithJWT {
+            builder = builder.bearer_auth(&self.env.pageserver.auth_token)
        }
        builder
    }
@@ -269,7 +295,7 @@ impl PageServerNode {
        Ok(())
    }

-    pub fn tenant_list(&self) -> Result<Vec<String>> {
+    pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
        Ok(self
            .http_request(Method::GET, format!("{}/{}", self.http_base_url, "tenant"))
            .send()?
@@ -332,12 +358,3 @@ impl PageServerNode {
            .json()?)
    }
 }
-
-impl Drop for PageServerNode {
-    fn drop(&mut self) {
-        // TODO Looks like this flag is never set
-        if self.kill_on_exit {
-            let _ = self.stop(true);
-        }
-    }
-}
--- a/docs/glossary.md
+++ b/docs/glossary.md
@@ -51,11 +51,14 @@ Each PostgreSQL fork is considered a separate relish.

 ### Layer

-Each layer corresponds to the specific version of a relish Segment in a range of LSNs.
+A layer contains data needed to reconstruct any page versions within the
+layer's Segment and range of LSNs.
+
 There are two kinds of layers, in-memory and on-disk layers. In-memory
 layers are used to ingest incoming WAL, and provide fast access
 to the recent page versions. On-disk layers are stored as files on disk, and
-are immutable.
+are immutable. See pageserver/src/layered_repository/README.md for more.
+
 ### Layer file (on-disk layer)

 Layered repository on-disk format is based on immutable files.  The
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -79,3 +79,61 @@ Helpers for exposing Prometheus metrics from the server.
 `/zenith_utils`:

 Helpers that are shared between other crates in this repository.
+
+## Using Python
+Note that Debian/Ubuntu Python packages are stale, as it commonly happens,
+so manual installation of dependencies is not recommended.
+
+A single virtual environment with all dependencies is described in the single `Pipfile`.
+
+### Prerequisites
+- Install Python 3.7 (the minimal supported version)
+    - Later version (e.g. 3.8) is ok if you don't write Python code
+    - You can install Python 3.7 separately, e.g.:
+      ```bash
+      # In Ubuntu
+      sudo add-apt-repository ppa:deadsnakes/ppa
+      sudo apt update
+      sudo apt install python3.7
+      ```
+- Install `pipenv`
+    - Exact version of `pipenv` is not important, you can use Debian/Ubuntu package `pipenv`.
+- Install dependencies via either
+  * `pipenv --python 3.7 install --dev` if you will write Python code, or
+  * `pipenv install` if you only want to run Python scripts and don't have Python 3.7.
+
+Run `pipenv shell` to activate the virtual environment.
+Alternatively, use `pipenv run` to run a single command in the venv, e.g. `pipenv run pytest`.
+
+### Obligatory checks
+We force code formatting via `yapf` and type hints via `mypy`.
+Run the following commands in the repository's root (next to `setup.cfg`):
+
+```bash
+pipenv run yapf -ri .  # All code is reformatted
+pipenv run mypy .  # Ensure there are no typing errors
+```
+
+**WARNING**: do not run `mypy` from a directory other than the root of the repository.
+Otherwise it will not find its configuration.
+
+Also consider:
+
+* Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any.
+* Adding more type hints to your code to avoid `Any`.
+
+### Changing dependencies
+You have to update `Pipfile.lock` if you have changed `Pipfile`:
+
+```bash
+pipenv --python 3.7 install --dev  # Re-create venv for Python 3.7 and install recent pipenv inside
+pipenv run pipenv --version  # Should be at least 2021.5.29
+pipenv run pipenv lock  # Regenerate Pipfile.lock
+```
+
+As the minimal supported version is Python 3.7 and we use it in CI,
+you have to use a Python 3.7 environment when updating `Pipfile.lock`.
+Otherwise some back-compatibility packages will be missing.
+
+It is also important to run recent `pipenv`.
+Older versions remove markers from `Pipfile.lock`.
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -5,7 +5,7 @@ authors = ["Stas Kelvich <stas@zenith.tech>"]
 edition = "2018"

 [dependencies]
-bookfile = "^0.3"
+bookfile = { git = "https://github.com/zenithdb/bookfile.git", branch="generic-readext" }
 chrono = "0.4.19"
 rand = "0.8.3"
 regex = "1.4.5"
@@ -17,7 +17,7 @@ lazy_static = "1.4.0"
 log = "0.4.14"
 clap = "2.33.0"
 daemonize = "0.4.1"
-tokio = { version = "1.11", features = ["process", "macros", "fs", "rt"] }
+tokio = { version = "1.11", features = ["process", "macros", "fs", "rt", "io-util"] }
 postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
 postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
@@ -37,6 +37,9 @@ async-trait = "0.1"
 const_format = "0.2.21"
 tracing = "0.1.27"
 signal-hook = {version = "0.3.10", features = ["extended-siginfo"] }
+url = "2"
+nix = "0.23"
+once_cell = "1.8.0"

 postgres_ffi = { path = "../postgres_ffi" }
 zenith_metrics = { path = "../zenith_metrics" }
@@ -45,3 +48,4 @@ workspace_hack = { path = "../workspace_hack" }

 [dev-dependencies]
 hex-literal = "0.3"
+tempfile = "3.2"
--- a/pageserver/README.md
+++ b/pageserver/README.md
@@ -41,7 +41,7 @@ Legend:
 +--+

 ....
-.  .   Component that we will need, but doesn't exist at the moment. A TODO.
+.  .   Component at its early development phase.
 ....

 --->   Data flow
@@ -116,13 +116,49 @@ Remove old on-disk layer files that are no longer needed according to the
 PITR retention policy


-TODO: Backup service
--------------------
+### Backup service

-The backup service is responsible for periodically pushing the chunks to S3.
+The backup service, responsible for storing pageserver recovery data externally.

-TODO: How/when do restore from S3? Whenever we get a GetPage@LSN request for
-a chunk we don't currently have? Or when an external Control Plane tells us?
+Currently, pageserver stores its files in a filesystem directory it's pointed to.
+That working directory could be rather ephemeral for such cases as "a pageserver pod running in k8s with no persistent volumes attached".
+Therefore, the server interacts with external, more reliable storage to back up and restore its state.
+
+The code for storage support is extensible and can support arbitrary ones as long as they implement a certain Rust trait.
+There are the following implementations present:
+* local filesystem — to use in tests mainly
+* AWS S3           - to use in production
+
+Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs.
+
+The backup service is disabled by default and can be enabled to interact with a single remote storage.
+
+CLI examples:
+* Local FS: `${PAGESERVER_BIN} --relish-storage-local-path="/some/local/path/"`
+* AWS S3  : `${PAGESERVER_BIN} --relish-storage-s3-bucket="some-sample-bucket" --relish-storage-region="eu-north-1" --relish-storage-access-key="SOMEKEYAAAAASADSAH*#" --relish-storage-secret-access-key="SOMEsEcReTsd292v"`
+
+For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
+For local S3 installations, refer to the their documentation for name format and credentials.
+
+Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets.
+Required sections are:
+
+```toml
+[remote_storage]
+local_path = '/Users/someonetoignore/Downloads/tmp_dir/'
+```
+
+or
+
+```toml
+[remote_storage]
+bucket_name = 'some-sample-bucket'
+bucket_region = 'eu-north-1'
+access_key_id = 'SOMEKEYAAAAASADSAH*#'
+secret_access_key = 'SOMEsEcReTsd292v'
+```
+
+Also, `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` variables can be used to specify the credentials instead of any of the ways above.

 TODO: Sharding
 --------------------
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -285,11 +285,7 @@ impl<'a> Basebackup<'a> {

        //send wal segment
        let segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE);
-        let wal_file_name = XLogFileName(
-            1, // FIXME: always use Postgres timeline 1
-            segno,
-            pg_constants::WAL_SEGMENT_SIZE,
-        );
+        let wal_file_name = XLogFileName(PG_TLI, segno, pg_constants::WAL_SEGMENT_SIZE);
        let wal_file_path = format!("pg_wal/{}", wal_file_name);
        let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?;
        let wal_seg = generate_wal_segment(segno, pg_control.system_identifier);
--- a/pageserver/src/bin/dump_layerfile.rs
+++ b/pageserver/src/bin/dump_layerfile.rs
@@ -5,10 +5,12 @@ use anyhow::Result;
 use clap::{App, Arg};
 use pageserver::layered_repository::dump_layerfile_from_path;
 use std::path::PathBuf;
+use zenith_utils::GIT_VERSION;

 fn main() -> Result<()> {
    let arg_matches = App::new("Zenith dump_layerfile utility")
        .about("Dump contents of one layer file, for debugging")
+        .version(GIT_VERSION)
        .arg(
            Arg::with_name("path")
                .help("Path to file to dump")
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -5,13 +5,12 @@
 use serde::{Deserialize, Serialize};
 use std::{
    env,
-    net::TcpListener,
    path::{Path, PathBuf},
    str::FromStr,
    thread,
 };
 use tracing::*;
-use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType};
+use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType, tcp_listener, GIT_VERSION};

 use anyhow::{bail, ensure, Context, Result};
 use signal_hook::consts::signal::*;
@@ -27,8 +26,8 @@ use clap::{App, Arg, ArgMatches};
 use daemonize::Daemonize;

 use pageserver::{
-    branches, defaults::*, http, page_service, relish_storage, tenant_mgr, PageServerConf,
-    RelishStorageConfig, RelishStorageKind, S3Config, LOG_FILE_NAME,
+    branches, defaults::*, http, page_cache, page_service, remote_storage, tenant_mgr,
+    virtual_file, PageServerConf, RemoteStorageConfig, RemoteStorageKind, S3Config, LOG_FILE_NAME,
 };
 use zenith_utils::http::endpoint;
 use zenith_utils::postgres_backend;
@@ -44,25 +43,28 @@ struct CfgFileParams {
    checkpoint_period: Option<String>,
    gc_horizon: Option<String>,
    gc_period: Option<String>,
+    open_mem_limit: Option<String>,
+    page_cache_size: Option<String>,
+    max_file_descriptors: Option<String>,
    pg_distrib_dir: Option<String>,
    auth_validation_public_key_path: Option<String>,
    auth_type: Option<String>,
-    relish_storage_max_concurrent_sync: Option<String>,
+    remote_storage_max_concurrent_sync: Option<String>,
    /////////////////////////////////
    //// Don't put `Option<String>` and other "simple" values below.
    ////
-    /// `Option<RelishStorage>` is a <a href='https://toml.io/en/v1.0.0#table'>table</a> in TOML.
+    /// `Option<RemoteStorage>` is a <a href='https://toml.io/en/v1.0.0#table'>table</a> in TOML.
    /// Values in TOML cannot be defined after tables (other tables can),
    /// and [`toml`] crate serializes all fields in the order of their appearance.
    ////////////////////////////////
-    relish_storage: Option<RelishStorage>,
+    remote_storage: Option<RemoteStorage>,
 }

 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
 // Without this attribute, enums with values won't be serialized by the `toml` library (but can be deserialized nonetheless!).
 // See https://github.com/alexcrichton/toml-rs/blob/6c162e6562c3e432bf04c82a3d1d789d80761a86/examples/enum_external.rs for the examples
 #[serde(untagged)]
-enum RelishStorage {
+enum RemoteStorage {
    Local {
        local_path: String,
    },
@@ -83,12 +85,12 @@ impl CfgFileParams {
            arg_matches.value_of(arg_name).map(str::to_owned)
        };

-        let relish_storage = if let Some(local_path) = get_arg("relish-storage-local-path") {
-            Some(RelishStorage::Local { local_path })
+        let remote_storage = if let Some(local_path) = get_arg("relish-storage-local-path") {
+            Some(RemoteStorage::Local { local_path })
        } else if let Some((bucket_name, bucket_region)) =
            get_arg("relish-storage-s3-bucket").zip(get_arg("relish-storage-region"))
        {
-            Some(RelishStorage::AwsS3 {
+            Some(RemoteStorage::AwsS3 {
                bucket_name,
                bucket_region,
                access_key_id: get_arg("relish-storage-access-key"),
@@ -105,11 +107,14 @@ impl CfgFileParams {
            checkpoint_period: get_arg("checkpoint_period"),
            gc_horizon: get_arg("gc_horizon"),
            gc_period: get_arg("gc_period"),
+            open_mem_limit: get_arg("open_mem_limit"),
+            page_cache_size: get_arg("page_cache_size"),
+            max_file_descriptors: get_arg("max_file_descriptors"),
            pg_distrib_dir: get_arg("postgres-distrib"),
            auth_validation_public_key_path: get_arg("auth-validation-public-key-path"),
            auth_type: get_arg("auth-type"),
-            relish_storage,
-            relish_storage_max_concurrent_sync: get_arg("relish-storage-max-concurrent-sync"),
+            remote_storage,
+            remote_storage_max_concurrent_sync: get_arg("relish-storage-max-concurrent-sync"),
        }
    }

@@ -123,15 +128,18 @@ impl CfgFileParams {
            checkpoint_period: self.checkpoint_period.or(other.checkpoint_period),
            gc_horizon: self.gc_horizon.or(other.gc_horizon),
            gc_period: self.gc_period.or(other.gc_period),
+            open_mem_limit: self.open_mem_limit.or(other.open_mem_limit),
+            page_cache_size: self.page_cache_size.or(other.page_cache_size),
+            max_file_descriptors: self.max_file_descriptors.or(other.max_file_descriptors),
            pg_distrib_dir: self.pg_distrib_dir.or(other.pg_distrib_dir),
            auth_validation_public_key_path: self
                .auth_validation_public_key_path
                .or(other.auth_validation_public_key_path),
            auth_type: self.auth_type.or(other.auth_type),
-            relish_storage: self.relish_storage.or(other.relish_storage),
-            relish_storage_max_concurrent_sync: self
-                .relish_storage_max_concurrent_sync
-                .or(other.relish_storage_max_concurrent_sync),
+            remote_storage: self.remote_storage.or(other.remote_storage),
+            remote_storage_max_concurrent_sync: self
+                .remote_storage_max_concurrent_sync
+                .or(other.remote_storage_max_concurrent_sync),
        }
    }

@@ -167,6 +175,21 @@ impl CfgFileParams {
            None => DEFAULT_GC_PERIOD,
        };

+        let open_mem_limit: usize = match self.open_mem_limit.as_ref() {
+            Some(open_mem_limit_str) => open_mem_limit_str.parse()?,
+            None => DEFAULT_OPEN_MEM_LIMIT,
+        };
+
+        let page_cache_size: usize = match self.page_cache_size.as_ref() {
+            Some(page_cache_size_str) => page_cache_size_str.parse()?,
+            None => DEFAULT_PAGE_CACHE_SIZE,
+        };
+
+        let max_file_descriptors: usize = match self.max_file_descriptors.as_ref() {
+            Some(max_file_descriptors_str) => max_file_descriptors_str.parse()?,
+            None => DEFAULT_MAX_FILE_DESCRIPTORS,
+        };
+
        let pg_distrib_dir = match self.pg_distrib_dir.as_ref() {
            Some(pg_distrib_dir_str) => PathBuf::from(pg_distrib_dir_str),
            None => env::current_dir()?.join("tmp_install"),
@@ -200,30 +223,28 @@ impl CfgFileParams {
            );
        }

-        let max_concurrent_sync = match self.relish_storage_max_concurrent_sync.as_deref() {
-            Some(relish_storage_max_concurrent_sync) => {
-                relish_storage_max_concurrent_sync.parse()?
-            }
-            None => DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS,
+        let max_concurrent_sync = match self.remote_storage_max_concurrent_sync.as_deref() {
+            Some(number_str) => number_str.parse()?,
+            None => DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC_LIMITS,
        };
-        let relish_storage_config = self.relish_storage.as_ref().map(|storage_params| {
+        let remote_storage_config = self.remote_storage.as_ref().map(|storage_params| {
            let storage = match storage_params.clone() {
-                RelishStorage::Local { local_path } => {
-                    RelishStorageKind::LocalFs(PathBuf::from(local_path))
+                RemoteStorage::Local { local_path } => {
+                    RemoteStorageKind::LocalFs(PathBuf::from(local_path))
                }
-                RelishStorage::AwsS3 {
+                RemoteStorage::AwsS3 {
                    bucket_name,
                    bucket_region,
                    access_key_id,
                    secret_access_key,
-                } => RelishStorageKind::AwsS3(S3Config {
+                } => RemoteStorageKind::AwsS3(S3Config {
                    bucket_name,
                    bucket_region,
                    access_key_id,
                    secret_access_key,
                }),
            };
-            RelishStorageConfig {
+            RemoteStorageConfig {
                max_concurrent_sync,
                storage,
            }
@@ -238,6 +259,9 @@ impl CfgFileParams {
            checkpoint_period,
            gc_horizon,
            gc_period,
+            open_mem_limit,
+            page_cache_size,
+            max_file_descriptors,

            superuser: String::from(DEFAULT_SUPERUSER),

@@ -247,7 +271,7 @@ impl CfgFileParams {

            auth_validation_public_key_path,
            auth_type,
-            relish_storage_config,
+            remote_storage_config,
        })
    }
 }
@@ -256,6 +280,7 @@ fn main() -> Result<()> {
    zenith_metrics::set_common_metrics_prefix("pageserver");
    let arg_matches = App::new("Zenith page server")
        .about("Materializes WAL stream to pages and serves them to the postgres")
+        .version(GIT_VERSION)
        .arg(
            Arg::with_name("listen-pg")
                .short("l")
@@ -308,6 +333,25 @@ fn main() -> Result<()> {
                .takes_value(true)
                .help("Interval between garbage collector iterations"),
        )
+        .arg(
+            Arg::with_name("open_mem_limit")
+                .long("open_mem_limit")
+                .takes_value(true)
+                .help("Amount of memory reserved for buffering incoming WAL"),
+        )
+        .arg(
+
+            Arg::with_name("page_cache_size")
+                .long("page_cache_size")
+                .takes_value(true)
+                .help("Number of pages in the page cache"),
+        )
+        .arg(
+            Arg::with_name("max_file_descriptors")
+                .long("max_file_descriptors")
+                .takes_value(true)
+                .help("Max number of file descriptors to keep open for files"),
+        )
        .arg(
            Arg::with_name("workdir")
                .short("D")
@@ -439,6 +483,11 @@ fn main() -> Result<()> {
    // as a ref.
    let conf: &'static PageServerConf = Box::leak(Box::new(conf));

+    // Basic initialization of things that don't change after startup
+    virtual_file::init(conf.max_file_descriptors);
+
+    page_cache::init(conf);
+
    // Create repo and exit if init was requested
    if init {
        branches::init_pageserver(conf, create_tenant).context("Failed to init pageserver")?;
@@ -462,6 +511,8 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
    // Initialize logger
    let log_file = logging::init(LOG_FILE_NAME, conf.daemonize)?;

+    info!("version: {}", GIT_VERSION);
+
    let term_now = Arc::new(AtomicBool::new(false));
    for sig in TERM_SIGNALS {
        // When terminated by a second term signal, exit with exit code 1.
@@ -480,13 +531,13 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
        "Starting pageserver http handler on {}",
        conf.listen_http_addr
    );
-    let http_listener = TcpListener::bind(conf.listen_http_addr.clone())?;
+    let http_listener = tcp_listener::bind(conf.listen_http_addr.clone())?;

    info!(
        "Starting pageserver pg protocol handler on {}",
        conf.listen_pg_addr
    );
-    let pageserver_listener = TcpListener::bind(conf.listen_pg_addr.clone())?;
+    let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;

    if conf.daemonize {
        info!("daemonizing...");
@@ -512,7 +563,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
    // don't spawn threads before daemonizing
    let mut join_handles = Vec::new();

-    if let Some(handle) = relish_storage::run_storage_sync_thread(conf)? {
+    if let Some(handle) = remote_storage::run_storage_sync_thread(conf)? {
        join_handles.push(handle);
    }
    // Initialize tenant manager.
@@ -602,16 +653,19 @@ mod tests {
            checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
            gc_horizon: Some("gc_horizon_VALUE".to_string()),
            gc_period: Some("gc_period_VALUE".to_string()),
+            open_mem_limit: Some("open_mem_limit_VALUE".to_string()),
+            page_cache_size: Some("page_cache_size_VALUE".to_string()),
+            max_file_descriptors: Some("max_file_descriptors_VALUE".to_string()),
            pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
            auth_validation_public_key_path: Some(
                "auth_validation_public_key_path_VALUE".to_string(),
            ),
            auth_type: Some("auth_type_VALUE".to_string()),
-            relish_storage: Some(RelishStorage::Local {
-                local_path: "relish_storage_local_VALUE".to_string(),
+            remote_storage: Some(RemoteStorage::Local {
+                local_path: "remote_storage_local_VALUE".to_string(),
            }),
-            relish_storage_max_concurrent_sync: Some(
-                "relish_storage_max_concurrent_sync_VALUE".to_string(),
+            remote_storage_max_concurrent_sync: Some(
+                "remote_storage_max_concurrent_sync_VALUE".to_string(),
            ),
        };

@@ -625,13 +679,16 @@ checkpoint_distance = 'checkpoint_distance_VALUE'
 checkpoint_period = 'checkpoint_period_VALUE'
 gc_horizon = 'gc_horizon_VALUE'
 gc_period = 'gc_period_VALUE'
+open_mem_limit = 'open_mem_limit_VALUE'
+page_cache_size = 'page_cache_size_VALUE'
+max_file_descriptors = 'max_file_descriptors_VALUE'
 pg_distrib_dir = 'pg_distrib_dir_VALUE'
 auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
 auth_type = 'auth_type_VALUE'
-relish_storage_max_concurrent_sync = 'relish_storage_max_concurrent_sync_VALUE'
+remote_storage_max_concurrent_sync = 'remote_storage_max_concurrent_sync_VALUE'

-[relish_storage]
-local_path = 'relish_storage_local_VALUE'
+[remote_storage]
+local_path = 'remote_storage_local_VALUE'
 "#,
            toml_pretty_string
        );
@@ -659,19 +716,22 @@ local_path = 'relish_storage_local_VALUE'
            checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
            gc_horizon: Some("gc_horizon_VALUE".to_string()),
            gc_period: Some("gc_period_VALUE".to_string()),
+            open_mem_limit: Some("open_mem_limit_VALUE".to_string()),
+            page_cache_size: Some("page_cache_size_VALUE".to_string()),
+            max_file_descriptors: Some("max_file_descriptors_VALUE".to_string()),
            pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
            auth_validation_public_key_path: Some(
                "auth_validation_public_key_path_VALUE".to_string(),
            ),
            auth_type: Some("auth_type_VALUE".to_string()),
-            relish_storage: Some(RelishStorage::AwsS3 {
+            remote_storage: Some(RemoteStorage::AwsS3 {
                bucket_name: "bucket_name_VALUE".to_string(),
                bucket_region: "bucket_region_VALUE".to_string(),
                access_key_id: Some("access_key_id_VALUE".to_string()),
                secret_access_key: Some("secret_access_key_VALUE".to_string()),
            }),
-            relish_storage_max_concurrent_sync: Some(
-                "relish_storage_max_concurrent_sync_VALUE".to_string(),
+            remote_storage_max_concurrent_sync: Some(
+                "remote_storage_max_concurrent_sync_VALUE".to_string(),
            ),
        };

@@ -685,12 +745,15 @@ checkpoint_distance = 'checkpoint_distance_VALUE'
 checkpoint_period = 'checkpoint_period_VALUE'
 gc_horizon = 'gc_horizon_VALUE'
 gc_period = 'gc_period_VALUE'
+open_mem_limit = 'open_mem_limit_VALUE'
+page_cache_size = 'page_cache_size_VALUE'
+max_file_descriptors = 'max_file_descriptors_VALUE'
 pg_distrib_dir = 'pg_distrib_dir_VALUE'
 auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
 auth_type = 'auth_type_VALUE'
-relish_storage_max_concurrent_sync = 'relish_storage_max_concurrent_sync_VALUE'
+remote_storage_max_concurrent_sync = 'remote_storage_max_concurrent_sync_VALUE'

-[relish_storage]
+[remote_storage]
 bucket_name = 'bucket_name_VALUE'
 bucket_region = 'bucket_region_VALUE'
 "#,
@@ -703,7 +766,7 @@ bucket_region = 'bucket_region_VALUE'
            .expect("Failed to deserialize the prettified serialization result of the config");

        let mut expected_params = params;
-        expected_params.relish_storage = Some(RelishStorage::AwsS3 {
+        expected_params.remote_storage = Some(RemoteStorage::AwsS3 {
            bucket_name: "bucket_name_VALUE".to_string(),
            bucket_region: "bucket_region_VALUE".to_string(),
            access_key_id: None,
--- a/pageserver/src/branches.rs
+++ b/pageserver/src/branches.rs
@@ -4,7 +4,7 @@
 // TODO: move all paths construction to conf impl
 //

-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{bail, Context, Result};
 use postgres_ffi::ControlFileData;
 use serde::{Deserialize, Serialize};
 use std::{
@@ -23,6 +23,7 @@ use zenith_utils::zid::{ZTenantId, ZTimelineId};

 use crate::tenant_mgr;
 use crate::walredo::WalRedoManager;
+use crate::CheckpointConfig;
 use crate::{repository::Repository, PageServerConf};
 use crate::{restore_local_repo, LOG_FILE_NAME};

@@ -35,7 +36,7 @@ pub struct BranchInfo {
    pub ancestor_id: Option<String>,
    pub ancestor_lsn: Option<String>,
    pub current_logical_size: usize,
-    pub current_logical_size_non_incremental: usize,
+    pub current_logical_size_non_incremental: Option<usize>,
 }

 impl BranchInfo {
@@ -44,6 +45,7 @@ impl BranchInfo {
        conf: &PageServerConf,
        tenantid: &ZTenantId,
        repo: &Arc<dyn Repository>,
+        include_non_incremental_logical_size: bool,
    ) -> Result<Self> {
        let name = path
            .as_ref()
@@ -78,6 +80,14 @@ impl BranchInfo {
            );
        }

+        // non incremental size calculation can be heavy, so let it be optional
+        // needed for tests to check size calculation
+        let current_logical_size_non_incremental = include_non_incremental_logical_size
+            .then(|| {
+                timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn())
+            })
+            .transpose()?;
+
        Ok(BranchInfo {
            name,
            timeline_id,
@@ -85,8 +95,7 @@ impl BranchInfo {
            ancestor_id,
            ancestor_lsn,
            current_logical_size: timeline.get_current_logical_size(),
-            current_logical_size_non_incremental: timeline
-                .get_current_logical_size_non_incremental(timeline.get_last_record_lsn())?,
+            current_logical_size_non_incremental,
        })
    }
 }
@@ -230,7 +239,7 @@ fn bootstrap_timeline(
        timeline.writer().as_ref(),
        lsn,
    )?;
-    timeline.checkpoint()?;
+    timeline.checkpoint(CheckpointConfig::Forced)?;

    println!(
        "created initial timeline {} timeline.lsn {}",
@@ -248,29 +257,40 @@ fn bootstrap_timeline(
    Ok(())
 }

-pub(crate) fn get_tenants(conf: &PageServerConf) -> Result<Vec<String>> {
-    let tenants_dir = conf.tenants_path();
-
-    std::fs::read_dir(&tenants_dir)?
-        .map(|dir_entry_res| {
-            let dir_entry = dir_entry_res?;
-            ensure!(dir_entry.file_type()?.is_dir());
-            Ok(dir_entry.file_name().to_str().unwrap().to_owned())
-        })
-        .collect()
-}
-
-pub(crate) fn get_branches(conf: &PageServerConf, tenantid: &ZTenantId) -> Result<Vec<BranchInfo>> {
+pub(crate) fn get_branches(
+    conf: &PageServerConf,
+    tenantid: &ZTenantId,
+    include_non_incremental_logical_size: bool,
+) -> Result<Vec<BranchInfo>> {
    let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?;

    // Each branch has a corresponding record (text file) in the refs/branches
    // with timeline_id.
    let branches_dir = conf.branches_path(tenantid);

-    std::fs::read_dir(&branches_dir)?
+    std::fs::read_dir(&branches_dir)
+        .with_context(|| {
+            format!(
+                "Found no branches directory '{}' for tenant {}",
+                branches_dir.display(),
+                tenantid
+            )
+        })?
        .map(|dir_entry_res| {
-            let dir_entry = dir_entry_res?;
-            BranchInfo::from_path(dir_entry.path(), conf, tenantid, &repo)
+            let dir_entry = dir_entry_res.with_context(|| {
+                format!(
+                    "Failed to list branches directory '{}' content for tenant {}",
+                    branches_dir.display(),
+                    tenantid
+                )
+            })?;
+            BranchInfo::from_path(
+                dir_entry.path(),
+                conf,
+                tenantid,
+                &repo,
+                include_non_incremental_logical_size,
+            )
        })
        .collect()
 }
@@ -332,7 +352,7 @@ pub(crate) fn create_branch(
        ancestor_id: None,
        ancestor_lsn: None,
        current_logical_size: 0,
-        current_logical_size_non_incremental: 0,
+        current_logical_size_non_incremental: Some(0),
    })
 }

--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -25,6 +25,11 @@ paths:
        schema:
          type: string
          format: hex
+      - name: include-non-incremental-logical-size
+        in: query
+        schema:
+          type: string
+          description: Controls calculation of current_logical_size_non_incremental
    get:
      description: Get branches for tenant
      responses:
@@ -73,6 +78,11 @@ paths:
        required: true
        schema:
          type: string
+      - name: include-non-incremental-logical-size
+        in: query
+        schema:
+          type: string
+          description: Controls calculation of current_logical_size_non_incremental
    get:
      description: Get branches for tenant
      responses:
@@ -164,13 +174,13 @@ paths:
      description: Get tenants list
      responses:
        "200":
-          description: OK
+          description: TenantInfo
          content:
            application/json:
              schema:
                type: array
                items:
-                  type: string
+                  $ref: "#/components/schemas/TenantInfo"
        "401":
          description: Unauthorized Error
          content:
@@ -243,6 +253,16 @@ components:
      scheme: bearer
      bearerFormat: JWT
  schemas:
+    TenantInfo:
+      type: object
+      required:
+        - id
+        - state
+      properties:
+        id:
+          type: string
+        state:
+          type: string
    BranchInfo:
      type: object
      required:
@@ -250,7 +270,6 @@ components:
        - timeline_id
        - latest_valid_lsn
        - current_logical_size
-        - current_logical_size_non_incremental
      properties:
        name:
          type: string
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -86,31 +86,59 @@ async fn branch_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    Ok(json_response(StatusCode::CREATED, response_data)?)
 }

+// Gate non incremental logical size calculation behind a flag
+// after pgbench -i -s100 calculation took 28ms so if multiplied by the number of timelines
+// and tenants it can take noticeable amount of time. Also the value currently used only in tests
+fn get_include_non_incremental_logical_size(request: &Request<Body>) -> bool {
+    request
+        .uri()
+        .query()
+        .map(|v| {
+            url::form_urlencoded::parse(v.as_bytes())
+                .into_owned()
+                .any(|(param, _)| param == "include-non-incremental-logical-size")
+        })
+        .unwrap_or(false)
+}
+
 async fn branch_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;

+    let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
+
    check_permission(&request, Some(tenantid))?;

    let response_data = tokio::task::spawn_blocking(move || {
        let _enter = info_span!("branch_list", tenant = %tenantid).entered();
-        crate::branches::get_branches(get_config(&request), &tenantid)
+        crate::branches::get_branches(
+            get_config(&request),
+            &tenantid,
+            include_non_incremental_logical_size,
+        )
    })
    .await
    .map_err(ApiError::from_err)??;
    Ok(json_response(StatusCode::OK, response_data)?)
 }

-// TODO add to swagger
 async fn branch_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
    let branch_name: String = get_request_param(&request, "branch_name")?.to_string();
    let conf = get_state(&request).conf;
    let path = conf.branch_path(&branch_name, &tenantid);

+    let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
+
    let response_data = tokio::task::spawn_blocking(move || {
        let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered();
        let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
-        BranchInfo::from_path(path, conf, &tenantid, &repo)
+        BranchInfo::from_path(
+            path,
+            conf,
+            &tenantid,
+            &repo,
+            include_non_incremental_logical_size,
+        )
    })
    .await
    .map_err(ApiError::from_err)??;
@@ -124,7 +152,7 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A

    let response_data = tokio::task::spawn_blocking(move || {
        let _enter = info_span!("tenant_list").entered();
-        crate::branches::get_tenants(get_config(&request))
+        crate::tenant_mgr::list_tenants()
    })
    .await
    .map_err(ApiError::from_err)??;
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
@@ -16,30 +16,31 @@ use bookfile::Book;
 use bytes::Bytes;
 use lazy_static::lazy_static;
 use postgres_ffi::pg_constants::BLCKSZ;
-use serde::{Deserialize, Serialize};
 use tracing::*;

+use std::cmp;
 use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::collections::{BTreeSet, HashSet};
-use std::convert::TryInto;
 use std::fs;
 use std::fs::{File, OpenOptions};
 use std::io::Write;
 use std::ops::{Bound::Included, Deref};
 use std::path::{Path, PathBuf};
-use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::atomic::{self, AtomicUsize};
 use std::sync::{Arc, Mutex, MutexGuard};
-use std::thread::JoinHandle;
 use std::time::{Duration, Instant};

+use self::metadata::{metadata_path, TimelineMetadata};
+use crate::page_cache;
 use crate::relish::*;
-use crate::relish_storage::schedule_timeline_upload;
+use crate::remote_storage::schedule_timeline_upload;
 use crate::repository::{GcResult, Repository, Timeline, TimelineWriter, WALRecord};
 use crate::tenant_mgr;
 use crate::walreceiver;
 use crate::walreceiver::IS_WAL_RECEIVER;
 use crate::walredo::WalRedoManager;
+use crate::CheckpointConfig;
 use crate::PageServerConf;
 use crate::{ZTenantId, ZTimelineId};

@@ -47,30 +48,35 @@ use zenith_metrics::{
    register_histogram, register_int_gauge_vec, Histogram, IntGauge, IntGaugeVec,
 };
 use zenith_metrics::{register_histogram_vec, HistogramVec};
-use zenith_utils::bin_ser::BeSer;
 use zenith_utils::crashsafe_dir;
 use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn};
 use zenith_utils::seqwait::SeqWait;

 mod blob;
 mod delta_layer;
+mod ephemeral_file;
 mod filename;
+mod global_layer_map;
 mod image_layer;
 mod inmemory_layer;
 mod interval_tree;
 mod layer_map;
+pub mod metadata;
 mod page_versions;
 mod storage_layer;

 use delta_layer::DeltaLayer;
 use image_layer::ImageLayer;

-use inmemory_layer::InMemoryLayer;
+use global_layer_map::{LayerId, GLOBAL_LAYER_MAP};
+use inmemory_layer::OpenLayer;
 use layer_map::LayerMap;
 use storage_layer::{
    Layer, PageReconstructData, PageReconstructResult, SegmentTag, RELISH_SEG_SIZE,
 };

+pub use crate::layered_repository::ephemeral_file::writeback as writeback_ephemeral_file;
+
 static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);

 // Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call.
@@ -111,8 +117,9 @@ lazy_static! {
    .expect("failed to define a metric");
 }

-/// The name of the metadata file pageserver creates per timeline.
-pub const METADATA_FILE_NAME: &str = "metadata";
+/// Parts of the `.zenith/tenants/<tenantid>/timelines/<timelineid>` directory prefix.
+pub const TENANTS_SEGMENT_NAME: &str = "tenants";
+pub const TIMELINES_SEGMENT_NAME: &str = "timelines";

 ///
 /// Repository consists of multiple timelines. Keep them in a hash table.
@@ -142,12 +149,7 @@ impl Repository for LayeredRepository {
        // Create the timeline directory, and write initial metadata to file.
        crashsafe_dir::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenantid))?;

-        let metadata = TimelineMetadata {
-            disk_consistent_lsn: Lsn(0),
-            prev_record_lsn: None,
-            ancestor_timeline: None,
-            ancestor_lsn: Lsn(0),
-        };
+        let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0));
        Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata, true)?;

        let timeline = LayeredTimeline::new(
@@ -186,12 +188,7 @@ impl Repository for LayeredRepository {
        // Create the metadata file, noting the ancestor of the new timeline.
        // There is initially no data in it, but all the read-calls know to look
        // into the ancestor.
-        let metadata = TimelineMetadata {
-            disk_consistent_lsn: start_lsn,
-            prev_record_lsn: dst_prev,
-            ancestor_timeline: Some(src),
-            ancestor_lsn: start_lsn,
-        };
+        let metadata = TimelineMetadata::new(start_lsn, dst_prev, Some(src), start_lsn);
        crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenantid))?;
        Self::save_metadata(self.conf, dst, self.tenantid, &metadata, true)?;

@@ -216,6 +213,22 @@ impl Repository for LayeredRepository {
            })
    }

+    fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()> {
+        {
+            let timelines = self.timelines.lock().unwrap();
+
+            for (timelineid, timeline) in timelines.iter() {
+                let _entered =
+                    info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenantid)
+                        .entered();
+
+                timeline.checkpoint(cconf)?;
+            }
+        }
+
+        Ok(())
+    }
+
    // Wait for all threads to complete and persist repository data before pageserver shutdown.
    fn shutdown(&self) -> Result<()> {
        trace!("LayeredRepository shutdown for tenant {}", self.tenantid);
@@ -225,7 +238,7 @@ impl Repository for LayeredRepository {
            walreceiver::stop_wal_receiver(*timelineid);
            // Wait for syncing data to disk
            trace!("repo shutdown. checkpoint timeline {}", timelineid);
-            timeline.checkpoint()?;
+            timeline.checkpoint(CheckpointConfig::Forced)?;

            //TODO Wait for walredo process to shutdown too
        }
@@ -247,14 +260,14 @@ impl LayeredRepository {
            Some(timeline) => Ok(timeline.clone()),
            None => {
                let metadata = Self::load_metadata(self.conf, timelineid, self.tenantid)?;
-                let disk_consistent_lsn = metadata.disk_consistent_lsn;
+                let disk_consistent_lsn = metadata.disk_consistent_lsn();

                // Recurse to look up the ancestor timeline.
                //
                // TODO: If you have a very deep timeline history, this could become
                // expensive. Perhaps delay this until we need to look up a page in
                // ancestor.
-                let ancestor = if let Some(ancestor_timelineid) = metadata.ancestor_timeline {
+                let ancestor = if let Some(ancestor_timelineid) = metadata.ancestor_timeline() {
                    Some(self.get_timeline_locked(ancestor_timelineid, timelines)?)
                } else {
                    None
@@ -266,7 +279,7 @@ impl LayeredRepository {

                let mut timeline = LayeredTimeline::new(
                    self.conf,
-                    metadata,
+                    metadata.clone(),
                    ancestor,
                    timelineid,
                    self.tenantid,
@@ -276,15 +289,9 @@ impl LayeredRepository {
                )?;

                // List the layers on disk, and load them into the layer map
-                let _loaded_layers = timeline.load_layer_map(disk_consistent_lsn)?;
+                let loaded_layers = timeline.load_layer_map(disk_consistent_lsn)?;
                if self.upload_relishes {
-                    schedule_timeline_upload(());
-                    // schedule_timeline_upload(
-                    //     self.tenantid,
-                    //     timelineid,
-                    //     loaded_layers,
-                    //     disk_consistent_lsn,
-                    // );
+                    schedule_timeline_upload(self.tenantid, timelineid, loaded_layers, metadata);
                }

                // needs to be after load_layer_map
@@ -312,90 +319,6 @@ impl LayeredRepository {
        }
    }

-    ///
-    /// Launch the checkpointer thread in given repository.
-    ///
-    pub fn launch_checkpointer_thread(
-        conf: &'static PageServerConf,
-        rc: Arc<LayeredRepository>,
-    ) -> JoinHandle<()> {
-        std::thread::Builder::new()
-            .name("Checkpointer thread".into())
-            .spawn(move || {
-                // FIXME: relaunch it? Panic is not good.
-                rc.checkpoint_loop(conf).expect("Checkpointer thread died");
-            })
-            .unwrap()
-    }
-
-    ///
-    /// Checkpointer thread's main loop
-    ///
-    fn checkpoint_loop(&self, conf: &'static PageServerConf) -> Result<()> {
-        while !tenant_mgr::shutdown_requested() {
-            std::thread::sleep(conf.checkpoint_period);
-            info!("checkpointer thread for tenant {} waking up", self.tenantid);
-
-            // checkpoint timelines that have accumulated more than CHECKPOINT_DISTANCE
-            // bytes of WAL since last checkpoint.
-            {
-                let timelines = self.timelines.lock().unwrap();
-                for (timelineid, timeline) in timelines.iter() {
-                    let _entered =
-                        info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenantid)
-                            .entered();
-
-                    STORAGE_TIME
-                        .with_label_values(&["checkpoint_timed"])
-                        .observe_closure_duration(|| {
-                            timeline.checkpoint_internal(conf.checkpoint_distance, false)
-                        })?
-                }
-                // release lock on 'timelines'
-            }
-        }
-        trace!("Checkpointer thread shut down");
-        Ok(())
-    }
-
-    ///
-    /// Launch the GC thread in given repository.
-    ///
-    pub fn launch_gc_thread(
-        conf: &'static PageServerConf,
-        rc: Arc<LayeredRepository>,
-    ) -> JoinHandle<()> {
-        std::thread::Builder::new()
-            .name("GC thread".into())
-            .spawn(move || {
-                // FIXME: relaunch it? Panic is not good.
-                rc.gc_loop(conf).expect("GC thread died");
-            })
-            .unwrap()
-    }
-
-    ///
-    /// GC thread's main loop
-    ///
-    fn gc_loop(&self, conf: &'static PageServerConf) -> Result<()> {
-        while !tenant_mgr::shutdown_requested() {
-            // Garbage collect old files that are not needed for PITR anymore
-            if conf.gc_horizon > 0 {
-                self.gc_iteration(None, conf.gc_horizon, false).unwrap();
-            }
-
-            // TODO Write it in more adequate way using
-            // condvar.wait_timeout() or something
-            let mut sleep_time = conf.gc_period.as_secs();
-            while sleep_time > 0 && !tenant_mgr::shutdown_requested() {
-                sleep_time -= 1;
-                std::thread::sleep(Duration::from_secs(1));
-            }
-            info!("gc thread for tenant {} waking up", self.tenantid);
-        }
-        Ok(())
-    }
-
    /// Save timeline metadata to file
    fn save_metadata(
        conf: &'static PageServerConf,
@@ -412,13 +335,7 @@ impl LayeredRepository {
            .create_new(first_save)
            .open(&path)?;

-        let mut metadata_bytes = TimelineMetadata::ser(data)?;
-
-        assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
-        metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8);
-
-        let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]);
-        metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum));
+        let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?;

        if file.write(&metadata_bytes)? != metadata_bytes.len() {
            bail!("Could not write all the metadata bytes in a single call");
@@ -445,20 +362,7 @@ impl LayeredRepository {
    ) -> Result<TimelineMetadata> {
        let path = metadata_path(conf, timelineid, tenantid);
        let metadata_bytes = std::fs::read(&path)?;
-        ensure!(metadata_bytes.len() == METADATA_MAX_SAFE_SIZE);
-
-        let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE];
-        let calculated_checksum = crc32c::crc32c(data);
-
-        let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] =
-            metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?;
-        let expected_checksum = u32::from_le_bytes(*checksum_bytes);
-        ensure!(calculated_checksum == expected_checksum);
-
-        let data = TimelineMetadata::des_prefix(data)?;
-        assert!(data.disk_consistent_lsn.is_aligned());
-
-        Ok(data)
+        TimelineMetadata::from_bytes(&metadata_bytes)
    }

    //
@@ -568,7 +472,7 @@ impl LayeredRepository {
                // so that they too can be garbage collected. That's
                // used in tests, so we want as deterministic results as possible.
                if checkpoint_before_gc {
-                    timeline.checkpoint()?;
+                    timeline.checkpoint(CheckpointConfig::Forced)?;
                    info!("timeline {} checkpoint_before_gc done", timelineid);
                }

@@ -583,29 +487,6 @@ impl LayeredRepository {
    }
 }

-/// Metadata stored on disk for each timeline
-///
-/// The fields correspond to the values we hold in memory, in LayeredTimeline.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct TimelineMetadata {
-    disk_consistent_lsn: Lsn,
-
-    // This is only set if we know it. We track it in memory when the page
-    // server is running, but we only track the value corresponding to
-    // 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a
-    // lot. We only store it in the metadata file when we flush *all* the
-    // in-memory data so that 'last_record_lsn' is the same as
-    // 'disk_consistent_lsn'.  That's OK, because after page server restart, as
-    // soon as we reprocess at least one record, we will have a valid
-    // 'prev_record_lsn' value in memory again. This is only really needed when
-    // doing a clean shutdown, so that there is no more WAL beyond
-    // 'disk_consistent_lsn'
-    prev_record_lsn: Option<Lsn>,
-
-    ancestor_timeline: Option<ZTimelineId>,
-    ancestor_lsn: Lsn,
-}
-
 pub struct LayeredTimeline {
    conf: &'static PageServerConf,

@@ -695,8 +576,8 @@ impl Timeline for LayeredTimeline {
            .wait_for_timeout(lsn, TIMEOUT)
            .with_context(|| {
                format!(
-                    "Timed out while waiting for WAL record at LSN {} to arrive",
-                    lsn
+                    "Timed out while waiting for WAL record at LSN {} to arrive, disk consistent LSN={}",
+                    lsn, self.get_disk_consistent_lsn()
                )
            })?;

@@ -720,7 +601,16 @@ impl Timeline for LayeredTimeline {
            RECONSTRUCT_TIME
                .observe_closure_duration(|| self.materialize_page(seg, blknum, lsn, &*layer))
        } else {
-            bail!("relish {} not found at {}", rel, lsn);
+            // FIXME: This can happen if PostgreSQL extends a relation but never writes
+            // the page. See https://github.com/zenithdb/zenith/issues/841
+            //
+            // Would be nice to detect that situation better.
+            if seg.segno > 0 && self.get_rel_exists(rel, lsn)? {
+                warn!("Page {} blk {} at {} not found", seg.rel, blknum, lsn);
+                return Ok(ZERO_PAGE.clone());
+            }
+
+            bail!("segment {} not found at {}", rel, lsn);
        }
    }

@@ -852,11 +742,15 @@ impl Timeline for LayeredTimeline {
    /// Public entry point for checkpoint(). All the logic is in the private
    /// checkpoint_internal function, this public facade just wraps it for
    /// metrics collection.
-    fn checkpoint(&self) -> Result<()> {
-        STORAGE_TIME
-            .with_label_values(&["checkpoint_force"])
-            //pass checkpoint_distance=0 to force checkpoint
-            .observe_closure_duration(|| self.checkpoint_internal(0, true))
+    fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()> {
+        match cconf {
+            CheckpointConfig::Forced => STORAGE_TIME
+                .with_label_values(&["forced checkpoint"])
+                .observe_closure_duration(|| self.checkpoint_internal(0)),
+            CheckpointConfig::Distance(distance) => STORAGE_TIME
+                .with_label_values(&["checkpoint"])
+                .observe_closure_duration(|| self.checkpoint_internal(distance)),
+        }
    }

    fn get_last_record_lsn(&self) -> Lsn {
@@ -880,7 +774,7 @@ impl Timeline for LayeredTimeline {
    }

    fn get_current_logical_size(&self) -> usize {
-        self.current_logical_size.load(Ordering::Acquire) as usize
+        self.current_logical_size.load(atomic::Ordering::Acquire) as usize
    }

    fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize> {
@@ -910,12 +804,20 @@ impl Timeline for LayeredTimeline {
        Ok(total_blocks * BLCKSZ as usize)
    }

+    fn get_disk_consistent_lsn(&self) -> Lsn {
+        self.disk_consistent_lsn.load()
+    }
+
    fn writer<'a>(&'a self) -> Box<dyn TimelineWriter + 'a> {
        Box::new(LayeredTimelineWriter {
            tl: self,
            _write_guard: self.write_lock.lock().unwrap(),
        })
    }
+
+    fn upgrade_to_layered_timeline(&self) -> &crate::layered_repository::LayeredTimeline {
+        self
+    }
 }

 impl LayeredTimeline {
@@ -946,13 +848,13 @@ impl LayeredTimeline {

            // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
            last_record_lsn: SeqWait::new(RecordLsn {
-                last: metadata.disk_consistent_lsn,
-                prev: metadata.prev_record_lsn.unwrap_or(Lsn(0)),
+                last: metadata.disk_consistent_lsn(),
+                prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)),
            }),
-            disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn.0),
+            disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0),

            ancestor_timeline: ancestor,
-            ancestor_lsn: metadata.ancestor_lsn,
+            ancestor_lsn: metadata.ancestor_lsn(),
            current_logical_size: AtomicUsize::new(current_logical_size),
            current_logical_size_gauge,
            upload_relishes,
@@ -1022,7 +924,7 @@ impl LayeredTimeline {
    /// Used to init current logical size on startup
    ///
    fn init_current_logical_size(&mut self) -> Result<()> {
-        if self.current_logical_size.load(Ordering::Relaxed) != 0 {
+        if self.current_logical_size.load(atomic::Ordering::Relaxed) != 0 {
            bail!("cannot init already initialized current logical size")
        };
        let lsn = self.get_last_record_lsn();
@@ -1030,7 +932,7 @@ impl LayeredTimeline {
            AtomicUsize::new(self.get_current_logical_size_non_incremental(lsn)?);
        trace!(
            "current_logical_size initialized to {}",
-            self.current_logical_size.load(Ordering::Relaxed)
+            self.current_logical_size.load(atomic::Ordering::Relaxed)
        );
        Ok(())
    }
@@ -1126,7 +1028,7 @@ impl LayeredTimeline {
    ///
    /// Get a handle to the latest layer for appending.
    ///
-    fn get_layer_for_write(&self, seg: SegmentTag, lsn: Lsn) -> Result<Arc<InMemoryLayer>> {
+    fn get_layer_for_write(&self, seg: SegmentTag, lsn: Lsn) -> Result<Arc<OpenLayer>> {
        let mut layers = self.layers.lock().unwrap();

        assert!(lsn.is_aligned());
@@ -1157,14 +1059,8 @@ impl LayeredTimeline {
                    lsn
                );

-                layer = InMemoryLayer::create(
-                    self.conf,
-                    self.timelineid,
-                    self.tenantid,
-                    seg,
-                    lsn,
-                    lsn,
-                )?;
+                layer =
+                    OpenLayer::create(self.conf, self.timelineid, self.tenantid, seg, lsn, lsn)?;
            } else {
                return Ok(open_layer);
            }
@@ -1200,7 +1096,7 @@ impl LayeredTimeline {
                prev_layer.get_start_lsn(),
                prev_layer.get_end_lsn()
            );
-            layer = InMemoryLayer::create_successor_layer(
+            layer = OpenLayer::create_successor_layer(
                self.conf,
                prev_layer,
                self.timelineid,
@@ -1217,11 +1113,10 @@ impl LayeredTimeline {
                lsn
            );

-            layer =
-                InMemoryLayer::create(self.conf, self.timelineid, self.tenantid, seg, lsn, lsn)?;
+            layer = OpenLayer::create(self.conf, self.timelineid, self.tenantid, seg, lsn, lsn)?;
        }

-        let layer_rc: Arc<InMemoryLayer> = Arc::new(layer);
+        let layer_rc: Arc<OpenLayer> = Arc::new(layer);
        layers.insert_open(Arc::clone(&layer_rc));

        Ok(layer_rc)
@@ -1231,7 +1126,7 @@ impl LayeredTimeline {
    /// Flush to disk all data that was written with the put_* functions
    ///
    /// NOTE: This has nothing to do with checkpoint in PostgreSQL.
-    fn checkpoint_internal(&self, checkpoint_distance: u64, forced: bool) -> Result<()> {
+    fn checkpoint_internal(&self, checkpoint_distance: u64) -> Result<()> {
        let mut write_guard = self.write_lock.lock().unwrap();
        let mut layers = self.layers.lock().unwrap();

@@ -1257,15 +1152,12 @@ impl LayeredTimeline {
        // a lot of memory and/or aren't receiving much updates anymore.
        let mut disk_consistent_lsn = last_record_lsn;

-        let mut created_historics = false;
        let mut layer_uploads = Vec::new();
-        while let Some((oldest_layer, oldest_generation)) = layers.peek_oldest_open() {
+        while let Some((oldest_layer_id, oldest_layer, oldest_generation)) =
+            layers.peek_oldest_open()
+        {
            let oldest_pending_lsn = oldest_layer.get_oldest_pending_lsn();

-            if tenant_mgr::shutdown_requested() && !forced {
-                return Ok(());
-            }
-
            // Does this layer need freezing?
            //
            // Write out all in-memory layers that contain WAL older than CHECKPOINT_DISTANCE.
@@ -1288,42 +1180,14 @@ impl LayeredTimeline {
                break;
            }

-            // Mark the layer as no longer accepting writes and record the end_lsn.
-            // This happens in-place, no new layers are created now.
-            // We call `get_last_record_lsn` again, which may be different from the
-            // original load, as we may have released the write lock since then.
-            oldest_layer.freeze(self.get_last_record_lsn());
-
-            // The layer is no longer open, update the layer map to reflect this.
-            // We will replace it with on-disk historics below.
-            layers.pop_oldest_open();
-            layers.insert_historic(oldest_layer.clone());
-
-            // Write the now-frozen layer to disk. That could take a while, so release the lock while do it
            drop(layers);
            drop(write_guard);

-            let new_historics = oldest_layer.write_to_disk(self)?;
+            let mut this_layer_uploads = self.evict_layer(oldest_layer_id)?;
+            layer_uploads.append(&mut this_layer_uploads);

            write_guard = self.write_lock.lock().unwrap();
            layers = self.layers.lock().unwrap();
-
-            if !new_historics.is_empty() {
-                created_historics = true;
-            }
-
-            // Finally, replace the frozen in-memory layer with the new on-disk layers
-            layers.remove_historic(oldest_layer);
-
-            // Add the historics to the LayerMap
-            for delta_layer in new_historics.delta_layers {
-                layer_uploads.push(delta_layer.path());
-                layers.insert_historic(Arc::new(delta_layer));
-            }
-            for image_layer in new_historics.image_layers {
-                layer_uploads.push(image_layer.path());
-                layers.insert_historic(Arc::new(image_layer));
-            }
        }

        // Call unload() on all frozen layers, to release memory.
@@ -1336,7 +1200,7 @@ impl LayeredTimeline {
        drop(layers);
        drop(write_guard);

-        if created_historics {
+        if !layer_uploads.is_empty() {
            // We must fsync the timeline dir to ensure the directory entries for
            // new layer files are durable
            let timeline_dir =
@@ -1364,12 +1228,13 @@ impl LayeredTimeline {

            let ancestor_timelineid = self.ancestor_timeline.as_ref().map(|x| x.timelineid);

-            let metadata = TimelineMetadata {
+            let metadata = TimelineMetadata::new(
                disk_consistent_lsn,
-                prev_record_lsn: ondisk_prev_record_lsn,
-                ancestor_timeline: ancestor_timelineid,
-                ancestor_lsn: self.ancestor_lsn,
-            };
+                ondisk_prev_record_lsn,
+                ancestor_timelineid,
+                self.ancestor_lsn,
+            );
+
            LayeredRepository::save_metadata(
                self.conf,
                self.timelineid,
@@ -1377,24 +1242,66 @@ impl LayeredTimeline {
                &metadata,
                false,
            )?;
+            if self.upload_relishes {
+                schedule_timeline_upload(self.tenantid, self.timelineid, layer_uploads, metadata);
+            }

            // Also update the in-memory copy
            self.disk_consistent_lsn.store(disk_consistent_lsn);
-
-            if self.upload_relishes {
-                schedule_timeline_upload(())
-                // schedule_timeline_upload(
-                //     self.tenantid,
-                //     self.timelineid,
-                //     layer_uploads,
-                //     disk_consistent_lsn,
-                // });
-            }
        }

        Ok(())
    }

+    fn evict_layer(&self, layer_id: LayerId) -> Result<Vec<PathBuf>> {
+        // Mark the layer as no longer accepting writes and record the end_lsn.
+        // This happens in-place, no new layers are created now.
+        // We call `get_last_record_lsn` again, which may be different from the
+        // original load, as we may have released the write lock since then.
+
+        let mut write_guard = self.write_lock.lock().unwrap();
+        let mut layers = self.layers.lock().unwrap();
+
+        let mut layer_uploads = Vec::new();
+
+        let global_layer_map = GLOBAL_LAYER_MAP.read().unwrap();
+        if let Some(oldest_layer) = global_layer_map.get(&layer_id) {
+            drop(global_layer_map);
+            oldest_layer.freeze(self.get_last_record_lsn());
+
+            // The layer is no longer open, update the layer map to reflect this.
+            // We will replace it with on-disk historics below.
+            layers.remove_open(layer_id);
+            layers.insert_historic(oldest_layer.clone());
+
+            // Write the now-frozen layer to disk. That could take a while, so release the lock while do it
+            drop(layers);
+            drop(write_guard);
+
+            let new_historics = oldest_layer.write_to_disk(self)?;
+
+            write_guard = self.write_lock.lock().unwrap();
+            layers = self.layers.lock().unwrap();
+
+            // Finally, replace the frozen in-memory layer with the new on-disk layers
+            layers.remove_historic(oldest_layer);
+
+            // Add the historics to the LayerMap
+            for delta_layer in new_historics.delta_layers {
+                layer_uploads.push(delta_layer.path());
+                layers.insert_historic(Arc::new(delta_layer));
+            }
+            for image_layer in new_historics.image_layers {
+                layer_uploads.push(image_layer.path());
+                layers.insert_historic(Arc::new(image_layer));
+            }
+        }
+        drop(layers);
+        drop(write_guard);
+
+        Ok(layer_uploads)
+    }
+
    ///
    /// Garbage collect layer files on a timeline that are no longer needed.
    ///
@@ -1605,6 +1512,23 @@ impl LayeredTimeline {
        Ok(result)
    }

+    fn lookup_cached_page(&self, seg: &SegmentTag, blknum: u32, lsn: Lsn) -> Option<(Lsn, Bytes)> {
+        let cache = page_cache::get();
+        if let RelishTag::Relation(rel_tag) = &seg.rel {
+            let (lsn, read_guard) = cache.lookup_materialized_page(
+                self.tenantid,
+                self.timelineid,
+                *rel_tag,
+                blknum,
+                lsn,
+            )?;
+            let img = Bytes::from(read_guard.to_vec());
+            Some((lsn, img))
+        } else {
+            None
+        }
+    }
+
    ///
    /// Reconstruct a page version from given Layer
    ///
@@ -1615,6 +1539,22 @@ impl LayeredTimeline {
        lsn: Lsn,
        layer: &dyn Layer,
    ) -> Result<Bytes> {
+        // Check the page cache. We will get back the most recent page with lsn <= `lsn`.
+        // The cached image can be returned directly if there is no WAL between the cached image
+        // and requested LSN. The cached image can also be used to reduce the amount of WAL needed
+        // for redo.
+        let (cached_lsn_opt, cached_page_opt) = match self.lookup_cached_page(&seg, blknum, lsn) {
+            Some((cached_lsn, cached_img)) => {
+                match cached_lsn.cmp(&lsn) {
+                    cmp::Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
+                    cmp::Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image
+                    cmp::Ordering::Greater => panic!(), // the returned lsn should never be after the requested lsn
+                }
+                (Some(cached_lsn), Some((cached_lsn, cached_img)))
+            }
+            None => (None, None),
+        };
+
        let mut data = PageReconstructData {
            records: Vec::new(),
            page_img: None,
@@ -1628,12 +1568,35 @@ impl LayeredTimeline {
        // call it again on the predecessor layer until we have all the required data.
        let mut layer_ref = layer;
        let mut curr_lsn = lsn;
+        let mut cacheable_result: Option<Lsn> = None;
        loop {
-            match layer_ref.get_page_reconstruct_data(blknum, curr_lsn, &mut data)? {
-                PageReconstructResult::Complete => break,
+            match layer_ref.get_page_reconstruct_data(
+                blknum,
+                curr_lsn,
+                cached_lsn_opt,
+                &mut data,
+            )? {
+                PageReconstructResult::Complete => {
+                    if curr_lsn == lsn {
+                        // We have an opportunity to cache this page
+                        if let Some((rec_lsn, _rec)) = data.records.first() {
+                            cacheable_result = Some(*rec_lsn);
+                        }
+                    }
+                    break;
+                }
                PageReconstructResult::Continue(cont_lsn) => {
                    // Fetch base image / more WAL from the returned predecessor layer
                    if let Some((cont_layer, cont_lsn)) = self.get_layer_for_read(seg, cont_lsn)? {
+                        if cont_lsn == curr_lsn {
+                            // We landed on the same layer again. Shouldn't happen, but if it does,
+                            // don't get stuck in an infinite loop.
+                            bail!(
+                                "could not find predecessor layer of segment {} at {}",
+                                seg.rel,
+                                cont_lsn
+                            );
+                        }
                        layer_arc = cont_layer;
                        layer_ref = &*layer_arc;
                        curr_lsn = cont_lsn;
@@ -1664,10 +1627,26 @@ impl LayeredTimeline {
                        lsn,
                    );
                }
+                PageReconstructResult::Cached => {
+                    let (cached_lsn, cached_img) = cached_page_opt.unwrap();
+                    assert!(data.page_img.is_none());
+                    if let Some((first_rec_lsn, first_rec)) = data.records.first() {
+                        assert!(&cached_lsn < first_rec_lsn);
+                        assert!(!first_rec.will_init);
+                    }
+                    data.page_img = Some(cached_img);
+                    break;
+                }
            }
        }

-        self.reconstruct_page(seg.rel, blknum, lsn, data)
+        let img = self.reconstruct_page(seg.rel, blknum, lsn, data)?;
+
+        if let Some(cache_lsn) = cacheable_result {
+            layer_ref.cache_page_image(blknum, cache_lsn, &img)?;
+        }
+
+        Ok(img)
    }

    ///
@@ -1719,6 +1698,9 @@ impl LayeredTimeline {
                } else {
                    trace!("found {} WAL records that will init the page for blk {} in {} at {}, performing WAL redo", data.records.len(), blknum, rel, request_lsn);
                }
+
+                let last_rec_lsn = data.records.last().unwrap().0;
+
                let img = self.walredo_mgr.request_redo(
                    rel,
                    blknum,
@@ -1727,6 +1709,18 @@ impl LayeredTimeline {
                    data.records,
                )?;

+                if let RelishTag::Relation(rel_tag) = &rel {
+                    let cache = page_cache::get();
+                    cache.memorize_materialized_page(
+                        self.tenantid,
+                        self.timelineid,
+                        *rel_tag,
+                        blknum,
+                        last_rec_lsn,
+                        &img,
+                    );
+                }
+
                Ok(img)
            }
        }
@@ -1738,7 +1732,7 @@ impl LayeredTimeline {
    fn increase_current_logical_size(&self, diff: u32) {
        let val = self
            .current_logical_size
-            .fetch_add(diff as usize, Ordering::SeqCst);
+            .fetch_add(diff as usize, atomic::Ordering::SeqCst);
        trace!(
            "increase_current_logical_size: {} + {} = {}",
            val,
@@ -1755,7 +1749,7 @@ impl LayeredTimeline {
    fn decrease_current_logical_size(&self, diff: u32) {
        let val = self
            .current_logical_size
-            .fetch_sub(diff as usize, Ordering::SeqCst);
+            .fetch_sub(diff as usize, atomic::Ordering::SeqCst);
        trace!(
            "decrease_current_logical_size: {} - {} = {}",
            val,
@@ -1793,7 +1787,7 @@ impl<'a> TimelineWriter for LayeredTimelineWriter<'a> {

        let seg = SegmentTag::from_blknum(rel, blknum);
        let layer = self.tl.get_layer_for_write(seg, lsn)?;
-        let delta_size = layer.put_wal_record(lsn, blknum, rec);
+        let delta_size = layer.put_wal_record(lsn, blknum, rec)?;
        self.tl
            .increase_current_logical_size(delta_size * BLCKSZ as u32);
        Ok(())
@@ -1812,7 +1806,7 @@ impl<'a> TimelineWriter for LayeredTimelineWriter<'a> {
        let seg = SegmentTag::from_blknum(rel, blknum);

        let layer = self.tl.get_layer_for_write(seg, lsn)?;
-        let delta_size = layer.put_page_image(blknum, lsn, img);
+        let delta_size = layer.put_page_image(blknum, lsn, img)?;

        self.tl
            .increase_current_logical_size(delta_size * BLCKSZ as u32);
@@ -1943,15 +1937,6 @@ pub fn dump_layerfile_from_path(path: &Path) -> Result<()> {
    Ok(())
 }

-fn metadata_path(
-    conf: &'static PageServerConf,
-    timelineid: ZTimelineId,
-    tenantid: ZTenantId,
-) -> PathBuf {
-    conf.timeline_path(&timelineid, &tenantid)
-        .join(METADATA_FILE_NAME)
-}
-
 /// Add a suffix to a layer file's name: .{num}.old
 /// Uses the first available num (starts at 0)
 fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> {
--- a/pageserver/src/layered_repository/README.md
+++ b/pageserver/src/layered_repository/README.md
@@ -1,12 +1,56 @@
 # Overview

-The on-disk format is based on immutable files. The page server
-receives a stream of incoming WAL, parses the WAL records to determine
-which pages they apply to, and accumulates the incoming changes in
-memory. Every now and then, the accumulated changes are written out to
-new immutable files. This process is called checkpointing. Old versions
-of on-disk files that are not needed by any timeline are removed by GC
-process.
+The on-disk format is based on immutable files. The page server receives a
+stream of incoming WAL, parses the WAL records to determine which pages they
+apply to, and accumulates the incoming changes in memory. Every now and then,
+the accumulated changes are written out to new immutable files. This process is
+called checkpointing. Old versions of on-disk files that are not needed by any
+timeline are removed by GC process.
+
+The main responsibility of the Page Server is to process the incoming WAL, and
+reprocess it into a format that allows reasonably quick access to any page
+version.
+
+The incoming WAL contains updates to arbitrary pages in the system. The
+distribution depends on the workload: the updates could be totally random, or
+there could be a long stream of updates to a single relation when data is bulk
+loaded, for example, or something in between. The page server slices the
+incoming WAL per relation and page, and packages the sliced WAL into
+suitably-sized "layer files". The layer files contain all the history of the
+database, back to some reasonable retention period. This system replaces the
+base backups and the WAL archive used in a traditional PostgreSQL
+installation. The layer files are immutable, they are not modified in-place
+after creation. New layer files are created for new incoming WAL, and old layer
+files are removed when they are no longer needed. We could also replace layer
+files with new files that contain the same information, merging small files for
+example, but that hasn't been implemented yet.
+
+
+Cloud Storage                   Page Server                   Safekeeper
+                     Local disk                Memory            WAL
+
+|AAAA|               |AAAA|AAAA|               |AA
+|BBBB|               |BBBB|BBBB|               |
+|CCCC|CCCC|  <----   |CCCC|CCCC|CCCC|   <---   |CC     <----   ADEBAABED
+|DDDD|DDDD|          |DDDD|DDDD|               |DDD
+|EEEE|               |EEEE|EEEE|EEEE|          |E
+
+
+In this illustration, WAL is received as a stream from the Safekeeper, from the
+right.  It is immediately captured by the page server and stored quickly in
+memory. The page server memory can be thought of as a quick "reorder buffer",
+used to hold the incoming WAL and reorder it so that we keep the WAL records for
+the same page and relation close to each other.
+
+From the page server memory, whenever enough WAL has been accumulated for one
+relation segment, it is moved to local disk, as a new layer file, and the memory
+is released.
+
+From the local disk, the layers are further copied to Cloud Storage, for
+long-term archival. After a layer has been copied to Cloud Storage, it can be
+removed from local disk, although we currently keep everything locally for fast
+access. If a layer is needed that isn't found locally, it is fetched from Cloud
+Storage and stored in local disk.

 # Terms used in layered repository

@@ -14,32 +58,9 @@ process.
 - Segment - one slice of a Relish that is stored in a LayeredTimeline.
 - Layer -  specific version of a relish Segment in a range of LSNs.

-Layers can be InMemory or OnDisk:
- InMemory layer is not durably stored and needs to rebuild from WAL on pageserver start.
- OnDisk layer is durably stored.
+# Layer map

-OnDisk layers can be Image or Delta:
- ImageLayer represents an image or a snapshot of a segment at one particular LSN.
- DeltaLayer represents a collection of WAL records or page images in a range of LSNs.
-
-Dropped segments are always represented on disk by DeltaLayer.
-
-LSN range defined by start_lsn and end_lsn:
- start_lsn is inclusive.
- end_lsn is exclusive.
-
-For an open in-memory layer, the end_lsn is MAX_LSN. For a frozen
-in-memory layer or a delta layer, it is a valid end bound. An image
-layer represents snapshot at one LSN, so end_lsn is always the
-snapshot LSN + 1
-
-Layers can be open or historical:
- Open layer is a writeable one. Only InMemory layer can be open.
-FIXME: If open layer is dropped, it is not writeable, so it should be turned into historical, 
-but now it is not implemented - see bug #569.
- Historical layer is the one that cannot be modified anymore. Now only OnDisk layers can be historical.
-
- LayerMap - a map that tracks what layers exist for all the relishes in a timeline.
+The LayerMap tracks what layers exist for all the relishes in a timeline.

 LayerMap consists of two data structures:
 - segs - All the layers keyed by segment tag
@@ -54,8 +75,53 @@ TODO: Are there any exceptions to this?
 For example, timeline.list_rels(lsn) will return all segments that are visible in this timeline at the LSN,
 including ones that were not modified in this timeline and thus don't have a layer in the timeline's LayerMap.

-TODO:
-Describe GC and checkpoint interval settings.
+
+# Different kinds of layers
+
+A layer can be in different states:
+
+- Open - a layer where new WAL records can be appended to.
+- Closed - a layer that is read-only, no new WAL records can be appended to it
+- Historical: synonym for closed
+- InMemory: A layer that is kept only in memory, and needs to be rebuilt from WAL
+  on pageserver start
+- OnDisk: A layer that is stored on disk. If its end-LSN is older than
+  disk_consistent_lsn, it is known to be fully flushed and fsync'd to local disk.
+- Frozen layer: an in-memory layer that is Closed.
+
+There are two kinds of OnDisk layers:
+- ImageLayer represents an image or a snapshot of a 10 MB relish segment, at one particular LSN.
+- DeltaLayer represents a collection of WAL records or page images in a range of LSNs, for one
+  relish segment.
+
+Dropped segments are always represented on disk by DeltaLayer.
+
+# Layer life cycle
+
+LSN range defined by start_lsn and end_lsn:
+- start_lsn is inclusive.
+- end_lsn is exclusive.
+
+For an open in-memory layer, the end_lsn is MAX_LSN. For a frozen in-memory
+layer or a delta layer, it is a valid end bound. An image layer represents
+snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
+
+Every layer starts its life as an Open In-Memory layer. When the page server
+receives the first WAL record for a segment, it creates a new In-Memory layer
+for it, and puts it to the layer map. Later, the layer is old enough, its
+contents are written to disk, as On-Disk layers. This process is called
+"evicting" a layer.
+
+Layer eviction is a two-step process: First, the layer is marked as closed, so
+that it no longer accepts new WAL records, and the layer map is updated
+accordingly. If a new WAL record for that segment arrives after this step, a new
+Open layer is created to hold it. After this first step, the layer is a Closed
+InMemory state. This first step is called "freezing" the layer.
+
+In the second step, new Delta and Image layers are created, containing all the
+data in the Frozen InMemory layer. When the new layers are ready, the original
+frozen layer is replaced with the new layers in the layer map, and the original
+frozen layer is dropped, releasing the memory.

 # Layer files (On-disk layers)

@@ -366,6 +432,8 @@ is a newer layer file there. TODO: This optimization hasn't been
 implemented! The GC algorithm will currently keep the file on the
 'main' branch anyway, for as long as the child branch exists.

+TODO:
+Describe GC and checkpoint interval settings.

 # TODO: On LSN ranges

--- a/pageserver/src/layered_repository/blob.rs
+++ b/pageserver/src/layered_repository/blob.rs
@@ -1,4 +1,5 @@
-use std::{fs::File, io::Write};
+use std::io::{Read, Write};
+use std::os::unix::prelude::FileExt;

 use anyhow::Result;
 use bookfile::{BookWriter, BoundedReader, ChapterId, ChapterWriter};
@@ -10,7 +11,7 @@ pub struct BlobRange {
    size: usize,
 }

-pub fn read_blob(reader: &BoundedReader<&'_ File>, range: &BlobRange) -> Result<Vec<u8>> {
+pub fn read_blob<F: FileExt>(reader: &BoundedReader<&'_ F>, range: &BlobRange) -> Result<Vec<u8>> {
    let mut buf = vec![0u8; range.size];
    reader.read_exact_at(&mut buf, range.offset)?;
    Ok(buf)
@@ -28,14 +29,14 @@ impl<W: Write> BlobWriter<W> {
        Self { writer, offset: 0 }
    }

-    pub fn write_blob(&mut self, blob: &[u8]) -> Result<BlobRange> {
-        self.writer.write_all(blob)?;
+    pub fn write_blob_from_reader(&mut self, r: &mut impl Read) -> Result<BlobRange> {
+        let len = std::io::copy(r, &mut self.writer)?;

        let range = BlobRange {
            offset: self.offset,
-            size: blob.len(),
+            size: len as usize,
        };
-        self.offset += blob.len() as u64;
+        self.offset += len as u64;
        Ok(range)
    }

--- a/pageserver/src/layered_repository/delta_layer.rs
+++ b/pageserver/src/layered_repository/delta_layer.rs
@@ -39,9 +39,11 @@
 //!
 use crate::layered_repository::blob::BlobWriter;
 use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
+use crate::layered_repository::page_versions::PageVersions;
 use crate::layered_repository::storage_layer::{
    Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag,
 };
+use crate::virtual_file::VirtualFile;
 use crate::waldecoder;
 use crate::PageServerConf;
 use crate::{ZTenantId, ZTimelineId};
@@ -53,7 +55,6 @@ use zenith_utils::vec_map::VecMap;
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::fs;
-use std::fs::File;
 use std::io::{BufWriter, Write};
 use std::ops::Bound::Included;
 use std::path::{Path, PathBuf};
@@ -139,6 +140,8 @@ pub struct DeltaLayerInner {
    /// loaded into memory yet.
    loaded: bool,

+    book: Option<Book<VirtualFile>>,
+
    /// All versions of all pages in the file are are kept here.
    /// Indexed by block number and LSN.
    page_version_metas: VecMap<(u32, Lsn), BlobRange>,
@@ -148,6 +151,10 @@ pub struct DeltaLayerInner {
 }

 impl Layer for DeltaLayer {
+    fn get_tenant_id(&self) -> ZTenantId {
+        self.tenantid
+    }
+
    fn get_timeline_id(&self) -> ZTimelineId {
        self.timelineid
    }
@@ -177,18 +184,28 @@ impl Layer for DeltaLayer {
        &self,
        blknum: u32,
        lsn: Lsn,
+        cached_img_lsn: Option<Lsn>,
        reconstruct_data: &mut PageReconstructData,
    ) -> Result<PageReconstructResult> {
        let mut need_image = true;

        assert!(self.seg.blknum_in_seg(blknum));

+        match &cached_img_lsn {
+            Some(cached_lsn) if &self.end_lsn <= cached_lsn => {
+                return Ok(PageReconstructResult::Cached)
+            }
+            _ => {}
+        }
+
        {
            // Open the file and lock the metadata in memory
-            // TODO: avoid opening the file for each read
-            let (_path, book) = self.open_book()?;
-            let page_version_reader = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
            let inner = self.load()?;
+            let page_version_reader = inner
+                .book
+                .as_ref()
+                .unwrap()
+                .chapter_reader(PAGE_VERSIONS_CHAPTER)?;

            // Scan the metadata BTreeMap backwards, starting from the given entry.
            let minkey = (blknum, Lsn(0));
@@ -199,24 +216,31 @@ impl Layer for DeltaLayer {
                .iter()
                .rev();
            for ((_blknum, pv_lsn), blob_range) in iter {
+                match &cached_img_lsn {
+                    Some(cached_lsn) if pv_lsn <= cached_lsn => {
+                        return Ok(PageReconstructResult::Cached)
+                    }
+                    _ => {}
+                }
+
                let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?;

-                if let Some(img) = pv.page_image {
-                    // Found a page image, return it
-                    reconstruct_data.page_img = Some(img);
-                    need_image = false;
-                    break;
-                } else if let Some(rec) = pv.record {
-                    let will_init = rec.will_init;
-                    reconstruct_data.records.push((*pv_lsn, rec));
-                    if will_init {
-                        // This WAL record initializes the page, so no need to go further back
+                match pv {
+                    PageVersion::Page(img) => {
+                        // Found a page image, return it
+                        reconstruct_data.page_img = Some(img);
                        need_image = false;
                        break;
                    }
-                } else {
-                    // No base image, and no WAL record. Huh?
-                    bail!("no page image or WAL record for requested page");
+                    PageVersion::Wal(rec) => {
+                        let will_init = rec.will_init;
+                        reconstruct_data.records.push((*pv_lsn, rec));
+                        if will_init {
+                            // This WAL record initializes the page, so no need to go further back
+                            need_image = false;
+                            break;
+                        }
+                    }
                }
            }

@@ -226,7 +250,7 @@ impl Layer for DeltaLayer {
        // If an older page image is needed to reconstruct the page, let the
        // caller know.
        if need_image {
-            Ok(PageReconstructResult::Continue(self.start_lsn))
+            Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
        } else {
            Ok(PageReconstructResult::Complete)
        }
@@ -299,7 +323,11 @@ impl Layer for DeltaLayer {
            println!("  {}: {}", k, v);
        }
        println!("--- page versions ---");
-        let (_path, book) = self.open_book()?;
+
+        let path = self.path();
+        let file = std::fs::File::open(&path)?;
+        let book = Book::new(file)?;
+
        let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
        for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
            let mut desc = String::new();
@@ -307,19 +335,22 @@ impl Layer for DeltaLayer {
            let buf = read_blob(&chapter, blob_range)?;
            let pv = PageVersion::des(&buf)?;

-            if let Some(img) = pv.page_image.as_ref() {
-                write!(&mut desc, " img {} bytes", img.len())?;
-            }
-            if let Some(rec) = pv.record.as_ref() {
-                let wal_desc = waldecoder::describe_wal_record(&rec.rec);
-                write!(
-                    &mut desc,
-                    " rec {} bytes will_init: {} {}",
-                    rec.rec.len(),
-                    rec.will_init,
-                    wal_desc
-                )?;
+            match pv {
+                PageVersion::Page(img) => {
+                    write!(&mut desc, " img {} bytes", img.len())?;
+                }
+                PageVersion::Wal(rec) => {
+                    let wal_desc = waldecoder::describe_wal_record(&rec.rec);
+                    write!(
+                        &mut desc,
+                        " rec {} bytes will_init: {} {}",
+                        rec.rec.len(),
+                        rec.will_init,
+                        wal_desc
+                    )?;
+                }
            }
+
            println!("  blk {} at {}: {}", blk, lsn, desc);
        }

@@ -343,14 +374,14 @@ impl DeltaLayer {
    }

    /// Create a new delta file, using the given page versions and relsizes.
-    /// The page versions are passed by an iterator; the iterator must return
-    /// page versions in blknum+lsn order.
+    /// The page versions are passed in a PageVersions struct. If 'cutoff' is
+    /// given, only page versions with LSN < cutoff are included.
    ///
-    /// This is used to write the in-memory layer to disk. The in-memory layer uses the same
-    /// data structure with two btreemaps as we do, so passing the btreemaps is currently
-    /// expedient.
+    /// This is used to write the in-memory layer to disk. The page_versions and
+    /// relsizes are thus passed in the same format as they are in the in-memory
+    /// layer, as that's expedient.
    #[allow(clippy::too_many_arguments)]
-    pub fn create<'a>(
+    pub fn create(
        conf: &'static PageServerConf,
        timelineid: ZTimelineId,
        tenantid: ZTenantId,
@@ -358,7 +389,8 @@ impl DeltaLayer {
        start_lsn: Lsn,
        end_lsn: Lsn,
        dropped: bool,
-        page_versions: impl Iterator<Item = (u32, Lsn, &'a PageVersion)>,
+        page_versions: &PageVersions,
+        cutoff: Option<Lsn>,
        relsizes: VecMap<Lsn, u32>,
    ) -> Result<DeltaLayer> {
        if seg.rel.is_blocky() {
@@ -375,26 +407,32 @@ impl DeltaLayer {
            dropped,
            inner: Mutex::new(DeltaLayerInner {
                loaded: true,
+                book: None,
                page_version_metas: VecMap::default(),
                relsizes,
            }),
        };
        let mut inner = delta_layer.inner.lock().unwrap();

-        // Write the in-memory btreemaps into a file
-        let path = delta_layer.path();
-
+        // Write the data into a file
+        //
+        // Note: Because we open the file in write-only mode, we cannot
+        // reuse the same VirtualFile for reading later. That's why we don't
+        // set inner.book here. The first read will have to re-open it.
+        //
        // Note: This overwrites any existing file. There shouldn't be any.
        // FIXME: throw an error instead?
-        let file = File::create(&path)?;
+        let path = delta_layer.path();
+        let file = VirtualFile::create(&path)?;
        let buf_writer = BufWriter::new(file);
        let book = BookWriter::new(buf_writer, DELTA_FILE_MAGIC)?;

        let mut page_version_writer = BlobWriter::new(book, PAGE_VERSIONS_CHAPTER);

-        for (blknum, lsn, page_version) in page_versions {
-            let buf = PageVersion::ser(page_version)?;
-            let blob_range = page_version_writer.write_blob(&buf)?;
+        let page_versions_iter = page_versions.ordered_page_version_iter(cutoff);
+        for (blknum, lsn, pos) in page_versions_iter {
+            let blob_range =
+                page_version_writer.write_blob_from_reader(&mut page_versions.reader(pos)?)?;

            inner
                .page_version_metas
@@ -441,20 +479,6 @@ impl DeltaLayer {
        Ok(delta_layer)
    }

-    fn open_book(&self) -> Result<(PathBuf, Book<File>)> {
-        let path = Self::path_for(
-            &self.path_or_conf,
-            self.timelineid,
-            self.tenantid,
-            &self.layer_name(),
-        );
-
-        let file = File::open(&path)?;
-        let book = Book::new(file)?;
-
-        Ok((path, book))
-    }
-
    ///
    /// Load the contents of the file into memory
    ///
@@ -466,7 +490,9 @@ impl DeltaLayer {
            return Ok(inner);
        }

-        let (path, book) = self.open_book()?;
+        let path = self.path();
+        let file = VirtualFile::open(&path)?;
+        let book = Book::new(file)?;

        match &self.path_or_conf {
            PathOrConf::Conf(_) => {
@@ -503,6 +529,7 @@ impl DeltaLayer {

        *inner = DeltaLayerInner {
            loaded: true,
+            book: None,
            page_version_metas,
            relsizes,
        };
@@ -527,6 +554,7 @@ impl DeltaLayer {
            dropped: filename.dropped,
            inner: Mutex::new(DeltaLayerInner {
                loaded: false,
+                book: None,
                page_version_metas: VecMap::default(),
                relsizes: VecMap::default(),
            }),
@@ -536,7 +564,10 @@ impl DeltaLayer {
    /// Create a DeltaLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
-    pub fn new_for_path(path: &Path, book: &Book<File>) -> Result<Self> {
+    pub fn new_for_path<F>(path: &Path, book: &Book<F>) -> Result<Self>
+    where
+        F: std::os::unix::prelude::FileExt,
+    {
        let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
        let summary = Summary::des(&chapter)?;

@@ -550,6 +581,7 @@ impl DeltaLayer {
            dropped: summary.dropped,
            inner: Mutex::new(DeltaLayerInner {
                loaded: false,
+                book: None,
                page_version_metas: VecMap::default(),
                relsizes: VecMap::default(),
            }),
--- a/pageserver/src/layered_repository/ephemeral_file.rs
+++ b/pageserver/src/layered_repository/ephemeral_file.rs
@@ -0,0 +1,295 @@
+use crate::page_cache;
+use crate::page_cache::PAGE_SZ;
+use crate::page_cache::{ReadBufResult, WriteBufResult};
+use crate::virtual_file::VirtualFile;
+use crate::PageServerConf;
+use lazy_static::lazy_static;
+use std::cmp::min;
+use std::collections::HashMap;
+use std::fs::OpenOptions;
+use std::io::{Error, ErrorKind, Seek, SeekFrom, Write};
+use std::ops::DerefMut;
+use std::path::PathBuf;
+use std::sync::{Arc, RwLock};
+use zenith_utils::zid::ZTenantId;
+use zenith_utils::zid::ZTimelineId;
+
+use std::os::unix::fs::FileExt;
+
+lazy_static! {
+    ///
+    /// This is the global cache of file descriptors (File objects).
+    ///
+    static ref EPHEMERAL_FILES: RwLock<EphemeralFiles> = RwLock::new(EphemeralFiles {
+        next_file_id: 1,
+        files: HashMap::new(),
+    });
+}
+
+pub struct EphemeralFiles {
+    next_file_id: u64,
+
+    files: HashMap<u64, Arc<VirtualFile>>,
+}
+
+pub struct EphemeralFile {
+    file_id: u64,
+    _tenantid: ZTenantId,
+    _timelineid: ZTimelineId,
+    file: Arc<VirtualFile>,
+
+    pos: u64,
+}
+
+impl EphemeralFile {
+    pub fn create(
+        conf: &PageServerConf,
+        tenantid: ZTenantId,
+        timelineid: ZTimelineId,
+    ) -> Result<EphemeralFile, std::io::Error> {
+        let mut l = EPHEMERAL_FILES.write().unwrap();
+        let file_id = l.next_file_id;
+        l.next_file_id += 1;
+
+        let filename = conf
+            .timeline_path(&timelineid, &tenantid)
+            .join(PathBuf::from(format!("ephemeral-{}", file_id)));
+
+        let file = VirtualFile::open_with_options(
+            &filename,
+            OpenOptions::new().read(true).write(true).create(true),
+        )?;
+        let file_rc = Arc::new(file);
+        l.files.insert(file_id, file_rc.clone());
+
+        Ok(EphemeralFile {
+            file_id,
+            _tenantid: tenantid,
+            _timelineid: timelineid,
+            file: file_rc,
+            pos: 0,
+        })
+    }
+
+    pub fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), Error> {
+        let mut off = 0;
+        while off < PAGE_SZ {
+            let n = self
+                .file
+                .read_at(&mut buf[off..], blkno as u64 * PAGE_SZ as u64 + off as u64)?;
+
+            if n == 0 {
+                // Reached EOF. Fill the rest of the buffer with zeros.
+                const ZERO_BUF: [u8; PAGE_SZ] = [0u8; PAGE_SZ];
+
+                buf[off..].copy_from_slice(&ZERO_BUF[off..]);
+                break;
+            }
+
+            off += n as usize;
+        }
+        Ok(())
+    }
+}
+
+impl FileExt for EphemeralFile {
+    fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result<usize, Error> {
+        // Look up the right page
+        let blkno = (offset / PAGE_SZ as u64) as u32;
+        let off = offset as usize % PAGE_SZ;
+        let len = min(PAGE_SZ - off, dstbuf.len());
+
+        let read_guard;
+        let mut write_guard;
+
+        let cache = page_cache::get();
+        let buf = match cache.read_ephemeral_buf(self.file_id, blkno) {
+            ReadBufResult::Found(guard) => {
+                read_guard = guard;
+                read_guard.as_ref()
+            }
+            ReadBufResult::NotFound(guard) => {
+                // Read the page from disk into the buffer
+                write_guard = guard;
+                self.fill_buffer(write_guard.deref_mut(), blkno)?;
+                write_guard.mark_valid();
+
+                // And then fall through to read the requested slice from the
+                // buffer.
+                write_guard.as_ref()
+            }
+        };
+
+        dstbuf[0..len].copy_from_slice(&buf[off..(off + len)]);
+        Ok(len)
+    }
+
+    fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result<usize, Error> {
+        // Look up the right page
+        let blkno = (offset / PAGE_SZ as u64) as u32;
+        let off = offset as usize % PAGE_SZ;
+        let len = min(PAGE_SZ - off, srcbuf.len());
+
+        let mut write_guard;
+        let cache = page_cache::get();
+        let buf = match cache.write_ephemeral_buf(self.file_id, blkno) {
+            WriteBufResult::Found(guard) => {
+                write_guard = guard;
+                write_guard.deref_mut()
+            }
+            WriteBufResult::NotFound(guard) => {
+                // Read the page from disk into the buffer
+                // TODO: if we're overwriting the whole page, no need to read it in first
+                write_guard = guard;
+                self.fill_buffer(write_guard.deref_mut(), blkno)?;
+                write_guard.mark_valid();
+
+                // And then fall through to modify it.
+                write_guard.deref_mut()
+            }
+        };
+
+        buf[off..(off + len)].copy_from_slice(&srcbuf[0..len]);
+        write_guard.mark_dirty();
+        Ok(len)
+    }
+}
+
+impl Write for EphemeralFile {
+    fn write(&mut self, buf: &[u8]) -> Result<usize, Error> {
+        let n = self.write_at(buf, self.pos)?;
+        self.pos += n as u64;
+        Ok(n)
+    }
+
+    fn flush(&mut self) -> Result<(), std::io::Error> {
+        todo!()
+    }
+}
+
+impl Seek for EphemeralFile {
+    fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
+        match pos {
+            SeekFrom::Start(offset) => {
+                self.pos = offset;
+            }
+            SeekFrom::End(_offset) => {
+                return Err(Error::new(
+                    ErrorKind::Other,
+                    "SeekFrom::End not supported by EphemeralFile",
+                ));
+            }
+            SeekFrom::Current(offset) => {
+                let pos = self.pos as i128 + offset as i128;
+                if pos < 0 {
+                    return Err(Error::new(
+                        ErrorKind::InvalidInput,
+                        "offset would be negative",
+                    ));
+                }
+                if pos > u64::MAX as i128 {
+                    return Err(Error::new(ErrorKind::InvalidInput, "offset overflow"));
+                }
+                self.pos = pos as u64;
+            }
+        }
+        Ok(self.pos)
+    }
+}
+
+impl Drop for EphemeralFile {
+    fn drop(&mut self) {
+        // drop all pages from page cache
+        let cache = page_cache::get();
+        cache.drop_buffers_for_ephemeral(self.file_id);
+
+        // remove entry from the hash map
+        EPHEMERAL_FILES.write().unwrap().files.remove(&self.file_id);
+
+        // unlink file
+        // FIXME: print error
+        let _ = std::fs::remove_file(&self.file.path);
+    }
+}
+
+pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Error> {
+    if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
+        file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64)?;
+        Ok(())
+    } else {
+        Err(std::io::Error::new(
+            ErrorKind::Other,
+            "could not write back page, not found in ephemeral files hash",
+        ))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use rand::seq::SliceRandom;
+    use rand::thread_rng;
+    use std::fs;
+    use std::str::FromStr;
+
+    fn repo_harness(
+        test_name: &str,
+    ) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), Error> {
+        let repo_dir = PageServerConf::test_repo_dir(test_name);
+        let _ = fs::remove_dir_all(&repo_dir);
+        let conf = PageServerConf::dummy_conf(repo_dir);
+        // Make a static copy of the config. This can never be free'd, but that's
+        // OK in a test.
+        let conf: &'static PageServerConf = Box::leak(Box::new(conf));
+
+        let tenantid = ZTenantId::from_str("11000000000000000000000000000000").unwrap();
+        let timelineid = ZTimelineId::from_str("22000000000000000000000000000000").unwrap();
+        fs::create_dir_all(conf.timeline_path(&timelineid, &tenantid))?;
+
+        Ok((conf, tenantid, timelineid))
+    }
+
+    // Helper function to slurp contents of a file, starting at the current position,
+    // into a string
+    fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result<String, Error> {
+        let mut buf = Vec::new();
+        buf.resize(len, 0u8);
+
+        efile.read_exact_at(&mut buf, offset)?;
+
+        Ok(String::from_utf8_lossy(&buf)
+            .trim_end_matches('\0')
+            .to_string())
+    }
+
+    #[test]
+    fn test_ephemeral_files() -> Result<(), Error> {
+        let (conf, tenantid, timelineid) = repo_harness("ephemeral_files")?;
+
+        let mut file_a = EphemeralFile::create(conf, tenantid, timelineid)?;
+
+        file_a.write_all(b"foo")?;
+        assert_eq!("foo", read_string(&file_a, 0, 20)?);
+
+        file_a.write_all(b"bar")?;
+        assert_eq!("foobar", read_string(&file_a, 0, 20)?);
+
+        // Open a lot of files, enough to cause some page evictions.
+        let mut efiles = Vec::new();
+        for fileno in 0..100 {
+            let mut efile = EphemeralFile::create(conf, tenantid, timelineid)?;
+            efile.write_all(format!("file {}", fileno).as_bytes())?;
+            assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
+            efiles.push((fileno, efile));
+        }
+
+        // Check that all the files can still be read from. Use them in random order for
+        // good measure.
+        efiles.as_mut_slice().shuffle(&mut thread_rng());
+        for (fileno, efile) in efiles.iter_mut() {
+            assert_eq!(format!("file {}", fileno), read_string(efile, 0, 10)?);
+        }
+
+        Ok(())
+    }
+}
--- a/pageserver/src/layered_repository/filename.rs
+++ b/pageserver/src/layered_repository/filename.rs
@@ -13,7 +13,7 @@ use anyhow::Result;
 use log::*;
 use zenith_utils::lsn::Lsn;

-use super::METADATA_FILE_NAME;
+use super::metadata::METADATA_FILE_NAME;

 // Note: LayeredTimeline::load_layer_map() relies on this sort order
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
--- a/pageserver/src/layered_repository/global_layer_map.rs
+++ b/pageserver/src/layered_repository/global_layer_map.rs
@@ -0,0 +1,141 @@
+//!
+//! Global registry of open layers.
+//!
+//! Whenever a new in-memory layer is created to hold incoming WAL, it is registered
+//! in [`GLOBAL_LAYER_MAP`], so that we can keep track of the total number of
+//! in-memory layers in the system, and know when we need to evict some to release
+//! memory.
+//!
+//! Each layer is assigned a unique ID when it's registered in the global registry.
+//! The ID can be used to relocate the layer later, without having to hold locks.
+//!
+
+use std::sync::atomic::{AtomicU8, Ordering};
+use std::sync::{Arc, RwLock};
+
+use super::inmemory_layer::OpenLayer;
+
+use lazy_static::lazy_static;
+
+const MAX_USAGE_COUNT: u8 = 5;
+
+lazy_static! {
+    pub static ref GLOBAL_LAYER_MAP: RwLock<OpenLayers> = RwLock::new(OpenLayers::default());
+}
+
+// TODO these types can probably be smaller
+#[derive(PartialEq, Eq, Clone, Copy)]
+pub struct LayerId {
+    index: usize,
+    tag: u64, // to avoid ABA problem
+}
+
+enum SlotData {
+    Occupied(Arc<OpenLayer>),
+    /// Vacant slots form a linked list, the value is the index
+    /// of the next vacant slot in the list.
+    Vacant(Option<usize>),
+}
+
+struct Slot {
+    tag: u64,
+    data: SlotData,
+    usage_count: AtomicU8, // for clock algorithm
+}
+
+#[derive(Default)]
+pub struct OpenLayers {
+    slots: Vec<Slot>,
+    num_occupied: usize,
+
+    // Head of free-slot list.
+    next_empty_slot_idx: Option<usize>,
+}
+
+impl OpenLayers {
+    pub fn insert(&mut self, layer: Arc<OpenLayer>) -> LayerId {
+        let slot_idx = match self.next_empty_slot_idx {
+            Some(slot_idx) => slot_idx,
+            None => {
+                let idx = self.slots.len();
+                self.slots.push(Slot {
+                    tag: 0,
+                    data: SlotData::Vacant(None),
+                    usage_count: AtomicU8::new(0),
+                });
+                idx
+            }
+        };
+        let slots_len = self.slots.len();
+
+        let slot = &mut self.slots[slot_idx];
+
+        match slot.data {
+            SlotData::Occupied(_) => {
+                panic!("an occupied slot was in the free list");
+            }
+            SlotData::Vacant(next_empty_slot_idx) => {
+                self.next_empty_slot_idx = next_empty_slot_idx;
+            }
+        }
+
+        slot.data = SlotData::Occupied(layer);
+        slot.usage_count.store(1, Ordering::Relaxed);
+
+        self.num_occupied += 1;
+        assert!(self.num_occupied <= slots_len);
+
+        LayerId {
+            index: slot_idx,
+            tag: slot.tag,
+        }
+    }
+
+    pub fn get(&self, layer_id: &LayerId) -> Option<Arc<OpenLayer>> {
+        let slot = self.slots.get(layer_id.index)?; // TODO should out of bounds indexes just panic?
+        if slot.tag != layer_id.tag {
+            return None;
+        }
+
+        if let SlotData::Occupied(layer) = &slot.data {
+            let _ = slot.usage_count.fetch_update(
+                Ordering::Relaxed,
+                Ordering::Relaxed,
+                |old_usage_count| {
+                    if old_usage_count < MAX_USAGE_COUNT {
+                        Some(old_usage_count + 1)
+                    } else {
+                        None
+                    }
+                },
+            );
+            Some(Arc::clone(layer))
+        } else {
+            None
+        }
+    }
+
+    // TODO this won't be a public API in the future
+    pub fn remove(&mut self, layer_id: &LayerId) {
+        let slot = &mut self.slots[layer_id.index];
+
+        if slot.tag != layer_id.tag {
+            return;
+        }
+
+        match &slot.data {
+            SlotData::Occupied(_layer) => {
+                // TODO evict the layer
+            }
+            SlotData::Vacant(_) => unimplemented!(),
+        }
+
+        slot.data = SlotData::Vacant(self.next_empty_slot_idx);
+        self.next_empty_slot_idx = Some(layer_id.index);
+
+        assert!(self.num_occupied > 0);
+        self.num_occupied -= 1;
+
+        slot.tag = slot.tag.wrapping_add(1);
+    }
+}
--- a/pageserver/src/layered_repository/image_layer.rs
+++ b/pageserver/src/layered_repository/image_layer.rs
@@ -27,15 +27,15 @@ use crate::layered_repository::storage_layer::{
 };
 use crate::layered_repository::LayeredTimeline;
 use crate::layered_repository::RELISH_SEG_SIZE;
+use crate::virtual_file::VirtualFile;
 use crate::PageServerConf;
 use crate::{ZTenantId, ZTimelineId};
-use anyhow::{anyhow, bail, ensure, Result};
+use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::Bytes;
 use log::*;
 use serde::{Deserialize, Serialize};
 use std::convert::TryInto;
 use std::fs;
-use std::fs::File;
 use std::io::{BufWriter, Write};
 use std::path::{Path, PathBuf};
 use std::sync::{Mutex, MutexGuard};
@@ -104,9 +104,8 @@ enum ImageType {
 }

 pub struct ImageLayerInner {
-    /// If false, the 'image_type' has not been
-    /// loaded into memory yet.
-    loaded: bool,
+    /// If None, the 'image_type' has not been loaded into memory yet.
+    book: Option<Book<VirtualFile>>,

    /// Derived from filename and bookfile chapter metadata
    image_type: ImageType,
@@ -117,6 +116,10 @@ impl Layer for ImageLayer {
        PathBuf::from(self.layer_name().to_string())
    }

+    fn get_tenant_id(&self) -> ZTenantId {
+        self.tenantid
+    }
+
    fn get_timeline_id(&self) -> ZTimelineId {
        self.timelineid
    }
@@ -143,16 +146,20 @@ impl Layer for ImageLayer {
        &self,
        blknum: u32,
        lsn: Lsn,
+        cached_img_lsn: Option<Lsn>,
        reconstruct_data: &mut PageReconstructData,
    ) -> Result<PageReconstructResult> {
        assert!(lsn >= self.lsn);

+        match cached_img_lsn {
+            Some(cached_lsn) if self.lsn <= cached_lsn => return Ok(PageReconstructResult::Cached),
+            _ => {}
+        }
+
        let inner = self.load()?;

        let base_blknum = blknum % RELISH_SEG_SIZE;

-        let (_path, book) = self.open_book()?;
-
        let buf = match &inner.image_type {
            ImageType::Blocky { num_blocks } => {
                if base_blknum >= *num_blocks {
@@ -162,14 +169,23 @@ impl Layer for ImageLayer {
                let mut buf = vec![0u8; BLOCK_SIZE];
                let offset = BLOCK_SIZE as u64 * base_blknum as u64;

-                let chapter = book.chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
+                let chapter = inner
+                    .book
+                    .as_ref()
+                    .unwrap()
+                    .chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
                chapter.read_exact_at(&mut buf, offset)?;

                buf
            }
            ImageType::NonBlocky => {
                ensure!(base_blknum == 0);
-                book.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?.into_vec()
+                inner
+                    .book
+                    .as_ref()
+                    .unwrap()
+                    .read_chapter(NONBLOCKY_IMAGE_CHAPTER)?
+                    .into_vec()
            }
        };

@@ -191,14 +207,7 @@ impl Layer for ImageLayer {
        Ok(true)
    }

-    ///
-    /// Release most of the memory used by this layer. If it's accessed again later,
-    /// it will need to be loaded back.
-    ///
    fn unload(&self) -> Result<()> {
-        let mut inner = self.inner.lock().unwrap();
-        inner.image_type = ImageType::Blocky { num_blocks: 0 };
-        inner.loaded = false;
        Ok(())
    }

@@ -224,8 +233,11 @@ impl Layer for ImageLayer {
        match inner.image_type {
            ImageType::Blocky { num_blocks } => println!("({}) blocks ", num_blocks),
            ImageType::NonBlocky => {
-                let (_path, book) = self.open_book()?;
-                let chapter = book.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?;
+                let chapter = inner
+                    .book
+                    .as_ref()
+                    .unwrap()
+                    .read_chapter(NONBLOCKY_IMAGE_CHAPTER)?;
                println!("non-blocky ({} bytes)", chapter.len());
            }
        }
@@ -273,17 +285,22 @@ impl ImageLayer {
            seg,
            lsn,
            inner: Mutex::new(ImageLayerInner {
-                loaded: true,
+                book: None,
                image_type: image_type.clone(),
            }),
        };
        let inner = layer.inner.lock().unwrap();

        // Write the images into a file
-        let path = layer.path();
+        //
+        // Note: Because we open the file in write-only mode, we cannot
+        // reuse the same VirtualFile for reading later. That's why we don't
+        // set inner.book here. The first read will have to re-open it.
+        //
        // Note: This overwrites any existing file. There shouldn't be any.
        // FIXME: throw an error instead?
-        let file = File::create(&path)?;
+        let path = layer.path();
+        let file = VirtualFile::create(&path)?;
        let buf_writer = BufWriter::new(file);
        let book = BookWriter::new(buf_writer, IMAGE_FILE_MAGIC)?;

@@ -370,11 +387,19 @@ impl ImageLayer {
        // quick exit if already loaded
        let mut inner = self.inner.lock().unwrap();

-        if inner.loaded {
+        if inner.book.is_some() {
            return Ok(inner);
        }

-        let (path, book) = self.open_book()?;
+        let path = self.path();
+        let file = VirtualFile::open(&path)
+            .with_context(|| format!("Failed to open virtual file '{}'", path.display()))?;
+        let book = Book::new(file).with_context(|| {
+            format!(
+                "Failed to open virtual file '{}' as a bookfile",
+                path.display()
+            )
+        })?;

        match &self.path_or_conf {
            PathOrConf::Conf(_) => {
@@ -415,22 +440,13 @@ impl ImageLayer {
        debug!("loaded from {}", &path.display());

        *inner = ImageLayerInner {
-            loaded: true,
+            book: Some(book),
            image_type,
        };

        Ok(inner)
    }

-    fn open_book(&self) -> Result<(PathBuf, Book<File>)> {
-        let path = self.path();
-
-        let file = File::open(&path)?;
-        let book = Book::new(file)?;
-
-        Ok((path, book))
-    }
-
    /// Create an ImageLayer struct representing an existing file on disk
    pub fn new(
        conf: &'static PageServerConf,
@@ -445,7 +461,7 @@ impl ImageLayer {
            seg: filename.seg,
            lsn: filename.lsn,
            inner: Mutex::new(ImageLayerInner {
-                loaded: false,
+                book: None,
                image_type: ImageType::Blocky { num_blocks: 0 },
            }),
        }
@@ -454,7 +470,10 @@ impl ImageLayer {
    /// Create an ImageLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
-    pub fn new_for_path(path: &Path, book: &Book<File>) -> Result<ImageLayer> {
+    pub fn new_for_path<F>(path: &Path, book: &Book<F>) -> Result<ImageLayer>
+    where
+        F: std::os::unix::prelude::FileExt,
+    {
        let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
        let summary = Summary::des(&chapter)?;

@@ -465,7 +484,7 @@ impl ImageLayer {
            seg: summary.seg,
            lsn: summary.lsn,
            inner: Mutex::new(ImageLayerInner {
-                loaded: false,
+                book: None,
                image_type: ImageType::Blocky { num_blocks: 0 },
            }),
        })
--- a/pageserver/src/layered_repository/inmemory_layer.rs
+++ b/pageserver/src/layered_repository/inmemory_layer.rs
@@ -1,7 +1,8 @@
-//!
+//! FIXME
 //! An in-memory layer stores recently received page versions in memory. The page versions
 //! are held in a BTreeMap, and there's another BTreeMap to track the size of the relation.
 //!
+use crate::layered_repository::ephemeral_file::EphemeralFile;
 use crate::layered_repository::filename::DeltaFileName;
 use crate::layered_repository::storage_layer::{
    Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag, RELISH_SEG_SIZE,
@@ -12,18 +13,29 @@ use crate::layered_repository::{DeltaLayer, ImageLayer};
 use crate::repository::WALRecord;
 use crate::PageServerConf;
 use crate::{ZTenantId, ZTimelineId};
-use anyhow::{bail, ensure, Result};
+use anyhow::{ensure, Result};
 use bytes::Bytes;
+use lazy_static::lazy_static;
 use log::*;
 use std::path::PathBuf;
 use std::sync::{Arc, RwLock};
-use zenith_utils::vec_map::VecMap;
-
 use zenith_utils::lsn::Lsn;
+use zenith_utils::vec_map::VecMap;

 use super::page_versions::PageVersions;

-pub struct InMemoryLayer {
+use zenith_metrics::{register_int_counter, IntCounter};
+
+lazy_static! {
+    static ref LATEST_IMG_UPDATE_COUNTER: IntCounter =
+        register_int_counter!("latest_img_updates", "Number of updates of latest img").unwrap();
+    static ref LATEST_IMG_MISS_COUNTER: IntCounter =
+        register_int_counter!("latest_img_misses", "Number of cache misses of latest img").unwrap();
+    static ref LATEST_IMG_HIT_COUNTER: IntCounter =
+        register_int_counter!("latest_img_hits", "Number of cache hits of latest img").unwrap();
+}
+
+pub struct OpenLayer {
    conf: &'static PageServerConf,
    tenantid: ZTenantId,
    timelineid: ZTimelineId,
@@ -40,14 +52,14 @@ pub struct InMemoryLayer {

    /// The above fields never change. The parts that do change are in 'inner',
    /// and protected by mutex.
-    inner: RwLock<InMemoryLayerInner>,
+    inner: RwLock<OpenLayerInner>,

    /// Predecessor layer might be needed?
    incremental: bool,
 }

-pub struct InMemoryLayerInner {
-    /// Frozen in-memory layers have an exclusive end LSN.
+pub struct OpenLayerInner {
+    /// Frozen layers have an exclusive end LSN.
    /// Writes are only allowed when this is None
    end_lsn: Option<Lsn>,

@@ -71,7 +83,7 @@ pub struct InMemoryLayerInner {
    segsizes: VecMap<Lsn, u32>,
 }

-impl InMemoryLayerInner {
+impl OpenLayerInner {
    fn assert_writeable(&self) {
        assert!(self.end_lsn.is_none());
    }
@@ -89,8 +101,9 @@ impl InMemoryLayerInner {
    }
 }

-impl Layer for InMemoryLayer {
-    // An in-memory layer doesn't really have a filename as it's not stored on disk,
+impl Layer for OpenLayer {
+    // FIXME
+    // An open layer doesn't really have a filename as it's not stored on disk,
    // but we construct a filename as if it was a delta layer
    fn filename(&self) -> PathBuf {
        let inner = self.inner.read().unwrap();
@@ -113,6 +126,10 @@ impl Layer for InMemoryLayer {
        PathBuf::from(format!("inmem-{}", delta_filename))
    }

+    fn get_tenant_id(&self) -> ZTenantId {
+        self.tenantid
+    }
+
    fn get_timeline_id(&self) -> ZTimelineId {
        self.timelineid
    }
@@ -145,6 +162,7 @@ impl Layer for InMemoryLayer {
        &self,
        blknum: u32,
        lsn: Lsn,
+        cached_img_lsn: Option<Lsn>,
        reconstruct_data: &mut PageReconstructData,
    ) -> Result<PageReconstructResult> {
        let mut need_image = true;
@@ -154,27 +172,49 @@ impl Layer for InMemoryLayer {
        {
            let inner = self.inner.read().unwrap();

+            let latest = inner.page_versions.get_latest(blknum);
+
            // Scan the page versions backwards, starting from `lsn`.
            let iter = inner
                .page_versions
                .get_block_lsn_range(blknum, ..=lsn)
                .iter()
                .rev();
-            for (entry_lsn, entry) in iter {
-                if let Some(img) = &entry.page_image {
-                    reconstruct_data.page_img = Some(img.clone());
-                    need_image = false;
-                    break;
-                } else if let Some(rec) = &entry.record {
-                    reconstruct_data.records.push((*entry_lsn, rec.clone()));
-                    if rec.will_init {
-                        // This WAL record initializes the page, so no need to go further back
+            for (entry_lsn, pos) in iter {
+                match &cached_img_lsn {
+                    Some(cached_lsn) if entry_lsn <= cached_lsn => {
+                        return Ok(PageReconstructResult::Cached)
+                    }
+                    _ => {}
+                }
+
+                let pv = inner.page_versions.get_page_version(*pos)?;
+                match pv {
+                    PageVersion::Page(img) => {
+                        reconstruct_data.page_img = Some(img);
                        need_image = false;
                        break;
                    }
-                } else {
-                    // No base image, and no WAL record. Huh?
-                    bail!("no page image or WAL record for requested page");
+                    PageVersion::Wal(rec) => {
+                        if let Some((latest_lsn, latest_pos)) = latest {
+                            if latest_lsn == entry_lsn {
+                                // we had this cached, nice!
+                                let img = inner.page_versions.fetch_cached_latest(*latest_pos)?;
+                                reconstruct_data.page_img = Some(img);
+                                need_image = false;
+                                LATEST_IMG_HIT_COUNTER.inc();
+                                break;
+                            }
+                        }
+                        LATEST_IMG_MISS_COUNTER.inc();
+
+                        reconstruct_data.records.push((*entry_lsn, rec.clone()));
+                        if rec.will_init {
+                            // This WAL record initializes the page, so no need to go further back
+                            need_image = false;
+                            break;
+                        }
+                    }
                }
            }
            // release lock on 'inner'
@@ -193,6 +233,14 @@ impl Layer for InMemoryLayer {
        }
    }

+    fn cache_page_image(&self, blknum: u32, lsn: Lsn, img: &[u8]) -> Result<()> {
+        let mut inner = self.inner.write().unwrap();
+
+        LATEST_IMG_UPDATE_COUNTER.inc();
+
+        inner.page_versions.cache_latest(blknum, lsn, img)
+    }
+
    /// Get size of the relation at given LSN
    fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
        assert!(lsn >= self.start_lsn);
@@ -261,14 +309,14 @@ impl Layer for InMemoryLayer {
            println!("segsizes {}: {}", k, v);
        }

-        for (blknum, lsn, pv) in inner.page_versions.ordered_page_version_iter(None) {
-            println!(
-                "blk {} at {}: {}/{}\n",
-                blknum,
-                lsn,
-                pv.page_image.is_some(),
-                pv.record.is_some()
-            );
+        for (blknum, lsn, pos) in inner.page_versions.ordered_page_version_iter(None) {
+            let pv = inner.page_versions.get_page_version(pos)?;
+            let pv_description = match pv {
+                PageVersion::Page(_img) => "page",
+                PageVersion::Wal(_rec) => "wal",
+            };
+
+            println!("blk {} at {}: {}\n", blknum, lsn, pv_description);
        }

        Ok(())
@@ -281,13 +329,7 @@ pub struct LayersOnDisk {
    pub image_layers: Vec<ImageLayer>,
 }

-impl LayersOnDisk {
-    pub fn is_empty(&self) -> bool {
-        self.delta_layers.is_empty() && self.image_layers.is_empty()
-    }
-}
-
-impl InMemoryLayer {
+impl OpenLayer {
    /// Return the oldest page version that's stored in this layer
    pub fn get_oldest_pending_lsn(&self) -> Lsn {
        self.oldest_pending_lsn
@@ -303,7 +345,7 @@ impl InMemoryLayer {
        seg: SegmentTag,
        start_lsn: Lsn,
        oldest_pending_lsn: Lsn,
-    ) -> Result<InMemoryLayer> {
+    ) -> Result<OpenLayer> {
        trace!(
            "initializing new empty InMemoryLayer for writing {} on timeline {} at {}",
            seg,
@@ -317,7 +359,9 @@ impl InMemoryLayer {
            segsizes.append(start_lsn, 0).unwrap();
        }

-        Ok(InMemoryLayer {
+        let file = EphemeralFile::create(conf, tenantid, timelineid)?;
+
+        Ok(OpenLayer {
            conf,
            timelineid,
            tenantid,
@@ -325,10 +369,10 @@ impl InMemoryLayer {
            start_lsn,
            oldest_pending_lsn,
            incremental: false,
-            inner: RwLock::new(InMemoryLayerInner {
+            inner: RwLock::new(OpenLayerInner {
                end_lsn: None,
                dropped: false,
-                page_versions: PageVersions::default(),
+                page_versions: PageVersions::new(file),
                segsizes,
            }),
        })
@@ -337,32 +381,18 @@ impl InMemoryLayer {
    // Write operations

    /// Remember new page version, as a WAL record over previous version
-    pub fn put_wal_record(&self, lsn: Lsn, blknum: u32, rec: WALRecord) -> u32 {
-        self.put_page_version(
-            blknum,
-            lsn,
-            PageVersion {
-                page_image: None,
-                record: Some(rec),
-            },
-        )
+    pub fn put_wal_record(&self, lsn: Lsn, blknum: u32, rec: WALRecord) -> Result<u32> {
+        self.put_page_version(blknum, lsn, PageVersion::Wal(rec))
    }

    /// Remember new page version, as a full page image
-    pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> u32 {
-        self.put_page_version(
-            blknum,
-            lsn,
-            PageVersion {
-                page_image: Some(img),
-                record: None,
-            },
-        )
+    pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> Result<u32> {
+        self.put_page_version(blknum, lsn, PageVersion::Page(img))
    }

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> u32 {
+    pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> Result<u32> {
        assert!(self.seg.blknum_in_seg(blknum));

        trace!(
@@ -376,7 +406,7 @@ impl InMemoryLayer {

        inner.assert_writeable();

-        let old = inner.page_versions.append_or_update_last(blknum, lsn, pv);
+        let old = inner.page_versions.append_or_update_last(blknum, lsn, pv)?;

        if old.is_some() {
            // We already had an entry for this LSN. That's odd..
@@ -412,10 +442,7 @@ impl InMemoryLayer {
                // subsequent call to initialize the gap page.
                let gapstart = self.seg.segno * RELISH_SEG_SIZE + oldsize;
                for gapblknum in gapstart..blknum {
-                    let zeropv = PageVersion {
-                        page_image: Some(ZERO_PAGE.clone()),
-                        record: None,
-                    };
+                    let zeropv = PageVersion::Page(ZERO_PAGE.clone());
                    trace!(
                        "filling gap blk {} with zeros for write of {}",
                        gapblknum,
@@ -423,7 +450,7 @@ impl InMemoryLayer {
                    );
                    let old = inner
                        .page_versions
-                        .append_or_update_last(gapblknum, lsn, zeropv);
+                        .append_or_update_last(gapblknum, lsn, zeropv)?;
                    // We already had an entry for this LSN. That's odd..

                    if old.is_some() {
@@ -435,11 +462,11 @@ impl InMemoryLayer {
                }

                inner.segsizes.append_or_update_last(lsn, newsize).unwrap();
-                return newsize - oldsize;
+                return Ok(newsize - oldsize);
            }
        }

-        0
+        Ok(0)
    }

    /// Remember that the relation was truncated at given LSN
@@ -456,7 +483,7 @@ impl InMemoryLayer {
        let oldsize = inner.get_seg_size(lsn);
        assert!(segsize < oldsize);

-        let old = inner.segsizes.append_or_update_last(lsn, segsize).unwrap();
+        let (old, _delta_size) = inner.segsizes.append_or_update_last(lsn, segsize).unwrap();

        if old.is_some() {
            // We already had an entry for this LSN. That's odd..
@@ -478,7 +505,7 @@ impl InMemoryLayer {
    }

    ///
-    /// Initialize a new InMemoryLayer for, by copying the state at the given
+    /// Initialize a new OpenLayer for, by copying the state at the given
    /// point in time from given existing layer.
    ///
    pub fn create_successor_layer(
@@ -488,14 +515,14 @@ impl InMemoryLayer {
        tenantid: ZTenantId,
        start_lsn: Lsn,
        oldest_pending_lsn: Lsn,
-    ) -> Result<InMemoryLayer> {
+    ) -> Result<OpenLayer> {
        let seg = src.get_seg_tag();

        assert!(oldest_pending_lsn.is_aligned());
        assert!(oldest_pending_lsn >= start_lsn);

        trace!(
-            "initializing new InMemoryLayer for writing {} on timeline {} at {}",
+            "initializing new OpenLayer for writing {} on timeline {} at {}",
            seg,
            timelineid,
            start_lsn,
@@ -508,7 +535,9 @@ impl InMemoryLayer {
            segsizes.append(start_lsn, size).unwrap();
        }

-        Ok(InMemoryLayer {
+        let file = EphemeralFile::create(conf, tenantid, timelineid)?;
+
+        Ok(OpenLayer {
            conf,
            timelineid,
            tenantid,
@@ -516,10 +545,10 @@ impl InMemoryLayer {
            start_lsn,
            oldest_pending_lsn,
            incremental: true,
-            inner: RwLock::new(InMemoryLayerInner {
+            inner: RwLock::new(OpenLayerInner {
                end_lsn: None,
                dropped: false,
-                page_versions: PageVersions::default(),
+                page_versions: PageVersions::new(file),
                segsizes,
            }),
        })
@@ -589,7 +618,8 @@ impl InMemoryLayer {
                self.start_lsn,
                end_lsn_exclusive,
                true,
-                inner.page_versions.ordered_page_version_iter(None),
+                &inner.page_versions,
+                None,
                inner.segsizes.clone(),
            )?;
            trace!(
@@ -606,13 +636,9 @@ impl InMemoryLayer {

        // Since `end_lsn` is inclusive, subtract 1.
        // We want to make an ImageLayer for the last included LSN,
-        // so the DeltaLayer should exlcude that LSN.
+        // so the DeltaLayer should exclude that LSN.
        let end_lsn_inclusive = Lsn(end_lsn_exclusive.0 - 1);

-        let mut page_versions = inner
-            .page_versions
-            .ordered_page_version_iter(Some(end_lsn_inclusive));
-
        let mut delta_layers = Vec::new();

        if self.start_lsn != end_lsn_inclusive {
@@ -626,7 +652,8 @@ impl InMemoryLayer {
                self.start_lsn,
                end_lsn_inclusive,
                false,
-                page_versions,
+                &inner.page_versions,
+                Some(end_lsn_inclusive),
                segsizes,
            )?;
            delta_layers.push(delta_layer);
@@ -637,7 +664,11 @@ impl InMemoryLayer {
                end_lsn_inclusive
            );
        } else {
-            assert!(page_versions.next().is_none());
+            assert!(inner
+                .page_versions
+                .ordered_page_version_iter(None)
+                .next()
+                .is_none());
        }

        drop(inner);
--- a/pageserver/src/layered_repository/layer_map.rs
+++ b/pageserver/src/layered_repository/layer_map.rs
@@ -11,7 +11,7 @@

 use crate::layered_repository::interval_tree::{IntervalItem, IntervalIter, IntervalTree};
 use crate::layered_repository::storage_layer::{Layer, SegmentTag};
-use crate::layered_repository::InMemoryLayer;
+use crate::layered_repository::OpenLayer;
 use crate::relish::*;
 use anyhow::Result;
 use lazy_static::lazy_static;
@@ -21,6 +21,8 @@ use std::sync::Arc;
 use zenith_metrics::{register_int_gauge, IntGauge};
 use zenith_utils::lsn::Lsn;

+use super::global_layer_map::{LayerId, GLOBAL_LAYER_MAP};
+
 lazy_static! {
    static ref NUM_INMEMORY_LAYERS: IntGauge =
        register_int_gauge!("pageserver_inmemory_layers", "Number of layers in memory")
@@ -65,19 +67,21 @@ impl LayerMap {
    /// Get the open layer for given segment for writing. Or None if no open
    /// layer exists.
    ///
-    pub fn get_open(&self, tag: &SegmentTag) -> Option<Arc<InMemoryLayer>> {
+    pub fn get_open(&self, tag: &SegmentTag) -> Option<Arc<OpenLayer>> {
        let segentry = self.segs.get(tag)?;

-        segentry.open.as_ref().map(Arc::clone)
+        segentry
+            .open_layer_id
+            .and_then(|layer_id| GLOBAL_LAYER_MAP.read().unwrap().get(&layer_id))
    }

    ///
    /// Insert an open in-memory layer
    ///
-    pub fn insert_open(&mut self, layer: Arc<InMemoryLayer>) {
+    pub fn insert_open(&mut self, layer: Arc<OpenLayer>) {
        let segentry = self.segs.entry(layer.get_seg_tag()).or_default();

-        segentry.update_open(Arc::clone(&layer));
+        let layer_id = segentry.update_open(Arc::clone(&layer));

        let oldest_pending_lsn = layer.get_oldest_pending_lsn();

@@ -89,7 +93,7 @@ impl LayerMap {
        // Also add it to the binary heap
        let open_layer_entry = OpenLayerEntry {
            oldest_pending_lsn: layer.get_oldest_pending_lsn(),
-            layer,
+            layer_id,
            generation: self.current_generation,
        };
        self.open_layers.push(open_layer_entry);
@@ -97,24 +101,35 @@ impl LayerMap {
        NUM_INMEMORY_LAYERS.inc();
    }

-    /// Remove the oldest in-memory layer
-    pub fn pop_oldest_open(&mut self) {
-        // Pop it from the binary heap
-        let oldest_entry = self.open_layers.pop().unwrap();
-        let segtag = oldest_entry.layer.get_seg_tag();
+    /// Remove an open in-memory layer
+    pub fn remove_open(&mut self, layer_id: LayerId) {
+        // Note: we don't try to remove the entry from the binary heap.
+        // It will be removed lazily by peek_oldest_open() when it's made it to
+        // the top of the heap.

-        // Also remove it from the SegEntry of this segment
-        let mut segentry = self.segs.get_mut(&segtag).unwrap();
-        if Arc::ptr_eq(segentry.open.as_ref().unwrap(), &oldest_entry.layer) {
-            segentry.open = None;
-        } else {
-            // We could have already updated segentry.open for
-            // dropped (non-writeable) layer. This is fine.
-            assert!(!oldest_entry.layer.is_writeable());
-            assert!(oldest_entry.layer.is_dropped());
+        let layer_opt = {
+            let mut global_map = GLOBAL_LAYER_MAP.write().unwrap();
+            let layer_opt = global_map.get(&layer_id);
+            global_map.remove(&layer_id);
+            // TODO it's bad that a ref can still exist after being evicted from cache
+            layer_opt
+        };
+
+        if let Some(layer) = layer_opt {
+            let mut segentry = self.segs.get_mut(&layer.get_seg_tag()).unwrap();
+
+            if segentry.open_layer_id == Some(layer_id) {
+                // Also remove it from the SegEntry of this segment
+                segentry.open_layer_id = None;
+            } else {
+                // We could have already updated segentry.open for
+                // dropped (non-writeable) layer. This is fine.
+                assert!(!layer.is_writeable());
+                assert!(layer.is_dropped());
+            }
+
+            NUM_INMEMORY_LAYERS.dec();
        }
-
-        NUM_INMEMORY_LAYERS.dec();
    }

    ///
@@ -199,10 +214,17 @@ impl LayerMap {
    }

    /// Return the oldest in-memory layer, along with its generation number.
-    pub fn peek_oldest_open(&self) -> Option<(Arc<InMemoryLayer>, u64)> {
-        self.open_layers
-            .peek()
-            .map(|oldest_entry| (Arc::clone(&oldest_entry.layer), oldest_entry.generation))
+    pub fn peek_oldest_open(&mut self) -> Option<(LayerId, Arc<OpenLayer>, u64)> {
+        let global_map = GLOBAL_LAYER_MAP.read().unwrap();
+
+        while let Some(oldest_entry) = self.open_layers.peek() {
+            if let Some(layer) = global_map.get(&oldest_entry.layer_id) {
+                return Some((oldest_entry.layer_id, layer, oldest_entry.generation));
+            } else {
+                self.open_layers.pop();
+            }
+        }
+        None
    }

    /// Increment the generation number used to stamp open in-memory layers. Layers
@@ -225,8 +247,12 @@ impl LayerMap {
    pub fn dump(&self) -> Result<()> {
        println!("Begin dump LayerMap");
        for (seg, segentry) in self.segs.iter() {
-            if let Some(open) = &segentry.open {
-                open.dump()?;
+            if let Some(open) = &segentry.open_layer_id {
+                if let Some(layer) = GLOBAL_LAYER_MAP.read().unwrap().get(open) {
+                    layer.dump()?;
+                } else {
+                    println!("layer not found in global map");
+                }
            }

            for layer in segentry.historic.iter() {
@@ -253,13 +279,13 @@ impl IntervalItem for dyn Layer {
 /// Per-segment entry in the LayerMap::segs hash map. Holds all the layers
 /// associated with the segment.
 ///
-/// The last layer that is open for writes is always an InMemoryLayer,
+/// The last layer that is open for writes is always an OpenLayer,
 /// and is kept in a separate field, because there can be only one for
 /// each segment. The older layers, stored on disk, are kept in an
 /// IntervalTree.
 #[derive(Default)]
 struct SegEntry {
-    open: Option<Arc<InMemoryLayer>>,
+    open_layer_id: Option<LayerId>,
    historic: IntervalTree<dyn Layer>,
 }

@@ -275,10 +301,10 @@ impl SegEntry {
    }

    pub fn get(&self, lsn: Lsn) -> Option<Arc<dyn Layer>> {
-        if let Some(open) = &self.open {
-            if open.get_start_lsn() <= lsn {
-                let x: Arc<dyn Layer> = Arc::clone(open) as _;
-                return Some(x);
+        if let Some(open_layer_id) = &self.open_layer_id {
+            let open_layer = GLOBAL_LAYER_MAP.read().unwrap().get(open_layer_id)?;
+            if open_layer.get_start_lsn() <= lsn {
+                return Some(open_layer);
            }
        }

@@ -297,11 +323,16 @@ impl SegEntry {
    // Set new open layer for a SegEntry.
    // It's ok to rewrite previous open layer,
    // but only if it is not writeable anymore.
-    pub fn update_open(&mut self, layer: Arc<InMemoryLayer>) {
-        if let Some(prev_open) = &self.open {
-            assert!(!prev_open.is_writeable());
+    pub fn update_open(&mut self, layer: Arc<OpenLayer>) -> LayerId {
+        if let Some(prev_open_layer_id) = &self.open_layer_id {
+            if let Some(prev_open_layer) = GLOBAL_LAYER_MAP.read().unwrap().get(prev_open_layer_id)
+            {
+                assert!(!prev_open_layer.is_writeable());
+            }
        }
-        self.open = Some(layer);
+        let open_layer_id = GLOBAL_LAYER_MAP.write().unwrap().insert(layer);
+        self.open_layer_id = Some(open_layer_id);
+        open_layer_id
    }

    pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
@@ -316,9 +347,9 @@ impl SegEntry {
 /// recently-added entries (i.e after last call to increment_generation()) from older
 /// entries with the same 'oldest_pending_lsn'.
 struct OpenLayerEntry {
-    pub oldest_pending_lsn: Lsn, // copy of layer.get_oldest_pending_lsn()
-    pub generation: u64,
-    pub layer: Arc<InMemoryLayer>,
+    oldest_pending_lsn: Lsn, // copy of layer.get_oldest_pending_lsn()
+    generation: u64,
+    layer_id: LayerId,
 }
 impl Ord for OpenLayerEntry {
    fn cmp(&self, other: &Self) -> Ordering {
@@ -383,18 +414,25 @@ mod tests {
        forknum: 0,
    });

-    /// Construct a dummy InMemoryLayer for testing
-    fn dummy_inmem_layer(
+    lazy_static! {
+        static ref DUMMY_TIMELINEID: ZTimelineId =
+            ZTimelineId::from_str("00000000000000000000000000000000").unwrap();
+        static ref DUMMY_TENANTID: ZTenantId =
+            ZTenantId::from_str("00000000000000000000000000000000").unwrap();
+    }
+
+    /// Construct a dummy OpenLayer for testing
+    fn dummy_open_layer(
        conf: &'static PageServerConf,
        segno: u32,
        start_lsn: Lsn,
        oldest_pending_lsn: Lsn,
-    ) -> Arc<InMemoryLayer> {
+    ) -> Arc<OpenLayer> {
        Arc::new(
-            InMemoryLayer::create(
+            OpenLayer::create(
                conf,
-                ZTimelineId::from_str("00000000000000000000000000000000").unwrap(),
-                ZTenantId::from_str("00000000000000000000000000000000").unwrap(),
+                *DUMMY_TIMELINEID,
+                *DUMMY_TENANTID,
                SegmentTag {
                    rel: TESTREL_A,
                    segno,
@@ -408,28 +446,29 @@ mod tests {

    #[test]
    fn test_open_layers() -> Result<()> {
-        let conf = PageServerConf::dummy_conf(PageServerConf::test_repo_dir("dummy_inmem_layer"));
+        let conf = PageServerConf::dummy_conf(PageServerConf::test_repo_dir("dummy_open_layer"));
        let conf = Box::leak(Box::new(conf));
+        std::fs::create_dir_all(conf.timeline_path(&DUMMY_TIMELINEID, &DUMMY_TENANTID))?;

        let mut layers = LayerMap::default();

        let gen1 = layers.increment_generation();
-        layers.insert_open(dummy_inmem_layer(conf, 0, Lsn(0x100), Lsn(0x100)));
-        layers.insert_open(dummy_inmem_layer(conf, 1, Lsn(0x100), Lsn(0x200)));
-        layers.insert_open(dummy_inmem_layer(conf, 2, Lsn(0x100), Lsn(0x120)));
-        layers.insert_open(dummy_inmem_layer(conf, 3, Lsn(0x100), Lsn(0x110)));
+        layers.insert_open(dummy_open_layer(conf, 0, Lsn(0x100), Lsn(0x100)));
+        layers.insert_open(dummy_open_layer(conf, 1, Lsn(0x100), Lsn(0x200)));
+        layers.insert_open(dummy_open_layer(conf, 2, Lsn(0x100), Lsn(0x120)));
+        layers.insert_open(dummy_open_layer(conf, 3, Lsn(0x100), Lsn(0x110)));

        let gen2 = layers.increment_generation();
-        layers.insert_open(dummy_inmem_layer(conf, 4, Lsn(0x100), Lsn(0x110)));
-        layers.insert_open(dummy_inmem_layer(conf, 5, Lsn(0x100), Lsn(0x100)));
+        layers.insert_open(dummy_open_layer(conf, 4, Lsn(0x100), Lsn(0x110)));
+        layers.insert_open(dummy_open_layer(conf, 5, Lsn(0x100), Lsn(0x100)));

        // A helper function (closure) to pop the next oldest open entry from the layer map,
        // and assert that it is what we'd expect
        let mut assert_pop_layer = |expected_segno: u32, expected_generation: u64| {
-            let (l, generation) = layers.peek_oldest_open().unwrap();
+            let (layer_id, l, generation) = layers.peek_oldest_open().unwrap();
            assert!(l.get_seg_tag().segno == expected_segno);
            assert!(generation == expected_generation);
-            layers.pop_oldest_open();
+            layers.remove_open(layer_id);
        };

        assert_pop_layer(0, gen1); // 0x100
--- a/pageserver/src/layered_repository/metadata.rs
+++ b/pageserver/src/layered_repository/metadata.rs
@@ -0,0 +1,202 @@
+//! Every image of a certain timeline from [`crate::layered_repository::LayeredRepository`]
+//! has a metadata that needs to be stored persistently.
+//!
+//! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of
+//! external storage import and export operations.
+//!
+//! The module contains all structs and related helper methods related to timeline metadata.
+
+use std::{convert::TryInto, path::PathBuf};
+
+use anyhow::ensure;
+use zenith_utils::{
+    bin_ser::BeSer,
+    lsn::Lsn,
+    zid::{ZTenantId, ZTimelineId},
+};
+
+use crate::{
+    layered_repository::{METADATA_CHECKSUM_SIZE, METADATA_MAX_DATA_SIZE, METADATA_MAX_SAFE_SIZE},
+    PageServerConf,
+};
+
+/// The name of the metadata file pageserver creates per timeline.
+pub const METADATA_FILE_NAME: &str = "metadata";
+
+/// Metadata stored on disk for each timeline
+///
+/// The fields correspond to the values we hold in memory, in LayeredTimeline.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
+pub struct TimelineMetadata {
+    disk_consistent_lsn: Lsn,
+    // This is only set if we know it. We track it in memory when the page
+    // server is running, but we only track the value corresponding to
+    // 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a
+    // lot. We only store it in the metadata file when we flush *all* the
+    // in-memory data so that 'last_record_lsn' is the same as
+    // 'disk_consistent_lsn'.  That's OK, because after page server restart, as
+    // soon as we reprocess at least one record, we will have a valid
+    // 'prev_record_lsn' value in memory again. This is only really needed when
+    // doing a clean shutdown, so that there is no more WAL beyond
+    // 'disk_consistent_lsn'
+    prev_record_lsn: Option<Lsn>,
+    ancestor_timeline: Option<ZTimelineId>,
+    ancestor_lsn: Lsn,
+}
+
+/// Points to a place in pageserver's local directory,
+/// where certain timeline's metadata file should be located.
+pub fn metadata_path(
+    conf: &'static PageServerConf,
+    timelineid: ZTimelineId,
+    tenantid: ZTenantId,
+) -> PathBuf {
+    conf.timeline_path(&timelineid, &tenantid)
+        .join(METADATA_FILE_NAME)
+}
+
+impl TimelineMetadata {
+    pub fn new(
+        disk_consistent_lsn: Lsn,
+        prev_record_lsn: Option<Lsn>,
+        ancestor_timeline: Option<ZTimelineId>,
+        ancestor_lsn: Lsn,
+    ) -> Self {
+        Self {
+            disk_consistent_lsn,
+            prev_record_lsn,
+            ancestor_timeline,
+            ancestor_lsn,
+        }
+    }
+
+    pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
+        ensure!(
+            metadata_bytes.len() == METADATA_MAX_SAFE_SIZE,
+            "metadata bytes size is wrong"
+        );
+
+        let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE];
+        let calculated_checksum = crc32c::crc32c(data);
+
+        let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] =
+            metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?;
+        let expected_checksum = u32::from_le_bytes(*checksum_bytes);
+        ensure!(
+            calculated_checksum == expected_checksum,
+            "metadata checksum mismatch"
+        );
+
+        let data = TimelineMetadata::from(serialize::DeTimelineMetadata::des_prefix(data)?);
+        assert!(data.disk_consistent_lsn.is_aligned());
+
+        Ok(data)
+    }
+
+    pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
+        let serializeable_metadata = serialize::SeTimelineMetadata::from(self);
+        let mut metadata_bytes = serialize::SeTimelineMetadata::ser(&serializeable_metadata)?;
+        assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
+        metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8);
+
+        let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]);
+        metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum));
+        Ok(metadata_bytes)
+    }
+
+    /// [`Lsn`] that corresponds to the corresponding timeline directory
+    /// contents, stored locally in the pageserver workdir.
+    pub fn disk_consistent_lsn(&self) -> Lsn {
+        self.disk_consistent_lsn
+    }
+
+    pub fn prev_record_lsn(&self) -> Option<Lsn> {
+        self.prev_record_lsn
+    }
+
+    pub fn ancestor_timeline(&self) -> Option<ZTimelineId> {
+        self.ancestor_timeline
+    }
+
+    pub fn ancestor_lsn(&self) -> Lsn {
+        self.ancestor_lsn
+    }
+}
+
+/// This module is for direct conversion of metadata to bytes and back.
+/// For a certain metadata, besides the conversion a few verification steps has to
+/// be done, so all serde derives are hidden from the user, to avoid accidental
+/// verification-less metadata creation.
+mod serialize {
+    use serde::{Deserialize, Serialize};
+    use zenith_utils::{lsn::Lsn, zid::ZTimelineId};
+
+    use super::TimelineMetadata;
+
+    #[derive(Serialize)]
+    pub(super) struct SeTimelineMetadata<'a> {
+        disk_consistent_lsn: &'a Lsn,
+        prev_record_lsn: &'a Option<Lsn>,
+        ancestor_timeline: &'a Option<ZTimelineId>,
+        ancestor_lsn: &'a Lsn,
+    }
+
+    impl<'a> From<&'a TimelineMetadata> for SeTimelineMetadata<'a> {
+        fn from(other: &'a TimelineMetadata) -> Self {
+            Self {
+                disk_consistent_lsn: &other.disk_consistent_lsn,
+                prev_record_lsn: &other.prev_record_lsn,
+                ancestor_timeline: &other.ancestor_timeline,
+                ancestor_lsn: &other.ancestor_lsn,
+            }
+        }
+    }
+
+    #[derive(Deserialize)]
+    pub(super) struct DeTimelineMetadata {
+        disk_consistent_lsn: Lsn,
+        prev_record_lsn: Option<Lsn>,
+        ancestor_timeline: Option<ZTimelineId>,
+        ancestor_lsn: Lsn,
+    }
+
+    impl From<DeTimelineMetadata> for TimelineMetadata {
+        fn from(other: DeTimelineMetadata) -> Self {
+            Self {
+                disk_consistent_lsn: other.disk_consistent_lsn,
+                prev_record_lsn: other.prev_record_lsn,
+                ancestor_timeline: other.ancestor_timeline,
+                ancestor_lsn: other.ancestor_lsn,
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::repository::repo_harness::TIMELINE_ID;
+
+    use super::*;
+
+    #[test]
+    fn metadata_serializes_correctly() {
+        let original_metadata = TimelineMetadata {
+            disk_consistent_lsn: Lsn(0x200),
+            prev_record_lsn: Some(Lsn(0x100)),
+            ancestor_timeline: Some(TIMELINE_ID),
+            ancestor_lsn: Lsn(0),
+        };
+
+        let metadata_bytes = original_metadata
+            .to_bytes()
+            .expect("Should serialize correct metadata to bytes");
+
+        let deserialized_metadata = TimelineMetadata::from_bytes(&metadata_bytes)
+            .expect("Should deserialize its own bytes");
+
+        assert_eq!(
+            deserialized_metadata, original_metadata,
+            "Metadata that was serialized to bytes and deserialized back should not change"
+        );
+    }
+}
--- a/pageserver/src/layered_repository/page_versions.rs
+++ b/pageserver/src/layered_repository/page_versions.rs
@@ -1,40 +1,121 @@
+//!
+//! Data structure to ingest incoming WAL into an append-only file.
+//!
+//! - The file is considered temporary, and will be discarded on crash
+//! - based on a B-tree
+//!
+
+use std::os::unix::fs::FileExt;
 use std::{collections::HashMap, ops::RangeBounds, slice};

+use anyhow::Result;
+use bytes::{Bytes, BytesMut};
+
+use std::cmp::min;
+use std::io::{Seek, SeekFrom};
+
 use zenith_utils::{lsn::Lsn, vec_map::VecMap};

 use super::storage_layer::PageVersion;
+use crate::layered_repository::ephemeral_file::EphemeralFile;

-const EMPTY_SLICE: &[(Lsn, PageVersion)] = &[];
+use postgres_ffi::pg_constants::BLCKSZ;

-#[derive(Debug, Default)]
-pub struct PageVersions(HashMap<u32, VecMap<Lsn, PageVersion>>);
+use zenith_utils::bin_ser::LeSer;
+
+const EMPTY_SLICE: &[(Lsn, u64)] = &[];
+
+pub struct PageVersions {
+    map: HashMap<u32, VecMap<Lsn, u64>>,
+
+    latest_map: HashMap<u32, (Lsn, u64)>,
+
+    /// The PageVersion structs are stored in a serialized format in this file.
+    /// Each serialized PageVersion is preceded by a 'u32' length field.
+    /// The 'map' stores offsets into this file.
+    file: EphemeralFile,
+}

 impl PageVersions {
+    pub fn new(file: EphemeralFile) -> PageVersions {
+        PageVersions {
+            map: HashMap::new(),
+            latest_map: HashMap::new(),
+            file,
+        }
+    }
+
+    pub fn cache_latest(&mut self, blknum: u32, lsn: Lsn, img: &[u8]) -> Result<()> {
+        if img.len() != BLCKSZ as usize {
+            return Ok(());
+        }
+
+        let pos = if let Some((_lsn, pos)) = self.latest_map.get(&blknum) {
+            *pos
+        } else {
+            let pos = self.file.stream_position()?;
+            // round up to nearest page boundary for performance
+            //let pos = (pos + BLCKSZ as u64 - 1) & !(BLCKSZ as u64 - 1);
+
+            self.file.seek(SeekFrom::Start(pos + BLCKSZ as u64))?;
+
+            pos
+        };
+
+        self.file.write_all_at(img, pos)?;
+
+        self.latest_map.insert(blknum, (lsn, pos));
+
+        Ok(())
+    }
+
+    pub fn get_latest(&self, blknum: u32) -> Option<&(Lsn, u64)> {
+        self.latest_map.get(&blknum)
+    }
+
+    pub fn fetch_cached_latest(&self, pos: u64) -> Result<Bytes, std::io::Error> {
+        let mut buf = BytesMut::with_capacity(BLCKSZ as usize);
+        buf.resize(BLCKSZ as usize, 0u8);
+        if let Err(err) = self.file.read_exact_at(buf.as_mut(), pos) {
+            tracing::error!("read_exact_at {} failed: {:?}", pos, err);
+        }
+        Ok(buf.freeze())
+    }
+
    pub fn append_or_update_last(
        &mut self,
        blknum: u32,
        lsn: Lsn,
        page_version: PageVersion,
-    ) -> Option<PageVersion> {
-        let map = self.0.entry(blknum).or_insert_with(VecMap::default);
-        map.append_or_update_last(lsn, page_version).unwrap()
+    ) -> Result<Option<u64>> {
+        // remember starting position
+        let pos = self.file.stream_position()?;
+
+        // make room for the 'length' field by writing zeros as a placeholder.
+        self.file.seek(SeekFrom::Start(pos + 4)).unwrap();
+
+        page_version.ser_into(&mut self.file).unwrap();
+
+        // write the 'length' field.
+        let len = self.file.stream_position()? - pos - 4;
+        let lenbuf = u32::to_ne_bytes(len as u32);
+        self.file.write_all_at(&lenbuf, pos)?;
+
+        let map = self.map.entry(blknum).or_insert_with(VecMap::default);
+        Ok(map.append_or_update_last(lsn, pos as u64).unwrap().0)
    }

    /// Get all [`PageVersion`]s in a block
-    pub fn get_block_slice(&self, blknum: u32) -> &[(Lsn, PageVersion)] {
-        self.0
+    fn get_block_slice(&self, blknum: u32) -> &[(Lsn, u64)] {
+        self.map
            .get(&blknum)
            .map(VecMap::as_slice)
            .unwrap_or(EMPTY_SLICE)
    }

    /// Get a range of [`PageVersions`] in a block
-    pub fn get_block_lsn_range<R: RangeBounds<Lsn>>(
-        &self,
-        blknum: u32,
-        range: R,
-    ) -> &[(Lsn, PageVersion)] {
-        self.0
+    pub fn get_block_lsn_range<R: RangeBounds<Lsn>>(&self, blknum: u32, range: R) -> &[(Lsn, u64)] {
+        self.map
            .get(&blknum)
            .map(|vec_map| vec_map.slice_range(range))
            .unwrap_or(EMPTY_SLICE)
@@ -43,7 +124,7 @@ impl PageVersions {
    /// Iterate through [`PageVersion`]s in (block, lsn) order.
    /// If a [`cutoff_lsn`] is set, only show versions with `lsn < cutoff_lsn`
    pub fn ordered_page_version_iter(&self, cutoff_lsn: Option<Lsn>) -> OrderedPageVersionIter<'_> {
-        let mut ordered_blocks: Vec<u32> = self.0.keys().cloned().collect();
+        let mut ordered_blocks: Vec<u32> = self.map.keys().cloned().collect();
        ordered_blocks.sort_unstable();

        let slice = ordered_blocks
@@ -59,6 +140,40 @@ impl PageVersions {
            cur_slice_iter: slice.iter(),
        }
    }
+
+    /// Returns a 'Read' that reads the page version at given offset.
+    pub fn reader(&self, pos: u64) -> Result<PageVersionReader, std::io::Error> {
+        // read length
+        let mut lenbuf = [0u8; 4];
+        self.file.read_exact_at(&mut lenbuf, pos)?;
+        let len = u32::from_ne_bytes(lenbuf);
+
+        Ok(PageVersionReader {
+            file: &self.file,
+            pos: pos + 4,
+            end_pos: pos + 4 + len as u64,
+        })
+    }
+
+    pub fn get_page_version(&self, pos: u64) -> Result<PageVersion> {
+        let mut reader = self.reader(pos)?;
+        Ok(PageVersion::des_from(&mut reader)?)
+    }
+}
+
+pub struct PageVersionReader<'a> {
+    file: &'a EphemeralFile,
+    pos: u64,
+    end_pos: u64,
+}
+
+impl<'a> std::io::Read for PageVersionReader<'a> {
+    fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
+        let len = min(buf.len(), (self.end_pos - self.pos) as usize);
+        let n = self.file.read_at(&mut buf[..len], self.pos)?;
+        self.pos += n as u64;
+        Ok(n)
+    }
 }

 pub struct OrderedPageVersionIter<'a> {
@@ -69,7 +184,7 @@ pub struct OrderedPageVersionIter<'a> {

    cutoff_lsn: Option<Lsn>,

-    cur_slice_iter: slice::Iter<'a, (Lsn, PageVersion)>,
+    cur_slice_iter: slice::Iter<'a, (Lsn, u64)>,
 }

 impl OrderedPageVersionIter<'_> {
@@ -83,14 +198,14 @@ impl OrderedPageVersionIter<'_> {
 }

 impl<'a> Iterator for OrderedPageVersionIter<'a> {
-    type Item = (u32, Lsn, &'a PageVersion);
+    type Item = (u32, Lsn, u64);

    fn next(&mut self) -> Option<Self::Item> {
        loop {
-            if let Some((lsn, page_version)) = self.cur_slice_iter.next() {
+            if let Some((lsn, pos)) = self.cur_slice_iter.next() {
                if self.is_lsn_before_cutoff(lsn) {
                    let blknum = self.ordered_blocks[self.cur_block_idx];
-                    return Some((blknum, *lsn, page_version));
+                    return Some((blknum, *lsn, *pos));
                }
            }

@@ -104,22 +219,50 @@ impl<'a> Iterator for OrderedPageVersionIter<'a> {

 #[cfg(test)]
 mod tests {
-    use super::*;
+    use bytes::Bytes;

-    const EMPTY_PAGE_VERSION: PageVersion = PageVersion {
-        page_image: None,
-        record: None,
-    };
+    use super::*;
+    use crate::PageServerConf;
+    use std::fs;
+    use std::str::FromStr;
+    use zenith_utils::zid::{ZTenantId, ZTimelineId};
+
+    fn repo_harness(test_name: &str) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId)> {
+        let repo_dir = PageServerConf::test_repo_dir(test_name);
+        let _ = fs::remove_dir_all(&repo_dir);
+        let conf = PageServerConf::dummy_conf(repo_dir);
+        // Make a static copy of the config. This can never be free'd, but that's
+        // OK in a test.
+        let conf: &'static PageServerConf = Box::leak(Box::new(conf));
+
+        let tenantid = ZTenantId::from_str("11000000000000000000000000000000").unwrap();
+        let timelineid = ZTimelineId::from_str("22000000000000000000000000000000").unwrap();
+        fs::create_dir_all(conf.timeline_path(&timelineid, &tenantid))?;
+
+        Ok((conf, tenantid, timelineid))
+    }

    #[test]
-    fn test_ordered_iter() {
-        let mut page_versions = PageVersions::default();
+    fn test_ordered_iter() -> Result<()> {
+        let (conf, tenantid, timelineid) = repo_harness("test_ordered_iter")?;
+
+        let file = EphemeralFile::create(conf, tenantid, timelineid)?;
+
+        let mut page_versions = PageVersions::new(file);
+
        const BLOCKS: u32 = 1000;
        const LSNS: u64 = 50;

+        let empty_page = Bytes::from_static(&[0u8; 8192]);
+        let empty_page_version = PageVersion::Page(empty_page);
+
        for blknum in 0..BLOCKS {
            for lsn in 0..LSNS {
-                let old = page_versions.append_or_update_last(blknum, Lsn(lsn), EMPTY_PAGE_VERSION);
+                let old = page_versions.append_or_update_last(
+                    blknum,
+                    Lsn(lsn),
+                    empty_page_version.clone(),
+                )?;
                assert!(old.is_none());
            }
        }
@@ -146,5 +289,7 @@ mod tests {
        }
        assert!(iter.next().is_none());
        assert!(iter.next().is_none()); // should be robust against excessive next() calls
+
+        Ok(())
    }
 }
--- a/pageserver/src/layered_repository/storage_layer.rs
+++ b/pageserver/src/layered_repository/storage_layer.rs
@@ -4,7 +4,7 @@

 use crate::relish::RelishTag;
 use crate::repository::WALRecord;
-use crate::ZTimelineId;
+use crate::{ZTenantId, ZTimelineId};
 use anyhow::Result;
 use bytes::Bytes;
 use serde::{Deserialize, Serialize};
@@ -51,23 +51,10 @@ impl SegmentTag {
 ///
 /// A page version can be stored as a full page image, or as WAL record that needs
 /// to be applied over the previous page version to reconstruct this version.
-///
-/// It's also possible to have both a WAL record and a page image in the same
-/// PageVersion. That happens if page version is originally stored as a WAL record
-/// but it is later reconstructed by a GetPage@LSN request by performing WAL
-/// redo. The get_page_at_lsn() code will store the reconstructed pag image next to
-/// the WAL record in that case. TODO: That's pretty accidental, not the result
-/// of any grand design. If we want to keep reconstructed page versions around, we
-/// probably should have a separate buffer cache so that we could control the
-/// replacement policy globally. Or if we keep a reconstructed page image, we
-/// could throw away the WAL record.
-///
 #[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct PageVersion {
-    /// an 8kb page image
-    pub page_image: Option<Bytes>,
-    /// WAL record to get from previous page version to this one.
-    pub record: Option<WALRecord>,
+pub enum PageVersion {
+    Page(Bytes),
+    Wal(WALRecord),
 }

 ///
@@ -93,6 +80,8 @@ pub enum PageReconstructResult {
    /// the returned LSN. This is usually considered an error, but might be OK
    /// in some circumstances.
    Missing(Lsn),
+    /// Use the cached image at `cached_img_lsn` as the base image
+    Cached,
 }

 ///
@@ -104,6 +93,8 @@ pub enum PageReconstructResult {
 /// in-memory and on-disk layers.
 ///
 pub trait Layer: Send + Sync {
+    fn get_tenant_id(&self) -> ZTenantId;
+
    /// Identify the timeline this relish belongs to
    fn get_timeline_id(&self) -> ZTimelineId;

@@ -138,6 +129,9 @@ pub trait Layer: Send + Sync {
    /// of the *relish*, not the beginning of the segment. The requested
    /// 'blknum' must be covered by this segment.
    ///
+    /// `cached_img_lsn` should be set to a cached page image's lsn < `lsn`.
+    /// This function will only return data after `cached_img_lsn`.
+    ///
    /// See PageReconstructResult for possible return values. The collected data
    /// is appended to reconstruct_data; the caller should pass an empty struct
    /// on first call. If this returns PageReconstructResult::Continue, look up
@@ -147,9 +141,14 @@ pub trait Layer: Send + Sync {
        &self,
        blknum: u32,
        lsn: Lsn,
+        cached_img_lsn: Option<Lsn>,
        reconstruct_data: &mut PageReconstructData,
    ) -> Result<PageReconstructResult>;

+    fn cache_page_image(&self, _blknum: u32, _lsn: Lsn, _img: &[u8]) -> Result<()> {
+        Ok(())
+    }
+
    /// Return size of the segment at given LSN. (Only for blocky relations.)
    fn get_seg_size(&self, lsn: Lsn) -> Result<u32>;

--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,3 +1,4 @@
+use layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use zenith_utils::postgres_backend::AuthType;
 use zenith_utils::zid::{ZTenantId, ZTimelineId};

@@ -11,12 +12,15 @@ pub mod basebackup;
 pub mod branches;
 pub mod http;
 pub mod layered_repository;
+pub mod page_cache;
 pub mod page_service;
 pub mod relish;
-pub mod relish_storage;
+pub mod remote_storage;
 pub mod repository;
 pub mod restore_local_repo;
 pub mod tenant_mgr;
+pub mod tenant_threads;
+pub mod virtual_file;
 pub mod waldecoder;
 pub mod walreceiver;
 pub mod walredo;
@@ -40,7 +44,11 @@ pub mod defaults {
    pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100);

    pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
-    pub const DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS: usize = 100;
+    pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC_LIMITS: usize = 100;
+
+    pub const DEFAULT_OPEN_MEM_LIMIT: usize = 128 * 1024 * 1024;
+    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
+    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
 }

 lazy_static! {
@@ -69,6 +77,10 @@ pub struct PageServerConf {
    pub gc_period: Duration,
    pub superuser: String,

+    pub open_mem_limit: usize,
+    pub page_cache_size: usize,
+    pub max_file_descriptors: usize,
+
    // Repository directory, relative to current working directory.
    // Normally, the page server changes the current working directory
    // to the repository, and 'workdir' is always '.'. But we don't do
@@ -82,7 +94,7 @@ pub struct PageServerConf {
    pub auth_type: AuthType,

    pub auth_validation_public_key_path: Option<PathBuf>,
-    pub relish_storage_config: Option<RelishStorageConfig>,
+    pub remote_storage_config: Option<RemoteStorageConfig>,
 }

 impl PageServerConf {
@@ -91,7 +103,7 @@ impl PageServerConf {
    //

    fn tenants_path(&self) -> PathBuf {
-        self.workdir.join("tenants")
+        self.workdir.join(TENANTS_SEGMENT_NAME)
    }

    fn tenant_path(&self, tenantid: &ZTenantId) -> PathBuf {
@@ -115,7 +127,7 @@ impl PageServerConf {
    }

    fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf {
-        self.tenant_path(tenantid).join("timelines")
+        self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME)
    }

    fn timeline_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
@@ -151,6 +163,9 @@ impl PageServerConf {
            checkpoint_period: Duration::from_secs(10),
            gc_horizon: defaults::DEFAULT_GC_HORIZON,
            gc_period: Duration::from_secs(10),
+            open_mem_limit: defaults::DEFAULT_OPEN_MEM_LIMIT,
+            page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
+            max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
            listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
            listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
            superuser: "zenith_admin".to_string(),
@@ -158,23 +173,32 @@ impl PageServerConf {
            pg_distrib_dir: "".into(),
            auth_type: AuthType::Trust,
            auth_validation_public_key_path: None,
-            relish_storage_config: None,
+            remote_storage_config: None,
        }
    }
 }

-/// External relish storage configuration, enough for creating a client for that storage.
-#[derive(Debug, Clone)]
-pub struct RelishStorageConfig {
-    /// Limits the number of concurrent sync operations between pageserver and relish storage.
-    pub max_concurrent_sync: usize,
-    /// The storage connection configuration.
-    pub storage: RelishStorageKind,
+/// Config for the Repository checkpointer
+#[derive(Debug, Clone, Copy)]
+pub enum CheckpointConfig {
+    // Flush in-memory data that is older than this
+    Distance(u64),
+    // Flush all in-memory data
+    Forced,
 }

-/// A kind of a relish storage to connect to, with its connection configuration.
+/// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone)]
-pub enum RelishStorageKind {
+pub struct RemoteStorageConfig {
+    /// Limits the number of concurrent sync operations between pageserver and the remote storage.
+    pub max_concurrent_sync: usize,
+    /// The storage connection configuration.
+    pub storage: RemoteStorageKind,
+}
+
+/// A kind of a remote storage to connect to, with its connection configuration.
+#[derive(Debug, Clone)]
+pub enum RemoteStorageKind {
    /// Storage based on local file system.
    /// Specify a root folder to place all stored relish data into.
    LocalFs(PathBuf),
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -0,0 +1,766 @@
+//!
+//! Global page cache
+//!
+//! The page cache uses up most of the memory in the page server. It is shared
+//! by all tenants, and it is used to store different kinds of pages. Sharing
+//! the cache allows memory to be dynamically allocated where it's needed the
+//! most.
+//!
+//! The page cache consists of fixed-size buffers, 8 kB each to match the
+//! PostgreSQL buffer size, and a Slot struct for each buffer to contain
+//! information about what's stored in the buffer.
+//!
+//! # Locking
+//!
+//! There are two levels of locking involved: There's one lock for the "mapping"
+//! from page identifier (tenant ID, timeline ID, rel, block, LSN) to the buffer
+//! slot, and a separate lock on each slot. To read or write the contents of a
+//! slot, you must hold the lock on the slot in read or write mode,
+//! respectively. To change the mapping of a slot, i.e. to evict a page or to
+//! assign a buffer for a page, you must hold the mapping lock and the lock on
+//! the slot at the same time.
+//!
+//! Whenever you need to hold both locks simultenously, the slot lock must be
+//! acquired first. This consistent ordering avoids deadlocks. To look up a page
+//! in the cache, you would first look up the mapping, while holding the mapping
+//! lock, and then lock the slot. You must release the mapping lock in between,
+//! to obey the lock ordering and avoid deadlock.
+//!
+//! A slot can momentarily have invalid contents, even if it's already been
+//! inserted to the mapping, but you must hold the write-lock on the slot until
+//! the contents are valid. If you need to release the lock without initializing
+//! the contents, you must remove the mapping first. We make that easy for the
+//! callers with PageWriteGuard: when lock_for_write() returns an uninitialized
+//! page, the caller must explicitly call guard.mark_valid() after it has
+//! initialized it. If the guard is dropped without calling mark_valid(), the
+//! mapping is automatically removed and the slot is marked free.
+//!
+
+use std::{
+    collections::{hash_map::Entry, HashMap},
+    convert::TryInto,
+    sync::{
+        atomic::{AtomicU8, AtomicUsize, Ordering},
+        RwLock, RwLockReadGuard, RwLockWriteGuard,
+    },
+};
+
+use once_cell::sync::OnceCell;
+use tracing::error;
+use zenith_utils::{
+    lsn::Lsn,
+    zid::{ZTenantId, ZTimelineId},
+};
+
+use crate::layered_repository::writeback_ephemeral_file;
+use crate::{relish::RelTag, PageServerConf};
+
+static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
+const TEST_PAGE_CACHE_SIZE: usize = 10;
+
+///
+/// Initialize the page cache. This must be called once at page server startup.
+///
+pub fn init(conf: &'static PageServerConf) {
+    if PAGE_CACHE
+        .set(PageCache::new(conf.page_cache_size))
+        .is_err()
+    {
+        panic!("page cache already initialized");
+    }
+}
+
+///
+/// Get a handle to the page cache.
+///
+pub fn get() -> &'static PageCache {
+    //
+    // In unit tests, page server startup doesn't happen and no one calls
+    // page_cache::init(). Initialize it here with a tiny cache, so that the
+    // page cache is usable in unit tests.
+    //
+    if cfg!(test) {
+        PAGE_CACHE.get_or_init(|| PageCache::new(TEST_PAGE_CACHE_SIZE))
+    } else {
+        PAGE_CACHE.get().expect("page cache not initialized")
+    }
+}
+
+pub const PAGE_SZ: usize = postgres_ffi::pg_constants::BLCKSZ as usize;
+const MAX_USAGE_COUNT: u8 = 5;
+
+///
+/// CacheKey uniquely identifies a "thing" to cache in the page cache.
+///
+#[derive(Debug, PartialEq, Eq, Clone)]
+enum CacheKey {
+    MaterializedPage {
+        hash_key: MaterializedPageHashKey,
+        lsn: Lsn,
+    },
+    EphemeralPage {
+        file_id: u64,
+        blkno: u32,
+    },
+}
+
+#[derive(Debug, PartialEq, Eq, Hash, Clone)]
+struct MaterializedPageHashKey {
+    tenant_id: ZTenantId,
+    timeline_id: ZTimelineId,
+    rel_tag: RelTag,
+    blknum: u32,
+}
+
+#[derive(Clone)]
+struct Version {
+    lsn: Lsn,
+    slot_idx: usize,
+}
+
+struct Slot {
+    inner: RwLock<SlotInner>,
+    usage_count: AtomicU8,
+}
+
+struct SlotInner {
+    key: Option<CacheKey>,
+    buf: &'static mut [u8; PAGE_SZ],
+    dirty: bool,
+}
+
+impl Slot {
+    /// Increment usage count on the buffer, with ceiling at MAX_USAGE_COUNT.
+    fn inc_usage_count(&self) {
+        let _ = self
+            .usage_count
+            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| {
+                if val == MAX_USAGE_COUNT {
+                    None
+                } else {
+                    Some(val + 1)
+                }
+            });
+    }
+
+    /// Decrement usage count on the buffer, unless it's already zero.  Returns
+    /// the old usage count.
+    fn dec_usage_count(&self) -> u8 {
+        let count_res =
+            self.usage_count
+                .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| {
+                    if val == 0 {
+                        None
+                    } else {
+                        Some(val - 1)
+                    }
+                });
+
+        match count_res {
+            Ok(usage_count) => usage_count,
+            Err(usage_count) => usage_count,
+        }
+    }
+}
+
+pub struct PageCache {
+    /// This contains the mapping from the cache key to buffer slot that currently
+    /// contains the page, if any.
+    ///
+    /// TODO: This is protected by a single lock. If that becomes a bottleneck,
+    /// this HashMap can be replaced with a more concurrent version, there are
+    /// plenty of such crates around.
+    ///
+    /// If you add support for caching different kinds of objects, each object kind
+    /// can have a separate mapping map, next to this field.
+    materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
+
+    ephemeral_page_map: RwLock<HashMap<(u64, u32), usize>>,
+
+    /// The actual buffers with their metadata.
+    slots: Box<[Slot]>,
+
+    /// Index of the next candidate to evict, for the Clock replacement algorithm.
+    /// This is interpreted modulo the page cache size.
+    next_evict_slot: AtomicUsize,
+}
+
+///
+/// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
+/// until the guard is dropped.
+///
+pub struct PageReadGuard<'i>(RwLockReadGuard<'i, SlotInner>);
+
+impl std::ops::Deref for PageReadGuard<'_> {
+    type Target = [u8; PAGE_SZ];
+
+    fn deref(&self) -> &Self::Target {
+        self.0.buf
+    }
+}
+
+///
+/// PageWriteGuard is a lease on a buffer for modifying it. The page is kept locked
+/// until the guard is dropped.
+///
+/// Counterintuitively, this is used even for a read, if the requested page is not
+/// currently found in the page cache. In that case, the caller of lock_for_read()
+/// is expected to fill in the page contents and call mark_valid(). Similarly
+/// lock_for_write() can return an invalid buffer that the caller is expected to
+/// to initialize.
+///
+pub struct PageWriteGuard<'i> {
+    inner: RwLockWriteGuard<'i, SlotInner>,
+
+    // Are the page contents currently valid?
+    valid: bool,
+}
+
+impl std::ops::DerefMut for PageWriteGuard<'_> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.inner.buf
+    }
+}
+
+impl std::ops::Deref for PageWriteGuard<'_> {
+    type Target = [u8; PAGE_SZ];
+
+    fn deref(&self) -> &Self::Target {
+        self.inner.buf
+    }
+}
+
+impl PageWriteGuard<'_> {
+    /// Mark that the buffer contents are now valid.
+    pub fn mark_valid(&mut self) {
+        assert!(
+            !self.valid,
+            "mark_valid called on a buffer that was already valid"
+        );
+        self.valid = true;
+    }
+    pub fn mark_dirty(&mut self) {
+        self.inner.dirty = true;
+    }
+}
+
+impl Drop for PageWriteGuard<'_> {
+    ///
+    /// If the buffer was allocated for a page that was not already in the
+    /// cache, but the lock_for_read/write() caller dropped the buffer without
+    /// initializing it, remove the mapping from the page cache.
+    ///
+    fn drop(&mut self) {
+        if !self.valid {
+            let self_key = self.inner.key.as_ref().unwrap();
+            PAGE_CACHE.get().unwrap().remove_mapping(self_key);
+            self.inner.key = None;
+        }
+    }
+}
+
+/// lock_for_read() return value
+pub enum ReadBufResult<'a> {
+    Found(PageReadGuard<'a>),
+    NotFound(PageWriteGuard<'a>),
+}
+
+/// lock_for_write() return value
+pub enum WriteBufResult<'a> {
+    Found(PageWriteGuard<'a>),
+    NotFound(PageWriteGuard<'a>),
+}
+
+impl PageCache {
+    //
+    // Section 1.1: Public interface functions for looking up and memorizing materialized page
+    // versions in the page cache
+    //
+
+    /// Look up a materialized page version.
+    ///
+    /// The 'lsn' is an upper bound, this will return the latest version of
+    /// the given block, but not newer than 'lsn'. Returns the actual LSN of the
+    /// returned page.
+    pub fn lookup_materialized_page(
+        &self,
+        tenant_id: ZTenantId,
+        timeline_id: ZTimelineId,
+        rel_tag: RelTag,
+        blknum: u32,
+        lsn: Lsn,
+    ) -> Option<(Lsn, PageReadGuard)> {
+        let mut cache_key = CacheKey::MaterializedPage {
+            hash_key: MaterializedPageHashKey {
+                tenant_id,
+                timeline_id,
+                rel_tag,
+                blknum,
+            },
+            lsn,
+        };
+
+        if let Some(guard) = self.try_lock_for_read(&mut cache_key) {
+            if let CacheKey::MaterializedPage { hash_key: _, lsn } = cache_key {
+                Some((lsn, guard))
+            } else {
+                panic!("unexpected key type in slot");
+            }
+        } else {
+            None
+        }
+    }
+
+    ///
+    /// Store an image of the given page in the cache.
+    ///
+    pub fn memorize_materialized_page(
+        &self,
+        tenant_id: ZTenantId,
+        timeline_id: ZTimelineId,
+        rel_tag: RelTag,
+        blknum: u32,
+        lsn: Lsn,
+        img: &[u8],
+    ) {
+        let cache_key = CacheKey::MaterializedPage {
+            hash_key: MaterializedPageHashKey {
+                tenant_id,
+                timeline_id,
+                rel_tag,
+                blknum,
+            },
+            lsn,
+        };
+
+        match self.lock_for_write(&cache_key) {
+            WriteBufResult::Found(write_guard) => {
+                // We already had it in cache. Another thread must've put it there
+                // concurrently. Check that it had the same contents that we
+                // replayed.
+                assert!(*write_guard == img);
+            }
+            WriteBufResult::NotFound(mut write_guard) => {
+                write_guard.copy_from_slice(img);
+                write_guard.mark_valid();
+            }
+        }
+    }
+
+    pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> ReadBufResult {
+        let mut cache_key = CacheKey::EphemeralPage { file_id, blkno };
+
+        self.lock_for_read(&mut cache_key)
+    }
+
+    pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> WriteBufResult {
+        let cache_key = CacheKey::EphemeralPage { file_id, blkno };
+
+        self.lock_for_write(&cache_key)
+    }
+
+    /// Immediately drop all buffers belonging to given file, without writeback
+    pub fn drop_buffers_for_ephemeral(&self, drop_file_id: u64) {
+        for slot_idx in 0..self.slots.len() {
+            let slot = &self.slots[slot_idx];
+
+            let mut inner = slot.inner.write().unwrap();
+            if let Some(key) = &inner.key {
+                match key {
+                    CacheKey::EphemeralPage { file_id, blkno: _ } if *file_id == drop_file_id => {
+                        // remove mapping for old buffer
+                        self.remove_mapping(key);
+                        inner.key = None;
+                    }
+                    _ => {}
+                }
+            }
+        }
+    }
+
+    //
+    // Section 2: Internal interface functions for lookup/update.
+    //
+    // Currently, the page cache only stores materialized page images. In the
+    // future, to add support for a new kind of "thing" to cache, you will need
+    // to add public interface routines above, and code to deal with the
+    // "mappings" after this section. But the routines in this section should
+    // not require changes.
+
+    /// Look up a page in the cache.
+    ///
+    /// If the search criteria is not exact, *cache_key is updated with the key
+    /// for exact key of the returned page. (For materialized pages, that means
+    /// that the LSN in 'cache_key' is updated with the LSN of the returned page
+    /// version.)
+    ///
+    /// If no page is found, returns None and *cache_key is left unmodified.
+    ///
+    fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
+        let cache_key_orig = cache_key.clone();
+        if let Some(slot_idx) = self.search_mapping(cache_key) {
+            // The page was found in the mapping. Lock the slot, and re-check
+            // that it's still what we expected (because we released the mapping
+            // lock already, another thread could have evicted the page)
+            let slot = &self.slots[slot_idx];
+            let inner = slot.inner.read().unwrap();
+            if inner.key.as_ref() == Some(cache_key) {
+                slot.inc_usage_count();
+                return Some(PageReadGuard(inner));
+            } else {
+                // search_mapping might have modified the search key; restore it.
+                *cache_key = cache_key_orig;
+            }
+        }
+        None
+    }
+
+    /// Return a locked buffer for given block.
+    ///
+    /// Like try_lock_for_read(), if the search criteria is not exact and the
+    /// page is already found in the cache, *cache_key is updated.
+    ///
+    /// If the page is not found in the cache, this allocates a new buffer for
+    /// it. The caller may then initialize the buffer with the contents, and
+    /// call mark_valid().
+    ///
+    /// Example usage:
+    ///
+    /// ```ignore
+    /// let cache = page_cache::get();
+    ///
+    /// match cache.lock_for_read(&key) {
+    ///     ReadBufResult::Found(read_guard) => {
+    ///         // The page was found in cache. Use it
+    ///     },
+    ///     ReadBufResult::NotFound(write_guard) => {
+    ///         // The page was not found in cache. Read it from disk into the
+    ///         // buffer.
+    ///         //read_my_page_from_disk(write_guard);
+    ///
+    ///         // The buffer contents are now valid. Tell the page cache.
+    ///         write_guard.mark_valid();
+    ///     },
+    /// }
+    /// ```
+    ///
+    fn lock_for_read(&self, cache_key: &mut CacheKey) -> ReadBufResult {
+        loop {
+            // First check if the key already exists in the cache.
+            if let Some(read_guard) = self.try_lock_for_read(cache_key) {
+                return ReadBufResult::Found(read_guard);
+            }
+
+            // Not found. Find a victim buffer
+            let (slot_idx, mut inner) = self.find_victim();
+
+            // Insert mapping for this. At this point, we may find that another
+            // thread did the same thing concurrently. In that case, we evicted
+            // our victim buffer unnecessarily. Put it into the free list and
+            // continue with the slot that the other thread chose.
+            if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
+                // TODO: put to free list
+
+                // We now just loop back to start from beginning. This is not
+                // optimal, we'll perform the lookup in the mapping again, which
+                // is not really necessary because we already got
+                // 'existing_slot_idx'.  But this shouldn't happen often enough
+                // to matter much.
+                continue;
+            }
+
+            // Make the slot ready
+            let slot = &self.slots[slot_idx];
+            inner.key = Some(cache_key.clone());
+            slot.usage_count.store(1, Ordering::Relaxed);
+
+            return ReadBufResult::NotFound(PageWriteGuard {
+                inner,
+                valid: false,
+            });
+        }
+    }
+
+    /// Look up a page in the cache and lock it in write mode. If it's not
+    /// found, returns None.
+    ///
+    /// When locking a page for writing, the search criteria is always "exact".
+    fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
+        if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
+            // The page was found in the mapping. Lock the slot, and re-check
+            // that it's still what we expected (because we don't released the mapping
+            // lock already, another thread could have evicted the page)
+            let slot = &self.slots[slot_idx];
+            let inner = slot.inner.write().unwrap();
+            if inner.key.as_ref() == Some(cache_key) {
+                slot.inc_usage_count();
+                return Some(PageWriteGuard { inner, valid: true });
+            }
+        }
+        None
+    }
+
+    /// Return a write-locked buffer for given block.
+    ///
+    /// Similar to read_for_read(), but the returned buffer is write-locked and
+    /// may be modified by the caller even if it's already found in the cache.
+    fn lock_for_write(&self, cache_key: &CacheKey) -> WriteBufResult {
+        loop {
+            // First check if the key already exists in the cache.
+            if let Some(write_guard) = self.try_lock_for_write(cache_key) {
+                return WriteBufResult::Found(write_guard);
+            }
+
+            // Not found. Find a victim buffer
+            let (slot_idx, mut inner) = self.find_victim();
+
+            // Insert mapping for this. At this point, we may find that another
+            // thread did the same thing concurrently. In that case, we evicted
+            // our victim buffer unnecessarily. Put it into the free list and
+            // continue with the slot that the other thread chose.
+            if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
+                // TODO: put to free list
+
+                // We now just loop back to start from beginning. This is not
+                // optimal, we'll perform the lookup in the mapping again, which
+                // is not really necessary because we already got
+                // 'existing_slot_idx'.  But this shouldn't happen often enough
+                // to matter much.
+                continue;
+            }
+
+            // Make the slot ready
+            let slot = &self.slots[slot_idx];
+            inner.key = Some(cache_key.clone());
+            slot.usage_count.store(1, Ordering::Relaxed);
+
+            return WriteBufResult::NotFound(PageWriteGuard {
+                inner,
+                valid: false,
+            });
+        }
+    }
+
+    //
+    // Section 3: Mapping functions
+    //
+
+    /// Search for a page in the cache using the given search key.
+    ///
+    /// Returns the slot index, if any. If the search criteria is not exact,
+    /// *cache_key is updated with the actual key of the found page.
+    ///
+    /// NOTE: We don't hold any lock on the mapping on return, so the slot might
+    /// get recycled for an unrelated page immediately after this function
+    /// returns.  The caller is responsible for re-checking that the slot still
+    /// contains the page with the same key before using it.
+    ///
+    fn search_mapping(&self, cache_key: &mut CacheKey) -> Option<usize> {
+        match cache_key {
+            CacheKey::MaterializedPage { hash_key, lsn } => {
+                let map = self.materialized_page_map.read().unwrap();
+                let versions = map.get(hash_key)?;
+
+                let version_idx = match versions.binary_search_by_key(lsn, |v| v.lsn) {
+                    Ok(version_idx) => version_idx,
+                    Err(0) => return None,
+                    Err(version_idx) => version_idx - 1,
+                };
+                let version = &versions[version_idx];
+                *lsn = version.lsn;
+                Some(version.slot_idx)
+            }
+            CacheKey::EphemeralPage { file_id, blkno } => {
+                let map = self.ephemeral_page_map.read().unwrap();
+                Some(*map.get(&(*file_id, *blkno))?)
+            }
+        }
+    }
+
+    /// Search for a page in the cache using the given search key.
+    ///
+    /// Like 'search_mapping, but performs an "exact" search. Used for
+    /// allocating a new buffer.
+    fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
+        match key {
+            CacheKey::MaterializedPage { hash_key, lsn } => {
+                let map = self.materialized_page_map.read().unwrap();
+                let versions = map.get(hash_key)?;
+
+                if let Ok(version_idx) = versions.binary_search_by_key(lsn, |v| v.lsn) {
+                    Some(versions[version_idx].slot_idx)
+                } else {
+                    None
+                }
+            }
+            CacheKey::EphemeralPage { file_id, blkno } => {
+                let map = self.ephemeral_page_map.read().unwrap();
+                Some(*map.get(&(*file_id, *blkno))?)
+            }
+        }
+    }
+
+    ///
+    /// Remove mapping for given key.
+    ///
+    fn remove_mapping(&self, old_key: &CacheKey) {
+        match old_key {
+            CacheKey::MaterializedPage {
+                hash_key: old_hash_key,
+                lsn: old_lsn,
+            } => {
+                let mut map = self.materialized_page_map.write().unwrap();
+                if let Entry::Occupied(mut old_entry) = map.entry(old_hash_key.clone()) {
+                    let versions = old_entry.get_mut();
+
+                    if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) {
+                        versions.remove(version_idx);
+                        if versions.is_empty() {
+                            old_entry.remove_entry();
+                        }
+                    }
+                } else {
+                    panic!("could not find old key in mapping")
+                }
+            }
+            CacheKey::EphemeralPage { file_id, blkno } => {
+                let mut map = self.ephemeral_page_map.write().unwrap();
+                map.remove(&(*file_id, *blkno))
+                    .expect("could not find old key in mapping");
+            }
+        }
+    }
+
+    ///
+    /// Insert mapping for given key.
+    ///
+    /// If a mapping already existed for the given key, returns the slot index
+    /// of the existing mapping and leaves it untouched.
+    fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
+        match new_key {
+            CacheKey::MaterializedPage {
+                hash_key: new_key,
+                lsn: new_lsn,
+            } => {
+                let mut map = self.materialized_page_map.write().unwrap();
+                let versions = map.entry(new_key.clone()).or_default();
+                match versions.binary_search_by_key(new_lsn, |v| v.lsn) {
+                    Ok(version_idx) => Some(versions[version_idx].slot_idx),
+                    Err(version_idx) => {
+                        versions.insert(
+                            version_idx,
+                            Version {
+                                lsn: *new_lsn,
+                                slot_idx,
+                            },
+                        );
+                        None
+                    }
+                }
+            }
+            CacheKey::EphemeralPage { file_id, blkno } => {
+                let mut map = self.ephemeral_page_map.write().unwrap();
+                match map.entry((*file_id, *blkno)) {
+                    Entry::Occupied(entry) => Some(*entry.get()),
+                    Entry::Vacant(entry) => {
+                        entry.insert(slot_idx);
+                        None
+                    }
+                }
+            }
+        }
+    }
+
+    //
+    // Section 5: Misc internal helpers
+    //
+
+    /// Find a slot to evict.
+    ///
+    /// On return, the slot is empty and write-locked.
+    fn find_victim(&self) -> (usize, RwLockWriteGuard<SlotInner>) {
+        let iter_limit = self.slots.len() * 2;
+        let mut iters = 0;
+        loop {
+            let slot_idx = self.next_evict_slot.fetch_add(1, Ordering::Relaxed) % self.slots.len();
+
+            let slot = &self.slots[slot_idx];
+
+            if slot.dec_usage_count() == 0 || iters >= iter_limit {
+                let mut inner = slot.inner.write().unwrap();
+
+                if let Some(old_key) = &inner.key {
+                    if inner.dirty {
+                        if let Err(err) = Self::writeback(old_key, inner.buf) {
+                            // Writing the page to disk failed.
+                            //
+                            // FIXME: What to do here, when? We could propagate the error to the
+                            // caller, but victim buffer is generally unrelated to the original
+                            // call. It can even belong to a different tenant. Currently, we
+                            // report the error to the log and continue the clock sweep to find
+                            // a different victim. But if the problem persists, the page cache
+                            // could fill up with dirty pages that we cannot evict, and we will
+                            // loop retrying the writebacks indefinitely.
+                            error!("writeback of buffer {:?} failed: {}", old_key, err);
+                            continue;
+                        }
+                    }
+
+                    // remove mapping for old buffer
+                    self.remove_mapping(old_key);
+                    inner.dirty = false;
+                    inner.key = None;
+                }
+                return (slot_idx, inner);
+            }
+
+            iters += 1;
+        }
+    }
+
+    fn writeback(cache_key: &CacheKey, buf: &[u8]) -> Result<(), std::io::Error> {
+        match cache_key {
+            CacheKey::MaterializedPage {
+                hash_key: _,
+                lsn: _,
+            } => {
+                panic!("unexpected dirty materialize page");
+            }
+            CacheKey::EphemeralPage { file_id, blkno } => {
+                writeback_ephemeral_file(*file_id, *blkno, buf)
+            }
+        }
+    }
+
+    /// Initialize a new page cache
+    ///
+    /// This should be called only once at page server startup.
+    fn new(num_pages: usize) -> Self {
+        assert!(num_pages > 0, "page cache size must be > 0");
+
+        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
+
+        let slots = page_buffer
+            .chunks_exact_mut(PAGE_SZ)
+            .map(|chunk| {
+                let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
+
+                Slot {
+                    inner: RwLock::new(SlotInner {
+                        key: None,
+                        buf,
+                        dirty: false,
+                    }),
+                    usage_count: AtomicU8::new(0),
+                }
+            })
+            .collect();
+
+        Self {
+            materialized_page_map: Default::default(),
+            ephemeral_page_map: Default::default(),
+            slots,
+            next_evict_slot: AtomicUsize::new(0),
+        }
+    }
+}
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -630,14 +630,16 @@ impl postgres_backend::Handler for PageServerHandler {

            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;

-            let branches = crate::branches::get_branches(self.conf, &tenantid)?;
+            // since these handlers for tenant/branch commands are deprecated (in favor of http based ones)
+            // just use false in place of include non incremental logical size
+            let branches = crate::branches::get_branches(self.conf, &tenantid, false)?;
            let branches_buf = serde_json::to_vec(&branches)?;

            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
                .write_message_noflush(&BeMessage::DataRow(&[Some(&branches_buf)]))?
                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else if query_string.starts_with("tenant_list") {
-            let tenants = crate::branches::get_tenants(self.conf)?;
+            let tenants = crate::tenant_mgr::list_tenants()?;
            let tenants_buf = serde_json::to_vec(&tenants)?;

            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
--- a/pageserver/src/relish_storage.rs
+++ b/pageserver/src/relish_storage.rs
@@ -1,87 +0,0 @@
-//! Abstractions for the page server to store its relish layer data in the external storage.
-//!
-//! Main purpose of this module subtree is to provide a set of abstractions to manage the storage state
-//! in a way, optimal for page server.
-//!
-//! The abstractions hide multiple custom external storage API implementations,
-//! such as AWS S3, local filesystem, etc., located in the submodules.
-
-mod local_fs;
-mod rust_s3;
-/// A queue-based storage with the background machinery behind it to synchronize
-/// local page server layer files with external storage.
-mod synced_storage;
-
-use std::{path::Path, thread};
-
-use anyhow::Context;
-
-pub use self::synced_storage::schedule_timeline_upload;
-use self::{local_fs::LocalFs, rust_s3::RustS3};
-use crate::{PageServerConf, RelishStorageKind};
-
-pub fn run_storage_sync_thread(
-    config: &'static PageServerConf,
-) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
-    match &config.relish_storage_config {
-        Some(relish_storage_config) => {
-            let max_concurrent_sync = relish_storage_config.max_concurrent_sync;
-            match &relish_storage_config.storage {
-                RelishStorageKind::LocalFs(root) => synced_storage::run_storage_sync_thread(
-                    config,
-                    LocalFs::new(root.clone())?,
-                    max_concurrent_sync,
-                ),
-                RelishStorageKind::AwsS3(s3_config) => synced_storage::run_storage_sync_thread(
-                    config,
-                    RustS3::new(s3_config)?,
-                    max_concurrent_sync,
-                ),
-            }
-        }
-        None => Ok(None),
-    }
-}
-
-/// Storage (potentially remote) API to manage its state.
-#[async_trait::async_trait]
-pub trait RelishStorage: Send + Sync {
-    type RelishStoragePath;
-
-    fn derive_destination(
-        page_server_workdir: &Path,
-        relish_local_path: &Path,
-    ) -> anyhow::Result<Self::RelishStoragePath>;
-
-    async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>>;
-
-    async fn download_relish<W: 'static + std::io::Write + Send>(
-        &self,
-        from: &Self::RelishStoragePath,
-        // rust_s3 `get_object_stream` method requires `std::io::BufWriter` for some reason, not the async counterpart
-        // that forces us to consume and return the writer to satisfy the blocking operation async wrapper requirements
-        to: std::io::BufWriter<W>,
-    ) -> anyhow::Result<std::io::BufWriter<W>>;
-
-    async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()>;
-
-    async fn upload_relish<R: tokio::io::AsyncRead + std::marker::Unpin + Send>(
-        &self,
-        from: &mut tokio::io::BufReader<R>,
-        to: &Self::RelishStoragePath,
-    ) -> anyhow::Result<()>;
-}
-
-fn strip_workspace_prefix<'a>(
-    page_server_workdir: &'a Path,
-    relish_local_path: &'a Path,
-) -> anyhow::Result<&'a Path> {
-    relish_local_path
-        .strip_prefix(page_server_workdir)
-        .with_context(|| {
-            format!(
-                "Unexpected: relish local path '{}' is not relevant to server workdir",
-                relish_local_path.display(),
-            )
-        })
-}
--- a/pageserver/src/relish_storage/local_fs.rs
+++ b/pageserver/src/relish_storage/local_fs.rs
@@ -1,189 +0,0 @@
-//! Local filesystem relish storage.
-//!
-//! Page server already stores layer data on the server, when freezing it.
-//! This storage serves a way to
-//!
-//! * test things locally simply
-//! * allow to compabre both binary sets
-//! * help validating the relish storage API
-
-use std::{
-    future::Future,
-    io::Write,
-    path::{Path, PathBuf},
-    pin::Pin,
-};
-
-use anyhow::{bail, Context};
-use tokio::{fs, io};
-
-use super::{strip_workspace_prefix, RelishStorage};
-
-pub struct LocalFs {
-    root: PathBuf,
-}
-
-impl LocalFs {
-    /// Atetmpts to create local FS relish storage, also creates the directory provided, if not exists.
-    pub fn new(root: PathBuf) -> anyhow::Result<Self> {
-        if !root.exists() {
-            std::fs::create_dir_all(&root).with_context(|| {
-                format!(
-                    "Failed to create all directories in the given root path {}",
-                    root.display(),
-                )
-            })?;
-        }
-        Ok(Self { root })
-    }
-
-    fn resolve_in_storage(&self, path: &Path) -> anyhow::Result<PathBuf> {
-        if path.is_relative() {
-            Ok(self.root.join(path))
-        } else if path.starts_with(&self.root) {
-            Ok(path.to_path_buf())
-        } else {
-            bail!(
-                "Path '{}' does not belong to the current storage",
-                path.display()
-            )
-        }
-    }
-}
-
-#[async_trait::async_trait]
-impl RelishStorage for LocalFs {
-    type RelishStoragePath = PathBuf;
-
-    fn derive_destination(
-        page_server_workdir: &Path,
-        relish_local_path: &Path,
-    ) -> anyhow::Result<Self::RelishStoragePath> {
-        Ok(strip_workspace_prefix(page_server_workdir, relish_local_path)?.to_path_buf())
-    }
-
-    async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>> {
-        Ok(get_all_files(&self.root).await?.into_iter().collect())
-    }
-
-    async fn download_relish<W: 'static + std::io::Write + Send>(
-        &self,
-        from: &Self::RelishStoragePath,
-        mut to: std::io::BufWriter<W>,
-    ) -> anyhow::Result<std::io::BufWriter<W>> {
-        let file_path = self.resolve_in_storage(from)?;
-        if file_path.exists() && file_path.is_file() {
-            let updated_buffer = tokio::task::spawn_blocking(move || {
-                let mut source = std::io::BufReader::new(
-                    std::fs::OpenOptions::new()
-                        .read(true)
-                        .open(&file_path)
-                        .with_context(|| {
-                            format!(
-                                "Failed to open source file '{}' to use in the download",
-                                file_path.display()
-                            )
-                        })?,
-                );
-                std::io::copy(&mut source, &mut to)
-                    .context("Failed to download the relish file")?;
-                to.flush().context("Failed to flush the download buffer")?;
-                Ok::<_, anyhow::Error>(to)
-            })
-            .await
-            .context("Failed to spawn a blocking task")??;
-            Ok(updated_buffer)
-        } else {
-            bail!(
-                "File '{}' either does not exist or is not a file",
-                file_path.display()
-            )
-        }
-    }
-
-    async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()> {
-        let file_path = self.resolve_in_storage(path)?;
-        if file_path.exists() && file_path.is_file() {
-            Ok(tokio::fs::remove_file(file_path).await?)
-        } else {
-            bail!(
-                "File '{}' either does not exist or is not a file",
-                file_path.display()
-            )
-        }
-    }
-
-    async fn upload_relish<R: io::AsyncRead + std::marker::Unpin + Send>(
-        &self,
-        from: &mut io::BufReader<R>,
-        to: &Self::RelishStoragePath,
-    ) -> anyhow::Result<()> {
-        let target_file_path = self.resolve_in_storage(to)?;
-        create_target_directory(&target_file_path).await?;
-        let mut destination = io::BufWriter::new(
-            fs::OpenOptions::new()
-                .write(true)
-                .create(true)
-                .open(&target_file_path)
-                .await
-                .with_context(|| {
-                    format!(
-                        "Failed to open target fs destination at '{}'",
-                        target_file_path.display()
-                    )
-                })?,
-        );
-
-        io::copy_buf(from, &mut destination)
-            .await
-            .context("Failed to upload relish to local storage")?;
-        Ok(())
-    }
-}
-
-fn get_all_files<'a, P>(
-    directory_path: P,
-) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
-where
-    P: AsRef<Path> + Send + Sync + 'a,
-{
-    Box::pin(async move {
-        let directory_path = directory_path.as_ref();
-        if directory_path.exists() {
-            if directory_path.is_dir() {
-                let mut paths = Vec::new();
-                let mut dir_contents = tokio::fs::read_dir(directory_path).await?;
-                while let Some(dir_entry) = dir_contents.next_entry().await? {
-                    let file_type = dir_entry.file_type().await?;
-                    let entry_path = dir_entry.path();
-                    if file_type.is_symlink() {
-                        log::debug!("{:?} us a symlink, skipping", entry_path)
-                    } else if file_type.is_dir() {
-                        paths.extend(get_all_files(entry_path).await?.into_iter())
-                    } else {
-                        paths.push(dir_entry.path());
-                    }
-                }
-                Ok(paths)
-            } else {
-                bail!("Path '{}' is not a directory", directory_path.display())
-            }
-        } else {
-            Ok(Vec::new())
-        }
-    })
-}
-
-async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()> {
-    let target_dir = match target_file_path.parent() {
-        Some(parent_dir) => parent_dir,
-        None => bail!(
-            "Relish path '{}' has no parent directory",
-            target_file_path.display()
-        ),
-    };
-    if !target_dir.exists() {
-        tokio::fs::create_dir_all(target_dir).await?;
-    }
-    Ok(())
-}
--- a/pageserver/src/relish_storage/rust_s3.rs
+++ b/pageserver/src/relish_storage/rust_s3.rs
@@ -1,149 +0,0 @@
-//! A wrapper around AWS S3 client library `rust_s3` to be used a relish storage.
-
-use std::io::Write;
-use std::path::Path;
-
-use anyhow::Context;
-use s3::{bucket::Bucket, creds::Credentials, region::Region};
-
-use crate::{
-    relish_storage::{strip_workspace_prefix, RelishStorage},
-    S3Config,
-};
-
-const S3_FILE_SEPARATOR: char = '/';
-
-#[derive(Debug)]
-pub struct S3ObjectKey(String);
-
-impl S3ObjectKey {
-    fn key(&self) -> &str {
-        &self.0
-    }
-}
-
-/// AWS S3 relish storage.
-pub struct RustS3 {
-    bucket: Bucket,
-}
-
-impl RustS3 {
-    /// Creates the relish storage, errors if incorrect AWS S3 configuration provided.
-    pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
-        let region = aws_config
-            .bucket_region
-            .parse::<Region>()
-            .context("Failed to parse the s3 region from config")?;
-        let credentials = Credentials::new(
-            aws_config.access_key_id.as_deref(),
-            aws_config.secret_access_key.as_deref(),
-            None,
-            None,
-            None,
-        )
-        .context("Failed to create the s3 credentials")?;
-        Ok(Self {
-            bucket: Bucket::new_with_path_style(
-                aws_config.bucket_name.as_str(),
-                region,
-                credentials,
-            )
-            .context("Failed to create the s3 bucket")?,
-        })
-    }
-}
-
-#[async_trait::async_trait]
-impl RelishStorage for RustS3 {
-    type RelishStoragePath = S3ObjectKey;
-
-    fn derive_destination(
-        page_server_workdir: &Path,
-        relish_local_path: &Path,
-    ) -> anyhow::Result<Self::RelishStoragePath> {
-        let relative_path = strip_workspace_prefix(page_server_workdir, relish_local_path)?;
-        let mut key = String::new();
-        for segment in relative_path {
-            key.push(S3_FILE_SEPARATOR);
-            key.push_str(&segment.to_string_lossy());
-        }
-        Ok(S3ObjectKey(key))
-    }
-
-    async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>> {
-        let list_response = self
-            .bucket
-            .list(String::new(), None)
-            .await
-            .context("Failed to list s3 objects")?;
-
-        Ok(list_response
-            .into_iter()
-            .flat_map(|response| response.contents)
-            .map(|s3_object| S3ObjectKey(s3_object.key))
-            .collect())
-    }
-
-    async fn download_relish<W: 'static + std::io::Write + Send>(
-        &self,
-        from: &Self::RelishStoragePath,
-        mut to: std::io::BufWriter<W>,
-    ) -> anyhow::Result<std::io::BufWriter<W>> {
-        let code = self
-            .bucket
-            .get_object_stream(from.key(), &mut to)
-            .await
-            .with_context(|| format!("Failed to download s3 object with key {}", from.key()))?;
-        if code != 200 {
-            Err(anyhow::format_err!(
-                "Received non-200 exit code during downloading object from directory, code: {}",
-                code
-            ))
-        } else {
-            tokio::task::spawn_blocking(move || {
-                to.flush().context("Failed to fluch the downoad buffer")?;
-                Ok::<_, anyhow::Error>(to)
-            })
-            .await
-            .context("Failed to joim the download buffer flush task")?
-        }
-    }
-
-    async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()> {
-        let (_, code) = self
-            .bucket
-            .delete_object(path.key())
-            .await
-            .with_context(|| format!("Failed to delete s3 object with key {}", path.key()))?;
-        if code != 204 {
-            Err(anyhow::format_err!(
-                "Received non-204 exit code during deleting object with key '{}', code: {}",
-                path.key(),
-                code
-            ))
-        } else {
-            Ok(())
-        }
-    }
-
-    async fn upload_relish<R: tokio::io::AsyncRead + std::marker::Unpin + Send>(
-        &self,
-        from: &mut tokio::io::BufReader<R>,
-        to: &Self::RelishStoragePath,
-    ) -> anyhow::Result<()> {
-        let code = self
-            .bucket
-            .put_object_stream(from, to.key())
-            .await
-            .with_context(|| format!("Failed to create s3 object with key {}", to.key()))?;
-        if code != 200 {
-            Err(anyhow::format_err!(
-                "Received non-200 exit code during creating object with key '{}', code: {}",
-                to.key(),
-                code
-            ))
-        } else {
-            Ok(())
-        }
-    }
-}
--- a/pageserver/src/relish_storage/synced_storage.rs
+++ b/pageserver/src/relish_storage/synced_storage.rs
@@ -1,57 +0,0 @@
-use std::time::Duration;
-use std::{collections::BinaryHeap, sync::Mutex, thread};
-
-use crate::tenant_mgr;
-use crate::{relish_storage::RelishStorage, PageServerConf};
-
-lazy_static::lazy_static! {
-    static ref UPLOAD_QUEUE: Mutex<BinaryHeap<SyncTask>> = Mutex::new(BinaryHeap::new());
-}
-
-pub fn schedule_timeline_upload(_local_timeline: ()) {
-    // UPLOAD_QUEUE
-    //     .lock()
-    //     .unwrap()
-    //     .push(SyncTask::Upload(local_timeline))
-}
-
-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
-enum SyncTask {}
-
-pub fn run_storage_sync_thread<
-    P: std::fmt::Debug,
-    S: 'static + RelishStorage<RelishStoragePath = P>,
->(
-    config: &'static PageServerConf,
-    relish_storage: S,
-    max_concurrent_sync: usize,
-) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
-    let runtime = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()?;
-
-    let handle = thread::Builder::new()
-        .name("Queue based relish storage sync".to_string())
-        .spawn(move || {
-            while !tenant_mgr::shutdown_requested() {
-                let mut queue_accessor = UPLOAD_QUEUE.lock().unwrap();
-                log::debug!("Upload queue length: {}", queue_accessor.len());
-                let next_task = queue_accessor.pop();
-                drop(queue_accessor);
-                match next_task {
-                    Some(task) => runtime.block_on(async {
-                        // suppress warnings
-                        let _ = (config, task, &relish_storage, max_concurrent_sync);
-                        todo!("omitted for brevity")
-                    }),
-                    None => {
-                        thread::sleep(Duration::from_secs(1));
-                        continue;
-                    }
-                }
-            }
-            log::debug!("Queue based relish storage sync thread shut down");
-            Ok(())
-        })?;
-    Ok(Some(handle))
-}
--- a/pageserver/src/remote_storage.rs
+++ b/pageserver/src/remote_storage.rs
@@ -0,0 +1,331 @@
+//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage.
+//! This particular module serves as a public API border between pageserver and the internal storage machinery.
+//! No other modules from this tree are supposed to be used directly by the external code.
+//!
+//! There are a few components the storage machinery consists of:
+//! * [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
+//!     * [`local_fs`] allows to use local file system as an external storage
+//!     * [`rust_s3`] uses AWS S3 bucket entirely as an external storage
+//!
+//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync.
+//!
+//! * public API via to interact with the external world: [`run_storage_sync_thread`] and [`schedule_timeline_upload`]
+//!
+//! Here's a schematic overview of all interactions backup and the rest of the pageserver perform:
+//!
+//! +------------------------+                                    +--------->-------+
+//! |                        |  - - - (init async loop) - - - ->  |                 |
+//! |                        |                                    |                 |
+//! |                        |  ------------------------------->  |      async      |
+//! |       pageserver       |   (schedule frozen layer upload)   | upload/download |
+//! |                        |                                    |      loop       |
+//! |                        |  <-------------------------------  |                 |
+//! |                        |    (register downloaded layers)    |                 |
+//! +------------------------+                                    +---------<-------+
+//!                                                                         |
+//!                                                                         |
+//!                                          CRUD layer file operations     |
+//!                                     (upload/download/delete/list, etc.) |
+//!                                                                         V
+//!                                                            +------------------------+
+//!                                                            |                        |
+//!                                                            | [`RemoteStorage`] impl |
+//!                                                            |                        |
+//!                                                            | pageserver assumes it  |
+//!                                                            | owns exclusive write   |
+//!                                                            | access to this storage |
+//!                                                            +------------------------+
+//!
+//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop unitialised, if configured so.
+//! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata.
+//! If the storage sync loop was successfully started before, pageserver schedules the new image uploads after every checkpoint.
+//! See [`crate::layered_repository`] for the upload calls and the adjacent logic.
+//!
+//! The storage logic considers `image` as a set of local files, fully representing a certain timeline at given moment (identified with `disk_consistent_lsn`).
+//! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed
+//! by the storage upload, if enabled.
+//! When a certain image gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same image state.
+//! No files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten
+//! when the newer timeline is downloaded.
+//!
+//! Meanwhile, the loop inits the storage connection and checks the remote files stored.
+//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server).
+//! Based on the remote image data, the storage sync logic queues image downloads, while accepting any potential upload tasks from pageserver and managing the tasks by their priority.
+//! On the image download, a [`crate::tenant_mgr::register_relish_download`] function is called to register the new image in pageserver, initializing all related threads and internal state.
+//!
+//! When the pageserver terminates, the upload loop finishes a current image sync task (if any) and exits.
+//!
+//! NOTES:
+//! * pageserver assumes it has exclusive write access to the remote storage. If supported, the way multiple pageservers can be separated in the same storage
+//! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API.
+//!
+//! * the uploads do not happen right after pageserver startup, they are registered when
+//!     1. pageserver does the checkpoint, which happens further in the future after the server start
+//!     2. pageserver loads the timeline from disk for the first time
+//!
+//! * the uploads do not happen right after the upload registration: the sync loop might be occupied with other tasks, or tasks with bigger priority could be waiting already
+//!
+//! * all synchronization tasks (including the public API to register uploads and downloads and the sync queue management) happens on an image scale: a big set of remote files,
+//! enough to represent (and recover, if needed) a certain timeline state. On the contrary, all internal storage CRUD calls are made per reilsh file from those images.
+//! This way, the synchronization is able to download the image partially, if some state was synced before, but exposes correctly synced images only.
+
+mod local_fs;
+mod rust_s3;
+mod storage_sync;
+
+use std::{
+    path::{Path, PathBuf},
+    thread,
+};
+
+use anyhow::{anyhow, ensure, Context};
+use tokio::io;
+use zenith_utils::zid::{ZTenantId, ZTimelineId};
+
+pub use self::storage_sync::schedule_timeline_upload;
+use self::{local_fs::LocalFs, rust_s3::S3};
+use crate::{
+    layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME},
+    PageServerConf, RemoteStorageKind,
+};
+
+/// Based on the config, initiates the remote storage connection and starts a separate thread
+/// that ensures that pageserver and the remote storage are in sync with each other.
+/// If no external configuraion connection given, no thread or storage initialization is done.
+pub fn run_storage_sync_thread(
+    config: &'static PageServerConf,
+) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
+    match &config.remote_storage_config {
+        Some(storage_config) => {
+            let max_concurrent_sync = storage_config.max_concurrent_sync;
+            let handle = match &storage_config.storage {
+                RemoteStorageKind::LocalFs(root) => storage_sync::spawn_storage_sync_thread(
+                    config,
+                    LocalFs::new(root.clone(), &config.workdir)?,
+                    max_concurrent_sync,
+                ),
+                RemoteStorageKind::AwsS3(s3_config) => storage_sync::spawn_storage_sync_thread(
+                    config,
+                    S3::new(s3_config, &config.workdir)?,
+                    max_concurrent_sync,
+                ),
+            };
+            handle.map(Some)
+        }
+        None => Ok(None),
+    }
+}
+
+/// Storage (potentially remote) API to manage its state.
+/// This storage tries to be unaware of any layered repository context,
+/// providing basic CRUD operations with storage files.
+#[async_trait::async_trait]
+trait RemoteStorage: Send + Sync {
+    /// A way to uniquely reference a file in the remote storage.
+    type StoragePath;
+
+    /// Attempts to derive the storage path out of the local path, if the latter is correct.
+    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath>;
+
+    /// Gets the layered storage information about the given entry.
+    fn info(&self, storage_path: &Self::StoragePath) -> anyhow::Result<RemoteFileInfo>;
+
+    /// Lists all items the storage has right now.
+    async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>>;
+
+    /// Streams the local file contents into remote into the remote storage entry.
+    async fn upload(
+        &self,
+        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        to: &Self::StoragePath,
+    ) -> anyhow::Result<()>;
+
+    /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
+    async fn download(
+        &self,
+        from: &Self::StoragePath,
+        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
+    ) -> anyhow::Result<()>;
+
+    /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
+    async fn download_range(
+        &self,
+        from: &Self::StoragePath,
+        start_inclusive: u64,
+        end_exclusive: Option<u64>,
+        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
+    ) -> anyhow::Result<()>;
+
+    async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()>;
+}
+
+/// Information about a certain remote storage entry.
+#[derive(Debug, PartialEq, Eq)]
+struct RemoteFileInfo {
+    tenant_id: ZTenantId,
+    timeline_id: ZTimelineId,
+    /// Path in the pageserver workdir where the file should go to.
+    download_destination: PathBuf,
+    is_metadata: bool,
+}
+
+fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
+    if prefix == path {
+        anyhow::bail!(
+            "Prefix and the path are equal, cannot strip: '{}'",
+            prefix.display()
+        )
+    } else {
+        path.strip_prefix(prefix).with_context(|| {
+            format!(
+                "Path '{}' is not prefixed with '{}'",
+                path.display(),
+                prefix.display(),
+            )
+        })
+    }
+}
+
+fn parse_ids_from_path<'a, R: std::fmt::Display>(
+    path_segments: impl Iterator<Item = &'a str>,
+    path_log_representation: &R,
+) -> anyhow::Result<(ZTenantId, ZTimelineId)> {
+    let mut segments = path_segments.skip_while(|&segment| segment != TENANTS_SEGMENT_NAME);
+    let tenants_segment = segments.next().ok_or_else(|| {
+        anyhow!(
+            "Found no '{}' segment in the storage path '{}'",
+            TENANTS_SEGMENT_NAME,
+            path_log_representation
+        )
+    })?;
+    ensure!(
+        tenants_segment == TENANTS_SEGMENT_NAME,
+        "Failed to extract '{}' segment from storage path '{}'",
+        TENANTS_SEGMENT_NAME,
+        path_log_representation
+    );
+    let tenant_id = segments
+        .next()
+        .ok_or_else(|| {
+            anyhow!(
+                "Found no tenant id in the storage path '{}'",
+                path_log_representation
+            )
+        })?
+        .parse::<ZTenantId>()
+        .with_context(|| {
+            format!(
+                "Failed to parse tenant id from storage path '{}'",
+                path_log_representation
+            )
+        })?;
+
+    let timelines_segment = segments.next().ok_or_else(|| {
+        anyhow!(
+            "Found no '{}' segment in the storage path '{}'",
+            TIMELINES_SEGMENT_NAME,
+            path_log_representation
+        )
+    })?;
+    ensure!(
+        timelines_segment == TIMELINES_SEGMENT_NAME,
+        "Failed to extract '{}' segment from storage path '{}'",
+        TIMELINES_SEGMENT_NAME,
+        path_log_representation
+    );
+    let timeline_id = segments
+        .next()
+        .ok_or_else(|| {
+            anyhow!(
+                "Found no timeline id in the storage path '{}'",
+                path_log_representation
+            )
+        })?
+        .parse::<ZTimelineId>()
+        .with_context(|| {
+            format!(
+                "Failed to parse timeline id from storage path '{}'",
+                path_log_representation
+            )
+        })?;
+
+    Ok((tenant_id, timeline_id))
+}
+
+/// A set of common test utils to share in unit tests inside the module tree.
+#[cfg(test)]
+mod test_utils {
+    use std::path::{Path, PathBuf};
+
+    use anyhow::ensure;
+
+    use crate::{
+        layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME},
+        repository::repo_harness::{RepoHarness, TIMELINE_ID},
+    };
+
+    /// Gives a timeline path with pageserver workdir stripped off.
+    pub fn relative_timeline_path(harness: &RepoHarness) -> anyhow::Result<PathBuf> {
+        let timeline_path = harness.timeline_path(&TIMELINE_ID);
+        Ok(timeline_path
+            .strip_prefix(&harness.conf.workdir)?
+            .to_path_buf())
+    }
+
+    /// Creates a path with custom tenant id in one of its segments.
+    /// Useful for emulating paths with wrong ids.
+    pub fn custom_tenant_id_path(
+        path_with_tenant_id: &Path,
+        new_tenant_id: &str,
+    ) -> anyhow::Result<PathBuf> {
+        let mut new_path = PathBuf::new();
+        let mut is_tenant_id = false;
+        let mut tenant_id_replaced = false;
+        for segment in path_with_tenant_id {
+            match segment.to_str() {
+                Some(TENANTS_SEGMENT_NAME) => is_tenant_id = true,
+                Some(_tenant_id_str) if is_tenant_id => {
+                    is_tenant_id = false;
+                    new_path.push(new_tenant_id);
+                    tenant_id_replaced = true;
+                    continue;
+                }
+                _ => {}
+            }
+            new_path.push(segment)
+        }
+
+        ensure!(tenant_id_replaced, "Found no tenant id segment to replace");
+        Ok(new_path)
+    }
+
+    /// Creates a path with custom timeline id in one of its segments.
+    /// Useful for emulating paths with wrong ids.
+    pub fn custom_timeline_id_path(
+        path_with_timeline_id: &Path,
+        new_timeline_id: &str,
+    ) -> anyhow::Result<PathBuf> {
+        let mut new_path = PathBuf::new();
+        let mut is_timeline_id = false;
+        let mut timeline_id_replaced = false;
+        for segment in path_with_timeline_id {
+            match segment.to_str() {
+                Some(TIMELINES_SEGMENT_NAME) => is_timeline_id = true,
+                Some(_timeline_id_str) if is_timeline_id => {
+                    is_timeline_id = false;
+                    new_path.push(new_timeline_id);
+                    timeline_id_replaced = true;
+                    continue;
+                }
+                _ => {}
+            }
+            new_path.push(segment)
+        }
+
+        ensure!(
+            timeline_id_replaced,
+            "Found no timeline id segment to replace"
+        );
+        Ok(new_path)
+    }
+}
--- a/pageserver/src/remote_storage/README.md
+++ b/pageserver/src/remote_storage/README.md
@@ -0,0 +1,82 @@
+# Non-implementation details
+
+This document describes the current state of the backup system in pageserver, existing limitations and concerns, why some things are done the way they are the future development plans.
+Detailed description on how the synchronization works and how it fits into the rest of the pageserver can be found in the [storage module](./../remote_storage.rs) and its submodules.
+Ideally, this document should disappear after current implementation concerns are mitigated, with the remaining useful knowledge bits moved into rustdocs.
+
+## Approach
+
+Backup functionality is a new component, appeared way after the core DB functionality was implemented.
+Pageserver layer functionality is also quite volatile at the moment, there's a risk its local file management changes over time.
+
+To avoid adding more chaos into that, backup functionality is currently designed as a relatively standalone component, with the majority of its logic placed in a standalone async loop.
+This way, the backups are managed in background, not affecting directly other pageserver parts: this way the backup and restoration process may lag behind, but eventually keep up with the reality. To track that, a set of prometheus metrics is exposed from pageserver.
+
+## What's done
+
+Current implementation
+* provides remote storage wrappers for AWS S3 and local FS
+* uploads layers, frozen by pageserver checkpoint thread
+* downloads and registers layers, found on the remote storage, but missing locally
+
+No good optimisations or performance testing is done, the feature is disabled by default and gets polished over time.
+It's planned to deal with all questions that are currently on and prepare the feature to be enabled by default in cloud environments.
+
+### Peculiarities
+
+As mentioned, the backup component is rather new and under development currently, so not all things are done properly from the start.
+Here's the list of known compromises with comments:
+
+* Remote storage model is the same as the `tenants/` directory contents of the pageserver's local workdir storage.
+This is relatively simple to implement, but may be costly to use in AWS S3: an initial data image contains ~782 relish files and a metadata file, ~31 MB combined.
+AWS charges per API call and for traffic either, layers are expected to be updated frequently, so this model most probably is ineffective.
+Additionally, pageservers might need to migrate images between tenants, which does not improve the situation.
+
+Storage sync API operates images when backing up or restoring a backup, so we're fluent to repack the layer contents the way we want to, which most probably will be done later.
+
+* no proper file comparison
+
+Currently, every layer contains `Lsn` in their name, to map the data it holds against a certain DB state.
+Then the images with same ids and different `Lsn`'s are compared, files are considered equal if their local file paths are equal (for remote files, "local file path" is their download destination).
+No file contents assertion is done currently, but should be.
+AWS S3 returns file checksums during the `list` operation, so that can be used to ensure the backup consistency, but that needs further research and, since current pageserver impl also needs to deal with layer file checksums.
+
+For now, due to this, we consider local workdir files as source of truth, not removing them ever and adjusting remote files instead, if image files mismatch.
+
+* no proper retry management
+
+Now, the storage sync attempts to redo the upload/download operation for the image files that failed.
+No proper task eviction or backpressure is implemented currently: the tasks will stay in the queue forever, reattempting the downloads.
+
+This will be fixed when more details on the file consistency model will be agreed on.
+
+* sad rust-s3 api
+
+rust-s3 is not very pleasant to use:
+1. it returns `anyhow::Result` and it's hard to distinguish "missing file" cases from "no connection" one, for instance
+2. at least one function it its API that we need (`get_object_stream`) has `async` keyword and blocks (!), see details [here](https://github.com/zenithdb/zenith/pull/752#discussion_r728373091)
+3. it's a prerelease library with unclear maintenance status
+4. noisy on debug level
+
+But it's already used in the project, so for now it's reused to avoid bloating the dependency tree.
+Based on previous evaluation, even `rusoto-s3` could be a better choice over this library, but needs further benchmarking.
+
+
+* gc and branches are ignored
+
+So far, we don't consider non-main images and don't adjust the remote storage based on GC thread loop results.
+Only checkpointer loop affects the remote storage.
+
+* more layers should be downloaded on demand
+
+Since we download and load remote layers into pageserver, there's a possibility a need for those layers' ancestors arise.
+Most probably, every downloaded image's ancestor is not present in locally too, but currently there's no logic for downloading such ancestors and their metadata,
+so the pageserver is unable to respond property on requests to such ancestors.
+
+To implement the downloading, more `tenant_mgr` refactoring is needed to properly handle web requests for layers and handle the state changes.
+[Here](https://github.com/zenithdb/zenith/pull/689#issuecomment-931216193) are the details about initial state management updates needed.
+
+* no IT tests
+
+Automated S3 testing is lacking currently, due to no convenient way to enable backups during the tests.
+After it's fixed, benchmark runs should also be carried out to find bottlenecks.
--- a/pageserver/src/remote_storage/local_fs.rs
+++ b/pageserver/src/remote_storage/local_fs.rs
@@ -0,0 +1,729 @@
+//! Local filesystem acting as a remote storage.
+//! Multiple pageservers can use the same "storage" of this kind by using different storage roots.
+//!
+//! This storage used in pageserver tests, but can also be used in cases when a certain persistent
+//! volume is mounted to the local FS.
+
+use std::{
+    ffi::OsStr,
+    future::Future,
+    path::{Path, PathBuf},
+    pin::Pin,
+};
+
+use anyhow::{bail, ensure, Context};
+use tokio::{
+    fs,
+    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
+};
+use tracing::*;
+
+use super::{parse_ids_from_path, strip_path_prefix, RemoteFileInfo, RemoteStorage};
+use crate::layered_repository::metadata::METADATA_FILE_NAME;
+
+pub struct LocalFs {
+    pageserver_workdir: &'static Path,
+    root: PathBuf,
+}
+
+impl LocalFs {
+    /// Attempts to create local FS storage, along with its root directory.
+    pub fn new(root: PathBuf, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
+        if !root.exists() {
+            std::fs::create_dir_all(&root).with_context(|| {
+                format!(
+                    "Failed to create all directories in the given root path '{}'",
+                    root.display(),
+                )
+            })?;
+        }
+        Ok(Self {
+            pageserver_workdir,
+            root,
+        })
+    }
+
+    fn resolve_in_storage(&self, path: &Path) -> anyhow::Result<PathBuf> {
+        if path.is_relative() {
+            Ok(self.root.join(path))
+        } else if path.starts_with(&self.root) {
+            Ok(path.to_path_buf())
+        } else {
+            bail!(
+                "Path '{}' does not belong to the current storage",
+                path.display()
+            )
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl RemoteStorage for LocalFs {
+    type StoragePath = PathBuf;
+
+    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath> {
+        Ok(self.root.join(
+            strip_path_prefix(self.pageserver_workdir, local_path)
+                .context("local path does not belong to this storage")?,
+        ))
+    }
+
+    fn info(&self, storage_path: &Self::StoragePath) -> anyhow::Result<RemoteFileInfo> {
+        let is_metadata =
+            storage_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME);
+        let relative_path = strip_path_prefix(&self.root, storage_path)
+            .context("local path does not belong to this storage")?;
+        let download_destination = self.pageserver_workdir.join(relative_path);
+        let (tenant_id, timeline_id) = parse_ids_from_path(
+            relative_path.iter().filter_map(|segment| segment.to_str()),
+            &relative_path.display(),
+        )?;
+        Ok(RemoteFileInfo {
+            tenant_id,
+            timeline_id,
+            download_destination,
+            is_metadata,
+        })
+    }
+
+    async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
+        Ok(get_all_files(&self.root).await?.into_iter().collect())
+    }
+
+    async fn upload(
+        &self,
+        mut from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        to: &Self::StoragePath,
+    ) -> anyhow::Result<()> {
+        let target_file_path = self.resolve_in_storage(to)?;
+        create_target_directory(&target_file_path).await?;
+        let mut destination = io::BufWriter::new(
+            fs::OpenOptions::new()
+                .write(true)
+                .create(true)
+                .open(&target_file_path)
+                .await
+                .with_context(|| {
+                    format!(
+                        "Failed to open target fs destination at '{}'",
+                        target_file_path.display()
+                    )
+                })?,
+        );
+
+        io::copy(&mut from, &mut destination)
+            .await
+            .context("Failed to upload a file to the local storage")?;
+        destination
+            .flush()
+            .await
+            .context("Failed to upload a file to the local storage")?;
+        Ok(())
+    }
+
+    async fn download(
+        &self,
+        from: &Self::StoragePath,
+        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
+    ) -> anyhow::Result<()> {
+        let file_path = self.resolve_in_storage(from)?;
+
+        if file_path.exists() && file_path.is_file() {
+            let mut source = io::BufReader::new(
+                fs::OpenOptions::new()
+                    .read(true)
+                    .open(&file_path)
+                    .await
+                    .with_context(|| {
+                        format!(
+                            "Failed to open source file '{}' to use in the download",
+                            file_path.display()
+                        )
+                    })?,
+            );
+            io::copy(&mut source, to)
+                .await
+                .context("Failed to download a file from the local storage")?;
+            Ok(())
+        } else {
+            bail!(
+                "File '{}' either does not exist or is not a file",
+                file_path.display()
+            )
+        }
+    }
+
+    async fn download_range(
+        &self,
+        from: &Self::StoragePath,
+        start_inclusive: u64,
+        end_exclusive: Option<u64>,
+        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
+    ) -> anyhow::Result<()> {
+        if let Some(end_exclusive) = end_exclusive {
+            ensure!(
+                end_exclusive > start_inclusive,
+                "Invalid range, start ({}) is bigger then end ({:?})",
+                start_inclusive,
+                end_exclusive
+            );
+            if start_inclusive == end_exclusive.saturating_sub(1) {
+                return Ok(());
+            }
+        }
+        let file_path = self.resolve_in_storage(from)?;
+
+        if file_path.exists() && file_path.is_file() {
+            let mut source = io::BufReader::new(
+                fs::OpenOptions::new()
+                    .read(true)
+                    .open(&file_path)
+                    .await
+                    .with_context(|| {
+                        format!(
+                            "Failed to open source file '{}' to use in the download",
+                            file_path.display()
+                        )
+                    })?,
+            );
+            source
+                .seek(io::SeekFrom::Start(start_inclusive))
+                .await
+                .context("Failed to seek to the range start in a local storage file")?;
+            match end_exclusive {
+                Some(end_exclusive) => {
+                    io::copy(&mut source.take(end_exclusive - start_inclusive), to).await
+                }
+                None => io::copy(&mut source, to).await,
+            }
+            .with_context(|| {
+                format!(
+                    "Failed to download file '{}' range from the local storage",
+                    file_path.display()
+                )
+            })?;
+            Ok(())
+        } else {
+            bail!(
+                "File '{}' either does not exist or is not a file",
+                file_path.display()
+            )
+        }
+    }
+
+    async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> {
+        let file_path = self.resolve_in_storage(path)?;
+        if file_path.exists() && file_path.is_file() {
+            Ok(fs::remove_file(file_path).await?)
+        } else {
+            bail!(
+                "File '{}' either does not exist or is not a file",
+                file_path.display()
+            )
+        }
+    }
+}
+
+fn get_all_files<'a, P>(
+    directory_path: P,
+) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
+where
+    P: AsRef<Path> + Send + Sync + 'a,
+{
+    Box::pin(async move {
+        let directory_path = directory_path.as_ref();
+        if directory_path.exists() {
+            if directory_path.is_dir() {
+                let mut paths = Vec::new();
+                let mut dir_contents = fs::read_dir(directory_path).await?;
+                while let Some(dir_entry) = dir_contents.next_entry().await? {
+                    let file_type = dir_entry.file_type().await?;
+                    let entry_path = dir_entry.path();
+                    if file_type.is_symlink() {
+                        debug!("{:?} us a symlink, skipping", entry_path)
+                    } else if file_type.is_dir() {
+                        paths.extend(get_all_files(entry_path).await?.into_iter())
+                    } else {
+                        paths.push(dir_entry.path());
+                    }
+                }
+                Ok(paths)
+            } else {
+                bail!("Path '{}' is not a directory", directory_path.display())
+            }
+        } else {
+            Ok(Vec::new())
+        }
+    })
+}
+
+async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()> {
+    let target_dir = match target_file_path.parent() {
+        Some(parent_dir) => parent_dir,
+        None => bail!(
+            "File path '{}' has no parent directory",
+            target_file_path.display()
+        ),
+    };
+    if !target_dir.exists() {
+        fs::create_dir_all(target_dir).await?;
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod pure_tests {
+    use crate::{
+        layered_repository::metadata::METADATA_FILE_NAME,
+        remote_storage::test_utils::{
+            custom_tenant_id_path, custom_timeline_id_path, relative_timeline_path,
+        },
+        repository::repo_harness::{RepoHarness, TIMELINE_ID},
+    };
+
+    use super::*;
+
+    #[test]
+    fn storage_path_positive() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("storage_path_positive")?;
+        let storage_root = PathBuf::from("somewhere").join("else");
+        let storage = LocalFs {
+            pageserver_workdir: &repo_harness.conf.workdir,
+            root: storage_root.clone(),
+        };
+
+        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("file_name");
+        let expected_path = storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?);
+
+        assert_eq!(
+            expected_path,
+            storage.storage_path(&local_path).expect("Matching path should map to storage path normally"),
+            "File paths from pageserver workdir should be stored in local fs storage with the same path they have relative to the workdir"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn storage_path_negatives() -> anyhow::Result<()> {
+        #[track_caller]
+        fn storage_path_error(storage: &LocalFs, mismatching_path: &Path) -> String {
+            match storage.storage_path(mismatching_path) {
+                Ok(wrong_path) => panic!(
+                    "Expected path '{}' to error, but got storage path: {:?}",
+                    mismatching_path.display(),
+                    wrong_path,
+                ),
+                Err(e) => format!("{:?}", e),
+            }
+        }
+
+        let repo_harness = RepoHarness::create("storage_path_negatives")?;
+        let storage_root = PathBuf::from("somewhere").join("else");
+        let storage = LocalFs {
+            pageserver_workdir: &repo_harness.conf.workdir,
+            root: storage_root,
+        };
+
+        let error_string = storage_path_error(&storage, &repo_harness.conf.workdir);
+        assert!(error_string.contains("does not belong to this storage"));
+        assert!(error_string.contains(repo_harness.conf.workdir.to_str().unwrap()));
+
+        let mismatching_path_str = "/something/else";
+        let error_message = storage_path_error(&storage, Path::new(mismatching_path_str));
+        assert!(
+            error_message.contains(mismatching_path_str),
+            "Error should mention wrong path"
+        );
+        assert!(
+            error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
+            "Error should mention server workdir"
+        );
+        assert!(error_message.contains("does not belong to this storage"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn info_positive() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("info_positive")?;
+        let storage_root = PathBuf::from("somewhere").join("else");
+        let storage = LocalFs {
+            pageserver_workdir: &repo_harness.conf.workdir,
+            root: storage_root.clone(),
+        };
+
+        let name = "not a metadata";
+        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join(name);
+        assert_eq!(
+            RemoteFileInfo {
+                tenant_id: repo_harness.tenant_id,
+                timeline_id: TIMELINE_ID,
+                download_destination: local_path.clone(),
+                is_metadata: false,
+            },
+            storage
+                .info(&storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?))
+                .expect("For a valid input, valid S3 info should be parsed"),
+            "Should be able to parse metadata out of the correctly named remote delta file"
+        );
+
+        let local_metadata_path = repo_harness
+            .timeline_path(&TIMELINE_ID)
+            .join(METADATA_FILE_NAME);
+        let remote_metadata_path = storage.storage_path(&local_metadata_path)?;
+        assert_eq!(
+            RemoteFileInfo {
+                tenant_id: repo_harness.tenant_id,
+                timeline_id: TIMELINE_ID,
+                download_destination: local_metadata_path,
+                is_metadata: true,
+            },
+            storage
+                .info(&remote_metadata_path)
+                .expect("For a valid input, valid S3 info should be parsed"),
+            "Should be able to parse metadata out of the correctly named remote metadata file"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn info_negatives() -> anyhow::Result<()> {
+        #[track_caller]
+        #[allow(clippy::ptr_arg)] // have to use &PathBuf due to `storage.info` parameter requirements
+        fn storage_info_error(storage: &LocalFs, storage_path: &PathBuf) -> String {
+            match storage.info(storage_path) {
+                Ok(wrong_info) => panic!(
+                    "Expected storage path input {:?} to cause an error, but got file info: {:?}",
+                    storage_path, wrong_info,
+                ),
+                Err(e) => format!("{:?}", e),
+            }
+        }
+
+        let repo_harness = RepoHarness::create("info_negatives")?;
+        let storage_root = PathBuf::from("somewhere").join("else");
+        let storage = LocalFs {
+            pageserver_workdir: &repo_harness.conf.workdir,
+            root: storage_root.clone(),
+        };
+
+        let totally_wrong_path = "wrong_wrong_wrong";
+        let error_message = storage_info_error(&storage, &PathBuf::from(totally_wrong_path));
+        assert!(error_message.contains(totally_wrong_path));
+
+        let relative_timeline_path = relative_timeline_path(&repo_harness)?;
+
+        let relative_file_path = custom_tenant_id_path(&relative_timeline_path, "wrong_tenant_id")?
+            .join("wrong_tenant_id_name");
+        let wrong_tenant_id_path = storage_root.join(&relative_file_path);
+        let error_message = storage_info_error(&storage, &wrong_tenant_id_path);
+        assert!(
+            error_message.contains(relative_file_path.to_str().unwrap()),
+            "Error message '{}' does not contain the expected substring",
+            error_message
+        );
+
+        let relative_file_path =
+            custom_timeline_id_path(&relative_timeline_path, "wrong_timeline_id")?
+                .join("wrong_timeline_id_name");
+        let wrong_timeline_id_path = storage_root.join(&relative_file_path);
+        let error_message = storage_info_error(&storage, &wrong_timeline_id_path);
+        assert!(
+            error_message.contains(relative_file_path.to_str().unwrap()),
+            "Error message '{}' does not contain the expected substring",
+            error_message
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn download_destination_matches_original_path() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
+        let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
+
+        let storage_root = PathBuf::from("somewhere").join("else");
+        let dummy_storage = LocalFs {
+            pageserver_workdir: &repo_harness.conf.workdir,
+            root: storage_root,
+        };
+
+        let storage_path = dummy_storage.storage_path(&original_path)?;
+        let download_destination = dummy_storage.info(&storage_path)?.download_destination;
+
+        assert_eq!(
+            original_path, download_destination,
+            "'original path -> storage path -> matching fs path' transformation should produce the same path as the input one for the correct path"
+        );
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod fs_tests {
+    use super::*;
+    use crate::{
+        remote_storage::test_utils::relative_timeline_path, repository::repo_harness::RepoHarness,
+    };
+
+    use std::io::Write;
+    use tempfile::tempdir;
+
+    #[tokio::test]
+    async fn upload_file() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("upload_file")?;
+        let storage = create_storage()?;
+
+        let source = create_file_for_upload(
+            &storage.pageserver_workdir.join("whatever"),
+            "whatever_contents",
+        )
+        .await?;
+        let target_path = PathBuf::from("/").join("somewhere").join("else");
+        match storage.upload(source, &target_path).await {
+            Ok(()) => panic!("Should not allow storing files with wrong target path"),
+            Err(e) => {
+                let message = format!("{:?}", e);
+                assert!(message.contains(&target_path.display().to_string()));
+                assert!(message.contains("does not belong to the current storage"));
+            }
+        }
+        assert!(storage.list().await?.is_empty());
+
+        let target_path_1 = upload_dummy_file(&repo_harness, &storage, "upload_1").await?;
+        assert_eq!(
+            storage.list().await?,
+            vec![target_path_1.clone()],
+            "Should list a single file after first upload"
+        );
+
+        let target_path_2 = upload_dummy_file(&repo_harness, &storage, "upload_2").await?;
+        assert_eq!(
+            list_files_sorted(&storage).await?,
+            vec![target_path_1.clone(), target_path_2.clone()],
+            "Should list a two different files after second upload"
+        );
+
+        Ok(())
+    }
+
+    fn create_storage() -> anyhow::Result<LocalFs> {
+        let pageserver_workdir = Box::leak(Box::new(tempdir()?.path().to_owned()));
+        let storage = LocalFs::new(tempdir()?.path().to_owned(), pageserver_workdir)?;
+        Ok(storage)
+    }
+
+    #[tokio::test]
+    async fn download_file() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("download_file")?;
+        let storage = create_storage()?;
+        let upload_name = "upload_1";
+        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
+
+        let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        storage.download(&upload_target, &mut content_bytes).await?;
+        content_bytes.flush().await?;
+
+        let contents = String::from_utf8(content_bytes.into_inner().into_inner())?;
+        assert_eq!(
+            dummy_contents(upload_name),
+            contents,
+            "We should upload and download the same contents"
+        );
+
+        let non_existing_path = PathBuf::from("somewhere").join("else");
+        match storage.download(&non_existing_path, &mut io::sink()).await {
+            Ok(_) => panic!("Should not allow downloading non-existing storage files"),
+            Err(e) => {
+                let error_string = e.to_string();
+                assert!(error_string.contains("does not exist"));
+                assert!(error_string.contains(&non_existing_path.display().to_string()));
+            }
+        }
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn download_file_range_positive() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("download_file_range_positive")?;
+        let storage = create_storage()?;
+        let upload_name = "upload_1";
+        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
+
+        let mut full_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        storage
+            .download_range(&upload_target, 0, None, &mut full_range_bytes)
+            .await?;
+        full_range_bytes.flush().await?;
+        assert_eq!(
+            dummy_contents(upload_name),
+            String::from_utf8(full_range_bytes.into_inner().into_inner())?,
+            "Download full range should return the whole upload"
+        );
+
+        let mut zero_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        let same_byte = 1_000_000_000;
+        storage
+            .download_range(
+                &upload_target,
+                same_byte,
+                Some(same_byte + 1), // exclusive end
+                &mut zero_range_bytes,
+            )
+            .await?;
+        zero_range_bytes.flush().await?;
+        assert!(
+            zero_range_bytes.into_inner().into_inner().is_empty(),
+            "Zero byte range should not download any part of the file"
+        );
+
+        let uploaded_bytes = dummy_contents(upload_name).into_bytes();
+        let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
+
+        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        storage
+            .download_range(
+                &upload_target,
+                0,
+                Some(first_part_local.len() as u64),
+                &mut first_part_remote,
+            )
+            .await?;
+        first_part_remote.flush().await?;
+        let first_part_remote = first_part_remote.into_inner().into_inner();
+        assert_eq!(
+            first_part_local,
+            first_part_remote.as_slice(),
+            "First part bytes should be returned when requrested"
+        );
+
+        let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        storage
+            .download_range(
+                &upload_target,
+                first_part_local.len() as u64,
+                Some((first_part_local.len() + second_part_local.len()) as u64),
+                &mut second_part_remote,
+            )
+            .await?;
+        second_part_remote.flush().await?;
+        let second_part_remote = second_part_remote.into_inner().into_inner();
+        assert_eq!(
+            second_part_local,
+            second_part_remote.as_slice(),
+            "Second part bytes should be returned when requrested"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn download_file_range_negative() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("download_file_range_negative")?;
+        let storage = create_storage()?;
+        let upload_name = "upload_1";
+        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
+
+        let start = 10000;
+        let end = 234;
+        assert!(start > end, "Should test an incorrect range");
+        match storage
+            .download_range(&upload_target, start, Some(end), &mut io::sink())
+            .await
+        {
+            Ok(_) => panic!("Should not allow downloading wrong ranges"),
+            Err(e) => {
+                let error_string = e.to_string();
+                assert!(error_string.contains("Invalid range"));
+                assert!(error_string.contains(&start.to_string()));
+                assert!(error_string.contains(&end.to_string()));
+            }
+        }
+
+        let non_existing_path = PathBuf::from("somewhere").join("else");
+        match storage
+            .download_range(&non_existing_path, 1, Some(3), &mut io::sink())
+            .await
+        {
+            Ok(_) => panic!("Should not allow downloading non-existing storage file ranges"),
+            Err(e) => {
+                let error_string = e.to_string();
+                assert!(error_string.contains("does not exist"));
+                assert!(error_string.contains(&non_existing_path.display().to_string()));
+            }
+        }
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn delete_file() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("delete_file")?;
+        let storage = create_storage()?;
+        let upload_name = "upload_1";
+        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
+
+        storage.delete(&upload_target).await?;
+        assert!(storage.list().await?.is_empty());
+
+        match storage.delete(&upload_target).await {
+            Ok(()) => panic!("Should not allow deleting non-existing storage files"),
+            Err(e) => {
+                let error_string = e.to_string();
+                assert!(error_string.contains("does not exist"));
+                assert!(error_string.contains(&upload_target.display().to_string()));
+            }
+        }
+        Ok(())
+    }
+
+    async fn upload_dummy_file(
+        harness: &RepoHarness,
+        storage: &LocalFs,
+        name: &str,
+    ) -> anyhow::Result<PathBuf> {
+        let storage_path = storage
+            .root
+            .join(relative_timeline_path(harness)?)
+            .join(name);
+        storage
+            .upload(
+                create_file_for_upload(
+                    &storage.pageserver_workdir.join(name),
+                    &dummy_contents(name),
+                )
+                .await?,
+                &storage_path,
+            )
+            .await?;
+        Ok(storage_path)
+    }
+
+    async fn create_file_for_upload(
+        path: &Path,
+        contents: &str,
+    ) -> anyhow::Result<io::BufReader<fs::File>> {
+        std::fs::create_dir_all(path.parent().unwrap())?;
+        let mut file_for_writing = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .open(path)?;
+        write!(file_for_writing, "{}", contents)?;
+        drop(file_for_writing);
+        Ok(io::BufReader::new(
+            fs::OpenOptions::new().read(true).open(&path).await?,
+        ))
+    }
+
+    fn dummy_contents(name: &str) -> String {
+        format!("contents for {}", name)
+    }
+
+    async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<PathBuf>> {
+        let mut files = storage.list().await?;
+        files.sort();
+        Ok(files)
+    }
+}
--- a/pageserver/src/remote_storage/rust_s3.rs
+++ b/pageserver/src/remote_storage/rust_s3.rs
@@ -0,0 +1,433 @@
+//! AWS S3 storage wrapper around `rust_s3` library.
+//! Currently does not allow multiple pageservers to use the same bucket concurrently: objects are
+//! placed in the root of the bucket.
+
+use std::path::{Path, PathBuf};
+
+use anyhow::Context;
+use s3::{bucket::Bucket, creds::Credentials, region::Region};
+use tokio::io::{self, AsyncWriteExt};
+
+use crate::{
+    layered_repository::metadata::METADATA_FILE_NAME,
+    remote_storage::{parse_ids_from_path, strip_path_prefix, RemoteFileInfo, RemoteStorage},
+    S3Config,
+};
+
+const S3_FILE_SEPARATOR: char = '/';
+
+#[derive(Debug, Eq, PartialEq)]
+pub struct S3ObjectKey(String);
+
+impl S3ObjectKey {
+    fn key(&self) -> &str {
+        &self.0
+    }
+
+    fn download_destination(&self, pageserver_workdir: &Path) -> PathBuf {
+        pageserver_workdir.join(self.0.split(S3_FILE_SEPARATOR).collect::<PathBuf>())
+    }
+}
+
+/// AWS S3 storage.
+pub struct S3 {
+    pageserver_workdir: &'static Path,
+    bucket: Bucket,
+}
+
+impl S3 {
+    /// Creates the storage, errors if incorrect AWS S3 configuration provided.
+    pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
+        let region = aws_config
+            .bucket_region
+            .parse::<Region>()
+            .context("Failed to parse the s3 region from config")?;
+        let credentials = Credentials::new(
+            aws_config.access_key_id.as_deref(),
+            aws_config.secret_access_key.as_deref(),
+            None,
+            None,
+            None,
+        )
+        .context("Failed to create the s3 credentials")?;
+        Ok(Self {
+            bucket: Bucket::new_with_path_style(
+                aws_config.bucket_name.as_str(),
+                region,
+                credentials,
+            )
+            .context("Failed to create the s3 bucket")?,
+            pageserver_workdir,
+        })
+    }
+}
+
+#[async_trait::async_trait]
+impl RemoteStorage for S3 {
+    type StoragePath = S3ObjectKey;
+
+    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath> {
+        let relative_path = strip_path_prefix(self.pageserver_workdir, local_path)?;
+        let mut key = String::new();
+        for segment in relative_path {
+            key.push(S3_FILE_SEPARATOR);
+            key.push_str(&segment.to_string_lossy());
+        }
+        Ok(S3ObjectKey(key))
+    }
+
+    fn info(&self, storage_path: &Self::StoragePath) -> anyhow::Result<RemoteFileInfo> {
+        let storage_path_key = &storage_path.0;
+        let is_metadata =
+            storage_path_key.ends_with(&format!("{}{}", S3_FILE_SEPARATOR, METADATA_FILE_NAME));
+        let download_destination = storage_path.download_destination(self.pageserver_workdir);
+        let (tenant_id, timeline_id) =
+            parse_ids_from_path(storage_path_key.split(S3_FILE_SEPARATOR), storage_path_key)?;
+        Ok(RemoteFileInfo {
+            tenant_id,
+            timeline_id,
+            download_destination,
+            is_metadata,
+        })
+    }
+
+    async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
+        let list_response = self
+            .bucket
+            .list(String::new(), None)
+            .await
+            .context("Failed to list s3 objects")?;
+
+        Ok(list_response
+            .into_iter()
+            .flat_map(|response| response.contents)
+            .map(|s3_object| S3ObjectKey(s3_object.key))
+            .collect())
+    }
+
+    async fn upload(
+        &self,
+        mut from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        to: &Self::StoragePath,
+    ) -> anyhow::Result<()> {
+        let mut upload_contents = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        io::copy(&mut from, &mut upload_contents)
+            .await
+            .context("Failed to read the upload contents")?;
+        upload_contents
+            .flush()
+            .await
+            .context("Failed to read the upload contents")?;
+        let upload_contents = upload_contents.into_inner().into_inner();
+
+        let (_, code) = self
+            .bucket
+            .put_object(to.key(), &upload_contents)
+            .await
+            .with_context(|| format!("Failed to create s3 object with key {}", to.key()))?;
+        if code != 200 {
+            Err(anyhow::format_err!(
+                "Received non-200 exit code during creating object with key '{}', code: {}",
+                to.key(),
+                code
+            ))
+        } else {
+            Ok(())
+        }
+    }
+
+    async fn download(
+        &self,
+        from: &Self::StoragePath,
+        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
+    ) -> anyhow::Result<()> {
+        let (data, code) = self
+            .bucket
+            .get_object(from.key())
+            .await
+            .with_context(|| format!("Failed to download s3 object with key {}", from.key()))?;
+        if code != 200 {
+            Err(anyhow::format_err!(
+                "Received non-200 exit code during downloading object, code: {}",
+                code
+            ))
+        } else {
+            // we don't have to write vector into the destination this way, `to_write_all` would be enough.
+            // but we want to prepare for migration on `rusoto`, that has a streaming HTTP body instead here, with
+            // which it makes more sense to use `io::copy`.
+            io::copy(&mut data.as_slice(), to)
+                .await
+                .context("Failed to write downloaded data into the destination buffer")?;
+            Ok(())
+        }
+    }
+
+    async fn download_range(
+        &self,
+        from: &Self::StoragePath,
+        start_inclusive: u64,
+        end_exclusive: Option<u64>,
+        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
+    ) -> anyhow::Result<()> {
+        // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
+        // and needs both ends to be exclusive
+        let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1));
+        let (data, code) = self
+            .bucket
+            .get_object_range(from.key(), start_inclusive, end_inclusive)
+            .await
+            .with_context(|| format!("Failed to download s3 object with key {}", from.key()))?;
+        if code != 206 {
+            Err(anyhow::format_err!(
+                "Received non-206 exit code during downloading object range, code: {}",
+                code
+            ))
+        } else {
+            // see `download` function above for the comment on why `Vec<u8>` buffer is copied this way
+            io::copy(&mut data.as_slice(), to)
+                .await
+                .context("Failed to write downloaded range into the destination buffer")?;
+            Ok(())
+        }
+    }
+
+    async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> {
+        let (_, code) = self
+            .bucket
+            .delete_object(path.key())
+            .await
+            .with_context(|| format!("Failed to delete s3 object with key {}", path.key()))?;
+        if code != 204 {
+            Err(anyhow::format_err!(
+                "Received non-204 exit code during deleting object with key '{}', code: {}",
+                path.key(),
+                code
+            ))
+        } else {
+            Ok(())
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        remote_storage::test_utils::{
+            custom_tenant_id_path, custom_timeline_id_path, relative_timeline_path,
+        },
+        repository::repo_harness::{RepoHarness, TIMELINE_ID},
+    };
+
+    use super::*;
+
+    #[test]
+    fn download_destination() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("download_destination")?;
+
+        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("test_name");
+        let relative_path = local_path.strip_prefix(&repo_harness.conf.workdir)?;
+
+        let key = S3ObjectKey(format!(
+            "{}{}",
+            S3_FILE_SEPARATOR,
+            relative_path
+                .iter()
+                .map(|segment| segment.to_str().unwrap())
+                .collect::<Vec<_>>()
+                .join(&S3_FILE_SEPARATOR.to_string()),
+        ));
+
+        assert_eq!(
+            local_path,
+            key.download_destination(&repo_harness.conf.workdir),
+            "Download destination should consist of s3 path joined with the pageserver workdir prefix"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn storage_path_positive() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("storage_path_positive")?;
+
+        let segment_1 = "matching";
+        let segment_2 = "file";
+        let local_path = &repo_harness.conf.workdir.join(segment_1).join(segment_2);
+        let expected_key = S3ObjectKey(format!(
+            "{SEPARATOR}{}{SEPARATOR}{}",
+            segment_1,
+            segment_2,
+            SEPARATOR = S3_FILE_SEPARATOR,
+        ));
+
+        let actual_key = dummy_storage(&repo_harness.conf.workdir)
+            .storage_path(local_path)
+            .expect("Matching path should map to S3 path normally");
+        assert_eq!(
+            expected_key,
+            actual_key,
+            "S3 key from the matching path should contain all segments after the workspace prefix, separated with S3 separator"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn storage_path_negatives() -> anyhow::Result<()> {
+        #[track_caller]
+        fn storage_path_error(storage: &S3, mismatching_path: &Path) -> String {
+            match storage.storage_path(mismatching_path) {
+                Ok(wrong_key) => panic!(
+                    "Expected path '{}' to error, but got S3 key: {:?}",
+                    mismatching_path.display(),
+                    wrong_key,
+                ),
+                Err(e) => e.to_string(),
+            }
+        }
+
+        let repo_harness = RepoHarness::create("storage_path_negatives")?;
+        let storage = dummy_storage(&repo_harness.conf.workdir);
+
+        let error_message = storage_path_error(&storage, &repo_harness.conf.workdir);
+        assert!(
+            error_message.contains("Prefix and the path are equal"),
+            "Message '{}' does not contain the required string",
+            error_message
+        );
+
+        let mismatching_path = PathBuf::from("somewhere").join("else");
+        let error_message = storage_path_error(&storage, &mismatching_path);
+        assert!(
+            error_message.contains(mismatching_path.to_str().unwrap()),
+            "Error should mention wrong path"
+        );
+        assert!(
+            error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
+            "Error should mention server workdir"
+        );
+        assert!(
+            error_message.contains("is not prefixed with"),
+            "Message '{}' does not contain a required string",
+            error_message
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn info_positive() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("info_positive")?;
+        let storage = dummy_storage(&repo_harness.conf.workdir);
+        let relative_timeline_path = relative_timeline_path(&repo_harness)?;
+
+        let s3_key = create_s3_key(&relative_timeline_path.join("not a metadata"));
+        assert_eq!(
+            RemoteFileInfo {
+                tenant_id: repo_harness.tenant_id,
+                timeline_id: TIMELINE_ID,
+                download_destination: s3_key.download_destination(&repo_harness.conf.workdir),
+                is_metadata: false,
+            },
+            storage
+                .info(&s3_key)
+                .expect("For a valid input, valid S3 info should be parsed"),
+            "Should be able to parse metadata out of the correctly named remote delta file"
+        );
+
+        let s3_key = create_s3_key(&relative_timeline_path.join(METADATA_FILE_NAME));
+        assert_eq!(
+            RemoteFileInfo {
+                tenant_id: repo_harness.tenant_id,
+                timeline_id: TIMELINE_ID,
+                download_destination: s3_key.download_destination(&repo_harness.conf.workdir),
+                is_metadata: true,
+            },
+            storage
+                .info(&s3_key)
+                .expect("For a valid input, valid S3 info should be parsed"),
+            "Should be able to parse metadata out of the correctly named remote metadata file"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn info_negatives() -> anyhow::Result<()> {
+        #[track_caller]
+        fn storage_info_error(storage: &S3, s3_key: &S3ObjectKey) -> String {
+            match storage.info(s3_key) {
+                Ok(wrong_info) => panic!(
+                    "Expected key {:?} to error, but got file info: {:?}",
+                    s3_key, wrong_info,
+                ),
+                Err(e) => e.to_string(),
+            }
+        }
+
+        let repo_harness = RepoHarness::create("info_negatives")?;
+        let storage = dummy_storage(&repo_harness.conf.workdir);
+        let relative_timeline_path = relative_timeline_path(&repo_harness)?;
+
+        let totally_wrong_path = "wrong_wrong_wrong";
+        let error_message =
+            storage_info_error(&storage, &S3ObjectKey(totally_wrong_path.to_string()));
+        assert!(error_message.contains(totally_wrong_path));
+
+        let wrong_tenant_id = create_s3_key(
+            &custom_tenant_id_path(&relative_timeline_path, "wrong_tenant_id")?.join("name"),
+        );
+        let error_message = storage_info_error(&storage, &wrong_tenant_id);
+        assert!(error_message.contains(&wrong_tenant_id.0));
+
+        let wrong_timeline_id = create_s3_key(
+            &custom_timeline_id_path(&relative_timeline_path, "wrong_timeline_id")?.join("name"),
+        );
+        let error_message = storage_info_error(&storage, &wrong_timeline_id);
+        assert!(error_message.contains(&wrong_timeline_id.0));
+
+        Ok(())
+    }
+
+    #[test]
+    fn download_destination_matches_original_path() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
+        let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
+
+        let dummy_storage = dummy_storage(&repo_harness.conf.workdir);
+
+        let key = dummy_storage.storage_path(&original_path)?;
+        let download_destination = dummy_storage.info(&key)?.download_destination;
+
+        assert_eq!(
+            original_path, download_destination,
+            "'original path -> storage key -> matching fs path' transformation should produce the same path as the input one for the correct path"
+        );
+
+        Ok(())
+    }
+
+    fn dummy_storage(pageserver_workdir: &'static Path) -> S3 {
+        S3 {
+            pageserver_workdir,
+            bucket: Bucket::new(
+                "dummy-bucket",
+                "us-east-1".parse().unwrap(),
+                Credentials::anonymous().unwrap(),
+            )
+            .unwrap(),
+        }
+    }
+
+    fn create_s3_key(relative_file_path: &Path) -> S3ObjectKey {
+        S3ObjectKey(
+            relative_file_path
+                .iter()
+                .fold(String::new(), |mut path_string, segment| {
+                    path_string.push(S3_FILE_SEPARATOR);
+                    path_string.push_str(segment.to_str().unwrap());
+                    path_string
+                }),
+        )
+    }
+}
--- a/pageserver/src/remote_storage/storage_sync.rs
+++ b/pageserver/src/remote_storage/storage_sync.rs
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -1,6 +1,7 @@
 use crate::relish::*;
+use crate::CheckpointConfig;
 use anyhow::Result;
-use bytes::{Buf, BufMut, Bytes, BytesMut};
+use bytes::Bytes;
 use serde::{Deserialize, Serialize};
 use std::collections::HashSet;
 use std::ops::{AddAssign, Deref};
@@ -24,9 +25,9 @@ pub trait Repository: Send + Sync {
    /// Branch a timeline
    fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>;

-    /// perform one garbage collection iteration.
-    /// garbage collection is periodically performed by gc thread,
-    /// but it can be explicitly requested through page server api.
+    /// perform one garbage collection iteration, removing old data files from disk.
+    /// this funtion is periodically called by gc thread.
+    /// also it can be explicitly requested through page server api 'do_gc' command.
    ///
    /// 'timelineid' specifies the timeline to GC, or None for all.
    /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval).
@@ -39,6 +40,10 @@ pub trait Repository: Send + Sync {
        horizon: u64,
        checkpoint_before_gc: bool,
    ) -> Result<GcResult>;
+
+    /// perform one checkpoint iteration, flushing in-memory data on disk.
+    /// this function is periodically called by checkponter thread.
+    fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()>;
 }

 ///
@@ -134,6 +139,7 @@ pub trait Timeline: Send + Sync {
    fn get_last_record_lsn(&self) -> Lsn;
    fn get_prev_record_lsn(&self) -> Lsn;
    fn get_start_lsn(&self) -> Lsn;
+    fn get_disk_consistent_lsn(&self) -> Lsn;

    /// Mutate the timeline with a [`TimelineWriter`].
    fn writer<'a>(&'a self) -> Box<dyn TimelineWriter + 'a>;
@@ -143,7 +149,7 @@ pub trait Timeline: Send + Sync {
    ///
    /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
    /// know anything about them here in the repository.
-    fn checkpoint(&self) -> Result<()>;
+    fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()>;

    /// Retrieve current logical size of the timeline
    ///
@@ -154,6 +160,9 @@ pub trait Timeline: Send + Sync {
    /// Does the same as get_current_logical_size but counted on demand.
    /// Used in tests to ensure thet incremental and non incremental variants match.
    fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize>;
+
+    /// An escape hatch to allow "casting" a generic Timeline to LayeredTimeline.
+    fn upgrade_to_layered_timeline(&self) -> &crate::layered_repository::LayeredTimeline;
 }

 /// Various functions to mutate the timeline.
@@ -193,22 +202,101 @@ pub struct WALRecord {
    pub main_data_offset: u32,
 }

-impl WALRecord {
-    pub fn pack(&self, buf: &mut BytesMut) {
-        buf.put_u8(self.will_init as u8);
-        buf.put_u32(self.main_data_offset);
-        buf.put_u32(self.rec.len() as u32);
-        buf.put_slice(&self.rec[..]);
+#[cfg(test)]
+pub mod repo_harness {
+    use bytes::BytesMut;
+    use std::{fs, path::PathBuf};
+
+    use crate::{
+        layered_repository::{LayeredRepository, TIMELINES_SEGMENT_NAME},
+        walredo::{WalRedoError, WalRedoManager},
+        PageServerConf,
+    };
+
+    use super::*;
+    use hex_literal::hex;
+    use zenith_utils::zid::ZTenantId;
+
+    pub const TIMELINE_ID: ZTimelineId =
+        ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
+    pub const NEW_TIMELINE_ID: ZTimelineId =
+        ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
+
+    /// Convenience function to create a page image with given string as the only content
+    #[allow(non_snake_case)]
+    pub fn TEST_IMG(s: &str) -> Bytes {
+        let mut buf = BytesMut::new();
+        buf.extend_from_slice(s.as_bytes());
+        buf.resize(8192, 0);
+
+        buf.freeze()
    }
-    pub fn unpack(buf: &mut Bytes) -> WALRecord {
-        let will_init = buf.get_u8() != 0;
-        let main_data_offset = buf.get_u32();
-        let rec_len = buf.get_u32() as usize;
-        let rec = buf.split_to(rec_len);
-        WALRecord {
-            will_init,
-            rec,
-            main_data_offset,
+
+    pub struct RepoHarness {
+        pub conf: &'static PageServerConf,
+        pub tenant_id: ZTenantId,
+    }
+
+    impl RepoHarness {
+        pub fn create(test_name: &'static str) -> Result<Self> {
+            let repo_dir = PageServerConf::test_repo_dir(test_name);
+            let _ = fs::remove_dir_all(&repo_dir);
+            fs::create_dir_all(&repo_dir)?;
+            fs::create_dir_all(&repo_dir.join(TIMELINES_SEGMENT_NAME))?;
+
+            let conf = PageServerConf::dummy_conf(repo_dir);
+            // Make a static copy of the config. This can never be free'd, but that's
+            // OK in a test.
+            let conf: &'static PageServerConf = Box::leak(Box::new(conf));
+
+            let tenant_id = ZTenantId::generate();
+            fs::create_dir_all(conf.tenant_path(&tenant_id))?;
+
+            Ok(Self { conf, tenant_id })
+        }
+
+        pub fn load(&self) -> Box<dyn Repository> {
+            let walredo_mgr = Arc::new(TestRedoManager);
+
+            Box::new(LayeredRepository::new(
+                self.conf,
+                walredo_mgr,
+                self.tenant_id,
+                false,
+            ))
+        }
+
+        pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
+            self.conf.timeline_path(timeline_id, &self.tenant_id)
+        }
+    }
+
+    // Mock WAL redo manager that doesn't do much
+    struct TestRedoManager;
+
+    impl WalRedoManager for TestRedoManager {
+        fn request_redo(
+            &self,
+            rel: RelishTag,
+            blknum: u32,
+            lsn: Lsn,
+            base_img: Option<Bytes>,
+            records: Vec<(Lsn, WALRecord)>,
+        ) -> Result<Bytes, WalRedoError> {
+            let s = format!(
+                "redo for {} blk {} to get to {}, with {} and {} records",
+                rel,
+                blknum,
+                lsn,
+                if base_img.is_some() {
+                    "base image"
+                } else {
+                    "no base image"
+                },
+                records.len()
+            );
+            println!("{}", s);
+            Ok(TEST_IMG(&s))
        }
    }
 }
@@ -219,21 +307,11 @@ impl WALRecord {
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
-    use super::*;
-    use crate::layered_repository::{LayeredRepository, METADATA_FILE_NAME};
-    use crate::walredo::{WalRedoError, WalRedoManager};
-    use crate::PageServerConf;
-    use hex_literal::hex;
-    use postgres_ffi::pg_constants;
-    use postgres_ffi::xlog_utils::SIZEOF_CHECKPOINT;
-    use std::fs;
-    use std::path::PathBuf;
-    use zenith_utils::zid::ZTenantId;
+    use crate::layered_repository::metadata::METADATA_FILE_NAME;

-    const TIMELINE_ID: ZTimelineId =
-        ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
-    const NEW_TIMELINE_ID: ZTimelineId =
-        ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
+    use super::repo_harness::*;
+    use super::*;
+    use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT};

    /// Arbitrary relation tag, for testing.
    const TESTREL_A: RelishTag = RelishTag::Relation(RelTag {
@@ -249,16 +327,6 @@ mod tests {
        forknum: 0,
    });

-    /// Convenience function to create a page image with given string as the only content
-    #[allow(non_snake_case)]
-    fn TEST_IMG(s: &str) -> Bytes {
-        let mut buf = BytesMut::new();
-        buf.extend_from_slice(s.as_bytes());
-        buf.resize(8192, 0);
-
-        buf.freeze()
-    }
-
    fn assert_current_logical_size(timeline: &Arc<dyn Timeline>, lsn: Lsn) {
        let incremental = timeline.get_current_logical_size();
        let non_incremental = timeline
@@ -270,45 +338,6 @@ mod tests {
    static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
    static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);

-    struct RepoHarness {
-        conf: &'static PageServerConf,
-        tenant_id: ZTenantId,
-    }
-
-    impl RepoHarness {
-        fn create(test_name: &'static str) -> Result<Self> {
-            let repo_dir = PageServerConf::test_repo_dir(test_name);
-            let _ = fs::remove_dir_all(&repo_dir);
-            fs::create_dir_all(&repo_dir)?;
-            fs::create_dir_all(&repo_dir.join("timelines"))?;
-
-            let conf = PageServerConf::dummy_conf(repo_dir);
-            // Make a static copy of the config. This can never be free'd, but that's
-            // OK in a test.
-            let conf: &'static PageServerConf = Box::leak(Box::new(conf));
-
-            let tenant_id = ZTenantId::generate();
-            fs::create_dir_all(conf.tenant_path(&tenant_id))?;
-
-            Ok(Self { conf, tenant_id })
-        }
-
-        fn load(&self) -> Box<dyn Repository> {
-            let walredo_mgr = Arc::new(TestRedoManager);
-
-            Box::new(LayeredRepository::new(
-                self.conf,
-                walredo_mgr,
-                self.tenant_id,
-                false,
-            ))
-        }
-
-        fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
-            self.conf.timeline_path(timeline_id, &self.tenant_id)
-        }
-    }
-
    #[test]
    fn test_relsize() -> Result<()> {
        let repo = RepoHarness::create("test_relsize")?.load();
@@ -675,7 +704,7 @@ mod tests {
            .contains(&TESTREL_A));

        // Run checkpoint and garbage collection and check that it's still not visible
-        newtline.checkpoint()?;
+        newtline.checkpoint(CheckpointConfig::Forced)?;
        repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?;

        assert!(!newtline
@@ -820,33 +849,4 @@ mod tests {

        Ok(())
    }
-
-    // Mock WAL redo manager that doesn't do much
-    struct TestRedoManager;
-
-    impl WalRedoManager for TestRedoManager {
-        fn request_redo(
-            &self,
-            rel: RelishTag,
-            blknum: u32,
-            lsn: Lsn,
-            base_img: Option<Bytes>,
-            records: Vec<(Lsn, WALRecord)>,
-        ) -> Result<Bytes, WalRedoError> {
-            let s = format!(
-                "redo for {} blk {} to get to {}, with {} and {} records",
-                rel,
-                blknum,
-                lsn,
-                if base_img.is_some() {
-                    "base image"
-                } else {
-                    "no base image"
-                },
-                records.len()
-            );
-            println!("{}", s);
-            Ok(TEST_IMG(&s))
-        }
-    }
 }
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -4,18 +4,19 @@
 use crate::branches;
 use crate::layered_repository::LayeredRepository;
 use crate::repository::{Repository, Timeline};
+use crate::tenant_threads;
 use crate::walredo::PostgresRedoManager;
 use crate::PageServerConf;
 use anyhow::{anyhow, bail, Context, Result};
 use lazy_static::lazy_static;
 use log::{debug, info};
+use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::fmt;
 use std::fs;
 use std::str::FromStr;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, Mutex, MutexGuard};
-use std::thread::JoinHandle;
 use zenith_utils::zid::{ZTenantId, ZTimelineId};

 lazy_static! {
@@ -27,8 +28,8 @@ struct Tenant {
    repo: Option<Arc<dyn Repository>>,
 }

-#[derive(Debug)]
-enum TenantState {
+#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
+pub enum TenantState {
    // This tenant only exists in cloud storage. It cannot be accessed.
    CloudOnly,
    // This tenant exists in cloud storage, and we are currently downloading it to local disk.
@@ -40,10 +41,12 @@ enum TenantState {
    // This tenant exists on local disk, and the layer map has been loaded into memory.
    // The local disk might have some newer files that don't exist in cloud storage yet.
    Active,
+    // Tenant is active, but there is no walreceiver connection.
+    Idle,
    // This tenant exists on local disk, and the layer map has been loaded into memory.
    // The local disk might have some newer files that don't exist in cloud storage yet.
    // The tenant cannot be accessed anymore for any reason, but graceful shutdown.
-    //Stopping,
+    Stopping,
 }

 impl fmt::Display for TenantState {
@@ -52,6 +55,8 @@ impl fmt::Display for TenantState {
            TenantState::CloudOnly => f.write_str("CloudOnly"),
            TenantState::Downloading => f.write_str("Downloading"),
            TenantState::Active => f.write_str("Active"),
+            TenantState::Idle => f.write_str("Idle"),
+            TenantState::Stopping => f.write_str("Stopping"),
        }
    }
 }
@@ -60,18 +65,6 @@ fn access_tenants() -> MutexGuard<'static, HashMap<ZTenantId, Tenant>> {
    TENANTS.lock().unwrap()
 }

-struct TenantHandleEntry {
-    checkpointer_handle: Option<JoinHandle<()>>,
-    gc_handle: Option<JoinHandle<()>>,
-}
-
-// Logically these handles belong to Repository,
-// but it's just simpler to store them separately
-lazy_static! {
-    static ref TENANT_HANDLES: Mutex<HashMap<ZTenantId, TenantHandleEntry>> =
-        Mutex::new(HashMap::new());
-}
-
 static SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false);

 pub fn init(conf: &'static PageServerConf) {
@@ -105,26 +98,13 @@ fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) {
        true,
    ));

-    let checkpointer_handle = LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
-    let gc_handle = LayeredRepository::launch_gc_thread(conf, repo.clone());
-
-    let mut handles = TENANT_HANDLES.lock().unwrap();
-    let h = TenantHandleEntry {
-        checkpointer_handle: Some(checkpointer_handle),
-        gc_handle: Some(gc_handle),
-    };
-
-    handles.insert(tenant_id, h);
-
    let mut m = access_tenants();
    let tenant = m.get_mut(&tenant_id).unwrap();
    tenant.repo = Some(repo);
-    tenant.state = TenantState::Active;
+    tenant.state = TenantState::Idle;
 }

-// TODO kb Currently unused function, will later be used when the relish storage downloads a new layer.
-// Relevant PR: https://github.com/zenithdb/zenith/pull/686
-pub fn register_relish_download(
+pub fn register_timeline_download(
    conf: &'static PageServerConf,
    tenant_id: ZTenantId,
    timeline_id: ZTimelineId,
@@ -137,14 +117,20 @@ pub fn register_relish_download(

    {
        let mut m = access_tenants();
-        let mut tenant = m.get_mut(&tenant_id).unwrap();
+        let tenant = m.entry(tenant_id).or_insert_with(|| Tenant {
+            state: TenantState::Downloading,
+            repo: None,
+        });
        tenant.state = TenantState::Downloading;
        match &tenant.repo {
-            Some(repo) => init_timeline(repo.as_ref(), timeline_id),
-            None => {
-                log::info!("Initialize new repo");
+            Some(repo) => {
+                init_timeline(repo.as_ref(), timeline_id);
+                tenant.state = TenantState::Idle;
+                return;
            }
+            None => log::warn!("Initialize new repo"),
        }
+        tenant.state = TenantState::Idle;
    }

    // init repo updates Tenant state
@@ -165,27 +151,23 @@ pub fn shutdown_requested() -> bool {
    SHUTDOWN_REQUESTED.load(Ordering::Relaxed)
 }

-pub fn stop_tenant_threads(tenantid: ZTenantId) {
-    let mut handles = TENANT_HANDLES.lock().unwrap();
-    if let Some(h) = handles.get_mut(&tenantid) {
-        h.checkpointer_handle.take().map(JoinHandle::join);
-        debug!("checkpointer for tenant {} has stopped", tenantid);
-        h.gc_handle.take().map(JoinHandle::join);
-        debug!("gc for tenant {} has stopped", tenantid);
-    }
-}
-
 pub fn shutdown_all_tenants() -> Result<()> {
    SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed);

    let tenantids = list_tenantids()?;
+
+    for tenantid in &tenantids {
+        set_tenant_state(*tenantid, TenantState::Stopping)?;
+    }
+
    for tenantid in tenantids {
-        stop_tenant_threads(tenantid);
+        // Wait for checkpointer and GC to finish their job
+        tenant_threads::wait_for_tenant_threads_to_stop(tenantid);
+
        let repo = get_repository_for_tenant(tenantid)?;
        debug!("shutdown tenant {}", tenantid);
        repo.shutdown()?;
    }
-
    Ok(())
 }

@@ -212,18 +194,45 @@ pub fn create_repository_for_tenant(
    let mut m = access_tenants();
    let tenant = m.get_mut(&tenantid).unwrap();
    tenant.repo = Some(repo);
-    tenant.state = TenantState::Active;
+    tenant.state = TenantState::Idle;

    Ok(())
 }

+// If tenant is not found in the repository, return CloudOnly state
+pub fn get_tenant_state(tenantid: ZTenantId) -> TenantState {
+    let m = access_tenants();
+    match m.get(&tenantid) {
+        Some(tenant) => tenant.state,
+        None => TenantState::CloudOnly,
+    }
+}
+
+pub fn set_tenant_state(tenantid: ZTenantId, newstate: TenantState) -> Result<TenantState> {
+    let mut m = access_tenants();
+    let tenant = m.get_mut(&tenantid);
+
+    match tenant {
+        Some(tenant) => {
+            if newstate == TenantState::Idle && tenant.state != TenantState::Active {
+                // Only Active tenant can become Idle
+                return Ok(tenant.state);
+            }
+            info!("set_tenant_state: {} -> {}", tenant.state, newstate);
+            tenant.state = newstate;
+            Ok(tenant.state)
+        }
+        None => bail!("Tenant not found for tenant {}", tenantid),
+    }
+}
+
 pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<dyn Repository>> {
    let m = access_tenants();
    let tenant = m
        .get(&tenantid)
-        .ok_or_else(|| anyhow!("Tenant not found for tenant {}", tenantid));
+        .ok_or_else(|| anyhow!("Tenant not found for tenant {}", tenantid))?;

-    match &tenant.unwrap().repo {
+    match &tenant.repo {
        Some(repo) => Ok(Arc::clone(repo)),
        None => anyhow::bail!("Repository for tenant {} is not yet valid", tenantid),
    }
@@ -247,3 +256,23 @@ fn list_tenantids() -> Result<Vec<ZTenantId>> {
        })
        .collect()
 }
+
+#[derive(Serialize, Deserialize, Clone)]
+pub struct TenantInfo {
+    #[serde(with = "hex")]
+    pub id: ZTenantId,
+    pub state: TenantState,
+}
+
+pub fn list_tenants() -> Result<Vec<TenantInfo>> {
+    let m = access_tenants();
+    m.iter()
+        .map(|v| {
+            let (id, tenant) = v;
+            Ok(TenantInfo {
+                id: *id,
+                state: tenant.state,
+            })
+        })
+        .collect()
+}
--- a/pageserver/src/tenant_threads.rs
+++ b/pageserver/src/tenant_threads.rs
@@ -0,0 +1,149 @@
+//! This module contains functions to serve per-tenant background processes,
+//! such as checkpointer and GC
+use crate::tenant_mgr;
+use crate::tenant_mgr::TenantState;
+use crate::CheckpointConfig;
+use crate::PageServerConf;
+use anyhow::Result;
+use lazy_static::lazy_static;
+use std::collections::HashMap;
+use std::sync::Mutex;
+use std::thread::JoinHandle;
+use std::time::Duration;
+use tracing::*;
+use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};
+use zenith_utils::zid::ZTenantId;
+
+struct TenantHandleEntry {
+    checkpointer_handle: Option<JoinHandle<()>>,
+    gc_handle: Option<JoinHandle<()>>,
+}
+
+// Preserve handles to wait for thread completion
+// at shutdown
+lazy_static! {
+    static ref TENANT_HANDLES: Mutex<HashMap<ZTenantId, TenantHandleEntry>> =
+        Mutex::new(HashMap::new());
+}
+
+lazy_static! {
+    static ref TENANT_THREADS_COUNT: IntGaugeVec = register_int_gauge_vec!(
+        "tenant_threads_count",
+        "Number of live tenant threads",
+        &["tenant_thread_type"]
+    )
+    .expect("failed to define a metric");
+}
+
+// Launch checkpointer and GC for the tenant.
+// It's possible that the threads are running already,
+// if so, just don't spawn new ones.
+pub fn start_tenant_threads(conf: &'static PageServerConf, tenantid: ZTenantId) {
+    let mut handles = TENANT_HANDLES.lock().unwrap();
+    let h = handles
+        .entry(tenantid)
+        .or_insert_with(|| TenantHandleEntry {
+            checkpointer_handle: None,
+            gc_handle: None,
+        });
+
+    if h.checkpointer_handle.is_none() {
+        h.checkpointer_handle = std::thread::Builder::new()
+            .name("Checkpointer thread".into())
+            .spawn(move || {
+                checkpoint_loop(tenantid, conf).expect("Checkpointer thread died");
+            })
+            .ok();
+    }
+
+    if h.gc_handle.is_none() {
+        h.gc_handle = std::thread::Builder::new()
+            .name("GC thread".into())
+            .spawn(move || {
+                gc_loop(tenantid, conf).expect("GC thread died");
+            })
+            .ok();
+    }
+}
+
+pub fn wait_for_tenant_threads_to_stop(tenantid: ZTenantId) {
+    let mut handles = TENANT_HANDLES.lock().unwrap();
+    if let Some(h) = handles.get_mut(&tenantid) {
+        h.checkpointer_handle.take().map(JoinHandle::join);
+        trace!("checkpointer for tenant {} has stopped", tenantid);
+        h.gc_handle.take().map(JoinHandle::join);
+        trace!("gc for tenant {} has stopped", tenantid);
+    }
+    handles.remove(&tenantid);
+}
+
+///
+/// Checkpointer thread's main loop
+///
+fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
+    let gauge = TENANT_THREADS_COUNT.with_label_values(&["checkpointer"]);
+    gauge.inc();
+    scopeguard::defer! {
+        gauge.dec();
+    }
+
+    loop {
+        if tenant_mgr::get_tenant_state(tenantid) != TenantState::Active {
+            break;
+        }
+
+        std::thread::sleep(conf.checkpoint_period);
+        trace!("checkpointer thread for tenant {} waking up", tenantid);
+
+        // checkpoint timelines that have accumulated more than CHECKPOINT_DISTANCE
+        // bytes of WAL since last checkpoint.
+        let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
+        repo.checkpoint_iteration(CheckpointConfig::Distance(conf.checkpoint_distance))?;
+    }
+
+    trace!(
+        "checkpointer thread stopped for tenant {} state is {}",
+        tenantid,
+        tenant_mgr::get_tenant_state(tenantid)
+    );
+    Ok(())
+}
+
+///
+/// GC thread's main loop
+///
+fn gc_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
+    let gauge = TENANT_THREADS_COUNT.with_label_values(&["gc"]);
+    gauge.inc();
+    scopeguard::defer! {
+        gauge.dec();
+    }
+
+    loop {
+        if tenant_mgr::get_tenant_state(tenantid) != TenantState::Active {
+            break;
+        }
+
+        trace!("gc thread for tenant {} waking up", tenantid);
+
+        // Garbage collect old files that are not needed for PITR anymore
+        if conf.gc_horizon > 0 {
+            let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
+            repo.gc_iteration(None, conf.gc_horizon, false).unwrap();
+        }
+
+        // TODO Write it in more adequate way using
+        // condvar.wait_timeout() or something
+        let mut sleep_time = conf.gc_period.as_secs();
+        while sleep_time > 0 && tenant_mgr::get_tenant_state(tenantid) == TenantState::Active {
+            sleep_time -= 1;
+            std::thread::sleep(Duration::from_secs(1));
+        }
+    }
+    trace!(
+        "GC thread stopped for tenant {} state is {}",
+        tenantid,
+        tenant_mgr::get_tenant_state(tenantid)
+    );
+    Ok(())
+}
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -0,0 +1,623 @@
+//!
+//! VirtualFile is like a normal File, but it's not bound directly to
+//! a file descriptor. Instead, the file is opened when it's read from,
+//! and if too many files are open globally in the system, least-recently
+//! used ones are closed.
+//!
+//! To track which files have been recently used, we use the clock algorithm
+//! with a 'recently_used' flag on each slot.
+//!
+//! This is similar to PostgreSQL's virtual file descriptor facility in
+//! src/backend/storage/file/fd.c
+//!
+use std::fs::{File, OpenOptions};
+use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write};
+use std::os::unix::fs::FileExt;
+use std::path::{Path, PathBuf};
+use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+use std::sync::{RwLock, RwLockWriteGuard};
+
+use once_cell::sync::OnceCell;
+
+///
+/// A virtual file descriptor. You can use this just like std::fs::File, but internally
+/// the underlying file is closed if the system is low on file descriptors,
+/// and re-opened when it's accessed again.
+///
+/// Like with std::fs::File, multiple threads can read/write the file concurrently,
+/// holding just a shared reference the same VirtualFile, using the read_at() / write_at()
+/// functions from the FileExt trait. But the functions from the Read/Write/Seek traits
+/// require a mutable reference, because they modify the "current position".
+///
+/// Each VirtualFile has a physical file descriptor in the global OPEN_FILES array, at the
+/// slot that 'handle points to, if the underlying file is currently open. If it's not
+/// currently open, the 'handle' can still point to the slot where it was last kept. The
+/// 'tag' field is used to detect whether the handle still is valid or not.
+///
+pub struct VirtualFile {
+    /// Lazy handle to the global file descriptor cache. The slot that this points to
+    /// might contain our File, or it may be empty, or it may contain a File that
+    /// belongs to a different VirtualFile.
+    handle: RwLock<SlotHandle>,
+
+    /// Current file position
+    pos: u64,
+
+    /// File path and options to use to open it.
+    ///
+    /// Note: this only contains the options needed to re-open it. For example,
+    /// if a new file is created, we only pass the create flag when it's initially
+    /// opened, in the VirtualFile::create() function, and strip the flag before
+    /// storing it here.
+    pub path: PathBuf,
+    open_options: OpenOptions,
+}
+
+#[derive(PartialEq, Clone, Copy)]
+struct SlotHandle {
+    /// Index into OPEN_FILES.slots
+    index: usize,
+
+    /// Value of 'tag' in the slot. If slot's tag doesn't match, then the slot has
+    /// been recycled and no longer contains the FD for this virtual file.
+    tag: u64,
+}
+
+/// OPEN_FILES is the global array that holds the physical file descriptors that
+/// are currently open. Each slot in the array is protected by a separate lock,
+/// so that different files can be accessed independently. The lock must be held
+/// in write mode to replace the slot with a different file, but a read mode
+/// is enough to operate on the file, whether you're reading or writing to it.
+///
+/// OPEN_FILES starts in uninitialized state, and it's initialized by
+/// the virtual_file::init() function. It must be called exactly once at page
+/// server startup.
+static OPEN_FILES: OnceCell<OpenFiles> = OnceCell::new();
+
+struct OpenFiles {
+    slots: &'static [Slot],
+
+    /// clock arm for the clock algorithm
+    next: AtomicUsize,
+}
+
+struct Slot {
+    inner: RwLock<SlotInner>,
+
+    /// has this file been used since last clock sweep?
+    recently_used: AtomicBool,
+}
+
+struct SlotInner {
+    /// Counter that's incremented every time a different file is stored here.
+    /// To avoid the ABA problem.
+    tag: u64,
+
+    /// the underlying file
+    file: Option<File>,
+}
+
+impl OpenFiles {
+    /// Find a slot to use, evicting an existing file descriptor if needed.
+    ///
+    /// On return, we hold a lock on the slot, and its 'tag' has been updated
+    /// recently_used has been set. It's all ready for reuse.
+    fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
+        //
+        // Run the clock algorithm to find a slot to replace.
+        //
+        let num_slots = self.slots.len();
+        let mut retries = 0;
+        let mut slot;
+        let mut slot_guard;
+        let index;
+        loop {
+            let next = self.next.fetch_add(1, Ordering::AcqRel) % num_slots;
+            slot = &self.slots[next];
+
+            // If the recently_used flag on this slot is set, continue the clock
+            // sweep. Otherwise try to use this slot. If we cannot acquire the
+            // lock, also continue the clock sweep.
+            //
+            // We only continue in this manner for a while, though. If we loop
+            // through the array twice without finding a victim, just pick the
+            // next slot and wait until we can reuse it. This way, we avoid
+            // spinning in the extreme case that all the slots are busy with an
+            // I/O operation.
+            if retries < num_slots * 2 {
+                if !slot.recently_used.swap(false, Ordering::Release) {
+                    if let Ok(guard) = slot.inner.try_write() {
+                        slot_guard = guard;
+                        index = next;
+                        break;
+                    }
+                }
+                retries += 1;
+            } else {
+                slot_guard = slot.inner.write().unwrap();
+                index = next;
+                break;
+            }
+        }
+
+        //
+        // We now have the victim slot locked. If it was in use previously, close the
+        // old file.
+        //
+        if let Some(old_file) = slot_guard.file.take() {
+            drop(old_file);
+        }
+
+        // Prepare the slot for reuse and return it
+        slot_guard.tag += 1;
+        slot.recently_used.store(true, Ordering::Relaxed);
+        (
+            SlotHandle {
+                index,
+                tag: slot_guard.tag,
+            },
+            slot_guard,
+        )
+    }
+}
+
+impl VirtualFile {
+    /// Open a file in read-only mode. Like File::open.
+    pub fn open(path: &Path) -> Result<VirtualFile, std::io::Error> {
+        Self::open_with_options(path, OpenOptions::new().read(true))
+    }
+
+    /// Create a new file for writing. If the file exists, it will be truncated.
+    /// Like File::create.
+    pub fn create(path: &Path) -> Result<VirtualFile, std::io::Error> {
+        Self::open_with_options(
+            path,
+            OpenOptions::new().write(true).create(true).truncate(true),
+        )
+    }
+
+    /// Open a file with given options.
+    ///
+    /// Note: If any custom flags were set in 'open_options' through OpenOptionsExt,
+    /// they will be applied also when the file is subsequently re-opened, not only
+    /// on the first time. Make sure that's sane!
+    pub fn open_with_options(
+        path: &Path,
+        open_options: &OpenOptions,
+    ) -> Result<VirtualFile, std::io::Error> {
+        let (handle, mut slot_guard) = get_open_files().find_victim_slot();
+
+        let file = open_options.open(path)?;
+
+        // Strip all options other than read and write.
+        //
+        // It would perhaps be nicer to check just for the read and write flags
+        // explicitly, but OpenOptions doesn't contain any functions to read flags,
+        // only to set them.
+        let mut reopen_options = open_options.clone();
+        reopen_options.create(false);
+        reopen_options.create_new(false);
+        reopen_options.truncate(false);
+
+        let vfile = VirtualFile {
+            handle: RwLock::new(handle),
+            pos: 0,
+            path: path.to_path_buf(),
+            open_options: reopen_options,
+        };
+
+        slot_guard.file.replace(file);
+
+        Ok(vfile)
+    }
+
+    /// Call File::sync_all() on the underlying File.
+    pub fn sync_all(&self) -> Result<(), Error> {
+        self.with_file(|file| file.sync_all())?
+    }
+
+    /// Helper function that looks up the underlying File for this VirtualFile,
+    /// opening it and evicting some other File if necessary. It calls 'func'
+    /// with the physical File.
+    fn with_file<F, R>(&self, mut func: F) -> Result<R, Error>
+    where
+        F: FnMut(&File) -> R,
+    {
+        let open_files = get_open_files();
+
+        let mut handle_guard = {
+            // Read the cached slot handle, and see if the slot that it points to still
+            // contains our File.
+            //
+            // We only need to hold the handle lock while we read the current handle. If
+            // another thread closes the file and recycles the slot for a different file,
+            // we will notice that the handle we read is no longer valid and retry.
+            let mut handle = *self.handle.read().unwrap();
+            loop {
+                // Check if the slot contains our File
+                {
+                    let slot = &open_files.slots[handle.index];
+                    let slot_guard = slot.inner.read().unwrap();
+                    if slot_guard.tag == handle.tag {
+                        if let Some(file) = &slot_guard.file {
+                            // Found a cached file descriptor.
+                            slot.recently_used.store(true, Ordering::Relaxed);
+                            return Ok(func(file));
+                        }
+                    }
+                }
+
+                // The slot didn't contain our File. We will have to open it ourselves,
+                // but before that, grab a write lock on handle in the VirtualFile, so
+                // that no other thread will try to concurrently open the same file.
+                let handle_guard = self.handle.write().unwrap();
+
+                // If another thread changed the handle while we were not holding the lock,
+                // then the handle might now be valid again. Loop back to retry.
+                if *handle_guard != handle {
+                    handle = *handle_guard;
+                    continue;
+                }
+                break handle_guard;
+            }
+        };
+
+        // We need to open the file ourselves. The handle in the VirtualFile is
+        // now locked in write-mode. Find a free slot to put it in.
+        let (handle, mut slot_guard) = open_files.find_victim_slot();
+
+        // Open the physical file
+        let file = self.open_options.open(&self.path)?;
+
+        // Perform the requested operation on it
+        //
+        // TODO: We could downgrade the locks to read mode before calling
+        // 'func', to allow a little bit more concurrency, but the standard
+        // library RwLock doesn't allow downgrading without releasing the lock,
+        // and that doesn't seem worth the trouble. (parking_lot RwLock would
+        // allow it)
+        let result = func(&file);
+
+        // Store the File in the slot and update the handle in the VirtualFile
+        // to point to it.
+        slot_guard.file.replace(file);
+
+        *handle_guard = handle;
+
+        Ok(result)
+    }
+}
+
+impl Drop for VirtualFile {
+    /// If a VirtualFile is dropped, close the underlying file if it was open.
+    fn drop(&mut self) {
+        let handle = self.handle.get_mut().unwrap();
+
+        // We could check with a read-lock first, to avoid waiting on an
+        // unrelated I/O.
+        let slot = &get_open_files().slots[handle.index];
+        let mut slot_guard = slot.inner.write().unwrap();
+        if slot_guard.tag == handle.tag {
+            slot.recently_used.store(false, Ordering::Relaxed);
+            slot_guard.file.take();
+        }
+    }
+}
+
+impl Read for VirtualFile {
+    fn read(&mut self, buf: &mut [u8]) -> Result<usize, Error> {
+        let pos = self.pos;
+        let n = self.read_at(buf, pos)?;
+        self.pos += n as u64;
+        Ok(n)
+    }
+}
+
+impl Write for VirtualFile {
+    fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
+        let pos = self.pos;
+        let n = self.write_at(buf, pos)?;
+        self.pos += n as u64;
+        Ok(n)
+    }
+
+    fn flush(&mut self) -> Result<(), std::io::Error> {
+        // flush is no-op for File (at least on unix), so we don't need to do
+        // anything here either.
+        Ok(())
+    }
+}
+
+impl Seek for VirtualFile {
+    fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
+        match pos {
+            SeekFrom::Start(offset) => {
+                self.pos = offset;
+            }
+            SeekFrom::End(offset) => {
+                self.pos = self.with_file(|mut file| file.seek(SeekFrom::End(offset)))??
+            }
+            SeekFrom::Current(offset) => {
+                let pos = self.pos as i128 + offset as i128;
+                if pos < 0 {
+                    return Err(Error::new(
+                        ErrorKind::InvalidInput,
+                        "offset would be negative",
+                    ));
+                }
+                if pos > u64::MAX as i128 {
+                    return Err(Error::new(ErrorKind::InvalidInput, "offset overflow"));
+                }
+                self.pos = pos as u64;
+            }
+        }
+        Ok(self.pos)
+    }
+}
+
+impl FileExt for VirtualFile {
+    fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
+        let result = self.with_file(|file| file.read_at(buf, offset))?;
+        if let Err(err) = &result {
+            tracing::error!("read_at error: {:?}", err);
+        }
+        result
+    }
+
+    fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
+        self.with_file(|file| file.write_at(buf, offset))?
+    }
+}
+
+impl OpenFiles {
+    fn new(num_slots: usize) -> OpenFiles {
+        let mut slots = Box::new(Vec::with_capacity(num_slots));
+        for _ in 0..num_slots {
+            let slot = Slot {
+                recently_used: AtomicBool::new(false),
+                inner: RwLock::new(SlotInner { tag: 0, file: None }),
+            };
+            slots.push(slot);
+        }
+
+        OpenFiles {
+            next: AtomicUsize::new(0),
+            slots: Box::leak(slots),
+        }
+    }
+}
+
+///
+/// Initialize the virtual file module. This must be called once at page
+/// server startup.
+///
+pub fn init(num_slots: usize) {
+    if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
+        panic!("virtual_file::init called twice");
+    }
+}
+
+const TEST_MAX_FILE_DESCRIPTORS: usize = 10;
+
+// Get a handle to the global slots array.
+fn get_open_files() -> &'static OpenFiles {
+    //
+    // In unit tests, page server startup doesn't happen and no one calls
+    // virtual_file::init(). Initialize it here, with a small array.
+    //
+    // This applies to the virtual file tests below, but all other unit
+    // tests too, so the virtual file facility is always usable in
+    // unit tests.
+    //
+    if cfg!(test) {
+        OPEN_FILES.get_or_init(|| OpenFiles::new(TEST_MAX_FILE_DESCRIPTORS))
+    } else {
+        OPEN_FILES.get().expect("virtual_file::init not called yet")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use rand::seq::SliceRandom;
+    use rand::thread_rng;
+    use rand::Rng;
+    use std::sync::Arc;
+    use std::thread;
+
+    // Helper function to slurp contents of a file, starting at the current position,
+    // into a string
+    fn read_string<FD>(vfile: &mut FD) -> Result<String, Error>
+    where
+        FD: Read,
+    {
+        let mut buf = String::new();
+        vfile.read_to_string(&mut buf)?;
+        Ok(buf)
+    }
+
+    // Helper function to slurp a portion of a file into a string
+    fn read_string_at<FD>(vfile: &mut FD, pos: u64, len: usize) -> Result<String, Error>
+    where
+        FD: FileExt,
+    {
+        let mut buf = Vec::new();
+        buf.resize(len, 0);
+        vfile.read_exact_at(&mut buf, pos)?;
+        Ok(String::from_utf8(buf).unwrap())
+    }
+
+    #[test]
+    fn test_virtual_files() -> Result<(), Error> {
+        // The real work is done in the test_files() helper function. This
+        // allows us to run the same set of tests against a native File, and
+        // VirtualFile. We trust the native Files and wouldn't need to test them,
+        // but this allows us to verify that the operations return the same
+        // results with VirtualFiles as with native Files. (Except that with
+        // native files, you will run out of file descriptors if the ulimit
+        // is low enough.)
+        test_files("virtual_files", |path, open_options| {
+            VirtualFile::open_with_options(path, open_options)
+        })
+    }
+
+    #[test]
+    fn test_physical_files() -> Result<(), Error> {
+        test_files("physical_files", |path, open_options| {
+            open_options.open(path)
+        })
+    }
+
+    fn test_files<OF, FD>(testname: &str, openfunc: OF) -> Result<(), Error>
+    where
+        FD: Read + Write + Seek + FileExt,
+        OF: Fn(&Path, &OpenOptions) -> Result<FD, std::io::Error>,
+    {
+        let testdir = crate::PageServerConf::test_repo_dir(testname);
+        std::fs::create_dir_all(&testdir)?;
+
+        let path_a = testdir.join("file_a");
+        let mut file_a = openfunc(
+            &path_a,
+            OpenOptions::new().write(true).create(true).truncate(true),
+        )?;
+        file_a.write_all(b"foobar")?;
+
+        // cannot read from a file opened in write-only mode
+        assert!(read_string(&mut file_a).is_err());
+
+        // Close the file and re-open for reading
+        let mut file_a = openfunc(&path_a, OpenOptions::new().read(true))?;
+
+        // cannot write to a file opened in read-only mode
+        assert!(file_a.write(b"bar").is_err());
+
+        // Try simple read
+        assert_eq!("foobar", read_string(&mut file_a)?);
+
+        // It's positioned at the EOF now.
+        assert_eq!("", read_string(&mut file_a)?);
+
+        // Test seeks.
+        assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
+        assert_eq!("oobar", read_string(&mut file_a)?);
+
+        assert_eq!(file_a.seek(SeekFrom::End(-2))?, 4);
+        assert_eq!("ar", read_string(&mut file_a)?);
+
+        assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
+        assert_eq!(file_a.seek(SeekFrom::Current(2))?, 3);
+        assert_eq!("bar", read_string(&mut file_a)?);
+
+        assert_eq!(file_a.seek(SeekFrom::Current(-5))?, 1);
+        assert_eq!("oobar", read_string(&mut file_a)?);
+
+        // Test erroneous seeks to before byte 0
+        assert!(file_a.seek(SeekFrom::End(-7)).is_err());
+        assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
+        assert!(file_a.seek(SeekFrom::Current(-2)).is_err());
+
+        // the erroneous seek should have left the position unchanged
+        assert_eq!("oobar", read_string(&mut file_a)?);
+
+        // Create another test file, and try FileExt functions on it.
+        let path_b = testdir.join("file_b");
+        let mut file_b = openfunc(
+            &path_b,
+            OpenOptions::new()
+                .read(true)
+                .write(true)
+                .create(true)
+                .truncate(true),
+        )?;
+        file_b.write_all_at(b"BAR", 3)?;
+        file_b.write_all_at(b"FOO", 0)?;
+
+        assert_eq!(read_string_at(&mut file_b, 2, 3)?, "OBA");
+
+        // Open a lot of files, enough to cause some evictions. (Or to be precise,
+        // open the same file many times. The effect is the same.)
+        //
+        // leave file_a positioned at offset 1 before we start
+        assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
+
+        let mut vfiles = Vec::new();
+        for _ in 0..100 {
+            let mut vfile = openfunc(&path_b, OpenOptions::new().read(true))?;
+            assert_eq!("FOOBAR", read_string(&mut vfile)?);
+            vfiles.push(vfile);
+        }
+
+        // make sure we opened enough files to definitely cause evictions.
+        assert!(vfiles.len() > TEST_MAX_FILE_DESCRIPTORS * 2);
+
+        // The underlying file descriptor for 'file_a' should be closed now. Try to read
+        // from it again. We left the file positioned at offset 1 above.
+        assert_eq!("oobar", read_string(&mut file_a)?);
+
+        // Check that all the other FDs still work too. Use them in random order for
+        // good measure.
+        vfiles.as_mut_slice().shuffle(&mut thread_rng());
+        for vfile in vfiles.iter_mut() {
+            assert_eq!("OOBAR", read_string_at(vfile, 1, 5)?);
+        }
+
+        Ok(())
+    }
+
+    /// Test using VirtualFiles from many threads concurrently. This tests both using
+    /// a lot of VirtualFiles concurrently, causing evictions, and also using the same
+    /// VirtualFile from multiple threads concurrently.
+    #[test]
+    fn test_vfile_concurrency() -> Result<(), Error> {
+        const SIZE: usize = 8 * 1024;
+        const VIRTUAL_FILES: usize = 100;
+        const THREADS: usize = 100;
+        const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];
+
+        let testdir = crate::PageServerConf::test_repo_dir("vfile_concurrency");
+        std::fs::create_dir_all(&testdir)?;
+
+        // Create a test file.
+        let test_file_path = testdir.join("concurrency_test_file");
+        {
+            let file = File::create(&test_file_path)?;
+            file.write_all_at(&SAMPLE, 0)?;
+        }
+
+        // Open the file many times.
+        let mut files = Vec::new();
+        for _ in 0..VIRTUAL_FILES {
+            let f = VirtualFile::open_with_options(&test_file_path, OpenOptions::new().read(true))?;
+            files.push(f);
+        }
+        let files = Arc::new(files);
+
+        // Launch many threads, and use the virtual files concurrently in random order.
+        let mut threads = Vec::new();
+        for threadno in 0..THREADS {
+            let builder =
+                thread::Builder::new().name(format!("test_vfile_concurrency thread {}", threadno));
+
+            let files = files.clone();
+            let thread = builder
+                .spawn(move || {
+                    let mut buf = [0u8; SIZE];
+                    let mut rng = rand::thread_rng();
+                    for _ in 1..1000 {
+                        let f = &files[rng.gen_range(0..files.len())];
+                        f.read_exact_at(&mut buf, 0).unwrap();
+                        assert!(buf == SAMPLE);
+                    }
+                })
+                .unwrap();
+            threads.push(thread);
+        }
+
+        for thread in threads {
+            thread.join().unwrap();
+        }
+
+        Ok(())
+    }
+}
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -8,6 +8,8 @@
 use crate::relish::*;
 use crate::restore_local_repo;
 use crate::tenant_mgr;
+use crate::tenant_mgr::TenantState;
+use crate::tenant_threads;
 use crate::waldecoder::*;
 use crate::PageServerConf;
 use anyhow::{bail, Error, Result};
@@ -38,6 +40,7 @@ use zenith_utils::zid::ZTimelineId;
 struct WalReceiverEntry {
    wal_producer_connstr: String,
    wal_receiver_handle: Option<JoinHandle<()>>,
+    tenantid: ZTenantId,
 }

 lazy_static! {
@@ -65,6 +68,23 @@ pub fn stop_wal_receiver(timelineid: ZTimelineId) {
    }
 }

+pub fn drop_wal_receiver(timelineid: ZTimelineId, tenantid: ZTenantId) {
+    let mut receivers = WAL_RECEIVERS.lock().unwrap();
+    receivers.remove(&timelineid);
+
+    // Check if it was the last walreceiver of the tenant.
+    // TODO now we store one WalReceiverEntry per timeline,
+    // so this iterator looks a bit strange.
+    for (_timelineid, entry) in receivers.iter() {
+        if entry.tenantid == tenantid {
+            return;
+        }
+    }
+
+    // When last walreceiver of the tenant is gone, change state to Idle
+    tenant_mgr::set_tenant_state(tenantid, TenantState::Idle).unwrap();
+}
+
 // Launch a new WAL receiver, or tell one that's running about change in connection string
 pub fn launch_wal_receiver(
    conf: &'static PageServerConf,
@@ -90,8 +110,13 @@ pub fn launch_wal_receiver(
            let receiver = WalReceiverEntry {
                wal_producer_connstr: wal_producer_connstr.into(),
                wal_receiver_handle: Some(wal_receiver_handle),
+                tenantid,
            };
            receivers.insert(timelineid, receiver);
+
+            // Update tenant state and start tenant threads, if they are not running yet.
+            tenant_mgr::set_tenant_state(tenantid, TenantState::Active).unwrap();
+            tenant_threads::start_tenant_threads(conf, tenantid);
        }
    };
 }
@@ -114,11 +139,15 @@ fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid:
    let _enter = info_span!("WAL receiver", timeline = %timelineid, tenant = %tenantid).entered();
    info!("WAL receiver thread started");

+    let mut retry_count = 10;
+
    //
    // Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
    // and start streaming WAL from it. If the connection is lost, keep retrying.
+    // TODO How long should we retry in case of losing connection?
+    // Should we retry at all or we can wait for the next callmemaybe request?
    //
-    while !tenant_mgr::shutdown_requested() {
+    while !tenant_mgr::shutdown_requested() && retry_count > 0 {
        // Look up the current WAL producer address
        let wal_producer_connstr = get_wal_producer_connstr(timelineid);

@@ -129,10 +158,20 @@ fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid:
                "WAL streaming connection failed ({}), retrying in 1 second",
                e
            );
+            retry_count -= 1;
            sleep(Duration::from_secs(1));
+        } else {
+            info!(
+                "walreceiver disconnected tenant {}, timelineid {}",
+                tenantid, timelineid
+            );
+            break;
        }
    }
-    debug!("WAL streaming shut down");
+    info!("WAL streaming shut down");
+    // Drop it from list of active WAL_RECEIVERS
+    // so that next callmemaybe request launched a new thread
+    drop_wal_receiver(timelineid, tenantid);
 }

 fn walreceiver_main(
@@ -284,12 +323,14 @@ fn walreceiver_main(
        if let Some(last_lsn) = status_update {
            // TODO: More thought should go into what values are sent here.
            let last_lsn = PgLsn::from(u64::from(last_lsn));
-            let write_lsn = last_lsn;
+            // We are using disk consistent LSN as `write_lsn`, i.e. LSN at which page server
+            // may guarantee persistence of all received data. Safekeeper is not free to remove
+            // WAL preceding `write_lsn`: it should not be requested by this page server.
+            let write_lsn = PgLsn::from(u64::from(timeline.get_disk_consistent_lsn()));
            let flush_lsn = last_lsn;
            let apply_lsn = PgLsn::from(0);
            let ts = SystemTime::now();
            const NO_REPLY: u8 = 0;
-
            physical_stream.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
        }

@@ -298,6 +339,7 @@ fn walreceiver_main(
            break;
        }
    }
+
    Ok(())
 }

--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,23 +22,23 @@ use byteorder::{ByteOrder, LittleEndian};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use lazy_static::lazy_static;
 use log::*;
+use nix::poll::*;
 use serde::Serialize;
 use std::fs;
 use std::fs::OpenOptions;
 use std::io::prelude::*;
-use std::io::Error;
+use std::io::{Error, ErrorKind};
+use std::os::unix::io::AsRawFd;
 use std::path::PathBuf;
 use std::process::Stdio;
+use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
 use std::sync::Mutex;
 use std::time::Duration;
 use std::time::Instant;
-use tokio::io::AsyncBufReadExt;
-use tokio::io::{AsyncReadExt, AsyncWriteExt};
-use tokio::process::{ChildStdin, ChildStdout, Command};
-use tokio::time::timeout;
 use zenith_metrics::{register_histogram, register_int_counter, Histogram, IntCounter};
 use zenith_utils::bin_ser::BeSer;
 use zenith_utils::lsn::Lsn;
+use zenith_utils::nonblock::set_nonblock;
 use zenith_utils::zid::ZTenantId;

 use crate::relish::*;
@@ -139,7 +139,6 @@ pub struct PostgresRedoManager {
    tenantid: ZTenantId,
    conf: &'static PageServerConf,

-    runtime: tokio::runtime::Runtime,
    process: Mutex<Option<PostgresRedoProcess>>,
 }

@@ -153,6 +152,13 @@ struct WalRedoRequest {
    records: Vec<(Lsn, WALRecord)>,
 }

+impl WalRedoRequest {
+    // Can this request be served by zenith redo funcitons
+    // or we need to pass it to wal-redo postgres process?
+    fn can_apply_in_zenith(&self) -> bool {
+        !matches!(self.rel, RelishTag::Relation(_))
+    }
+}
 /// An error happened in WAL redo
 #[derive(Debug, thiserror::Error)]
 pub enum WalRedoError {
@@ -161,6 +167,8 @@ pub enum WalRedoError {

    #[error("cannot perform WAL redo now")]
    InvalidState,
+    #[error("cannot perform WAL redo for this request")]
+    InvalidRequest,
 }

 ///
@@ -182,7 +190,6 @@ impl WalRedoManager for PostgresRedoManager {
        records: Vec<(Lsn, WALRecord)>,
    ) -> Result<Bytes, WalRedoError> {
        let start_time;
-        let lock_time;
        let end_time;

        let request = WalRedoRequest {
@@ -194,26 +201,37 @@ impl WalRedoManager for PostgresRedoManager {
        };

        start_time = Instant::now();
-        let result = {
+        let result;
+
+        if request.can_apply_in_zenith() {
+            result = self.handle_apply_request_zenith(&request);
+
+            end_time = Instant::now();
+            WAL_REDO_TIME.observe(end_time.duration_since(start_time).as_secs_f64());
+        } else {
            let mut process_guard = self.process.lock().unwrap();
-            lock_time = Instant::now();
+            let lock_time = Instant::now();

            // launch the WAL redo process on first use
            if process_guard.is_none() {
-                let p = self
-                    .runtime
-                    .block_on(PostgresRedoProcess::launch(self.conf, &self.tenantid))?;
+                let p = PostgresRedoProcess::launch(self.conf, &self.tenantid)?;
                *process_guard = Some(p);
            }
            let process = process_guard.as_mut().unwrap();

-            self.runtime
-                .block_on(self.handle_apply_request(process, &request))
-        };
-        end_time = Instant::now();
+            result = self.handle_apply_request_postgres(process, &request);

-        WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
-        WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());
+            WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
+            end_time = Instant::now();
+            WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());
+
+            // If something went wrong, don't try to reuse the process. Kill it, and
+            // next request will launch a new one.
+            if result.is_err() {
+                let process = process_guard.take().unwrap();
+                process.kill();
+            }
+        }

        result
    }
@@ -224,17 +242,8 @@ impl PostgresRedoManager {
    /// Create a new PostgresRedoManager.
    ///
    pub fn new(conf: &'static PageServerConf, tenantid: ZTenantId) -> PostgresRedoManager {
-        // We block on waiting for requests on the walredo request channel, but
-        // use async I/O to communicate with the child process. Initialize the
-        // runtime for the async part.
-        let runtime = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .unwrap();
-
        // The actual process is launched lazily, on first request.
        PostgresRedoManager {
-            runtime,
            tenantid,
            conf,
            process: Mutex::new(None),
@@ -242,13 +251,47 @@ impl PostgresRedoManager {
    }

    ///
-    /// Process one request for WAL redo.
+    /// Process one request for WAL redo using wal-redo postgres
    ///
-    async fn handle_apply_request(
+    fn handle_apply_request_postgres(
        &self,
        process: &mut PostgresRedoProcess,
        request: &WalRedoRequest,
    ) -> Result<Bytes, WalRedoError> {
+        let blknum = request.blknum;
+        let lsn = request.lsn;
+        let base_img = request.base_img.clone();
+        let records = &request.records;
+        let nrecords = records.len();
+
+        let start = Instant::now();
+
+        let apply_result: Result<Bytes, Error>;
+
+        if let RelishTag::Relation(rel) = request.rel {
+            // Relational WAL records are applied using wal-redo-postgres
+            let buf_tag = BufferTag { rel, blknum };
+            apply_result = process.apply_wal_records(buf_tag, base_img, records);
+
+            let duration = start.elapsed();
+
+            debug!(
+                "postgres applied {} WAL records in {} us to reconstruct page image at LSN {}",
+                nrecords,
+                duration.as_micros(),
+                lsn
+            );
+
+            apply_result.map_err(WalRedoError::IoError)
+        } else {
+            Err(WalRedoError::InvalidRequest)
+        }
+    }
+
+    ///
+    /// Process one request for WAL redo using custom zenith code
+    ///
+    fn handle_apply_request_zenith(&self, request: &WalRedoRequest) -> Result<Bytes, WalRedoError> {
        let rel = request.rel;
        let blknum = request.blknum;
        let lsn = request.lsn;
@@ -260,178 +303,158 @@ impl PostgresRedoManager {
        let start = Instant::now();

        let apply_result: Result<Bytes, Error>;
-        if let RelishTag::Relation(rel) = rel {
-            // Relational WAL records are applied using wal-redo-postgres
-            let buf_tag = BufferTag { rel, blknum };
-            apply_result = process.apply_wal_records(buf_tag, base_img, records).await;
+
+        // Non-relational WAL records are handled here, with custom code that has the
+        // same effects as the corresponding Postgres WAL redo function.
+        const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
+        let mut page = BytesMut::new();
+        if let Some(fpi) = base_img {
+            // If full-page image is provided, then use it...
+            page.extend_from_slice(&fpi[..]);
        } else {
-            // Non-relational WAL records are handled here, with custom code that has the
-            // same effects as the corresponding Postgres WAL redo function.
-            const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
-            let mut page = BytesMut::new();
-            if let Some(fpi) = base_img {
-                // If full-page image is provided, then use it...
-                page.extend_from_slice(&fpi[..]);
-            } else {
-                // otherwise initialize page with zeros
-                page.extend_from_slice(&ZERO_PAGE);
+            // otherwise initialize page with zeros
+            page.extend_from_slice(&ZERO_PAGE);
+        }
+        // Apply all collected WAL records
+        for (_lsn, record) in records {
+            let mut buf = record.rec.clone();
+
+            WAL_REDO_RECORD_COUNTER.inc();
+
+            // 1. Parse XLogRecord struct
+            // FIXME: refactor to avoid code duplication.
+            let xlogrec = XLogRecord::from_bytes(&mut buf);
+
+            //move to main data
+            // TODO probably, we should store some records in our special format
+            // to avoid this weird parsing on replay
+            let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
+            if buf.remaining() > skip {
+                buf.advance(skip);
            }
-            // Apply all collected WAL records
-            for (_lsn, record) in records {
-                let mut buf = record.rec.clone();

-                WAL_REDO_RECORD_COUNTER.inc();
-
-                // 1. Parse XLogRecord struct
-                // FIXME: refactor to avoid code duplication.
-                let xlogrec = XLogRecord::from_bytes(&mut buf);
-
-                //move to main data
-                // TODO probably, we should store some records in our special format
-                // to avoid this weird parsing on replay
-                let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
-                if buf.remaining() > skip {
-                    buf.advance(skip);
-                }
-
-                if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
-                    // Transaction manager stuff
-                    let rec_segno = match rel {
-                        RelishTag::Slru { slru, segno } => {
-                            assert!(
-                                slru == SlruKind::Clog,
-                                "Not valid XACT relish tag {:?}",
-                                rel
+            if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
+                // Transaction manager stuff
+                let rec_segno = match rel {
+                    RelishTag::Slru { slru, segno } => {
+                        assert!(
+                            slru == SlruKind::Clog,
+                            "Not valid XACT relish tag {:?}",
+                            rel
+                        );
+                        segno
+                    }
+                    _ => panic!("Not valid XACT relish tag {:?}", rel),
+                };
+                let parsed_xact =
+                    XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
+                if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
+                    || parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
+                {
+                    transaction_id_set_status(
+                        parsed_xact.xid,
+                        pg_constants::TRANSACTION_STATUS_COMMITTED,
+                        &mut page,
+                    );
+                    for subxact in &parsed_xact.subxacts {
+                        let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                        let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                        let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                        // only update xids on the requested page
+                        if rec_segno == segno && blknum == rpageno {
+                            transaction_id_set_status(
+                                *subxact,
+                                pg_constants::TRANSACTION_STATUS_COMMITTED,
+                                &mut page,
                            );
-                            segno
-                        }
-                        _ => panic!("Not valid XACT relish tag {:?}", rel),
-                    };
-                    let parsed_xact =
-                        XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
-                    if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
-                        || parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
-                    {
-                        transaction_id_set_status(
-                            parsed_xact.xid,
-                            pg_constants::TRANSACTION_STATUS_COMMITTED,
-                            &mut page,
-                        );
-                        for subxact in &parsed_xact.subxacts {
-                            let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
-                            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                            // only update xids on the requested page
-                            if rec_segno == segno && blknum == rpageno {
-                                transaction_id_set_status(
-                                    *subxact,
-                                    pg_constants::TRANSACTION_STATUS_COMMITTED,
-                                    &mut page,
-                                );
-                            }
-                        }
-                    } else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
-                        || parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
-                    {
-                        transaction_id_set_status(
-                            parsed_xact.xid,
-                            pg_constants::TRANSACTION_STATUS_ABORTED,
-                            &mut page,
-                        );
-                        for subxact in &parsed_xact.subxacts {
-                            let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
-                            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                            // only update xids on the requested page
-                            if rec_segno == segno && blknum == rpageno {
-                                transaction_id_set_status(
-                                    *subxact,
-                                    pg_constants::TRANSACTION_STATUS_ABORTED,
-                                    &mut page,
-                                );
-                            }
                        }
                    }
-                } else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
-                    // Multixact operations
-                    let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-                    if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
-                        let xlrec = XlMultiXactCreate::decode(&mut buf);
-                        if let RelishTag::Slru {
-                            slru,
-                            segno: rec_segno,
-                        } = rel
-                        {
-                            if slru == SlruKind::MultiXactMembers {
-                                for i in 0..xlrec.nmembers {
-                                    let pageno =
-                                        i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
-                                    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                                    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                                    if segno == rec_segno && rpageno == blknum {
-                                        // update only target block
-                                        let offset = xlrec.moff + i;
-                                        let memberoff = mx_offset_to_member_offset(offset);
-                                        let flagsoff = mx_offset_to_flags_offset(offset);
-                                        let bshift = mx_offset_to_flags_bitshift(offset);
-                                        let mut flagsval =
-                                            LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
-                                        flagsval &= !(((1
-                                            << pg_constants::MXACT_MEMBER_BITS_PER_XACT)
-                                            - 1)
+                } else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
+                    || parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
+                {
+                    transaction_id_set_status(
+                        parsed_xact.xid,
+                        pg_constants::TRANSACTION_STATUS_ABORTED,
+                        &mut page,
+                    );
+                    for subxact in &parsed_xact.subxacts {
+                        let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                        let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                        let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                        // only update xids on the requested page
+                        if rec_segno == segno && blknum == rpageno {
+                            transaction_id_set_status(
+                                *subxact,
+                                pg_constants::TRANSACTION_STATUS_ABORTED,
+                                &mut page,
+                            );
+                        }
+                    }
+                }
+            } else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
+                // Multixact operations
+                let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+                if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
+                    let xlrec = XlMultiXactCreate::decode(&mut buf);
+                    if let RelishTag::Slru {
+                        slru,
+                        segno: rec_segno,
+                    } = rel
+                    {
+                        if slru == SlruKind::MultiXactMembers {
+                            for i in 0..xlrec.nmembers {
+                                let pageno = i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
+                                let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                                let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                                if segno == rec_segno && rpageno == blknum {
+                                    // update only target block
+                                    let offset = xlrec.moff + i;
+                                    let memberoff = mx_offset_to_member_offset(offset);
+                                    let flagsoff = mx_offset_to_flags_offset(offset);
+                                    let bshift = mx_offset_to_flags_bitshift(offset);
+                                    let mut flagsval =
+                                        LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
+                                    flagsval &=
+                                        !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1)
                                            << bshift);
-                                        flagsval |= xlrec.members[i as usize].status << bshift;
-                                        LittleEndian::write_u32(
-                                            &mut page[flagsoff..flagsoff + 4],
-                                            flagsval,
-                                        );
-                                        LittleEndian::write_u32(
-                                            &mut page[memberoff..memberoff + 4],
-                                            xlrec.members[i as usize].xid,
-                                        );
-                                    }
+                                    flagsval |= xlrec.members[i as usize].status << bshift;
+                                    LittleEndian::write_u32(
+                                        &mut page[flagsoff..flagsoff + 4],
+                                        flagsval,
+                                    );
+                                    LittleEndian::write_u32(
+                                        &mut page[memberoff..memberoff + 4],
+                                        xlrec.members[i as usize].xid,
+                                    );
                                }
-                            } else {
-                                // Multixact offsets SLRU
-                                let offs = (xlrec.mid
-                                    % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
-                                    * 4) as usize;
-                                LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
                            }
                        } else {
-                            panic!();
+                            // Multixact offsets SLRU
+                            let offs = (xlrec.mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
+                                * 4) as usize;
+                            LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
                        }
                    } else {
                        panic!();
                    }
+                } else {
+                    panic!();
                }
            }
-
-            apply_result = Ok::<Bytes, Error>(page.freeze());
        }

+        apply_result = Ok::<Bytes, Error>(page.freeze());
+
        let duration = start.elapsed();

-        let result: Result<Bytes, WalRedoError>;
-
        debug!(
-            "applied {} WAL records in {} ms to reconstruct page image at LSN {}",
+            "zenith applied {} WAL records in {} ms to reconstruct page image at LSN {}",
            nrecords,
            duration.as_millis(),
            lsn
        );

-        if let Err(e) = apply_result {
-            error!("could not apply WAL records: {:#}", e);
-            result = Err(WalRedoError::IoError(e));
-        } else {
-            let img = apply_result.unwrap();
-
-            result = Ok(img);
-        }
-
-        // The caller is responsible for sending the response
-        result
+        apply_result.map_err(WalRedoError::IoError)
    }
 }

@@ -439,18 +462,17 @@ impl PostgresRedoManager {
 /// Handle to the Postgres WAL redo process
 ///
 struct PostgresRedoProcess {
+    child: Child,
    stdin: ChildStdin,
    stdout: ChildStdout,
+    stderr: ChildStderr,
 }

 impl PostgresRedoProcess {
    //
    // Start postgres binary in special WAL redo mode.
    //
-    async fn launch(
-        conf: &PageServerConf,
-        tenantid: &ZTenantId,
-    ) -> Result<PostgresRedoProcess, Error> {
+    fn launch(conf: &PageServerConf, tenantid: &ZTenantId) -> Result<PostgresRedoProcess, Error> {
        // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
        // just create one with constant name. That fails if you try to launch more than
        // one WAL redo manager concurrently.
@@ -471,7 +493,6 @@ impl PostgresRedoProcess {
            .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
            .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
            .output()
-            .await
            .expect("failed to execute initdb");

        if !initdb.status.success() {
@@ -508,102 +529,139 @@ impl PostgresRedoProcess {
            datadir.display()
        );

-        let stdin = child.stdin.take().expect("failed to open child's stdin");
-        let stderr = child.stderr.take().expect("failed to open child's stderr");
-        let stdout = child.stdout.take().expect("failed to open child's stdout");
+        let stdin = child.stdin.take().unwrap();
+        let stdout = child.stdout.take().unwrap();
+        let stderr = child.stderr.take().unwrap();

-        // This async block reads the child's stderr, and forwards it to the logger
-        let f_stderr = async {
-            let mut stderr_buffered = tokio::io::BufReader::new(stderr);
+        set_nonblock(stdin.as_raw_fd())?;
+        set_nonblock(stdout.as_raw_fd())?;
+        set_nonblock(stderr.as_raw_fd())?;

-            let mut line = String::new();
-            loop {
-                let res = stderr_buffered.read_line(&mut line).await;
-                if res.is_err() {
-                    debug!("could not convert line to utf-8");
-                    continue;
-                }
-                if res.unwrap() == 0 {
-                    break;
-                }
-                error!("wal-redo-postgres: {}", line.trim());
-                line.clear();
-            }
-            Ok::<(), Error>(())
-        };
-        tokio::spawn(f_stderr);
+        Ok(PostgresRedoProcess {
+            child,
+            stdin,
+            stdout,
+            stderr,
+        })
+    }

-        Ok(PostgresRedoProcess { stdin, stdout })
+    fn kill(mut self) {
+        let _ = self.child.kill();
+        if let Ok(exit_status) = self.child.wait() {
+            error!("wal-redo-postgres exited with code {}", exit_status);
+        }
+        drop(self);
    }

    //
    // Apply given WAL records ('records') over an old page image. Returns
    // new page image.
    //
-    async fn apply_wal_records(
+    fn apply_wal_records(
        &mut self,
        tag: BufferTag,
        base_img: Option<Bytes>,
        records: &[(Lsn, WALRecord)],
    ) -> Result<Bytes, std::io::Error> {
-        let stdout = &mut self.stdout;
-        // Buffer the writes to avoid a lot of small syscalls.
-        let mut stdin = tokio::io::BufWriter::new(&mut self.stdin);
+        // Serialize all the messages to send the WAL redo process first.
+        //
+        // This could be problematic if there are millions of records to replay,
+        // but in practice the number of records is usually so small that it doesn't
+        // matter, and it's better to keep this code simple.
+        let mut writebuf: Vec<u8> = Vec::new();
+        build_begin_redo_for_block_msg(tag, &mut writebuf);
+        if let Some(img) = base_img {
+            build_push_page_msg(tag, &img, &mut writebuf);
+        }
+        for (lsn, rec) in records.iter() {
+            build_apply_record_msg(*lsn, &rec.rec, &mut writebuf);
+        }
+        build_get_page_msg(tag, &mut writebuf);
+        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
+
+        // The input is now in 'writebuf'. Do a blind write first, writing as much as
+        // we can, before calling poll(). That skips one call to poll() if the stdin is
+        // already available for writing, which it almost certainly is because the
+        // process is idle.
+        let mut nwrite = self.stdin.write(&writebuf)?;
+
+        // We expect the WAL redo process to respond with an 8k page image. We read it
+        // into this buffer.
+        let mut resultbuf = vec![0; pg_constants::BLCKSZ.into()];
+        let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
+
+        // Prepare for calling poll()
+        let mut pollfds = [
+            PollFd::new(self.stdout.as_raw_fd(), PollFlags::POLLIN),
+            PollFd::new(self.stderr.as_raw_fd(), PollFlags::POLLIN),
+            PollFd::new(self.stdin.as_raw_fd(), PollFlags::POLLOUT),
+        ];

        // We do three things simultaneously: send the old base image and WAL records to
        // the child process's stdin, read the result from child's stdout, and forward any logging
        // information that the child writes to its stderr to the page server's log.
-        //
-        // 'f_stdin' handles writing the base image and WAL records to the child process.
-        // 'f_stdout' below reads the result back. And 'f_stderr', which was spawned into the
-        // tokio runtime in the 'launch' function already, forwards the logging.
-        let f_stdin = async {
-            // Send base image, if any. (If the record initializes the page, previous page
-            // version is not needed.)
-            timeout(
-                TIMEOUT,
-                stdin.write_all(&build_begin_redo_for_block_msg(tag)),
-            )
-            .await??;
-            if let Some(img) = base_img {
-                timeout(TIMEOUT, stdin.write_all(&build_push_page_msg(tag, &img))).await??;
+        while nresult < pg_constants::BLCKSZ.into() {
+            // If we have more data to write, wake up if 'stdin' becomes writeable or
+            // we have data to read. Otherwise only wake up if there's data to read.
+            let nfds = if nwrite < writebuf.len() { 3 } else { 2 };
+            let n = nix::poll::poll(&mut pollfds[0..nfds], TIMEOUT.as_millis() as i32)?;
+
+            if n == 0 {
+                return Err(Error::new(ErrorKind::Other, "WAL redo timed out"));
            }

-            // Send WAL records.
-            for (lsn, rec) in records.iter() {
-                WAL_REDO_RECORD_COUNTER.inc();
+            // If we have some messages in stderr, forward them to the log.
+            let err_revents = pollfds[1].revents().unwrap();
+            if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
+                let mut errbuf: [u8; 16384] = [0; 16384];
+                let n = self.stderr.read(&mut errbuf)?;

-                stdin
-                    .write_all(&build_apply_record_msg(*lsn, &rec.rec))
-                    .await?;
+                // The message might not be split correctly into lines here. But this is
+                // good enough, the important thing is to get the message to the log.
+                if n > 0 {
+                    error!(
+                        "wal-redo-postgres: {}",
+                        String::from_utf8_lossy(&errbuf[0..n])
+                    );

-                //debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
-                //       r.lsn >> 32, r.lsn & 0xffff_ffff);
+                    // To make sure we capture all log from the process if it fails, keep
+                    // reading from the stderr, before checking the stdout.
+                    continue;
+                }
+            } else if err_revents.contains(PollFlags::POLLHUP) {
+                return Err(Error::new(
+                    ErrorKind::BrokenPipe,
+                    "WAL redo process closed its stderr unexpectedly",
+                ));
            }
-            //debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
-            //       records.len(), lsn >> 32, lsn & 0xffff_ffff);

-            // Send GetPage command to get the result back
-            timeout(TIMEOUT, stdin.write_all(&build_get_page_msg(tag))).await??;
-            timeout(TIMEOUT, stdin.flush()).await??;
-            //debug!("sent GetPage for {}", tag.blknum);
-            Ok::<(), Error>(())
-        };
+            // If we have more data to write and 'stdin' is writeable, do write.
+            if nwrite < writebuf.len() {
+                let in_revents = pollfds[2].revents().unwrap();
+                if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
+                    nwrite += self.stdin.write(&writebuf[nwrite..])?;
+                } else if in_revents.contains(PollFlags::POLLHUP) {
+                    // We still have more data to write, but the process closed the pipe.
+                    return Err(Error::new(
+                        ErrorKind::BrokenPipe,
+                        "WAL redo process closed its stdin unexpectedly",
+                    ));
+                }
+            }

-        // Read back new page image
-        let f_stdout = async {
-            let mut buf = [0u8; 8192];
+            // If we have some data in stdout, read it to the result buffer.
+            let out_revents = pollfds[0].revents().unwrap();
+            if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
+                nresult += self.stdout.read(&mut resultbuf[nresult..])?;
+            } else if out_revents.contains(PollFlags::POLLHUP) {
+                return Err(Error::new(
+                    ErrorKind::BrokenPipe,
+                    "WAL redo process closed its stdout unexpectedly",
+                ));
+            }
+        }

-            timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??;
-            //debug!("got response for {}", tag.blknum);
-            Ok::<[u8; 8192], Error>(buf)
-        };
-
-        let res = tokio::try_join!(f_stdout, f_stdin)?;
-
-        let buf = res.0;
-
-        Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
+        Ok(Bytes::from(resultbuf))
    }
 }

@@ -611,62 +669,42 @@ impl PostgresRedoProcess {
 // process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
 // explanation of the protocol.

-fn build_begin_redo_for_block_msg(tag: BufferTag) -> Vec<u8> {
+fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
    let len = 4 + 1 + 4 * 4;
-    let mut buf = Vec::with_capacity(1 + len);

    buf.put_u8(b'B');
    buf.put_u32(len as u32);

-    tag.ser_into(&mut buf)
+    tag.ser_into(buf)
        .expect("serialize BufferTag should always succeed");
-
-    debug_assert!(buf.len() == 1 + len);
-
-    buf
 }

-fn build_push_page_msg(tag: BufferTag, base_img: &[u8]) -> Vec<u8> {
+fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec<u8>) {
    assert!(base_img.len() == 8192);

    let len = 4 + 1 + 4 * 4 + base_img.len();
-    let mut buf = Vec::with_capacity(1 + len);

    buf.put_u8(b'P');
    buf.put_u32(len as u32);
-    tag.ser_into(&mut buf)
+    tag.ser_into(buf)
        .expect("serialize BufferTag should always succeed");
    buf.put(base_img);
-
-    debug_assert!(buf.len() == 1 + len);
-
-    buf
 }

-fn build_apply_record_msg(endlsn: Lsn, rec: &[u8]) -> Vec<u8> {
+fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec<u8>) {
    let len = 4 + 8 + rec.len();
-    let mut buf: Vec<u8> = Vec::with_capacity(1 + len);

    buf.put_u8(b'A');
    buf.put_u32(len as u32);
    buf.put_u64(endlsn.0);
    buf.put(rec);
-
-    debug_assert!(buf.len() == 1 + len);
-
-    buf
 }

-fn build_get_page_msg(tag: BufferTag) -> Vec<u8> {
+fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
    let len = 4 + 1 + 4 * 4;
-    let mut buf = Vec::with_capacity(1 + len);

    buf.put_u8(b'G');
    buf.put_u32(len as u32);
-    tag.ser_into(&mut buf)
+    tag.ser_into(buf)
        .expect("serialize BufferTag should always succeed");
-
-    debug_assert!(buf.len() == 1 + len);
-
-    buf
 }
--- a/postgres_ffi/src/xlog_utils.rs
+++ b/postgres_ffi/src/xlog_utils.rs
@@ -43,6 +43,9 @@ pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::<XLogRecord>();
 #[allow(clippy::identity_op)]
 pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;

+// PG timeline is always 1, changing it doesn't have useful meaning in Zenith.
+pub const PG_TLI: u32 = 1;
+
 pub type XLogRecPtr = u64;
 pub type TimeLineID = u32;
 pub type TimestampTz = i64;
@@ -421,7 +424,7 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes {
            XLogPageHeaderData {
                xlp_magic: XLOG_PAGE_MAGIC as u16,
                xlp_info: pg_constants::XLP_LONG_HEADER,
-                xlp_tli: 1, // FIXME: always use Postgres timeline 1
+                xlp_tli: PG_TLI,
                xlp_pageaddr: pageaddr,
                xlp_rem_len: 0,
                ..Default::default() // Put 0 in padding fields.
--- a/pre-commit.py
+++ b/pre-commit.py
@@ -37,20 +37,27 @@ def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str:
    return cmd


+def yapf(fix_inplace: bool) -> str:
+    cmd = "pipenv run yapf --recursive"
+    if fix_inplace:
+        cmd += " --in-place"
+    else:
+        cmd += " --diff"
+    return cmd
+
+
+def mypy() -> str:
+    return "pipenv run mypy"
+
+
 def get_commit_files() -> List[str]:
-    files = subprocess.check_output(
-        "git diff --cached --name-only --diff-filter=ACM".split()
-    )
+    files = subprocess.check_output("git diff --cached --name-only --diff-filter=ACM".split())
    return files.decode().splitlines()


-def check(
-    name: str, suffix: str, cmd: str, changed_files: List[str], no_color: bool = False
-):
+def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color: bool = False):
    print(f"Checking: {name} ", end="")
-    applicable_files = list(
-        filter(lambda fname: fname.strip().endswith(suffix), changed_files)
-    )
+    applicable_files = list(filter(lambda fname: fname.strip().endswith(suffix), changed_files))
    if not applicable_files:
        print(colorify("[NOT APPLICABLE]", Color.CYAN, no_color))
        return
@@ -59,7 +66,14 @@ def check(
    res = subprocess.run(cmd.split(), capture_output=True)
    if res.returncode != 0:
        print(colorify("[FAILED]", Color.RED, no_color))
-        print("Please inspect the output below and run make fmt to fix automatically\n")
+        if name == "mypy":
+            print("Please inspect the output below and fix type mismatches.")
+        else:
+            print("Please inspect the output below and run make fmt to fix automatically.")
+        if suffix == ".py":
+            print("If the output is empty, ensure that you've installed Python tooling by\n"
+                  "running 'pipenv install --dev' in the current directory (no root needed)")
+        print()
        print(res.stdout.decode())
        exit(1)

@@ -68,12 +82,11 @@ def check(

 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--fix-inplace", action="store_true", help="apply fixes inplace"
-    )
-    parser.add_argument(
-        "--no-color", action="store_true", help="disable colored output", default=not sys.stdout.isatty()
-    )
+    parser.add_argument("--fix-inplace", action="store_true", help="apply fixes inplace")
+    parser.add_argument("--no-color",
+                        action="store_true",
+                        help="disable colored output",
+                        default=not sys.stdout.isatty())
    args = parser.parse_args()

    files = get_commit_files()
@@ -87,3 +100,17 @@ if __name__ == "__main__":
        changed_files=files,
        no_color=args.no_color,
    )
+    check(
+        name="yapf",
+        suffix=".py",
+        cmd=yapf(fix_inplace=args.fix_inplace),
+        changed_files=files,
+        no_color=args.no_color,
+    )
+    check(
+        name="mypy",
+        suffix=".py",
+        cmd=mypy(),
+        changed_files=files,
+        no_color=args.no_color,
+    )
--- a/proxy/src/cplane_api.rs
+++ b/proxy/src/cplane_api.rs
@@ -12,7 +12,14 @@ pub struct DatabaseInfo {
    pub port: u16,
    pub dbname: String,
    pub user: String,
-    pub password: String,
+    pub password: Option<String>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct ProxyAuthResult {
+    pub ready: bool,
+    pub error: Option<String>,
+    pub conn_info: Option<DatabaseInfo>,
 }

 impl DatabaseInfo {
@@ -24,12 +31,23 @@ impl DatabaseInfo {
            .next()
            .ok_or_else(|| anyhow::Error::msg("cannot resolve at least one SocketAddr"))
    }
+}

-    pub fn conn_string(&self) -> String {
-        format!(
-            "dbname={} user={} password={}",
-            self.dbname, self.user, self.password
-        )
+impl From<DatabaseInfo> for tokio_postgres::Config {
+    fn from(db_info: DatabaseInfo) -> Self {
+        let mut config = tokio_postgres::Config::new();
+
+        config
+            .host(&db_info.host)
+            .port(db_info.port)
+            .dbname(&db_info.dbname)
+            .user(&db_info.user);
+
+        if let Some(password) = db_info.password {
+            config.password(password);
+        }
+
+        config
    }
 }

@@ -44,22 +62,25 @@ impl CPlaneApi {
        database: &str,
        md5_response: &[u8],
        salt: &[u8; 4],
-    ) -> Result<DatabaseInfo> {
+        psql_session_id: &str,
+    ) -> Result<ProxyAuthResult> {
        let mut url = reqwest::Url::parse(self.auth_endpoint)?;
        url.query_pairs_mut()
            .append_pair("login", user)
            .append_pair("database", database)
            .append_pair("md5response", std::str::from_utf8(md5_response)?)
-            .append_pair("salt", &hex::encode(salt));
+            .append_pair("salt", &hex::encode(salt))
+            .append_pair("psql_session_id", psql_session_id);

        println!("cplane request: {}", url.as_str());

        let resp = reqwest::blocking::get(url)?;

        if resp.status().is_success() {
-            let conn_info: DatabaseInfo = serde_json::from_str(resp.text()?.as_str())?;
-            println!("got conn info: #{:?}", conn_info);
-            Ok(conn_info)
+            let auth_info: ProxyAuthResult = serde_json::from_str(resp.text()?.as_str())?;
+            println!("got auth info: #{:?}", auth_info);
+
+            Ok(auth_info)
        } else {
            bail!("Auth failed")
        }
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -7,7 +7,7 @@
 ///
 use std::{
    collections::HashMap,
-    net::{SocketAddr, TcpListener},
+    net::SocketAddr,
    sync::{mpsc, Arc, Mutex},
    thread,
 };
@@ -17,6 +17,7 @@ use clap::{App, Arg, ArgMatches};

 use cplane_api::DatabaseInfo;
 use rustls::{internal::pemfile, NoClientAuth, ProtocolVersion, ServerConfig};
+use zenith_utils::{tcp_listener, GIT_VERSION};

 mod cplane_api;
 mod mgmt;
@@ -77,6 +78,7 @@ fn configure_ssl(arg_matches: &ArgMatches) -> anyhow::Result<Option<Arc<ServerCo

 fn main() -> anyhow::Result<()> {
    let arg_matches = App::new("Zenith proxy/router")
+        .version(GIT_VERSION)
        .arg(
            Arg::with_name("proxy")
                .short("p")
@@ -138,25 +140,27 @@ fn main() -> anyhow::Result<()> {
    };
    let state: &'static ProxyState = Box::leak(Box::new(state));

+    println!("Version: {}", GIT_VERSION);
+
    // Check that we can bind to address before further initialization
    println!("Starting proxy on {}", state.conf.proxy_address);
-    let pageserver_listener = TcpListener::bind(state.conf.proxy_address)?;
+    let pageserver_listener = tcp_listener::bind(state.conf.proxy_address)?;

    println!("Starting mgmt on {}", state.conf.mgmt_address);
-    let mgmt_listener = TcpListener::bind(state.conf.mgmt_address)?;
+    let mgmt_listener = tcp_listener::bind(state.conf.mgmt_address)?;

-    let threads = vec![
+    let threads = [
        // Spawn a thread to listen for connections. It will spawn further threads
        // for each connection.
        thread::Builder::new()
-            .name("Proxy thread".into())
+            .name("Listener thread".into())
            .spawn(move || proxy::thread_main(state, pageserver_listener))?,
        thread::Builder::new()
            .name("Mgmt thread".into())
            .spawn(move || mgmt::thread_main(state, mgmt_listener))?,
    ];

-    for t in threads.into_iter() {
+    for t in threads {
        t.join().unwrap()?;
    }

--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -6,7 +6,6 @@ use anyhow::bail;
 use tokio_postgres::NoTls;

 use rand::Rng;
-use std::io::Write;
 use std::{io, sync::mpsc::channel, thread};
 use zenith_utils::postgres_backend::Stream;
 use zenith_utils::postgres_backend::{PostgresBackend, ProtoState};
@@ -28,11 +27,13 @@ pub fn thread_main(
        println!("accepted connection from {}", peer_addr);
        socket.set_nodelay(true).unwrap();

-        thread::spawn(move || {
-            if let Err(err) = proxy_conn_main(state, socket) {
-                println!("error: {}", err);
-            }
-        });
+        thread::Builder::new()
+            .name("Proxy thread".into())
+            .spawn(move || {
+                if let Err(err) = proxy_conn_main(state, socket) {
+                    println!("error: {}", err);
+                }
+            })?;
    }
 }

@@ -74,8 +75,12 @@ pub fn proxy_conn_main(
    // This will set conn.existing_user and we can decide on next actions
    conn.handle_startup()?;

+    let mut psql_session_id_buf = [0u8; 8];
+    rand::thread_rng().fill(&mut psql_session_id_buf);
+    conn.psql_session_id = hex::encode(psql_session_id_buf);
+
    // both scenarious here should end up producing database connection string
-    let db_info = if conn.is_existing_user() {
+    let conn_info = if conn.is_existing_user() {
        conn.handle_existing_user()?
    } else {
        conn.handle_new_user()?
@@ -83,7 +88,7 @@ pub fn proxy_conn_main(

    // XXX: move that inside handle_new_user/handle_existing_user to be able to
    // report wrong connection error.
-    proxy_pass(conn.pgb, db_info)
+    proxy_pass(conn.pgb, conn_info)
 }

 impl ProxyConnection {
@@ -155,9 +160,25 @@ impl ProxyConnection {
        Ok(())
    }

+    // Wait for proxy kick form the console with conninfo
+    fn wait_for_conninfo(&mut self) -> anyhow::Result<DatabaseInfo> {
+        let (tx, rx) = channel::<anyhow::Result<DatabaseInfo>>();
+        let _ = self
+            .state
+            .waiters
+            .lock()
+            .unwrap()
+            .insert(self.psql_session_id.clone(), tx);
+
+        // Wait for web console response
+        // TODO: respond with error to client
+        rx.recv()?
+    }
+
    fn handle_existing_user(&mut self) -> anyhow::Result<DatabaseInfo> {
        // ask password
        rand::thread_rng().fill(&mut self.md5_salt);
+
        self.pgb
            .write_message(&BeMessage::AuthenticationMD5Password(&self.md5_salt))?;
        self.pgb.state = ProtoState::Authentication; // XXX
@@ -180,14 +201,41 @@ impl ProxyConnection {
                self.database.as_str(),
                md5_response,
                &self.md5_salt,
+                &self.psql_session_id,
            ) {
                Err(e) => {
-                    self.pgb
-                        .write_message(&BeMessage::ErrorResponse(format!("{}", e)))?;
+                    self.pgb.write_message(&BeMessage::ErrorResponse(format!(
+                        "cannot authenticate proxy: {}",
+                        e
+                    )))?;

                    bail!("auth failed: {}", e);
                }
-                Ok(conn_info) => {
+
+                Ok(auth_info) => {
+                    let conn_info = if auth_info.ready {
+                        // Cluster is ready, so just take `conn_info` and respond to the client.
+                        auth_info
+                            .conn_info
+                            .expect("conn_info should be provided with ready cluster")
+                    } else {
+                        match auth_info.error {
+                            Some(e) => {
+                                self.pgb.write_message(&BeMessage::ErrorResponse(format!(
+                                    "cannot authenticate proxy: {}",
+                                    e
+                                )))?;
+
+                                bail!("auth failed: {}", e);
+                            }
+                            None => {
+                                // Cluster exists, but isn't active, await its start and proxy kick
+                                // with `conn_info`.
+                                self.wait_for_conninfo()?
+                            }
+                        }
+                    };
+
                    self.pgb
                        .write_message_noflush(&BeMessage::AuthenticationOk)?;
                    self.pgb
@@ -203,10 +251,6 @@ impl ProxyConnection {
    }

    fn handle_new_user(&mut self) -> anyhow::Result<DatabaseInfo> {
-        let mut psql_session_id_buf = [0u8; 8];
-        rand::thread_rng().fill(&mut psql_session_id_buf);
-        self.psql_session_id = hex::encode(psql_session_id_buf);
-
        let hello_message = format!("☀️  Welcome to Zenith!

 To proceed with database creation, open the following link:
@@ -225,76 +269,83 @@ databases without opening the browser.
        self.pgb
            .write_message(&BeMessage::NoticeResponse(hello_message))?;

-        // await for database creation
-        let (tx, rx) = channel::<anyhow::Result<DatabaseInfo>>();
-        let _ = self
-            .state
-            .waiters
-            .lock()
-            .unwrap()
-            .insert(self.psql_session_id.clone(), tx);
-
-        // Wait for web console response
-        // XXX: respond with error to client
-        let dbinfo = rx.recv()??;
+        // We requested the DB creation from the console. Now wait for conninfo
+        let conn_info = self.wait_for_conninfo()?;

        self.pgb.write_message_noflush(&BeMessage::NoticeResponse(
            "Connecting to database.".to_string(),
        ))?;
        self.pgb.write_message(&BeMessage::ReadyForQuery)?;

-        Ok(dbinfo)
+        Ok(conn_info)
    }
 }

 /// Create a TCP connection to a postgres database, authenticate with it, and receive the ReadyForQuery message
 async fn connect_to_db(db_info: DatabaseInfo) -> anyhow::Result<tokio::net::TcpStream> {
    let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()?).await?;
-    let config = db_info.conn_string().parse::<tokio_postgres::Config>()?;
+    let config = tokio_postgres::Config::from(db_info);
    let _ = config.connect_raw(&mut socket, NoTls).await?;
    Ok(socket)
 }

 /// Concurrently proxy both directions of the client and server connections
 fn proxy(
-    client_read: ReadStream,
-    client_write: WriteStream,
-    server_read: ReadStream,
-    server_write: WriteStream,
+    (client_read, client_write): (ReadStream, WriteStream),
+    (server_read, server_write): (ReadStream, WriteStream),
 ) -> anyhow::Result<()> {
-    fn do_proxy(mut reader: ReadStream, mut writer: WriteStream) -> io::Result<()> {
-        std::io::copy(&mut reader, &mut writer)?;
-        writer.flush()?;
-        writer.shutdown(std::net::Shutdown::Both)
+    fn do_proxy(mut reader: impl io::Read, mut writer: WriteStream) -> io::Result<u64> {
+        /// FlushWriter will make sure that every message is sent as soon as possible
+        struct FlushWriter<W>(W);
+
+        impl<W: io::Write> io::Write for FlushWriter<W> {
+            fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
+                // `std::io::copy` is guaranteed to exit if we return an error,
+                // so we can afford to lose `res` in case `flush` fails
+                let res = self.0.write(buf);
+                if res.is_ok() {
+                    self.0.flush()?;
+                }
+                res
+            }
+
+            fn flush(&mut self) -> io::Result<()> {
+                self.0.flush()
+            }
+        }
+
+        let res = std::io::copy(&mut reader, &mut FlushWriter(&mut writer));
+        writer.shutdown(std::net::Shutdown::Both)?;
+        res
    }

    let client_to_server_jh = thread::spawn(move || do_proxy(client_read, server_write));

-    let res1 = do_proxy(server_read, client_write);
-    let res2 = client_to_server_jh.join().unwrap();
-    res1?;
-    res2?;
+    do_proxy(server_read, client_write)?;
+    client_to_server_jh.join().unwrap()?;

    Ok(())
 }

 /// Proxy a client connection to a postgres database
 fn proxy_pass(pgb: PostgresBackend, db_info: DatabaseInfo) -> anyhow::Result<()> {
-    let runtime = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()?;
-    let db_stream = runtime.block_on(connect_to_db(db_info))?;
-    let db_stream = db_stream.into_std()?;
-    db_stream.set_nonblocking(false)?;
+    let db_stream = {
+        // We'll get rid of this once migration to async is complete
+        let runtime = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()?;

-    let db_stream = zenith_utils::sock_split::BidiStream::from_tcp(db_stream);
-    let (db_read, db_write) = db_stream.split();
+        let stream = runtime.block_on(connect_to_db(db_info))?.into_std()?;
+        stream.set_nonblocking(false)?;
+        stream
+    };

-    let stream = match pgb.into_stream() {
+    let db = zenith_utils::sock_split::BidiStream::from_tcp(db_stream);
+
+    let client = match pgb.into_stream() {
        Stream::Bidirectional(bidi_stream) => bidi_stream,
        _ => bail!("invalid stream"),
    };

-    let (client_read, client_write) = stream.split();
-    proxy(client_read, client_write, db_read, db_write)
+    proxy(client.split(), db.split())
 }
--- a/test_runner/pytest.ini
+++ b/test_runner/pytest.ini
@@ -1,4 +1,9 @@
 [pytest]
+addopts =
+    -m 'not remote_cluster'
+markers =
+    remote_cluster
 minversion = 6.0
 log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s
 log_date_format = %Y-%m-%d %H:%M:%S
+log_cli = true
--- a/scripts/generate_perf_report_page.py
+++ b/scripts/generate_perf_report_page.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python3
+import argparse
+from dataclasses import dataclass
+from pathlib import Path
+import json
+from typing import Any, Dict, List, Optional, Tuple, cast
+from jinja2 import Template
+
+# skip 'input' columns. They are included in the header and just blow the table
+EXCLUDE_COLUMNS = frozenset({
+    'scale',
+    'duration',
+    'number_of_clients',
+    'number_of_threads',
+    'init_start_timestamp',
+    'init_end_timestamp',
+    'run_start_timestamp',
+    'run_end_timestamp',
+})
+
+KEY_EXCLUDE_FIELDS = frozenset({
+    'init_start_timestamp',
+    'init_end_timestamp',
+    'run_start_timestamp',
+    'run_end_timestamp',
+})
+NEGATIVE_COLOR = 'negative'
+POSITIVE_COLOR = 'positive'
+
+
+@dataclass
+class SuitRun:
+    revision: str
+    values: Dict[str, Any]
+
+
+@dataclass
+class SuitRuns:
+    platform: str
+    suit: str
+    common_columns: List[Tuple[str, str]]
+    value_columns: List[str]
+    runs: List[SuitRun]
+
+
+@dataclass
+class RowValue:
+    value: str
+    color: str
+    ratio: str
+
+
+def get_columns(values: List[Dict[Any, Any]]) -> Tuple[List[Tuple[str, str]], List[str]]:
+    value_columns = []
+    common_columns = []
+    for item in values:
+        if item['name'] in KEY_EXCLUDE_FIELDS:
+            continue
+        if item['report'] != 'test_param':
+            value_columns.append(cast(str, item['name']))
+        else:
+            common_columns.append((cast(str, item['name']), cast(str, item['value'])))
+    value_columns.sort()
+    common_columns.sort(key=lambda x: x[0])  # sort by name
+    return common_columns, value_columns
+
+
+def format_ratio(ratio: float, report: str) -> Tuple[str, str]:
+    color = ''
+    sign = '+' if ratio > 0 else ''
+    if abs(ratio) < 0.05:
+        return f'&nbsp({sign}{ratio:.2f})', color
+
+    if report not in {'test_param', 'higher_is_better', 'lower_is_better'}:
+        raise ValueError(f'Unknown report type: {report}')
+
+    if report == 'test_param':
+        return f'{ratio:.2f}', color
+
+    if ratio > 0:
+        if report == 'higher_is_better':
+            color = POSITIVE_COLOR
+        elif report == 'lower_is_better':
+            color = NEGATIVE_COLOR
+    elif ratio < 0:
+        if report == 'higher_is_better':
+            color = NEGATIVE_COLOR
+        elif report == 'lower_is_better':
+            color = POSITIVE_COLOR
+
+    return f'&nbsp({sign}{ratio:.2f})', color
+
+
+def extract_value(name: str, suit_run: SuitRun) -> Optional[Dict[str, Any]]:
+    for item in suit_run.values['data']:
+        if item['name'] == name:
+            return cast(Dict[str, Any], item)
+    return None
+
+
+def get_row_values(columns: List[str], run_result: SuitRun,
+                   prev_result: Optional[SuitRun]) -> List[RowValue]:
+    row_values = []
+    for column in columns:
+        current_value = extract_value(column, run_result)
+        if current_value is None:
+            # should never happen
+            raise ValueError(f'{column} not found in {run_result.values}')
+
+        value = current_value["value"]
+        if isinstance(value, float):
+            value = f'{value:.2f}'
+
+        if prev_result is None:
+            row_values.append(RowValue(value, '', ''))
+            continue
+
+        prev_value = extract_value(column, prev_result)
+        if prev_value is None:
+            # this might happen when new metric is added and there is no value for it in previous run
+            # let this be here, TODO add proper handling when this actually happens
+            raise ValueError(f'{column} not found in previous result')
+        ratio = float(value) / float(prev_value['value']) - 1
+        ratio_display, color = format_ratio(ratio, current_value['report'])
+        row_values.append(RowValue(value, color, ratio_display))
+    return row_values
+
+
+@dataclass
+class SuiteRunTableRow:
+    revision: str
+    values: List[RowValue]
+
+
+def prepare_rows_from_runs(value_columns: List[str], runs: List[SuitRun]) -> List[SuiteRunTableRow]:
+    rows = []
+    prev_run = None
+    for run in runs:
+        rows.append(
+            SuiteRunTableRow(revision=run.revision,
+                             values=get_row_values(value_columns, run, prev_run)))
+        prev_run = run
+
+    return rows
+
+
+def main(args: argparse.Namespace) -> None:
+    input_dir = Path(args.input_dir)
+    grouped_runs: Dict[str, SuitRuns] = {}
+    # we have files in form: <ctr>_<rev>.json
+    # fill them in the hashmap so we have grouped items for the
+    # same run configuration (scale, duration etc.) ordered by counter.
+    for item in sorted(input_dir.iterdir(), key=lambda x: int(x.name.split('_')[0])):
+        run_data = json.loads(item.read_text())
+        revision = run_data['revision']
+
+        for suit_result in run_data['result']:
+            key = "{}{}".format(run_data['platform'], suit_result['suit'])
+            # pack total duration as a synthetic value
+            total_duration = suit_result['total_duration']
+            suit_result['data'].append({
+                'name': 'total_duration',
+                'value': total_duration,
+                'unit': 's',
+                'report': 'lower_is_better',
+            })
+            common_columns, value_columns = get_columns(suit_result['data'])
+
+            grouped_runs.setdefault(
+                key,
+                SuitRuns(
+                    platform=run_data['platform'],
+                    suit=suit_result['suit'],
+                    common_columns=common_columns,
+                    value_columns=value_columns,
+                    runs=[],
+                ),
+            )
+
+            grouped_runs[key].runs.append(SuitRun(revision=revision, values=suit_result))
+    context = {}
+    for result in grouped_runs.values():
+        suit = result.suit
+        context[suit] = {
+            'common_columns': result.common_columns,
+            'value_columns': result.value_columns,
+            'platform': result.platform,
+            # reverse the order so newest results are on top of the table
+            'rows': reversed(prepare_rows_from_runs(result.value_columns, result.runs)),
+        }
+
+    template = Template((Path(__file__).parent / 'perf_report_template.html').read_text())
+
+    Path(args.out).write_text(template.render(context=context))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input-dir',
+        dest='input_dir',
+        required=True,
+        help='Directory with jsons generated by the test suite',
+    )
+    parser.add_argument('--out', required=True, help='Output html file path')
+    args = parser.parse_args()
+    main(args)
--- a/scripts/perf_report_template.html
+++ b/scripts/perf_report_template.html
@@ -0,0 +1,52 @@
+<!DOCTYPE html>
+<html>
+
+<body>
+    <style>
+        table,
+        th,
+        td {
+            border: 1px solid black;
+            border-collapse: collapse;
+        }
+
+        .positive {
+            background-color: rgba(0, 255, 0, 0.8)
+        }
+
+        .negative {
+            background-color: rgba(255, 0, 0, 0.65)
+        }
+    </style>
+
+    <h2>Zenith Performance Tests</h2>
+
+    {% for suit_name, suit_data in context.items() %}
+    <h3>Runs for {{ suit_name }} </h3>
+    <b>platform:</b> {{ suit_data.platform }}<br>
+    {% for common_column_name, common_column_value in suit_data.common_columns %}
+    <b>{{ common_column_name }}</b>: {{ common_column_value }}<br>
+    {% endfor %}
+    <br>
+
+    <table>
+        <tr>
+            <th>revision</th>
+            {% for column_name in suit_data.value_columns %}
+            <th>{{ column_name }}</th>
+            {% endfor %}
+        </tr>
+        {% for row in suit_data.rows %}
+        <tr>
+            <td><a href=https://github.com/zenithdb/zenith/commit/{{ row.revision }}>{{ row.revision[:6] }}</a></td>
+            {% for column_value in row.values %}
+            <td class="{{ column_value.color }}">{{ column_value.value }}{{column_value.ratio}}</td>
+            {% endfor %}
+        </tr>
+        {% endfor %}
+    </table>
+    {% endfor %}
+
+</body>
+
+</html>
--- a/test_runner/setup.cfg
+++ b/test_runner/setup.cfg
@@ -13,6 +13,8 @@ column_limit = 100
 split_all_top_level_comma_separated_values = true

 [mypy]
+# mypy uses regex
+exclude = ^vendor/
 # some tests don't typecheck when this flag is set
 check_untyped_defs = false

@@ -22,7 +24,11 @@ disallow_untyped_decorators = false
 disallow_untyped_defs = false
 strict = true

-[mypy-psycopg2.*]
+[mypy-asyncpg.*]
+# There is some work in progress, though: https://github.com/MagicStack/asyncpg/pull/577
+ignore_missing_imports = true
+
+[mypy-cached_property.*]
 ignore_missing_imports = true

 [mypy-pytest.*]
--- a/test_runner/Pipfile
+++ b/test_runner/Pipfile
@@ -1,25 +0,0 @@
-[[source]]
-url = "https://pypi.python.org/simple"
-verify_ssl = true
-name = "pypi"
-
-[packages]
-pytest = ">=6.0.0"
-psycopg2 = "*"
-typing-extensions = "*"
-pyjwt = {extras = ["crypto"], version = "*"}
-requests = "*"
-pytest-xdist = "*"
-asyncpg = "*"
-cached-property = "*"
-
-[dev-packages]
-flake8 = "*"
-mypy = "*"
-# Behavior may change slightly between versions. These are run continuously,
-# so we pin exact versions to avoid suprising breaks. Update if comfortable.
-yapf = "==0.31.0"
-
-[requires]
-# we need at least 3.6, but pipenv doesn't allow to say this directly
-python_version = "3"
--- a/test_runner/Pipfile.lock
+++ b/test_runner/Pipfile.lock
@@ -1,390 +0,0 @@
-{
-    "_meta": {
-        "hash": {
-            "sha256": "3645ae8d2dcf55bd2a54963c44cfeedf577f3b289d1077365214a80a7f36e643"
-        },
-        "pipfile-spec": 6,
-        "requires": {
-            "python_version": "3"
-        },
-        "sources": [
-            {
-                "name": "pypi",
-                "url": "https://pypi.python.org/simple",
-                "verify_ssl": true
-            }
-        ]
-    },
-    "default": {
-        "asyncpg": {
-            "hashes": [
-                "sha256:129d501f3d30616afd51eb8d3142ef51ba05374256bd5834cec3ef4956a9b317",
-                "sha256:29ef6ae0a617fc13cc2ac5dc8e9b367bb83cba220614b437af9b67766f4b6b20",
-                "sha256:41704c561d354bef01353835a7846e5606faabbeb846214dfcf666cf53319f18",
-                "sha256:556b0e92e2b75dc028b3c4bc9bd5162ddf0053b856437cf1f04c97f9c6837d03",
-                "sha256:8ff5073d4b654e34bd5eaadc01dc4d68b8a9609084d835acd364cd934190a08d",
-                "sha256:a458fc69051fbb67d995fdda46d75a012b5d6200f91e17d23d4751482640ed4c",
-                "sha256:a7095890c96ba36f9f668eb552bb020dddb44f8e73e932f8573efc613ee83843",
-                "sha256:a738f4807c853623d3f93f0fea11f61be6b0e5ca16ea8aeb42c2c7ee742aa853",
-                "sha256:c4fc0205fe4ddd5aeb3dfdc0f7bafd43411181e1f5650189608e5971cceacff1",
-                "sha256:dd2fa063c3344823487d9ddccb40802f02622ddf8bf8a6cc53885ee7a2c1c0c6",
-                "sha256:ddffcb85227bf39cd1bedd4603e0082b243cf3b14ced64dce506a15b05232b83",
-                "sha256:e36c6806883786b19551bb70a4882561f31135dc8105a59662e0376cf5b2cbc5",
-                "sha256:eed43abc6ccf1dc02e0d0efc06ce46a411362f3358847c6b0ec9a43426f91ece"
-            ],
-            "index": "pypi",
-            "version": "==0.24.0"
-        },
-        "attrs": {
-            "hashes": [
-                "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1",
-                "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==21.2.0"
-        },
-        "cached-property": {
-            "hashes": [
-                "sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130",
-                "sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"
-            ],
-            "index": "pypi",
-            "version": "==1.5.2"
-        },
-        "certifi": {
-            "hashes": [
-                "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
-                "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
-            ],
-            "version": "==2021.10.8"
-        },
-        "cffi": {
-            "hashes": [
-                "sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3",
-                "sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2",
-                "sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636",
-                "sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20",
-                "sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728",
-                "sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27",
-                "sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66",
-                "sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443",
-                "sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0",
-                "sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7",
-                "sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39",
-                "sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605",
-                "sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a",
-                "sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37",
-                "sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029",
-                "sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139",
-                "sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc",
-                "sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df",
-                "sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14",
-                "sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880",
-                "sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2",
-                "sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a",
-                "sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e",
-                "sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474",
-                "sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024",
-                "sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8",
-                "sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0",
-                "sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e",
-                "sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a",
-                "sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e",
-                "sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032",
-                "sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6",
-                "sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e",
-                "sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b",
-                "sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e",
-                "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954",
-                "sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962",
-                "sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c",
-                "sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4",
-                "sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55",
-                "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962",
-                "sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023",
-                "sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c",
-                "sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6",
-                "sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8",
-                "sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382",
-                "sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7",
-                "sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc",
-                "sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997",
-                "sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"
-            ],
-            "version": "==1.15.0"
-        },
-        "charset-normalizer": {
-            "hashes": [
-                "sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0",
-                "sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b"
-            ],
-            "markers": "python_version >= '3'",
-            "version": "==2.0.7"
-        },
-        "cryptography": {
-            "hashes": [
-                "sha256:07bb7fbfb5de0980590ddfc7f13081520def06dc9ed214000ad4372fb4e3c7f6",
-                "sha256:18d90f4711bf63e2fb21e8c8e51ed8189438e6b35a6d996201ebd98a26abbbe6",
-                "sha256:1ed82abf16df40a60942a8c211251ae72858b25b7421ce2497c2eb7a1cee817c",
-                "sha256:22a38e96118a4ce3b97509443feace1d1011d0571fae81fc3ad35f25ba3ea999",
-                "sha256:2d69645f535f4b2c722cfb07a8eab916265545b3475fdb34e0be2f4ee8b0b15e",
-                "sha256:4a2d0e0acc20ede0f06ef7aa58546eee96d2592c00f450c9acb89c5879b61992",
-                "sha256:54b2605e5475944e2213258e0ab8696f4f357a31371e538ef21e8d61c843c28d",
-                "sha256:7075b304cd567694dc692ffc9747f3e9cb393cc4aa4fb7b9f3abd6f5c4e43588",
-                "sha256:7b7ceeff114c31f285528ba8b390d3e9cfa2da17b56f11d366769a807f17cbaa",
-                "sha256:7eba2cebca600a7806b893cb1d541a6e910afa87e97acf2021a22b32da1df52d",
-                "sha256:928185a6d1ccdb816e883f56ebe92e975a262d31cc536429041921f8cb5a62fd",
-                "sha256:9933f28f70d0517686bd7de36166dda42094eac49415459d9bdf5e7df3e0086d",
-                "sha256:a688ebcd08250eab5bb5bca318cc05a8c66de5e4171a65ca51db6bd753ff8953",
-                "sha256:abb5a361d2585bb95012a19ed9b2c8f412c5d723a9836418fab7aaa0243e67d2",
-                "sha256:c10c797ac89c746e488d2ee92bd4abd593615694ee17b2500578b63cad6b93a8",
-                "sha256:ced40344e811d6abba00295ced98c01aecf0c2de39481792d87af4fa58b7b4d6",
-                "sha256:d57e0cdc1b44b6cdf8af1d01807db06886f10177469312fbde8f44ccbb284bc9",
-                "sha256:d99915d6ab265c22873f1b4d6ea5ef462ef797b4140be4c9d8b179915e0985c6",
-                "sha256:eb80e8a1f91e4b7ef8b33041591e6d89b2b8e122d787e87eeb2b08da71bb16ad",
-                "sha256:ebeddd119f526bcf323a89f853afb12e225902a24d29b55fe18dd6fcb2838a76"
-            ],
-            "version": "==35.0.0"
-        },
-        "execnet": {
-            "hashes": [
-                "sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5",
-                "sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==1.9.0"
-        },
-        "idna": {
-            "hashes": [
-                "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
-                "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
-            ],
-            "markers": "python_version >= '3'",
-            "version": "==3.3"
-        },
-        "iniconfig": {
-            "hashes": [
-                "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
-                "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
-            ],
-            "version": "==1.1.1"
-        },
-        "packaging": {
-            "hashes": [
-                "sha256:7dc96269f53a4ccec5c0670940a4281106dd0bb343f47b7471f779df49c2fbe7",
-                "sha256:c86254f9220d55e31cc94d69bade760f0847da8000def4dfe1c6b872fd14ff14"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==21.0"
-        },
-        "pluggy": {
-            "hashes": [
-                "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
-                "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==1.0.0"
-        },
-        "psycopg2": {
-            "hashes": [
-                "sha256:079d97fc22de90da1d370c90583659a9f9a6ee4007355f5825e5f1c70dffc1fa",
-                "sha256:2087013c159a73e09713294a44d0c8008204d06326006b7f652bef5ace66eebb",
-                "sha256:2c992196719fadda59f72d44603ee1a2fdcc67de097eea38d41c7ad9ad246e62",
-                "sha256:7640e1e4d72444ef012e275e7b53204d7fab341fb22bc76057ede22fe6860b25",
-                "sha256:7f91312f065df517187134cce8e395ab37f5b601a42446bdc0f0d51773621854",
-                "sha256:830c8e8dddab6b6716a4bf73a09910c7954a92f40cf1d1e702fb93c8a919cc56",
-                "sha256:89409d369f4882c47f7ea20c42c5046879ce22c1e4ea20ef3b00a4dfc0a7f188",
-                "sha256:bf35a25f1aaa8a3781195595577fcbb59934856ee46b4f252f56ad12b8043bcf",
-                "sha256:de5303a6f1d0a7a34b9d40e4d3bef684ccc44a49bbe3eb85e3c0bffb4a131b7c"
-            ],
-            "index": "pypi",
-            "version": "==2.9.1"
-        },
-        "py": {
-            "hashes": [
-                "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3",
-                "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==1.10.0"
-        },
-        "pycparser": {
-            "hashes": [
-                "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
-                "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==2.20"
-        },
-        "pyjwt": {
-            "extras": [
-                "crypto"
-            ],
-            "hashes": [
-                "sha256:b888b4d56f06f6dcd777210c334e69c737be74755d3e5e9ee3fe67dc18a0ee41",
-                "sha256:e0c4bb8d9f0af0c7f5b1ec4c5036309617d03d56932877f2f7a0beeb5318322f"
-            ],
-            "index": "pypi",
-            "version": "==2.3.0"
-        },
-        "pyparsing": {
-            "hashes": [
-                "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
-                "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
-            ],
-            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==2.4.7"
-        },
-        "pytest": {
-            "hashes": [
-                "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89",
-                "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"
-            ],
-            "index": "pypi",
-            "version": "==6.2.5"
-        },
-        "pytest-forked": {
-            "hashes": [
-                "sha256:6aa9ac7e00ad1a539c41bec6d21011332de671e938c7637378ec9710204e37ca",
-                "sha256:dc4147784048e70ef5d437951728825a131b81714b398d5d52f17c7c144d8815"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==1.3.0"
-        },
-        "pytest-xdist": {
-            "hashes": [
-                "sha256:7b61ebb46997a0820a263553179d6d1e25a8c50d8a8620cd1aa1e20e3be99168",
-                "sha256:89b330316f7fc475f999c81b577c2b926c9569f3d397ae432c0c2e2496d61ff9"
-            ],
-            "index": "pypi",
-            "version": "==2.4.0"
-        },
-        "requests": {
-            "hashes": [
-                "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
-                "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
-            ],
-            "index": "pypi",
-            "version": "==2.26.0"
-        },
-        "toml": {
-            "hashes": [
-                "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
-                "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
-            ],
-            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==0.10.2"
-        },
-        "typing-extensions": {
-            "hashes": [
-                "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
-                "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
-                "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
-            ],
-            "index": "pypi",
-            "version": "==3.10.0.2"
-        },
-        "urllib3": {
-            "hashes": [
-                "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece",
-                "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
-            "version": "==1.26.7"
-        }
-    },
-    "develop": {
-        "flake8": {
-            "hashes": [
-                "sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d",
-                "sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"
-            ],
-            "index": "pypi",
-            "version": "==4.0.1"
-        },
-        "mccabe": {
-            "hashes": [
-                "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
-                "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
-            ],
-            "version": "==0.6.1"
-        },
-        "mypy": {
-            "hashes": [
-                "sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9",
-                "sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a",
-                "sha256:119bed3832d961f3a880787bf621634ba042cb8dc850a7429f643508eeac97b9",
-                "sha256:1a85e280d4d217150ce8cb1a6dddffd14e753a4e0c3cf90baabb32cefa41b59e",
-                "sha256:3c4b8ca36877fc75339253721f69603a9c7fdb5d4d5a95a1a1b899d8b86a4de2",
-                "sha256:3e382b29f8e0ccf19a2df2b29a167591245df90c0b5a2542249873b5c1d78212",
-                "sha256:42c266ced41b65ed40a282c575705325fa7991af370036d3f134518336636f5b",
-                "sha256:53fd2eb27a8ee2892614370896956af2ff61254c275aaee4c230ae771cadd885",
-                "sha256:704098302473cb31a218f1775a873b376b30b4c18229421e9e9dc8916fd16150",
-                "sha256:7df1ead20c81371ccd6091fa3e2878559b5c4d4caadaf1a484cf88d93ca06703",
-                "sha256:866c41f28cee548475f146aa4d39a51cf3b6a84246969f3759cb3e9c742fc072",
-                "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457",
-                "sha256:adaeee09bfde366d2c13fe6093a7df5df83c9a2ba98638c7d76b010694db760e",
-                "sha256:b6fb13123aeef4a3abbcfd7e71773ff3ff1526a7d3dc538f3929a49b42be03f0",
-                "sha256:b94e4b785e304a04ea0828759172a15add27088520dc7e49ceade7834275bedb",
-                "sha256:c0df2d30ed496a08de5daed2a9ea807d07c21ae0ab23acf541ab88c24b26ab97",
-                "sha256:c6c2602dffb74867498f86e6129fd52a2770c48b7cd3ece77ada4fa38f94eba8",
-                "sha256:ceb6e0a6e27fb364fb3853389607cf7eb3a126ad335790fa1e14ed02fba50811",
-                "sha256:d9dd839eb0dc1bbe866a288ba3c1afc33a202015d2ad83b31e875b5905a079b6",
-                "sha256:e4dab234478e3bd3ce83bac4193b2ecd9cf94e720ddd95ce69840273bf44f6de",
-                "sha256:ec4e0cd079db280b6bdabdc807047ff3e199f334050db5cbb91ba3e959a67504",
-                "sha256:ecd2c3fe726758037234c93df7e98deb257fd15c24c9180dacf1ef829da5f921",
-                "sha256:ef565033fa5a958e62796867b1df10c40263ea9ded87164d67572834e57a174d"
-            ],
-            "index": "pypi",
-            "version": "==0.910"
-        },
-        "mypy-extensions": {
-            "hashes": [
-                "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d",
-                "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"
-            ],
-            "version": "==0.4.3"
-        },
-        "pycodestyle": {
-            "hashes": [
-                "sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20",
-                "sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==2.8.0"
-        },
-        "pyflakes": {
-            "hashes": [
-                "sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c",
-                "sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==2.4.0"
-        },
-        "toml": {
-            "hashes": [
-                "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
-                "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
-            ],
-            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==0.10.2"
-        },
-        "typing-extensions": {
-            "hashes": [
-                "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
-                "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
-                "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
-            ],
-            "index": "pypi",
-            "version": "==3.10.0.2"
-        },
-        "yapf": {
-            "hashes": [
-                "sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d",
-                "sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"
-            ],
-            "index": "pypi",
-            "version": "==0.31.0"
-        }
-    }
-}
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -3,18 +3,13 @@
 This directory contains integration tests.

 Prerequisites:
- Python 3.6 or later
- Dependencies: install them via `pipenv install`. Note that Debian/Ubuntu
-  packages are stale, as it commonly happens, so manual installation is not
-  recommended.
-  Run `pipenv shell` to activate the venv or use `pipenv run` to run a single
-  command in the venv, e.g. `pipenv run pytest`.
+- Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python)
 - Zenith and Postgres binaries
-    - See the root README.md for build directions
+    - See the root [README.md](/README.md) for build directions
    - Tests can be run from the git tree; or see the environment variables
      below to run from other directories.
 - The zenith git repo, including the postgres submodule
-  (for some tests, e.g. pg_regress)
+  (for some tests, e.g. `pg_regress`)

 ### Test Organization

@@ -35,15 +30,15 @@ be stored under a directory `test_output`.

 You can run all the tests with:

-`pytest`
+`pipenv run pytest`

 If you want to run all the tests in a particular file:

-`pytest test_pgbench.py`
+`pipenv run pytest test_pgbench.py`

 If you want to run all tests that have the string "bench" in their names:

-`pytest -k bench`
+`pipenv run pytest -k bench`

 Useful environment variables:

@@ -62,46 +57,51 @@ Exit after the first test failure:
 `pytest -x ...`
 (there are many more pytest options; run `pytest -h` to see them.)

+### Writing a test

-### Building new tests
+Every test needs a Zenith Environment, or ZenithEnv to operate in. A Zenith Environment
+is like a little cloud-in-a-box, and consists of a Pageserver, 0-N Safekeepers, and
+compute Postgres nodes. The connections between them can be configured to use JWT
+authentication tokens, and some other configuration options can be tweaked too.

-The tests make heavy use of pytest fixtures. You can read about how they work here: https://docs.pytest.org/en/stable/fixture.html
+The easiest way to get access to a Zenith Environment is by using the `zenith_simple_env`
+fixture. The 'simple' env may be shared across multiple tests, so don't shut down the nodes
+or make other destructive changes in that environment. Also don't assume that
+there are no tenants or branches or data in the cluster. For convenience, there is a
+branch called `empty`, though. The convention is to create a test-specific branch of
+that and load any test data there, instead of the 'main' branch.

-Essentially, this means that each time you see a fixture named as an input parameter, the function with that name will be run and passed as a parameter to the function.
-
-So this code:
+For more complicated cases, you can build a custom Zenith Environment, with the `zenith_env`
+fixture:

 ```python
-def test_something(zenith_cli, pg_bin):
-    pass
+def test_foobar(zenith_env_builder: ZenithEnvBuilder):
+    # Prescribe the environment.
+    # We want to have 3 safekeeper nodes, and use JWT authentication in the
+    # connections to the page server
+    zenith_env_builder.num_safekeepers = 3
+    zenith_env_builder.set_pageserver_auth(True)
+
+    # Now create the environment. This initializes the repository, and starts
+    # up the page server and the safekeepers
+    env = zenith_env_builder.init()
+
+    # Run the test
+    ...
 ```

-... will run the fixtures called `zenith_cli` and `pg_bin` and deliver those results to the test function.
+For more information about pytest fixtures, see https://docs.pytest.org/en/stable/fixture.html

-Fixtures can't be imported using the normal python syntax. Instead, use this:
+At the end of a test, all the nodes in the environment are automatically stopped, so you
+don't need to worry about cleaning up. Logs and test data are preserved for the analysis,
+in a directory under `../test_output/<testname>`

-```python
-pytest_plugins = ("fixtures.something")
-```
+### Before submitting a patch
+Ensure that you pass all [obligatory checks](/docs/sourcetree.md#obligatory-checks).

-That will make all the fixtures in the `fixtures/something.py` file available.
-
-Anything that's likely to be used in multiple tests should be built into a fixture.
-
-Note that fixtures can clean up after themselves if they use the `yield` syntax.
-Cleanup will happen even if the test fails (raises an unhandled exception).
-Python destructors, e.g. `__del__()` aren't recommended for cleanup.
-
-
-### Code quality
-
-We force code formatting via yapf:
-
-1. Install `yapf` and other tools (`flake8`, `mypy`) with `pipenv install --dev`.
-1. Reformat all your code by running `pipenv run yapf -ri .` in the `test_runner/` directory.
-
-Before submitting a patch, please consider:
+Also consider:

 * Writing a couple of docstrings to clarify the reasoning behind a new test.
-* Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any.
-* (Optional) Typechecking the code with `mypy .`. Currently this mostly affects `fixtures/zenith_fixtures.py`.
+* Adding more type hints to your code to avoid `Any`, especially:
+  * For fixture parameters, they are not automatically deduced.
+  * For function arguments and return values.
--- a/test_runner/batch_others/test_auth.py
+++ b/test_runner/batch_others/test_auth.py
@@ -2,18 +2,21 @@ from contextlib import closing
 from typing import Iterator
 from uuid import uuid4
 import psycopg2
-from fixtures.zenith_fixtures import PortDistributor, Postgres, ZenithCli, ZenithPageserver, PgBin
+from fixtures.zenith_fixtures import ZenithEnvBuilder
 import pytest

 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_pageserver_auth(pageserver_auth_enabled: ZenithPageserver):
-    ps = pageserver_auth_enabled
+def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder):
+    zenith_env_builder.pageserver_auth_enabled = True
+    env = zenith_env_builder.init()

-    tenant_token = ps.auth_keys.generate_tenant_token(ps.initial_tenant)
-    invalid_tenant_token = ps.auth_keys.generate_tenant_token(uuid4().hex)
-    management_token = ps.auth_keys.generate_management_token()
+    ps = env.pageserver
+
+    tenant_token = env.auth_keys.generate_tenant_token(env.initial_tenant)
+    invalid_tenant_token = env.auth_keys.generate_tenant_token(uuid4().hex)
+    management_token = env.auth_keys.generate_management_token()

    # this does not invoke auth check and only decodes jwt and checks it for validity
    # check both tokens
@@ -21,13 +24,13 @@ def test_pageserver_auth(pageserver_auth_enabled: ZenithPageserver):
    ps.safe_psql("status", password=management_token)

    # tenant can create branches
-    ps.safe_psql(f"branch_create {ps.initial_tenant} new1 main", password=tenant_token)
+    ps.safe_psql(f"branch_create {env.initial_tenant} new1 main", password=tenant_token)
    # console can create branches for tenant
-    ps.safe_psql(f"branch_create {ps.initial_tenant} new2 main", password=management_token)
+    ps.safe_psql(f"branch_create {env.initial_tenant} new2 main", password=management_token)

    # fail to create branch using token with different tenantid
    with pytest.raises(psycopg2.DatabaseError, match='Tenant id mismatch. Permission denied'):
-        ps.safe_psql(f"branch_create {ps.initial_tenant} new2 main", password=invalid_tenant_token)
+        ps.safe_psql(f"branch_create {env.initial_tenant} new2 main", password=invalid_tenant_token)

    # create tenant using management token
    ps.safe_psql(f"tenant_create {uuid4().hex}", password=management_token)
@@ -40,40 +43,22 @@ def test_pageserver_auth(pageserver_auth_enabled: ZenithPageserver):


@pytest.mark.parametrize('with_wal_acceptors', [False, True])
-def test_compute_auth_to_pageserver(
-    zenith_cli: ZenithCli,
-    wa_factory,
-    pageserver_auth_enabled: ZenithPageserver,
-    repo_dir: str,
-    with_wal_acceptors: bool,
-    pg_bin: PgBin,
-    port_distributor: PortDistributor,
-):
-    ps = pageserver_auth_enabled
-    # since we are in progress of refactoring protocols between compute safekeeper and page server
-    # use hardcoded management token in safekeeper
-    management_token = ps.auth_keys.generate_management_token()
+def test_compute_auth_to_pageserver(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptors: bool):
+    zenith_env_builder.pageserver_auth_enabled = True
+    if with_wal_acceptors:
+        zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()

    branch = f"test_compute_auth_to_pageserver{with_wal_acceptors}"
-    zenith_cli.run(["branch", branch, "empty"])
-    if with_wal_acceptors:
-        wa_factory.start_n_new(3, management_token)
+    env.zenith_cli(["branch", branch, "main"])

-    with Postgres(
-            zenith_cli=zenith_cli,
-            repo_dir=repo_dir,
-            pg_bin=pg_bin,
-            tenant_id=ps.initial_tenant,
-            port=port_distributor.get_port(),
-    ).create_start(
-            branch,
-            wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None,
-    ) as pg:
-        with closing(pg.connect()) as conn:
-            with conn.cursor() as cur:
-                # we rely upon autocommit after each statement
-                # as waiting for acceptors happens there
-                cur.execute('CREATE TABLE t(key int primary key, value text)')
-                cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
-                cur.execute('SELECT sum(key) FROM t')
-                assert cur.fetchone() == (5000050000, )
+    pg = env.postgres.create_start(branch)
+
+    with closing(pg.connect()) as conn:
+        with conn.cursor() as cur:
+            # we rely upon autocommit after each statement
+            # as waiting for acceptors happens there
+            cur.execute('CREATE TABLE t(key int primary key, value text)')
+            cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
+            cur.execute('SELECT sum(key) FROM t')
+            assert cur.fetchone() == (5000050000, )
--- a/test_runner/batch_others/test_branch_behind.py
+++ b/test_runner/batch_others/test_branch_behind.py
@@ -1,5 +1,5 @@
 import subprocess
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -8,11 +8,12 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Create a couple of branches off the main branch, at a historical point in time.
 #
-def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
+def test_branch_behind(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Branch at the point where only 100 rows were inserted
-    zenith_cli.run(["branch", "test_branch_behind", "empty"])
+    env.zenith_cli(["branch", "test_branch_behind", "empty"])

-    pgmain = postgres.create_start('test_branch_behind')
+    pgmain = env.postgres.create_start('test_branch_behind')
    log.info("postgres is running on 'test_branch_behind' branch")

    main_pg_conn = pgmain.connect()
@@ -40,7 +41,7 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
    log.info(f'LSN after 200100 rows: {lsn_b}')

    # Branch at the point where only 100 rows were inserted
-    zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])
+    env.zenith_cli(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])

    # Insert many more rows. This generates enough WAL to fill a few segments.
    main_cur.execute('''
@@ -55,10 +56,10 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
    log.info(f'LSN after 400100 rows: {lsn_c}')

    # Branch at the point where only 200100 rows were inserted
-    zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])
+    env.zenith_cli(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])

-    pg_hundred = postgres.create_start("test_branch_behind_hundred")
-    pg_more = postgres.create_start("test_branch_behind_more")
+    pg_hundred = env.postgres.create_start("test_branch_behind_hundred")
+    pg_more = env.postgres.create_start("test_branch_behind_more")

    # On the 'hundred' branch, we should see only 100 rows
    hundred_pg_conn = pg_hundred.connect()
@@ -79,8 +80,8 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
    # Check bad lsn's for branching

    # branch at segment boundary
-    zenith_cli.run(["branch", "test_branch_segment_boundary", "test_branch_behind@0/3000000"])
-    pg = postgres.create_start("test_branch_segment_boundary")
+    env.zenith_cli(["branch", "test_branch_segment_boundary", "test_branch_behind@0/3000000"])
+    pg = env.postgres.create_start("test_branch_segment_boundary")
    cur = pg.connect().cursor()
    cur.execute('SELECT 1')
    assert cur.fetchone() == (1, )
@@ -89,7 +90,7 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
    #
    # FIXME: This works currently, but probably shouldn't be allowed
    try:
-        zenith_cli.run(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
+        env.zenith_cli(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
        # FIXME: assert false, "branch with invalid LSN should have failed"
    except subprocess.CalledProcessError:
        log.info("Branch creation with pre-initdb LSN failed (as expected)")
--- a/test_runner/batch_others/test_clog_truncate.py
+++ b/test_runner/batch_others/test_clog_truncate.py
@@ -3,7 +3,7 @@ import os

 from contextlib import closing

-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -12,9 +12,10 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test compute node start after clog truncation
 #
-def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
+def test_clog_truncate(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_clog_truncate", "empty"])
+    env.zenith_cli(["branch", "test_clog_truncate", "empty"])

    # set agressive autovacuum to make sure that truncation will happen
    config = [
@@ -27,7 +28,7 @@ def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
        'autovacuum_freeze_max_age=100000'
    ]

-    pg = postgres.create_start('test_clog_truncate', config_lines=config)
+    pg = env.postgres.create_start('test_clog_truncate', config_lines=config)
    log.info('postgres is running on test_clog_truncate branch')

    # Install extension containing function needed for test
@@ -64,10 +65,10 @@ def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: Postg

    # create new branch after clog truncation and start a compute node on it
    log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}')
-    zenith_cli.run(
+    env.zenith_cli(
        ["branch", "test_clog_truncate_new", "test_clog_truncate@" + lsn_after_truncation])

-    pg2 = postgres.create_start('test_clog_truncate_new')
+    pg2 = env.postgres.create_start('test_clog_truncate_new')
    log.info('postgres is running on test_clog_truncate_new branch')

    # check that new node doesn't contain truncated segment
--- a/test_runner/batch_others/test_config.py
+++ b/test_runner/batch_others/test_config.py
@@ -1,6 +1,6 @@
 from contextlib import closing

-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -9,12 +9,13 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test starting Postgres with custom options
 #
-def test_config(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
+def test_config(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_config", "empty"])
+    env.zenith_cli(["branch", "test_config", "empty"])

    # change config
-    pg = postgres.create_start('test_config', config_lines=['log_min_messages=debug1'])
+    pg = env.postgres.create_start('test_config', config_lines=['log_min_messages=debug1'])
    log.info('postgres is running on test_config branch')

    with closing(pg.connect()) as conn:
--- a/test_runner/batch_others/test_createdropdb.py
+++ b/test_runner/batch_others/test_createdropdb.py
@@ -2,7 +2,7 @@ import os
 import pathlib

 from contextlib import closing
-from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory, ZenithCli, check_restored_datadir_content
+from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -11,15 +11,11 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test CREATE DATABASE when there have been relmapper changes
 #
-def test_createdb(
-    zenith_cli: ZenithCli,
-    pageserver: ZenithPageserver,
-    postgres: PostgresFactory,
-    pg_bin,
-):
-    zenith_cli.run(["branch", "test_createdb", "empty"])
+def test_createdb(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    env.zenith_cli(["branch", "test_createdb", "empty"])

-    pg = postgres.create_start('test_createdb')
+    pg = env.postgres.create_start('test_createdb')
    log.info("postgres is running on 'test_createdb' branch")

    with closing(pg.connect()) as conn:
@@ -33,9 +29,9 @@ def test_createdb(
            lsn = cur.fetchone()[0]

    # Create a branch
-    zenith_cli.run(["branch", "test_createdb2", "test_createdb@" + lsn])
+    env.zenith_cli(["branch", "test_createdb2", "test_createdb@" + lsn])

-    pg2 = postgres.create_start('test_createdb2')
+    pg2 = env.postgres.create_start('test_createdb2')

    # Test that you can connect to the new database on both branches
    for db in (pg, pg2):
@@ -45,16 +41,11 @@ def test_createdb(
 #
 # Test DROP DATABASE
 #
-def test_dropdb(
-    zenith_cli: ZenithCli,
-    pageserver: ZenithPageserver,
-    postgres: PostgresFactory,
-    pg_bin,
-    test_output_dir,
-):
-    zenith_cli.run(["branch", "test_dropdb", "empty"])
+def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir):
+    env = zenith_simple_env
+    env.zenith_cli(["branch", "test_dropdb", "empty"])

-    pg = postgres.create_start('test_dropdb')
+    pg = env.postgres.create_start('test_dropdb')
    log.info("postgres is running on 'test_dropdb' branch")

    with closing(pg.connect()) as conn:
@@ -77,26 +68,28 @@ def test_dropdb(
            lsn_after_drop = cur.fetchone()[0]

    # Create two branches before and after database drop.
-    zenith_cli.run(["branch", "test_before_dropdb", "test_dropdb@" + lsn_before_drop])
-    pg_before = postgres.create_start('test_before_dropdb')
+    env.zenith_cli(["branch", "test_before_dropdb", "test_dropdb@" + lsn_before_drop])
+    pg_before = env.postgres.create_start('test_before_dropdb')

-    zenith_cli.run(["branch", "test_after_dropdb", "test_dropdb@" + lsn_after_drop])
-    pg_after = postgres.create_start('test_after_dropdb')
+    env.zenith_cli(["branch", "test_after_dropdb", "test_dropdb@" + lsn_after_drop])
+    pg_after = env.postgres.create_start('test_after_dropdb')

    # Test that database exists on the branch before drop
    pg_before.connect(dbname='foodb').close()

    # Test that database subdir exists on the branch before drop
+    assert pg_before.pgdata_dir
    dbpath = pathlib.Path(pg_before.pgdata_dir) / 'base' / str(dboid)
    log.info(dbpath)

    assert os.path.isdir(dbpath) == True

    # Test that database subdir doesn't exist on the branch after drop
+    assert pg_after.pgdata_dir
    dbpath = pathlib.Path(pg_after.pgdata_dir) / 'base' / str(dboid)
    log.info(dbpath)

    assert os.path.isdir(dbpath) == False

    # Check that we restore the content of the datadir correctly
-    check_restored_datadir_content(zenith_cli, test_output_dir, pg, pageserver.service_port.pg)
+    check_restored_datadir_content(test_output_dir, env, pg)
--- a/test_runner/batch_others/test_createuser.py
+++ b/test_runner/batch_others/test_createuser.py
@@ -1,6 +1,6 @@
 from contextlib import closing

-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -9,10 +9,11 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test CREATE USER to check shared catalog restore
 #
-def test_createuser(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
-    zenith_cli.run(["branch", "test_createuser", "empty"])
+def test_createuser(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    env.zenith_cli(["branch", "test_createuser", "empty"])

-    pg = postgres.create_start('test_createuser')
+    pg = env.postgres.create_start('test_createuser')
    log.info("postgres is running on 'test_createuser' branch")

    with closing(pg.connect()) as conn:
@@ -26,9 +27,9 @@ def test_createuser(zenith_cli, pageserver: ZenithPageserver, postgres: Postgres
            lsn = cur.fetchone()[0]

    # Create a branch
-    zenith_cli.run(["branch", "test_createuser2", "test_createuser@" + lsn])
+    env.zenith_cli(["branch", "test_createuser2", "test_createuser@" + lsn])

-    pg2 = postgres.create_start('test_createuser2')
+    pg2 = env.postgres.create_start('test_createuser2')

    # Test that you can connect to new branch as a new user
    assert pg2.safe_psql('select current_user', username='testuser') == [('testuser', )]
--- a/test_runner/batch_others/test_multixact.py
+++ b/test_runner/batch_others/test_multixact.py
@@ -1,4 +1,4 @@
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content
+from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -10,15 +10,11 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 # it only checks next_multixact_id field in restored pg_control,
 # since we don't have functions to check multixact internals.
 #
-def test_multixact(pageserver: ZenithPageserver,
-                   postgres: PostgresFactory,
-                   pg_bin,
-                   zenith_cli,
-                   base_dir,
-                   test_output_dir):
+def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_multixact", "empty"])
-    pg = postgres.create_start('test_multixact')
+    env.zenith_cli(["branch", "test_multixact", "empty"])
+    pg = env.postgres.create_start('test_multixact')

    log.info("postgres is running on 'test_multixact' branch")
    pg_conn = pg.connect()
@@ -57,8 +53,8 @@ def test_multixact(pageserver: ZenithPageserver,
    assert int(next_multixact_id) > int(next_multixact_id_old)

    # Branch at this point
-    zenith_cli.run(["branch", "test_multixact_new", "test_multixact@" + lsn])
-    pg_new = postgres.create_start('test_multixact_new')
+    env.zenith_cli(["branch", "test_multixact_new", "test_multixact@" + lsn])
+    pg_new = env.postgres.create_start('test_multixact_new')

    log.info("postgres is running on 'test_multixact_new' branch")
    pg_new_conn = pg_new.connect()
@@ -71,4 +67,4 @@ def test_multixact(pageserver: ZenithPageserver,
    assert next_multixact_id_new == next_multixact_id

    # Check that we restore the content of the datadir correctly
-    check_restored_datadir_content(zenith_cli, test_output_dir, pg_new, pageserver.service_port.pg)
+    check_restored_datadir_content(test_output_dir, env, pg_new)
--- a/test_runner/batch_others/test_old_request_lsn.py
+++ b/test_runner/batch_others/test_old_request_lsn.py
@@ -1,6 +1,6 @@
 from contextlib import closing

-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -16,13 +16,11 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 # just a hint that the page hasn't been modified since that LSN, and the page
 # server should return the latest page version regardless of the LSN.
 #
-def test_old_request_lsn(zenith_cli,
-                         pageserver: ZenithPageserver,
-                         postgres: PostgresFactory,
-                         pg_bin):
+def test_old_request_lsn(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_old_request_lsn", "empty"])
-    pg = postgres.create_start('test_old_request_lsn')
+    env.zenith_cli(["branch", "test_old_request_lsn", "empty"])
+    pg = env.postgres.create_start('test_old_request_lsn')
    log.info('postgres is running on test_old_request_lsn branch')

    pg_conn = pg.connect()
@@ -32,7 +30,7 @@ def test_old_request_lsn(zenith_cli,
    cur.execute("SHOW zenith.zenith_timeline")
    timeline = cur.fetchone()[0]

-    psconn = pageserver.connect()
+    psconn = env.pageserver.connect()
    pscur = psconn.cursor()

    # Create table, and insert some rows. Make it big enough that it doesn't fit in
@@ -59,7 +57,7 @@ def test_old_request_lsn(zenith_cli,
    # Make a lot of updates on a single row, generating a lot of WAL. Trigger
    # garbage collections so that the page server will remove old page versions.
    for i in range(10):
-        pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+        pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
        for j in range(100):
            cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;')

--- a/test_runner/batch_others/test_pageserver_api.py
+++ b/test_runner/batch_others/test_pageserver_api.py
@@ -3,25 +3,28 @@ from uuid import uuid4
 import pytest
 import psycopg2
 import requests
-from fixtures.zenith_fixtures import ZenithPageserver, ZenithPageserverHttpClient
+from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient
+from typing import cast

 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_status_psql(pageserver):
-    assert pageserver.safe_psql('status') == [
+def test_status_psql(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    assert env.pageserver.safe_psql('status') == [
        ('hello world', ),
    ]


-def test_branch_list_psql(pageserver: ZenithPageserver, zenith_cli):
+def test_branch_list_psql(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_branch_list_main", "empty"])
+    env.zenith_cli(["branch", "test_branch_list_main", "empty"])

-    conn = pageserver.connect()
+    conn = env.pageserver.connect()
    cur = conn.cursor()

-    cur.execute(f'branch_list {pageserver.initial_tenant}')
+    cur.execute(f'branch_list {env.initial_tenant}')
    branches = json.loads(cur.fetchone()[0])
    # Filter out branches created by other tests
    branches = [x for x in branches if x['name'].startswith('test_branch_list')]
@@ -34,10 +37,10 @@ def test_branch_list_psql(pageserver: ZenithPageserver, zenith_cli):
    assert 'ancestor_lsn' in branches[0]

    # Create another branch, and start Postgres on it
-    zenith_cli.run(['branch', 'test_branch_list_experimental', 'test_branch_list_main'])
-    zenith_cli.run(['pg', 'create', 'test_branch_list_experimental'])
+    env.zenith_cli(['branch', 'test_branch_list_experimental', 'test_branch_list_main'])
+    env.zenith_cli(['pg', 'create', 'test_branch_list_experimental'])

-    cur.execute(f'branch_list {pageserver.initial_tenant}')
+    cur.execute(f'branch_list {env.initial_tenant}')
    new_branches = json.loads(cur.fetchone()[0])
    # Filter out branches created by other tests
    new_branches = [x for x in new_branches if x['name'].startswith('test_branch_list')]
@@ -53,19 +56,22 @@ def test_branch_list_psql(pageserver: ZenithPageserver, zenith_cli):
    conn.close()


-def test_tenant_list_psql(pageserver: ZenithPageserver, zenith_cli):
-    res = zenith_cli.run(["tenant", "list"])
-    res.check_returncode()
-    tenants = res.stdout.splitlines()
-    assert tenants == [pageserver.initial_tenant]
+def test_tenant_list_psql(zenith_env_builder: ZenithEnvBuilder):
+    # don't use zenith_simple_env, because there might be other tenants there,
+    # left over from other tests.
+    env = zenith_env_builder.init()

-    conn = pageserver.connect()
+    res = env.zenith_cli(["tenant", "list"])
+    res.check_returncode()
+    tenants = sorted(map(lambda t: t.split()[0], res.stdout.splitlines()))
+    assert tenants == [env.initial_tenant]
+
+    conn = env.pageserver.connect()
    cur = conn.cursor()

    # check same tenant cannot be created twice
-    with pytest.raises(psycopg2.DatabaseError,
-                       match=f'tenant {pageserver.initial_tenant} already exists'):
-        cur.execute(f'tenant_create {pageserver.initial_tenant}')
+    with pytest.raises(psycopg2.DatabaseError, match=f'tenant {env.initial_tenant} already exists'):
+        cur.execute(f'tenant_create {env.initial_tenant}')

    # create one more tenant
    tenant1 = uuid4().hex
@@ -74,20 +80,20 @@ def test_tenant_list_psql(pageserver: ZenithPageserver, zenith_cli):
    cur.execute('tenant_list')

    # compare tenants list
-    new_tenants = sorted(json.loads(cur.fetchone()[0]))
-    assert sorted([pageserver.initial_tenant, tenant1]) == new_tenants
+    new_tenants = sorted(map(lambda t: cast(str, t['id']), json.loads(cur.fetchone()[0])))
+    assert sorted([env.initial_tenant, tenant1]) == new_tenants


 def check_client(client: ZenithPageserverHttpClient, initial_tenant: str):
    client.check_status()

    # check initial tenant is there
-    assert initial_tenant in set(client.tenant_list())
+    assert initial_tenant in {t['id'] for t in client.tenant_list()}

    # create new tenant and check it is also there
    tenant_id = uuid4()
    client.tenant_create(tenant_id)
-    assert tenant_id.hex in set(client.tenant_list())
+    assert tenant_id.hex in {t['id'] for t in client.tenant_list()}

    # create branch
    branch_name = uuid4().hex
@@ -97,12 +103,17 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: str):
    assert branch_name in {b['name'] for b in client.branch_list(tenant_id)}


-def test_pageserver_http_api_client(pageserver: ZenithPageserver):
-    client = pageserver.http_client()
-    check_client(client, pageserver.initial_tenant)
+def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    client = env.pageserver.http_client()
+    check_client(client, env.initial_tenant)


-def test_pageserver_http_api_client_auth_enabled(pageserver_auth_enabled: ZenithPageserver):
-    client = pageserver_auth_enabled.http_client(
-        auth_token=pageserver_auth_enabled.auth_keys.generate_management_token())
-    check_client(client, pageserver_auth_enabled.initial_tenant)
+def test_pageserver_http_api_client_auth_enabled(zenith_env_builder: ZenithEnvBuilder):
+    zenith_env_builder.pageserver_auth_enabled = True
+    env = zenith_env_builder.init()
+
+    management_token = env.auth_keys.generate_management_token()
+
+    client = env.pageserver.http_client(auth_token=management_token)
+    check_client(client, env.initial_tenant)
--- a/test_runner/batch_others/test_pageserver_restart.py
+++ b/test_runner/batch_others/test_pageserver_restart.py
@@ -4,7 +4,7 @@ import time

 from contextlib import closing
 from multiprocessing import Process, Value
-from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory
+from fixtures.zenith_fixtures import ZenithEnvBuilder
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -13,16 +13,13 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 # Check that dead minority doesn't prevent the commits: execute insert n_inserts
 # times, with fault_probability chance of getting a wal acceptor down or up
 # along the way. 2 of 3 are always alive, so the work keeps going.
-def test_pageserver_restart(zenith_cli,
-                            pageserver: ZenithPageserver,
-                            postgres: PostgresFactory,
-                            wa_factory: WalAcceptorFactory):
-
+def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder):
    # One safekeeper is enough for this test.
-    wa_factory.start_n_new(1)
+    zenith_env_builder.num_safekeepers = 1
+    env = zenith_env_builder.init()

-    zenith_cli.run(["branch", "test_pageserver_restart", "empty"])
-    pg = postgres.create_start('test_pageserver_restart', wal_acceptors=wa_factory.get_connstrs())
+    env.zenith_cli(["branch", "test_pageserver_restart", "main"])
+    pg = env.postgres.create_start('test_pageserver_restart')

    pg_conn = pg.connect()
    cur = pg_conn.cursor()
@@ -50,8 +47,8 @@ def test_pageserver_restart(zenith_cli,
    # Stop and restart pageserver. This is a more or less graceful shutdown, although
    # the page server doesn't currently have a shutdown routine so there's no difference
    # between stopping and crashing.
-    pageserver.stop()
-    pageserver.start()
+    env.pageserver.stop()
+    env.pageserver.start()

    # Stopping the pageserver breaks the connection from the postgres backend to
    # the page server, and causes the next query on the connection to fail. Start a new
@@ -65,5 +62,5 @@ def test_pageserver_restart(zenith_cli,
    assert cur.fetchone() == (100000, )

    # Stop the page server by force, and restart it
-    pageserver.stop()
-    pageserver.start()
+    env.pageserver.stop()
+    env.pageserver.start()
--- a/test_runner/batch_others/test_parallel_copy.py
+++ b/test_runner/batch_others/test_parallel_copy.py
@@ -0,0 +1,54 @@
+from io import BytesIO
+import asyncio
+import asyncpg
+import subprocess
+from fixtures.zenith_fixtures import ZenithEnv, Postgres
+from fixtures.log_helper import log
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+
+async def repeat_bytes(buf, repetitions: int):
+    for i in range(repetitions):
+        yield buf
+
+
+async def copy_test_data_to_table(pg: Postgres, worker_id: int, table_name: str):
+    buf = BytesIO()
+    for i in range(1000):
+        buf.write(
+            f"{i}\tLoaded by worker {worker_id}. Long string to consume some space.\n".encode())
+    buf.seek(0)
+
+    copy_input = repeat_bytes(buf.read(), 5000)
+
+    pg_conn = await pg.connect_async()
+    await pg_conn.copy_to_table(table_name, source=copy_input)
+
+
+async def parallel_load_same_table(pg: Postgres, n_parallel: int):
+    workers = []
+    for worker_id in range(n_parallel):
+        worker = copy_test_data_to_table(pg, worker_id, f'copytest')
+        workers.append(asyncio.create_task(worker))
+
+    # await all workers
+    await asyncio.gather(*workers)
+
+
+# Load data into one table with COPY TO from 5 parallel connections
+def test_parallel_copy(zenith_simple_env: ZenithEnv, n_parallel=5):
+    env = zenith_simple_env
+    # Create a branch for us
+    env.zenith_cli(["branch", "test_parallel_copy", "empty"])
+
+    pg = env.postgres.create_start('test_parallel_copy')
+    log.info("postgres is running on 'test_parallel_copy' branch")
+
+    # Create test table
+    conn = pg.connect()
+    cur = conn.cursor()
+    cur.execute(f'CREATE TABLE copytest (i int, t text)')
+
+    # Run COPY TO to load the table with parallel connections.
+    asyncio.run(parallel_load_same_table(pg, n_parallel))
--- a/test_runner/batch_others/test_pgbench.py
+++ b/test_runner/batch_others/test_pgbench.py
@@ -1,14 +1,15 @@
-from fixtures.zenith_fixtures import PostgresFactory
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_pgbench(postgres: PostgresFactory, pg_bin, zenith_cli):
+def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_pgbench", "empty"])
+    env.zenith_cli(["branch", "test_pgbench", "empty"])

-    pg = postgres.create_start('test_pgbench')
+    pg = env.postgres.create_start('test_pgbench')
    log.info("postgres is running on 'test_pgbench' branch")

    connstr = pg.connstr()
--- a/test_runner/batch_others/test_readonly_node.py
+++ b/test_runner/batch_others/test_readonly_node.py
@@ -1,5 +1,5 @@
 import subprocess
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv

 pytest_plugins = ("fixtures.zenith_fixtures")

@@ -10,10 +10,11 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 # This is very similar to the 'test_branch_behind' test, but instead of
 # creating branches, creates read-only nodes.
 #
-def test_readonly_node(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
-    zenith_cli.run(["branch", "test_readonly_node", "empty"])
+def test_readonly_node(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    env.zenith_cli(["branch", "test_readonly_node", "empty"])

-    pgmain = postgres.create_start('test_readonly_node')
+    pgmain = env.postgres.create_start('test_readonly_node')
    print("postgres is running on 'test_readonly_node' branch")

    main_pg_conn = pgmain.connect()
@@ -52,11 +53,12 @@ def test_readonly_node(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
    print('LSN after 400100 rows: ' + lsn_c)

    # Create first read-only node at the point where only 100 rows were inserted
-    pg_hundred = postgres.create_start("test_readonly_node_hundred",
-                                       branch=f'test_readonly_node@{lsn_a}')
+    pg_hundred = env.postgres.create_start("test_readonly_node_hundred",
+                                           branch=f'test_readonly_node@{lsn_a}')

    # And another at the point where 200100 rows were inserted
-    pg_more = postgres.create_start("test_readonly_node_more", branch=f'test_readonly_node@{lsn_b}')
+    pg_more = env.postgres.create_start("test_readonly_node_more",
+                                        branch=f'test_readonly_node@{lsn_b}')

    # On the 'hundred' node, we should see only 100 rows
    hundred_pg_conn = pg_hundred.connect()
@@ -75,15 +77,15 @@ def test_readonly_node(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
    assert main_cur.fetchone() == (400100, )

    # Check creating a node at segment boundary
-    pg = postgres.create_start("test_branch_segment_boundary",
-                               branch="test_readonly_node@0/3000000")
+    pg = env.postgres.create_start("test_branch_segment_boundary",
+                                   branch="test_readonly_node@0/3000000")
    cur = pg.connect().cursor()
    cur.execute('SELECT 1')
    assert cur.fetchone() == (1, )

    # Create node at pre-initdb lsn
    try:
-        zenith_cli.run(["pg", "start", "test_branch_preinitdb", "test_readonly_node@0/42"])
-        assert false, "compute node startup with invalid LSN should have failed"
+        env.zenith_cli(["pg", "start", "test_branch_preinitdb", "test_readonly_node@0/42"])
+        assert False, "compute node startup with invalid LSN should have failed"
    except Exception:
        print("Node creation with pre-initdb LSN failed (as expected)")
--- a/test_runner/batch_others/test_restart_compute.py
+++ b/test_runner/batch_others/test_restart_compute.py
@@ -1,7 +1,7 @@
 import pytest

 from contextlib import closing
-from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory
+from fixtures.zenith_fixtures import ZenithEnvBuilder
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -11,22 +11,15 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 # Test restarting and recreating a postgres instance
 #
@pytest.mark.parametrize('with_wal_acceptors', [False, True])
-def test_restart_compute(
-    zenith_cli,
-    pageserver: ZenithPageserver,
-    postgres: PostgresFactory,
-    pg_bin,
-    wa_factory,
-    with_wal_acceptors: bool,
-):
-    wal_acceptor_connstrs = None
-    zenith_cli.run(["branch", "test_restart_compute", "empty"])
-
+def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptors: bool):
+    zenith_env_builder.pageserver_auth_enabled = True
    if with_wal_acceptors:
-        wa_factory.start_n_new(3)
-        wal_acceptor_connstrs = wa_factory.get_connstrs()
+        zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()

-    pg = postgres.create_start('test_restart_compute', wal_acceptors=wal_acceptor_connstrs)
+    env.zenith_cli(["branch", "test_restart_compute", "main"])
+
+    pg = env.postgres.create_start('test_restart_compute')
    log.info("postgres is running on 'test_restart_compute' branch")

    with closing(pg.connect()) as conn:
@@ -39,7 +32,7 @@ def test_restart_compute(
            log.info(f"res = {r}")

    # Remove data directory and restart
-    pg.stop_and_destroy().create_start('test_restart_compute', wal_acceptors=wal_acceptor_connstrs)
+    pg.stop_and_destroy().create_start('test_restart_compute')

    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
@@ -58,7 +51,7 @@ def test_restart_compute(
            log.info(f"res = {r}")

    # Again remove data directory and restart
-    pg.stop_and_destroy().create_start('test_restart_compute', wal_acceptors=wal_acceptor_connstrs)
+    pg.stop_and_destroy().create_start('test_restart_compute')

    # That select causes lots of FPI's and increases probability of wakeepers
    # lagging behind after query completion
@@ -72,7 +65,7 @@ def test_restart_compute(
            log.info(f"res = {r}")

    # And again remove data directory and restart
-    pg.stop_and_destroy().create_start('test_restart_compute', wal_acceptors=wal_acceptor_connstrs)
+    pg.stop_and_destroy().create_start('test_restart_compute')

    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
--- a/test_runner/batch_others/test_snapfiles_gc.py
+++ b/test_runner/batch_others/test_snapfiles_gc.py
@@ -1,6 +1,7 @@
 from contextlib import closing
 import psycopg2.extras
 import time
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -22,13 +23,14 @@ def print_gc_result(row):
 # This test is pretty tightly coupled with the current implementation of layered
 # storage, in layered_repository.rs.
 #
-def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
-    zenith_cli.run(["branch", "test_layerfiles_gc", "empty"])
-    pg = postgres.create_start('test_layerfiles_gc')
+def test_layerfiles_gc(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    env.zenith_cli(["branch", "test_layerfiles_gc", "empty"])
+    pg = env.postgres.create_start('test_layerfiles_gc')

    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
-            with closing(pageserver.connect()) as psconn:
+            with closing(env.pageserver.connect()) as psconn:
                with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur:

                    # Get the timeline ID of our branch. We need it for the 'do_gc' command
@@ -57,7 +59,7 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
                    cur.execute("DELETE FROM foo")

                    log.info("Running GC before test")
-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
                    print_gc_result(row)
                    # remember the number of files
@@ -70,7 +72,7 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
                    # removing the old image and delta layer.
                    log.info("Inserting one row and running GC")
                    cur.execute("INSERT INTO foo VALUES (1)")
-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
                    print_gc_result(row)
                    assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
@@ -84,7 +86,7 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
                    cur.execute("INSERT INTO foo VALUES (2)")
                    cur.execute("INSERT INTO foo VALUES (3)")

-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
                    print_gc_result(row)
                    assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
@@ -96,7 +98,7 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
                    cur.execute("INSERT INTO foo VALUES (2)")
                    cur.execute("INSERT INTO foo VALUES (3)")

-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
                    print_gc_result(row)
                    assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
@@ -105,7 +107,7 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):

                    # Run GC again, with no changes in the database. Should not remove anything.
                    log.info("Run GC again, with nothing to do")
-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
                    print_gc_result(row)
                    assert row['layer_relfiles_total'] == layer_relfiles_remain
@@ -118,7 +120,7 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
                    log.info("Drop table and run GC again")
                    cur.execute("DROP TABLE foo")

-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
                    print_gc_result(row)

--- a/test_runner/batch_others/test_tenants.py
+++ b/test_runner/batch_others/test_tenants.py
@@ -2,51 +2,41 @@ from contextlib import closing

 import pytest

-from fixtures.zenith_fixtures import (
-    TenantFactory,
-    ZenithCli,
-    PostgresFactory,
-)
+from fixtures.zenith_fixtures import ZenithEnvBuilder


@pytest.mark.parametrize('with_wal_acceptors', [False, True])
-def test_tenants_normal_work(
-    zenith_cli: ZenithCli,
-    tenant_factory: TenantFactory,
-    postgres: PostgresFactory,
-    wa_factory,
-    with_wal_acceptors: bool,
-):
-    """Tests tenants with and without wal acceptors"""
-    tenant_1 = tenant_factory.create()
-    tenant_2 = tenant_factory.create()
+def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptors: bool):
+    if with_wal_acceptors:
+        zenith_env_builder.num_safekeepers = 3

-    zenith_cli.run([
+    env = zenith_env_builder.init()
+    """Tests tenants with and without wal acceptors"""
+    tenant_1 = env.create_tenant()
+    tenant_2 = env.create_tenant()
+
+    env.zenith_cli([
        "branch",
        f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
        "main",
        f"--tenantid={tenant_1}"
    ])
-    zenith_cli.run([
+    env.zenith_cli([
        "branch",
        f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
        "main",
        f"--tenantid={tenant_2}"
    ])
-    if with_wal_acceptors:
-        wa_factory.start_n_new(3)

-    pg_tenant1 = postgres.create_start(
+    pg_tenant1 = env.postgres.create_start(
        f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
        None,  # branch name, None means same as node name
        tenant_1,
-        wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None,
    )
-    pg_tenant2 = postgres.create_start(
+    pg_tenant2 = env.postgres.create_start(
        f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}",
        None,  # branch name, None means same as node name
        tenant_2,
-        wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None,
    )

    for pg in [pg_tenant1, pg_tenant2]:
--- a/test_runner/batch_others/test_timeline_size.py
+++ b/test_runner/batch_others/test_timeline_size.py
@@ -1,19 +1,20 @@
 from contextlib import closing
 from uuid import UUID
 import psycopg2.extras
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log


-def test_timeline_size(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
+def test_timeline_size(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Branch at the point where only 100 rows were inserted
-    zenith_cli.run(["branch", "test_timeline_size", "empty"])
+    env.zenith_cli(["branch", "test_timeline_size", "empty"])

-    client = pageserver.http_client()
-    res = client.branch_detail(UUID(pageserver.initial_tenant), "test_timeline_size")
+    client = env.pageserver.http_client()
+    res = client.branch_detail(UUID(env.initial_tenant), "test_timeline_size")
    assert res["current_logical_size"] == res["current_logical_size_non_incremental"]

-    pgmain = postgres.create_start("test_timeline_size")
+    pgmain = env.postgres.create_start("test_timeline_size")
    log.info("postgres is running on 'test_timeline_size' branch")

    with closing(pgmain.connect()) as conn:
@@ -28,9 +29,9 @@ def test_timeline_size(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
                    FROM generate_series(1, 10) g
            """)

-            res = client.branch_detail(UUID(pageserver.initial_tenant), "test_timeline_size")
+            res = client.branch_detail(UUID(env.initial_tenant), "test_timeline_size")
            assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
            cur.execute("TRUNCATE foo")

-            res = client.branch_detail(UUID(pageserver.initial_tenant), "test_timeline_size")
+            res = client.branch_detail(UUID(env.initial_tenant), "test_timeline_size")
            assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
--- a/test_runner/batch_others/test_twophase.py
+++ b/test_runner/batch_others/test_twophase.py
@@ -1,6 +1,6 @@
 import os

-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, PgBin
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -9,13 +9,11 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test branching, when a transaction is in prepared state
 #
-def test_twophase(zenith_cli,
-                  pageserver: ZenithPageserver,
-                  postgres: PostgresFactory,
-                  pg_bin: PgBin):
-    zenith_cli.run(["branch", "test_twophase", "empty"])
+def test_twophase(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    env.zenith_cli(["branch", "test_twophase", "empty"])

-    pg = postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5'])
+    pg = env.postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5'])
    log.info("postgres is running on 'test_twophase' branch")

    conn = pg.connect()
@@ -60,10 +58,10 @@ def test_twophase(zenith_cli,
    assert len(twophase_files) == 2

    # Create a branch with the transaction in prepared state
-    zenith_cli.run(["branch", "test_twophase_prepared", "test_twophase"])
+    env.zenith_cli(["branch", "test_twophase_prepared", "test_twophase"])

    # Start compute on the new branch
-    pg2 = postgres.create_start(
+    pg2 = env.postgres.create_start(
        'test_twophase_prepared',
        config_lines=['max_prepared_transactions=5'],
    )
--- a/test_runner/batch_others/test_vm_bits.py
+++ b/test_runner/batch_others/test_vm_bits.py
@@ -1,4 +1,4 @@
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -8,14 +8,12 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 # Test that the VM bit is cleared correctly at a HEAP_DELETE and
 # HEAP_UPDATE record.
 #
-def test_vm_bit_clear(pageserver: ZenithPageserver,
-                      postgres: PostgresFactory,
-                      pg_bin,
-                      zenith_cli,
-                      base_dir):
+def test_vm_bit_clear(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+
    # Create a branch for us
-    zenith_cli.run(["branch", "test_vm_bit_clear", "empty"])
-    pg = postgres.create_start('test_vm_bit_clear')
+    env.zenith_cli(["branch", "test_vm_bit_clear", "empty"])
+    pg = env.postgres.create_start('test_vm_bit_clear')

    log.info("postgres is running on 'test_vm_bit_clear' branch")
    pg_conn = pg.connect()
@@ -38,7 +36,7 @@ def test_vm_bit_clear(pageserver: ZenithPageserver,
    cur.execute('UPDATE vmtest_update SET id = 5000 WHERE id = 1')

    # Branch at this point, to test that later
-    zenith_cli.run(["branch", "test_vm_bit_clear_new", "test_vm_bit_clear"])
+    env.zenith_cli(["branch", "test_vm_bit_clear_new", "test_vm_bit_clear"])

    # Clear the buffer cache, to force the VM page to be re-fetched from
    # the page server
@@ -66,7 +64,7 @@ def test_vm_bit_clear(pageserver: ZenithPageserver,
    # a dirty VM page is evicted. If the VM bit was not correctly cleared by the
    # earlier WAL record, the full-page image hides the problem. Starting a new
    # server at the right point-in-time avoids that full-page image.
-    pg_new = postgres.create_start('test_vm_bit_clear_new')
+    pg_new = env.postgres.create_start('test_vm_bit_clear_new')

    log.info("postgres is running on 'test_vm_bit_clear_new' branch")
    pg_new_conn = pg_new.connect()
--- a/test_runner/batch_others/test_wal_acceptor.py
+++ b/test_runner/batch_others/test_wal_acceptor.py
@@ -3,27 +3,30 @@ import random
 import time
 import os
 import subprocess
+import sys
+import threading
 import uuid

 from contextlib import closing
+from dataclasses import dataclass, field
 from multiprocessing import Process, Value
-from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory, PgBin
+from fixtures.zenith_fixtures import PgBin, ZenithEnv, ZenithEnvBuilder
 from fixtures.utils import lsn_to_hex, mkdir_if_needed
 from fixtures.log_helper import log
+from typing import List, Optional

 pytest_plugins = ("fixtures.zenith_fixtures")


 # basic test, write something in setup with wal acceptors, ensure that commits
 # succeed and data is written
-def test_normal_work(zenith_cli,
-                     pageserver: ZenithPageserver,
-                     postgres: PostgresFactory,
-                     wa_factory):
-    zenith_cli.run(["branch", "test_wal_acceptors_normal_work", "empty"])
-    wa_factory.start_n_new(3)
-    pg = postgres.create_start('test_wal_acceptors_normal_work',
-                               wal_acceptors=wa_factory.get_connstrs())
+def test_normal_work(zenith_env_builder: ZenithEnvBuilder):
+    zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()
+
+    env.zenith_cli(["branch", "test_wal_acceptors_normal_work", "main"])
+
+    pg = env.postgres.create_start('test_wal_acceptors_normal_work')

    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
@@ -35,56 +38,153 @@ def test_normal_work(zenith_cli,
            assert cur.fetchone() == (5000050000, )


+@dataclass
+class BranchMetrics:
+    name: str
+    latest_valid_lsn: int
+    # One entry per each Safekeeper, order is the same
+    flush_lsns: List[int] = field(default_factory=list)
+    commit_lsns: List[int] = field(default_factory=list)
+
+
 # Run page server and multiple acceptors, and multiple compute nodes running
 # against different timelines.
-def test_many_timelines(zenith_cli,
-                        pageserver: ZenithPageserver,
-                        postgres: PostgresFactory,
-                        wa_factory):
-    n_timelines = 2
+def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
+    zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()

-    wa_factory.start_n_new(3)
+    n_timelines = 3

    branches = ["test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines)]

    # start postgres on each timeline
    pgs = []
    for branch in branches:
-        zenith_cli.run(["branch", branch, "empty"])
-        pgs.append(postgres.create_start(branch, wal_acceptors=wa_factory.get_connstrs()))
+        env.zenith_cli(["branch", branch, "main"])
+        pgs.append(env.postgres.create_start(branch))
+
+    tenant_id = uuid.UUID(env.initial_tenant)
+
+    def collect_metrics(message: str) -> List[BranchMetrics]:
+        with env.pageserver.http_client() as pageserver_http:
+            branch_details = [
+                pageserver_http.branch_detail(tenant_id=tenant_id, name=branch)
+                for branch in branches
+            ]
+        # All changes visible to pageserver (latest_valid_lsn) should be
+        # confirmed by safekeepers first. As we cannot atomically get
+        # state of both pageserver and safekeepers, we should start with
+        # pageserver. Looking at outdated data from pageserver is ok.
+        # Asking safekeepers first is not ok because new commits may arrive
+        # to both safekeepers and pageserver after we've already obtained
+        # safekeepers' state, it will look contradictory.
+        sk_metrics = [sk.http_client().get_metrics() for sk in env.safekeepers]
+
+        branch_metrics = []
+        with env.pageserver.http_client() as pageserver_http:
+            for branch_detail in branch_details:
+                timeline_id: str = branch_detail["timeline_id"]
+
+                m = BranchMetrics(
+                    name=branch_detail["name"],
+                    latest_valid_lsn=branch_detail["latest_valid_lsn"],
+                )
+                for sk_m in sk_metrics:
+                    m.flush_lsns.append(sk_m.flush_lsn_inexact[timeline_id])
+                    m.commit_lsns.append(sk_m.commit_lsn_inexact[timeline_id])
+
+                for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns):
+                    # Invariant. May be < when transaction is in progress.
+                    assert commit_lsn <= flush_lsn
+                # We only call collect_metrics() after a transaction is confirmed by
+                # the compute node, which only happens after a consensus of safekeepers
+                # has confirmed the transaction. We assume majority consensus here.
+                assert (2 * sum(m.latest_valid_lsn <= lsn
+                                for lsn in m.flush_lsns) > zenith_env_builder.num_safekeepers)
+                assert (2 * sum(m.latest_valid_lsn <= lsn
+                                for lsn in m.commit_lsns) > zenith_env_builder.num_safekeepers)
+                branch_metrics.append(m)
+        log.info(f"{message}: {branch_metrics}")
+        return branch_metrics
+
+    # TODO: https://github.com/zenithdb/zenith/issues/809
+    # collect_metrics("before CREATE TABLE")

    # Do everything in different loops to have actions on different timelines
    # interleaved.
    # create schema
    for pg in pgs:
        pg.safe_psql("CREATE TABLE t(key int primary key, value text)")
+    init_m = collect_metrics("after CREATE TABLE")

-    # Populate data
-    for pg in pgs:
+    # Populate data for 2/3 branches
+    class MetricsChecker(threading.Thread):
+        def __init__(self) -> None:
+            super().__init__(daemon=True)
+            self.should_stop = threading.Event()
+            self.exception: Optional[BaseException] = None
+
+        def run(self) -> None:
+            try:
+                while not self.should_stop.is_set():
+                    collect_metrics("during INSERT INTO")
+                    time.sleep(1)
+            except:
+                log.error("MetricsChecker's thread failed, the test will be failed on .stop() call",
+                          exc_info=True)
+                # We want to preserve traceback as well as the exception
+                exc_type, exc_value, exc_tb = sys.exc_info()
+                assert exc_type
+                e = exc_type(exc_value)
+                e.__traceback__ = exc_tb
+                self.exception = e
+
+        def stop(self) -> None:
+            self.should_stop.set()
+            self.join()
+            if self.exception:
+                raise self.exception
+
+    metrics_checker = MetricsChecker()
+    metrics_checker.start()
+
+    for pg in pgs[:-1]:
        pg.safe_psql("INSERT INTO t SELECT generate_series(1,100000), 'payload'")

-    # Check data
-    for pg in pgs:
+    metrics_checker.stop()
+
+    collect_metrics("after INSERT INTO")
+
+    # Check data for 2/3 branches
+    for pg in pgs[:-1]:
        res = pg.safe_psql("SELECT sum(key) FROM t")
        assert res[0] == (5000050000, )

+    final_m = collect_metrics("after SELECT")
+    # Assume that LSNs (a) behave similarly in all branches; and (b) INSERT INTO alters LSN significantly.
+    # Also assume that safekeepers will not be significantly out of sync in this test.
+    middle_lsn = (init_m[0].latest_valid_lsn + final_m[0].latest_valid_lsn) // 2
+    assert max(init_m[0].flush_lsns) < middle_lsn < min(final_m[0].flush_lsns)
+    assert max(init_m[0].commit_lsns) < middle_lsn < min(final_m[0].commit_lsns)
+    assert max(init_m[1].flush_lsns) < middle_lsn < min(final_m[1].flush_lsns)
+    assert max(init_m[1].commit_lsns) < middle_lsn < min(final_m[1].commit_lsns)
+    assert max(init_m[2].flush_lsns) <= min(final_m[2].flush_lsns) < middle_lsn
+    assert max(init_m[2].commit_lsns) <= min(final_m[2].commit_lsns) < middle_lsn
+

 # Check that dead minority doesn't prevent the commits: execute insert n_inserts
 # times, with fault_probability chance of getting a wal acceptor down or up
 # along the way. 2 of 3 are always alive, so the work keeps going.
-def test_restarts(zenith_cli,
-                  pageserver: ZenithPageserver,
-                  postgres: PostgresFactory,
-                  wa_factory: WalAcceptorFactory):
+def test_restarts(zenith_env_builder: ZenithEnvBuilder):
    fault_probability = 0.01
    n_inserts = 1000
    n_acceptors = 3

-    wa_factory.start_n_new(n_acceptors)
+    zenith_env_builder.num_safekeepers = n_acceptors
+    env = zenith_env_builder.init()

-    zenith_cli.run(["branch", "test_wal_acceptors_restarts", "empty"])
-    pg = postgres.create_start('test_wal_acceptors_restarts',
-                               wal_acceptors=wa_factory.get_connstrs())
+    env.zenith_cli(["branch", "test_wal_acceptors_restarts", "main"])
+    pg = env.postgres.create_start('test_wal_acceptors_restarts')

    # we rely upon autocommit after each statement
    # as waiting for acceptors happens there
@@ -98,7 +198,7 @@ def test_restarts(zenith_cli,

        if random.random() <= fault_probability:
            if failed_node is None:
-                failed_node = wa_factory.instances[random.randrange(0, n_acceptors)]
+                failed_node = env.safekeepers[random.randrange(0, n_acceptors)]
                failed_node.stop()
            else:
                failed_node.start()
@@ -116,12 +216,12 @@ def delayed_wal_acceptor_start(wa):


 # When majority of acceptors is offline, commits are expected to be frozen
-def test_unavailability(zenith_cli, postgres: PostgresFactory, wa_factory):
-    wa_factory.start_n_new(2)
+def test_unavailability(zenith_env_builder: ZenithEnvBuilder):
+    zenith_env_builder.num_safekeepers = 2
+    env = zenith_env_builder.init()

-    zenith_cli.run(["branch", "test_wal_acceptors_unavailability", "empty"])
-    pg = postgres.create_start('test_wal_acceptors_unavailability',
-                               wal_acceptors=wa_factory.get_connstrs())
+    env.zenith_cli(["branch", "test_wal_acceptors_unavailability", "main"])
+    pg = env.postgres.create_start('test_wal_acceptors_unavailability')

    # we rely upon autocommit after each statement
    # as waiting for acceptors happens there
@@ -133,9 +233,9 @@ def test_unavailability(zenith_cli, postgres: PostgresFactory, wa_factory):
    cur.execute("INSERT INTO t values (1, 'payload')")

    # shutdown one of two acceptors, that is, majority
-    wa_factory.instances[0].stop()
+    env.safekeepers[0].stop()

-    proc = Process(target=delayed_wal_acceptor_start, args=(wa_factory.instances[0], ))
+    proc = Process(target=delayed_wal_acceptor_start, args=(env.safekeepers[0], ))
    proc.start()

    start = time.time()
@@ -145,9 +245,9 @@ def test_unavailability(zenith_cli, postgres: PostgresFactory, wa_factory):
    proc.join()

    # for the world's balance, do the same with second acceptor
-    wa_factory.instances[1].stop()
+    env.safekeepers[1].stop()

-    proc = Process(target=delayed_wal_acceptor_start, args=(wa_factory.instances[1], ))
+    proc = Process(target=delayed_wal_acceptor_start, args=(env.safekeepers[1], ))
    proc.start()

    start = time.time()
@@ -186,17 +286,13 @@ def stop_value():


 # do inserts while concurrently getting up/down subsets of acceptors
-def test_race_conditions(zenith_cli,
-                         pageserver: ZenithPageserver,
-                         postgres: PostgresFactory,
-                         wa_factory,
-                         stop_value):
+def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value):

-    wa_factory.start_n_new(3)
+    zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()

-    zenith_cli.run(["branch", "test_wal_acceptors_race_conditions", "empty"])
-    pg = postgres.create_start('test_wal_acceptors_race_conditions',
-                               wal_acceptors=wa_factory.get_connstrs())
+    env.zenith_cli(["branch", "test_wal_acceptors_race_conditions", "main"])
+    pg = env.postgres.create_start('test_wal_acceptors_race_conditions')

    # we rely upon autocommit after each statement
    # as waiting for acceptors happens there
@@ -205,7 +301,7 @@ def test_race_conditions(zenith_cli,

    cur.execute('CREATE TABLE t(key int primary key, value text)')

-    proc = Process(target=xmas_garland, args=(wa_factory.instances, stop_value))
+    proc = Process(target=xmas_garland, args=(env.safekeepers, stop_value))
    proc.start()

    for i in range(1000):
@@ -220,7 +316,8 @@ def test_race_conditions(zenith_cli,

 class ProposerPostgres:
    """Object for running safekeepers sync with walproposer"""
-    def __init__(self, pgdata_dir: str, pg_bin: PgBin, timeline_id: str, tenant_id: str):
+    def __init__(self, env: ZenithEnv, pgdata_dir: str, pg_bin, timeline_id: str, tenant_id: str):
+        self.env = env
        self.pgdata_dir: str = pgdata_dir
        self.pg_bin: PgBin = pg_bin
        self.timeline_id: str = timeline_id
@@ -266,16 +363,20 @@ class ProposerPostgres:


 # insert wal in all safekeepers and run sync on proposer
-def test_sync_safekeepers(repo_dir: str, pg_bin: PgBin, wa_factory: WalAcceptorFactory):
-    wa_factory.start_n_new(3)
+def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, pg_bin: PgBin):
+
+    # We don't really need the full environment for this test, just the
+    # safekeepers would be enough.
+    zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()

    timeline_id = uuid.uuid4().hex
    tenant_id = uuid.uuid4().hex

    # write config for proposer
-    pgdata_dir = os.path.join(repo_dir, "proposer_pgdata")
-    pg = ProposerPostgres(pgdata_dir, pg_bin, timeline_id, tenant_id)
-    pg.create_dir_config(wa_factory.get_connstrs())
+    pgdata_dir = os.path.join(env.repo_dir, "proposer_pgdata")
+    pg = ProposerPostgres(env, pgdata_dir, pg_bin, timeline_id, tenant_id)
+    pg.create_dir_config(env.get_safekeeper_connstrs())

    # valid lsn, which is not in the segment start, nor in zero segment
    epoch_start_lsn = 0x16B9188  # 0/16B9188
@@ -284,7 +385,7 @@ def test_sync_safekeepers(repo_dir: str, pg_bin: PgBin, wa_factory: WalAcceptorF
    # append and commit WAL
    lsn_after_append = []
    for i in range(3):
-        res = wa_factory.instances[i].append_logical_message(
+        res = env.safekeepers[i].append_logical_message(
            tenant_id,
            timeline_id,
            {
@@ -308,13 +409,15 @@ def test_sync_safekeepers(repo_dir: str, pg_bin: PgBin, wa_factory: WalAcceptorF
    assert all(lsn_after_sync == lsn for lsn in lsn_after_append)


-def test_timeline_status(zenith_cli, pageserver, postgres, wa_factory: WalAcceptorFactory):
-    wa_factory.start_n_new(1)
+def test_timeline_status(zenith_env_builder: ZenithEnvBuilder):

-    zenith_cli.run(["branch", "test_timeline_status", "empty"])
-    pg = postgres.create_start('test_timeline_status', wal_acceptors=wa_factory.get_connstrs())
+    zenith_env_builder.num_safekeepers = 1
+    env = zenith_env_builder.init()

-    wa = wa_factory.instances[0]
+    env.zenith_cli(["branch", "test_timeline_status", "main"])
+    pg = env.postgres.create_start('test_timeline_status')
+
+    wa = env.safekeepers[0]
    wa_http_cli = wa.http_client()
    wa_http_cli.check_status()

--- a/test_runner/batch_others/test_wal_acceptor_async.py
+++ b/test_runner/batch_others/test_wal_acceptor_async.py
@@ -1,9 +1,11 @@
 import asyncio
 import asyncpg
 import random
+import time

-from fixtures.zenith_fixtures import WalAcceptor, WalAcceptorFactory, ZenithPageserver, PostgresFactory, Postgres
+from fixtures.zenith_fixtures import ZenithEnvBuilder, Postgres, Safekeeper
 from fixtures.log_helper import getLogger
+from fixtures.utils import lsn_from_hex, lsn_to_hex
 from typing import List

 log = getLogger('root.wal_acceptor_async')
@@ -102,18 +104,58 @@ async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accou
    await pg_conn.close()


+async def wait_for_lsn(safekeeper: Safekeeper,
+                       tenant_id: str,
+                       timeline_id: str,
+                       wait_lsn: str,
+                       polling_interval=1,
+                       timeout=60):
+    """
+    Poll flush_lsn from safekeeper until it's greater or equal than
+    provided wait_lsn. To do that, timeline_status is fetched from
+    safekeeper every polling_interval seconds.
+    """
+
+    started_at = time.time()
+    client = safekeeper.http_client()
+
+    flush_lsn = client.timeline_status(tenant_id, timeline_id).flush_lsn
+    log.info(
+        f'Safekeeper at port {safekeeper.port.pg} has flush_lsn {flush_lsn}, waiting for lsn {wait_lsn}'
+    )
+
+    while lsn_from_hex(wait_lsn) > lsn_from_hex(flush_lsn):
+        elapsed = time.time() - started_at
+        if elapsed > timeout:
+            raise RuntimeError(
+                f"timed out waiting for safekeeper at port {safekeeper.port.pg} to reach {wait_lsn}, current lsn is {flush_lsn}"
+            )
+
+        await asyncio.sleep(polling_interval)
+        flush_lsn = client.timeline_status(tenant_id, timeline_id).flush_lsn
+        log.debug(f'safekeeper port={safekeeper.port.pg} flush_lsn={flush_lsn} wait_lsn={wait_lsn}')
+
+
 # This test will run several iterations and check progress in each of them.
 # On each iteration 1 acceptor is stopped, and 2 others should allow
 # background workers execute transactions. In the end, state should remain
 # consistent.
-async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_workers=10):
+async def run_restarts_under_load(pg: Postgres, acceptors: List[Safekeeper], n_workers=10):
    n_accounts = 100
    init_amount = 100000
    max_transfer = 100
    period_time = 10
    iterations = 6

+    # Set timeout for this test at 5 minutes. It should be enough for test to complete
+    # and less than CircleCI's no_output_timeout, taking into account that this timeout
+    # is checked only at the beginning of every iteration.
+    test_timeout_at = time.monotonic() + 5 * 60
+
    pg_conn = await pg.connect_async()
+    tenant_id = await pg_conn.fetchval("show zenith.zenith_tenant")
+    timeline_id = await pg_conn.fetchval("show zenith.zenith_timeline")
+
    bank = BankClient(pg_conn, n_accounts=n_accounts, init_amount=init_amount)
    # create tables and initial balances
    await bank.initdb()
@@ -125,14 +167,20 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_
        workers.append(asyncio.create_task(worker))

    for it in range(iterations):
-        victim = acceptors[it % len(acceptors)]
+        assert time.monotonic() < test_timeout_at, 'test timed out'
+
+        victim_idx = it % len(acceptors)
+        victim = acceptors[victim_idx]
        victim.stop()

-        # Wait till previous victim recovers so it is ready for the next
-        # iteration by making any writing xact.
-        conn = await pg.connect_async()
-        await conn.execute('UPDATE bank_accs SET amount = amount WHERE uid = 1', timeout=120)
-        await conn.close()
+        flush_lsn = await pg_conn.fetchval('SELECT pg_current_wal_flush_lsn()')
+        flush_lsn = lsn_to_hex(flush_lsn)
+        log.info(f'Postgres flush_lsn {flush_lsn}')
+
+        # Wait until alive safekeepers catch up with postgres
+        for idx, safekeeper in enumerate(acceptors):
+            if idx != victim_idx:
+                await wait_for_lsn(safekeeper, tenant_id, timeline_id, flush_lsn)

        stats.reset()
        await asyncio.sleep(period_time)
@@ -151,18 +199,14 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_


 # restart acceptors one by one, while executing and validating bank transactions
-def test_restarts_under_load(zenith_cli,
-                             pageserver: ZenithPageserver,
-                             postgres: PostgresFactory,
-                             wa_factory: WalAcceptorFactory):
+def test_restarts_under_load(zenith_env_builder: ZenithEnvBuilder):
+    zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()

-    wa_factory.start_n_new(3)
+    env.zenith_cli(["branch", "test_wal_acceptors_restarts_under_load", "main"])
+    pg = env.postgres.create_start('test_wal_acceptors_restarts_under_load')

-    zenith_cli.run(["branch", "test_wal_acceptors_restarts_under_load", "empty"])
-    pg = postgres.create_start('test_wal_acceptors_restarts_under_load',
-                               wal_acceptors=wa_factory.get_connstrs())
-
-    asyncio.run(run_restarts_under_load(pg, wa_factory.instances))
+    asyncio.run(run_restarts_under_load(pg, env.safekeepers))

    # TODO: Remove when https://github.com/zenithdb/zenith/issues/644 is fixed
    pg.stop()
--- a/test_runner/batch_others/test_zenith_cli.py
+++ b/test_runner/batch_others/test_zenith_cli.py
@@ -1,27 +1,30 @@
 import json
 import uuid

-from fixtures.zenith_fixtures import ZenithCli, ZenithPageserver
+from psycopg2.extensions import cursor as PgCursor
+from fixtures.zenith_fixtures import ZenithEnv
+from typing import cast

 pytest_plugins = ("fixtures.zenith_fixtures")


-def helper_compare_branch_list(page_server_cur, zenith_cli, initial_tenant: str):
+def helper_compare_branch_list(page_server_cur: PgCursor, env: ZenithEnv, initial_tenant: str):
    """
    Compare branches list returned by CLI and directly via API.
    Filters out branches created by other tests.
    """

    page_server_cur.execute(f'branch_list {initial_tenant}')
-    branches_api = sorted(map(lambda b: b['name'], json.loads(page_server_cur.fetchone()[0])))
+    branches_api = sorted(
+        map(lambda b: cast(str, b['name']), json.loads(page_server_cur.fetchone()[0])))
    branches_api = [b for b in branches_api if b.startswith('test_cli_') or b in ('empty', 'main')]

-    res = zenith_cli.run(["branch"])
+    res = env.zenith_cli(["branch"])
    res.check_returncode()
    branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
    branches_cli = [b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')]

-    res = zenith_cli.run(["branch", f"--tenantid={initial_tenant}"])
+    res = env.zenith_cli(["branch", f"--tenantid={initial_tenant}"])
    res.check_returncode()
    branches_cli_with_tenant_arg = sorted(
        map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
@@ -32,25 +35,26 @@ def helper_compare_branch_list(page_server_cur, zenith_cli, initial_tenant: str)
    assert branches_api == branches_cli == branches_cli_with_tenant_arg


-def test_cli_branch_list(pageserver: ZenithPageserver, zenith_cli):
-    page_server_conn = pageserver.connect()
+def test_cli_branch_list(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    page_server_conn = env.pageserver.connect()
    page_server_cur = page_server_conn.cursor()

    # Initial sanity check
-    helper_compare_branch_list(page_server_cur, zenith_cli, pageserver.initial_tenant)
+    helper_compare_branch_list(page_server_cur, env, env.initial_tenant)

    # Create a branch for us
-    res = zenith_cli.run(["branch", "test_cli_branch_list_main", "main"])
+    res = env.zenith_cli(["branch", "test_cli_branch_list_main", "empty"])
    assert res.stderr == ''
-    helper_compare_branch_list(page_server_cur, zenith_cli, pageserver.initial_tenant)
+    helper_compare_branch_list(page_server_cur, env, env.initial_tenant)

    # Create a nested branch
-    res = zenith_cli.run(["branch", "test_cli_branch_list_nested", "test_cli_branch_list_main"])
+    res = env.zenith_cli(["branch", "test_cli_branch_list_nested", "test_cli_branch_list_main"])
    assert res.stderr == ''
-    helper_compare_branch_list(page_server_cur, zenith_cli, pageserver.initial_tenant)
+    helper_compare_branch_list(page_server_cur, env, env.initial_tenant)

    # Check that all new branches are visible via CLI
-    res = zenith_cli.run(["branch"])
+    res = env.zenith_cli(["branch"])
    assert res.stderr == ''
    branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))

@@ -58,44 +62,46 @@ def test_cli_branch_list(pageserver: ZenithPageserver, zenith_cli):
    assert 'test_cli_branch_list_nested' in branches_cli


-def helper_compare_tenant_list(page_server_cur, zenith_cli: ZenithCli):
+def helper_compare_tenant_list(page_server_cur: PgCursor, env: ZenithEnv):
    page_server_cur.execute(f'tenant_list')
-    tenants_api = sorted(json.loads(page_server_cur.fetchone()[0]))
+    tenants_api = sorted(
+        map(lambda t: cast(str, t['id']), json.loads(page_server_cur.fetchone()[0])))

-    res = zenith_cli.run(["tenant", "list"])
+    res = env.zenith_cli(["tenant", "list"])
    assert res.stderr == ''
-    tenants_cli = sorted(res.stdout.splitlines())
+    tenants_cli = sorted(map(lambda t: t.split()[0], res.stdout.splitlines()))

    assert tenants_api == tenants_cli


-def test_cli_tenant_list(pageserver: ZenithPageserver, zenith_cli: ZenithCli):
-    page_server_conn = pageserver.connect()
+def test_cli_tenant_list(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    page_server_conn = env.pageserver.connect()
    page_server_cur = page_server_conn.cursor()

    # Initial sanity check
-    helper_compare_tenant_list(page_server_cur, zenith_cli)
+    helper_compare_tenant_list(page_server_cur, env)

    # Create new tenant
    tenant1 = uuid.uuid4().hex
-    res = zenith_cli.run(["tenant", "create", tenant1])
+    res = env.zenith_cli(["tenant", "create", tenant1])
    res.check_returncode()

    # check tenant1 appeared
-    helper_compare_tenant_list(page_server_cur, zenith_cli)
+    helper_compare_tenant_list(page_server_cur, env)

    # Create new tenant
    tenant2 = uuid.uuid4().hex
-    res = zenith_cli.run(["tenant", "create", tenant2])
+    res = env.zenith_cli(["tenant", "create", tenant2])
    res.check_returncode()

    # check tenant2 appeared
-    helper_compare_tenant_list(page_server_cur, zenith_cli)
+    helper_compare_tenant_list(page_server_cur, env)

-    res = zenith_cli.run(["tenant", "list"])
+    res = env.zenith_cli(["tenant", "list"])
    res.check_returncode()
-    tenants = sorted(res.stdout.splitlines())
+    tenants = sorted(map(lambda t: t.split()[0], res.stdout.splitlines()))

-    assert pageserver.initial_tenant in tenants
+    assert env.initial_tenant in tenants
    assert tenant1 in tenants
    assert tenant2 in tenants
--- a/test_runner/batch_pg_regress/test_isolation.py
+++ b/test_runner/batch_pg_regress/test_isolation.py
@@ -1,26 +1,20 @@
 import os

 from fixtures.utils import mkdir_if_needed
-from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory
+from fixtures.zenith_fixtures import ZenithEnv, base_dir, pg_distrib_dir

 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_isolation(pageserver: ZenithPageserver,
-                   postgres: PostgresFactory,
-                   pg_bin,
-                   zenith_cli,
-                   test_output_dir,
-                   pg_distrib_dir,
-                   base_dir,
-                   capsys):
+def test_isolation(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys):
+    env = zenith_simple_env

    # Create a branch for us
-    zenith_cli.run(["branch", "test_isolation", "empty"])
+    env.zenith_cli(["branch", "test_isolation", "empty"])

    # Connect to postgres and create a database called "regression".
    # isolation tests use prepared transactions, so enable them
-    pg = postgres.create_start('test_isolation', config_lines=['max_prepared_transactions=100'])
+    pg = env.postgres.create_start('test_isolation', config_lines=['max_prepared_transactions=100'])
    pg.safe_psql('CREATE DATABASE isolation_regression')

    # Create some local directories for pg_isolation_regress to run in.
@@ -44,7 +38,7 @@ def test_isolation(pageserver: ZenithPageserver,
        '--schedule={}'.format(schedule),
    ]

-    env = {
+    env_vars = {
        'PGPORT': str(pg.port),
        'PGUSER': pg.username,
        'PGHOST': pg.host,
@@ -54,4 +48,4 @@ def test_isolation(pageserver: ZenithPageserver,
    # We don't capture the output. It's not too chatty, and it always
    # logs the exact same data to `regression.out` anyway.
    with capsys.disabled():
-        pg_bin.run(pg_isolation_regress_command, env=env, cwd=runpath)
+        pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath)
--- a/test_runner/batch_pg_regress/test_pg_regress.py
+++ b/test_runner/batch_pg_regress/test_pg_regress.py
@@ -1,25 +1,19 @@
 import os

 from fixtures.utils import mkdir_if_needed
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content
+from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content, base_dir, pg_distrib_dir

 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_pg_regress(pageserver: ZenithPageserver,
-                    postgres: PostgresFactory,
-                    pg_bin,
-                    zenith_cli,
-                    test_output_dir,
-                    pg_distrib_dir,
-                    base_dir,
-                    capsys):
+def test_pg_regress(zenith_simple_env: ZenithEnv, test_output_dir: str, pg_bin, capsys):
+    env = zenith_simple_env

    # Create a branch for us
-    zenith_cli.run(["branch", "test_pg_regress", "empty"])
+    env.zenith_cli(["branch", "test_pg_regress", "empty"])

    # Connect to postgres and create a database called "regression".
-    pg = postgres.create_start('test_pg_regress')
+    pg = env.postgres.create_start('test_pg_regress')
    pg.safe_psql('CREATE DATABASE regression')

    # Create some local directories for pg_regress to run in.
@@ -44,7 +38,7 @@ def test_pg_regress(pageserver: ZenithPageserver,
        '--inputdir={}'.format(src_path),
    ]

-    env = {
+    env_vars = {
        'PGPORT': str(pg.port),
        'PGUSER': pg.username,
        'PGHOST': pg.host,
@@ -54,11 +48,11 @@ def test_pg_regress(pageserver: ZenithPageserver,
    # We don't capture the output. It's not too chatty, and it always
    # logs the exact same data to `regression.out` anyway.
    with capsys.disabled():
-        pg_bin.run(pg_regress_command, env=env, cwd=runpath)
+        pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)

        # checkpoint one more time to ensure that the lsn we get is the latest one
        pg.safe_psql('CHECKPOINT')
        lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0]

        # Check that we restore the content of the datadir correctly
-        check_restored_datadir_content(zenith_cli, test_output_dir, pg, pageserver.service_port.pg)
+        check_restored_datadir_content(test_output_dir, env, pg)
--- a/test_runner/batch_pg_regress/test_zenith_regress.py
+++ b/test_runner/batch_pg_regress/test_zenith_regress.py
@@ -1,26 +1,23 @@
 import os

 from fixtures.utils import mkdir_if_needed
-from fixtures.zenith_fixtures import PageserverPort, PostgresFactory, check_restored_datadir_content
+from fixtures.zenith_fixtures import (ZenithEnv,
+                                      check_restored_datadir_content,
+                                      base_dir,
+                                      pg_distrib_dir)
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_zenith_regress(postgres: PostgresFactory,
-                        pg_bin,
-                        zenith_cli,
-                        test_output_dir,
-                        pg_distrib_dir,
-                        base_dir,
-                        capsys,
-                        pageserver_port: PageserverPort):
+def test_zenith_regress(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys):
+    env = zenith_simple_env

    # Create a branch for us
-    zenith_cli.run(["branch", "test_zenith_regress", "empty"])
+    env.zenith_cli(["branch", "test_zenith_regress", "empty"])

    # Connect to postgres and create a database called "regression".
-    pg = postgres.create_start('test_zenith_regress')
+    pg = env.postgres.create_start('test_zenith_regress')
    pg.safe_psql('CREATE DATABASE regression')

    # Create some local directories for pg_regress to run in.
@@ -46,7 +43,7 @@ def test_zenith_regress(postgres: PostgresFactory,
    ]

    log.info(pg_regress_command)
-    env = {
+    env_vars = {
        'PGPORT': str(pg.port),
        'PGUSER': pg.username,
        'PGHOST': pg.host,
@@ -56,11 +53,11 @@ def test_zenith_regress(postgres: PostgresFactory,
    # We don't capture the output. It's not too chatty, and it always
    # logs the exact same data to `regression.out` anyway.
    with capsys.disabled():
-        pg_bin.run(pg_regress_command, env=env, cwd=runpath)
+        pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)

        # checkpoint one more time to ensure that the lsn we get is the latest one
        pg.safe_psql('CHECKPOINT')
        lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0]

        # Check that we restore the content of the datadir correctly
-        check_restored_datadir_content(zenith_cli, test_output_dir, pg, pageserver_port.pg)
+        check_restored_datadir_content(test_output_dir, env, pg)
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -1 +1 @@
-pytest_plugins = ("fixtures.zenith_fixtures")
+pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -1,29 +1,22 @@
+import dataclasses
+import json
 import os
+from pathlib import Path
 import re
+import subprocess
 import timeit
-import pathlib
-import uuid
-import psycopg2
+import calendar
+import enum
+from datetime import datetime
 import pytest
 from _pytest.config import Config
-from _pytest.runner import CallInfo
 from _pytest.terminal import TerminalReporter
-import shutil
-import signal
-import subprocess
-import time
+import warnings

 from contextlib import contextmanager
-from contextlib import closing
-from pathlib import Path
-from dataclasses import dataclass

 # Type-related stuff
-from psycopg2.extensions import connection as PgConnection
-from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast
-from typing_extensions import Literal
-
-from .utils import (get_self_dir, mkdir_if_needed, subprocess_capture)
+from typing import Iterator
 """
 This file contains fixtures for micro-benchmarks.

@@ -31,15 +24,15 @@ To use, declare the 'zenbenchmark' fixture in the test function. Run the
 bencmark, and then record the result by calling zenbenchmark.record. For example:

 import timeit
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv

 pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")

-def test_mybench(postgres: PostgresFactory, pageserver: ZenithPageserver, zenbenchmark):
+def test_mybench(zenith_simple_env: env, zenbenchmark):

    # Initialize the test
    ...
-    
+
    # Run the test, timing how long it takes
    with zenbenchmark.record_duration('test_query'):
        cur.execute('SELECT test_query(...)')
@@ -54,39 +47,92 @@ in the test initialization, or measure disk usage after the test query.

 """

-# All the results are collected in this list, as a tuple:
-# (test_name: str, metric_name: str, metric_value: float, unit: str)
-#
-# TODO: It would perhaps be better to store the results as additional
-# properties in the pytest TestReport objects, to make them visible to
-# other pytest tools.
-global zenbenchmark_results
-zenbenchmark_results = []
+
+@dataclasses.dataclass
+class PgBenchRunResult:
+    scale: int
+    number_of_clients: int
+    number_of_threads: int
+    number_of_transactions_actually_processed: int
+    latency_average: float
+    latency_stddev: float
+    tps_including_connection_time: float
+    tps_excluding_connection_time: float
+    init_duration: float
+    init_start_timestamp: int
+    init_end_timestamp: int
+    run_duration: float
+    run_start_timestamp: int
+    run_end_timestamp: int
+
+    # TODO progress
+
+    @classmethod
+    def parse_from_output(
+        cls,
+        out: 'subprocess.CompletedProcess[str]',
+        init_duration: float,
+        init_start_timestamp: int,
+        init_end_timestamp: int,
+        run_duration: float,
+        run_start_timestamp: int,
+        run_end_timestamp: int,
+    ):
+        stdout_lines = out.stdout.splitlines()
+        # we know significant parts of these values from test input
+        # but to be precise take them from output
+        # scaling factor: 5
+        assert "scaling factor" in stdout_lines[1]
+        scale = int(stdout_lines[1].split()[-1])
+        # number of clients: 1
+        assert "number of clients" in stdout_lines[3]
+        number_of_clients = int(stdout_lines[3].split()[-1])
+        # number of threads: 1
+        assert "number of threads" in stdout_lines[4]
+        number_of_threads = int(stdout_lines[4].split()[-1])
+        # number of transactions actually processed: 1000/1000
+        assert "number of transactions actually processed" in stdout_lines[6]
+        number_of_transactions_actually_processed = int(stdout_lines[6].split("/")[1])
+        # latency average = 19.894 ms
+        assert "latency average" in stdout_lines[7]
+        latency_average = stdout_lines[7].split()[-2]
+        # latency stddev = 3.387 ms
+        assert "latency stddev" in stdout_lines[8]
+        latency_stddev = stdout_lines[8].split()[-2]
+        # tps = 50.219689 (including connections establishing)
+        assert "(including connections establishing)" in stdout_lines[9]
+        tps_including_connection_time = stdout_lines[9].split()[2]
+        # tps = 50.264435 (excluding connections establishing)
+        assert "(excluding connections establishing)" in stdout_lines[10]
+        tps_excluding_connection_time = stdout_lines[10].split()[2]
+
+        return cls(
+            scale=scale,
+            number_of_clients=number_of_clients,
+            number_of_threads=number_of_threads,
+            number_of_transactions_actually_processed=number_of_transactions_actually_processed,
+            latency_average=float(latency_average),
+            latency_stddev=float(latency_stddev),
+            tps_including_connection_time=float(tps_including_connection_time),
+            tps_excluding_connection_time=float(tps_excluding_connection_time),
+            init_duration=init_duration,
+            init_start_timestamp=init_start_timestamp,
+            init_end_timestamp=init_end_timestamp,
+            run_duration=run_duration,
+            run_start_timestamp=run_start_timestamp,
+            run_end_timestamp=run_end_timestamp,
+        )


-class ZenithBenchmarkResults:
-    """ An object for recording benchmark results. """
-    def __init__(self):
-        self.results = []
-
-    def record(self, test_name: str, metric_name: str, metric_value: float, unit: str):
-        """
-        Record a benchmark result.
-        """
-
-        self.results.append((test_name, metric_name, metric_value, unit))
-
-
-# Session scope fixture that initializes the results object
-@pytest.fixture(autouse=True, scope='session')
-def zenbenchmark_global(request) -> Iterator[ZenithBenchmarkResults]:
-    """
-    This is a python decorator for benchmark fixtures
-    """
-    global zenbenchmark_results
-    zenbenchmark_results = ZenithBenchmarkResults()
-
-    yield zenbenchmark_results
+@enum.unique
+class MetricReport(str, enum.Enum):  # str is a hack to make it json serializable
+    # this means that this is a constant test parameter
+    # like number of transactions, or number of clients
+    TEST_PARAM = 'test_param'
+    # reporter can use it to mark test runs with higher values as improvements
+    HIGHER_IS_BETTER = 'higher_is_better'
+    # the same but for lower values
+    LOWER_IS_BETTER = 'lower_is_better'


 class ZenithBenchmarker:
@@ -94,30 +140,109 @@ class ZenithBenchmarker:
    An object for recording benchmark results. This is created for each test
    function by the zenbenchmark fixture
    """
-    def __init__(self, results, request):
-        self.results = results
-        self.request = request
+    def __init__(self, property_recorder):
+        # property recorder here is a pytest fixture provided by junitxml module
+        # https://docs.pytest.org/en/6.2.x/reference.html#pytest.junitxml.record_property
+        self.property_recorder = property_recorder

-    def record(self, metric_name: str, metric_value: float, unit: str):
+    def record(
+        self,
+        metric_name: str,
+        metric_value: float,
+        unit: str,
+        report: MetricReport,
+    ):
        """
        Record a benchmark result.
        """
-        self.results.record(self.request.node.name, metric_name, metric_value, unit)
+        # just to namespace the value
+        name = f"zenith_benchmarker_{metric_name}"
+        self.property_recorder(
+            name,
+            {
+                "name": metric_name,
+                "value": metric_value,
+                "unit": unit,
+                "report": report,
+            },
+        )

    @contextmanager
-    def record_duration(self, metric_name):
+    def record_duration(self, metric_name: str):
        """
        Record a duration. Usage:
-        
+
        with zenbenchmark.record_duration('foobar_runtime'):
            foobar()   # measure this
-        
        """
        start = timeit.default_timer()
        yield
        end = timeit.default_timer()

-        self.results.record(self.request.node.name, metric_name, end - start, 's')
+        self.record(
+            metric_name=metric_name,
+            metric_value=end - start,
+            unit="s",
+            report=MetricReport.LOWER_IS_BETTER,
+        )
+
+    def record_pg_bench_result(self, pg_bench_result: PgBenchRunResult):
+        self.record("scale", pg_bench_result.scale, '', MetricReport.TEST_PARAM)
+        self.record("number_of_clients",
+                    pg_bench_result.number_of_clients,
+                    '',
+                    MetricReport.TEST_PARAM)
+        self.record("number_of_threads",
+                    pg_bench_result.number_of_threads,
+                    '',
+                    MetricReport.TEST_PARAM)
+        self.record(
+            "number_of_transactions_actually_processed",
+            pg_bench_result.number_of_transactions_actually_processed,
+            '',
+            # thats because this is predefined by test matrix and doesnt change across runs
+            report=MetricReport.TEST_PARAM,
+        )
+        self.record("latency_average",
+                    pg_bench_result.latency_average,
+                    unit="ms",
+                    report=MetricReport.LOWER_IS_BETTER)
+        self.record("latency_stddev",
+                    pg_bench_result.latency_stddev,
+                    unit="ms",
+                    report=MetricReport.LOWER_IS_BETTER)
+        self.record("tps_including_connection_time",
+                    pg_bench_result.tps_including_connection_time,
+                    '',
+                    report=MetricReport.HIGHER_IS_BETTER)
+        self.record("tps_excluding_connection_time",
+                    pg_bench_result.tps_excluding_connection_time,
+                    '',
+                    report=MetricReport.HIGHER_IS_BETTER)
+        self.record("init_duration",
+                    pg_bench_result.init_duration,
+                    unit="s",
+                    report=MetricReport.LOWER_IS_BETTER)
+        self.record("init_start_timestamp",
+                    pg_bench_result.init_start_timestamp,
+                    '',
+                    MetricReport.TEST_PARAM)
+        self.record("init_end_timestamp",
+                    pg_bench_result.init_end_timestamp,
+                    '',
+                    MetricReport.TEST_PARAM)
+        self.record("run_duration",
+                    pg_bench_result.run_duration,
+                    unit="s",
+                    report=MetricReport.LOWER_IS_BETTER)
+        self.record("run_start_timestamp",
+                    pg_bench_result.run_start_timestamp,
+                    '',
+                    MetricReport.TEST_PARAM)
+        self.record("run_end_timestamp",
+                    pg_bench_result.run_end_timestamp,
+                    '',
+                    MetricReport.TEST_PARAM)

    def get_io_writes(self, pageserver) -> int:
        """
@@ -137,6 +262,7 @@ class ZenithBenchmarker:
        matches = re.search(r'^pageserver_disk_io_bytes{io_operation="write"} (\S+)$',
                            all_metrics,
                            re.MULTILINE)
+        assert matches
        return int(round(float(matches.group(1))))

    def get_peak_mem(self, pageserver) -> int:
@@ -147,9 +273,10 @@ class ZenithBenchmarker:
        all_metrics = pageserver.http_client().get_metrics()
        # See comment in get_io_writes()
        matches = re.search(r'^pageserver_maxrss_kb (\S+)$', all_metrics, re.MULTILINE)
+        assert matches
        return int(round(float(matches.group(1))))

-    def get_timeline_size(self, repo_dir: str, tenantid: str, timelineid: str):
+    def get_timeline_size(self, repo_dir: Path, tenantid: str, timelineid: str):
        """
        Calculate the on-disk size of a timeline
        """
@@ -171,47 +298,90 @@ class ZenithBenchmarker:
        yield
        after = self.get_io_writes(pageserver)

-        self.results.record(self.request.node.name,
-                            metric_name,
-                            round((after - before) / (1024 * 1024)),
-                            'MB')
+        self.record(metric_name,
+                    round((after - before) / (1024 * 1024)),
+                    "MB",
+                    report=MetricReport.LOWER_IS_BETTER)


-@pytest.fixture(scope='function')
-def zenbenchmark(zenbenchmark_global, request) -> Iterator[ZenithBenchmarker]:
+@pytest.fixture(scope="function")
+def zenbenchmark(record_property) -> Iterator[ZenithBenchmarker]:
    """
    This is a python decorator for benchmark fixtures. It contains functions for
    recording measurements, and prints them out at the end.
    """
-    benchmarker = ZenithBenchmarker(zenbenchmark_global, request)
+    benchmarker = ZenithBenchmarker(record_property)
    yield benchmarker


+def pytest_addoption(parser):
+    parser.addoption(
+        "--out-dir",
+        dest="out_dir",
+        help="Directory to ouput performance tests results to.",
+    )
+
+
+def get_out_path(target_dir: Path, revision: str) -> Path:
+    """
+    get output file path
+    if running in the CI uses commit revision
+    to avoid duplicates uses counter
+    """
+    # use UTC timestamp as a counter marker to avoid weird behaviour
+    # when for example files are deleted
+    ts = calendar.timegm(datetime.utcnow().utctimetuple())
+    path = target_dir / f"{ts}_{revision}.json"
+    assert not path.exists()
+    return path
+
+
 # Hook to print the results at the end
@pytest.hookimpl(hookwrapper=True)
 def pytest_terminal_summary(terminalreporter: TerminalReporter, exitstatus: int, config: Config):
    yield
+    revision = os.getenv("GITHUB_SHA", "local")
+    platform = os.getenv("PLATFORM", "local")

-    global zenbenchmark_results
+    terminalreporter.section("Benchmark results", "-")

-    if not zenbenchmark_results:
+    result = []
+    for test_report in terminalreporter.stats.get("passed", []):
+        result_entry = []
+
+        for _, recorded_property in test_report.user_properties:
+            terminalreporter.write("{}.{}: ".format(test_report.head_line,
+                                                    recorded_property["name"]))
+            unit = recorded_property["unit"]
+            value = recorded_property["value"]
+            if unit == "MB":
+                terminalreporter.write("{0:,.0f}".format(value), green=True)
+            elif unit in ("s", "ms") and isinstance(value, float):
+                terminalreporter.write("{0:,.3f}".format(value), green=True)
+            elif isinstance(value, float):
+                terminalreporter.write("{0:,.4f}".format(value), green=True)
+            else:
+                terminalreporter.write(str(value), green=True)
+            terminalreporter.line(" {}".format(unit))
+
+            result_entry.append(recorded_property)
+
+        result.append({
+            "suit": test_report.nodeid,
+            "total_duration": test_report.duration,
+            "data": result_entry,
+        })
+
+    out_dir = config.getoption("out_dir")
+    if out_dir is None:
+        warnings.warn("no out dir provided to store performance test results")
        return

-    terminalreporter.section('Benchmark results', "-")
+    if not result:
+        warnings.warn("no results to store (no passed test suites)")
+        return

-    for result in zenbenchmark_results.results:
-        func = result[0]
-        metric_name = result[1]
-        metric_value = result[2]
-        unit = result[3]
-
-        terminalreporter.write("{}.{}: ".format(func, metric_name))
-
-        if unit == 'MB':
-            terminalreporter.write("{0:,.0f}".format(metric_value), green=True)
-        elif unit == 's':
-            terminalreporter.write("{0:,.3f}".format(metric_value), green=True)
-        else:
-            terminalreporter.write("{0:,.4f}".format(metric_value), green=True)
-
-        terminalreporter.line(" {}".format(unit))
+    get_out_path(Path(out_dir), revision=revision).write_text(
+        json.dumps({
+            "revision": revision, "platform": platform, "result": result
+        }, indent=4))
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -63,3 +63,9 @@ def global_counter() -> int:
 def lsn_to_hex(num: int) -> str:
    """ Convert lsn from int to standard hex notation. """
    return "{:X}/{:X}".format(num >> 32, num & 0xffffffff)
+
+
+def lsn_from_hex(lsn_hex: str) -> int:
+    """ Convert lsn from hex notation to int. """
+    l, r = lsn_hex.split('/')
+    return (int(l, 16) << 32) + int(r, 16)
--- a/test_runner/fixtures/zenith_fixtures.py
+++ b/test_runner/fixtures/zenith_fixtures.py
--- a/Show More
+++ b/Show More