add SegmentTag to Layer queries

pageserver - initial work on layer segment ranges
* add end segment tags to layer filenames * introduce naive segment ranges in delta and image layer formats
2026-02-07 04:30:36 +00:00 · 2022-01-02 20:35:06 -08:00 · 2021-12-19 20:33:10 -08:00 · 2021-12-10 17:23:35 +02:00 · 2021-12-10 17:23:35 +02:00 · 2021-12-10 15:14:27 +02:00
162 changed files with 19635 additions and 8049 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,20 +1,19 @@
 version: 2.1

-orbs:
-  python: circleci/python@1.4.0
-
 executors:
  zenith-build-executor:
    resource_class: xlarge
    docker:
-      - image: cimg/rust:1.55.0
+      - image: cimg/rust:1.56.1
+  zenith-python-executor:
+    docker:
+      - image: cimg/python:3.7.10  # Oldest available 3.7 with Ubuntu 20.04 (for GLIBC and Rust) at CirlceCI

 jobs:
-  check-codestyle:
+  check-codestyle-rust:
    executor: zenith-build-executor
    steps:
      - checkout
-
      - run:
          name: rustfmt
          when: always
@@ -81,6 +80,8 @@ jobs:
      build_type:
        type: enum
        enum: ["debug", "release"]
+    environment:
+      BUILD_TYPE: << parameters.build_type >>
    steps:
      - run:
          name: apt install dependencies
@@ -116,16 +117,17 @@ jobs:
      - run:
          name: Rust build << parameters.build_type >>
          command: |
-            export CARGO_INCREMENTAL=0
-            BUILD_TYPE="<< parameters.build_type >>"
            if [[ $BUILD_TYPE == "debug" ]]; then
-              echo "Build in debug mode"
-              cargo build --bins --tests
+              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
+              CARGO_FLAGS=
            elif [[ $BUILD_TYPE == "release" ]]; then
-              echo "Build in release mode"
-              cargo build --release --bins --tests
+              cov_prefix=()
+              CARGO_FLAGS=--release
            fi

+            export CARGO_INCREMENTAL=0
+            "${cov_prefix[@]}" cargo build $CARGO_FLAGS --bins --tests
+
      - save_cache:
          name: Save rust cache
          key: v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
@@ -138,68 +140,100 @@ jobs:
        # has to run separately from cargo fmt section
        # since needs to run with dependencies
      - run:
-          name: clippy
+          name: cargo clippy
          command: |
-            ./run_clippy.sh
+            if [[ $BUILD_TYPE == "debug" ]]; then
+              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
+            elif [[ $BUILD_TYPE == "release" ]]; then
+              cov_prefix=()
+            fi
+
+            "${cov_prefix[@]}" ./run_clippy.sh

        # Run rust unit tests
-      - run: cargo test
+      - run:
+          name: cargo test
+          command: |
+            if [[ $BUILD_TYPE == "debug" ]]; then
+              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
+            elif [[ $BUILD_TYPE == "release" ]]; then
+              cov_prefix=()
+            fi
+
+            "${cov_prefix[@]}" cargo test

        # Install the rust binaries, for use by test jobs
-        # `--locked` is required; otherwise, `cargo install` will ignore Cargo.lock.
-        # FIXME: this is a really silly way to install; maybe we should just output
-        # a tarball as an artifact? Or a .deb package?
      - run:
-          name: cargo install
+          name: Install rust binaries
          command: |
-            export CARGO_INCREMENTAL=0
-            BUILD_TYPE="<< parameters.build_type >>"
            if [[ $BUILD_TYPE == "debug" ]]; then
-              echo "Install debug mode"
-              CARGO_FLAGS="--debug"
+              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
            elif [[ $BUILD_TYPE == "release" ]]; then
-              echo "Install release mode"
-              # The default is release mode; there is no --release flag.
-              CARGO_FLAGS=""
+              cov_prefix=()
+            fi
+
+            binaries=$(
+              "${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps |
+              jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
+            )
+
+            test_exe_paths=$(
+              "${cov_prefix[@]}" cargo test --message-format=json --no-run |
+              jq -r '.executable | select(. != null)'
+            )
+
+            mkdir -p /tmp/zenith/bin
+            mkdir -p /tmp/zenith/test_bin
+            mkdir -p /tmp/zenith/etc
+
+            # Install target binaries
+            for bin in $binaries; do
+              SRC=target/$BUILD_TYPE/$bin
+              DST=/tmp/zenith/bin/$bin
+              cp $SRC $DST
+              echo $DST >> /tmp/zenith/etc/binaries.list
+            done
+
+            # Install test executables (for code coverage)
+            if [[ $BUILD_TYPE == "debug" ]]; then
+              for bin in $test_exe_paths; do
+                SRC=$bin
+                DST=/tmp/zenith/test_bin/$(basename $bin)
+                cp $SRC $DST
+                echo $DST >> /tmp/zenith/etc/binaries.list
+              done
            fi
-            cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path pageserver
-            cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path walkeeper
-            cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path zenith

        # Install the postgres binaries, for use by test jobs
-        # FIXME: this is a silly way to do "install"; maybe just output a standard
-        # postgres package, whatever the favored form is (tarball? .deb package?)
-        # Note that pg_regress needs some build artifacts that probably aren't
-        # in the usual package...?
      - run:
-          name: postgres install
+          name: Install postgres binaries
          command: |
            cp -a tmp_install /tmp/zenith/pg_install

-        # Save the rust output binaries for other jobs in this workflow.
+        # Save the rust binaries and coverage data for other jobs in this workflow.
      - persist_to_workspace:
          root: /tmp/zenith
          paths:
            - "*"

-  check-python:
-    executor: python/default
+  check-codestyle-python:
+    executor: zenith-python-executor
    steps:
      - checkout
      - run:
-          name: Install pipenv & deps
-          working_directory: test_runner
-          command: |
-            pip install pipenv
-            pipenv install --dev
+          name: Install deps
+          command: pipenv --python 3.7 install --dev
      - run:
          name: Run yapf to ensure code format
-          working_directory: test_runner
+          when: always
          command: pipenv run yapf --recursive --diff .
+      - run:
+          name: Run mypy to check types
+          when: always
+          command: pipenv run mypy .

  run-pytest:
-    #description: "Run pytest"
-    executor: python/default
+    executor: zenith-python-executor
    parameters:
      # pytest args to specify the tests to run.
      #
@@ -225,6 +259,11 @@ jobs:
      run_in_parallel:
        type: boolean
        default: true
+      save_perf_report:
+        type: boolean
+        default: false
+    environment:
+      BUILD_TYPE: << parameters.build_type >>
    steps:
      - attach_workspace:
          at: /tmp/zenith
@@ -234,20 +273,26 @@ jobs:
          steps:
            - run: git submodule update --init --depth 1
      - run:
-          name: Install pipenv & deps
-          working_directory: test_runner
-          command: |
-            pip install pipenv
-            pipenv install
+          name: Install deps
+          command: pipenv --python 3.7 install
      - run:
          name: Run pytest
-          working_directory: test_runner
+          # pytest doesn't output test logs in real time, so CI job may fail with
+          # `Too long with no output` error, if a test is running for a long time.
+          # In that case, tests should have internal timeouts that are less than
+          # no_output_timeout, specified here.
+          no_output_timeout: 10m
          environment:
            - ZENITH_BIN: /tmp/zenith/bin
            - POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
            - TEST_OUTPUT: /tmp/test_output
+            # this variable will be embedded in perf test report
+            # and is needed to distinguish different environments
+            - PLATFORM: zenith-local-ci
          command: |
-            TEST_SELECTION="<< parameters.test_selection >>"
+            PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
+
+            TEST_SELECTION="test_runner/<< parameters.test_selection >>"
            EXTRA_PARAMS="<< parameters.extra_params >>"
            if [ -z "$TEST_SELECTION" ]; then
              echo "test_selection must be set"
@@ -255,7 +300,22 @@ jobs:
            fi
            if << parameters.run_in_parallel >>; then
              EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
-            fi;
+            fi
+            if << parameters.save_perf_report >>; then
+              if [[ $CIRCLE_BRANCH == "main" ]]; then
+                mkdir -p "$PERF_REPORT_DIR"
+                EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
+              fi
+            fi
+
+            export GITHUB_SHA=$CIRCLE_SHA1
+
+            if [[ $BUILD_TYPE == "debug" ]]; then
+              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
+            elif [[ $BUILD_TYPE == "release" ]]; then
+              cov_prefix=()
+            fi
+
            # Run the tests.
            #
            # The junit.xml file allows CircleCI to display more fine-grained test information
@@ -266,7 +326,21 @@ jobs:
            # -n4 uses four processes to run tests via pytest-xdist
            # -s is not used to prevent pytest from capturing output, because tests are running
            # in parallel and logs are mixed between different tests
-            pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short --verbose -rA $TEST_SELECTION $EXTRA_PARAMS
+            "${cov_prefix[@]}" pipenv run pytest \
+              --junitxml=$TEST_OUTPUT/junit.xml \
+              --tb=short \
+              --verbose \
+              -m "not remote_cluster" \
+              -rA $TEST_SELECTION $EXTRA_PARAMS
+
+            if << parameters.save_perf_report >>; then
+              if [[ $CIRCLE_BRANCH == "main" ]]; then
+                # TODO: reuse scripts/git-upload
+                export REPORT_FROM="$PERF_REPORT_DIR"
+                export REPORT_TO=local
+                scripts/generate_and_push_perf_report.sh
+              fi
+            fi
      - run:
          # CircleCI artifacts are preserved one file at a time, so skipping
          # this step isn't a good idea. If you want to extract the
@@ -282,6 +356,65 @@ jobs:
      # The store_test_results step tells CircleCI where to find the junit.xml file.
      - store_test_results:
          path: /tmp/test_output
+      # Save coverage data (if any)
+      - persist_to_workspace:
+          root: /tmp/zenith
+          paths:
+            - "*"
+
+  coverage-report:
+    executor: zenith-build-executor
+    steps:
+      - attach_workspace:
+          at: /tmp/zenith
+      - checkout
+      - restore_cache:
+          name: Restore rust cache
+          keys:
+            # Require an exact match. While an out of date cache might speed up the build,
+            # there's no way to clean out old packages, so the cache grows every time something
+            # changes.
+            - v04-rust-cache-deps-debug-{{ checksum "Cargo.lock" }}
+      - run:
+          name: Install llvm-tools
+          command: |
+            # TODO: install a proper symbol demangler, e.g. rustfilt
+            # TODO: we should embed this into a docker image
+            rustup component add llvm-tools-preview
+      - run:
+          name: Build coverage report
+          command: |
+            COMMIT_URL=https://github.com/zenithdb/zenith/commit/$CIRCLE_SHA1
+
+            scripts/coverage \
+              --dir=/tmp/zenith/coverage report \
+              --input-objects=/tmp/zenith/etc/binaries.list \
+              --commit-url=$COMMIT_URL \
+              --format=github
+      - run:
+          name: Upload coverage report
+          command: |
+            LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
+            REPORT_URL=https://zenithdb.github.io/zenith-coverage-data/$CIRCLE_SHA1
+            COMMIT_URL=https://github.com/zenithdb/zenith/commit/$CIRCLE_SHA1
+
+            scripts/git-upload \
+              --repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-coverage-data.git \
+              --message="Add code coverage for $COMMIT_URL" \
+              copy /tmp/zenith/coverage/report $CIRCLE_SHA1 # COPY FROM TO_RELATIVE
+
+            # Add link to the coverage report to the commit
+            curl -f -X POST \
+            https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \
+            -H "Accept: application/vnd.github.v3+json" \
+            --user "$CI_ACCESS_TOKEN" \
+            --data \
+              "{
+                \"state\": \"success\",
+                \"context\": \"zenith-coverage\",
+                \"description\": \"Coverage report is ready\",
+                \"target_url\": \"$REPORT_URL\"
+              }"

  # Build zenithdb/zenith:latest image and push it to Docker hub
  docker-image:
@@ -298,7 +431,7 @@ jobs:
          name: Build and push Docker image
          command: |
            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
-            docker build -t zenithdb/zenith:latest . && docker push zenithdb/zenith:latest
+            docker build --build-arg GIT_VERSION=$CIRCLE_SHA1 -t zenithdb/zenith:latest . && docker push zenithdb/zenith:latest

  # Trigger a new remote CI job
  remote-ci-trigger:
@@ -347,8 +480,8 @@ jobs:
 workflows:
  build_and_test:
    jobs:
-      - check-codestyle
-      - check-python
+      - check-codestyle-rust
+      - check-codestyle-python
      - build-postgres:
          name: build-postgres-<< matrix.build_type >>
          matrix:
@@ -383,8 +516,15 @@ workflows:
          build_type: release
          test_selection: performance
          run_in_parallel: false
+          save_perf_report: true
          requires:
            - build-zenith-release
+      - coverage-report:
+          # Context passes credentials for gh api
+          context: CI_ACCESS_TOKEN
+          requires:
+            # TODO: consider adding more
+            - other-tests-debug
      - docker-image:
          # Context gives an ability to login
          context: Docker Hub
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -0,0 +1,114 @@
+name: benchmarking
+
+on:
+  # uncomment to run on push for debugging your PR
+  # push:
+  #   branches: [ mybranch ]
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '36 7 * * *' # run once a day, timezone is utc
+
+  workflow_dispatch: # adds ability to run this manually
+
+env:
+  BASE_URL: "https://console.zenith.tech"
+
+jobs:
+  bench:
+    # this workflow runs on self hosteed runner
+    # it's environment is quite different from usual guthub runner
+    # probably the most important difference is that it doesnt start from clean workspace each time
+    # e g if you install system packages they are not cleaned up since you install them directly in host machine
+    # not a container or something
+    # See documentation for more info: https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners
+    runs-on: [self-hosted, zenith-benchmarker]
+
+    env:
+      PG_BIN: "/usr/pgsql-13/bin"
+
+    steps:
+    - name: Checkout zenith repo
+      uses: actions/checkout@v2
+
+    # actions/setup-python@v2 is not working correctly on self-hosted runners
+    # see https://github.com/actions/setup-python/issues/162
+    # and probably https://github.com/actions/setup-python/issues/162#issuecomment-865387976 in particular
+    # so the simplest solution to me is to use already installed system python and spin virtualenvs for job runs.
+    # there is Python 3.7.10 already installed on the machine so use it to install pipenv and then use pipenv's virtuealenvs
+    - name: Install pipenv & deps
+      run: |
+        python3 -m pip install --upgrade pipenv wheel
+        # since pip/pipenv caches are reused there shouldn't be any troubles with install every time
+        pipenv install
+
+    - name: Show versions
+      run: |
+        echo Python
+        python3 --version
+        pipenv run python3 --version
+        echo Pipenv
+        pipenv --version
+        echo Pgbench
+        $PG_BIN/pgbench --version
+
+    # FIXME cluster setup is skipped due to various changes in console API
+    # for now pre created cluster is used. When API gain some stability
+    # after massive changes dynamic cluster setup will be revived.
+    # So use pre created cluster. It needs to be started manually, but stop is automatic after 5 minutes of inactivity
+    - name: Setup cluster
+      env:
+        BENCHMARK_CONSOLE_USER_PASSWORD: "${{ secrets.BENCHMARK_CONSOLE_USER_PASSWORD }}"
+        BENCHMARK_CONSOLE_ACCESS_TOKEN: "${{ secrets.BENCHMARK_CONSOLE_ACCESS_TOKEN }}"
+        BENCHMARK_CLUSTER_ID: "${{ secrets.BENCHMARK_CLUSTER_ID }}"
+      shell: bash
+      run: |
+        set -e
+
+        echo "Starting cluster"
+        CLUSTER=$(curl -s --fail --show-error -X POST $BASE_URL/api/v1/clusters/$BENCHMARK_CLUSTER_ID/start \
+            -H "Authorization: Bearer $BENCHMARK_CONSOLE_ACCESS_TOKEN")
+        echo $CLUSTER | python -m json.tool
+
+        echo "Waiting for cluster to become ready"
+        sleep 10
+
+        echo "CLUSTER_ID=$BENCHMARK_CLUSTER_ID" >> $GITHUB_ENV
+        CLUSTER=$(curl -s --fail --show-error -X GET $BASE_URL/api/v1/clusters/$BENCHMARK_CLUSTER_ID.json \
+            -H "Authorization: Bearer $BENCHMARK_CONSOLE_ACCESS_TOKEN")
+        echo $CLUSTER | python -m json.tool
+
+    - name: Run benchmark
+      # pgbench is installed system wide from official repo
+      # https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
+      # via
+      # sudo tee /etc/yum.repos.d/pgdg.repo<<EOF
+      # [pgdg13]
+      # name=PostgreSQL 13 for RHEL/CentOS 7 - x86_64
+      # baseurl=https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
+      # enabled=1
+      # gpgcheck=0
+      # EOF
+      # sudo yum makecache
+      # sudo yum install postgresql13-contrib
+      # actual binaries are located in /usr/pgsql-13/bin/
+      env:
+        TEST_PG_BENCH_TRANSACTIONS_MATRIX: "5000,10000,20000"
+        TEST_PG_BENCH_SCALES_MATRIX: "10,15"
+        PLATFORM: "zenith-staging"
+        BENCHMARK_CONSOLE_ACCESS_TOKEN: "${{ secrets.BENCHMARK_CONSOLE_ACCESS_TOKEN }}"
+        BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
+        REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
+      run: |
+        mkdir -p perf-report-staging
+        pipenv run pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging
+
+    - name: Submit result
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+      run: |
+        REPORT_FROM=$(realpath perf-report-staging) REPORT_TO=staging scripts/generate_and_push_perf_report.sh
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,7 @@ test_output/
 .vscode
 /.zenith
 /integration_tests/.zenith
+
+# Coverage
+*.profraw
+*.profdata
--- a/.yapfignore
+++ b/.yapfignore
@@ -0,0 +1,10 @@
+# This file is only read when `yapf` is run from this directory.
+# Hence we only top-level directories here to avoid confusion.
+# See source code for the exact file format: https://github.com/google/yapf/blob/c6077954245bc3add82dafd853a1c7305a6ebd20/yapf/yapflib/file_resources.py#L40-L43
+vendor/
+target/
+tmp_install/
+__pycache__/
+test_output/
+.zenith/
+.git/
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,7 +15,3 @@ members = [
 # This is useful for profiling and, to some extent, debug.
 # Besides, debug info should not affect the performance.
 debug = true
-panic = 'abort'
-
-[profile.dev]
-panic = 'abort'
--- a/6
+++ b/6
@@ -21,11 +21,15 @@ RUN rm -rf postgres_install/build
 # net time waste in a lot of cases. Copying Cargo.lock with empty lib.rs should do the work.
 #
 FROM zenithdb/build:buster AS build
+
+ARG GIT_VERSION
+RUN if [ -z "$GIT_VERSION" ]; then echo "GIT_VERSION is reqired, use build_arg to pass it"; exit 1; fi
+
 WORKDIR /zenith
 COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server

 COPY . .
-RUN cargo build --release
+RUN GIT_VERSION=$GIT_VERSION cargo build --release

 #
 # Copy binaries to resulting image.
--- a/1
+++ b/1
@@ -1 +0,0 @@
-./test_runner/Pipfile
--- a/30
+++ b/30
@@ -0,0 +1,30 @@
+[[source]]
+url = "https://pypi.python.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+pytest = ">=6.0.0"
+typing-extensions = "*"
+pyjwt = {extras = ["crypto"], version = "*"}
+requests = "*"
+pytest-xdist = "*"
+asyncpg = "*"
+cached-property = "*"
+psycopg2-binary = "*"
+jinja2 = "*"
+
+[dev-packages]
+# Behavior may change slightly between versions. These are run continuously,
+# so we pin exact versions to avoid suprising breaks. Update if comfortable.
+yapf = "==0.31.0"
+mypy = "==0.910"
+# Non-pinned packages follow.
+pipenv = "*"
+flake8 = "*"
+types-requests = "*"
+types-psycopg2 = "*"
+
+[requires]
+# we need at least 3.7, but pipenv doesn't allow to say this directly
+python_version = "3"
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1 +0,0 @@
-./test_runner/Pipfile.lock
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -0,0 +1,652 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "c309cb963a7b07ae3d30e9cbf08b495f77bdecc0e5356fc89d133c4fbcb65b2b"
+        },
+        "pipfile-spec": 6,
+        "requires": {
+            "python_version": "3"
+        },
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.python.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "asyncpg": {
+            "hashes": [
+                "sha256:129d501f3d30616afd51eb8d3142ef51ba05374256bd5834cec3ef4956a9b317",
+                "sha256:29ef6ae0a617fc13cc2ac5dc8e9b367bb83cba220614b437af9b67766f4b6b20",
+                "sha256:41704c561d354bef01353835a7846e5606faabbeb846214dfcf666cf53319f18",
+                "sha256:556b0e92e2b75dc028b3c4bc9bd5162ddf0053b856437cf1f04c97f9c6837d03",
+                "sha256:8ff5073d4b654e34bd5eaadc01dc4d68b8a9609084d835acd364cd934190a08d",
+                "sha256:a458fc69051fbb67d995fdda46d75a012b5d6200f91e17d23d4751482640ed4c",
+                "sha256:a7095890c96ba36f9f668eb552bb020dddb44f8e73e932f8573efc613ee83843",
+                "sha256:a738f4807c853623d3f93f0fea11f61be6b0e5ca16ea8aeb42c2c7ee742aa853",
+                "sha256:c4fc0205fe4ddd5aeb3dfdc0f7bafd43411181e1f5650189608e5971cceacff1",
+                "sha256:dd2fa063c3344823487d9ddccb40802f02622ddf8bf8a6cc53885ee7a2c1c0c6",
+                "sha256:ddffcb85227bf39cd1bedd4603e0082b243cf3b14ced64dce506a15b05232b83",
+                "sha256:e36c6806883786b19551bb70a4882561f31135dc8105a59662e0376cf5b2cbc5",
+                "sha256:eed43abc6ccf1dc02e0d0efc06ce46a411362f3358847c6b0ec9a43426f91ece"
+            ],
+            "index": "pypi",
+            "version": "==0.24.0"
+        },
+        "attrs": {
+            "hashes": [
+                "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1",
+                "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==21.2.0"
+        },
+        "cached-property": {
+            "hashes": [
+                "sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130",
+                "sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"
+            ],
+            "index": "pypi",
+            "version": "==1.5.2"
+        },
+        "certifi": {
+            "hashes": [
+                "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
+                "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
+            ],
+            "version": "==2021.10.8"
+        },
+        "cffi": {
+            "hashes": [
+                "sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3",
+                "sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2",
+                "sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636",
+                "sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20",
+                "sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728",
+                "sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27",
+                "sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66",
+                "sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443",
+                "sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0",
+                "sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7",
+                "sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39",
+                "sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605",
+                "sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a",
+                "sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37",
+                "sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029",
+                "sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139",
+                "sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc",
+                "sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df",
+                "sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14",
+                "sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880",
+                "sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2",
+                "sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a",
+                "sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e",
+                "sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474",
+                "sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024",
+                "sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8",
+                "sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0",
+                "sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e",
+                "sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a",
+                "sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e",
+                "sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032",
+                "sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6",
+                "sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e",
+                "sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b",
+                "sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e",
+                "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954",
+                "sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962",
+                "sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c",
+                "sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4",
+                "sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55",
+                "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962",
+                "sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023",
+                "sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c",
+                "sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6",
+                "sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8",
+                "sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382",
+                "sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7",
+                "sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc",
+                "sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997",
+                "sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"
+            ],
+            "version": "==1.15.0"
+        },
+        "charset-normalizer": {
+            "hashes": [
+                "sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0",
+                "sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b"
+            ],
+            "markers": "python_version >= '3'",
+            "version": "==2.0.7"
+        },
+        "cryptography": {
+            "hashes": [
+                "sha256:07bb7fbfb5de0980590ddfc7f13081520def06dc9ed214000ad4372fb4e3c7f6",
+                "sha256:18d90f4711bf63e2fb21e8c8e51ed8189438e6b35a6d996201ebd98a26abbbe6",
+                "sha256:1ed82abf16df40a60942a8c211251ae72858b25b7421ce2497c2eb7a1cee817c",
+                "sha256:22a38e96118a4ce3b97509443feace1d1011d0571fae81fc3ad35f25ba3ea999",
+                "sha256:2d69645f535f4b2c722cfb07a8eab916265545b3475fdb34e0be2f4ee8b0b15e",
+                "sha256:4a2d0e0acc20ede0f06ef7aa58546eee96d2592c00f450c9acb89c5879b61992",
+                "sha256:54b2605e5475944e2213258e0ab8696f4f357a31371e538ef21e8d61c843c28d",
+                "sha256:7075b304cd567694dc692ffc9747f3e9cb393cc4aa4fb7b9f3abd6f5c4e43588",
+                "sha256:7b7ceeff114c31f285528ba8b390d3e9cfa2da17b56f11d366769a807f17cbaa",
+                "sha256:7eba2cebca600a7806b893cb1d541a6e910afa87e97acf2021a22b32da1df52d",
+                "sha256:928185a6d1ccdb816e883f56ebe92e975a262d31cc536429041921f8cb5a62fd",
+                "sha256:9933f28f70d0517686bd7de36166dda42094eac49415459d9bdf5e7df3e0086d",
+                "sha256:a688ebcd08250eab5bb5bca318cc05a8c66de5e4171a65ca51db6bd753ff8953",
+                "sha256:abb5a361d2585bb95012a19ed9b2c8f412c5d723a9836418fab7aaa0243e67d2",
+                "sha256:c10c797ac89c746e488d2ee92bd4abd593615694ee17b2500578b63cad6b93a8",
+                "sha256:ced40344e811d6abba00295ced98c01aecf0c2de39481792d87af4fa58b7b4d6",
+                "sha256:d57e0cdc1b44b6cdf8af1d01807db06886f10177469312fbde8f44ccbb284bc9",
+                "sha256:d99915d6ab265c22873f1b4d6ea5ef462ef797b4140be4c9d8b179915e0985c6",
+                "sha256:eb80e8a1f91e4b7ef8b33041591e6d89b2b8e122d787e87eeb2b08da71bb16ad",
+                "sha256:ebeddd119f526bcf323a89f853afb12e225902a24d29b55fe18dd6fcb2838a76"
+            ],
+            "version": "==35.0.0"
+        },
+        "execnet": {
+            "hashes": [
+                "sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5",
+                "sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==1.9.0"
+        },
+        "idna": {
+            "hashes": [
+                "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
+                "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
+            ],
+            "markers": "python_version >= '3'",
+            "version": "==3.3"
+        },
+        "importlib-metadata": {
+            "hashes": [
+                "sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
+                "sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
+            ],
+            "markers": "python_version < '3.8'",
+            "version": "==4.8.1"
+        },
+        "iniconfig": {
+            "hashes": [
+                "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
+                "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
+            ],
+            "version": "==1.1.1"
+        },
+        "jinja2": {
+            "hashes": [
+                "sha256:827a0e32839ab1600d4eb1c4c33ec5a8edfbc5cb42dafa13b81f182f97784b45",
+                "sha256:8569982d3f0889eed11dd620c706d39b60c36d6d25843961f33f77fb6bc6b20c"
+            ],
+            "index": "pypi",
+            "version": "==3.0.2"
+        },
+        "markupsafe": {
+            "hashes": [
+                "sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298",
+                "sha256:023cb26ec21ece8dc3907c0e8320058b2e0cb3c55cf9564da612bc325bed5e64",
+                "sha256:0446679737af14f45767963a1a9ef7620189912317d095f2d9ffa183a4d25d2b",
+                "sha256:04635854b943835a6ea959e948d19dcd311762c5c0c6e1f0e16ee57022669194",
+                "sha256:0717a7390a68be14b8c793ba258e075c6f4ca819f15edfc2a3a027c823718567",
+                "sha256:0955295dd5eec6cb6cc2fe1698f4c6d84af2e92de33fbcac4111913cd100a6ff",
+                "sha256:0d4b31cc67ab36e3392bbf3862cfbadac3db12bdd8b02a2731f509ed5b829724",
+                "sha256:10f82115e21dc0dfec9ab5c0223652f7197feb168c940f3ef61563fc2d6beb74",
+                "sha256:168cd0a3642de83558a5153c8bd34f175a9a6e7f6dc6384b9655d2697312a646",
+                "sha256:1d609f577dc6e1aa17d746f8bd3c31aa4d258f4070d61b2aa5c4166c1539de35",
+                "sha256:1f2ade76b9903f39aa442b4aadd2177decb66525062db244b35d71d0ee8599b6",
+                "sha256:20dca64a3ef2d6e4d5d615a3fd418ad3bde77a47ec8a23d984a12b5b4c74491a",
+                "sha256:2a7d351cbd8cfeb19ca00de495e224dea7e7d919659c2841bbb7f420ad03e2d6",
+                "sha256:2d7d807855b419fc2ed3e631034685db6079889a1f01d5d9dac950f764da3dad",
+                "sha256:2ef54abee730b502252bcdf31b10dacb0a416229b72c18b19e24a4509f273d26",
+                "sha256:36bc903cbb393720fad60fc28c10de6acf10dc6cc883f3e24ee4012371399a38",
+                "sha256:37205cac2a79194e3750b0af2a5720d95f786a55ce7df90c3af697bfa100eaac",
+                "sha256:3c112550557578c26af18a1ccc9e090bfe03832ae994343cfdacd287db6a6ae7",
+                "sha256:3dd007d54ee88b46be476e293f48c85048603f5f516008bee124ddd891398ed6",
+                "sha256:4296f2b1ce8c86a6aea78613c34bb1a672ea0e3de9c6ba08a960efe0b0a09047",
+                "sha256:47ab1e7b91c098ab893b828deafa1203de86d0bc6ab587b160f78fe6c4011f75",
+                "sha256:49e3ceeabbfb9d66c3aef5af3a60cc43b85c33df25ce03d0031a608b0a8b2e3f",
+                "sha256:4dc8f9fb58f7364b63fd9f85013b780ef83c11857ae79f2feda41e270468dd9b",
+                "sha256:4efca8f86c54b22348a5467704e3fec767b2db12fc39c6d963168ab1d3fc9135",
+                "sha256:53edb4da6925ad13c07b6d26c2a852bd81e364f95301c66e930ab2aef5b5ddd8",
+                "sha256:5855f8438a7d1d458206a2466bf82b0f104a3724bf96a1c781ab731e4201731a",
+                "sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a",
+                "sha256:5b6d930f030f8ed98e3e6c98ffa0652bdb82601e7a016ec2ab5d7ff23baa78d1",
+                "sha256:5bb28c636d87e840583ee3adeb78172efc47c8b26127267f54a9c0ec251d41a9",
+                "sha256:60bf42e36abfaf9aff1f50f52644b336d4f0a3fd6d8a60ca0d054ac9f713a864",
+                "sha256:611d1ad9a4288cf3e3c16014564df047fe08410e628f89805e475368bd304914",
+                "sha256:6300b8454aa6930a24b9618fbb54b5a68135092bc666f7b06901f897fa5c2fee",
+                "sha256:63f3268ba69ace99cab4e3e3b5840b03340efed0948ab8f78d2fd87ee5442a4f",
+                "sha256:6557b31b5e2c9ddf0de32a691f2312a32f77cd7681d8af66c2692efdbef84c18",
+                "sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8",
+                "sha256:6a7fae0dd14cf60ad5ff42baa2e95727c3d81ded453457771d02b7d2b3f9c0c2",
+                "sha256:6c4ca60fa24e85fe25b912b01e62cb969d69a23a5d5867682dd3e80b5b02581d",
+                "sha256:6fcf051089389abe060c9cd7caa212c707e58153afa2c649f00346ce6d260f1b",
+                "sha256:7d91275b0245b1da4d4cfa07e0faedd5b0812efc15b702576d103293e252af1b",
+                "sha256:89c687013cb1cd489a0f0ac24febe8c7a666e6e221b783e53ac50ebf68e45d86",
+                "sha256:8d206346619592c6200148b01a2142798c989edcb9c896f9ac9722a99d4e77e6",
+                "sha256:905fec760bd2fa1388bb5b489ee8ee5f7291d692638ea5f67982d968366bef9f",
+                "sha256:97383d78eb34da7e1fa37dd273c20ad4320929af65d156e35a5e2d89566d9dfb",
+                "sha256:984d76483eb32f1bcb536dc27e4ad56bba4baa70be32fa87152832cdd9db0833",
+                "sha256:99df47edb6bda1249d3e80fdabb1dab8c08ef3975f69aed437cb69d0a5de1e28",
+                "sha256:9f02365d4e99430a12647f09b6cc8bab61a6564363f313126f775eb4f6ef798e",
+                "sha256:a30e67a65b53ea0a5e62fe23682cfe22712e01f453b95233b25502f7c61cb415",
+                "sha256:ab3ef638ace319fa26553db0624c4699e31a28bb2a835c5faca8f8acf6a5a902",
+                "sha256:aca6377c0cb8a8253e493c6b451565ac77e98c2951c45f913e0b52facdcff83f",
+                "sha256:add36cb2dbb8b736611303cd3bfcee00afd96471b09cda130da3581cbdc56a6d",
+                "sha256:b2f4bf27480f5e5e8ce285a8c8fd176c0b03e93dcc6646477d4630e83440c6a9",
+                "sha256:b7f2d075102dc8c794cbde1947378051c4e5180d52d276987b8d28a3bd58c17d",
+                "sha256:baa1a4e8f868845af802979fcdbf0bb11f94f1cb7ced4c4b8a351bb60d108145",
+                "sha256:be98f628055368795d818ebf93da628541e10b75b41c559fdf36d104c5787066",
+                "sha256:bf5d821ffabf0ef3533c39c518f3357b171a1651c1ff6827325e4489b0e46c3c",
+                "sha256:c47adbc92fc1bb2b3274c4b3a43ae0e4573d9fbff4f54cd484555edbf030baf1",
+                "sha256:cdfba22ea2f0029c9261a4bd07e830a8da012291fbe44dc794e488b6c9bb353a",
+                "sha256:d6c7ebd4e944c85e2c3421e612a7057a2f48d478d79e61800d81468a8d842207",
+                "sha256:d7f9850398e85aba693bb640262d3611788b1f29a79f0c93c565694658f4071f",
+                "sha256:d8446c54dc28c01e5a2dbac5a25f071f6653e6e40f3a8818e8b45d790fe6ef53",
+                "sha256:deb993cacb280823246a026e3b2d81c493c53de6acfd5e6bfe31ab3402bb37dd",
+                "sha256:e0f138900af21926a02425cf736db95be9f4af72ba1bb21453432a07f6082134",
+                "sha256:e9936f0b261d4df76ad22f8fee3ae83b60d7c3e871292cd42f40b81b70afae85",
+                "sha256:f0567c4dc99f264f49fe27da5f735f414c4e7e7dd850cfd8e69f0862d7c74ea9",
+                "sha256:f5653a225f31e113b152e56f154ccbe59eeb1c7487b39b9d9f9cdb58e6c79dc5",
+                "sha256:f826e31d18b516f653fe296d967d700fddad5901ae07c622bb3705955e1faa94",
+                "sha256:f8ba0e8349a38d3001fae7eadded3f6606f0da5d748ee53cc1dab1d6527b9509",
+                "sha256:f9081981fe268bd86831e5c75f7de206ef275defcb82bc70740ae6dc507aee51",
+                "sha256:fa130dd50c57d53368c9d59395cb5526eda596d3ffe36666cd81a44d56e48872"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==2.0.1"
+        },
+        "packaging": {
+            "hashes": [
+                "sha256:096d689d78ca690e4cd8a89568ba06d07ca097e3306a4381635073ca91479966",
+                "sha256:14317396d1e8cdb122989b916fa2c7e9ca8e2be9e8060a6eff75b6b7b4d8a7e0"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==21.2"
+        },
+        "pluggy": {
+            "hashes": [
+                "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
+                "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==1.0.0"
+        },
+        "psycopg2-binary": {
+            "hashes": [
+                "sha256:0b7dae87f0b729922e06f85f667de7bf16455d411971b2043bbd9577af9d1975",
+                "sha256:0f2e04bd2a2ab54fa44ee67fe2d002bb90cee1c0f1cc0ebc3148af7b02034cbd",
+                "sha256:123c3fb684e9abfc47218d3784c7b4c47c8587951ea4dd5bc38b6636ac57f616",
+                "sha256:1473c0215b0613dd938db54a653f68251a45a78b05f6fc21af4326f40e8360a2",
+                "sha256:14db1752acdd2187d99cb2ca0a1a6dfe57fc65c3281e0f20e597aac8d2a5bd90",
+                "sha256:1e3a362790edc0a365385b1ac4cc0acc429a0c0d662d829a50b6ce743ae61b5a",
+                "sha256:1e85b74cbbb3056e3656f1cc4781294df03383127a8114cbc6531e8b8367bf1e",
+                "sha256:20f1ab44d8c352074e2d7ca67dc00843067788791be373e67a0911998787ce7d",
+                "sha256:24b0b6688b9f31a911f2361fe818492650795c9e5d3a1bc647acbd7440142a4f",
+                "sha256:2f62c207d1740b0bde5c4e949f857b044818f734a3d57f1d0d0edc65050532ed",
+                "sha256:3242b9619de955ab44581a03a64bdd7d5e470cc4183e8fcadd85ab9d3756ce7a",
+                "sha256:35c4310f8febe41f442d3c65066ca93cccefd75013df3d8c736c5b93ec288140",
+                "sha256:4235f9d5ddcab0b8dbd723dca56ea2922b485ea00e1dafacf33b0c7e840b3d32",
+                "sha256:542875f62bc56e91c6eac05a0deadeae20e1730be4c6334d8f04c944fcd99759",
+                "sha256:5ced67f1e34e1a450cdb48eb53ca73b60aa0af21c46b9b35ac3e581cf9f00e31",
+                "sha256:661509f51531ec125e52357a489ea3806640d0ca37d9dada461ffc69ee1e7b6e",
+                "sha256:7360647ea04db2e7dff1648d1da825c8cf68dc5fbd80b8fb5b3ee9f068dcd21a",
+                "sha256:736b8797b58febabb85494142c627bd182b50d2a7ec65322983e71065ad3034c",
+                "sha256:8c13d72ed6af7fd2c8acbd95661cf9477f94e381fce0792c04981a8283b52917",
+                "sha256:988b47ac70d204aed01589ed342303da7c4d84b56c2f4c4b8b00deda123372bf",
+                "sha256:995fc41ebda5a7a663a254a1dcac52638c3e847f48307b5416ee373da15075d7",
+                "sha256:a36c7eb6152ba5467fb264d73844877be8b0847874d4822b7cf2d3c0cb8cdcb0",
+                "sha256:aed4a9a7e3221b3e252c39d0bf794c438dc5453bc2963e8befe9d4cd324dff72",
+                "sha256:aef9aee84ec78af51107181d02fe8773b100b01c5dfde351184ad9223eab3698",
+                "sha256:b0221ca5a9837e040ebf61f48899926b5783668b7807419e4adae8175a31f773",
+                "sha256:b4d7679a08fea64573c969f6994a2631908bb2c0e69a7235648642f3d2e39a68",
+                "sha256:c250a7ec489b652c892e4f0a5d122cc14c3780f9f643e1a326754aedf82d9a76",
+                "sha256:ca86db5b561b894f9e5f115d6a159fff2a2570a652e07889d8a383b5fae66eb4",
+                "sha256:cfc523edecddaef56f6740d7de1ce24a2fdf94fd5e704091856a201872e37f9f",
+                "sha256:d92272c7c16e105788efe2cfa5d680f07e34e0c29b03c1908f8636f55d5f915a",
+                "sha256:da113b70f6ec40e7d81b43d1b139b9db6a05727ab8be1ee559f3a69854a69d34",
+                "sha256:f6fac64a38f6768e7bc7b035b9e10d8a538a9fadce06b983fb3e6fa55ac5f5ce",
+                "sha256:f8559617b1fcf59a9aedba2c9838b5b6aa211ffedecabca412b92a1ff75aac1a",
+                "sha256:fbb42a541b1093385a2d8c7eec94d26d30437d0e77c1d25dae1dcc46741a385e"
+            ],
+            "index": "pypi",
+            "version": "==2.9.1"
+        },
+        "py": {
+            "hashes": [
+                "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3",
+                "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==1.10.0"
+        },
+        "pycparser": {
+            "hashes": [
+                "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
+                "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==2.20"
+        },
+        "pyjwt": {
+            "extras": [
+                "crypto"
+            ],
+            "hashes": [
+                "sha256:b888b4d56f06f6dcd777210c334e69c737be74755d3e5e9ee3fe67dc18a0ee41",
+                "sha256:e0c4bb8d9f0af0c7f5b1ec4c5036309617d03d56932877f2f7a0beeb5318322f"
+            ],
+            "index": "pypi",
+            "version": "==2.3.0"
+        },
+        "pyparsing": {
+            "hashes": [
+                "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
+                "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
+            ],
+            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==2.4.7"
+        },
+        "pytest": {
+            "hashes": [
+                "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89",
+                "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"
+            ],
+            "index": "pypi",
+            "version": "==6.2.5"
+        },
+        "pytest-forked": {
+            "hashes": [
+                "sha256:6aa9ac7e00ad1a539c41bec6d21011332de671e938c7637378ec9710204e37ca",
+                "sha256:dc4147784048e70ef5d437951728825a131b81714b398d5d52f17c7c144d8815"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==1.3.0"
+        },
+        "pytest-xdist": {
+            "hashes": [
+                "sha256:7b61ebb46997a0820a263553179d6d1e25a8c50d8a8620cd1aa1e20e3be99168",
+                "sha256:89b330316f7fc475f999c81b577c2b926c9569f3d397ae432c0c2e2496d61ff9"
+            ],
+            "index": "pypi",
+            "version": "==2.4.0"
+        },
+        "requests": {
+            "hashes": [
+                "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
+                "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
+            ],
+            "index": "pypi",
+            "version": "==2.26.0"
+        },
+        "toml": {
+            "hashes": [
+                "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
+                "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
+            ],
+            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==0.10.2"
+        },
+        "typing-extensions": {
+            "hashes": [
+                "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
+                "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
+                "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
+            ],
+            "index": "pypi",
+            "version": "==3.10.0.2"
+        },
+        "urllib3": {
+            "hashes": [
+                "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece",
+                "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
+            "version": "==1.26.7"
+        },
+        "zipp": {
+            "hashes": [
+                "sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
+                "sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==3.6.0"
+        }
+    },
+    "develop": {
+        "backports.entry-points-selectable": {
+            "hashes": [
+                "sha256:988468260ec1c196dab6ae1149260e2f5472c9110334e5d51adcb77867361f6a",
+                "sha256:a6d9a871cde5e15b4c4a53e3d43ba890cc6861ec1332c9c2428c92f977192acc"
+            ],
+            "markers": "python_version >= '2.7'",
+            "version": "==1.1.0"
+        },
+        "certifi": {
+            "hashes": [
+                "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
+                "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
+            ],
+            "version": "==2021.10.8"
+        },
+        "distlib": {
+            "hashes": [
+                "sha256:c8b54e8454e5bf6237cc84c20e8264c3e991e824ef27e8f1e81049867d861e31",
+                "sha256:d982d0751ff6eaaab5e2ec8e691d949ee80eddf01a62eaa96ddb11531fe16b05"
+            ],
+            "version": "==0.3.3"
+        },
+        "filelock": {
+            "hashes": [
+                "sha256:7afc856f74fa7006a289fd10fa840e1eebd8bbff6bffb69c26c54a0512ea8cf8",
+                "sha256:bb2a1c717df74c48a2d00ed625e5a66f8572a3a30baacb7657add1d7bac4097b"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==3.3.2"
+        },
+        "flake8": {
+            "hashes": [
+                "sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d",
+                "sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"
+            ],
+            "index": "pypi",
+            "version": "==4.0.1"
+        },
+        "importlib-metadata": {
+            "hashes": [
+                "sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
+                "sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
+            ],
+            "markers": "python_version < '3.8'",
+            "version": "==4.8.1"
+        },
+        "mccabe": {
+            "hashes": [
+                "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
+                "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
+            ],
+            "version": "==0.6.1"
+        },
+        "mypy": {
+            "hashes": [
+                "sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9",
+                "sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a",
+                "sha256:119bed3832d961f3a880787bf621634ba042cb8dc850a7429f643508eeac97b9",
+                "sha256:1a85e280d4d217150ce8cb1a6dddffd14e753a4e0c3cf90baabb32cefa41b59e",
+                "sha256:3c4b8ca36877fc75339253721f69603a9c7fdb5d4d5a95a1a1b899d8b86a4de2",
+                "sha256:3e382b29f8e0ccf19a2df2b29a167591245df90c0b5a2542249873b5c1d78212",
+                "sha256:42c266ced41b65ed40a282c575705325fa7991af370036d3f134518336636f5b",
+                "sha256:53fd2eb27a8ee2892614370896956af2ff61254c275aaee4c230ae771cadd885",
+                "sha256:704098302473cb31a218f1775a873b376b30b4c18229421e9e9dc8916fd16150",
+                "sha256:7df1ead20c81371ccd6091fa3e2878559b5c4d4caadaf1a484cf88d93ca06703",
+                "sha256:866c41f28cee548475f146aa4d39a51cf3b6a84246969f3759cb3e9c742fc072",
+                "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457",
+                "sha256:adaeee09bfde366d2c13fe6093a7df5df83c9a2ba98638c7d76b010694db760e",
+                "sha256:b6fb13123aeef4a3abbcfd7e71773ff3ff1526a7d3dc538f3929a49b42be03f0",
+                "sha256:b94e4b785e304a04ea0828759172a15add27088520dc7e49ceade7834275bedb",
+                "sha256:c0df2d30ed496a08de5daed2a9ea807d07c21ae0ab23acf541ab88c24b26ab97",
+                "sha256:c6c2602dffb74867498f86e6129fd52a2770c48b7cd3ece77ada4fa38f94eba8",
+                "sha256:ceb6e0a6e27fb364fb3853389607cf7eb3a126ad335790fa1e14ed02fba50811",
+                "sha256:d9dd839eb0dc1bbe866a288ba3c1afc33a202015d2ad83b31e875b5905a079b6",
+                "sha256:e4dab234478e3bd3ce83bac4193b2ecd9cf94e720ddd95ce69840273bf44f6de",
+                "sha256:ec4e0cd079db280b6bdabdc807047ff3e199f334050db5cbb91ba3e959a67504",
+                "sha256:ecd2c3fe726758037234c93df7e98deb257fd15c24c9180dacf1ef829da5f921",
+                "sha256:ef565033fa5a958e62796867b1df10c40263ea9ded87164d67572834e57a174d"
+            ],
+            "index": "pypi",
+            "version": "==0.910"
+        },
+        "mypy-extensions": {
+            "hashes": [
+                "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d",
+                "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"
+            ],
+            "version": "==0.4.3"
+        },
+        "pipenv": {
+            "hashes": [
+                "sha256:05958fadcd70b2de6a27542fcd2bd72dd5c59c6d35307fdac3e06361fb06e30e",
+                "sha256:d180f5be4775c552fd5e69ae18a9d6099d9dafb462efe54f11c72cb5f4d5e977"
+            ],
+            "index": "pypi",
+            "version": "==2021.5.29"
+        },
+        "platformdirs": {
+            "hashes": [
+                "sha256:367a5e80b3d04d2428ffa76d33f124cf11e8fff2acdaa9b43d545f5c7d661ef2",
+                "sha256:8868bbe3c3c80d42f20156f22e7131d2fb321f5bc86a2a345375c6481a67021d"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==2.4.0"
+        },
+        "pycodestyle": {
+            "hashes": [
+                "sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20",
+                "sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==2.8.0"
+        },
+        "pyflakes": {
+            "hashes": [
+                "sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c",
+                "sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==2.4.0"
+        },
+        "six": {
+            "hashes": [
+                "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
+                "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==1.16.0"
+        },
+        "toml": {
+            "hashes": [
+                "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
+                "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
+            ],
+            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==0.10.2"
+        },
+        "typed-ast": {
+            "hashes": [
+                "sha256:01ae5f73431d21eead5015997ab41afa53aa1fbe252f9da060be5dad2c730ace",
+                "sha256:067a74454df670dcaa4e59349a2e5c81e567d8d65458d480a5b3dfecec08c5ff",
+                "sha256:0fb71b8c643187d7492c1f8352f2c15b4c4af3f6338f21681d3681b3dc31a266",
+                "sha256:1b3ead4a96c9101bef08f9f7d1217c096f31667617b58de957f690c92378b528",
+                "sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6",
+                "sha256:209596a4ec71d990d71d5e0d312ac935d86930e6eecff6ccc7007fe54d703808",
+                "sha256:2c726c276d09fc5c414693a2de063f521052d9ea7c240ce553316f70656c84d4",
+                "sha256:398e44cd480f4d2b7ee8d98385ca104e35c81525dd98c519acff1b79bdaac363",
+                "sha256:52b1eb8c83f178ab787f3a4283f68258525f8d70f778a2f6dd54d3b5e5fb4341",
+                "sha256:5feca99c17af94057417d744607b82dd0a664fd5e4ca98061480fd8b14b18d04",
+                "sha256:7538e495704e2ccda9b234b82423a4038f324f3a10c43bc088a1636180f11a41",
+                "sha256:760ad187b1041a154f0e4d0f6aae3e40fdb51d6de16e5c99aedadd9246450e9e",
+                "sha256:777a26c84bea6cd934422ac2e3b78863a37017618b6e5c08f92ef69853e765d3",
+                "sha256:95431a26309a21874005845c21118c83991c63ea800dd44843e42a916aec5899",
+                "sha256:9ad2c92ec681e02baf81fdfa056fe0d818645efa9af1f1cd5fd6f1bd2bdfd805",
+                "sha256:9c6d1a54552b5330bc657b7ef0eae25d00ba7ffe85d9ea8ae6540d2197a3788c",
+                "sha256:aee0c1256be6c07bd3e1263ff920c325b59849dc95392a05f258bb9b259cf39c",
+                "sha256:af3d4a73793725138d6b334d9d247ce7e5f084d96284ed23f22ee626a7b88e39",
+                "sha256:b36b4f3920103a25e1d5d024d155c504080959582b928e91cb608a65c3a49e1a",
+                "sha256:b9574c6f03f685070d859e75c7f9eeca02d6933273b5e69572e5ff9d5e3931c3",
+                "sha256:bff6ad71c81b3bba8fa35f0f1921fb24ff4476235a6e94a26ada2e54370e6da7",
+                "sha256:c190f0899e9f9f8b6b7863debfb739abcb21a5c054f911ca3596d12b8a4c4c7f",
+                "sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075",
+                "sha256:cae53c389825d3b46fb37538441f75d6aecc4174f615d048321b716df2757fb0",
+                "sha256:dd4a21253f42b8d2b48410cb31fe501d32f8b9fbeb1f55063ad102fe9c425e40",
+                "sha256:dde816ca9dac1d9c01dd504ea5967821606f02e510438120091b84e852367428",
+                "sha256:f2362f3cb0f3172c42938946dbc5b7843c2a28aec307c49100c8b38764eb6927",
+                "sha256:f328adcfebed9f11301eaedfa48e15bdece9b519fb27e6a8c01aa52a17ec31b3",
+                "sha256:f8afcf15cc511ada719a88e013cec87c11aff7b91f019295eb4530f96fe5ef2f",
+                "sha256:fb1bbeac803adea29cedd70781399c99138358c26d05fcbd23c13016b7f5ec65"
+            ],
+            "markers": "python_version < '3.8'",
+            "version": "==1.4.3"
+        },
+        "types-psycopg2": {
+            "hashes": [
+                "sha256:77ed80f2668582654623e04fb3d741ecce93effcc39c929d7e02f4a917a538ce",
+                "sha256:98a6e0e9580cd7eb4bd4d20f7c7063d154b2589a2b90c0ce4e3ca6085cde77c6"
+            ],
+            "index": "pypi",
+            "version": "==2.9.1"
+        },
+        "types-requests": {
+            "hashes": [
+                "sha256:b279284e51f668e38ee12d9665e4d789089f532dc2a0be4a1508ca0efd98ba9e",
+                "sha256:ba1d108d512e294b6080c37f6ae7cb2a2abf527560e2b671d1786c1fc46b541a"
+            ],
+            "index": "pypi",
+            "version": "==2.25.11"
+        },
+        "typing-extensions": {
+            "hashes": [
+                "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
+                "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
+                "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
+            ],
+            "index": "pypi",
+            "version": "==3.10.0.2"
+        },
+        "virtualenv": {
+            "hashes": [
+                "sha256:4b02e52a624336eece99c96e3ab7111f469c24ba226a53ec474e8e787b365814",
+                "sha256:576d05b46eace16a9c348085f7d0dc8ef28713a2cabaa1cf0aea41e8f12c9218"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==20.10.0"
+        },
+        "virtualenv-clone": {
+            "hashes": [
+                "sha256:418ee935c36152f8f153c79824bb93eaf6f0f7984bae31d3f48f350b9183501a",
+                "sha256:44d5263bceed0bac3e1424d64f798095233b64def1c5689afa43dc3223caf5b0"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==0.5.7"
+        },
+        "yapf": {
+            "hashes": [
+                "sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d",
+                "sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"
+            ],
+            "index": "pypi",
+            "version": "==0.31.0"
+        },
+        "zipp": {
+            "hashes": [
+                "sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
+                "sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==3.6.0"
+        }
+    }
+}
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Zenith

-Zenith substitutes PostgreSQL storage layer and redistributes data across a cluster of nodes
+Zenith is an serverless open source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes PostgreSQL storage layer by redistributes data across a cluster of nodes. 

 ## Architecture overview

@@ -32,8 +32,8 @@ libssl-dev clang pkg-config libpq-dev

 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.

-To run the integration tests (not required to use the code), install
-Python (3.6 or higher), and install python3 packages with `pipenv` using `pipenv install` in the project directory.
+To run the integration tests or Python scripts (not required to use the code), install
+Python (3.7 or higher), and install python3 packages using `pipenv install` in the project directory.

 2. Build zenith and patched postgres
 ```sh
@@ -47,17 +47,26 @@ make -j5
 # Create repository in .zenith with proper paths to binaries and data
 # Later that would be responsibility of a package install script
 > ./target/debug/zenith init
+initializing tenantid c03ba6b7ad4c5e9cf556f059ade44229
+created initial timeline 5b014a9e41b4b63ce1a1febc04503636 timeline.lsn 0/169C3C8
+created main branch
 pageserver init succeeded

-# start pageserver
+# start pageserver and safekeeper
 > ./target/debug/zenith start
-Starting pageserver at '127.0.0.1:64000' in .zenith
+Starting pageserver at 'localhost:64000' in '.zenith'
 Pageserver started
+initializing for single for 7676
+Starting safekeeper at 'localhost:5454' in '.zenith/safekeepers/single'
+Safekeeper started

-# start postgres on top on the pageserver
+# start postgres compute node
 > ./target/debug/zenith pg start main
-Starting postgres node at 'host=127.0.0.1 port=55432 user=stas'
+Starting new postgres main on main...
+Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432
+Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres'
 waiting for server to start.... done
+server started

 # check list of running postgres instances
 > ./target/debug/zenith pg list
@@ -108,10 +117,9 @@ postgres=# insert into t values(2,2);
 INSERT 0 1
 ```

-6. If you want to run tests afterwards (see below), you have to stop pageserver and all postgres instances you have just started:
+6. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances
+   you have just started. You can stop them all with one command:
 ```sh
-> ./target/debug/zenith pg stop migration_check
-> ./target/debug/zenith pg stop main
 > ./target/debug/zenith stop
 ```

@@ -121,7 +129,7 @@ INSERT 0 1
 git clone --recursive https://github.com/zenithdb/zenith.git
 make # builds also postgres and installs it to ./tmp_install
 cd test_runner
-pytest
+pipenv run pytest
 ```

 ## Documentation
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -7,24 +7,18 @@ edition = "2018"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
-rand = "0.8.3"
 tar = "0.4.33"
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1"
 toml = "0.5"
 lazy_static = "1.4"
 regex = "1"
 anyhow = "1.0"
 thiserror = "1"
-bytes = "1.0.1"
-nix = "0.20"
+nix = "0.23"
 url = "2.2.2"
-hex = { version = "0.4.3", features = ["serde"] }
-reqwest = { version = "0.11", features = ["blocking", "json"] }
+reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }

 pageserver = { path = "../pageserver" }
-walkeeper = { path = "../walkeeper" }
-postgres_ffi = { path = "../postgres_ffi" }
 zenith_utils = { path = "../zenith_utils" }
 workspace_hack = { path = "../workspace_hack" }
--- a/control_plane/safekeepers.conf
+++ b/control_plane/safekeepers.conf
@@ -0,0 +1,20 @@
+# Page server and three safekeepers.
+[pageserver]
+pg_port = 64000
+http_port = 9898
+auth_type = 'Trust'
+
+[[safekeepers]]
+name = 'sk1'
+pg_port = 5454
+http_port = 7676
+
+[[safekeepers]]
+name = 'sk2'
+pg_port = 5455
+http_port = 7677
+
+[[safekeepers]]
+name = 'sk3'
+pg_port = 5456
+http_port = 7678
--- a/control_plane/simple.conf
+++ b/control_plane/simple.conf
@@ -0,0 +1,11 @@
+# Minimal zenith environment with one safekeeper. This is equivalent to the built-in
+# defaults that you get with no --config
+[pageserver]
+pg_port = 64000
+http_port = 9898
+auth_type = 'Trust'
+
+[[safekeepers]]
+name = 'single'
+pg_port = 5454
+http_port = 7676
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -39,8 +39,6 @@ impl ComputeControlPlane {
    // |  |- <tenant_id>
    // |  |   |- <branch name>
    pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
-        // TODO: since pageserver do not have config file yet we believe here that
-        // it is running on default port. Change that when pageserver will have config.
        let pageserver = Arc::new(PageServerNode::from_env(&env));

        let mut nodes = BTreeMap::default();
@@ -75,15 +73,6 @@ impl ComputeControlPlane {
            .unwrap_or(self.base_port)
    }

-    pub fn local(local_env: &LocalEnv, pageserver: &Arc<PageServerNode>) -> ComputeControlPlane {
-        ComputeControlPlane {
-            base_port: 65431,
-            pageserver: Arc::clone(pageserver),
-            nodes: BTreeMap::new(),
-            env: local_env.clone(),
-        }
-    }
-
    // FIXME: see also parse_point_in_time in branches.rs.
    fn parse_point_in_time(
        &self,
@@ -136,7 +125,7 @@ impl ComputeControlPlane {
        });

        node.create_pgdata()?;
-        node.setup_pg_conf(self.env.auth_type)?;
+        node.setup_pg_conf(self.env.pageserver.auth_type)?;

        self.nodes
            .insert((tenantid, node.name.clone()), Arc::clone(&node));
@@ -210,17 +199,24 @@ impl PostgresNode {
        })
    }

-    fn sync_walkeepers(&self) -> Result<Lsn> {
+    fn sync_safekeepers(&self, auth_token: &Option<String>) -> Result<Lsn> {
        let pg_path = self.env.pg_bin_dir().join("postgres");
-        let sync_handle = Command::new(pg_path)
-            .arg("--sync-safekeepers")
+        let mut cmd = Command::new(&pg_path);
+
+        cmd.arg("--sync-safekeepers")
            .env_clear()
            .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
            .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
            .env("PGDATA", self.pgdata().to_str().unwrap())
            .stdout(Stdio::piped())
            // Comment this to avoid capturing stderr (useful if command hangs)
-            .stderr(Stdio::piped())
+            .stderr(Stdio::piped());
+
+        if let Some(token) = auth_token {
+            cmd.env("ZENITH_AUTH_TOKEN", token);
+        }
+
+        let sync_handle = cmd
            .spawn()
            .expect("postgres --sync-safekeepers failed to start");

@@ -235,7 +231,7 @@ impl PostgresNode {
        }

        let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?;
-        println!("Walkeepers synced on {}", lsn);
+        println!("Safekeepers synced on {}", lsn);
        Ok(lsn)
    }

@@ -298,11 +294,14 @@ impl PostgresNode {
        conf.append("max_replication_slots", "10");
        conf.append("hot_standby", "on");
        conf.append("shared_buffers", "1MB");
-        conf.append("max_wal_size", "100GB");
        conf.append("fsync", "off");
        conf.append("max_connections", "100");
-        conf.append("wal_sender_timeout", "0");
        conf.append("wal_level", "replica");
+        // wal_sender_timeout is the maximum time to wait for WAL replication.
+        // It also defines how often the walreciever will send a feedback message to the wal sender.
+        conf.append("wal_sender_timeout", "5s");
+        conf.append("max_replication_flush_lag", "160MB");
+        conf.append("max_replication_apply_lag", "1500MB");
        conf.append("listen_addresses", &self.address.ip().to_string());
        conf.append("port", &self.address.port().to_string());

@@ -327,8 +326,11 @@ impl PostgresNode {
            } else {
                ""
            };
-
-            format!("host={} port={} password={}", host, port, password)
+            // NOTE avoiding spaces in connection string, because it is less error prone if we forward it somewhere.
+            // Also note that not all parameters are supported here. Because in compute we substitute $ZENITH_AUTH_TOKEN
+            // We parse this string and build it back with token from env var, and for simplicity rebuild
+            // uses only needed variables namely host, port, user, password.
+            format!("postgresql://no_user:{}@{}:{}", password, host, port)
        };
        conf.append("shared_preload_libraries", "zenith");
        conf.append_line("");
@@ -340,9 +342,25 @@ impl PostgresNode {
        }
        conf.append_line("");

-        // Configure the node to stream WAL directly to the pageserver
-        conf.append("synchronous_standby_names", "pageserver"); // TODO: add a new function arg?
-        conf.append("zenith.callmemaybe_connstring", &self.connstr());
+        if !self.env.safekeepers.is_empty() {
+            // Configure the node to connect to the safekeepers
+            conf.append("synchronous_standby_names", "walproposer");
+
+            let wal_acceptors = self
+                .env
+                .safekeepers
+                .iter()
+                .map(|sk| format!("localhost:{}", sk.pg_port))
+                .collect::<Vec<String>>()
+                .join(",");
+            conf.append("wal_acceptors", &wal_acceptors);
+        } else {
+            // Configure the node to stream WAL directly to the pageserver
+            // This isn't really a supported configuration, but can be useful for
+            // testing.
+            conf.append("synchronous_standby_names", "pageserver");
+            conf.append("zenith.callmemaybe_connstring", &self.connstr());
+        }

        let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
        file.write_all(conf.to_string().as_bytes())?;
@@ -350,7 +368,7 @@ impl PostgresNode {
        Ok(())
    }

-    fn load_basebackup(&self) -> Result<()> {
+    fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
        let backup_lsn = if let Some(lsn) = self.lsn {
            Some(lsn)
        } else if self.uses_wal_proposer {
@@ -358,7 +376,7 @@ impl PostgresNode {
            // latest data from the pageserver. That is a bit clumsy but whole bootstrap
            // procedure evolves quite actively right now, so let's think about it again
            // when things would be more stable (TODO).
-            let lsn = self.sync_walkeepers()?;
+            let lsn = self.sync_safekeepers(auth_token)?;
            if lsn == Lsn(0) {
                None
            } else {
@@ -409,7 +427,6 @@ impl PostgresNode {
        .env_clear()
        .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
        .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap());
-
        if let Some(token) = auth_token {
            cmd.env("ZENITH_AUTH_TOKEN", token);
        }
@@ -443,7 +460,7 @@ impl PostgresNode {
        fs::write(&postgresql_conf_path, postgresql_conf)?;

        // 3. Load basebackup
-        self.load_basebackup()?;
+        self.load_basebackup(auth_token)?;

        if self.lsn.is_some() {
            File::create(self.pgdata().join("standby.signal"))?;
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -13,6 +13,7 @@ use std::path::Path;
 pub mod compute;
 pub mod local_env;
 pub mod postgresql_conf;
+pub mod safekeeper;
 pub mod storage;

 /// Read a PID file
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -7,46 +7,102 @@
 use anyhow::{Context, Result};
 use serde::{Deserialize, Serialize};
 use std::env;
+use std::fmt::Write;
 use std::fs;
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
-use zenith_utils::auth::{encode_from_key_path, Claims, Scope};
+use zenith_utils::auth::{encode_from_key_file, Claims, Scope};
 use zenith_utils::postgres_backend::AuthType;
-use zenith_utils::zid::ZTenantId;
+use zenith_utils::zid::{opt_display_serde, ZTenantId};

 //
-// This data structures represent deserialized zenith CLI config
+// This data structures represents zenith CLI config
+//
+// It is deserialized from the .zenith/config file, or the config file passed
+// to 'zenith init --config=<path>' option. See control_plane/simple.conf for
+// an example.
 //
 #[derive(Serialize, Deserialize, Clone, Debug)]
 pub struct LocalEnv {
-    // Pageserver connection settings
-    pub pageserver_pg_port: u16,
-    pub pageserver_http_port: u16,
-
-    // Base directory for both pageserver and compute nodes
+    // Base directory for all the nodes (the pageserver, safekeepers and
+    // compute nodes).
+    //
+    // This is not stored in the config file. Rather, this is the path where the
+    // config file itself is. It is read from the ZENITH_REPO_DIR env variable or
+    // '.zenith' if not given.
+    #[serde(skip)]
    pub base_data_dir: PathBuf,

    // Path to postgres distribution. It's expected that "bin", "include",
    // "lib", "share" from postgres distribution are there. If at some point
    // in time we will be able to run against vanilla postgres we may split that
    // to four separate paths and match OS-specific installation layout.
+    #[serde(default)]
    pub pg_distrib_dir: PathBuf,

    // Path to pageserver binary.
+    #[serde(default)]
    pub zenith_distrib_dir: PathBuf,

-    // keeping tenant id in config to reduce copy paste when running zenith locally with single tenant
-    #[serde(with = "hex")]
-    pub tenantid: ZTenantId,
+    // Default tenant ID to use with the 'zenith' command line utility, when
+    // --tenantid is not explicitly specified.
+    #[serde(with = "opt_display_serde")]
+    #[serde(default)]
+    pub default_tenantid: Option<ZTenantId>,

-    // jwt auth token used for communication with pageserver
-    pub auth_token: String,
+    // used to issue tokens during e.g pg start
+    #[serde(default)]
+    pub private_key_path: PathBuf,
+
+    pub pageserver: PageServerConf,
+
+    #[serde(default)]
+    pub safekeepers: Vec<SafekeeperConf>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(default)]
+pub struct PageServerConf {
+    // Pageserver connection settings
+    pub pg_port: u16,
+    pub http_port: u16,

    // used to determine which auth type is used
    pub auth_type: AuthType,

-    // used to issue tokens during e.g pg start
-    pub private_key_path: PathBuf,
+    // jwt auth token used for communication with pageserver
+    pub auth_token: String,
+}
+
+impl Default for PageServerConf {
+    fn default() -> Self {
+        Self {
+            pg_port: 0,
+            http_port: 0,
+            auth_type: AuthType::Trust,
+            auth_token: "".to_string(),
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(default)]
+pub struct SafekeeperConf {
+    pub name: String,
+    pub pg_port: u16,
+    pub http_port: u16,
+    pub sync: bool,
+}
+
+impl Default for SafekeeperConf {
+    fn default() -> Self {
+        Self {
+            name: "".to_string(),
+            pg_port: 0,
+            http_port: 0,
+            sync: true,
+        }
+    }
 }

 impl LocalEnv {
@@ -62,6 +118,10 @@ impl LocalEnv {
        Ok(self.zenith_distrib_dir.join("pageserver"))
    }

+    pub fn safekeeper_bin(&self) -> Result<PathBuf> {
+        Ok(self.zenith_distrib_dir.join("safekeeper"))
+    }
+
    pub fn pg_data_dirs_path(&self) -> PathBuf {
        self.base_data_dir.join("pgdatadirs").join("tenants")
    }
@@ -76,6 +136,187 @@ impl LocalEnv {
    pub fn pageserver_data_dir(&self) -> PathBuf {
        self.base_data_dir.clone()
    }
+
+    pub fn safekeeper_data_dir(&self, node_name: &str) -> PathBuf {
+        self.base_data_dir.join("safekeepers").join(node_name)
+    }
+
+    /// Create a LocalEnv from a config file.
+    ///
+    /// Unlike 'load_config', this function fills in any defaults that are missing
+    /// from the config file.
+    pub fn create_config(toml: &str) -> Result<LocalEnv> {
+        let mut env: LocalEnv = toml::from_str(toml)?;
+
+        // Find postgres binaries.
+        // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
+        if env.pg_distrib_dir == Path::new("") {
+            if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
+                env.pg_distrib_dir = postgres_bin.into();
+            } else {
+                let cwd = env::current_dir()?;
+                env.pg_distrib_dir = cwd.join("tmp_install")
+            }
+        }
+        if !env.pg_distrib_dir.join("bin/postgres").exists() {
+            anyhow::bail!(
+                "Can't find postgres binary at {}",
+                env.pg_distrib_dir.display()
+            );
+        }
+
+        // Find zenith binaries.
+        if env.zenith_distrib_dir == Path::new("") {
+            env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
+        }
+        if !env.zenith_distrib_dir.join("pageserver").exists() {
+            anyhow::bail!("Can't find pageserver binary.");
+        }
+        if !env.zenith_distrib_dir.join("safekeeper").exists() {
+            anyhow::bail!("Can't find safekeeper binary.");
+        }
+
+        // If no initial tenant ID was given, generate it.
+        if env.default_tenantid.is_none() {
+            env.default_tenantid = Some(ZTenantId::generate());
+        }
+
+        env.base_data_dir = base_path();
+
+        Ok(env)
+    }
+
+    /// Locate and load config
+    pub fn load_config() -> Result<LocalEnv> {
+        let repopath = base_path();
+
+        if !repopath.exists() {
+            anyhow::bail!(
+                "Zenith config is not found in {}. You need to run 'zenith init' first",
+                repopath.to_str().unwrap()
+            );
+        }
+
+        // TODO: check that it looks like a zenith repository
+
+        // load and parse file
+        let config = fs::read_to_string(repopath.join("config"))?;
+        let mut env: LocalEnv = toml::from_str(config.as_str())?;
+
+        env.base_data_dir = repopath;
+
+        Ok(env)
+    }
+
+    // this function is used only for testing purposes in CLI e g generate tokens during init
+    pub fn generate_auth_token(&self, claims: &Claims) -> Result<String> {
+        let private_key_path = if self.private_key_path.is_absolute() {
+            self.private_key_path.to_path_buf()
+        } else {
+            self.base_data_dir.join(&self.private_key_path)
+        };
+
+        let key_data = fs::read(private_key_path)?;
+        encode_from_key_file(claims, &key_data)
+    }
+
+    //
+    // Initialize a new Zenith repository
+    //
+    pub fn init(&mut self) -> Result<()> {
+        // check if config already exists
+        let base_path = &self.base_data_dir;
+        if base_path == Path::new("") {
+            anyhow::bail!("repository base path is missing");
+        }
+        if base_path.exists() {
+            anyhow::bail!(
+                "directory '{}' already exists. Perhaps already initialized?",
+                base_path.to_str().unwrap()
+            );
+        }
+
+        fs::create_dir(&base_path)?;
+
+        // generate keys for jwt
+        // openssl genrsa -out private_key.pem 2048
+        let private_key_path;
+        if self.private_key_path == PathBuf::new() {
+            private_key_path = base_path.join("auth_private_key.pem");
+            let keygen_output = Command::new("openssl")
+                .arg("genrsa")
+                .args(&["-out", private_key_path.to_str().unwrap()])
+                .arg("2048")
+                .stdout(Stdio::null())
+                .output()
+                .with_context(|| "failed to generate auth private key")?;
+            if !keygen_output.status.success() {
+                anyhow::bail!(
+                    "openssl failed: '{}'",
+                    String::from_utf8_lossy(&keygen_output.stderr)
+                );
+            }
+            self.private_key_path = Path::new("auth_private_key.pem").to_path_buf();
+
+            let public_key_path = base_path.join("auth_public_key.pem");
+            // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
+            let keygen_output = Command::new("openssl")
+                .arg("rsa")
+                .args(&["-in", private_key_path.to_str().unwrap()])
+                .arg("-pubout")
+                .args(&["-outform", "PEM"])
+                .args(&["-out", public_key_path.to_str().unwrap()])
+                .stdout(Stdio::null())
+                .output()
+                .with_context(|| "failed to generate auth private key")?;
+            if !keygen_output.status.success() {
+                anyhow::bail!(
+                    "openssl failed: '{}'",
+                    String::from_utf8_lossy(&keygen_output.stderr)
+                );
+            }
+        }
+
+        self.pageserver.auth_token =
+            self.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
+
+        fs::create_dir_all(self.pg_data_dirs_path())?;
+
+        for safekeeper in self.safekeepers.iter() {
+            fs::create_dir_all(self.safekeeper_data_dir(&safekeeper.name))?;
+        }
+
+        let mut conf_content = String::new();
+
+        // Currently, the user first passes a config file with 'zenith init --config=<path>'
+        // We read that in, in `create_config`, and fill any missing defaults. Then it's saved
+        // to .zenith/config. TODO: We lose any formatting and comments along the way, which is
+        // a bit sad.
+        write!(
+            &mut conf_content,
+            r#"# This file describes a locale deployment of the page server
+# and safekeeeper node. It is read by the 'zenith' command-line
+# utility.
+"#
+        )?;
+
+        // Convert the LocalEnv to a toml file.
+        //
+        // This could be as simple as this:
+        //
+        // conf_content += &toml::to_string_pretty(env)?;
+        //
+        // But it results in a "values must be emitted before tables". I'm not sure
+        // why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
+        // Maybe rust reorders the fields to squeeze avoid padding or something?
+        // In any case, converting to toml::Value first, and serializing that, works.
+        // See https://github.com/alexcrichton/toml-rs/issues/142
+        conf_content += &toml::to_string_pretty(&toml::Value::try_from(&self)?)?;
+
+        fs::write(base_path.join("config"), conf_content)?;
+
+        Ok(())
+    }
 }

 fn base_path() -> PathBuf {
@@ -84,119 +325,3 @@ fn base_path() -> PathBuf {
        None => ".zenith".into(),
    }
 }
-
-//
-// Initialize a new Zenith repository
-//
-pub fn init(
-    pageserver_pg_port: u16,
-    pageserver_http_port: u16,
-    tenantid: ZTenantId,
-    auth_type: AuthType,
-) -> Result<()> {
-    // check if config already exists
-    let base_path = base_path();
-    if base_path.exists() {
-        anyhow::bail!(
-            "{} already exists. Perhaps already initialized?",
-            base_path.to_str().unwrap()
-        );
-    }
-    fs::create_dir(&base_path)?;
-
-    // ok, now check that expected binaries are present
-
-    // Find postgres binaries. Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
-    let pg_distrib_dir: PathBuf = {
-        if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
-            postgres_bin.into()
-        } else {
-            let cwd = env::current_dir()?;
-            cwd.join("tmp_install")
-        }
-    };
-    if !pg_distrib_dir.join("bin/postgres").exists() {
-        anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
-    }
-
-    // generate keys for jwt
-    // openssl genrsa -out private_key.pem 2048
-    let private_key_path = base_path.join("auth_private_key.pem");
-    let keygen_output = Command::new("openssl")
-        .arg("genrsa")
-        .args(&["-out", private_key_path.to_str().unwrap()])
-        .arg("2048")
-        .stdout(Stdio::null())
-        .output()
-        .with_context(|| "failed to generate auth private key")?;
-    if !keygen_output.status.success() {
-        anyhow::bail!(
-            "openssl failed: '{}'",
-            String::from_utf8_lossy(&keygen_output.stderr)
-        );
-    }
-
-    let public_key_path = base_path.join("auth_public_key.pem");
-    // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
-    let keygen_output = Command::new("openssl")
-        .arg("rsa")
-        .args(&["-in", private_key_path.to_str().unwrap()])
-        .arg("-pubout")
-        .args(&["-outform", "PEM"])
-        .args(&["-out", public_key_path.to_str().unwrap()])
-        .stdout(Stdio::null())
-        .output()
-        .with_context(|| "failed to generate auth private key")?;
-    if !keygen_output.status.success() {
-        anyhow::bail!(
-            "openssl failed: '{}'",
-            String::from_utf8_lossy(&keygen_output.stderr)
-        );
-    }
-
-    let auth_token =
-        encode_from_key_path(&Claims::new(None, Scope::PageServerApi), &private_key_path)?;
-
-    // Find zenith binaries.
-    let zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
-    if !zenith_distrib_dir.join("pageserver").exists() {
-        anyhow::bail!("Can't find pageserver binary.",);
-    }
-
-    let conf = LocalEnv {
-        pageserver_pg_port,
-        pageserver_http_port,
-        pg_distrib_dir,
-        zenith_distrib_dir,
-        base_data_dir: base_path,
-        tenantid,
-        auth_token,
-        auth_type,
-        private_key_path,
-    };
-
-    fs::create_dir_all(conf.pg_data_dirs_path())?;
-
-    let toml = toml::to_string_pretty(&conf)?;
-    fs::write(conf.base_data_dir.join("config"), toml)?;
-
-    Ok(())
-}
-
-// Locate and load config
-pub fn load_config() -> Result<LocalEnv> {
-    let repopath = base_path();
-
-    if !repopath.exists() {
-        anyhow::bail!(
-            "Zenith config is not found in {}. You need to run 'zenith init' first",
-            repopath.to_str().unwrap()
-        );
-    }
-
-    // TODO: check that it looks like a zenith repository
-
-    // load and parse file
-    let config = fs::read_to_string(repopath.join("config"))?;
-    toml::from_str(config.as_str()).map_err(|e| e.into())
-}
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -0,0 +1,263 @@
+use std::io::Write;
+use std::net::TcpStream;
+use std::path::PathBuf;
+use std::process::Command;
+use std::sync::Arc;
+use std::time::Duration;
+use std::{io, result, thread};
+
+use anyhow::bail;
+use nix::errno::Errno;
+use nix::sys::signal::{kill, Signal};
+use nix::unistd::Pid;
+use postgres::Config;
+use reqwest::blocking::{Client, RequestBuilder, Response};
+use reqwest::{IntoUrl, Method};
+use thiserror::Error;
+use zenith_utils::http::error::HttpErrorBody;
+
+use crate::local_env::{LocalEnv, SafekeeperConf};
+use crate::read_pidfile;
+use crate::storage::PageServerNode;
+use zenith_utils::connstring::connection_address;
+
+#[derive(Error, Debug)]
+pub enum SafekeeperHttpError {
+    #[error("Reqwest error: {0}")]
+    Transport(#[from] reqwest::Error),
+
+    #[error("Error: {0}")]
+    Response(String),
+}
+
+type Result<T> = result::Result<T, SafekeeperHttpError>;
+
+pub trait ResponseErrorMessageExt: Sized {
+    fn error_from_body(self) -> Result<Self>;
+}
+
+impl ResponseErrorMessageExt for Response {
+    fn error_from_body(self) -> Result<Self> {
+        let status = self.status();
+        if !(status.is_client_error() || status.is_server_error()) {
+            return Ok(self);
+        }
+
+        // reqwest do not export it's error construction utility functions, so lets craft the message ourselves
+        let url = self.url().to_owned();
+        Err(SafekeeperHttpError::Response(
+            match self.json::<HttpErrorBody>() {
+                Ok(err_body) => format!("Error: {}", err_body.msg),
+                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
+            },
+        ))
+    }
+}
+
+//
+// Control routines for safekeeper.
+//
+// Used in CLI and tests.
+//
+#[derive(Debug)]
+pub struct SafekeeperNode {
+    pub name: String,
+
+    pub conf: SafekeeperConf,
+
+    pub pg_connection_config: Config,
+    pub env: LocalEnv,
+    pub http_client: Client,
+    pub http_base_url: String,
+
+    pub pageserver: Arc<PageServerNode>,
+}
+
+impl SafekeeperNode {
+    pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
+        let pageserver = Arc::new(PageServerNode::from_env(env));
+
+        println!("initializing for {} for {}", conf.name, conf.http_port);
+
+        SafekeeperNode {
+            name: conf.name.clone(),
+            conf: conf.clone(),
+            pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
+            env: env.clone(),
+            http_client: Client::new(),
+            http_base_url: format!("http://localhost:{}/v1", conf.http_port),
+            pageserver,
+        }
+    }
+
+    /// Construct libpq connection string for connecting to this safekeeper.
+    fn safekeeper_connection_config(port: u16) -> Config {
+        // TODO safekeeper authentication not implemented yet
+        format!("postgresql://no_user@localhost:{}/no_db", port)
+            .parse()
+            .unwrap()
+    }
+
+    pub fn datadir_path(&self) -> PathBuf {
+        self.env.safekeeper_data_dir(&self.name)
+    }
+
+    pub fn pid_file(&self) -> PathBuf {
+        self.datadir_path().join("safekeeper.pid")
+    }
+
+    pub fn start(&self) -> anyhow::Result<()> {
+        print!(
+            "Starting safekeeper at '{}' in '{}'",
+            connection_address(&self.pg_connection_config),
+            self.datadir_path().display()
+        );
+        io::stdout().flush().unwrap();
+
+        let listen_pg = format!("localhost:{}", self.conf.pg_port);
+        let listen_http = format!("localhost:{}", self.conf.http_port);
+
+        let mut cmd = Command::new(self.env.safekeeper_bin()?);
+        cmd.args(&["-D", self.datadir_path().to_str().unwrap()])
+            .args(&["--listen-pg", &listen_pg])
+            .args(&["--listen-http", &listen_http])
+            .args(&["--recall", "1 second"])
+            .arg("--daemonize")
+            .env_clear()
+            .env("RUST_BACKTRACE", "1");
+        if !self.conf.sync {
+            cmd.arg("--no-sync");
+        }
+
+        let var = "LLVM_PROFILE_FILE";
+        if let Some(val) = std::env::var_os(var) {
+            cmd.env(var, val);
+        }
+
+        if !cmd.status()?.success() {
+            bail!(
+                "Safekeeper failed to start. See '{}' for details.",
+                self.datadir_path().join("safekeeper.log").display()
+            );
+        }
+
+        // It takes a while for the safekeeper to start up. Wait until it is
+        // open for business.
+        const RETRIES: i8 = 15;
+        for retries in 1..RETRIES {
+            match self.check_status() {
+                Ok(_) => {
+                    println!("\nSafekeeper started");
+                    return Ok(());
+                }
+                Err(err) => {
+                    match err {
+                        SafekeeperHttpError::Transport(err) => {
+                            if err.is_connect() && retries < 5 {
+                                print!(".");
+                                io::stdout().flush().unwrap();
+                            } else {
+                                if retries == 5 {
+                                    println!() // put a line break after dots for second message
+                                }
+                                println!(
+                                    "Safekeeper not responding yet, err {} retrying ({})...",
+                                    err, retries
+                                );
+                            }
+                        }
+                        SafekeeperHttpError::Response(msg) => {
+                            bail!("safekeeper failed to start: {} ", msg)
+                        }
+                    }
+                    thread::sleep(Duration::from_secs(1));
+                }
+            }
+        }
+        bail!("safekeeper failed to start in {} seconds", RETRIES);
+    }
+
+    ///
+    /// Stop the server.
+    ///
+    /// If 'immediate' is true, we use SIGQUIT, killing the process immediately.
+    /// Otherwise we use SIGTERM, triggering a clean shutdown
+    ///
+    /// If the server is not running, returns success
+    ///
+    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
+        let pid_file = self.pid_file();
+        if !pid_file.exists() {
+            println!("Safekeeper {} is already stopped", self.name);
+            return Ok(());
+        }
+        let pid = read_pidfile(&pid_file)?;
+        let pid = Pid::from_raw(pid);
+
+        let sig = if immediate {
+            println!("Stop safekeeper immediately");
+            Signal::SIGQUIT
+        } else {
+            println!("Stop safekeeper gracefully");
+            Signal::SIGTERM
+        };
+        match kill(pid, sig) {
+            Ok(_) => (),
+            Err(Errno::ESRCH) => {
+                println!(
+                    "Safekeeper with pid {} does not exist, but a PID file was found",
+                    pid
+                );
+                return Ok(());
+            }
+            Err(err) => bail!(
+                "Failed to send signal to safekeeper with pid {}: {}",
+                pid,
+                err.desc()
+            ),
+        }
+
+        let address = connection_address(&self.pg_connection_config);
+
+        // TODO Remove this "timeout" and handle it on caller side instead.
+        // Shutting down may take a long time,
+        // if safekeeper flushes a lot of data
+        for _ in 0..100 {
+            if let Err(_e) = TcpStream::connect(&address) {
+                println!("Safekeeper stopped receiving connections");
+
+                //Now check status
+                match self.check_status() {
+                    Ok(_) => {
+                        println!("Safekeeper status is OK. Wait a bit.");
+                        thread::sleep(Duration::from_secs(1));
+                    }
+                    Err(err) => {
+                        println!("Safekeeper status is: {}", err);
+                        return Ok(());
+                    }
+                }
+            } else {
+                println!("Safekeeper still receives connections");
+                thread::sleep(Duration::from_secs(1));
+            }
+        }
+
+        bail!("Failed to stop safekeeper with pid {}", pid);
+    }
+
+    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
+        // TODO: authentication
+        //if self.env.auth_type == AuthType::ZenithJWT {
+        //    builder = builder.bearer_auth(&self.env.safekeeper_auth_token)
+        //}
+        self.http_client.request(method, url)
+    }
+
+    pub fn check_status(&self) -> Result<()> {
+        self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
+            .send()?
+            .error_from_body()?;
+        Ok(())
+    }
+}
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -5,7 +5,8 @@ use std::process::Command;
 use std::time::Duration;
 use std::{io, result, thread};

-use anyhow::{anyhow, bail};
+use anyhow::bail;
+use nix::errno::Errno;
 use nix::sys::signal::{kill, Signal};
 use nix::unistd::Pid;
 use pageserver::http::models::{BranchCreateRequest, TenantCreateRequest};
@@ -20,6 +21,7 @@ use zenith_utils::zid::ZTenantId;
 use crate::local_env::LocalEnv;
 use crate::read_pidfile;
 use pageserver::branches::BranchInfo;
+use pageserver::tenant_mgr::TenantInfo;
 use zenith_utils::connstring::connection_address;

 #[derive(Error, Debug)]
@@ -62,7 +64,6 @@ impl ResponseErrorMessageExt for Response {
 //
 #[derive(Debug)]
 pub struct PageServerNode {
-    pub kill_on_exit: bool,
    pub pg_connection_config: Config,
    pub env: LocalEnv,
    pub http_client: Client,
@@ -71,34 +72,33 @@ pub struct PageServerNode {

 impl PageServerNode {
    pub fn from_env(env: &LocalEnv) -> PageServerNode {
-        let password = if env.auth_type == AuthType::ZenithJWT {
-            &env.auth_token
+        let password = if env.pageserver.auth_type == AuthType::ZenithJWT {
+            &env.pageserver.auth_token
        } else {
            ""
        };

        PageServerNode {
-            kill_on_exit: false,
            pg_connection_config: Self::pageserver_connection_config(
                password,
-                env.pageserver_pg_port,
+                env.pageserver.pg_port,
            ),
            env: env.clone(),
            http_client: Client::new(),
-            http_base_url: format!("http://localhost:{}/v1", env.pageserver_http_port),
+            http_base_url: format!("http://localhost:{}/v1", env.pageserver.http_port),
        }
    }

+    /// Construct libpq connection string for connecting to the pageserver.
    fn pageserver_connection_config(password: &str, port: u16) -> Config {
        format!("postgresql://no_user:{}@localhost:{}/no_db", password, port)
            .parse()
            .unwrap()
    }

-    pub fn init(&self, create_tenant: Option<&str>, enable_auth: bool) -> anyhow::Result<()> {
-        let mut cmd = Command::new(self.env.pageserver_bin()?);
-        let listen_pg = format!("localhost:{}", self.env.pageserver_pg_port);
-        let listen_http = format!("localhost:{}", self.env.pageserver_http_port);
+    pub fn init(&self, create_tenant: Option<&str>) -> anyhow::Result<()> {
+        let listen_pg = format!("localhost:{}", self.env.pageserver.pg_port);
+        let listen_http = format!("localhost:{}", self.env.pageserver.http_port);
        let mut args = vec![
            "--init",
            "-D",
@@ -111,27 +111,29 @@ impl PageServerNode {
            &listen_http,
        ];

-        if enable_auth {
+        let auth_type_str = &self.env.pageserver.auth_type.to_string();
+        if self.env.pageserver.auth_type != AuthType::Trust {
            args.extend(&["--auth-validation-public-key-path", "auth_public_key.pem"]);
-            args.extend(&["--auth-type", "ZenithJWT"]);
        }
+        args.extend(&["--auth-type", auth_type_str]);

        if let Some(tenantid) = create_tenant {
            args.extend(&["--create-tenant", tenantid])
        }

-        let status = cmd
-            .args(args)
-            .env_clear()
-            .env("RUST_BACKTRACE", "1")
-            .status()
-            .expect("pageserver init failed");
+        let mut cmd = Command::new(self.env.pageserver_bin()?);
+        cmd.args(args).env_clear().env("RUST_BACKTRACE", "1");

-        if status.success() {
-            Ok(())
-        } else {
-            Err(anyhow!("pageserver init failed"))
+        let var = "LLVM_PROFILE_FILE";
+        if let Some(val) = std::env::var_os(var) {
+            cmd.env(var, val);
        }
+
+        if !cmd.status()?.success() {
+            bail!("pageserver init failed");
+        }
+
+        Ok(())
    }

    pub fn repo_path(&self) -> PathBuf {
@@ -152,10 +154,15 @@ impl PageServerNode {

        let mut cmd = Command::new(self.env.pageserver_bin()?);
        cmd.args(&["-D", self.repo_path().to_str().unwrap()])
-            .arg("-d")
+            .arg("--daemonize")
            .env_clear()
            .env("RUST_BACKTRACE", "1");

+        let var = "LLVM_PROFILE_FILE";
+        if let Some(val) = std::env::var_os(var) {
+            cmd.env(var, val);
+        }
+
        if !cmd.status()?.success() {
            bail!(
                "Pageserver failed to start. See '{}' for details.",
@@ -199,19 +206,43 @@ impl PageServerNode {
        bail!("pageserver failed to start in {} seconds", RETRIES);
    }

+    ///
+    /// Stop the server.
+    ///
+    /// If 'immediate' is true, we use SIGQUIT, killing the process immediately.
+    /// Otherwise we use SIGTERM, triggering a clean shutdown
+    ///
+    /// If the server is not running, returns success
+    ///
    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        let pid = read_pidfile(&self.pid_file())?;
-        let pid = Pid::from_raw(pid);
-        if immediate {
+        let pid_file = self.pid_file();
+        if !pid_file.exists() {
+            println!("Pageserver is already stopped");
+            return Ok(());
+        }
+        let pid = Pid::from_raw(read_pidfile(&pid_file)?);
+
+        let sig = if immediate {
            println!("Stop pageserver immediately");
-            if kill(pid, Signal::SIGQUIT).is_err() {
-                bail!("Failed to kill pageserver with pid {}", pid);
-            }
+            Signal::SIGQUIT
        } else {
            println!("Stop pageserver gracefully");
-            if kill(pid, Signal::SIGTERM).is_err() {
-                bail!("Failed to stop pageserver with pid {}", pid);
+            Signal::SIGTERM
+        };
+        match kill(pid, sig) {
+            Ok(_) => (),
+            Err(Errno::ESRCH) => {
+                println!(
+                    "Pageserver with pid {} does not exist, but a PID file was found",
+                    pid
+                );
+                return Ok(());
            }
+            Err(err) => bail!(
+                "Failed to send signal to pageserver with pid {}: {}",
+                pid,
+                err.desc()
+            ),
        }

        let address = connection_address(&self.pg_connection_config);
@@ -256,8 +287,8 @@ impl PageServerNode {

    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
        let mut builder = self.http_client.request(method, url);
-        if self.env.auth_type == AuthType::ZenithJWT {
-            builder = builder.bearer_auth(&self.env.auth_token)
+        if self.env.pageserver.auth_type == AuthType::ZenithJWT {
+            builder = builder.bearer_auth(&self.env.pageserver.auth_token)
        }
        builder
    }
@@ -269,7 +300,7 @@ impl PageServerNode {
        Ok(())
    }

-    pub fn tenant_list(&self) -> Result<Vec<String>> {
+    pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
        Ok(self
            .http_request(Method::GET, format!("{}/{}", self.http_base_url, "tenant"))
            .send()?
@@ -332,12 +363,3 @@ impl PageServerNode {
            .json()?)
    }
 }
-
-impl Drop for PageServerNode {
-    fn drop(&mut self) {
-        // TODO Looks like this flag is never set
-        if self.kill_on_exit {
-            let _ = self.stop(true);
-        }
-    }
-}
--- a/docs/glossary.md
+++ b/docs/glossary.md
@@ -51,11 +51,14 @@ Each PostgreSQL fork is considered a separate relish.

 ### Layer

-Each layer corresponds to the specific version of a relish Segment in a range of LSNs.
+A layer contains data needed to reconstruct any page versions within the
+layer's Segment and range of LSNs.
+
 There are two kinds of layers, in-memory and on-disk layers. In-memory
 layers are used to ingest incoming WAL, and provide fast access
 to the recent page versions. On-disk layers are stored as files on disk, and
-are immutable.
+are immutable. See pageserver/src/layered_repository/README.md for more.
+
 ### Layer file (on-disk layer)

 Layered repository on-disk format is based on immutable files.  The
--- a/docs/settings.md
+++ b/docs/settings.md
@@ -0,0 +1,128 @@
+## Pageserver
+
+### listen_pg_addr
+
+Network interface and port number to listen at for connections from
+the compute nodes and safekeepers. The default is `127.0.0.1:64000`.
+
+### listen_http_addr
+
+Network interface and port number to listen at for admin connections.
+The default is `127.0.0.1:9898`.
+
+### checkpoint_distance
+
+`checkpoint_distance` is the amount of incoming WAL that is held in
+the open layer, before it's flushed to local disk. It puts an upper
+bound on how much WAL needs to be re-processed after a pageserver
+crash. It is a soft limit, the pageserver can momentarily go above it,
+but it will trigger a checkpoint operation to get it back below the
+limit.
+
+`checkpoint_distance` also determines how much WAL needs to be kept
+durable in the safekeeper.  The safekeeper must have capacity to hold
+this much WAL, with some headroom, otherwise you can get stuck in a
+situation where the safekeeper is full and stops accepting new WAL,
+but the pageserver is not flushing out and releasing the space in the
+safekeeper because it hasn't reached checkpoint_distance yet.
+
+`checkpoint_distance` also controls how often the WAL is uploaded to
+S3.
+
+The unit is # of bytes.
+
+### checkpoint_period
+
+The pageserver checks whether `checkpoint_distance` has been reached
+every `checkpoint_period` seconds. Default is 1 s, which should be
+fine.
+
+### gc_horizon
+
+`gz_horizon` determines how much history is retained, to allow
+branching and read replicas at an older point in time. The unit is #
+of bytes of WAL. Page versions older than this are garbage collected
+away.
+
+### gc_period
+
+Interval at which garbage collection is triggered. Default is 100 s.
+
+### superuser
+
+Name of the initial superuser role, passed to initdb when a new tenant
+is initialized. It doesn't affect anything after initialization. The
+default is Note: The default is 'zenith_admin', and the console
+depends on that, so if you change it, bad things will happen.
+
+### page_cache_size
+
+Size of the page cache, to hold materialized page versions. Unit is
+number of 8 kB blocks. The default is 8192, which means 64 MB.
+
+### max_file_descriptors
+
+Max number of file descriptors to hold open concurrently for accessing
+layer files. This should be kept well below the process/container/OS
+limit (see `ulimit -n`), as the pageserver also needs file descriptors
+for other files and for sockets for incoming connections.
+
+### postgres-distrib
+
+A directory with Postgres installation to use during pageserver activities.
+Inside that dir, a `bin/postgres` binary should be present.
+
+The default distrib dir is `./tmp_install/`.
+
+### workdir (-D)
+
+A directory in the file system, where pageserver will store its files.
+The default is `./.zenith/`.
+
+### Remote storage
+
+There's a way to automatically backup and restore some of the pageserver's data from working dir to the remote storage.
+The backup system is disabled by default and can be enabled for either of the currently available storages:
+
+#### Local FS storage
+
+##### remote-storage-local-path
+
+Pageserver can back up and restore some of its workdir contents to another directory.
+For that, only a path to that directory needs to be specified as a parameter.
+
+#### S3 storage
+
+Pageserver can back up and restore some of its workdir contents to S3.
+Full set of S3 credentials is needed for that as parameters:
+
+##### remote-storage-s3-bucket
+
+Name of the bucket to connect to, example: "some-sample-bucket".
+
+##### remote-storage-region
+
+Name of the region where the bucket is located at, example: "eu-north-1"
+
+##### remote-storage-access-key
+
+Access key to connect to the bucket ("login" part of the credentials), example: "AKIAIOSFODNN7EXAMPLE"
+
+##### remote-storage-secret-access-key
+
+Secret access key to connect to the bucket ("password" part of the credentials), example: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"
+
+#### General remote storage configuration
+
+Pagesever allows only one remote storage configured concurrently and errors if parameters from multiple different remote configurations are used.
+No default values are used for the remote storage configuration parameters.
+
+##### remote-storage-max-concurrent-sync
+
+Max number of concurrent connections to open for uploading to or
+downloading from S3.
+The default value is 100.
+
+## safekeeper
+
+TODO
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -79,3 +79,61 @@ Helpers for exposing Prometheus metrics from the server.
 `/zenith_utils`:

 Helpers that are shared between other crates in this repository.
+
+## Using Python
+Note that Debian/Ubuntu Python packages are stale, as it commonly happens,
+so manual installation of dependencies is not recommended.
+
+A single virtual environment with all dependencies is described in the single `Pipfile`.
+
+### Prerequisites
+- Install Python 3.7 (the minimal supported version)
+    - Later version (e.g. 3.8) is ok if you don't write Python code
+    - You can install Python 3.7 separately, e.g.:
+      ```bash
+      # In Ubuntu
+      sudo add-apt-repository ppa:deadsnakes/ppa
+      sudo apt update
+      sudo apt install python3.7
+      ```
+- Install `pipenv`
+    - Exact version of `pipenv` is not important, you can use Debian/Ubuntu package `pipenv`.
+- Install dependencies via either
+  * `pipenv --python 3.7 install --dev` if you will write Python code, or
+  * `pipenv install` if you only want to run Python scripts and don't have Python 3.7.
+
+Run `pipenv shell` to activate the virtual environment.
+Alternatively, use `pipenv run` to run a single command in the venv, e.g. `pipenv run pytest`.
+
+### Obligatory checks
+We force code formatting via `yapf` and type hints via `mypy`.
+Run the following commands in the repository's root (next to `setup.cfg`):
+
+```bash
+pipenv run yapf -ri .  # All code is reformatted
+pipenv run mypy .  # Ensure there are no typing errors
+```
+
+**WARNING**: do not run `mypy` from a directory other than the root of the repository.
+Otherwise it will not find its configuration.
+
+Also consider:
+
+* Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any.
+* Adding more type hints to your code to avoid `Any`.
+
+### Changing dependencies
+You have to update `Pipfile.lock` if you have changed `Pipfile`:
+
+```bash
+pipenv --python 3.7 install --dev  # Re-create venv for Python 3.7 and install recent pipenv inside
+pipenv run pipenv --version  # Should be at least 2021.5.29
+pipenv run pipenv lock  # Regenerate Pipfile.lock
+```
+
+As the minimal supported version is Python 3.7 and we use it in CI,
+you have to use a Python 3.7 environment when updating `Pipfile.lock`.
+Otherwise some back-compatibility packages will be missing.
+
+It is also important to run recent `pipenv`.
+Older versions remove markers from `Pipfile.lock`.
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -5,7 +5,7 @@ authors = ["Stas Kelvich <stas@zenith.tech>"]
 edition = "2018"

 [dependencies]
-bookfile = "^0.3"
+bookfile = { git = "https://github.com/zenithdb/bookfile.git", branch="generic-readext" }
 chrono = "0.4.19"
 rand = "0.8.3"
 regex = "1.4.5"
@@ -17,7 +17,7 @@ lazy_static = "1.4.0"
 log = "0.4.14"
 clap = "2.33.0"
 daemonize = "0.4.1"
-tokio = { version = "1.11", features = ["process", "macros", "fs", "rt"] }
+tokio = { version = "1.11", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
 postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
 postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
@@ -32,21 +32,22 @@ serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 toml = "0.5"
 scopeguard = "1.1.0"
-rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
 async-trait = "0.1"
 const_format = "0.2.21"
 tracing = "0.1.27"
-signal-hook = {version = "0.3.10", features = ["extended-siginfo"] }
+signal-hook = "0.3.10"
+url = "2"
 nix = "0.23"
-#yakv = { path  = "../../yakv" }
-yakv = "0.2.7"
-lz4_flex = "0.9.0"
+once_cell = "1.8.0"
+
+rust-s3 = { version = "0.28", default-features = false, features = ["no-verify-ssl", "tokio-rustls-tls"] }
+async-compression = {version = "0.3", features = ["zstd", "tokio"]}

 postgres_ffi = { path = "../postgres_ffi" }
 zenith_metrics = { path = "../zenith_metrics" }
 zenith_utils = { path = "../zenith_utils" }
 workspace_hack = { path = "../workspace_hack" }
-parking_lot = "0.11.2"

 [dev-dependencies]
 hex-literal = "0.3"
+tempfile = "3.2"
--- a/pageserver/README.md
+++ b/pageserver/README.md
@@ -9,7 +9,7 @@ The Page Server has a few different duties:

 S3 is the main fault-tolerant storage of all data, as there are no Page Server
 replicas. We use a separate fault-tolerant WAL service to reduce latency. It
-keeps track of WAL records which are not syncted to S3 yet.
+keeps track of WAL records which are not synced to S3 yet.

 The Page Server consists of multiple threads that operate on a shared
 repository of page versions:
@@ -41,7 +41,7 @@ Legend:
 +--+

 ....
-.  .   Component that we will need, but doesn't exist at the moment. A TODO.
+.  .   Component at its early development phase.
 ....

 --->   Data flow
@@ -116,13 +116,49 @@ Remove old on-disk layer files that are no longer needed according to the
 PITR retention policy


-TODO: Backup service
--------------------
+### Backup service

-The backup service is responsible for periodically pushing the chunks to S3.
+The backup service, responsible for storing pageserver recovery data externally.

-TODO: How/when do restore from S3? Whenever we get a GetPage@LSN request for
-a chunk we don't currently have? Or when an external Control Plane tells us?
+Currently, pageserver stores its files in a filesystem directory it's pointed to.
+That working directory could be rather ephemeral for such cases as "a pageserver pod running in k8s with no persistent volumes attached".
+Therefore, the server interacts with external, more reliable storage to back up and restore its state.
+
+The code for storage support is extensible and can support arbitrary ones as long as they implement a certain Rust trait.
+There are the following implementations present:
+* local filesystem — to use in tests mainly
+* AWS S3           - to use in production
+
+Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs.
+
+The backup service is disabled by default and can be enabled to interact with a single remote storage.
+
+CLI examples:
+* Local FS: `${PAGESERVER_BIN} --remote-storage-local-path="/some/local/path/"`
+* AWS S3  : `${PAGESERVER_BIN} --remote-storage-s3-bucket="some-sample-bucket" --remote-storage-region="eu-north-1" --remote-storage-access-key="SOMEKEYAAAAASADSAH*#" --remote-storage-secret-access-key="SOMEsEcReTsd292v"`
+
+For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
+For local S3 installations, refer to the their documentation for name format and credentials.
+
+Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets.
+Required sections are:
+
+```toml
+[remote_storage]
+local_path = '/Users/someonetoignore/Downloads/tmp_dir/'
+```
+
+or
+
+```toml
+[remote_storage]
+bucket_name = 'some-sample-bucket'
+bucket_region = 'eu-north-1'
+access_key_id = 'SOMEKEYAAAAASADSAH*#'
+secret_access_key = 'SOMEsEcReTsd292v'
+```
+
+Also, `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` variables can be used to specify the credentials instead of any of the ways above.

 TODO: Sharding
 --------------------
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -10,7 +10,7 @@
 //! This module is responsible for creation of such tarball
 //! from data stored in object storage.
 //!
-use anyhow::Result;
+use anyhow::{Context, Result};
 use bytes::{BufMut, BytesMut};
 use log::*;
 use std::fmt::Write as FmtWrite;
@@ -242,10 +242,12 @@ impl<'a> Basebackup<'a> {
    fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
        let checkpoint_bytes = self
            .timeline
-            .get_page_at_lsn(RelishTag::Checkpoint, 0, self.lsn)?;
-        let pg_control_bytes =
-            self.timeline
-                .get_page_at_lsn(RelishTag::ControlFile, 0, self.lsn)?;
+            .get_page_at_lsn(RelishTag::Checkpoint, 0, self.lsn)
+            .context("failed to get checkpoint bytes")?;
+        let pg_control_bytes = self
+            .timeline
+            .get_page_at_lsn(RelishTag::ControlFile, 0, self.lsn)
+            .context("failed get control bytes")?;
        let mut pg_control = ControlFileData::decode(&pg_control_bytes)?;
        let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;

@@ -285,11 +287,7 @@ impl<'a> Basebackup<'a> {

        //send wal segment
        let segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE);
-        let wal_file_name = XLogFileName(
-            1, // FIXME: always use Postgres timeline 1
-            segno,
-            pg_constants::WAL_SEGMENT_SIZE,
-        );
+        let wal_file_name = XLogFileName(PG_TLI, segno, pg_constants::WAL_SEGMENT_SIZE);
        let wal_file_path = format!("pg_wal/{}", wal_file_name);
        let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?;
        let wal_seg = generate_wal_segment(segno, pg_control.system_identifier);
--- a/pageserver/src/bin/dump_layerfile.rs
+++ b/pageserver/src/bin/dump_layerfile.rs
@@ -0,0 +1,31 @@
+//! Main entry point for the dump_layerfile executable
+//!
+//! A handy tool for debugging, that's all.
+use anyhow::Result;
+use clap::{App, Arg};
+use pageserver::layered_repository::dump_layerfile_from_path;
+use pageserver::virtual_file;
+use std::path::PathBuf;
+use zenith_utils::GIT_VERSION;
+
+fn main() -> Result<()> {
+    let arg_matches = App::new("Zenith dump_layerfile utility")
+        .about("Dump contents of one layer file, for debugging")
+        .version(GIT_VERSION)
+        .arg(
+            Arg::with_name("path")
+                .help("Path to file to dump")
+                .required(true)
+                .index(1),
+        )
+        .get_matches();
+
+    let path = PathBuf::from(arg_matches.value_of("path").unwrap());
+
+    // Basic initialization of things that don't change after startup
+    virtual_file::init(10);
+
+    dump_layerfile_from_path(&path)?;
+
+    Ok(())
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -5,33 +5,27 @@
 use serde::{Deserialize, Serialize};
 use std::{
    env,
-    net::TcpListener,
+    num::{NonZeroU32, NonZeroUsize},
    path::{Path, PathBuf},
    str::FromStr,
    thread,
 };
 use tracing::*;
-use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType};
+use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType, tcp_listener, GIT_VERSION};

 use anyhow::{bail, ensure, Context, Result};
-use signal_hook::consts::signal::*;
-use signal_hook::consts::TERM_SIGNALS;
-use signal_hook::flag;
-use signal_hook::iterator::exfiltrator::WithOrigin;
-use signal_hook::iterator::SignalsInfo;
-use std::process::exit;
-use std::sync::atomic::AtomicBool;
-use std::sync::Arc;

 use clap::{App, Arg, ArgMatches};
 use daemonize::Daemonize;

 use pageserver::{
-    branches, defaults::*, http, page_service, relish_storage, tenant_mgr, PageServerConf,
-    RelishStorageConfig, RelishStorageKind, S3Config, LOG_FILE_NAME,
+    branches, defaults::*, http, page_cache, page_service, remote_storage, tenant_mgr,
+    virtual_file, PageServerConf, RemoteStorageConfig, RemoteStorageKind, S3Config, LOG_FILE_NAME,
 };
 use zenith_utils::http::endpoint;
 use zenith_utils::postgres_backend;
+use zenith_utils::shutdown::exit_now;
+use zenith_utils::signals::{self, Signal};

 use const_format::formatcp;

@@ -42,30 +36,31 @@ struct CfgFileParams {
    listen_http_addr: Option<String>,
    checkpoint_distance: Option<String>,
    checkpoint_period: Option<String>,
-    upload_distance: Option<String>,
-    upload_period: Option<String>,
-    reconstruct_threshold: Option<String>,
    gc_horizon: Option<String>,
    gc_period: Option<String>,
+    open_mem_limit: Option<String>,
+    page_cache_size: Option<String>,
+    max_file_descriptors: Option<String>,
    pg_distrib_dir: Option<String>,
    auth_validation_public_key_path: Option<String>,
    auth_type: Option<String>,
-    relish_storage_max_concurrent_sync: Option<String>,
+    remote_storage_max_concurrent_sync: Option<String>,
+    remote_storage_max_sync_errors: Option<String>,
    /////////////////////////////////
    //// Don't put `Option<String>` and other "simple" values below.
    ////
-    /// `Option<RelishStorage>` is a <a href='https://toml.io/en/v1.0.0#table'>table</a> in TOML.
+    /// `Option<RemoteStorage>` is a <a href='https://toml.io/en/v1.0.0#table'>table</a> in TOML.
    /// Values in TOML cannot be defined after tables (other tables can),
    /// and [`toml`] crate serializes all fields in the order of their appearance.
    ////////////////////////////////
-    relish_storage: Option<RelishStorage>,
+    remote_storage: Option<RemoteStorage>,
 }

 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
 // Without this attribute, enums with values won't be serialized by the `toml` library (but can be deserialized nonetheless!).
 // See https://github.com/alexcrichton/toml-rs/blob/6c162e6562c3e432bf04c82a3d1d789d80761a86/examples/enum_external.rs for the examples
 #[serde(untagged)]
-enum RelishStorage {
+enum RemoteStorage {
    Local {
        local_path: String,
    },
@@ -86,36 +81,37 @@ impl CfgFileParams {
            arg_matches.value_of(arg_name).map(str::to_owned)
        };

-        let relish_storage = if let Some(local_path) = get_arg("relish-storage-local-path") {
-            Some(RelishStorage::Local { local_path })
+        let remote_storage = if let Some(local_path) = get_arg("remote-storage-local-path") {
+            Some(RemoteStorage::Local { local_path })
        } else if let Some((bucket_name, bucket_region)) =
-            get_arg("relish-storage-s3-bucket").zip(get_arg("relish-storage-region"))
+            get_arg("remote-storage-s3-bucket").zip(get_arg("remote-storage-region"))
        {
-            Some(RelishStorage::AwsS3 {
+            Some(RemoteStorage::AwsS3 {
                bucket_name,
                bucket_region,
-                access_key_id: get_arg("relish-storage-access-key"),
-                secret_access_key: get_arg("relish-storage-secret-access-key"),
+                access_key_id: get_arg("remote-storage-access-key"),
+                secret_access_key: get_arg("remote-storage-secret-access-key"),
            })
        } else {
            None
        };

        Self {
-            listen_pg_addr: get_arg("listen-pg"),
-            listen_http_addr: get_arg("listen-http"),
+            listen_pg_addr: get_arg("listen_pg_addr"),
+            listen_http_addr: get_arg("listen_http_addr"),
            checkpoint_distance: get_arg("checkpoint_distance"),
            checkpoint_period: get_arg("checkpoint_period"),
-            upload_distance: get_arg("upload_distance"),
-            upload_period: get_arg("upload_period"),
-            reconstruct_threshold: get_arg("reconstruct_threshold"),
            gc_horizon: get_arg("gc_horizon"),
            gc_period: get_arg("gc_period"),
+            open_mem_limit: get_arg("open_mem_limit"),
+            page_cache_size: get_arg("page_cache_size"),
+            max_file_descriptors: get_arg("max_file_descriptors"),
            pg_distrib_dir: get_arg("postgres-distrib"),
            auth_validation_public_key_path: get_arg("auth-validation-public-key-path"),
            auth_type: get_arg("auth-type"),
-            relish_storage,
-            relish_storage_max_concurrent_sync: get_arg("relish-storage-max-concurrent-sync"),
+            remote_storage,
+            remote_storage_max_concurrent_sync: get_arg("remote-storage-max-concurrent-sync"),
+            remote_storage_max_sync_errors: get_arg("remote-storage-max-sync-errors"),
        }
    }

@@ -127,20 +123,23 @@ impl CfgFileParams {
            listen_http_addr: self.listen_http_addr.or(other.listen_http_addr),
            checkpoint_distance: self.checkpoint_distance.or(other.checkpoint_distance),
            checkpoint_period: self.checkpoint_period.or(other.checkpoint_period),
-            upload_distance: self.upload_distance.or(other.upload_distance),
-            upload_period: self.upload_period.or(other.upload_period),
-            reconstruct_threshold: self.reconstruct_threshold.or(other.reconstruct_threshold),
            gc_horizon: self.gc_horizon.or(other.gc_horizon),
            gc_period: self.gc_period.or(other.gc_period),
+            open_mem_limit: self.open_mem_limit.or(other.open_mem_limit),
+            page_cache_size: self.page_cache_size.or(other.page_cache_size),
+            max_file_descriptors: self.max_file_descriptors.or(other.max_file_descriptors),
            pg_distrib_dir: self.pg_distrib_dir.or(other.pg_distrib_dir),
            auth_validation_public_key_path: self
                .auth_validation_public_key_path
                .or(other.auth_validation_public_key_path),
            auth_type: self.auth_type.or(other.auth_type),
-            relish_storage: self.relish_storage.or(other.relish_storage),
-            relish_storage_max_concurrent_sync: self
-                .relish_storage_max_concurrent_sync
-                .or(other.relish_storage_max_concurrent_sync),
+            remote_storage: self.remote_storage.or(other.remote_storage),
+            remote_storage_max_concurrent_sync: self
+                .remote_storage_max_concurrent_sync
+                .or(other.remote_storage_max_concurrent_sync),
+            remote_storage_max_sync_errors: self
+                .remote_storage_max_sync_errors
+                .or(other.remote_storage_max_sync_errors),
        }
    }

@@ -167,20 +166,6 @@ impl CfgFileParams {
            None => DEFAULT_CHECKPOINT_PERIOD,
        };

-        let upload_distance: u64 = match self.upload_distance.as_ref() {
-            Some(upload_distance_str) => upload_distance_str.parse()?,
-            None => DEFAULT_UPLOAD_DISTANCE,
-        };
-        let upload_period = match self.upload_period.as_ref() {
-            Some(upload_period_str) => humantime::parse_duration(upload_period_str)?,
-            None => DEFAULT_UPLOAD_PERIOD,
-        };
-
-        let reconstruct_threshold: u64 = match self.reconstruct_threshold.as_ref() {
-            Some(reconstruct_threshold_str) => reconstruct_threshold_str.parse()?,
-            None => DEFAULT_RECONSTRUCT_THRESHOLD,
-        };
-
        let gc_horizon: u64 = match self.gc_horizon.as_ref() {
            Some(horizon_str) => horizon_str.parse()?,
            None => DEFAULT_GC_HORIZON,
@@ -190,6 +175,21 @@ impl CfgFileParams {
            None => DEFAULT_GC_PERIOD,
        };

+        let open_mem_limit: usize = match self.open_mem_limit.as_ref() {
+            Some(open_mem_limit_str) => open_mem_limit_str.parse()?,
+            None => DEFAULT_OPEN_MEM_LIMIT,
+        };
+
+        let page_cache_size: usize = match self.page_cache_size.as_ref() {
+            Some(page_cache_size_str) => page_cache_size_str.parse()?,
+            None => DEFAULT_PAGE_CACHE_SIZE,
+        };
+
+        let max_file_descriptors: usize = match self.max_file_descriptors.as_ref() {
+            Some(max_file_descriptors_str) => max_file_descriptors_str.parse()?,
+            None => DEFAULT_MAX_FILE_DESCRIPTORS,
+        };
+
        let pg_distrib_dir = match self.pg_distrib_dir.as_ref() {
            Some(pg_distrib_dir_str) => PathBuf::from(pg_distrib_dir_str),
            None => env::current_dir()?.join("tmp_install"),
@@ -223,31 +223,34 @@ impl CfgFileParams {
            );
        }

-        let max_concurrent_sync = match self.relish_storage_max_concurrent_sync.as_deref() {
-            Some(relish_storage_max_concurrent_sync) => {
-                relish_storage_max_concurrent_sync.parse()?
-            }
-            None => DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS,
+        let max_concurrent_sync = match self.remote_storage_max_concurrent_sync.as_deref() {
+            Some(number_str) => number_str.parse()?,
+            None => NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC).unwrap(),
        };
-        let relish_storage_config = self.relish_storage.as_ref().map(|storage_params| {
+        let max_sync_errors = match self.remote_storage_max_sync_errors.as_deref() {
+            Some(number_str) => number_str.parse()?,
+            None => NonZeroU32::new(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS).unwrap(),
+        };
+        let remote_storage_config = self.remote_storage.as_ref().map(|storage_params| {
            let storage = match storage_params.clone() {
-                RelishStorage::Local { local_path } => {
-                    RelishStorageKind::LocalFs(PathBuf::from(local_path))
+                RemoteStorage::Local { local_path } => {
+                    RemoteStorageKind::LocalFs(PathBuf::from(local_path))
                }
-                RelishStorage::AwsS3 {
+                RemoteStorage::AwsS3 {
                    bucket_name,
                    bucket_region,
                    access_key_id,
                    secret_access_key,
-                } => RelishStorageKind::AwsS3(S3Config {
+                } => RemoteStorageKind::AwsS3(S3Config {
                    bucket_name,
                    bucket_region,
                    access_key_id,
                    secret_access_key,
                }),
            };
-            RelishStorageConfig {
+            RemoteStorageConfig {
                max_concurrent_sync,
+                max_sync_errors,
                storage,
            }
        });
@@ -259,11 +262,11 @@ impl CfgFileParams {
            listen_http_addr,
            checkpoint_distance,
            checkpoint_period,
-            upload_distance,
-            upload_period,
-            reconstruct_threshold,
            gc_horizon,
            gc_period,
+            open_mem_limit,
+            page_cache_size,
+            max_file_descriptors,

            superuser: String::from(DEFAULT_SUPERUSER),

@@ -273,7 +276,7 @@ impl CfgFileParams {

            auth_validation_public_key_path,
            auth_type,
-            relish_storage_config,
+            remote_storage_config,
        })
    }
 }
@@ -282,18 +285,19 @@ fn main() -> Result<()> {
    zenith_metrics::set_common_metrics_prefix("pageserver");
    let arg_matches = App::new("Zenith page server")
        .about("Materializes WAL stream to pages and serves them to the postgres")
+        .version(GIT_VERSION)
        .arg(
-            Arg::with_name("listen-pg")
+            Arg::with_name("listen_pg_addr")
                .short("l")
-                .long("listen-pg")
-                .alias("listen") // keep some compatibility
+                .long("listen_pg_addr")
+                .aliases(&["listen", "listen-pg"]) // keep some compatibility
                .takes_value(true)
                .help(formatcp!("listen for incoming page requests on ip:port (default: {DEFAULT_PG_LISTEN_ADDR})")),
        )
        .arg(
-            Arg::with_name("listen-http")
-                .long("listen-http")
-                .alias("http_endpoint") // keep some compatibility
+            Arg::with_name("listen_http_addr")
+                .long("listen_http_addr")
+                .aliases(&["http_endpoint", "listen-http"]) // keep some compatibility
                .takes_value(true)
                .help(formatcp!("http endpoint address for metrics and management API calls on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")),
        )
@@ -322,24 +326,6 @@ fn main() -> Result<()> {
                .takes_value(true)
                .help("Interval between checkpoint iterations"),
        )
-        .arg(
-            Arg::with_name("upload_distance")
-                .long("upload_distance")
-                .takes_value(true)
-                .help("Distance from current LSN to perform checkpoint of in-memory layers"),
-        )
-        .arg(
-            Arg::with_name("upload_period")
-                .long("upload_period")
-                .takes_value(true)
-                .help("Interval between upload iterations"),
-        )
-        .arg(
-            Arg::with_name("reconstruct_threshold")
-                .long("reconstruct_threshold")
-                .takes_value(true)
-                .help("Minimal size of deltas after which page reconstruction (materialization) can be performed"),
-        )
        .arg(
            Arg::with_name("gc_horizon")
                .long("gc_horizon")
@@ -352,6 +338,25 @@ fn main() -> Result<()> {
                .takes_value(true)
                .help("Interval between garbage collector iterations"),
        )
+        .arg(
+            Arg::with_name("open_mem_limit")
+                .long("open_mem_limit")
+                .takes_value(true)
+                .help("Amount of memory reserved for buffering incoming WAL"),
+        )
+        .arg(
+
+            Arg::with_name("page_cache_size")
+                .long("page_cache_size")
+                .takes_value(true)
+                .help("Number of pages in the page cache"),
+        )
+        .arg(
+            Arg::with_name("max_file_descriptors")
+                .long("max_file_descriptors")
+                .takes_value(true)
+                .help("Max number of file descriptors to keep open for files"),
+        )
        .arg(
            Arg::with_name("workdir")
                .short("D")
@@ -385,45 +390,45 @@ fn main() -> Result<()> {
                .help("Authentication scheme type. One of: Trust, MD5, ZenithJWT"),
        )
        .arg(
-            Arg::with_name("relish-storage-local-path")
-                .long("relish-storage-local-path")
+            Arg::with_name("remote-storage-local-path")
+                .long("remote-storage-local-path")
                .takes_value(true)
-                .help("Path to the local directory, to be used as an external relish storage")
+                .help("Path to the local directory, to be used as an external remote storage")
                .conflicts_with_all(&[
-                    "relish-storage-s3-bucket",
-                    "relish-storage-region",
-                    "relish-storage-access-key",
-                    "relish-storage-secret-access-key",
+                    "remote-storage-s3-bucket",
+                    "remote-storage-region",
+                    "remote-storage-access-key",
+                    "remote-storage-secret-access-key",
                ]),
        )
        .arg(
-            Arg::with_name("relish-storage-s3-bucket")
-                .long("relish-storage-s3-bucket")
+            Arg::with_name("remote-storage-s3-bucket")
+                .long("remote-storage-s3-bucket")
                .takes_value(true)
-                .help("Name of the AWS S3 bucket to use an external relish storage")
-                .requires("relish-storage-region"),
+                .help("Name of the AWS S3 bucket to use an external remote storage")
+                .requires("remote-storage-region"),
        )
        .arg(
-            Arg::with_name("relish-storage-region")
-                .long("relish-storage-region")
+            Arg::with_name("remote-storage-region")
+                .long("remote-storage-region")
                .takes_value(true)
                .help("Region of the AWS S3 bucket"),
        )
        .arg(
-            Arg::with_name("relish-storage-access-key")
-                .long("relish-storage-access-key")
+            Arg::with_name("remote-storage-access-key")
+                .long("remote-storage-access-key")
                .takes_value(true)
                .help("Credentials to access the AWS S3 bucket"),
        )
        .arg(
-            Arg::with_name("relish-storage-secret-access-key")
-                .long("relish-storage-secret-access-key")
+            Arg::with_name("remote-storage-secret-access-key")
+                .long("remote-storage-secret-access-key")
                .takes_value(true)
                .help("Credentials to access the AWS S3 bucket"),
        )
        .arg(
-            Arg::with_name("relish-storage-max-concurrent-sync")
-                .long("relish-storage-max-concurrent-sync")
+            Arg::with_name("remote-storage-max-concurrent-sync")
+                .long("remote-storage-max-concurrent-sync")
                .takes_value(true)
                .help("Maximum allowed concurrent synchronisations with storage"),
        )
@@ -483,6 +488,11 @@ fn main() -> Result<()> {
    // as a ref.
    let conf: &'static PageServerConf = Box::leak(Box::new(conf));

+    // Basic initialization of things that don't change after startup
+    virtual_file::init(conf.max_file_descriptors);
+
+    page_cache::init(conf);
+
    // Create repo and exit if init was requested
    if init {
        branches::init_pageserver(conf, create_tenant).context("Failed to init pageserver")?;
@@ -506,16 +516,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
    // Initialize logger
    let log_file = logging::init(LOG_FILE_NAME, conf.daemonize)?;

-    let term_now = Arc::new(AtomicBool::new(false));
-    for sig in TERM_SIGNALS {
-        // When terminated by a second term signal, exit with exit code 1.
-        // This will do nothing the first time (because term_now is false).
-        flag::register_conditional_shutdown(*sig, 1, Arc::clone(&term_now))?;
-        // But this will "arm" the above for the second time, by setting it to true.
-        // The order of registering these is important, if you put this one first, it will
-        // first arm and then terminate ‒ all in the first round.
-        flag::register(*sig, Arc::clone(&term_now))?;
-    }
+    info!("version: {}", GIT_VERSION);

    // TODO: Check that it looks like a valid repository before going further

@@ -524,14 +525,15 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
        "Starting pageserver http handler on {}",
        conf.listen_http_addr
    );
-    let http_listener = TcpListener::bind(conf.listen_http_addr.clone())?;
+    let http_listener = tcp_listener::bind(conf.listen_http_addr.clone())?;

    info!(
        "Starting pageserver pg protocol handler on {}",
        conf.listen_pg_addr
    );
-    let pageserver_listener = TcpListener::bind(conf.listen_pg_addr.clone())?;
+    let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;

+    // XXX: Don't spawn any threads before daemonizing!
    if conf.daemonize {
        info!("daemonizing...");

@@ -546,21 +548,28 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
            .stdout(stdout)
            .stderr(stderr);

-        match daemonize.start() {
+        // XXX: The parent process should exit abruptly right after
+        // it has spawned a child to prevent coverage machinery from
+        // dumping stats into a `profraw` file now owned by the child.
+        // Otherwise, the coverage data will be damaged.
+        match daemonize.exit_action(|| exit_now(0)).start() {
            Ok(_) => info!("Success, daemonized"),
            Err(err) => error!(%err, "could not daemonize"),
        }
    }

-    // keep join handles for spawned threads
-    // don't spawn threads before daemonizing
-    let mut join_handles = Vec::new();
+    let signals = signals::install_shutdown_handlers()?;
+    let mut threads = Vec::new();

-    if let Some(handle) = relish_storage::run_storage_sync_thread(conf)? {
-        join_handles.push(handle);
+    let sync_startup = remote_storage::start_local_timeline_sync(conf)
+        .context("Failed to set up local files sync with external storage")?;
+
+    if let Some(handle) = sync_startup.sync_loop_handle {
+        threads.push(handle);
    }
+
    // Initialize tenant manager.
-    tenant_mgr::init(conf);
+    tenant_mgr::set_timeline_states(conf, sync_startup.initial_timeline_states);

    // initialize authentication for incoming connections
    let auth = match &conf.auth_type {
@@ -576,61 +585,55 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
    // Spawn a new thread for the http endpoint
    // bind before launching separate thread so the error reported before startup exits
    let cloned = auth.clone();
-    let http_endpoint_thread = thread::Builder::new()
-        .name("http_endpoint_thread".into())
-        .spawn(move || {
-            let router = http::make_router(conf, cloned);
-            endpoint::serve_thread_main(router, http_listener)
-        })?;
-
-    join_handles.push(http_endpoint_thread);
+    threads.push(
+        thread::Builder::new()
+            .name("http_endpoint_thread".into())
+            .spawn(move || {
+                let router = http::make_router(conf, cloned);
+                endpoint::serve_thread_main(router, http_listener)
+            })?,
+    );

    // Spawn a thread to listen for connections. It will spawn further threads
    // for each connection.
-    let page_service_thread = thread::Builder::new()
-        .name("Page Service thread".into())
-        .spawn(move || {
-            page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type)
-        })?;
+    threads.push(
+        thread::Builder::new()
+            .name("Page Service thread".into())
+            .spawn(move || {
+                page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type)
+            })?,
+    );

-    for info in SignalsInfo::<WithOrigin>::new(TERM_SIGNALS)?.into_iter() {
-        match info.signal {
-            SIGQUIT => {
-                info!("Got SIGQUIT. Terminate pageserver in immediate shutdown mode");
-                exit(111);
-            }
-            SIGINT | SIGTERM => {
-                info!("Got SIGINT/SIGTERM. Terminate gracefully in fast shutdown mode");
-                // Terminate postgres backends
-                postgres_backend::set_pgbackend_shutdown_requested();
-                // Stop all tenants and flush their data
-                tenant_mgr::shutdown_all_tenants()?;
-                // Wait for pageservice thread to complete the job
-                page_service_thread
+    signals.handle(|signal| match signal {
+        Signal::Quit => {
+            info!(
+                "Got {}. Terminating in immediate shutdown mode",
+                signal.name()
+            );
+            std::process::exit(111);
+        }
+
+        Signal::Interrupt | Signal::Terminate => {
+            info!(
+                "Got {}. Terminating gracefully in fast shutdown mode",
+                signal.name()
+            );
+
+            postgres_backend::set_pgbackend_shutdown_requested();
+            tenant_mgr::shutdown_all_tenants()?;
+            endpoint::shutdown();
+
+            for handle in std::mem::take(&mut threads) {
+                handle
                    .join()
                    .expect("thread panicked")
                    .expect("thread exited with an error");
-
-                // Shut down http router
-                endpoint::shutdown();
-
-                // Wait for all threads
-                for handle in join_handles.into_iter() {
-                    handle
-                        .join()
-                        .expect("thread panicked")
-                        .expect("thread exited with an error");
-                }
-                info!("Pageserver shut down successfully completed");
-                exit(0);
-            }
-            unknown_signal => {
-                debug!("Unknown signal {}", unknown_signal);
            }
+
+            info!("Shut down successfully completed");
+            std::process::exit(0);
        }
-    }
-
-    Ok(())
+    })
 }

 #[cfg(test)]
@@ -644,21 +647,24 @@ mod tests {
            listen_http_addr: Some("listen_http_addr_VALUE".to_string()),
            checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()),
            checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
-            upload_distance: Some("upload_distance_VALUE".to_string()),
-            upload_period: Some("upload_period_VALUE".to_string()),
-            reconstruct_threshold: Some("reconstruct_threshold_VALUE".to_string()),
            gc_horizon: Some("gc_horizon_VALUE".to_string()),
            gc_period: Some("gc_period_VALUE".to_string()),
+            open_mem_limit: Some("open_mem_limit_VALUE".to_string()),
+            page_cache_size: Some("page_cache_size_VALUE".to_string()),
+            max_file_descriptors: Some("max_file_descriptors_VALUE".to_string()),
            pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
            auth_validation_public_key_path: Some(
                "auth_validation_public_key_path_VALUE".to_string(),
            ),
            auth_type: Some("auth_type_VALUE".to_string()),
-            relish_storage: Some(RelishStorage::Local {
-                local_path: "relish_storage_local_VALUE".to_string(),
+            remote_storage: Some(RemoteStorage::Local {
+                local_path: "remote_storage_local_VALUE".to_string(),
            }),
-            relish_storage_max_concurrent_sync: Some(
-                "relish_storage_max_concurrent_sync_VALUE".to_string(),
+            remote_storage_max_concurrent_sync: Some(
+                "remote_storage_max_concurrent_sync_VALUE".to_string(),
+            ),
+            remote_storage_max_sync_errors: Some(
+                "remote_storage_max_sync_errors_VALUE".to_string(),
            ),
        };

@@ -670,18 +676,19 @@ mod tests {
 listen_http_addr = 'listen_http_addr_VALUE'
 checkpoint_distance = 'checkpoint_distance_VALUE'
 checkpoint_period = 'checkpoint_period_VALUE'
-upload_distance = 'upload_distance_VALUE'
-upload_period = 'upload_period_VALUE'
-reconstruct_threshold = 'reconstruct_threshold_VALUE'
 gc_horizon = 'gc_horizon_VALUE'
 gc_period = 'gc_period_VALUE'
+open_mem_limit = 'open_mem_limit_VALUE'
+page_cache_size = 'page_cache_size_VALUE'
+max_file_descriptors = 'max_file_descriptors_VALUE'
 pg_distrib_dir = 'pg_distrib_dir_VALUE'
 auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
 auth_type = 'auth_type_VALUE'
-relish_storage_max_concurrent_sync = 'relish_storage_max_concurrent_sync_VALUE'
+remote_storage_max_concurrent_sync = 'remote_storage_max_concurrent_sync_VALUE'
+remote_storage_max_sync_errors = 'remote_storage_max_sync_errors_VALUE'

-[relish_storage]
-local_path = 'relish_storage_local_VALUE'
+[remote_storage]
+local_path = 'remote_storage_local_VALUE'
 "#,
            toml_pretty_string
        );
@@ -707,24 +714,27 @@ local_path = 'relish_storage_local_VALUE'
            listen_http_addr: Some("listen_http_addr_VALUE".to_string()),
            checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()),
            checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
-            upload_distance: Some("upload_distance_VALUE".to_string()),
-            upload_period: Some("upload_period_VALUE".to_string()),
-            reconstruct_threshold: Some("reconstruct_threshold_VALUE".to_string()),
            gc_horizon: Some("gc_horizon_VALUE".to_string()),
            gc_period: Some("gc_period_VALUE".to_string()),
+            open_mem_limit: Some("open_mem_limit_VALUE".to_string()),
+            page_cache_size: Some("page_cache_size_VALUE".to_string()),
+            max_file_descriptors: Some("max_file_descriptors_VALUE".to_string()),
            pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
            auth_validation_public_key_path: Some(
                "auth_validation_public_key_path_VALUE".to_string(),
            ),
            auth_type: Some("auth_type_VALUE".to_string()),
-            relish_storage: Some(RelishStorage::AwsS3 {
+            remote_storage: Some(RemoteStorage::AwsS3 {
                bucket_name: "bucket_name_VALUE".to_string(),
                bucket_region: "bucket_region_VALUE".to_string(),
                access_key_id: Some("access_key_id_VALUE".to_string()),
                secret_access_key: Some("secret_access_key_VALUE".to_string()),
            }),
-            relish_storage_max_concurrent_sync: Some(
-                "relish_storage_max_concurrent_sync_VALUE".to_string(),
+            remote_storage_max_concurrent_sync: Some(
+                "remote_storage_max_concurrent_sync_VALUE".to_string(),
+            ),
+            remote_storage_max_sync_errors: Some(
+                "remote_storage_max_sync_errors_VALUE".to_string(),
            ),
        };

@@ -736,17 +746,18 @@ local_path = 'relish_storage_local_VALUE'
 listen_http_addr = 'listen_http_addr_VALUE'
 checkpoint_distance = 'checkpoint_distance_VALUE'
 checkpoint_period = 'checkpoint_period_VALUE'
-upload_distance = 'upload_distance_VALUE'
-upload_period = 'upload_period_VALUE'
-reconstruct_threshold = 'reconstruct_threshold_VALUE'
 gc_horizon = 'gc_horizon_VALUE'
 gc_period = 'gc_period_VALUE'
+open_mem_limit = 'open_mem_limit_VALUE'
+page_cache_size = 'page_cache_size_VALUE'
+max_file_descriptors = 'max_file_descriptors_VALUE'
 pg_distrib_dir = 'pg_distrib_dir_VALUE'
 auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
 auth_type = 'auth_type_VALUE'
-relish_storage_max_concurrent_sync = 'relish_storage_max_concurrent_sync_VALUE'
+remote_storage_max_concurrent_sync = 'remote_storage_max_concurrent_sync_VALUE'
+remote_storage_max_sync_errors = 'remote_storage_max_sync_errors_VALUE'

-[relish_storage]
+[remote_storage]
 bucket_name = 'bucket_name_VALUE'
 bucket_region = 'bucket_region_VALUE'
 "#,
@@ -759,7 +770,7 @@ bucket_region = 'bucket_region_VALUE'
            .expect("Failed to deserialize the prettified serialization result of the config");

        let mut expected_params = params;
-        expected_params.relish_storage = Some(RelishStorage::AwsS3 {
+        expected_params.remote_storage = Some(RemoteStorage::AwsS3 {
            bucket_name: "bucket_name_VALUE".to_string(),
            bucket_region: "bucket_region_VALUE".to_string(),
            access_key_id: None,
--- a/pageserver/src/branches.rs
+++ b/pageserver/src/branches.rs
@@ -4,7 +4,7 @@
 // TODO: move all paths construction to conf impl
 //

-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{anyhow, bail, Context, Result};
 use postgres_ffi::ControlFileData;
 use serde::{Deserialize, Serialize};
 use std::{
@@ -21,9 +21,10 @@ use zenith_utils::logging;
 use zenith_utils::lsn::Lsn;
 use zenith_utils::zid::{ZTenantId, ZTimelineId};

-use crate::tenant_mgr;
 use crate::walredo::WalRedoManager;
+use crate::CheckpointConfig;
 use crate::{repository::Repository, PageServerConf};
+use crate::{repository::RepositoryTimeline, tenant_mgr};
 use crate::{restore_local_repo, LOG_FILE_NAME};

 #[derive(Serialize, Deserialize, Clone)]
@@ -35,15 +36,14 @@ pub struct BranchInfo {
    pub ancestor_id: Option<String>,
    pub ancestor_lsn: Option<String>,
    pub current_logical_size: usize,
-    pub current_logical_size_non_incremental: usize,
+    pub current_logical_size_non_incremental: Option<usize>,
 }

 impl BranchInfo {
    pub fn from_path<T: AsRef<Path>>(
        path: T,
-        conf: &PageServerConf,
-        tenantid: &ZTenantId,
        repo: &Arc<dyn Repository>,
+        include_non_incremental_logical_size: bool,
    ) -> Result<Self> {
        let name = path
            .as_ref()
@@ -54,29 +54,29 @@ impl BranchInfo {
            .to_string();
        let timeline_id = std::fs::read_to_string(path)?.parse::<ZTimelineId>()?;

-        let timeline = repo.get_timeline(timeline_id)?;
+        let timeline = match repo.get_timeline(timeline_id)? {
+            RepositoryTimeline::Local(local_entry) => local_entry,
+            RepositoryTimeline::Remote(_) => {
+                bail!("Timeline {} is remote, no branches to display", timeline_id)
+            }
+        };

-        let ancestor_path = conf.ancestor_path(&timeline_id, tenantid);
-        let mut ancestor_id: Option<String> = None;
-        let mut ancestor_lsn: Option<String> = None;
+        // we use ancestor lsn zero if we don't have an ancestor, so turn this into an option based on timeline id
+        let (ancestor_id, ancestor_lsn) = match timeline.get_ancestor_timeline_id() {
+            Some(ancestor_id) => (
+                Some(ancestor_id.to_string()),
+                Some(timeline.get_ancestor_lsn().to_string()),
+            ),
+            None => (None, None),
+        };

-        if ancestor_path.exists() {
-            let ancestor = std::fs::read_to_string(ancestor_path)?;
-            let mut strings = ancestor.split('@');
-
-            ancestor_id = Some(
-                strings
-                    .next()
-                    .with_context(|| "wrong branch ancestor point in time format")?
-                    .to_owned(),
-            );
-            ancestor_lsn = Some(
-                strings
-                    .next()
-                    .with_context(|| "wrong branch ancestor point in time format")?
-                    .to_owned(),
-            );
-        }
+        // non incremental size calculation can be heavy, so let it be optional
+        // needed for tests to check size calculation
+        let current_logical_size_non_incremental = include_non_incremental_logical_size
+            .then(|| {
+                timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn())
+            })
+            .transpose()?;

        Ok(BranchInfo {
            name,
@@ -85,8 +85,7 @@ impl BranchInfo {
            ancestor_id,
            ancestor_lsn,
            current_logical_size: timeline.get_current_logical_size(),
-            current_logical_size_non_incremental: timeline
-                .get_current_logical_size_non_incremental(timeline.get_last_record_lsn())?,
+            current_logical_size_non_incremental,
        })
    }
 }
@@ -145,19 +144,23 @@ pub fn create_repo(

    info!("created directory structure in {}", repo_dir.display());

-    let tli = create_timeline(conf, None, &tenantid)?;
+    // create a new timeline directory
+    let timeline_id = ZTimelineId::generate();
+    let timelinedir = conf.timeline_path(&timeline_id, &tenantid);

-    let repo = Arc::new(crate::buffered_repository::BufferedRepository::new(
+    crashsafe_dir::create_dir(&timelinedir)?;
+
+    let repo = Arc::new(crate::layered_repository::LayeredRepository::new(
        conf,
        wal_redo_manager,
        tenantid,
-        false,
+        conf.remote_storage_config.is_some(),
    ));

    // Load data into pageserver
    // TODO To implement zenith import we need to
    //      move data loading out of create_repo()
-    bootstrap_timeline(conf, tenantid, tli, repo.as_ref())?;
+    bootstrap_timeline(conf, tenantid, timeline_id, repo.as_ref())?;

    Ok(repo)
 }
@@ -182,7 +185,6 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
    let initdb_output = Command::new(initdb_path)
        .args(&["-D", initdbpath.to_str().unwrap()])
        .args(&["-U", &conf.superuser])
-        .args(&["-E", "utf8"])
        .arg("--no-instructions")
        // This is only used for a temporary installation that is deleted shortly after,
        // so no need to fsync it
@@ -225,13 +227,15 @@ fn bootstrap_timeline(

    // Import the contents of the data directory at the initial checkpoint
    // LSN, and any WAL after that.
-    let timeline = repo.create_empty_timeline(tli)?;
+    // Initdb lsn will be equal to last_record_lsn which will be set after import.
+    // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline.
+    let timeline = repo.create_empty_timeline(tli, lsn)?;
    restore_local_repo::import_timeline_from_postgres_datadir(
        &pgdata_path,
        timeline.writer().as_ref(),
        lsn,
    )?;
-    timeline.checkpoint()?;
+    timeline.checkpoint(CheckpointConfig::Forced)?;

    println!(
        "created initial timeline {} timeline.lsn {}",
@@ -249,29 +253,38 @@ fn bootstrap_timeline(
    Ok(())
 }

-pub(crate) fn get_tenants(conf: &PageServerConf) -> Result<Vec<String>> {
-    let tenants_dir = conf.tenants_path();
-
-    std::fs::read_dir(&tenants_dir)?
-        .map(|dir_entry_res| {
-            let dir_entry = dir_entry_res?;
-            ensure!(dir_entry.file_type()?.is_dir());
-            Ok(dir_entry.file_name().to_str().unwrap().to_owned())
-        })
-        .collect()
-}
-
-pub(crate) fn get_branches(conf: &PageServerConf, tenantid: &ZTenantId) -> Result<Vec<BranchInfo>> {
+pub(crate) fn get_branches(
+    conf: &PageServerConf,
+    tenantid: &ZTenantId,
+    include_non_incremental_logical_size: bool,
+) -> Result<Vec<BranchInfo>> {
    let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?;

    // Each branch has a corresponding record (text file) in the refs/branches
    // with timeline_id.
    let branches_dir = conf.branches_path(tenantid);

-    std::fs::read_dir(&branches_dir)?
+    std::fs::read_dir(&branches_dir)
+        .with_context(|| {
+            format!(
+                "Found no branches directory '{}' for tenant {}",
+                branches_dir.display(),
+                tenantid
+            )
+        })?
        .map(|dir_entry_res| {
-            let dir_entry = dir_entry_res?;
-            BranchInfo::from_path(dir_entry.path(), conf, tenantid, &repo)
+            let dir_entry = dir_entry_res.with_context(|| {
+                format!(
+                    "Failed to list branches directory '{}' content for tenant {}",
+                    branches_dir.display(),
+                    tenantid
+                )
+            })?;
+            BranchInfo::from_path(
+                dir_entry.path(),
+                &repo,
+                include_non_incremental_logical_size,
+            )
        })
        .collect()
 }
@@ -289,7 +302,10 @@ pub(crate) fn create_branch(
    }

    let mut startpoint = parse_point_in_time(conf, startpoint_str, tenantid)?;
-    let timeline = repo.get_timeline(startpoint.timelineid)?;
+    let timeline = repo
+        .get_timeline(startpoint.timelineid)?
+        .local_timeline()
+        .ok_or_else(|| anyhow!("Cannot branch off the timeline that's not present locally"))?;
    if startpoint.lsn == Lsn(0) {
        // Find end of WAL on the old timeline
        let end_of_wal = timeline.get_last_record_lsn();
@@ -314,26 +330,26 @@ pub(crate) fn create_branch(
        );
    }

-    // create a new timeline directory for it
-    let newtli = create_timeline(conf, Some(startpoint), tenantid)?;
+    let new_timeline_id = ZTimelineId::generate();

-    // Let the Repository backend do its initialization
-    repo.branch_timeline(startpoint.timelineid, newtli, startpoint.lsn)?;
+    // Forward entire timeline creation routine to repository
+    // backend, so it can do all needed initialization
+    repo.branch_timeline(startpoint.timelineid, new_timeline_id, startpoint.lsn)?;

    // Remember the human-readable branch name for the new timeline.
    // FIXME: there's a race condition, if you create a branch with the same
    // name concurrently.
-    let data = newtli.to_string();
+    let data = new_timeline_id.to_string();
    fs::write(conf.branch_path(branchname, tenantid), data)?;

    Ok(BranchInfo {
        name: branchname.to_string(),
-        timeline_id: newtli,
+        timeline_id: new_timeline_id,
        latest_valid_lsn: startpoint.lsn,
-        ancestor_id: None,
-        ancestor_lsn: None,
+        ancestor_id: Some(startpoint.timelineid.to_string()),
+        ancestor_lsn: Some(startpoint.lsn.to_string()),
        current_logical_size: 0,
-        current_logical_size_non_incremental: 0,
+        current_logical_size_non_incremental: Some(0),
    })
 }

@@ -409,24 +425,3 @@ fn parse_point_in_time(

    bail!("could not parse point-in-time {}", s);
 }
-
-fn create_timeline(
-    conf: &PageServerConf,
-    ancestor: Option<PointInTime>,
-    tenantid: &ZTenantId,
-) -> Result<ZTimelineId> {
-    // Create initial timeline
-
-    let timelineid = ZTimelineId::generate();
-
-    let timelinedir = conf.timeline_path(&timelineid, tenantid);
-
-    fs::create_dir(&timelinedir)?;
-
-    if let Some(ancestor) = ancestor {
-        let data = format!("{}@{}", ancestor.timelineid, ancestor.lsn);
-        fs::write(timelinedir.join("ancestor"), data)?;
-    }
-
-    Ok(timelineid)
-}
--- a/pageserver/src/buffered_repository.rs
+++ b/pageserver/src/buffered_repository.rs
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -17,6 +17,98 @@ paths:
            application/json:
              schema:
                type: object
+  /v1/timeline/{tenant_id}:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    get:
+      description: List tenant timelines
+      responses:
+        "200":
+          description: array of brief timeline descriptions
+          content:
+            application/json:
+              schema:
+                type: array
+                items:
+                  # currently, just a timeline id string, but when remote index gets to be accessed
+                  # remote/local timeline field would be added at least
+                  type: string
+        "400":
+          description: Error when no tenant id found in path
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+  /v1/timeline/{tenant_id}/{timeline_id}:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    get:
+      description: Get timeline info for tenant's remote timeline
+      responses:
+        "200":
+          description: TimelineInfo
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/TimelineInfo"
+        "400":
+          description: Error when no tenant id found in path or no branch name
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
  /v1/branch/{tenant_id}:
    parameters:
      - name: tenant_id
@@ -25,6 +117,11 @@ paths:
        schema:
          type: string
          format: hex
+      - name: include-non-incremental-logical-size
+        in: query
+        schema:
+          type: string
+          description: Controls calculation of current_logical_size_non_incremental
    get:
      description: Get branches for tenant
      responses:
@@ -73,6 +170,11 @@ paths:
        required: true
        schema:
          type: string
+      - name: include-non-incremental-logical-size
+        in: query
+        schema:
+          type: string
+          description: Controls calculation of current_logical_size_non_incremental
    get:
      description: Get branches for tenant
      responses:
@@ -164,13 +266,13 @@ paths:
      description: Get tenants list
      responses:
        "200":
-          description: OK
+          description: TenantInfo
          content:
            application/json:
              schema:
                type: array
                items:
-                  type: string
+                  $ref: "#/components/schemas/TenantInfo"
        "401":
          description: Unauthorized Error
          content:
@@ -243,6 +345,16 @@ components:
      scheme: bearer
      bearerFormat: JWT
  schemas:
+    TenantInfo:
+      type: object
+      required:
+        - id
+        - state
+      properties:
+        id:
+          type: string
+        state:
+          type: string
    BranchInfo:
      type: object
      required:
@@ -250,7 +362,6 @@ components:
        - timeline_id
        - latest_valid_lsn
        - current_logical_size
-        - current_logical_size_non_incremental
      properties:
        name:
          type: string
@@ -265,6 +376,36 @@ components:
          type: integer
        current_logical_size_non_incremental:
          type: integer
+    TimelineInfo:
+      type: object
+      required:
+        - timeline_id
+        - tenant_id
+        - last_record_lsn
+        - prev_record_lsn
+        - start_lsn
+        - disk_consistent_lsn
+      properties:
+        timeline_id:
+          type: string
+          format: hex
+        tenant_id:
+          type: string
+          format: hex
+        ancestor_timeline_id:
+          type: string
+          format: hex
+        last_record_lsn:
+          type: string
+        prev_record_lsn:
+          type: string
+        start_lsn:
+          type: string
+        disk_consistent_lsn:
+          type: string
+        timeline_state:
+          type: string
+
    Error:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1,10 +1,11 @@
 use std::sync::Arc;

-use anyhow::Result;
+use anyhow::{bail, Context, Result};
 use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use routerify::{ext::RequestExt, RouterBuilder};
+use serde::Serialize;
 use tracing::*;
 use zenith_utils::auth::JwtAuth;
 use zenith_utils::http::endpoint::attach_openapi_ui;
@@ -18,10 +19,13 @@ use zenith_utils::http::{
    request::get_request_param,
    request::parse_request_param,
 };
+use zenith_utils::lsn::Lsn;
+use zenith_utils::zid::{opt_display_serde, ZTimelineId};

 use super::models::BranchCreateRequest;
 use super::models::TenantCreateRequest;
 use crate::branches::BranchInfo;
+use crate::repository::TimelineSyncState;
 use crate::{branches, tenant_mgr, PageServerConf, ZTenantId};

 #[derive(Debug)]
@@ -86,31 +90,144 @@ async fn branch_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    Ok(json_response(StatusCode::CREATED, response_data)?)
 }

+// Gate non incremental logical size calculation behind a flag
+// after pgbench -i -s100 calculation took 28ms so if multiplied by the number of timelines
+// and tenants it can take noticeable amount of time. Also the value currently used only in tests
+fn get_include_non_incremental_logical_size(request: &Request<Body>) -> bool {
+    request
+        .uri()
+        .query()
+        .map(|v| {
+            url::form_urlencoded::parse(v.as_bytes())
+                .into_owned()
+                .any(|(param, _)| param == "include-non-incremental-logical-size")
+        })
+        .unwrap_or(false)
+}
+
 async fn branch_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;

+    let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
+
    check_permission(&request, Some(tenantid))?;

    let response_data = tokio::task::spawn_blocking(move || {
        let _enter = info_span!("branch_list", tenant = %tenantid).entered();
-        crate::branches::get_branches(get_config(&request), &tenantid)
+        crate::branches::get_branches(
+            get_config(&request),
+            &tenantid,
+            include_non_incremental_logical_size,
+        )
    })
    .await
    .map_err(ApiError::from_err)??;
    Ok(json_response(StatusCode::OK, response_data)?)
 }

-// TODO add to swagger
 async fn branch_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
    let branch_name: String = get_request_param(&request, "branch_name")?.to_string();
    let conf = get_state(&request).conf;
    let path = conf.branch_path(&branch_name, &tenantid);

+    let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
+
    let response_data = tokio::task::spawn_blocking(move || {
        let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered();
        let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
-        BranchInfo::from_path(path, conf, &tenantid, &repo)
+        BranchInfo::from_path(path, &repo, include_non_incremental_logical_size)
+    })
+    .await
+    .map_err(ApiError::from_err)??;
+
+    Ok(json_response(StatusCode::OK, response_data)?)
+}
+
+async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let conf = get_state(&request).conf;
+    let timelines_dir = conf.timelines_path(&tenant_id);
+
+    let mut timelines_dir_contents =
+        tokio::fs::read_dir(&timelines_dir).await.with_context(|| {
+            format!(
+                "Failed to list timelines dir '{}' contents",
+                timelines_dir.display()
+            )
+        })?;
+
+    let mut local_timelines = Vec::new();
+    while let Some(entry) = timelines_dir_contents.next_entry().await.with_context(|| {
+        format!(
+            "Failed to list timelines dir '{}' contents",
+            timelines_dir.display()
+        )
+    })? {
+        let entry_path = entry.path();
+        let entry_type = entry.file_type().await.with_context(|| {
+            format!(
+                "Failed to get file type of timeline dirs' entry '{}'",
+                entry_path.display()
+            )
+        })?;
+
+        if entry_type.is_dir() {
+            match entry.file_name().to_string_lossy().parse::<ZTimelineId>() {
+                Ok(timeline_id) => local_timelines.push(timeline_id.to_string()),
+                Err(e) => error!(
+                    "Failed to get parse timeline id from timeline dirs' entry '{}': {}",
+                    entry_path.display(),
+                    e
+                ),
+            }
+        }
+    }
+
+    Ok(json_response(StatusCode::OK, local_timelines)?)
+}
+
+#[derive(Debug, Serialize)]
+struct TimelineInfo {
+    #[serde(with = "hex")]
+    timeline_id: ZTimelineId,
+    #[serde(with = "hex")]
+    tenant_id: ZTenantId,
+    #[serde(with = "opt_display_serde")]
+    ancestor_timeline_id: Option<ZTimelineId>,
+    last_record_lsn: Lsn,
+    prev_record_lsn: Lsn,
+    start_lsn: Lsn,
+    disk_consistent_lsn: Lsn,
+    timeline_state: Option<TimelineSyncState>,
+}
+
+async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
+
+    let response_data = tokio::task::spawn_blocking(move || {
+        let _enter =
+            info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id)
+                .entered();
+        let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
+        match repo.get_timeline(timeline_id)?.local_timeline() {
+            None => bail!("Timeline with id {} is not present locally", timeline_id),
+            Some(timeline) => Ok::<_, anyhow::Error>(TimelineInfo {
+                timeline_id,
+                tenant_id,
+                ancestor_timeline_id: timeline.get_ancestor_timeline_id(),
+                disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
+                last_record_lsn: timeline.get_last_record_lsn(),
+                prev_record_lsn: timeline.get_prev_record_lsn(),
+                start_lsn: timeline.get_start_lsn(),
+                timeline_state: repo.get_timeline_state(timeline_id),
+            }),
+        }
    })
    .await
    .map_err(ApiError::from_err)??;
@@ -124,7 +241,7 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A

    let response_data = tokio::task::spawn_blocking(move || {
        let _enter = info_span!("tenant_list").entered();
-        crate::branches::get_tenants(get_config(&request))
+        crate::tenant_mgr::list_tenants()
    })
    .await
    .map_err(ApiError::from_err)??;
@@ -174,6 +291,11 @@ pub fn make_router(
    router
        .data(Arc::new(State::new(conf, auth)))
        .get("/v1/status", status_handler)
+        .get("/v1/timeline/:tenant_id", timeline_list_handler)
+        .get(
+            "/v1/timeline/:tenant_id/:timeline_id",
+            timeline_detail_handler,
+        )
        .get("/v1/branch/:tenant_id", branch_list_handler)
        .get("/v1/branch/:tenant_id/:branch_name", branch_detail_handler)
        .post("/v1/branch", branch_create_handler)
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
--- a/pageserver/src/layered_repository/README.md
+++ b/pageserver/src/layered_repository/README.md
@@ -1,12 +1,56 @@
 # Overview

-The on-disk format is based on immutable files. The page server
-receives a stream of incoming WAL, parses the WAL records to determine
-which pages they apply to, and accumulates the incoming changes in
-memory. Every now and then, the accumulated changes are written out to
-new immutable files. This process is called checkpointing. Old versions
-of on-disk files that are not needed by any timeline are removed by GC
-process.
+The on-disk format is based on immutable files. The page server receives a
+stream of incoming WAL, parses the WAL records to determine which pages they
+apply to, and accumulates the incoming changes in memory. Every now and then,
+the accumulated changes are written out to new immutable files. This process is
+called checkpointing. Old versions of on-disk files that are not needed by any
+timeline are removed by GC process.
+
+The main responsibility of the Page Server is to process the incoming WAL, and
+reprocess it into a format that allows reasonably quick access to any page
+version.
+
+The incoming WAL contains updates to arbitrary pages in the system. The
+distribution depends on the workload: the updates could be totally random, or
+there could be a long stream of updates to a single relation when data is bulk
+loaded, for example, or something in between. The page server slices the
+incoming WAL per relation and page, and packages the sliced WAL into
+suitably-sized "layer files". The layer files contain all the history of the
+database, back to some reasonable retention period. This system replaces the
+base backups and the WAL archive used in a traditional PostgreSQL
+installation. The layer files are immutable, they are not modified in-place
+after creation. New layer files are created for new incoming WAL, and old layer
+files are removed when they are no longer needed. We could also replace layer
+files with new files that contain the same information, merging small files for
+example, but that hasn't been implemented yet.
+
+
+Cloud Storage                   Page Server                   Safekeeper
+                     Local disk                Memory            WAL
+
+|AAAA|               |AAAA|AAAA|               |AA
+|BBBB|               |BBBB|BBBB|               |
+|CCCC|CCCC|  <----   |CCCC|CCCC|CCCC|   <---   |CC     <----   ADEBAABED
+|DDDD|DDDD|          |DDDD|DDDD|               |DDD
+|EEEE|               |EEEE|EEEE|EEEE|          |E
+
+
+In this illustration, WAL is received as a stream from the Safekeeper, from the
+right.  It is immediately captured by the page server and stored quickly in
+memory. The page server memory can be thought of as a quick "reorder buffer",
+used to hold the incoming WAL and reorder it so that we keep the WAL records for
+the same page and relation close to each other.
+
+From the page server memory, whenever enough WAL has been accumulated for one
+relation segment, it is moved to local disk, as a new layer file, and the memory
+is released.
+
+From the local disk, the layers are further copied to Cloud Storage, for
+long-term archival. After a layer has been copied to Cloud Storage, it can be
+removed from local disk, although we currently keep everything locally for fast
+access. If a layer is needed that isn't found locally, it is fetched from Cloud
+Storage and stored in local disk.

 # Terms used in layered repository

@@ -14,32 +58,9 @@ process.
 - Segment - one slice of a Relish that is stored in a LayeredTimeline.
 - Layer -  specific version of a relish Segment in a range of LSNs.

-Layers can be InMemory or OnDisk:
- InMemory layer is not durably stored and needs to rebuild from WAL on pageserver start.
- OnDisk layer is durably stored.
+# Layer map

-OnDisk layers can be Image or Delta:
- ImageLayer represents an image or a snapshot of a segment at one particular LSN.
- DeltaLayer represents a collection of WAL records or page images in a range of LSNs.
-
-Dropped segments are always represented on disk by DeltaLayer.
-
-LSN range defined by start_lsn and end_lsn:
- start_lsn is inclusive.
- end_lsn is exclusive.
-
-For an open in-memory layer, the end_lsn is MAX_LSN. For a frozen
-in-memory layer or a delta layer, it is a valid end bound. An image
-layer represents snapshot at one LSN, so end_lsn is always the
-snapshot LSN + 1
-
-Layers can be open or historical:
- Open layer is a writeable one. Only InMemory layer can be open.
-FIXME: If open layer is dropped, it is not writeable, so it should be turned into historical, 
-but now it is not implemented - see bug #569.
- Historical layer is the one that cannot be modified anymore. Now only OnDisk layers can be historical.
-
- LayerMap - a map that tracks what layers exist for all the relishes in a timeline.
+The LayerMap tracks what layers exist for all the relishes in a timeline.

 LayerMap consists of two data structures:
 - segs - All the layers keyed by segment tag
@@ -54,8 +75,55 @@ TODO: Are there any exceptions to this?
 For example, timeline.list_rels(lsn) will return all segments that are visible in this timeline at the LSN,
 including ones that were not modified in this timeline and thus don't have a layer in the timeline's LayerMap.

-TODO:
-Describe GC and checkpoint interval settings.
+
+# Different kinds of layers
+
+A layer can be in different states:
+
+- Open - a layer where new WAL records can be appended to.
+- Closed - a layer that is read-only, no new WAL records can be appended to it
+- Historic: synonym for closed
+- InMemory: A layer that needs to be rebuilt from WAL on pageserver start.
+To avoid OOM errors, InMemory layers can be spilled to disk into ephemeral file.
+- OnDisk: A layer that is stored on disk. If its end-LSN is older than
+  disk_consistent_lsn, it is known to be fully flushed and fsync'd to local disk.
+- Frozen layer: an in-memory layer that is Closed.
+
+TODO: Clarify the difference between Closed, Historic and Frozen.
+
+There are two kinds of OnDisk layers:
+- ImageLayer represents an image or a snapshot of a 10 MB relish segment, at one particular LSN.
+- DeltaLayer represents a collection of WAL records or page images in a range of LSNs, for one
+  relish segment.
+
+Dropped segments are always represented on disk by DeltaLayer.
+
+# Layer life cycle
+
+LSN range defined by start_lsn and end_lsn:
+- start_lsn is inclusive.
+- end_lsn is exclusive.
+
+For an open in-memory layer, the end_lsn is MAX_LSN. For a frozen in-memory
+layer or a delta layer, it is a valid end bound. An image layer represents
+snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
+
+Every layer starts its life as an Open In-Memory layer. When the page server
+receives the first WAL record for a segment, it creates a new In-Memory layer
+for it, and puts it to the layer map. Later, the layer is old enough, its
+contents are written to disk, as On-Disk layers. This process is called
+"evicting" a layer.
+
+Layer eviction is a two-step process: First, the layer is marked as closed, so
+that it no longer accepts new WAL records, and the layer map is updated
+accordingly. If a new WAL record for that segment arrives after this step, a new
+Open layer is created to hold it. After this first step, the layer is a Closed
+InMemory state. This first step is called "freezing" the layer.
+
+In the second step, new Delta and Image layers are created, containing all the
+data in the Frozen InMemory layer. When the new layers are ready, the original
+frozen layer is replaced with the new layers in the layer map, and the original
+frozen layer is dropped, releasing the memory.

 # Layer files (On-disk layers)

@@ -366,6 +434,8 @@ is a newer layer file there. TODO: This optimization hasn't been
 implemented! The GC algorithm will currently keep the file on the
 'main' branch anyway, for as long as the child branch exists.

+TODO:
+Describe GC and checkpoint interval settings.

 # TODO: On LSN ranges

--- a/pageserver/src/layered_repository/blob.rs
+++ b/pageserver/src/layered_repository/blob.rs
@@ -1,16 +1,17 @@
-use std::{fs::File, io::Write};
+use std::io::{Read, Write};
+use std::os::unix::prelude::FileExt;

 use anyhow::Result;
 use bookfile::{BookWriter, BoundedReader, ChapterId, ChapterWriter};
 use serde::{Deserialize, Serialize};

-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Clone)]
 pub struct BlobRange {
-    offset: u64,
-    size: usize,
+    pub offset: u64,
+    pub size: usize,
 }

-pub fn read_blob(reader: &BoundedReader<&'_ File>, range: &BlobRange) -> Result<Vec<u8>> {
+pub fn read_blob<F: FileExt>(reader: &BoundedReader<&'_ F>, range: &BlobRange) -> Result<Vec<u8>> {
    let mut buf = vec![0u8; range.size];
    reader.read_exact_at(&mut buf, range.offset)?;
    Ok(buf)
@@ -28,14 +29,14 @@ impl<W: Write> BlobWriter<W> {
        Self { writer, offset: 0 }
    }

-    pub fn write_blob(&mut self, blob: &[u8]) -> Result<BlobRange> {
-        self.writer.write_all(blob)?;
+    pub fn write_blob_from_reader(&mut self, r: &mut impl Read) -> Result<BlobRange> {
+        let len = std::io::copy(r, &mut self.writer)?;

        let range = BlobRange {
            offset: self.offset,
-            size: blob.len(),
+            size: len as usize,
        };
-        self.offset += blob.len() as u64;
+        self.offset += len as u64;
        Ok(range)
    }

--- a/pageserver/src/layered_repository/delta_layer.rs
+++ b/pageserver/src/layered_repository/delta_layer.rs
@@ -39,9 +39,12 @@
 //!
 use crate::layered_repository::blob::BlobWriter;
 use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
-use crate::layered_repository::storage_layer::{Layer, SegmentTag};
-use crate::repository::{PageReconstructData, PageReconstructResult, PageVersion};
-use crate::waldecoder;
+use crate::layered_repository::page_versions::PageVersions;
+use crate::layered_repository::storage_layer::{
+    Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag, RELISH_SEG_SIZE,
+};
+use crate::virtual_file::VirtualFile;
+use crate::walrecord;
 use crate::PageServerConf;
 use crate::{ZTenantId, ZTimelineId};
 use anyhow::{bail, ensure, Result};
@@ -52,7 +55,6 @@ use zenith_utils::vec_map::VecMap;
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::fs;
-use std::fs::File;
 use std::io::{BufWriter, Write};
 use std::ops::Bound::Included;
 use std::path::{Path, PathBuf};
@@ -117,16 +119,16 @@ impl From<&DeltaLayer> for Summary {
 pub struct DeltaLayer {
    path_or_conf: PathOrConf,

-    pub tenantid: ZTenantId,
-    pub timelineid: ZTimelineId,
-    pub seg: SegmentTag,
+    tenantid: ZTenantId,
+    timelineid: ZTimelineId,
+    seg: SegmentTag,

    //
    // This entry contains all the changes from 'start_lsn' to 'end_lsn'. The
    // start is inclusive, and end is exclusive.
    //
-    pub start_lsn: Lsn,
-    pub end_lsn: Lsn,
+    start_lsn: Lsn,
+    end_lsn: Lsn,

    dropped: bool,

@@ -138,12 +140,25 @@ pub struct DeltaLayerInner {
    /// loaded into memory yet.
    loaded: bool,

+    book: Option<Book<VirtualFile>>,
+
    /// All versions of all pages in the file are are kept here.
    /// Indexed by block number and LSN.
-    page_version_metas: VecMap<(u32, Lsn), BlobRange>,
+    page_version_metas: VecMap<(SegmentTag, u32, Lsn), BlobRange>,

    /// `relsizes` tracks the size of the relation at different points in time.
-    relsizes: VecMap<Lsn, u32>,
+    relsizes: VecMap<(SegmentTag, Lsn), u32>,
+}
+
+impl DeltaLayerInner {
+    fn get_seg_size(&self, seg: SegmentTag, lsn: Lsn) -> Result<u32> {
+        let slice = self.relsizes.slice_range((seg, Lsn(0))..=(seg, lsn));
+        if let Some((_entry_lsn, entry)) = slice.last() {
+            Ok(*entry)
+        } else {
+            Err(anyhow::anyhow!("could not find seg size in delta layer"))
+        }
+    }
 }

 impl Layer for DeltaLayer {
@@ -178,30 +193,48 @@ impl Layer for DeltaLayer {
    /// Look up given page in the cache.
    fn get_page_reconstruct_data(
        &self,
+        seg: SegmentTag,
        blknum: u32,
        lsn: Lsn,
+        cached_img_lsn: Option<Lsn>,
        reconstruct_data: &mut PageReconstructData,
    ) -> Result<PageReconstructResult> {
        let mut need_image = true;

-        assert!(self.seg.blknum_in_seg(blknum));
+        assert!(seg.blknum_in_seg(blknum));
+
+        match &cached_img_lsn {
+            Some(cached_lsn) if &self.end_lsn <= cached_lsn => {
+                return Ok(PageReconstructResult::Cached)
+            }
+            _ => {}
+        }

        {
            // Open the file and lock the metadata in memory
-            // TODO: avoid opening the file for each read
-            let (_path, book) = self.open_book()?;
-            let page_version_reader = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
            let inner = self.load()?;
+            let page_version_reader = inner
+                .book
+                .as_ref()
+                .expect("should be loaded in load call above")
+                .chapter_reader(PAGE_VERSIONS_CHAPTER)?;

            // Scan the metadata BTreeMap backwards, starting from the given entry.
-            let minkey = (blknum, Lsn(0));
-            let maxkey = (blknum, lsn);
+            let minkey = (seg, blknum, Lsn(0));
+            let maxkey = (seg, blknum, lsn);
            let iter = inner
                .page_version_metas
                .slice_range((Included(&minkey), Included(&maxkey)))
                .iter()
                .rev();
-            for ((_blknum, pv_lsn), blob_range) in iter {
+            for ((_seg, _blknum, pv_lsn), blob_range) in iter {
+                match &cached_img_lsn {
+                    Some(cached_lsn) if pv_lsn <= cached_lsn => {
+                        return Ok(PageReconstructResult::Cached)
+                    }
+                    _ => {}
+                }
+
                let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?;

                match pv {
@@ -223,6 +256,15 @@ impl Layer for DeltaLayer {
                }
            }

+            // If we didn't find any records for this, check if the request is beyond EOF
+            if need_image
+                && reconstruct_data.records.is_empty()
+                && seg.rel.is_blocky()
+                && blknum - seg.segno * RELISH_SEG_SIZE >= inner.get_seg_size(seg, lsn)?
+            {
+                return Ok(PageReconstructResult::Missing(self.start_lsn));
+            }
+
            // release metadata lock and close the file
        }

@@ -236,7 +278,7 @@ impl Layer for DeltaLayer {
    }

    /// Get size of the relation at given LSN
-    fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
+    fn get_seg_size(&self, seg: SegmentTag, lsn: Lsn) -> Result<u32> {
        assert!(lsn >= self.start_lsn);
        ensure!(
            self.seg.rel.is_blocky(),
@@ -245,19 +287,13 @@ impl Layer for DeltaLayer {

        // Scan the BTreeMap backwards, starting from the given entry.
        let inner = self.load()?;
-        let slice = inner
-            .relsizes
-            .slice_range((Included(&Lsn(0)), Included(&lsn)));
-
-        if let Some((_entry_lsn, entry)) = slice.last() {
-            Ok(*entry)
-        } else {
-            Err(anyhow::anyhow!("could not find seg size in delta layer"))
-        }
+        inner.get_seg_size(seg, lsn)
    }

    /// Does this segment exist at given LSN?
-    fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
+    fn get_seg_exists(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool> {
+        assert_eq!(self.seg, seg, "range get_seg_exists not supported"); // TODO
+
        // Is the requested LSN after the rel was dropped?
        if self.dropped && lsn >= self.end_lsn {
            return Ok(false);
@@ -276,6 +312,11 @@ impl Layer for DeltaLayer {
        inner.page_version_metas = VecMap::default();
        inner.relsizes = VecMap::default();
        inner.loaded = false;
+
+        // Note: we keep the Book open. Is that a good idea? The virtual file
+        // machinery has its own rules for closing the file descriptor if it's not
+        // needed, but the Book struct uses up some memory, too.
+
        Ok(())
    }

@@ -289,6 +330,10 @@ impl Layer for DeltaLayer {
        true
    }

+    fn is_in_memory(&self) -> bool {
+        false
+    }
+
    /// debugging function to print out the contents of the layer
    fn dump(&self) -> Result<()> {
        println!(
@@ -298,24 +343,30 @@ impl Layer for DeltaLayer {

        println!("--- relsizes ---");
        let inner = self.load()?;
-        for (k, v) in inner.relsizes.as_slice() {
-            println!("  {}: {}", k, v);
+        for ((seg, lsn), v) in inner.relsizes.as_slice() {
+            println!("  {}@{}: {}", seg, lsn, v);
        }
        println!("--- page versions ---");
-        let (_path, book) = self.open_book()?;
+
+        let path = self.path();
+        let file = std::fs::File::open(&path)?;
+        let book = Book::new(file)?;
+
        let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
-        for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
+        for ((seg, blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
            let mut desc = String::new();

            let buf = read_blob(&chapter, blob_range)?;
            let pv = PageVersion::des(&buf)?;

+            write!(&mut desc, "{}", seg)?;
+
            match pv {
                PageVersion::Page(img) => {
                    write!(&mut desc, " img {} bytes", img.len())?;
                }
                PageVersion::Wal(rec) => {
-                    let wal_desc = waldecoder::describe_wal_record(&rec.rec);
+                    let wal_desc = walrecord::describe_wal_record(&rec.rec);
                    write!(
                        &mut desc,
                        " rec {} bytes will_init: {} {}",
@@ -334,19 +385,6 @@ impl Layer for DeltaLayer {
 }

 impl DeltaLayer {
-    /// debugging function to print out the contents of the layer
-    pub fn versions(&self) -> Result<Vec<(u32, Lsn, PageVersion)>> {
-        let mut versions: Vec<(u32, Lsn, PageVersion)> = Vec::new();
-        let inner = self.load()?;
-        let (_path, book) = self.open_book()?;
-        let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
-        for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
-            let buf = read_blob(&chapter, blob_range)?;
-            versions.push((*blk, *lsn, PageVersion::des(&buf)?));
-        }
-        Ok(versions)
-    }
-
    fn path_for(
        path_or_conf: &PathOrConf,
        timelineid: ZTimelineId,
@@ -362,14 +400,14 @@ impl DeltaLayer {
    }

    /// Create a new delta file, using the given page versions and relsizes.
-    /// The page versions are passed by an iterator; the iterator must return
-    /// page versions in blknum+lsn order.
+    /// The page versions are passed in a PageVersions struct. If 'cutoff' is
+    /// given, only page versions with LSN < cutoff are included.
    ///
-    /// This is used to write the in-memory layer to disk. The in-memory layer uses the same
-    /// data structure with two btreemaps as we do, so passing the btreemaps is currently
-    /// expedient.
+    /// This is used to write the in-memory layer to disk. The page_versions and
+    /// relsizes are thus passed in the same format as they are in the in-memory
+    /// layer, as that's expedient.
    #[allow(clippy::too_many_arguments)]
-    pub fn create<'a>(
+    pub fn create(
        conf: &'static PageServerConf,
        timelineid: ZTimelineId,
        tenantid: ZTenantId,
@@ -377,14 +415,22 @@ impl DeltaLayer {
        start_lsn: Lsn,
        end_lsn: Lsn,
        dropped: bool,
-        page_versions: impl Iterator<Item = (u32, Lsn, &'a PageVersion)>,
-        relsizes: VecMap<Lsn, u32>,
-        nosync: bool,
+        page_versions: &PageVersions,
+        cutoff: Option<Lsn>,
+        relsizes: &[(Lsn, u32)],
    ) -> Result<DeltaLayer> {
        if seg.rel.is_blocky() {
            assert!(!relsizes.is_empty());
        }

+        let relsizes = {
+            let mut m = VecMap::default();
+            for &(lsn, size) in relsizes {
+                m.append((seg, lsn), size).unwrap();
+            }
+            m
+        };
+
        let delta_layer = DeltaLayer {
            path_or_conf: PathOrConf::Conf(conf),
            timelineid,
@@ -394,31 +440,37 @@ impl DeltaLayer {
            end_lsn,
            dropped,
            inner: Mutex::new(DeltaLayerInner {
-                loaded: true,
+                loaded: false,
+                book: None,
                page_version_metas: VecMap::default(),
                relsizes,
            }),
        };
        let mut inner = delta_layer.inner.lock().unwrap();

-        // Write the in-memory btreemaps into a file
-        let path = delta_layer.path();
-
+        // Write the data into a file
+        //
+        // Note: Because we open the file in write-only mode, we cannot
+        // reuse the same VirtualFile for reading later. That's why we don't
+        // set inner.book here. The first read will have to re-open it.
+        //
        // Note: This overwrites any existing file. There shouldn't be any.
        // FIXME: throw an error instead?
-        let file = File::create(&path)?;
+        let path = delta_layer.path();
+        let file = VirtualFile::create(&path)?;
        let buf_writer = BufWriter::new(file);
        let book = BookWriter::new(buf_writer, DELTA_FILE_MAGIC)?;

        let mut page_version_writer = BlobWriter::new(book, PAGE_VERSIONS_CHAPTER);

-        for (blknum, lsn, page_version) in page_versions {
-            let buf = PageVersion::ser(page_version)?;
-            let blob_range = page_version_writer.write_blob(&buf)?;
+        let page_versions_iter = page_versions.ordered_page_version_iter(cutoff);
+        for (blknum, lsn, pos) in page_versions_iter {
+            let blob_range =
+                page_version_writer.write_blob_from_reader(&mut page_versions.reader(pos)?)?;

            inner
                .page_version_metas
-                .append((blknum, lsn), blob_range)
+                .append((seg, blknum, lsn), blob_range)
                .unwrap();
        }

@@ -452,9 +504,8 @@ impl DeltaLayer {

        // This flushes the underlying 'buf_writer'.
        let writer = book.close()?;
-        if !nosync {
-            writer.get_ref().sync_all()?;
-        }
+        writer.get_ref().sync_all()?;
+
        trace!("saved {}", &path.display());

        drop(inner);
@@ -462,15 +513,6 @@ impl DeltaLayer {
        Ok(delta_layer)
    }

-    fn open_book(&self) -> Result<(PathBuf, Book<File>)> {
-        let path = self.path();
-
-        let file = File::open(&path)?;
-        let book = Book::new(file)?;
-
-        Ok((path, book))
-    }
-
    ///
    /// Load the contents of the file into memory
    ///
@@ -482,7 +524,14 @@ impl DeltaLayer {
            return Ok(inner);
        }

-        let (path, book) = self.open_book()?;
+        let path = self.path();
+
+        // Open the file if it's not open already.
+        if inner.book.is_none() {
+            let file = VirtualFile::open(&path)?;
+            inner.book = Some(Book::new(file)?);
+        }
+        let book = inner.book.as_ref().unwrap();

        match &self.path_or_conf {
            PathOrConf::Conf(_) => {
@@ -517,11 +566,9 @@ impl DeltaLayer {

        debug!("loaded from {}", &path.display());

-        *inner = DeltaLayerInner {
-            loaded: true,
-            page_version_metas,
-            relsizes,
-        };
+        inner.page_version_metas = page_version_metas;
+        inner.relsizes = relsizes;
+        inner.loaded = true;

        Ok(inner)
    }
@@ -537,12 +584,13 @@ impl DeltaLayer {
            path_or_conf: PathOrConf::Conf(conf),
            timelineid,
            tenantid,
-            seg: filename.seg,
+            seg: filename.start_seg,
            start_lsn: filename.start_lsn,
            end_lsn: filename.end_lsn,
            dropped: filename.dropped,
            inner: Mutex::new(DeltaLayerInner {
                loaded: false,
+                book: None,
                page_version_metas: VecMap::default(),
                relsizes: VecMap::default(),
            }),
@@ -552,7 +600,10 @@ impl DeltaLayer {
    /// Create a DeltaLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
-    pub fn new_for_path(path: &Path, book: &Book<File>) -> Result<Self> {
+    pub fn new_for_path<F>(path: &Path, book: &Book<F>) -> Result<Self>
+    where
+        F: std::os::unix::prelude::FileExt,
+    {
        let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
        let summary = Summary::des(&chapter)?;

@@ -566,6 +617,7 @@ impl DeltaLayer {
            dropped: summary.dropped,
            inner: Mutex::new(DeltaLayerInner {
                loaded: false,
+                book: None,
                page_version_metas: VecMap::default(),
                relsizes: VecMap::default(),
            }),
@@ -574,7 +626,11 @@ impl DeltaLayer {

    fn layer_name(&self) -> DeltaFileName {
        DeltaFileName {
-            seg: self.seg,
+            start_seg: self.seg,
+            end_seg: SegmentTag {
+                rel: self.seg.rel,
+                segno: self.seg.segno + 1,
+            },
            start_lsn: self.start_lsn,
            end_lsn: self.end_lsn,
            dropped: self.dropped,
--- a/pageserver/src/layered_repository/ephemeral_file.rs
+++ b/pageserver/src/layered_repository/ephemeral_file.rs
@@ -0,0 +1,298 @@
+//! Implementation of append-only file data structure
+//! used to keep in-memory layers spilled on disk.
+
+use crate::page_cache;
+use crate::page_cache::PAGE_SZ;
+use crate::page_cache::{ReadBufResult, WriteBufResult};
+use crate::virtual_file::VirtualFile;
+use crate::PageServerConf;
+use lazy_static::lazy_static;
+use std::cmp::min;
+use std::collections::HashMap;
+use std::fs::OpenOptions;
+use std::io::{Error, ErrorKind, Seek, SeekFrom, Write};
+use std::ops::DerefMut;
+use std::path::PathBuf;
+use std::sync::{Arc, RwLock};
+use zenith_utils::zid::ZTenantId;
+use zenith_utils::zid::ZTimelineId;
+
+use std::os::unix::fs::FileExt;
+
+lazy_static! {
+    ///
+    /// This is the global cache of file descriptors (File objects).
+    ///
+    static ref EPHEMERAL_FILES: RwLock<EphemeralFiles> = RwLock::new(EphemeralFiles {
+        next_file_id: 1,
+        files: HashMap::new(),
+    });
+}
+
+pub struct EphemeralFiles {
+    next_file_id: u64,
+
+    files: HashMap<u64, Arc<VirtualFile>>,
+}
+
+pub struct EphemeralFile {
+    file_id: u64,
+    _tenantid: ZTenantId,
+    _timelineid: ZTimelineId,
+    file: Arc<VirtualFile>,
+
+    pos: u64,
+}
+
+impl EphemeralFile {
+    pub fn create(
+        conf: &PageServerConf,
+        tenantid: ZTenantId,
+        timelineid: ZTimelineId,
+    ) -> Result<EphemeralFile, std::io::Error> {
+        let mut l = EPHEMERAL_FILES.write().unwrap();
+        let file_id = l.next_file_id;
+        l.next_file_id += 1;
+
+        let filename = conf
+            .timeline_path(&timelineid, &tenantid)
+            .join(PathBuf::from(format!("ephemeral-{}", file_id)));
+
+        let file = VirtualFile::open_with_options(
+            &filename,
+            OpenOptions::new().read(true).write(true).create(true),
+        )?;
+        let file_rc = Arc::new(file);
+        l.files.insert(file_id, file_rc.clone());
+
+        Ok(EphemeralFile {
+            file_id,
+            _tenantid: tenantid,
+            _timelineid: timelineid,
+            file: file_rc,
+            pos: 0,
+        })
+    }
+
+    pub fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), Error> {
+        let mut off = 0;
+        while off < PAGE_SZ {
+            let n = self
+                .file
+                .read_at(&mut buf[off..], blkno as u64 * PAGE_SZ as u64 + off as u64)?;
+
+            if n == 0 {
+                // Reached EOF. Fill the rest of the buffer with zeros.
+                const ZERO_BUF: [u8; PAGE_SZ] = [0u8; PAGE_SZ];
+
+                buf[off..].copy_from_slice(&ZERO_BUF[off..]);
+                break;
+            }
+
+            off += n as usize;
+        }
+        Ok(())
+    }
+}
+
+impl FileExt for EphemeralFile {
+    fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result<usize, Error> {
+        // Look up the right page
+        let blkno = (offset / PAGE_SZ as u64) as u32;
+        let off = offset as usize % PAGE_SZ;
+        let len = min(PAGE_SZ - off, dstbuf.len());
+
+        let read_guard;
+        let mut write_guard;
+
+        let cache = page_cache::get();
+        let buf = match cache.read_ephemeral_buf(self.file_id, blkno) {
+            ReadBufResult::Found(guard) => {
+                read_guard = guard;
+                read_guard.as_ref()
+            }
+            ReadBufResult::NotFound(guard) => {
+                // Read the page from disk into the buffer
+                write_guard = guard;
+                self.fill_buffer(write_guard.deref_mut(), blkno)?;
+                write_guard.mark_valid();
+
+                // And then fall through to read the requested slice from the
+                // buffer.
+                write_guard.as_ref()
+            }
+        };
+
+        dstbuf[0..len].copy_from_slice(&buf[off..(off + len)]);
+        Ok(len)
+    }
+
+    fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result<usize, Error> {
+        // Look up the right page
+        let blkno = (offset / PAGE_SZ as u64) as u32;
+        let off = offset as usize % PAGE_SZ;
+        let len = min(PAGE_SZ - off, srcbuf.len());
+
+        let mut write_guard;
+        let cache = page_cache::get();
+        let buf = match cache.write_ephemeral_buf(self.file_id, blkno) {
+            WriteBufResult::Found(guard) => {
+                write_guard = guard;
+                write_guard.deref_mut()
+            }
+            WriteBufResult::NotFound(guard) => {
+                // Read the page from disk into the buffer
+                // TODO: if we're overwriting the whole page, no need to read it in first
+                write_guard = guard;
+                self.fill_buffer(write_guard.deref_mut(), blkno)?;
+                write_guard.mark_valid();
+
+                // And then fall through to modify it.
+                write_guard.deref_mut()
+            }
+        };
+
+        buf[off..(off + len)].copy_from_slice(&srcbuf[0..len]);
+        write_guard.mark_dirty();
+        Ok(len)
+    }
+}
+
+impl Write for EphemeralFile {
+    fn write(&mut self, buf: &[u8]) -> Result<usize, Error> {
+        let n = self.write_at(buf, self.pos)?;
+        self.pos += n as u64;
+        Ok(n)
+    }
+
+    fn flush(&mut self) -> Result<(), std::io::Error> {
+        todo!()
+    }
+}
+
+impl Seek for EphemeralFile {
+    fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
+        match pos {
+            SeekFrom::Start(offset) => {
+                self.pos = offset;
+            }
+            SeekFrom::End(_offset) => {
+                return Err(Error::new(
+                    ErrorKind::Other,
+                    "SeekFrom::End not supported by EphemeralFile",
+                ));
+            }
+            SeekFrom::Current(offset) => {
+                let pos = self.pos as i128 + offset as i128;
+                if pos < 0 {
+                    return Err(Error::new(
+                        ErrorKind::InvalidInput,
+                        "offset would be negative",
+                    ));
+                }
+                if pos > u64::MAX as i128 {
+                    return Err(Error::new(ErrorKind::InvalidInput, "offset overflow"));
+                }
+                self.pos = pos as u64;
+            }
+        }
+        Ok(self.pos)
+    }
+}
+
+impl Drop for EphemeralFile {
+    fn drop(&mut self) {
+        // drop all pages from page cache
+        let cache = page_cache::get();
+        cache.drop_buffers_for_ephemeral(self.file_id);
+
+        // remove entry from the hash map
+        EPHEMERAL_FILES.write().unwrap().files.remove(&self.file_id);
+
+        // unlink file
+        // FIXME: print error
+        let _ = std::fs::remove_file(&self.file.path);
+    }
+}
+
+pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Error> {
+    if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
+        file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64)?;
+        Ok(())
+    } else {
+        Err(std::io::Error::new(
+            ErrorKind::Other,
+            "could not write back page, not found in ephemeral files hash",
+        ))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use rand::seq::SliceRandom;
+    use rand::thread_rng;
+    use std::fs;
+    use std::str::FromStr;
+
+    fn repo_harness(
+        test_name: &str,
+    ) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), Error> {
+        let repo_dir = PageServerConf::test_repo_dir(test_name);
+        let _ = fs::remove_dir_all(&repo_dir);
+        let conf = PageServerConf::dummy_conf(repo_dir);
+        // Make a static copy of the config. This can never be free'd, but that's
+        // OK in a test.
+        let conf: &'static PageServerConf = Box::leak(Box::new(conf));
+
+        let tenantid = ZTenantId::from_str("11000000000000000000000000000000").unwrap();
+        let timelineid = ZTimelineId::from_str("22000000000000000000000000000000").unwrap();
+        fs::create_dir_all(conf.timeline_path(&timelineid, &tenantid))?;
+
+        Ok((conf, tenantid, timelineid))
+    }
+
+    // Helper function to slurp contents of a file, starting at the current position,
+    // into a string
+    fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result<String, Error> {
+        let mut buf = Vec::new();
+        buf.resize(len, 0u8);
+
+        efile.read_exact_at(&mut buf, offset)?;
+
+        Ok(String::from_utf8_lossy(&buf)
+            .trim_end_matches('\0')
+            .to_string())
+    }
+
+    #[test]
+    fn test_ephemeral_files() -> Result<(), Error> {
+        let (conf, tenantid, timelineid) = repo_harness("ephemeral_files")?;
+
+        let mut file_a = EphemeralFile::create(conf, tenantid, timelineid)?;
+
+        file_a.write_all(b"foo")?;
+        assert_eq!("foo", read_string(&file_a, 0, 20)?);
+
+        file_a.write_all(b"bar")?;
+        assert_eq!("foobar", read_string(&file_a, 0, 20)?);
+
+        // Open a lot of files, enough to cause some page evictions.
+        let mut efiles = Vec::new();
+        for fileno in 0..100 {
+            let mut efile = EphemeralFile::create(conf, tenantid, timelineid)?;
+            efile.write_all(format!("file {}", fileno).as_bytes())?;
+            assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
+            efiles.push((fileno, efile));
+        }
+
+        // Check that all the files can still be read from. Use them in random order for
+        // good measure.
+        efiles.as_mut_slice().shuffle(&mut thread_rng());
+        for (fileno, efile) in efiles.iter_mut() {
+            assert_eq!(format!("file {}", fileno), read_string(efile, 0, 10)?);
+        }
+
+        Ok(())
+    }
+}
--- a/pageserver/src/layered_repository/filename.rs
+++ b/pageserver/src/layered_repository/filename.rs
@@ -5,7 +5,7 @@ use crate::layered_repository::storage_layer::SegmentTag;
 use crate::relish::*;
 use crate::PageServerConf;
 use crate::{ZTenantId, ZTimelineId};
-use std::fmt;
+use std::fmt::{self, Write};
 use std::fs;
 use std::path::PathBuf;

@@ -13,10 +13,111 @@ use anyhow::Result;
 use log::*;
 use zenith_utils::lsn::Lsn;

+use super::metadata::METADATA_FILE_NAME;
+
+fn parse_seg(input: &mut &str) -> Option<SegmentTag> {
+    let rel = if let Some(rest) = input.strip_prefix("rel_") {
+        let mut parts = rest.splitn(5, '_');
+        let rel = RelishTag::Relation(RelTag {
+            spcnode: parts.next()?.parse::<u32>().ok()?,
+            dbnode: parts.next()?.parse::<u32>().ok()?,
+            relnode: parts.next()?.parse::<u32>().ok()?,
+            forknum: parts.next()?.parse::<u8>().ok()?,
+        });
+        *input = parts.next()?;
+        debug_assert!(parts.next().is_none());
+        rel
+    } else if let Some(rest) = input.strip_prefix("pg_xact_") {
+        let (segno, rest) = rest.split_once('_')?;
+        *input = rest;
+        RelishTag::Slru {
+            slru: SlruKind::Clog,
+            segno: u32::from_str_radix(segno, 16).ok()?,
+        }
+    } else if let Some(rest) = input.strip_prefix("pg_multixact_members_") {
+        let (segno, rest) = rest.split_once('_')?;
+        *input = rest;
+        RelishTag::Slru {
+            slru: SlruKind::MultiXactMembers,
+            segno: u32::from_str_radix(segno, 16).ok()?,
+        }
+    } else if let Some(rest) = input.strip_prefix("pg_multixact_offsets_") {
+        let (segno, rest) = rest.split_once('_')?;
+        *input = rest;
+        RelishTag::Slru {
+            slru: SlruKind::MultiXactOffsets,
+            segno: u32::from_str_radix(segno, 16).ok()?,
+        }
+    } else if let Some(rest) = input.strip_prefix("pg_filenodemap_") {
+        let mut parts = rest.splitn(3, '_');
+        let rel = RelishTag::FileNodeMap {
+            spcnode: parts.next()?.parse::<u32>().ok()?,
+            dbnode: parts.next()?.parse::<u32>().ok()?,
+        };
+        *input = parts.next()?;
+        debug_assert!(parts.next().is_none());
+        rel
+    } else if let Some(rest) = input.strip_prefix("pg_twophase_") {
+        let (xid, rest) = rest.split_once('_')?;
+        *input = rest;
+        RelishTag::TwoPhase {
+            xid: xid.parse::<u32>().ok()?,
+        }
+    } else if let Some(rest) = input.strip_prefix("pg_control_checkpoint_") {
+        *input = rest;
+        RelishTag::Checkpoint
+    } else if let Some(rest) = input.strip_prefix("pg_control_") {
+        *input = rest;
+        RelishTag::ControlFile
+    } else {
+        return None;
+    };
+
+    let (segno, rest) = input.split_once('_')?;
+    *input = rest;
+
+    Some(SegmentTag {
+        rel,
+        segno: segno.parse().ok()?,
+    })
+}
+
+fn write_seg(seg: &SegmentTag) -> String {
+    let mut s = match seg.rel {
+        RelishTag::Relation(reltag) => format!(
+            "rel_{}_{}_{}_{}",
+            reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum
+        ),
+        RelishTag::Slru {
+            slru: SlruKind::Clog,
+            segno,
+        } => format!("pg_xact_{:04X}", segno),
+        RelishTag::Slru {
+            slru: SlruKind::MultiXactMembers,
+            segno,
+        } => format!("pg_multixact_members_{:04X}", segno),
+        RelishTag::Slru {
+            slru: SlruKind::MultiXactOffsets,
+            segno,
+        } => format!("pg_multixact_offsets_{:04X}", segno),
+        RelishTag::FileNodeMap { spcnode, dbnode } => {
+            format!("pg_filenodemap_{}_{}", spcnode, dbnode)
+        }
+        RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
+        RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
+        RelishTag::ControlFile => "pg_control".to_string(),
+    };
+
+    write!(&mut s, "_{}", seg.segno).unwrap();
+
+    s
+}
+
 // Note: LayeredTimeline::load_layer_map() relies on this sort order
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
 pub struct DeltaFileName {
-    pub seg: SegmentTag,
+    pub start_seg: SegmentTag,
+    pub end_seg: SegmentTag,
    pub start_lsn: Lsn,
    pub end_lsn: Lsn,
    pub dropped: bool,
@@ -36,59 +137,12 @@ impl DeltaFileName {
    /// match the expected pattern.
    ///
    pub fn parse_str(fname: &str) -> Option<Self> {
-        let rel;
-        let mut parts;
-        if let Some(rest) = fname.strip_prefix("rel_") {
-            parts = rest.split('_');
-            rel = RelishTag::Relation(RelTag {
-                spcnode: parts.next()?.parse::<u32>().ok()?,
-                dbnode: parts.next()?.parse::<u32>().ok()?,
-                relnode: parts.next()?.parse::<u32>().ok()?,
-                forknum: parts.next()?.parse::<u8>().ok()?,
-            });
-        } else if let Some(rest) = fname.strip_prefix("pg_xact_") {
-            parts = rest.split('_');
-            rel = RelishTag::Slru {
-                slru: SlruKind::Clog,
-                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") {
-            parts = rest.split('_');
-            rel = RelishTag::Slru {
-                slru: SlruKind::MultiXactMembers,
-                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") {
-            parts = rest.split('_');
-            rel = RelishTag::Slru {
-                slru: SlruKind::MultiXactOffsets,
-                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") {
-            parts = rest.split('_');
-            rel = RelishTag::FileNodeMap {
-                spcnode: parts.next()?.parse::<u32>().ok()?,
-                dbnode: parts.next()?.parse::<u32>().ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_twophase_") {
-            parts = rest.split('_');
-            rel = RelishTag::TwoPhase {
-                xid: parts.next()?.parse::<u32>().ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") {
-            parts = rest.split('_');
-            rel = RelishTag::Checkpoint;
-        } else if let Some(rest) = fname.strip_prefix("pg_control_") {
-            parts = rest.split('_');
-            rel = RelishTag::ControlFile;
-        } else {
-            return None;
-        }
-
-        let segno = parts.next()?.parse::<u32>().ok()?;
-
-        let seg = SegmentTag { rel, segno };
+        let mut rest = fname;
+        let start_seg = parse_seg(&mut rest)?;
+        let end_seg = parse_seg(&mut rest)?;
+        debug_assert!(start_seg < end_seg);

+        let mut parts = rest.split('_');
        let start_lsn = Lsn::from_hex(parts.next()?).ok()?;
        let end_lsn = Lsn::from_hex(parts.next()?).ok()?;

@@ -105,7 +159,8 @@ impl DeltaFileName {
        }

        Some(DeltaFileName {
-            seg,
+            start_seg,
+            end_seg,
            start_lsn,
            end_lsn,
            dropped,
@@ -115,36 +170,14 @@ impl DeltaFileName {

 impl fmt::Display for DeltaFileName {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let basename = match self.seg.rel {
-            RelishTag::Relation(reltag) => format!(
-                "rel_{}_{}_{}_{}",
-                reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum
-            ),
-            RelishTag::Slru {
-                slru: SlruKind::Clog,
-                segno,
-            } => format!("pg_xact_{:04X}", segno),
-            RelishTag::Slru {
-                slru: SlruKind::MultiXactMembers,
-                segno,
-            } => format!("pg_multixact_members_{:04X}", segno),
-            RelishTag::Slru {
-                slru: SlruKind::MultiXactOffsets,
-                segno,
-            } => format!("pg_multixact_offsets_{:04X}", segno),
-            RelishTag::FileNodeMap { spcnode, dbnode } => {
-                format!("pg_filenodemap_{}_{}", spcnode, dbnode)
-            }
-            RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
-            RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
-            RelishTag::ControlFile => "pg_control".to_string(),
-        };
+        let start_seg = write_seg(&self.start_seg);
+        let end_seg = write_seg(&self.end_seg);

        write!(
            f,
            "{}_{}_{:016X}_{:016X}{}",
-            basename,
-            self.seg.segno,
+            start_seg,
+            end_seg,
            u64::from(self.start_lsn),
            u64::from(self.end_lsn),
            if self.dropped { "_DROPPED" } else { "" }
@@ -154,7 +187,8 @@ impl fmt::Display for DeltaFileName {

 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
 pub struct ImageFileName {
-    pub seg: SegmentTag,
+    pub start_seg: SegmentTag,
+    pub end_seg: SegmentTag,
    pub lsn: Lsn,
 }

@@ -169,103 +203,31 @@ impl ImageFileName {
    /// match the expected pattern.
    ///
    pub fn parse_str(fname: &str) -> Option<Self> {
-        let rel;
-        let mut parts;
-        if let Some(rest) = fname.strip_prefix("rel_") {
-            parts = rest.split('_');
-            rel = RelishTag::Relation(RelTag {
-                spcnode: parts.next()?.parse::<u32>().ok()?,
-                dbnode: parts.next()?.parse::<u32>().ok()?,
-                relnode: parts.next()?.parse::<u32>().ok()?,
-                forknum: parts.next()?.parse::<u8>().ok()?,
-            });
-        } else if let Some(rest) = fname.strip_prefix("pg_xact_") {
-            parts = rest.split('_');
-            rel = RelishTag::Slru {
-                slru: SlruKind::Clog,
-                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") {
-            parts = rest.split('_');
-            rel = RelishTag::Slru {
-                slru: SlruKind::MultiXactMembers,
-                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") {
-            parts = rest.split('_');
-            rel = RelishTag::Slru {
-                slru: SlruKind::MultiXactOffsets,
-                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") {
-            parts = rest.split('_');
-            rel = RelishTag::FileNodeMap {
-                spcnode: parts.next()?.parse::<u32>().ok()?,
-                dbnode: parts.next()?.parse::<u32>().ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_twophase_") {
-            parts = rest.split('_');
-            rel = RelishTag::TwoPhase {
-                xid: parts.next()?.parse::<u32>().ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") {
-            parts = rest.split('_');
-            rel = RelishTag::Checkpoint;
-        } else if let Some(rest) = fname.strip_prefix("pg_control_") {
-            parts = rest.split('_');
-            rel = RelishTag::ControlFile;
-        } else {
+        let mut rest = fname;
+        let start_seg = parse_seg(&mut rest)?;
+        let end_seg = parse_seg(&mut rest)?;
+        debug_assert!(start_seg < end_seg);
+
+        if rest.contains('_') {
            return None;
        }

-        let segno = parts.next()?.parse::<u32>().ok()?;
+        let lsn = Lsn::from_hex(rest).ok()?;

-        let seg = SegmentTag { rel, segno };
-
-        let lsn = Lsn::from_hex(parts.next()?).ok()?;
-
-        if parts.next().is_some() {
-            return None;
-        }
-
-        Some(ImageFileName { seg, lsn })
+        Some(ImageFileName {
+            start_seg,
+            end_seg,
+            lsn,
+        })
    }
 }

 impl fmt::Display for ImageFileName {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let basename = match self.seg.rel {
-            RelishTag::Relation(reltag) => format!(
-                "rel_{}_{}_{}_{}",
-                reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum
-            ),
-            RelishTag::Slru {
-                slru: SlruKind::Clog,
-                segno,
-            } => format!("pg_xact_{:04X}", segno),
-            RelishTag::Slru {
-                slru: SlruKind::MultiXactMembers,
-                segno,
-            } => format!("pg_multixact_members_{:04X}", segno),
-            RelishTag::Slru {
-                slru: SlruKind::MultiXactOffsets,
-                segno,
-            } => format!("pg_multixact_offsets_{:04X}", segno),
-            RelishTag::FileNodeMap { spcnode, dbnode } => {
-                format!("pg_filenodemap_{}_{}", spcnode, dbnode)
-            }
-            RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
-            RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
-            RelishTag::ControlFile => "pg_control".to_string(),
-        };
+        let start_seg = write_seg(&self.start_seg);
+        let end_seg = write_seg(&self.end_seg);

-        write!(
-            f,
-            "{}_{}_{:016X}",
-            basename,
-            self.seg.segno,
-            u64::from(self.lsn),
-        )
+        write!(f, "{}_{}_{:016X}", start_seg, end_seg, u64::from(self.lsn),)
    }
 }

@@ -290,7 +252,7 @@ pub fn list_files(
            deltafiles.push(deltafilename);
        } else if let Some(imgfilename) = ImageFileName::parse_str(fname) {
            imgfiles.push(imgfilename);
-        } else if fname == "metadata" || fname == "ancestor" || fname.ends_with(".old") {
+        } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
            // ignore these
        } else {
            warn!("unrecognized filename in timeline dir: {}", fname);
--- a/pageserver/src/layered_repository/global_layer_map.rs
+++ b/pageserver/src/layered_repository/global_layer_map.rs
@@ -0,0 +1,142 @@
+//!
+//! Global registry of open layers.
+//!
+//! Whenever a new in-memory layer is created to hold incoming WAL, it is registered
+//! in [`GLOBAL_LAYER_MAP`], so that we can keep track of the total number of
+//! in-memory layers in the system, and know when we need to evict some to release
+//! memory.
+//!
+//! Each layer is assigned a unique ID when it's registered in the global registry.
+//! The ID can be used to relocate the layer later, without having to hold locks.
+//!
+
+use std::sync::atomic::{AtomicU8, Ordering};
+use std::sync::{Arc, RwLock};
+
+use super::inmemory_layer::InMemoryLayer;
+
+use lazy_static::lazy_static;
+
+const MAX_USAGE_COUNT: u8 = 5;
+
+lazy_static! {
+    pub static ref GLOBAL_LAYER_MAP: RwLock<InMemoryLayers> =
+        RwLock::new(InMemoryLayers::default());
+}
+
+// TODO these types can probably be smaller
+#[derive(PartialEq, Eq, Clone, Copy)]
+pub struct LayerId {
+    index: usize,
+    tag: u64, // to avoid ABA problem
+}
+
+enum SlotData {
+    Occupied(Arc<InMemoryLayer>),
+    /// Vacant slots form a linked list, the value is the index
+    /// of the next vacant slot in the list.
+    Vacant(Option<usize>),
+}
+
+struct Slot {
+    tag: u64,
+    data: SlotData,
+    usage_count: AtomicU8, // for clock algorithm
+}
+
+#[derive(Default)]
+pub struct InMemoryLayers {
+    slots: Vec<Slot>,
+    num_occupied: usize,
+
+    // Head of free-slot list.
+    next_empty_slot_idx: Option<usize>,
+}
+
+impl InMemoryLayers {
+    pub fn insert(&mut self, layer: Arc<InMemoryLayer>) -> LayerId {
+        let slot_idx = match self.next_empty_slot_idx {
+            Some(slot_idx) => slot_idx,
+            None => {
+                let idx = self.slots.len();
+                self.slots.push(Slot {
+                    tag: 0,
+                    data: SlotData::Vacant(None),
+                    usage_count: AtomicU8::new(0),
+                });
+                idx
+            }
+        };
+        let slots_len = self.slots.len();
+
+        let slot = &mut self.slots[slot_idx];
+
+        match slot.data {
+            SlotData::Occupied(_) => {
+                panic!("an occupied slot was in the free list");
+            }
+            SlotData::Vacant(next_empty_slot_idx) => {
+                self.next_empty_slot_idx = next_empty_slot_idx;
+            }
+        }
+
+        slot.data = SlotData::Occupied(layer);
+        slot.usage_count.store(1, Ordering::Relaxed);
+
+        self.num_occupied += 1;
+        assert!(self.num_occupied <= slots_len);
+
+        LayerId {
+            index: slot_idx,
+            tag: slot.tag,
+        }
+    }
+
+    pub fn get(&self, layer_id: &LayerId) -> Option<Arc<InMemoryLayer>> {
+        let slot = self.slots.get(layer_id.index)?; // TODO should out of bounds indexes just panic?
+        if slot.tag != layer_id.tag {
+            return None;
+        }
+
+        if let SlotData::Occupied(layer) = &slot.data {
+            let _ = slot.usage_count.fetch_update(
+                Ordering::Relaxed,
+                Ordering::Relaxed,
+                |old_usage_count| {
+                    if old_usage_count < MAX_USAGE_COUNT {
+                        Some(old_usage_count + 1)
+                    } else {
+                        None
+                    }
+                },
+            );
+            Some(Arc::clone(layer))
+        } else {
+            None
+        }
+    }
+
+    // TODO this won't be a public API in the future
+    pub fn remove(&mut self, layer_id: &LayerId) {
+        let slot = &mut self.slots[layer_id.index];
+
+        if slot.tag != layer_id.tag {
+            return;
+        }
+
+        match &slot.data {
+            SlotData::Occupied(_layer) => {
+                // TODO evict the layer
+            }
+            SlotData::Vacant(_) => unimplemented!(),
+        }
+
+        slot.data = SlotData::Vacant(self.next_empty_slot_idx);
+        self.next_empty_slot_idx = Some(layer_id.index);
+
+        assert!(self.num_occupied > 0);
+        self.num_occupied -= 1;
+
+        slot.tag = slot.tag.wrapping_add(1);
+    }
+}
--- a/pageserver/src/layered_repository/image_layer.rs
+++ b/pageserver/src/layered_repository/image_layer.rs
@@ -21,33 +21,39 @@
 //!
 //! For non-blocky relishes, the image can be found in NONBLOCKY_IMAGE_CHAPTER.
 //!
+use crate::layered_repository::blob::read_blob;
 use crate::layered_repository::filename::{ImageFileName, PathOrConf};
-use crate::layered_repository::storage_layer::{Layer, SegmentTag, RELISH_SEG_SIZE};
-use crate::repository::{PageReconstructData, PageReconstructResult};
+use crate::layered_repository::storage_layer::{
+    Layer, PageReconstructData, PageReconstructResult, SegmentTag,
+};
+use crate::layered_repository::LayeredTimeline;
+use crate::layered_repository::RELISH_SEG_SIZE;
+use crate::virtual_file::VirtualFile;
 use crate::PageServerConf;
 use crate::{ZTenantId, ZTimelineId};
-use anyhow::{anyhow, bail, ensure, Result};
+use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::Bytes;
 use log::*;
 use serde::{Deserialize, Serialize};
 use std::convert::TryInto;
 use std::fs;
-use std::fs::File;
 use std::io::{BufWriter, Write};
 use std::path::{Path, PathBuf};
 use std::sync::{Mutex, MutexGuard};
+use zenith_utils::vec_map::VecMap;

 use bookfile::{Book, BookWriter};

 use zenith_utils::bin_ser::BeSer;
 use zenith_utils::lsn::Lsn;

+use super::blob::BlobRange;
+
 // Magic constant to identify a Zenith segment image file
 pub const IMAGE_FILE_MAGIC: u32 = 0x5A616E01 + 1;

-/// Contains each block in block # order
-const BLOCKY_IMAGES_CHAPTER: u64 = 1;
-const NONBLOCKY_IMAGE_CHAPTER: u64 = 2;
+const BLOB_CHAPTER: u64 = 4;
+const META_CHAPTER: u64 = 5;

 /// Contains the [`Summary`] struct
 const SUMMARY_CHAPTER: u64 = 3;
@@ -84,29 +90,31 @@ const BLOCK_SIZE: usize = 8192;
 ///
 pub struct ImageLayer {
    path_or_conf: PathOrConf,
-    pub tenantid: ZTenantId,
-    pub timelineid: ZTimelineId,
-    pub seg: SegmentTag,
+    tenantid: ZTenantId,
+    timelineid: ZTimelineId,
+    seg: SegmentTag,

    // This entry contains an image of all pages as of this LSN
-    pub lsn: Lsn,
+    lsn: Lsn,

    inner: Mutex<ImageLayerInner>,
 }

-#[derive(Clone)]
-enum ImageType {
-    Blocky { num_blocks: u32 },
-    NonBlocky,
+pub struct ImageLayerInner {
+    /// If None, the 'image_type' has not been loaded into memory yet.
+    book: Option<Book<VirtualFile>>,
+
+    meta: VecMap<SegmentTag, BlobRange>,
 }

-pub struct ImageLayerInner {
-    /// If false, the 'image_type' has not been
-    /// loaded into memory yet.
-    loaded: bool,
-
-    /// Derived from filename and bookfile chapter metadata
-    image_type: ImageType,
+impl ImageLayerInner {
+    fn get_seg_blob_range(&self, seg: SegmentTag) -> Result<BlobRange> {
+        self.meta
+            .as_slice()
+            .binary_search_by_key(&&seg, |(seg, _meta)| seg)
+            .map(|idx| self.meta.as_slice()[idx].1.clone())
+            .map_err(|_| anyhow!("segment not found in ImageLayer"))
+    }
 }

 impl Layer for ImageLayer {
@@ -142,36 +150,46 @@ impl Layer for ImageLayer {
    /// Look up given page in the file
    fn get_page_reconstruct_data(
        &self,
+        seg: SegmentTag,
        blknum: u32,
        lsn: Lsn,
+        cached_img_lsn: Option<Lsn>,
        reconstruct_data: &mut PageReconstructData,
    ) -> Result<PageReconstructResult> {
        assert!(lsn >= self.lsn);

+        match cached_img_lsn {
+            Some(cached_lsn) if self.lsn <= cached_lsn => return Ok(PageReconstructResult::Cached),
+            _ => {}
+        }
+
        let inner = self.load()?;

        let base_blknum = blknum % RELISH_SEG_SIZE;

-        let (_path, book) = self.open_book()?;
+        let blob_range = inner.get_seg_blob_range(seg)?;

-        let buf = match &inner.image_type {
-            ImageType::Blocky { num_blocks } => {
-                if base_blknum >= *num_blocks {
-                    return Ok(PageReconstructResult::Missing(lsn));
-                }
+        let chapter = inner.book.as_ref().unwrap().chapter_reader(BLOB_CHAPTER)?;

-                let mut buf = vec![0u8; BLOCK_SIZE];
-                let offset = BLOCK_SIZE as u64 * base_blknum as u64;
-
-                let chapter = book.chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
-                chapter.read_exact_at(&mut buf, offset)?;
-
-                buf
-            }
-            ImageType::NonBlocky => {
-                ensure!(base_blknum == 0);
-                book.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?.into_vec()
+        let buf = if seg.rel.is_blocky() {
+            // Check if the request is beyond EOF
+            if base_blknum >= get_num_blocks(&blob_range) {
+                return Ok(PageReconstructResult::Missing(lsn));
            }
+
+            let mut buf = vec![0u8; BLOCK_SIZE];
+
+            let block_offset = BLOCK_SIZE as u64 * base_blknum as u64;
+            assert!(block_offset + BLOCK_SIZE as u64 <= blob_range.size as u64);
+
+            let offset = blob_range.offset + block_offset;
+
+            chapter.read_exact_at(&mut buf, offset)?;
+
+            buf
+        } else {
+            ensure!(base_blknum == 0);
+            read_blob(&chapter, &blob_range)?
        };

        reconstruct_data.page_img = Some(Bytes::from(buf));
@@ -179,27 +197,29 @@ impl Layer for ImageLayer {
    }

    /// Get size of the segment
-    fn get_seg_size(&self, _lsn: Lsn) -> Result<u32> {
-        let inner = self.load()?;
-        match inner.image_type {
-            ImageType::Blocky { num_blocks } => Ok(num_blocks),
-            ImageType::NonBlocky => Err(anyhow!("get_seg_size called for non-blocky segment")),
+    fn get_seg_size(&self, seg: SegmentTag, _lsn: Lsn) -> Result<u32> {
+        if !self.seg.rel.is_blocky() {
+            bail!("get_seg_size called for non-blocky segment");
        }
+
+        let inner = self.load()?;
+
+        let blob_range = inner.get_seg_blob_range(seg)?;
+        Ok(get_num_blocks(&blob_range))
    }

    /// Does this segment exist at given LSN?
-    fn get_seg_exists(&self, _lsn: Lsn) -> Result<bool> {
-        Ok(true)
+    fn get_seg_exists(&self, seg: SegmentTag, _lsn: Lsn) -> Result<bool> {
+        let inner = self.load()?;
+
+        Ok(inner
+            .meta
+            .as_slice()
+            .binary_search_by_key(&&seg, |(seg, _meta)| seg)
+            .is_ok())
    }

-    ///
-    /// Release most of the memory used by this layer. If it's accessed again later,
-    /// it will need to be loaded back.
-    ///
    fn unload(&self) -> Result<()> {
-        let mut inner = self.inner.lock().unwrap();
-        inner.image_type = ImageType::Blocky { num_blocks: 0 };
-        inner.loaded = false;
        Ok(())
    }

@@ -213,6 +233,10 @@ impl Layer for ImageLayer {
        false
    }

+    fn is_in_memory(&self) -> bool {
+        false
+    }
+
    /// debugging function to print out the contents of the layer
    fn dump(&self) -> Result<()> {
        println!(
@@ -222,12 +246,11 @@ impl Layer for ImageLayer {

        let inner = self.load()?;

-        match inner.image_type {
-            ImageType::Blocky { num_blocks } => println!("({}) blocks ", num_blocks),
-            ImageType::NonBlocky => {
-                let (_path, book) = self.open_book()?;
-                let chapter = book.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?;
-                println!("non-blocky ({} bytes)", chapter.len());
+        for (seg, blob_range) in inner.meta.as_slice() {
+            if seg.rel.is_blocky() {
+                println!("{} ({}) blocks ", seg, get_num_blocks(blob_range));
+            } else {
+                println!("{} non-blocky ({} bytes)", seg, blob_range.size);
            }
        }

@@ -251,60 +274,66 @@ impl ImageLayer {
    }

    /// Create a new image file, using the given array of pages.
-    pub fn create(
+    fn create(
        conf: &'static PageServerConf,
        timelineid: ZTimelineId,
        tenantid: ZTenantId,
        seg: SegmentTag,
        lsn: Lsn,
        base_images: Vec<Bytes>,
-        nosync: bool,
    ) -> Result<ImageLayer> {
-        let image_type = if seg.rel.is_blocky() {
-            let num_blocks: u32 = base_images.len().try_into()?;
-            ImageType::Blocky { num_blocks }
-        } else {
-            assert_eq!(base_images.len(), 1);
-            ImageType::NonBlocky
-        };
-
-        let layer = ImageLayer {
+        let mut layer = ImageLayer {
            path_or_conf: PathOrConf::Conf(conf),
            timelineid,
            tenantid,
            seg,
            lsn,
            inner: Mutex::new(ImageLayerInner {
-                loaded: true,
-                image_type: image_type.clone(),
+                book: None,
+                meta: VecMap::default(),
            }),
        };
-        let inner = layer.inner.lock().unwrap();

        // Write the images into a file
-        let path = layer.path();
+        //
+        // Note: Because we open the file in write-only mode, we cannot
+        // reuse the same VirtualFile for reading later. That's why we don't
+        // set inner.book here. The first read will have to re-open it.
+        //
        // Note: This overwrites any existing file. There shouldn't be any.
        // FIXME: throw an error instead?
-        let file = File::create(&path)?;
+        let path = layer.path();
+        let file = VirtualFile::create(&path)?;
        let buf_writer = BufWriter::new(file);
        let book = BookWriter::new(buf_writer, IMAGE_FILE_MAGIC)?;

-        let book = match &image_type {
-            ImageType::Blocky { .. } => {
-                let mut chapter = book.new_chapter(BLOCKY_IMAGES_CHAPTER);
-                for block_bytes in base_images {
-                    assert_eq!(block_bytes.len(), BLOCK_SIZE);
-                    chapter.write_all(&block_bytes)?;
-                }
-                chapter.close()?
-            }
-            ImageType::NonBlocky => {
-                let mut chapter = book.new_chapter(NONBLOCKY_IMAGE_CHAPTER);
-                chapter.write_all(&base_images[0])?;
-                chapter.close()?
+        let mut blob_chapter = book.new_chapter(BLOB_CHAPTER);
+
+        let size = if seg.rel.is_blocky() {
+            for block_bytes in &base_images {
+                assert_eq!(block_bytes.len(), BLOCK_SIZE);
+                blob_chapter.write_all(block_bytes)?;
            }
+            BLOCK_SIZE * base_images.len()
+        } else {
+            assert_eq!(base_images.len(), 1);
+            blob_chapter.write_all(&base_images[0])?;
+            base_images[0].len()
        };

+        let book = blob_chapter.close()?;
+
+        let inner = layer.inner.get_mut().unwrap();
+
+        inner
+            .meta
+            .append(seg, BlobRange { offset: 0, size })
+            .unwrap();
+
+        let mut meta_chapter = book.new_chapter(META_CHAPTER);
+        inner.meta.ser_into(&mut meta_chapter)?;
+        let book = meta_chapter.close()?;
+
        let mut chapter = book.new_chapter(SUMMARY_CHAPTER);
        let summary = Summary {
            tenantid,
@@ -318,17 +347,13 @@ impl ImageLayer {

        // This flushes the underlying 'buf_writer'.
        let writer = book.close()?;
-        if !nosync {
-            writer.get_ref().sync_all()?;
-        }
-        trace!("saved {}", path.display());
+        writer.get_ref().sync_all()?;

-        drop(inner);
+        trace!("saved {}", path.display());

        Ok(layer)
    }

-    /*
    // Create a new image file by materializing every page in a source layer
    // at given LSN.
    pub fn create_from_src(
@@ -337,13 +362,14 @@ impl ImageLayer {
        src: &dyn Layer,
        lsn: Lsn,
    ) -> Result<ImageLayer> {
+        // TODO needs to become an image of all segments in the layer
        let seg = src.get_seg_tag();
        let timelineid = timeline.timelineid;

        let startblk;
        let size;
        if seg.rel.is_blocky() {
-            size = src.get_seg_size(lsn)?;
+            size = src.get_seg_size(seg, lsn)?;
            startblk = seg.segno * RELISH_SEG_SIZE;
        } else {
            size = 1;
@@ -366,7 +392,6 @@ impl ImageLayer {

        Self::create(conf, timelineid, timeline.tenantid, seg, lsn, base_images)
    }
-    */

    ///
    /// Load the contents of the file into memory
@@ -375,11 +400,19 @@ impl ImageLayer {
        // quick exit if already loaded
        let mut inner = self.inner.lock().unwrap();

-        if inner.loaded {
+        if inner.book.is_some() {
            return Ok(inner);
        }

-        let (path, book) = self.open_book()?;
+        let path = self.path();
+        let file = VirtualFile::open(&path)
+            .with_context(|| format!("Failed to open virtual file '{}'", path.display()))?;
+        let book = Book::new(file).with_context(|| {
+            format!(
+                "Failed to open virtual file '{}' as a bookfile",
+                path.display()
+            )
+        })?;

        match &self.path_or_conf {
            PathOrConf::Conf(_) => {
@@ -406,36 +439,18 @@ impl ImageLayer {
            }
        }

-        let image_type = if self.seg.rel.is_blocky() {
-            let chapter = book.chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
-            let images_len = chapter.len();
-            ensure!(images_len % BLOCK_SIZE as u64 == 0);
-            let num_blocks: u32 = (images_len / BLOCK_SIZE as u64).try_into()?;
-            ImageType::Blocky { num_blocks }
-        } else {
-            let _chapter = book.chapter_reader(NONBLOCKY_IMAGE_CHAPTER)?;
-            ImageType::NonBlocky
-        };
+        let meta = VecMap::des(&book.read_chapter(META_CHAPTER)?)?;

        debug!("loaded from {}", &path.display());

        *inner = ImageLayerInner {
-            loaded: true,
-            image_type,
+            book: Some(book),
+            meta,
        };

        Ok(inner)
    }

-    fn open_book(&self) -> Result<(PathBuf, Book<File>)> {
-        let path = self.path();
-
-        let file = File::open(&path)?;
-        let book = Book::new(file)?;
-
-        Ok((path, book))
-    }
-
    /// Create an ImageLayer struct representing an existing file on disk
    pub fn new(
        conf: &'static PageServerConf,
@@ -447,11 +462,11 @@ impl ImageLayer {
            path_or_conf: PathOrConf::Conf(conf),
            timelineid,
            tenantid,
-            seg: filename.seg,
+            seg: filename.start_seg,
            lsn: filename.lsn,
            inner: Mutex::new(ImageLayerInner {
-                loaded: false,
-                image_type: ImageType::Blocky { num_blocks: 0 },
+                book: None,
+                meta: VecMap::default(),
            }),
        }
    }
@@ -459,7 +474,10 @@ impl ImageLayer {
    /// Create an ImageLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
-    pub fn new_for_path(path: &Path, book: &Book<File>) -> Result<ImageLayer> {
+    pub fn new_for_path<F>(path: &Path, book: &Book<F>) -> Result<ImageLayer>
+    where
+        F: std::os::unix::prelude::FileExt,
+    {
        let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
        let summary = Summary::des(&chapter)?;

@@ -470,15 +488,19 @@ impl ImageLayer {
            seg: summary.seg,
            lsn: summary.lsn,
            inner: Mutex::new(ImageLayerInner {
-                loaded: false,
-                image_type: ImageType::Blocky { num_blocks: 0 },
+                book: None,
+                meta: VecMap::default(),
            }),
        })
    }

    fn layer_name(&self) -> ImageFileName {
        ImageFileName {
-            seg: self.seg,
+            start_seg: self.seg,
+            end_seg: SegmentTag {
+                rel: self.seg.rel,
+                segno: self.seg.segno + 1,
+            },
            lsn: self.lsn,
        }
    }
@@ -493,3 +515,9 @@ impl ImageLayer {
        )
    }
 }
+
+/// Must only be called for blob ranges of blocky relishes.
+fn get_num_blocks(blob_range: &BlobRange) -> u32 {
+    assert_eq!(blob_range.size % BLOCK_SIZE, 0);
+    (blob_range.size / BLOCK_SIZE).try_into().unwrap()
+}
--- a/pageserver/src/layered_repository/inmemory_layer.rs
+++ b/pageserver/src/layered_repository/inmemory_layer.rs
@@ -1,7 +1,10 @@
+//! An in-memory layer stores recently received PageVersions.
+//! The page versions are held in a BTreeMap. To avoid OOM errors, the map size is limited
+//! and layers can be spilled to disk into ephemeral files.
 //!
-//! An in-memory layer stores recently received page versions in memory. The page versions
-//! are held in a BTreeMap, and there's another BTreeMap to track the size of the relation.
+//! And there's another BTreeMap to track the size of the relation.
 //!
+use crate::layered_repository::ephemeral_file::EphemeralFile;
 use crate::layered_repository::filename::DeltaFileName;
 use crate::layered_repository::storage_layer::{
    Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag, RELISH_SEG_SIZE,
@@ -12,14 +15,13 @@ use crate::layered_repository::{DeltaLayer, ImageLayer};
 use crate::repository::WALRecord;
 use crate::PageServerConf;
 use crate::{ZTenantId, ZTimelineId};
-use anyhow::{bail, ensure, Result};
+use anyhow::{ensure, Result};
 use bytes::Bytes;
 use log::*;
 use std::path::PathBuf;
 use std::sync::{Arc, RwLock};
-use zenith_utils::vec_map::VecMap;
-
 use zenith_utils::lsn::Lsn;
+use zenith_utils::vec_map::VecMap;

 use super::page_versions::PageVersions;

@@ -47,7 +49,7 @@ pub struct InMemoryLayer {
 }

 pub struct InMemoryLayerInner {
-    /// Frozen in-memory layers have an exclusive end LSN.
+    /// Frozen layers have an exclusive end LSN.
    /// Writes are only allowed when this is None
    end_lsn: Option<Lsn>,

@@ -90,8 +92,9 @@ impl InMemoryLayerInner {
 }

 impl Layer for InMemoryLayer {
-    // An in-memory layer doesn't really have a filename as it's not stored on disk,
-    // but we construct a filename as if it was a delta layer
+    // An in-memory layer can be spilled to disk into ephemeral file,
+    // This function is used only for debugging, so we don't need to be very precise.
+    // Construct a filename as if it was a delta layer.
    fn filename(&self) -> PathBuf {
        let inner = self.inner.read().unwrap();

@@ -103,7 +106,11 @@ impl Layer for InMemoryLayer {
        }

        let delta_filename = DeltaFileName {
-            seg: self.seg,
+            start_seg: self.seg,
+            end_seg: SegmentTag {
+                rel: self.seg.rel,
+                segno: self.seg.segno + 1,
+            },
            start_lsn: self.start_lsn,
            end_lsn,
            dropped: inner.dropped,
@@ -113,6 +120,10 @@ impl Layer for InMemoryLayer {
        PathBuf::from(format!("inmem-{}", delta_filename))
    }

+    fn get_tenant_id(&self) -> ZTenantId {
+        self.tenantid
+    }
+
    fn get_timeline_id(&self) -> ZTimelineId {
        self.timelineid
    }
@@ -143,10 +154,14 @@ impl Layer for InMemoryLayer {
    /// Look up given page in the cache.
    fn get_page_reconstruct_data(
        &self,
+        seg: SegmentTag,
        blknum: u32,
        lsn: Lsn,
+        cached_img_lsn: Option<Lsn>,
        reconstruct_data: &mut PageReconstructData,
    ) -> Result<PageReconstructResult> {
+        assert_eq!(self.seg, seg); // TODO
+
        let mut need_image = true;

        assert!(self.seg.blknum_in_seg(blknum));
@@ -160,23 +175,41 @@ impl Layer for InMemoryLayer {
                .get_block_lsn_range(blknum, ..=lsn)
                .iter()
                .rev();
-            for (entry_lsn, entry) in iter {
-                if let Some(img) = &entry.page_image {
-                    reconstruct_data.page_img = Some(img.clone());
-                    need_image = false;
-                    break;
-                } else if let Some(rec) = &entry.record {
-                    reconstruct_data.records.push((*entry_lsn, rec.clone()));
-                    if rec.will_init {
-                        // This WAL record initializes the page, so no need to go further back
+            for (entry_lsn, pos) in iter {
+                match &cached_img_lsn {
+                    Some(cached_lsn) if entry_lsn <= cached_lsn => {
+                        return Ok(PageReconstructResult::Cached)
+                    }
+                    _ => {}
+                }
+
+                let pv = inner.page_versions.get_page_version(*pos)?;
+                match pv {
+                    PageVersion::Page(img) => {
+                        reconstruct_data.page_img = Some(img);
                        need_image = false;
                        break;
                    }
-                } else {
-                    // No base image, and no WAL record. Huh?
-                    bail!("no page image or WAL record for requested page");
+                    PageVersion::Wal(rec) => {
+                        reconstruct_data.records.push((*entry_lsn, rec.clone()));
+                        if rec.will_init {
+                            // This WAL record initializes the page, so no need to go further back
+                            need_image = false;
+                            break;
+                        }
+                    }
                }
            }
+
+            // If we didn't find any records for this, check if the request is beyond EOF
+            if need_image
+                && reconstruct_data.records.is_empty()
+                && self.seg.rel.is_blocky()
+                && blknum - self.seg.segno * RELISH_SEG_SIZE >= self.get_seg_size(seg, lsn)?
+            {
+                return Ok(PageReconstructResult::Missing(self.start_lsn));
+            }
+
            // release lock on 'inner'
        }

@@ -194,7 +227,9 @@ impl Layer for InMemoryLayer {
    }

    /// Get size of the relation at given LSN
-    fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
+    fn get_seg_size(&self, seg: SegmentTag, lsn: Lsn) -> Result<u32> {
+        assert_eq!(self.seg, seg);
+
        assert!(lsn >= self.start_lsn);
        ensure!(
            self.seg.rel.is_blocky(),
@@ -206,7 +241,9 @@ impl Layer for InMemoryLayer {
    }

    /// Does this segment exist at given LSN?
-    fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
+    fn get_seg_exists(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool> {
+        assert_eq!(self.seg, seg);
+
        let inner = self.inner.read().unwrap();

        // If the segment created after requested LSN,
@@ -215,9 +252,13 @@ impl Layer for InMemoryLayer {
        assert!(lsn >= self.start_lsn);

        // Is the requested LSN after the segment was dropped?
-        if let Some(end_lsn) = inner.end_lsn {
-            if lsn >= end_lsn {
-                return Ok(false);
+        if inner.dropped {
+            if let Some(end_lsn) = inner.end_lsn {
+                if lsn >= end_lsn {
+                    return Ok(false);
+                }
+            } else {
+                panic!("dropped in-memory layer with no end LSN");
            }
        }

@@ -235,13 +276,17 @@ impl Layer for InMemoryLayer {
    /// Nothing to do here. When you drop the last reference to the layer, it will
    /// be deallocated.
    fn delete(&self) -> Result<()> {
-        Ok(())
+        panic!("can't delete an InMemoryLayer")
    }

    fn is_incremental(&self) -> bool {
        self.incremental
    }

+    fn is_in_memory(&self) -> bool {
+        true
+    }
+
    /// debugging function to print out the contents of the layer
    fn dump(&self) -> Result<()> {
        let inner = self.inner.read().unwrap();
@@ -261,14 +306,14 @@ impl Layer for InMemoryLayer {
            println!("segsizes {}: {}", k, v);
        }

-        for (blknum, lsn, pv) in inner.page_versions.ordered_page_version_iter(None) {
-            println!(
-                "blk {} at {}: {}/{}\n",
-                blknum,
-                lsn,
-                pv.page_image.is_some(),
-                pv.record.is_some()
-            );
+        for (blknum, lsn, pos) in inner.page_versions.ordered_page_version_iter(None) {
+            let pv = inner.page_versions.get_page_version(pos)?;
+            let pv_description = match pv {
+                PageVersion::Page(_img) => "page",
+                PageVersion::Wal(_rec) => "wal",
+            };
+
+            println!("blk {} at {}: {}\n", blknum, lsn, pv_description);
        }

        Ok(())
@@ -281,12 +326,6 @@ pub struct LayersOnDisk {
    pub image_layers: Vec<ImageLayer>,
 }

-impl LayersOnDisk {
-    pub fn is_empty(&self) -> bool {
-        self.delta_layers.is_empty() && self.image_layers.is_empty()
-    }
-}
-
 impl InMemoryLayer {
    /// Return the oldest page version that's stored in this layer
    pub fn get_oldest_pending_lsn(&self) -> Lsn {
@@ -317,6 +356,8 @@ impl InMemoryLayer {
            segsizes.append(start_lsn, 0).unwrap();
        }

+        let file = EphemeralFile::create(conf, tenantid, timelineid)?;
+
        Ok(InMemoryLayer {
            conf,
            timelineid,
@@ -328,7 +369,7 @@ impl InMemoryLayer {
            inner: RwLock::new(InMemoryLayerInner {
                end_lsn: None,
                dropped: false,
-                page_versions: PageVersions::default(),
+                page_versions: PageVersions::new(file),
                segsizes,
            }),
        })
@@ -337,32 +378,18 @@ impl InMemoryLayer {
    // Write operations

    /// Remember new page version, as a WAL record over previous version
-    pub fn put_wal_record(&self, lsn: Lsn, blknum: u32, rec: WALRecord) -> u32 {
-        self.put_page_version(
-            blknum,
-            lsn,
-            PageVersion {
-                page_image: None,
-                record: Some(rec),
-            },
-        )
+    pub fn put_wal_record(&self, lsn: Lsn, blknum: u32, rec: WALRecord) -> Result<u32> {
+        self.put_page_version(blknum, lsn, PageVersion::Wal(rec))
    }

    /// Remember new page version, as a full page image
-    pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> u32 {
-        self.put_page_version(
-            blknum,
-            lsn,
-            PageVersion {
-                page_image: Some(img),
-                record: None,
-            },
-        )
+    pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> Result<u32> {
+        self.put_page_version(blknum, lsn, PageVersion::Page(img))
    }

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> u32 {
+    pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> Result<u32> {
        assert!(self.seg.blknum_in_seg(blknum));

        trace!(
@@ -376,7 +403,7 @@ impl InMemoryLayer {

        inner.assert_writeable();

-        let old = inner.page_versions.append_or_update_last(blknum, lsn, pv);
+        let old = inner.page_versions.append_or_update_last(blknum, lsn, pv)?;

        if old.is_some() {
            // We already had an entry for this LSN. That's odd..
@@ -412,10 +439,7 @@ impl InMemoryLayer {
                // subsequent call to initialize the gap page.
                let gapstart = self.seg.segno * RELISH_SEG_SIZE + oldsize;
                for gapblknum in gapstart..blknum {
-                    let zeropv = PageVersion {
-                        page_image: Some(ZERO_PAGE.clone()),
-                        record: None,
-                    };
+                    let zeropv = PageVersion::Page(ZERO_PAGE.clone());
                    trace!(
                        "filling gap blk {} with zeros for write of {}",
                        gapblknum,
@@ -423,7 +447,7 @@ impl InMemoryLayer {
                    );
                    let old = inner
                        .page_versions
-                        .append_or_update_last(gapblknum, lsn, zeropv);
+                        .append_or_update_last(gapblknum, lsn, zeropv)?;
                    // We already had an entry for this LSN. That's odd..

                    if old.is_some() {
@@ -435,11 +459,11 @@ impl InMemoryLayer {
                }

                inner.segsizes.append_or_update_last(lsn, newsize).unwrap();
-                return newsize - oldsize;
+                return Ok(newsize - oldsize);
            }
        }

-        0
+        Ok(0)
    }

    /// Remember that the relation was truncated at given LSN
@@ -456,7 +480,7 @@ impl InMemoryLayer {
        let oldsize = inner.get_seg_size(lsn);
        assert!(segsize < oldsize);

-        let old = inner.segsizes.append_or_update_last(lsn, segsize).unwrap();
+        let (old, _delta_size) = inner.segsizes.append_or_update_last(lsn, segsize).unwrap();

        if old.is_some() {
            // We already had an entry for this LSN. That's odd..
@@ -504,10 +528,12 @@ impl InMemoryLayer {
        // Copy the segment size at the start LSN from the predecessor layer.
        let mut segsizes = VecMap::default();
        if seg.rel.is_blocky() {
-            let size = src.get_seg_size(start_lsn)?;
+            let size = src.get_seg_size(seg, start_lsn)?;
            segsizes.append(start_lsn, size).unwrap();
        }

+        let file = EphemeralFile::create(conf, tenantid, timelineid)?;
+
        Ok(InMemoryLayer {
            conf,
            timelineid,
@@ -519,7 +545,7 @@ impl InMemoryLayer {
            inner: RwLock::new(InMemoryLayerInner {
                end_lsn: None,
                dropped: false,
-                page_versions: PageVersions::default(),
+                page_versions: PageVersions::new(file),
                segsizes,
            }),
        })
@@ -589,8 +615,9 @@ impl InMemoryLayer {
                self.start_lsn,
                end_lsn_exclusive,
                true,
-                inner.page_versions.ordered_page_version_iter(None),
-                inner.segsizes.clone(),
+                &inner.page_versions,
+                None,
+                inner.segsizes.as_slice(),
            )?;
            trace!(
                "freeze: created delta layer for dropped segment {} {}-{}",
@@ -606,13 +633,9 @@ impl InMemoryLayer {

        // Since `end_lsn` is inclusive, subtract 1.
        // We want to make an ImageLayer for the last included LSN,
-        // so the DeltaLayer should exlcude that LSN.
+        // so the DeltaLayer should exclude that LSN.
        let end_lsn_inclusive = Lsn(end_lsn_exclusive.0 - 1);

-        let mut page_versions = inner
-            .page_versions
-            .ordered_page_version_iter(Some(end_lsn_inclusive));
-
        let mut delta_layers = Vec::new();

        if self.start_lsn != end_lsn_inclusive {
@@ -626,8 +649,9 @@ impl InMemoryLayer {
                self.start_lsn,
                end_lsn_inclusive,
                false,
-                page_versions,
-                segsizes,
+                &inner.page_versions,
+                Some(end_lsn_inclusive),
+                segsizes.as_slice(), // TODO avoid copy above
            )?;
            delta_layers.push(delta_layer);
            trace!(
@@ -637,7 +661,11 @@ impl InMemoryLayer {
                end_lsn_inclusive
            );
        } else {
-            assert!(page_versions.next().is_none());
+            assert!(inner
+                .page_versions
+                .ordered_page_version_iter(None)
+                .next()
+                .is_none());
        }

        drop(inner);
--- a/pageserver/src/layered_repository/layer_map.rs
+++ b/pageserver/src/layered_repository/layer_map.rs
@@ -21,6 +21,8 @@ use std::sync::Arc;
 use zenith_metrics::{register_int_gauge, IntGauge};
 use zenith_utils::lsn::Lsn;

+use super::global_layer_map::{LayerId, GLOBAL_LAYER_MAP};
+
 lazy_static! {
    static ref NUM_INMEMORY_LAYERS: IntGauge =
        register_int_gauge!("pageserver_inmemory_layers", "Number of layers in memory")
@@ -68,7 +70,9 @@ impl LayerMap {
    pub fn get_open(&self, tag: &SegmentTag) -> Option<Arc<InMemoryLayer>> {
        let segentry = self.segs.get(tag)?;

-        segentry.open.as_ref().map(Arc::clone)
+        segentry
+            .open_layer_id
+            .and_then(|layer_id| GLOBAL_LAYER_MAP.read().unwrap().get(&layer_id))
    }

    ///
@@ -77,7 +81,7 @@ impl LayerMap {
    pub fn insert_open(&mut self, layer: Arc<InMemoryLayer>) {
        let segentry = self.segs.entry(layer.get_seg_tag()).or_default();

-        segentry.update_open(Arc::clone(&layer));
+        let layer_id = segentry.update_open(Arc::clone(&layer));

        let oldest_pending_lsn = layer.get_oldest_pending_lsn();

@@ -89,7 +93,7 @@ impl LayerMap {
        // Also add it to the binary heap
        let open_layer_entry = OpenLayerEntry {
            oldest_pending_lsn: layer.get_oldest_pending_lsn(),
-            layer,
+            layer_id,
            generation: self.current_generation,
        };
        self.open_layers.push(open_layer_entry);
@@ -97,24 +101,35 @@ impl LayerMap {
        NUM_INMEMORY_LAYERS.inc();
    }

-    /// Remove the oldest in-memory layer
-    pub fn pop_oldest_open(&mut self) {
-        // Pop it from the binary heap
-        let oldest_entry = self.open_layers.pop().unwrap();
-        let segtag = oldest_entry.layer.get_seg_tag();
+    /// Remove an open in-memory layer
+    pub fn remove_open(&mut self, layer_id: LayerId) {
+        // Note: we don't try to remove the entry from the binary heap.
+        // It will be removed lazily by peek_oldest_open() when it's made it to
+        // the top of the heap.

-        // Also remove it from the SegEntry of this segment
-        let mut segentry = self.segs.get_mut(&segtag).unwrap();
-        if Arc::ptr_eq(segentry.open.as_ref().unwrap(), &oldest_entry.layer) {
-            segentry.open = None;
-        } else {
-            // We could have already updated segentry.open for
-            // dropped (non-writeable) layer. This is fine.
-            assert!(!oldest_entry.layer.is_writeable());
-            assert!(oldest_entry.layer.is_dropped());
+        let layer_opt = {
+            let mut global_map = GLOBAL_LAYER_MAP.write().unwrap();
+            let layer_opt = global_map.get(&layer_id);
+            global_map.remove(&layer_id);
+            // TODO it's bad that a ref can still exist after being evicted from cache
+            layer_opt
+        };
+
+        if let Some(layer) = layer_opt {
+            let mut segentry = self.segs.get_mut(&layer.get_seg_tag()).unwrap();
+
+            if segentry.open_layer_id == Some(layer_id) {
+                // Also remove it from the SegEntry of this segment
+                segentry.open_layer_id = None;
+            } else {
+                // We could have already updated segentry.open for
+                // dropped (non-writeable) layer. This is fine.
+                assert!(!layer.is_writeable());
+                assert!(layer.is_dropped());
+            }
+
+            NUM_INMEMORY_LAYERS.dec();
        }
-
-        NUM_INMEMORY_LAYERS.dec();
    }

    ///
@@ -154,7 +169,7 @@ impl LayerMap {
                        if (request_rel.spcnode == 0 || reltag.spcnode == request_rel.spcnode)
                            && (request_rel.dbnode == 0 || reltag.dbnode == request_rel.dbnode)
                        {
-                            if let Some(exists) = segentry.exists_at_lsn(lsn)? {
+                            if let Some(exists) = segentry.exists_at_lsn(*seg, lsn)? {
                                rels.insert(seg.rel, exists);
                            }
                        }
@@ -162,7 +177,7 @@ impl LayerMap {
                }
                _ => {
                    if tag == None {
-                        if let Some(exists) = segentry.exists_at_lsn(lsn)? {
+                        if let Some(exists) = segentry.exists_at_lsn(*seg, lsn)? {
                            rels.insert(seg.rel, exists);
                        }
                    }
@@ -192,17 +207,24 @@ impl LayerMap {
    /// to avoid incorrectly making it visible.
    pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool> {
        Ok(if let Some(segentry) = self.segs.get(&seg) {
-            segentry.exists_at_lsn(lsn)?.unwrap_or(false)
+            segentry.exists_at_lsn(seg, lsn)?.unwrap_or(false)
        } else {
            false
        })
    }

    /// Return the oldest in-memory layer, along with its generation number.
-    pub fn peek_oldest_open(&self) -> Option<(Arc<InMemoryLayer>, u64)> {
-        self.open_layers
-            .peek()
-            .map(|oldest_entry| (Arc::clone(&oldest_entry.layer), oldest_entry.generation))
+    pub fn peek_oldest_open(&mut self) -> Option<(LayerId, Arc<InMemoryLayer>, u64)> {
+        let global_map = GLOBAL_LAYER_MAP.read().unwrap();
+
+        while let Some(oldest_entry) = self.open_layers.peek() {
+            if let Some(layer) = global_map.get(&oldest_entry.layer_id) {
+                return Some((oldest_entry.layer_id, layer, oldest_entry.generation));
+            } else {
+                self.open_layers.pop();
+            }
+        }
+        None
    }

    /// Increment the generation number used to stamp open in-memory layers. Layers
@@ -225,8 +247,12 @@ impl LayerMap {
    pub fn dump(&self) -> Result<()> {
        println!("Begin dump LayerMap");
        for (seg, segentry) in self.segs.iter() {
-            if let Some(open) = &segentry.open {
-                open.dump()?;
+            if let Some(open) = &segentry.open_layer_id {
+                if let Some(layer) = GLOBAL_LAYER_MAP.read().unwrap().get(open) {
+                    layer.dump()?;
+                } else {
+                    println!("layer not found in global map");
+                }
            }

            for layer in segentry.historic.iter() {
@@ -259,26 +285,26 @@ impl IntervalItem for dyn Layer {
 /// IntervalTree.
 #[derive(Default)]
 struct SegEntry {
-    open: Option<Arc<InMemoryLayer>>,
+    open_layer_id: Option<LayerId>,
    historic: IntervalTree<dyn Layer>,
 }

 impl SegEntry {
    /// Does the segment exist at given LSN?
    /// Return None if object is not found in this SegEntry.
-    fn exists_at_lsn(&self, lsn: Lsn) -> Result<Option<bool>> {
+    fn exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result<Option<bool>> {
        if let Some(layer) = self.get(lsn) {
-            Ok(Some(layer.get_seg_exists(lsn)?))
+            Ok(Some(layer.get_seg_exists(seg, lsn)?))
        } else {
            Ok(None)
        }
    }

    pub fn get(&self, lsn: Lsn) -> Option<Arc<dyn Layer>> {
-        if let Some(open) = &self.open {
-            if open.get_start_lsn() <= lsn {
-                let x: Arc<dyn Layer> = Arc::clone(open) as _;
-                return Some(x);
+        if let Some(open_layer_id) = &self.open_layer_id {
+            let open_layer = GLOBAL_LAYER_MAP.read().unwrap().get(open_layer_id)?;
+            if open_layer.get_start_lsn() <= lsn {
+                return Some(open_layer);
            }
        }

@@ -297,11 +323,16 @@ impl SegEntry {
    // Set new open layer for a SegEntry.
    // It's ok to rewrite previous open layer,
    // but only if it is not writeable anymore.
-    pub fn update_open(&mut self, layer: Arc<InMemoryLayer>) {
-        if let Some(prev_open) = &self.open {
-            assert!(!prev_open.is_writeable());
+    pub fn update_open(&mut self, layer: Arc<InMemoryLayer>) -> LayerId {
+        if let Some(prev_open_layer_id) = &self.open_layer_id {
+            if let Some(prev_open_layer) = GLOBAL_LAYER_MAP.read().unwrap().get(prev_open_layer_id)
+            {
+                assert!(!prev_open_layer.is_writeable());
+            }
        }
-        self.open = Some(layer);
+        let open_layer_id = GLOBAL_LAYER_MAP.write().unwrap().insert(layer);
+        self.open_layer_id = Some(open_layer_id);
+        open_layer_id
    }

    pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
@@ -316,9 +347,9 @@ impl SegEntry {
 /// recently-added entries (i.e after last call to increment_generation()) from older
 /// entries with the same 'oldest_pending_lsn'.
 struct OpenLayerEntry {
-    pub oldest_pending_lsn: Lsn, // copy of layer.get_oldest_pending_lsn()
-    pub generation: u64,
-    pub layer: Arc<InMemoryLayer>,
+    oldest_pending_lsn: Lsn, // copy of layer.get_oldest_pending_lsn()
+    generation: u64,
+    layer_id: LayerId,
 }
 impl Ord for OpenLayerEntry {
    fn cmp(&self, other: &Self) -> Ordering {
@@ -383,6 +414,13 @@ mod tests {
        forknum: 0,
    });

+    lazy_static! {
+        static ref DUMMY_TIMELINEID: ZTimelineId =
+            ZTimelineId::from_str("00000000000000000000000000000000").unwrap();
+        static ref DUMMY_TENANTID: ZTenantId =
+            ZTenantId::from_str("00000000000000000000000000000000").unwrap();
+    }
+
    /// Construct a dummy InMemoryLayer for testing
    fn dummy_inmem_layer(
        conf: &'static PageServerConf,
@@ -393,8 +431,8 @@ mod tests {
        Arc::new(
            InMemoryLayer::create(
                conf,
-                ZTimelineId::from_str("00000000000000000000000000000000").unwrap(),
-                ZTenantId::from_str("00000000000000000000000000000000").unwrap(),
+                *DUMMY_TIMELINEID,
+                *DUMMY_TENANTID,
                SegmentTag {
                    rel: TESTREL_A,
                    segno,
@@ -410,6 +448,7 @@ mod tests {
    fn test_open_layers() -> Result<()> {
        let conf = PageServerConf::dummy_conf(PageServerConf::test_repo_dir("dummy_inmem_layer"));
        let conf = Box::leak(Box::new(conf));
+        std::fs::create_dir_all(conf.timeline_path(&DUMMY_TIMELINEID, &DUMMY_TENANTID))?;

        let mut layers = LayerMap::default();

@@ -426,10 +465,10 @@ mod tests {
        // A helper function (closure) to pop the next oldest open entry from the layer map,
        // and assert that it is what we'd expect
        let mut assert_pop_layer = |expected_segno: u32, expected_generation: u64| {
-            let (l, generation) = layers.peek_oldest_open().unwrap();
+            let (layer_id, l, generation) = layers.peek_oldest_open().unwrap();
            assert!(l.get_seg_tag().segno == expected_segno);
            assert!(generation == expected_generation);
-            layers.pop_oldest_open();
+            layers.remove_open(layer_id);
        };

        assert_pop_layer(0, gen1); // 0x100
--- a/pageserver/src/layered_repository/metadata.rs
+++ b/pageserver/src/layered_repository/metadata.rs
@@ -0,0 +1,226 @@
+//! Every image of a certain timeline from [`crate::layered_repository::LayeredRepository`]
+//! has a metadata that needs to be stored persistently.
+//!
+//! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of
+//! external storage import and export operations.
+//!
+//! The module contains all structs and related helper methods related to timeline metadata.
+
+use std::{convert::TryInto, path::PathBuf};
+
+use anyhow::ensure;
+use zenith_utils::{
+    bin_ser::BeSer,
+    lsn::Lsn,
+    zid::{ZTenantId, ZTimelineId},
+};
+
+use crate::{
+    layered_repository::{METADATA_CHECKSUM_SIZE, METADATA_MAX_DATA_SIZE, METADATA_MAX_SAFE_SIZE},
+    PageServerConf,
+};
+
+/// The name of the metadata file pageserver creates per timeline.
+pub const METADATA_FILE_NAME: &str = "metadata";
+
+/// Metadata stored on disk for each timeline
+///
+/// The fields correspond to the values we hold in memory, in LayeredTimeline.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
+pub struct TimelineMetadata {
+    disk_consistent_lsn: Lsn,
+    // This is only set if we know it. We track it in memory when the page
+    // server is running, but we only track the value corresponding to
+    // 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a
+    // lot. We only store it in the metadata file when we flush *all* the
+    // in-memory data so that 'last_record_lsn' is the same as
+    // 'disk_consistent_lsn'.  That's OK, because after page server restart, as
+    // soon as we reprocess at least one record, we will have a valid
+    // 'prev_record_lsn' value in memory again. This is only really needed when
+    // doing a clean shutdown, so that there is no more WAL beyond
+    // 'disk_consistent_lsn'
+    prev_record_lsn: Option<Lsn>,
+    ancestor_timeline: Option<ZTimelineId>,
+    ancestor_lsn: Lsn,
+    latest_gc_cutoff_lsn: Lsn,
+    initdb_lsn: Lsn,
+}
+
+/// Points to a place in pageserver's local directory,
+/// where certain timeline's metadata file should be located.
+pub fn metadata_path(
+    conf: &'static PageServerConf,
+    timelineid: ZTimelineId,
+    tenantid: ZTenantId,
+) -> PathBuf {
+    conf.timeline_path(&timelineid, &tenantid)
+        .join(METADATA_FILE_NAME)
+}
+
+impl TimelineMetadata {
+    pub fn new(
+        disk_consistent_lsn: Lsn,
+        prev_record_lsn: Option<Lsn>,
+        ancestor_timeline: Option<ZTimelineId>,
+        ancestor_lsn: Lsn,
+        latest_gc_cutoff_lsn: Lsn,
+        initdb_lsn: Lsn,
+    ) -> Self {
+        Self {
+            disk_consistent_lsn,
+            prev_record_lsn,
+            ancestor_timeline,
+            ancestor_lsn,
+            latest_gc_cutoff_lsn,
+            initdb_lsn,
+        }
+    }
+
+    pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
+        ensure!(
+            metadata_bytes.len() == METADATA_MAX_SAFE_SIZE,
+            "metadata bytes size is wrong"
+        );
+
+        let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE];
+        let calculated_checksum = crc32c::crc32c(data);
+
+        let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] =
+            metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?;
+        let expected_checksum = u32::from_le_bytes(*checksum_bytes);
+        ensure!(
+            calculated_checksum == expected_checksum,
+            "metadata checksum mismatch"
+        );
+
+        let data = TimelineMetadata::from(serialize::DeTimelineMetadata::des_prefix(data)?);
+        assert!(data.disk_consistent_lsn.is_aligned());
+
+        Ok(data)
+    }
+
+    pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
+        let serializeable_metadata = serialize::SeTimelineMetadata::from(self);
+        let mut metadata_bytes = serialize::SeTimelineMetadata::ser(&serializeable_metadata)?;
+        assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
+        metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8);
+
+        let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]);
+        metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum));
+        Ok(metadata_bytes)
+    }
+
+    /// [`Lsn`] that corresponds to the corresponding timeline directory
+    /// contents, stored locally in the pageserver workdir.
+    pub fn disk_consistent_lsn(&self) -> Lsn {
+        self.disk_consistent_lsn
+    }
+
+    pub fn prev_record_lsn(&self) -> Option<Lsn> {
+        self.prev_record_lsn
+    }
+
+    pub fn ancestor_timeline(&self) -> Option<ZTimelineId> {
+        self.ancestor_timeline
+    }
+
+    pub fn ancestor_lsn(&self) -> Lsn {
+        self.ancestor_lsn
+    }
+
+    pub fn latest_gc_cutoff_lsn(&self) -> Lsn {
+        self.latest_gc_cutoff_lsn
+    }
+
+    pub fn initdb_lsn(&self) -> Lsn {
+        self.initdb_lsn
+    }
+}
+
+/// This module is for direct conversion of metadata to bytes and back.
+/// For a certain metadata, besides the conversion a few verification steps has to
+/// be done, so all serde derives are hidden from the user, to avoid accidental
+/// verification-less metadata creation.
+mod serialize {
+    use serde::{Deserialize, Serialize};
+    use zenith_utils::{lsn::Lsn, zid::ZTimelineId};
+
+    use super::TimelineMetadata;
+
+    #[derive(Serialize)]
+    pub(super) struct SeTimelineMetadata<'a> {
+        disk_consistent_lsn: &'a Lsn,
+        prev_record_lsn: &'a Option<Lsn>,
+        ancestor_timeline: &'a Option<ZTimelineId>,
+        ancestor_lsn: &'a Lsn,
+        latest_gc_cutoff_lsn: &'a Lsn,
+        initdb_lsn: &'a Lsn,
+    }
+
+    impl<'a> From<&'a TimelineMetadata> for SeTimelineMetadata<'a> {
+        fn from(other: &'a TimelineMetadata) -> Self {
+            Self {
+                disk_consistent_lsn: &other.disk_consistent_lsn,
+                prev_record_lsn: &other.prev_record_lsn,
+                ancestor_timeline: &other.ancestor_timeline,
+                ancestor_lsn: &other.ancestor_lsn,
+                latest_gc_cutoff_lsn: &other.latest_gc_cutoff_lsn,
+                initdb_lsn: &other.initdb_lsn,
+            }
+        }
+    }
+
+    #[derive(Deserialize)]
+    pub(super) struct DeTimelineMetadata {
+        disk_consistent_lsn: Lsn,
+        prev_record_lsn: Option<Lsn>,
+        ancestor_timeline: Option<ZTimelineId>,
+        ancestor_lsn: Lsn,
+        latest_gc_cutoff_lsn: Lsn,
+        initdb_lsn: Lsn,
+    }
+
+    impl From<DeTimelineMetadata> for TimelineMetadata {
+        fn from(other: DeTimelineMetadata) -> Self {
+            Self {
+                disk_consistent_lsn: other.disk_consistent_lsn,
+                prev_record_lsn: other.prev_record_lsn,
+                ancestor_timeline: other.ancestor_timeline,
+                ancestor_lsn: other.ancestor_lsn,
+                latest_gc_cutoff_lsn: other.latest_gc_cutoff_lsn,
+                initdb_lsn: other.initdb_lsn,
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::repository::repo_harness::TIMELINE_ID;
+
+    use super::*;
+
+    #[test]
+    fn metadata_serializes_correctly() {
+        let original_metadata = TimelineMetadata {
+            disk_consistent_lsn: Lsn(0x200),
+            prev_record_lsn: Some(Lsn(0x100)),
+            ancestor_timeline: Some(TIMELINE_ID),
+            ancestor_lsn: Lsn(0),
+            latest_gc_cutoff_lsn: Lsn(0),
+            initdb_lsn: Lsn(0),
+        };
+
+        let metadata_bytes = original_metadata
+            .to_bytes()
+            .expect("Should serialize correct metadata to bytes");
+
+        let deserialized_metadata = TimelineMetadata::from_bytes(&metadata_bytes)
+            .expect("Should deserialize its own bytes");
+
+        assert_eq!(
+            deserialized_metadata, original_metadata,
+            "Metadata that was serialized to bytes and deserialized back should not change"
+        );
+    }
+}
--- a/pageserver/src/layered_repository/page_versions.rs
+++ b/pageserver/src/layered_repository/page_versions.rs
@@ -1,40 +1,78 @@
+//!
+//! Data structure to ingest incoming WAL into an append-only file.
+//!
+//! - The file is considered temporary, and will be discarded on crash
+//! - based on a B-tree
+//!
+
+use std::os::unix::fs::FileExt;
 use std::{collections::HashMap, ops::RangeBounds, slice};

+use anyhow::Result;
+
+use std::cmp::min;
+use std::io::Seek;
+
 use zenith_utils::{lsn::Lsn, vec_map::VecMap};

 use super::storage_layer::PageVersion;
+use crate::layered_repository::ephemeral_file::EphemeralFile;

-const EMPTY_SLICE: &[(Lsn, PageVersion)] = &[];
+use zenith_utils::bin_ser::BeSer;

-#[derive(Debug, Default)]
-pub struct PageVersions(HashMap<u32, VecMap<Lsn, PageVersion>>);
+const EMPTY_SLICE: &[(Lsn, u64)] = &[];
+
+pub struct PageVersions {
+    map: HashMap<u32, VecMap<Lsn, u64>>,
+
+    /// The PageVersion structs are stored in a serialized format in this file.
+    /// Each serialized PageVersion is preceded by a 'u32' length field.
+    /// The 'map' stores offsets into this file.
+    file: EphemeralFile,
+}

 impl PageVersions {
+    pub fn new(file: EphemeralFile) -> PageVersions {
+        PageVersions {
+            map: HashMap::new(),
+            file,
+        }
+    }
+
    pub fn append_or_update_last(
        &mut self,
        blknum: u32,
        lsn: Lsn,
        page_version: PageVersion,
-    ) -> Option<PageVersion> {
-        let map = self.0.entry(blknum).or_insert_with(VecMap::default);
-        map.append_or_update_last(lsn, page_version).unwrap()
+    ) -> Result<Option<u64>> {
+        // remember starting position
+        let pos = self.file.stream_position()?;
+
+        // make room for the 'length' field by writing zeros as a placeholder.
+        self.file.seek(std::io::SeekFrom::Start(pos + 4)).unwrap();
+
+        page_version.ser_into(&mut self.file).unwrap();
+
+        // write the 'length' field.
+        let len = self.file.stream_position()? - pos - 4;
+        let lenbuf = u32::to_ne_bytes(len as u32);
+        self.file.write_all_at(&lenbuf, pos)?;
+
+        let map = self.map.entry(blknum).or_insert_with(VecMap::default);
+        Ok(map.append_or_update_last(lsn, pos as u64).unwrap().0)
    }

    /// Get all [`PageVersion`]s in a block
-    pub fn get_block_slice(&self, blknum: u32) -> &[(Lsn, PageVersion)] {
-        self.0
+    fn get_block_slice(&self, blknum: u32) -> &[(Lsn, u64)] {
+        self.map
            .get(&blknum)
            .map(VecMap::as_slice)
            .unwrap_or(EMPTY_SLICE)
    }

    /// Get a range of [`PageVersions`] in a block
-    pub fn get_block_lsn_range<R: RangeBounds<Lsn>>(
-        &self,
-        blknum: u32,
-        range: R,
-    ) -> &[(Lsn, PageVersion)] {
-        self.0
+    pub fn get_block_lsn_range<R: RangeBounds<Lsn>>(&self, blknum: u32, range: R) -> &[(Lsn, u64)] {
+        self.map
            .get(&blknum)
            .map(|vec_map| vec_map.slice_range(range))
            .unwrap_or(EMPTY_SLICE)
@@ -43,7 +81,7 @@ impl PageVersions {
    /// Iterate through [`PageVersion`]s in (block, lsn) order.
    /// If a [`cutoff_lsn`] is set, only show versions with `lsn < cutoff_lsn`
    pub fn ordered_page_version_iter(&self, cutoff_lsn: Option<Lsn>) -> OrderedPageVersionIter<'_> {
-        let mut ordered_blocks: Vec<u32> = self.0.keys().cloned().collect();
+        let mut ordered_blocks: Vec<u32> = self.map.keys().cloned().collect();
        ordered_blocks.sort_unstable();

        let slice = ordered_blocks
@@ -59,6 +97,40 @@ impl PageVersions {
            cur_slice_iter: slice.iter(),
        }
    }
+
+    /// Returns a 'Read' that reads the page version at given offset.
+    pub fn reader(&self, pos: u64) -> Result<PageVersionReader, std::io::Error> {
+        // read length
+        let mut lenbuf = [0u8; 4];
+        self.file.read_exact_at(&mut lenbuf, pos)?;
+        let len = u32::from_ne_bytes(lenbuf);
+
+        Ok(PageVersionReader {
+            file: &self.file,
+            pos: pos + 4,
+            end_pos: pos + 4 + len as u64,
+        })
+    }
+
+    pub fn get_page_version(&self, pos: u64) -> Result<PageVersion> {
+        let mut reader = self.reader(pos)?;
+        Ok(PageVersion::des_from(&mut reader)?)
+    }
+}
+
+pub struct PageVersionReader<'a> {
+    file: &'a EphemeralFile,
+    pos: u64,
+    end_pos: u64,
+}
+
+impl<'a> std::io::Read for PageVersionReader<'a> {
+    fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
+        let len = min(buf.len(), (self.end_pos - self.pos) as usize);
+        let n = self.file.read_at(&mut buf[..len], self.pos)?;
+        self.pos += n as u64;
+        Ok(n)
+    }
 }

 pub struct OrderedPageVersionIter<'a> {
@@ -69,7 +141,7 @@ pub struct OrderedPageVersionIter<'a> {

    cutoff_lsn: Option<Lsn>,

-    cur_slice_iter: slice::Iter<'a, (Lsn, PageVersion)>,
+    cur_slice_iter: slice::Iter<'a, (Lsn, u64)>,
 }

 impl OrderedPageVersionIter<'_> {
@@ -83,14 +155,14 @@ impl OrderedPageVersionIter<'_> {
 }

 impl<'a> Iterator for OrderedPageVersionIter<'a> {
-    type Item = (u32, Lsn, &'a PageVersion);
+    type Item = (u32, Lsn, u64);

    fn next(&mut self) -> Option<Self::Item> {
        loop {
-            if let Some((lsn, page_version)) = self.cur_slice_iter.next() {
+            if let Some((lsn, pos)) = self.cur_slice_iter.next() {
                if self.is_lsn_before_cutoff(lsn) {
                    let blknum = self.ordered_blocks[self.cur_block_idx];
-                    return Some((blknum, *lsn, page_version));
+                    return Some((blknum, *lsn, *pos));
                }
            }

@@ -104,22 +176,50 @@ impl<'a> Iterator for OrderedPageVersionIter<'a> {

 #[cfg(test)]
 mod tests {
-    use super::*;
+    use bytes::Bytes;

-    const EMPTY_PAGE_VERSION: PageVersion = PageVersion {
-        page_image: None,
-        record: None,
-    };
+    use super::*;
+    use crate::PageServerConf;
+    use std::fs;
+    use std::str::FromStr;
+    use zenith_utils::zid::{ZTenantId, ZTimelineId};
+
+    fn repo_harness(test_name: &str) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId)> {
+        let repo_dir = PageServerConf::test_repo_dir(test_name);
+        let _ = fs::remove_dir_all(&repo_dir);
+        let conf = PageServerConf::dummy_conf(repo_dir);
+        // Make a static copy of the config. This can never be free'd, but that's
+        // OK in a test.
+        let conf: &'static PageServerConf = Box::leak(Box::new(conf));
+
+        let tenantid = ZTenantId::from_str("11000000000000000000000000000000").unwrap();
+        let timelineid = ZTimelineId::from_str("22000000000000000000000000000000").unwrap();
+        fs::create_dir_all(conf.timeline_path(&timelineid, &tenantid))?;
+
+        Ok((conf, tenantid, timelineid))
+    }

    #[test]
-    fn test_ordered_iter() {
-        let mut page_versions = PageVersions::default();
+    fn test_ordered_iter() -> Result<()> {
+        let (conf, tenantid, timelineid) = repo_harness("test_ordered_iter")?;
+
+        let file = EphemeralFile::create(conf, tenantid, timelineid)?;
+
+        let mut page_versions = PageVersions::new(file);
+
        const BLOCKS: u32 = 1000;
        const LSNS: u64 = 50;

+        let empty_page = Bytes::from_static(&[0u8; 8192]);
+        let empty_page_version = PageVersion::Page(empty_page);
+
        for blknum in 0..BLOCKS {
            for lsn in 0..LSNS {
-                let old = page_versions.append_or_update_last(blknum, Lsn(lsn), EMPTY_PAGE_VERSION);
+                let old = page_versions.append_or_update_last(
+                    blknum,
+                    Lsn(lsn),
+                    empty_page_version.clone(),
+                )?;
                assert!(old.is_none());
            }
        }
@@ -146,5 +246,7 @@ mod tests {
        }
        assert!(iter.next().is_none());
        assert!(iter.next().is_none()); // should be robust against excessive next() calls
+
+        Ok(())
    }
 }
--- a/pageserver/src/layered_repository/storage_layer.rs
+++ b/pageserver/src/layered_repository/storage_layer.rs
@@ -3,9 +3,10 @@
 //!

 use crate::relish::RelishTag;
-use crate::repository::{PageReconstructData, PageReconstructResult};
+use crate::repository::WALRecord;
 use crate::{ZTenantId, ZTimelineId};
 use anyhow::Result;
+use bytes::Bytes;
 use serde::{Deserialize, Serialize};
 use std::fmt;
 use std::path::PathBuf;
@@ -44,6 +45,45 @@ impl SegmentTag {
    }
 }

+///
+/// Represents a version of a page at a specific LSN. The LSN is the key of the
+/// entry in the 'page_versions' hash, it is not duplicated here.
+///
+/// A page version can be stored as a full page image, or as WAL record that needs
+/// to be applied over the previous page version to reconstruct this version.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum PageVersion {
+    Page(Bytes),
+    Wal(WALRecord),
+}
+
+///
+/// Data needed to reconstruct a page version
+///
+/// 'page_img' is the old base image of the page to start the WAL replay with.
+/// It can be None, if the first WAL record initializes the page (will_init)
+/// 'records' contains the records to apply over the base image.
+///
+pub struct PageReconstructData {
+    pub records: Vec<(Lsn, WALRecord)>,
+    pub page_img: Option<Bytes>,
+}
+
+/// Return value from Layer::get_page_reconstruct_data
+pub enum PageReconstructResult {
+    /// Got all the data needed to reconstruct the requested page
+    Complete,
+    /// This layer didn't contain all the required data, the caller should look up
+    /// the predecessor layer at the returned LSN and collect more data from there.
+    Continue(Lsn),
+    /// This layer didn't contain data needed to reconstruct the page version at
+    /// the returned LSN. This is usually considered an error, but might be OK
+    /// in some circumstances.
+    Missing(Lsn),
+    /// Use the cached image at `cached_img_lsn` as the base image
+    Cached,
+}
+
 ///
 /// A Layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs.
 /// There are two kinds of layers, in-memory and on-disk layers. In-memory
@@ -89,6 +129,9 @@ pub trait Layer: Send + Sync {
    /// of the *relish*, not the beginning of the segment. The requested
    /// 'blknum' must be covered by this segment.
    ///
+    /// `cached_img_lsn` should be set to a cached page image's lsn < `lsn`.
+    /// This function will only return data after `cached_img_lsn`.
+    ///
    /// See PageReconstructResult for possible return values. The collected data
    /// is appended to reconstruct_data; the caller should pass an empty struct
    /// on first call. If this returns PageReconstructResult::Continue, look up
@@ -96,16 +139,18 @@ pub trait Layer: Send + Sync {
    /// to collect more data.
    fn get_page_reconstruct_data(
        &self,
+        seg: SegmentTag,
        blknum: u32,
        lsn: Lsn,
+        cached_img_lsn: Option<Lsn>,
        reconstruct_data: &mut PageReconstructData,
    ) -> Result<PageReconstructResult>;

    /// Return size of the segment at given LSN. (Only for blocky relations.)
-    fn get_seg_size(&self, lsn: Lsn) -> Result<u32>;
+    fn get_seg_size(&self, seg: SegmentTag, lsn: Lsn) -> Result<u32>;

    /// Does the segment exist at given LSN? Or was it dropped before it.
-    fn get_seg_exists(&self, lsn: Lsn) -> Result<bool>;
+    fn get_seg_exists(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool>;

    /// Does this layer only contain some data for the segment (incremental),
    /// or does it contain a version of every page? This is important to know
@@ -113,6 +158,9 @@ pub trait Layer: Send + Sync {
    /// the previous non-incremental layer.
    fn is_incremental(&self) -> bool;

+    /// Returns true for layers that are represented in memory.
+    fn is_in_memory(&self) -> bool;
+
    /// Release memory used by this layer. There is no corresponding 'load'
    /// function, that's done implicitly when you call one of the get-functions.
    fn unload(&self) -> Result<()>;
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,6 +1,8 @@
+use layered_repository::TIMELINES_SEGMENT_NAME;
 use zenith_utils::postgres_backend::AuthType;
 use zenith_utils::zid::{ZTenantId, ZTimelineId};

+use std::num::{NonZeroU32, NonZeroUsize};
 use std::path::PathBuf;
 use std::time::Duration;

@@ -9,18 +11,19 @@ use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};

 pub mod basebackup;
 pub mod branches;
-pub mod buffered_repository;
 pub mod http;
 pub mod layered_repository;
+pub mod page_cache;
 pub mod page_service;
 pub mod relish;
-pub mod relish_storage;
+pub mod remote_storage;
 pub mod repository;
 pub mod restore_local_repo;
 pub mod tenant_mgr;
-pub mod toast_store;
-pub mod waldecoder;
+pub mod tenant_threads;
+pub mod virtual_file;
 pub mod walreceiver;
+pub mod walrecord;
 pub mod walredo;

 pub mod defaults {
@@ -32,20 +35,22 @@ pub mod defaults {
    pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
    pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");

-    // Minimal size of WAL records chain to trigger materialization of the page
+    // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
+    // would be more appropriate. But a low value forces the code to be exercised more,
+    // which is good for now to trigger bugs.
    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
-    pub const DEFAULT_CHECKPOINT_PERIOD: Duration = Duration::from_secs(10);
+    pub const DEFAULT_CHECKPOINT_PERIOD: Duration = Duration::from_secs(1);

-    pub const DEFAULT_UPLOAD_DISTANCE: u64 = 1024 * 1024 * 1024;
-    pub const DEFAULT_UPLOAD_PERIOD: Duration = Duration::from_secs(3600);
-
-    pub const DEFAULT_RECONSTRUCT_THRESHOLD: u64 = 0;
-
-    pub const DEFAULT_GC_HORIZON: u64 = 1024;
-    pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(10);
+    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
+    pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100);

    pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
-    pub const DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS: usize = 100;
+    pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC: usize = 100;
+    pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
+
+    pub const DEFAULT_OPEN_MEM_LIMIT: usize = 128 * 1024 * 1024;
+    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
+    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
 }

 lazy_static! {
@@ -69,14 +74,15 @@ pub struct PageServerConf {
    // page server crashes.
    pub checkpoint_distance: u64,
    pub checkpoint_period: Duration,
-    pub upload_period: Duration,
-    pub upload_distance: u64,
-    pub reconstruct_threshold: u64,

    pub gc_horizon: u64,
    pub gc_period: Duration,
    pub superuser: String,

+    pub open_mem_limit: usize,
+    pub page_cache_size: usize,
+    pub max_file_descriptors: usize,
+
    // Repository directory, relative to current working directory.
    // Normally, the page server changes the current working directory
    // to the repository, and 'workdir' is always '.'. But we don't do
@@ -90,7 +96,7 @@ pub struct PageServerConf {
    pub auth_type: AuthType,

    pub auth_validation_public_key_path: Option<PathBuf>,
-    pub relish_storage_config: Option<RelishStorageConfig>,
+    pub remote_storage_config: Option<RemoteStorageConfig>,
 }

 impl PageServerConf {
@@ -123,17 +129,13 @@ impl PageServerConf {
    }

    fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf {
-        self.tenant_path(tenantid).join("timelines")
+        self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME)
    }

    fn timeline_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
        self.timelines_path(tenantid).join(timelineid.to_string())
    }

-    fn ancestor_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
-        self.timeline_path(timelineid, tenantid).join("ancestor")
-    }
-
    //
    // Postgres distribution paths
    //
@@ -157,11 +159,11 @@ impl PageServerConf {
            daemonize: false,
            checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
            checkpoint_period: Duration::from_secs(10),
-            upload_distance: defaults::DEFAULT_UPLOAD_DISTANCE,
-            upload_period: defaults::DEFAULT_UPLOAD_PERIOD,
-            reconstruct_threshold: defaults::DEFAULT_RECONSTRUCT_THRESHOLD,
            gc_horizon: defaults::DEFAULT_GC_HORIZON,
            gc_period: Duration::from_secs(10),
+            open_mem_limit: defaults::DEFAULT_OPEN_MEM_LIMIT,
+            page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
+            max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
            listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
            listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
            superuser: "zenith_admin".to_string(),
@@ -169,23 +171,34 @@ impl PageServerConf {
            pg_distrib_dir: "".into(),
            auth_type: AuthType::Trust,
            auth_validation_public_key_path: None,
-            relish_storage_config: None,
+            remote_storage_config: None,
        }
    }
 }

-/// External relish storage configuration, enough for creating a client for that storage.
-#[derive(Debug, Clone)]
-pub struct RelishStorageConfig {
-    /// Limits the number of concurrent sync operations between pageserver and relish storage.
-    pub max_concurrent_sync: usize,
-    /// The storage connection configuration.
-    pub storage: RelishStorageKind,
+/// Config for the Repository checkpointer
+#[derive(Debug, Clone, Copy)]
+pub enum CheckpointConfig {
+    // Flush in-memory data that is older than this
+    Distance(u64),
+    // Flush all in-memory data
+    Forced,
 }

-/// A kind of a relish storage to connect to, with its connection configuration.
+/// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone)]
-pub enum RelishStorageKind {
+pub struct RemoteStorageConfig {
+    /// Max allowed number of concurrent sync operations between pageserver and the remote storage.
+    pub max_concurrent_sync: NonZeroUsize,
+    /// Max allowed errors before the sync task is considered failed and evicted.
+    pub max_sync_errors: NonZeroU32,
+    /// The storage connection configuration.
+    pub storage: RemoteStorageKind,
+}
+
+/// A kind of a remote storage to connect to, with its connection configuration.
+#[derive(Debug, Clone)]
+pub enum RemoteStorageKind {
    /// Storage based on local file system.
    /// Specify a root folder to place all stored relish data into.
    LocalFs(PathBuf),
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -0,0 +1,778 @@
+//!
+//! Global page cache
+//!
+//! The page cache uses up most of the memory in the page server. It is shared
+//! by all tenants, and it is used to store different kinds of pages. Sharing
+//! the cache allows memory to be dynamically allocated where it's needed the
+//! most.
+//!
+//! The page cache consists of fixed-size buffers, 8 kB each to match the
+//! PostgreSQL buffer size, and a Slot struct for each buffer to contain
+//! information about what's stored in the buffer.
+//!
+//! # Locking
+//!
+//! There are two levels of locking involved: There's one lock for the "mapping"
+//! from page identifier (tenant ID, timeline ID, rel, block, LSN) to the buffer
+//! slot, and a separate lock on each slot. To read or write the contents of a
+//! slot, you must hold the lock on the slot in read or write mode,
+//! respectively. To change the mapping of a slot, i.e. to evict a page or to
+//! assign a buffer for a page, you must hold the mapping lock and the lock on
+//! the slot at the same time.
+//!
+//! Whenever you need to hold both locks simultenously, the slot lock must be
+//! acquired first. This consistent ordering avoids deadlocks. To look up a page
+//! in the cache, you would first look up the mapping, while holding the mapping
+//! lock, and then lock the slot. You must release the mapping lock in between,
+//! to obey the lock ordering and avoid deadlock.
+//!
+//! A slot can momentarily have invalid contents, even if it's already been
+//! inserted to the mapping, but you must hold the write-lock on the slot until
+//! the contents are valid. If you need to release the lock without initializing
+//! the contents, you must remove the mapping first. We make that easy for the
+//! callers with PageWriteGuard: when lock_for_write() returns an uninitialized
+//! page, the caller must explicitly call guard.mark_valid() after it has
+//! initialized it. If the guard is dropped without calling mark_valid(), the
+//! mapping is automatically removed and the slot is marked free.
+//!
+
+use std::{
+    collections::{hash_map::Entry, HashMap},
+    convert::TryInto,
+    sync::{
+        atomic::{AtomicU8, AtomicUsize, Ordering},
+        RwLock, RwLockReadGuard, RwLockWriteGuard,
+    },
+};
+
+use once_cell::sync::OnceCell;
+use tracing::error;
+use zenith_utils::{
+    lsn::Lsn,
+    zid::{ZTenantId, ZTimelineId},
+};
+
+use crate::layered_repository::writeback_ephemeral_file;
+use crate::{relish::RelTag, PageServerConf};
+
+static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
+const TEST_PAGE_CACHE_SIZE: usize = 10;
+
+///
+/// Initialize the page cache. This must be called once at page server startup.
+///
+pub fn init(conf: &'static PageServerConf) {
+    if PAGE_CACHE
+        .set(PageCache::new(conf.page_cache_size))
+        .is_err()
+    {
+        panic!("page cache already initialized");
+    }
+}
+
+///
+/// Get a handle to the page cache.
+///
+pub fn get() -> &'static PageCache {
+    //
+    // In unit tests, page server startup doesn't happen and no one calls
+    // page_cache::init(). Initialize it here with a tiny cache, so that the
+    // page cache is usable in unit tests.
+    //
+    if cfg!(test) {
+        PAGE_CACHE.get_or_init(|| PageCache::new(TEST_PAGE_CACHE_SIZE))
+    } else {
+        PAGE_CACHE.get().expect("page cache not initialized")
+    }
+}
+
+pub const PAGE_SZ: usize = postgres_ffi::pg_constants::BLCKSZ as usize;
+const MAX_USAGE_COUNT: u8 = 5;
+
+///
+/// CacheKey uniquely identifies a "thing" to cache in the page cache.
+///
+#[derive(Debug, PartialEq, Eq, Clone)]
+enum CacheKey {
+    MaterializedPage {
+        hash_key: MaterializedPageHashKey,
+        lsn: Lsn,
+    },
+    EphemeralPage {
+        file_id: u64,
+        blkno: u32,
+    },
+}
+
+#[derive(Debug, PartialEq, Eq, Hash, Clone)]
+struct MaterializedPageHashKey {
+    tenant_id: ZTenantId,
+    timeline_id: ZTimelineId,
+    rel_tag: RelTag,
+    blknum: u32,
+}
+
+#[derive(Clone)]
+struct Version {
+    lsn: Lsn,
+    slot_idx: usize,
+}
+
+struct Slot {
+    inner: RwLock<SlotInner>,
+    usage_count: AtomicU8,
+}
+
+struct SlotInner {
+    key: Option<CacheKey>,
+    buf: &'static mut [u8; PAGE_SZ],
+    dirty: bool,
+}
+
+impl Slot {
+    /// Increment usage count on the buffer, with ceiling at MAX_USAGE_COUNT.
+    fn inc_usage_count(&self) {
+        let _ = self
+            .usage_count
+            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| {
+                if val == MAX_USAGE_COUNT {
+                    None
+                } else {
+                    Some(val + 1)
+                }
+            });
+    }
+
+    /// Decrement usage count on the buffer, unless it's already zero.  Returns
+    /// the old usage count.
+    fn dec_usage_count(&self) -> u8 {
+        let count_res =
+            self.usage_count
+                .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| {
+                    if val == 0 {
+                        None
+                    } else {
+                        Some(val - 1)
+                    }
+                });
+
+        match count_res {
+            Ok(usage_count) => usage_count,
+            Err(usage_count) => usage_count,
+        }
+    }
+}
+
+pub struct PageCache {
+    /// This contains the mapping from the cache key to buffer slot that currently
+    /// contains the page, if any.
+    ///
+    /// TODO: This is protected by a single lock. If that becomes a bottleneck,
+    /// this HashMap can be replaced with a more concurrent version, there are
+    /// plenty of such crates around.
+    ///
+    /// If you add support for caching different kinds of objects, each object kind
+    /// can have a separate mapping map, next to this field.
+    materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
+
+    ephemeral_page_map: RwLock<HashMap<(u64, u32), usize>>,
+
+    /// The actual buffers with their metadata.
+    slots: Box<[Slot]>,
+
+    /// Index of the next candidate to evict, for the Clock replacement algorithm.
+    /// This is interpreted modulo the page cache size.
+    next_evict_slot: AtomicUsize,
+}
+
+///
+/// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
+/// until the guard is dropped.
+///
+pub struct PageReadGuard<'i>(RwLockReadGuard<'i, SlotInner>);
+
+impl std::ops::Deref for PageReadGuard<'_> {
+    type Target = [u8; PAGE_SZ];
+
+    fn deref(&self) -> &Self::Target {
+        self.0.buf
+    }
+}
+
+///
+/// PageWriteGuard is a lease on a buffer for modifying it. The page is kept locked
+/// until the guard is dropped.
+///
+/// Counterintuitively, this is used even for a read, if the requested page is not
+/// currently found in the page cache. In that case, the caller of lock_for_read()
+/// is expected to fill in the page contents and call mark_valid(). Similarly
+/// lock_for_write() can return an invalid buffer that the caller is expected to
+/// to initialize.
+///
+pub struct PageWriteGuard<'i> {
+    inner: RwLockWriteGuard<'i, SlotInner>,
+
+    // Are the page contents currently valid?
+    valid: bool,
+}
+
+impl std::ops::DerefMut for PageWriteGuard<'_> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.inner.buf
+    }
+}
+
+impl std::ops::Deref for PageWriteGuard<'_> {
+    type Target = [u8; PAGE_SZ];
+
+    fn deref(&self) -> &Self::Target {
+        self.inner.buf
+    }
+}
+
+impl PageWriteGuard<'_> {
+    /// Mark that the buffer contents are now valid.
+    pub fn mark_valid(&mut self) {
+        assert!(self.inner.key.is_some());
+        assert!(
+            !self.valid,
+            "mark_valid called on a buffer that was already valid"
+        );
+        self.valid = true;
+    }
+    pub fn mark_dirty(&mut self) {
+        // only ephemeral pages can be dirty ATM.
+        assert!(matches!(
+            self.inner.key,
+            Some(CacheKey::EphemeralPage { .. })
+        ));
+        self.inner.dirty = true;
+    }
+}
+
+impl Drop for PageWriteGuard<'_> {
+    ///
+    /// If the buffer was allocated for a page that was not already in the
+    /// cache, but the lock_for_read/write() caller dropped the buffer without
+    /// initializing it, remove the mapping from the page cache.
+    ///
+    fn drop(&mut self) {
+        assert!(self.inner.key.is_some());
+        if !self.valid {
+            let self_key = self.inner.key.as_ref().unwrap();
+            PAGE_CACHE.get().unwrap().remove_mapping(self_key);
+            self.inner.key = None;
+            self.inner.dirty = false;
+        }
+    }
+}
+
+/// lock_for_read() return value
+pub enum ReadBufResult<'a> {
+    Found(PageReadGuard<'a>),
+    NotFound(PageWriteGuard<'a>),
+}
+
+/// lock_for_write() return value
+pub enum WriteBufResult<'a> {
+    Found(PageWriteGuard<'a>),
+    NotFound(PageWriteGuard<'a>),
+}
+
+impl PageCache {
+    //
+    // Section 1.1: Public interface functions for looking up and memorizing materialized page
+    // versions in the page cache
+    //
+
+    /// Look up a materialized page version.
+    ///
+    /// The 'lsn' is an upper bound, this will return the latest version of
+    /// the given block, but not newer than 'lsn'. Returns the actual LSN of the
+    /// returned page.
+    pub fn lookup_materialized_page(
+        &self,
+        tenant_id: ZTenantId,
+        timeline_id: ZTimelineId,
+        rel_tag: RelTag,
+        blknum: u32,
+        lsn: Lsn,
+    ) -> Option<(Lsn, PageReadGuard)> {
+        let mut cache_key = CacheKey::MaterializedPage {
+            hash_key: MaterializedPageHashKey {
+                tenant_id,
+                timeline_id,
+                rel_tag,
+                blknum,
+            },
+            lsn,
+        };
+
+        if let Some(guard) = self.try_lock_for_read(&mut cache_key) {
+            if let CacheKey::MaterializedPage { hash_key: _, lsn } = cache_key {
+                Some((lsn, guard))
+            } else {
+                panic!("unexpected key type in slot");
+            }
+        } else {
+            None
+        }
+    }
+
+    ///
+    /// Store an image of the given page in the cache.
+    ///
+    pub fn memorize_materialized_page(
+        &self,
+        tenant_id: ZTenantId,
+        timeline_id: ZTimelineId,
+        rel_tag: RelTag,
+        blknum: u32,
+        lsn: Lsn,
+        img: &[u8],
+    ) {
+        let cache_key = CacheKey::MaterializedPage {
+            hash_key: MaterializedPageHashKey {
+                tenant_id,
+                timeline_id,
+                rel_tag,
+                blknum,
+            },
+            lsn,
+        };
+
+        match self.lock_for_write(&cache_key) {
+            WriteBufResult::Found(write_guard) => {
+                // We already had it in cache. Another thread must've put it there
+                // concurrently. Check that it had the same contents that we
+                // replayed.
+                assert!(*write_guard == img);
+            }
+            WriteBufResult::NotFound(mut write_guard) => {
+                write_guard.copy_from_slice(img);
+                write_guard.mark_valid();
+            }
+        }
+    }
+
+    // Section 1.2: Public interface functions for working with Ephemeral pages.
+
+    pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> ReadBufResult {
+        let mut cache_key = CacheKey::EphemeralPage { file_id, blkno };
+
+        self.lock_for_read(&mut cache_key)
+    }
+
+    pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> WriteBufResult {
+        let cache_key = CacheKey::EphemeralPage { file_id, blkno };
+
+        self.lock_for_write(&cache_key)
+    }
+
+    /// Immediately drop all buffers belonging to given file, without writeback
+    pub fn drop_buffers_for_ephemeral(&self, drop_file_id: u64) {
+        for slot_idx in 0..self.slots.len() {
+            let slot = &self.slots[slot_idx];
+
+            let mut inner = slot.inner.write().unwrap();
+            if let Some(key) = &inner.key {
+                match key {
+                    CacheKey::EphemeralPage { file_id, blkno: _ } if *file_id == drop_file_id => {
+                        // remove mapping for old buffer
+                        self.remove_mapping(key);
+                        inner.key = None;
+                        inner.dirty = false;
+                    }
+                    _ => {}
+                }
+            }
+        }
+    }
+
+    //
+    // Section 2: Internal interface functions for lookup/update.
+    //
+    // To add support for a new kind of "thing" to cache, you will need
+    // to add public interface routines above, and code to deal with the
+    // "mappings" after this section. But the routines in this section should
+    // not require changes.
+
+    /// Look up a page in the cache.
+    ///
+    /// If the search criteria is not exact, *cache_key is updated with the key
+    /// for exact key of the returned page. (For materialized pages, that means
+    /// that the LSN in 'cache_key' is updated with the LSN of the returned page
+    /// version.)
+    ///
+    /// If no page is found, returns None and *cache_key is left unmodified.
+    ///
+    fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
+        let cache_key_orig = cache_key.clone();
+        if let Some(slot_idx) = self.search_mapping(cache_key) {
+            // The page was found in the mapping. Lock the slot, and re-check
+            // that it's still what we expected (because we released the mapping
+            // lock already, another thread could have evicted the page)
+            let slot = &self.slots[slot_idx];
+            let inner = slot.inner.read().unwrap();
+            if inner.key.as_ref() == Some(cache_key) {
+                slot.inc_usage_count();
+                return Some(PageReadGuard(inner));
+            } else {
+                // search_mapping might have modified the search key; restore it.
+                *cache_key = cache_key_orig;
+            }
+        }
+        None
+    }
+
+    /// Return a locked buffer for given block.
+    ///
+    /// Like try_lock_for_read(), if the search criteria is not exact and the
+    /// page is already found in the cache, *cache_key is updated.
+    ///
+    /// If the page is not found in the cache, this allocates a new buffer for
+    /// it. The caller may then initialize the buffer with the contents, and
+    /// call mark_valid().
+    ///
+    /// Example usage:
+    ///
+    /// ```ignore
+    /// let cache = page_cache::get();
+    ///
+    /// match cache.lock_for_read(&key) {
+    ///     ReadBufResult::Found(read_guard) => {
+    ///         // The page was found in cache. Use it
+    ///     },
+    ///     ReadBufResult::NotFound(write_guard) => {
+    ///         // The page was not found in cache. Read it from disk into the
+    ///         // buffer.
+    ///         //read_my_page_from_disk(write_guard);
+    ///
+    ///         // The buffer contents are now valid. Tell the page cache.
+    ///         write_guard.mark_valid();
+    ///     },
+    /// }
+    /// ```
+    ///
+    fn lock_for_read(&self, cache_key: &mut CacheKey) -> ReadBufResult {
+        loop {
+            // First check if the key already exists in the cache.
+            if let Some(read_guard) = self.try_lock_for_read(cache_key) {
+                return ReadBufResult::Found(read_guard);
+            }
+
+            // Not found. Find a victim buffer
+            let (slot_idx, mut inner) = self.find_victim();
+
+            // Insert mapping for this. At this point, we may find that another
+            // thread did the same thing concurrently. In that case, we evicted
+            // our victim buffer unnecessarily. Put it into the free list and
+            // continue with the slot that the other thread chose.
+            if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
+                // TODO: put to free list
+
+                // We now just loop back to start from beginning. This is not
+                // optimal, we'll perform the lookup in the mapping again, which
+                // is not really necessary because we already got
+                // 'existing_slot_idx'.  But this shouldn't happen often enough
+                // to matter much.
+                continue;
+            }
+
+            // Make the slot ready
+            let slot = &self.slots[slot_idx];
+            inner.key = Some(cache_key.clone());
+            inner.dirty = false;
+            slot.usage_count.store(1, Ordering::Relaxed);
+
+            return ReadBufResult::NotFound(PageWriteGuard {
+                inner,
+                valid: false,
+            });
+        }
+    }
+
+    /// Look up a page in the cache and lock it in write mode. If it's not
+    /// found, returns None.
+    ///
+    /// When locking a page for writing, the search criteria is always "exact".
+    fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
+        if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
+            // The page was found in the mapping. Lock the slot, and re-check
+            // that it's still what we expected (because we don't released the mapping
+            // lock already, another thread could have evicted the page)
+            let slot = &self.slots[slot_idx];
+            let inner = slot.inner.write().unwrap();
+            if inner.key.as_ref() == Some(cache_key) {
+                slot.inc_usage_count();
+                return Some(PageWriteGuard { inner, valid: true });
+            }
+        }
+        None
+    }
+
+    /// Return a write-locked buffer for given block.
+    ///
+    /// Similar to lock_for_read(), but the returned buffer is write-locked and
+    /// may be modified by the caller even if it's already found in the cache.
+    fn lock_for_write(&self, cache_key: &CacheKey) -> WriteBufResult {
+        loop {
+            // First check if the key already exists in the cache.
+            if let Some(write_guard) = self.try_lock_for_write(cache_key) {
+                return WriteBufResult::Found(write_guard);
+            }
+
+            // Not found. Find a victim buffer
+            let (slot_idx, mut inner) = self.find_victim();
+
+            // Insert mapping for this. At this point, we may find that another
+            // thread did the same thing concurrently. In that case, we evicted
+            // our victim buffer unnecessarily. Put it into the free list and
+            // continue with the slot that the other thread chose.
+            if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
+                // TODO: put to free list
+
+                // We now just loop back to start from beginning. This is not
+                // optimal, we'll perform the lookup in the mapping again, which
+                // is not really necessary because we already got
+                // 'existing_slot_idx'.  But this shouldn't happen often enough
+                // to matter much.
+                continue;
+            }
+
+            // Make the slot ready
+            let slot = &self.slots[slot_idx];
+            inner.key = Some(cache_key.clone());
+            inner.dirty = false;
+            slot.usage_count.store(1, Ordering::Relaxed);
+
+            return WriteBufResult::NotFound(PageWriteGuard {
+                inner,
+                valid: false,
+            });
+        }
+    }
+
+    //
+    // Section 3: Mapping functions
+    //
+
+    /// Search for a page in the cache using the given search key.
+    ///
+    /// Returns the slot index, if any. If the search criteria is not exact,
+    /// *cache_key is updated with the actual key of the found page.
+    ///
+    /// NOTE: We don't hold any lock on the mapping on return, so the slot might
+    /// get recycled for an unrelated page immediately after this function
+    /// returns.  The caller is responsible for re-checking that the slot still
+    /// contains the page with the same key before using it.
+    ///
+    fn search_mapping(&self, cache_key: &mut CacheKey) -> Option<usize> {
+        match cache_key {
+            CacheKey::MaterializedPage { hash_key, lsn } => {
+                let map = self.materialized_page_map.read().unwrap();
+                let versions = map.get(hash_key)?;
+
+                let version_idx = match versions.binary_search_by_key(lsn, |v| v.lsn) {
+                    Ok(version_idx) => version_idx,
+                    Err(0) => return None,
+                    Err(version_idx) => version_idx - 1,
+                };
+                let version = &versions[version_idx];
+                *lsn = version.lsn;
+                Some(version.slot_idx)
+            }
+            CacheKey::EphemeralPage { file_id, blkno } => {
+                let map = self.ephemeral_page_map.read().unwrap();
+                Some(*map.get(&(*file_id, *blkno))?)
+            }
+        }
+    }
+
+    /// Search for a page in the cache using the given search key.
+    ///
+    /// Like 'search_mapping, but performs an "exact" search. Used for
+    /// allocating a new buffer.
+    fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
+        match key {
+            CacheKey::MaterializedPage { hash_key, lsn } => {
+                let map = self.materialized_page_map.read().unwrap();
+                let versions = map.get(hash_key)?;
+
+                if let Ok(version_idx) = versions.binary_search_by_key(lsn, |v| v.lsn) {
+                    Some(versions[version_idx].slot_idx)
+                } else {
+                    None
+                }
+            }
+            CacheKey::EphemeralPage { file_id, blkno } => {
+                let map = self.ephemeral_page_map.read().unwrap();
+                Some(*map.get(&(*file_id, *blkno))?)
+            }
+        }
+    }
+
+    ///
+    /// Remove mapping for given key.
+    ///
+    fn remove_mapping(&self, old_key: &CacheKey) {
+        match old_key {
+            CacheKey::MaterializedPage {
+                hash_key: old_hash_key,
+                lsn: old_lsn,
+            } => {
+                let mut map = self.materialized_page_map.write().unwrap();
+                if let Entry::Occupied(mut old_entry) = map.entry(old_hash_key.clone()) {
+                    let versions = old_entry.get_mut();
+
+                    if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) {
+                        versions.remove(version_idx);
+                        if versions.is_empty() {
+                            old_entry.remove_entry();
+                        }
+                    }
+                } else {
+                    panic!("could not find old key in mapping")
+                }
+            }
+            CacheKey::EphemeralPage { file_id, blkno } => {
+                let mut map = self.ephemeral_page_map.write().unwrap();
+                map.remove(&(*file_id, *blkno))
+                    .expect("could not find old key in mapping");
+            }
+        }
+    }
+
+    ///
+    /// Insert mapping for given key.
+    ///
+    /// If a mapping already existed for the given key, returns the slot index
+    /// of the existing mapping and leaves it untouched.
+    fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
+        match new_key {
+            CacheKey::MaterializedPage {
+                hash_key: new_key,
+                lsn: new_lsn,
+            } => {
+                let mut map = self.materialized_page_map.write().unwrap();
+                let versions = map.entry(new_key.clone()).or_default();
+                match versions.binary_search_by_key(new_lsn, |v| v.lsn) {
+                    Ok(version_idx) => Some(versions[version_idx].slot_idx),
+                    Err(version_idx) => {
+                        versions.insert(
+                            version_idx,
+                            Version {
+                                lsn: *new_lsn,
+                                slot_idx,
+                            },
+                        );
+                        None
+                    }
+                }
+            }
+            CacheKey::EphemeralPage { file_id, blkno } => {
+                let mut map = self.ephemeral_page_map.write().unwrap();
+                match map.entry((*file_id, *blkno)) {
+                    Entry::Occupied(entry) => Some(*entry.get()),
+                    Entry::Vacant(entry) => {
+                        entry.insert(slot_idx);
+                        None
+                    }
+                }
+            }
+        }
+    }
+
+    //
+    // Section 4: Misc internal helpers
+    //
+
+    /// Find a slot to evict.
+    ///
+    /// On return, the slot is empty and write-locked.
+    fn find_victim(&self) -> (usize, RwLockWriteGuard<SlotInner>) {
+        let iter_limit = self.slots.len() * 2;
+        let mut iters = 0;
+        loop {
+            let slot_idx = self.next_evict_slot.fetch_add(1, Ordering::Relaxed) % self.slots.len();
+
+            let slot = &self.slots[slot_idx];
+
+            if slot.dec_usage_count() == 0 || iters >= iter_limit {
+                let mut inner = slot.inner.write().unwrap();
+
+                if let Some(old_key) = &inner.key {
+                    if inner.dirty {
+                        if let Err(err) = Self::writeback(old_key, inner.buf) {
+                            // Writing the page to disk failed.
+                            //
+                            // FIXME: What to do here, when? We could propagate the error to the
+                            // caller, but victim buffer is generally unrelated to the original
+                            // call. It can even belong to a different tenant. Currently, we
+                            // report the error to the log and continue the clock sweep to find
+                            // a different victim. But if the problem persists, the page cache
+                            // could fill up with dirty pages that we cannot evict, and we will
+                            // loop retrying the writebacks indefinitely.
+                            error!("writeback of buffer {:?} failed: {}", old_key, err);
+                            continue;
+                        }
+                    }
+
+                    // remove mapping for old buffer
+                    self.remove_mapping(old_key);
+                    inner.dirty = false;
+                    inner.key = None;
+                }
+                return (slot_idx, inner);
+            }
+
+            iters += 1;
+        }
+    }
+
+    fn writeback(cache_key: &CacheKey, buf: &[u8]) -> Result<(), std::io::Error> {
+        match cache_key {
+            CacheKey::MaterializedPage {
+                hash_key: _,
+                lsn: _,
+            } => {
+                panic!("unexpected dirty materialized page");
+            }
+            CacheKey::EphemeralPage { file_id, blkno } => {
+                writeback_ephemeral_file(*file_id, *blkno, buf)
+            }
+        }
+    }
+
+    /// Initialize a new page cache
+    ///
+    /// This should be called only once at page server startup.
+    fn new(num_pages: usize) -> Self {
+        assert!(num_pages > 0, "page cache size must be > 0");
+
+        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
+
+        let slots = page_buffer
+            .chunks_exact_mut(PAGE_SZ)
+            .map(|chunk| {
+                let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
+
+                Slot {
+                    inner: RwLock::new(SlotInner {
+                        key: None,
+                        buf,
+                        dirty: false,
+                    }),
+                    usage_count: AtomicU8::new(0),
+                }
+            })
+            .collect();
+
+        Self {
+            materialized_page_map: Default::default(),
+            ephemeral_page_map: Default::default(),
+            slots,
+            next_evict_slot: AtomicUsize::new(0),
+        }
+    }
+}
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -10,7 +10,7 @@
 //     *callmemaybe <zenith timelineid> $url* -- ask pageserver to start walreceiver on $url
 //

-use anyhow::{anyhow, bail, ensure, Result};
+use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use lazy_static::lazy_static;
 use regex::Regex;
@@ -279,7 +279,8 @@ impl PageServerHandler {
        let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered();

        // Check that the timeline exists
-        let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
+        let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
+            .context("Cannot handle pagerequests for a remote timeline")?;

        /* switch client to COPYBOTH */
        pgb.write_message(&BeMessage::CopyBothResponse)?;
@@ -301,17 +302,17 @@ impl PageServerHandler {
                            PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME
                                .with_label_values(&["get_rel_exists"])
                                .observe_closure_duration(|| {
-                                    self.handle_get_rel_exists_request(&*timeline, &req)
+                                    self.handle_get_rel_exists_request(timeline.as_ref(), &req)
                                }),
                            PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME
                                .with_label_values(&["get_rel_size"])
                                .observe_closure_duration(|| {
-                                    self.handle_get_nblocks_request(&*timeline, &req)
+                                    self.handle_get_nblocks_request(timeline.as_ref(), &req)
                                }),
                            PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME
                                .with_label_values(&["get_page_at_lsn"])
                                .observe_closure_duration(|| {
-                                    self.handle_get_page_at_lsn_request(&*timeline, &req)
+                                    self.handle_get_page_at_lsn_request(timeline.as_ref(), &req)
                                }),
                        };

@@ -455,7 +456,13 @@ impl PageServerHandler {
        let _enter = span.enter();

        // check that the timeline exists
-        let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
+        let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
+            .context("Cannot handle basebackup request for a remote timeline")?;
+        if let Some(lsn) = lsn {
+            timeline
+                .check_lsn_is_in_scope(lsn)
+                .context("invalid basebackup lsn")?;
+        }

        // switch client to COPYOUT
        pgb.write_message(&BeMessage::CopyOutResponse)?;
@@ -590,7 +597,8 @@ impl postgres_backend::Handler for PageServerHandler {
                info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered();

            // Check that the timeline exists
-            tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
+            tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
+                .context("Failed to fetch local timeline for callmemaybe requests")?;

            walreceiver::launch_wal_receiver(self.conf, timelineid, &connstr, tenantid.to_owned());

@@ -630,14 +638,16 @@ impl postgres_backend::Handler for PageServerHandler {

            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;

-            let branches = crate::branches::get_branches(self.conf, &tenantid)?;
+            // since these handlers for tenant/branch commands are deprecated (in favor of http based ones)
+            // just use false in place of include non incremental logical size
+            let branches = crate::branches::get_branches(self.conf, &tenantid, false)?;
            let branches_buf = serde_json::to_vec(&branches)?;

            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
                .write_message_noflush(&BeMessage::DataRow(&[Some(&branches_buf)]))?
                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else if query_string.starts_with("tenant_list") {
-            let tenants = crate::branches::get_tenants(self.conf)?;
+            let tenants = crate::tenant_mgr::list_tenants()?;
            let tenants_buf = serde_json::to_vec(&tenants)?;

            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
@@ -689,25 +699,69 @@ impl postgres_backend::Handler for PageServerHandler {
                .unwrap_or(Ok(self.conf.gc_horizon))?;

            let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
-
            let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;
-
            pgb.write_message_noflush(&BeMessage::RowDescription(&[
-                RowDescriptor::int8_col(b"meta_total"),
-                RowDescriptor::int8_col(b"meta_removed"),
-                RowDescriptor::int8_col(b"meta_dropped"),
-                RowDescriptor::int8_col(b"pages_total"),
-                RowDescriptor::int8_col(b"pages_removed"),
-                RowDescriptor::int8_col(b"pages_dropped"),
+                RowDescriptor::int8_col(b"layer_relfiles_total"),
+                RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"),
+                RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"),
+                RowDescriptor::int8_col(b"layer_relfiles_not_updated"),
+                RowDescriptor::int8_col(b"layer_relfiles_needed_as_tombstone"),
+                RowDescriptor::int8_col(b"layer_relfiles_removed"),
+                RowDescriptor::int8_col(b"layer_relfiles_dropped"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_total"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_needed_as_tombstone"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_removed"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"),
                RowDescriptor::int8_col(b"elapsed"),
            ]))?
            .write_message_noflush(&BeMessage::DataRow(&[
-                Some(result.meta_total.to_string().as_bytes()),
-                Some(result.meta_removed.to_string().as_bytes()),
-                Some(result.meta_dropped.to_string().as_bytes()),
-                Some(result.pages_total.to_string().as_bytes()),
-                Some(result.pages_removed.to_string().as_bytes()),
-                Some(result.pages_dropped.to_string().as_bytes()),
+                Some(result.ondisk_relfiles_total.to_string().as_bytes()),
+                Some(
+                    result
+                        .ondisk_relfiles_needed_by_cutoff
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(
+                    result
+                        .ondisk_relfiles_needed_by_branches
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(result.ondisk_relfiles_not_updated.to_string().as_bytes()),
+                Some(
+                    result
+                        .ondisk_relfiles_needed_as_tombstone
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(result.ondisk_relfiles_removed.to_string().as_bytes()),
+                Some(result.ondisk_relfiles_dropped.to_string().as_bytes()),
+                Some(result.ondisk_nonrelfiles_total.to_string().as_bytes()),
+                Some(
+                    result
+                        .ondisk_nonrelfiles_needed_by_cutoff
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(
+                    result
+                        .ondisk_nonrelfiles_needed_by_branches
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()),
+                Some(
+                    result
+                        .ondisk_nonrelfiles_needed_as_tombstone
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(result.ondisk_nonrelfiles_removed.to_string().as_bytes()),
+                Some(result.ondisk_nonrelfiles_dropped.to_string().as_bytes()),
                Some(result.elapsed.as_millis().to_string().as_bytes()),
            ]))?
            .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
--- a/pageserver/src/relish_storage.rs
+++ b/pageserver/src/relish_storage.rs
@@ -1,87 +0,0 @@
-//! Abstractions for the page server to store its relish layer data in the external storage.
-//!
-//! Main purpose of this module subtree is to provide a set of abstractions to manage the storage state
-//! in a way, optimal for page server.
-//!
-//! The abstractions hide multiple custom external storage API implementations,
-//! such as AWS S3, local filesystem, etc., located in the submodules.
-
-mod local_fs;
-mod rust_s3;
-/// A queue-based storage with the background machinery behind it to synchronize
-/// local page server layer files with external storage.
-mod synced_storage;
-
-use std::{path::Path, thread};
-
-use anyhow::Context;
-
-pub use self::synced_storage::schedule_timeline_upload;
-use self::{local_fs::LocalFs, rust_s3::RustS3};
-use crate::{PageServerConf, RelishStorageKind};
-
-pub fn run_storage_sync_thread(
-    config: &'static PageServerConf,
-) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
-    match &config.relish_storage_config {
-        Some(relish_storage_config) => {
-            let max_concurrent_sync = relish_storage_config.max_concurrent_sync;
-            match &relish_storage_config.storage {
-                RelishStorageKind::LocalFs(root) => synced_storage::run_storage_sync_thread(
-                    config,
-                    LocalFs::new(root.clone())?,
-                    max_concurrent_sync,
-                ),
-                RelishStorageKind::AwsS3(s3_config) => synced_storage::run_storage_sync_thread(
-                    config,
-                    RustS3::new(s3_config)?,
-                    max_concurrent_sync,
-                ),
-            }
-        }
-        None => Ok(None),
-    }
-}
-
-/// Storage (potentially remote) API to manage its state.
-#[async_trait::async_trait]
-pub trait RelishStorage: Send + Sync {
-    type RelishStoragePath;
-
-    fn derive_destination(
-        page_server_workdir: &Path,
-        relish_local_path: &Path,
-    ) -> anyhow::Result<Self::RelishStoragePath>;
-
-    async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>>;
-
-    async fn download_relish<W: 'static + std::io::Write + Send>(
-        &self,
-        from: &Self::RelishStoragePath,
-        // rust_s3 `get_object_stream` method requires `std::io::BufWriter` for some reason, not the async counterpart
-        // that forces us to consume and return the writer to satisfy the blocking operation async wrapper requirements
-        to: std::io::BufWriter<W>,
-    ) -> anyhow::Result<std::io::BufWriter<W>>;
-
-    async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()>;
-
-    async fn upload_relish<R: tokio::io::AsyncRead + std::marker::Unpin + Send>(
-        &self,
-        from: &mut tokio::io::BufReader<R>,
-        to: &Self::RelishStoragePath,
-    ) -> anyhow::Result<()>;
-}
-
-fn strip_workspace_prefix<'a>(
-    page_server_workdir: &'a Path,
-    relish_local_path: &'a Path,
-) -> anyhow::Result<&'a Path> {
-    relish_local_path
-        .strip_prefix(page_server_workdir)
-        .with_context(|| {
-            format!(
-                "Unexpected: relish local path '{}' is not relevant to server workdir",
-                relish_local_path.display(),
-            )
-        })
-}
--- a/pageserver/src/relish_storage/local_fs.rs
+++ b/pageserver/src/relish_storage/local_fs.rs
@@ -1,189 +0,0 @@
-//! Local filesystem relish storage.
-//!
-//! Page server already stores layer data on the server, when freezing it.
-//! This storage serves a way to
-//!
-//! * test things locally simply
-//! * allow to compabre both binary sets
-//! * help validating the relish storage API
-
-use std::{
-    future::Future,
-    io::Write,
-    path::{Path, PathBuf},
-    pin::Pin,
-};
-
-use anyhow::{bail, Context};
-use tokio::{fs, io};
-
-use super::{strip_workspace_prefix, RelishStorage};
-
-pub struct LocalFs {
-    root: PathBuf,
-}
-
-impl LocalFs {
-    /// Atetmpts to create local FS relish storage, also creates the directory provided, if not exists.
-    pub fn new(root: PathBuf) -> anyhow::Result<Self> {
-        if !root.exists() {
-            std::fs::create_dir_all(&root).with_context(|| {
-                format!(
-                    "Failed to create all directories in the given root path {}",
-                    root.display(),
-                )
-            })?;
-        }
-        Ok(Self { root })
-    }
-
-    fn resolve_in_storage(&self, path: &Path) -> anyhow::Result<PathBuf> {
-        if path.is_relative() {
-            Ok(self.root.join(path))
-        } else if path.starts_with(&self.root) {
-            Ok(path.to_path_buf())
-        } else {
-            bail!(
-                "Path '{}' does not belong to the current storage",
-                path.display()
-            )
-        }
-    }
-}
-
-#[async_trait::async_trait]
-impl RelishStorage for LocalFs {
-    type RelishStoragePath = PathBuf;
-
-    fn derive_destination(
-        page_server_workdir: &Path,
-        relish_local_path: &Path,
-    ) -> anyhow::Result<Self::RelishStoragePath> {
-        Ok(strip_workspace_prefix(page_server_workdir, relish_local_path)?.to_path_buf())
-    }
-
-    async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>> {
-        Ok(get_all_files(&self.root).await?.into_iter().collect())
-    }
-
-    async fn download_relish<W: 'static + std::io::Write + Send>(
-        &self,
-        from: &Self::RelishStoragePath,
-        mut to: std::io::BufWriter<W>,
-    ) -> anyhow::Result<std::io::BufWriter<W>> {
-        let file_path = self.resolve_in_storage(from)?;
-        if file_path.exists() && file_path.is_file() {
-            let updated_buffer = tokio::task::spawn_blocking(move || {
-                let mut source = std::io::BufReader::new(
-                    std::fs::OpenOptions::new()
-                        .read(true)
-                        .open(&file_path)
-                        .with_context(|| {
-                            format!(
-                                "Failed to open source file '{}' to use in the download",
-                                file_path.display()
-                            )
-                        })?,
-                );
-                std::io::copy(&mut source, &mut to)
-                    .context("Failed to download the relish file")?;
-                to.flush().context("Failed to flush the download buffer")?;
-                Ok::<_, anyhow::Error>(to)
-            })
-            .await
-            .context("Failed to spawn a blocking task")??;
-            Ok(updated_buffer)
-        } else {
-            bail!(
-                "File '{}' either does not exist or is not a file",
-                file_path.display()
-            )
-        }
-    }
-
-    async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()> {
-        let file_path = self.resolve_in_storage(path)?;
-        if file_path.exists() && file_path.is_file() {
-            Ok(tokio::fs::remove_file(file_path).await?)
-        } else {
-            bail!(
-                "File '{}' either does not exist or is not a file",
-                file_path.display()
-            )
-        }
-    }
-
-    async fn upload_relish<R: io::AsyncRead + std::marker::Unpin + Send>(
-        &self,
-        from: &mut io::BufReader<R>,
-        to: &Self::RelishStoragePath,
-    ) -> anyhow::Result<()> {
-        let target_file_path = self.resolve_in_storage(to)?;
-        create_target_directory(&target_file_path).await?;
-        let mut destination = io::BufWriter::new(
-            fs::OpenOptions::new()
-                .write(true)
-                .create(true)
-                .open(&target_file_path)
-                .await
-                .with_context(|| {
-                    format!(
-                        "Failed to open target fs destination at '{}'",
-                        target_file_path.display()
-                    )
-                })?,
-        );
-
-        io::copy_buf(from, &mut destination)
-            .await
-            .context("Failed to upload relish to local storage")?;
-        Ok(())
-    }
-}
-
-fn get_all_files<'a, P>(
-    directory_path: P,
-) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
-where
-    P: AsRef<Path> + Send + Sync + 'a,
-{
-    Box::pin(async move {
-        let directory_path = directory_path.as_ref();
-        if directory_path.exists() {
-            if directory_path.is_dir() {
-                let mut paths = Vec::new();
-                let mut dir_contents = tokio::fs::read_dir(directory_path).await?;
-                while let Some(dir_entry) = dir_contents.next_entry().await? {
-                    let file_type = dir_entry.file_type().await?;
-                    let entry_path = dir_entry.path();
-                    if file_type.is_symlink() {
-                        log::debug!("{:?} us a symlink, skipping", entry_path)
-                    } else if file_type.is_dir() {
-                        paths.extend(get_all_files(entry_path).await?.into_iter())
-                    } else {
-                        paths.push(dir_entry.path());
-                    }
-                }
-                Ok(paths)
-            } else {
-                bail!("Path '{}' is not a directory", directory_path.display())
-            }
-        } else {
-            Ok(Vec::new())
-        }
-    })
-}
-
-async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()> {
-    let target_dir = match target_file_path.parent() {
-        Some(parent_dir) => parent_dir,
-        None => bail!(
-            "Relish path '{}' has no parent directory",
-            target_file_path.display()
-        ),
-    };
-    if !target_dir.exists() {
-        tokio::fs::create_dir_all(target_dir).await?;
-    }
-    Ok(())
-}
--- a/pageserver/src/relish_storage/rust_s3.rs
+++ b/pageserver/src/relish_storage/rust_s3.rs
@@ -1,149 +0,0 @@
-//! A wrapper around AWS S3 client library `rust_s3` to be used a relish storage.
-
-use std::io::Write;
-use std::path::Path;
-
-use anyhow::Context;
-use s3::{bucket::Bucket, creds::Credentials, region::Region};
-
-use crate::{
-    relish_storage::{strip_workspace_prefix, RelishStorage},
-    S3Config,
-};
-
-const S3_FILE_SEPARATOR: char = '/';
-
-#[derive(Debug)]
-pub struct S3ObjectKey(String);
-
-impl S3ObjectKey {
-    fn key(&self) -> &str {
-        &self.0
-    }
-}
-
-/// AWS S3 relish storage.
-pub struct RustS3 {
-    bucket: Bucket,
-}
-
-impl RustS3 {
-    /// Creates the relish storage, errors if incorrect AWS S3 configuration provided.
-    pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
-        let region = aws_config
-            .bucket_region
-            .parse::<Region>()
-            .context("Failed to parse the s3 region from config")?;
-        let credentials = Credentials::new(
-            aws_config.access_key_id.as_deref(),
-            aws_config.secret_access_key.as_deref(),
-            None,
-            None,
-            None,
-        )
-        .context("Failed to create the s3 credentials")?;
-        Ok(Self {
-            bucket: Bucket::new_with_path_style(
-                aws_config.bucket_name.as_str(),
-                region,
-                credentials,
-            )
-            .context("Failed to create the s3 bucket")?,
-        })
-    }
-}
-
-#[async_trait::async_trait]
-impl RelishStorage for RustS3 {
-    type RelishStoragePath = S3ObjectKey;
-
-    fn derive_destination(
-        page_server_workdir: &Path,
-        relish_local_path: &Path,
-    ) -> anyhow::Result<Self::RelishStoragePath> {
-        let relative_path = strip_workspace_prefix(page_server_workdir, relish_local_path)?;
-        let mut key = String::new();
-        for segment in relative_path {
-            key.push(S3_FILE_SEPARATOR);
-            key.push_str(&segment.to_string_lossy());
-        }
-        Ok(S3ObjectKey(key))
-    }
-
-    async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>> {
-        let list_response = self
-            .bucket
-            .list(String::new(), None)
-            .await
-            .context("Failed to list s3 objects")?;
-
-        Ok(list_response
-            .into_iter()
-            .flat_map(|response| response.contents)
-            .map(|s3_object| S3ObjectKey(s3_object.key))
-            .collect())
-    }
-
-    async fn download_relish<W: 'static + std::io::Write + Send>(
-        &self,
-        from: &Self::RelishStoragePath,
-        mut to: std::io::BufWriter<W>,
-    ) -> anyhow::Result<std::io::BufWriter<W>> {
-        let code = self
-            .bucket
-            .get_object_stream(from.key(), &mut to)
-            .await
-            .with_context(|| format!("Failed to download s3 object with key {}", from.key()))?;
-        if code != 200 {
-            Err(anyhow::format_err!(
-                "Received non-200 exit code during downloading object from directory, code: {}",
-                code
-            ))
-        } else {
-            tokio::task::spawn_blocking(move || {
-                to.flush().context("Failed to fluch the downoad buffer")?;
-                Ok::<_, anyhow::Error>(to)
-            })
-            .await
-            .context("Failed to joim the download buffer flush task")?
-        }
-    }
-
-    async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()> {
-        let (_, code) = self
-            .bucket
-            .delete_object(path.key())
-            .await
-            .with_context(|| format!("Failed to delete s3 object with key {}", path.key()))?;
-        if code != 204 {
-            Err(anyhow::format_err!(
-                "Received non-204 exit code during deleting object with key '{}', code: {}",
-                path.key(),
-                code
-            ))
-        } else {
-            Ok(())
-        }
-    }
-
-    async fn upload_relish<R: tokio::io::AsyncRead + std::marker::Unpin + Send>(
-        &self,
-        from: &mut tokio::io::BufReader<R>,
-        to: &Self::RelishStoragePath,
-    ) -> anyhow::Result<()> {
-        let code = self
-            .bucket
-            .put_object_stream(from, to.key())
-            .await
-            .with_context(|| format!("Failed to create s3 object with key {}", to.key()))?;
-        if code != 200 {
-            Err(anyhow::format_err!(
-                "Received non-200 exit code during creating object with key '{}', code: {}",
-                to.key(),
-                code
-            ))
-        } else {
-            Ok(())
-        }
-    }
-}
--- a/pageserver/src/relish_storage/synced_storage.rs
+++ b/pageserver/src/relish_storage/synced_storage.rs
@@ -1,57 +0,0 @@
-use std::time::Duration;
-use std::{collections::BinaryHeap, sync::Mutex, thread};
-
-use crate::tenant_mgr;
-use crate::{relish_storage::RelishStorage, PageServerConf};
-
-lazy_static::lazy_static! {
-    static ref UPLOAD_QUEUE: Mutex<BinaryHeap<SyncTask>> = Mutex::new(BinaryHeap::new());
-}
-
-pub fn schedule_timeline_upload(_local_timeline: ()) {
-    // UPLOAD_QUEUE
-    //     .lock()
-    //     .unwrap()
-    //     .push(SyncTask::Upload(local_timeline))
-}
-
-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
-enum SyncTask {}
-
-pub fn run_storage_sync_thread<
-    P: std::fmt::Debug,
-    S: 'static + RelishStorage<RelishStoragePath = P>,
->(
-    config: &'static PageServerConf,
-    relish_storage: S,
-    max_concurrent_sync: usize,
-) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
-    let runtime = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()?;
-
-    let handle = thread::Builder::new()
-        .name("Queue based relish storage sync".to_string())
-        .spawn(move || {
-            while !tenant_mgr::shutdown_requested() {
-                let mut queue_accessor = UPLOAD_QUEUE.lock().unwrap();
-                log::debug!("Upload queue length: {}", queue_accessor.len());
-                let next_task = queue_accessor.pop();
-                drop(queue_accessor);
-                match next_task {
-                    Some(task) => runtime.block_on(async {
-                        // suppress warnings
-                        let _ = (config, task, &relish_storage, max_concurrent_sync);
-                        todo!("omitted for brevity")
-                    }),
-                    None => {
-                        thread::sleep(Duration::from_secs(1));
-                        continue;
-                    }
-                }
-            }
-            log::debug!("Queue based relish storage sync thread shut down");
-            Ok(())
-        })?;
-    Ok(Some(handle))
-}
--- a/pageserver/src/remote_storage.rs
+++ b/pageserver/src/remote_storage.rs
@@ -0,0 +1,355 @@
+//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage.
+//! This particular module serves as a public API border between pageserver and the internal storage machinery.
+//! No other modules from this tree are supposed to be used directly by the external code.
+//!
+//! There are a few components the storage machinery consists of:
+//! * [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
+//!     * [`local_fs`] allows to use local file system as an external storage
+//!     * [`rust_s3`] uses AWS S3 bucket entirely as an external storage
+//!
+//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync.
+//! Synchronization internals are split into submodules
+//!     * [`storage_sync::compression`] for a custom remote storage format used to store timeline files in archives
+//!     * [`storage_sync::index`] to keep track of remote tenant files, the metadata and their mappings to local files
+//!     * [`storage_sync::upload`] and [`storage_sync::download`] to manage archive creation and upload; download and extraction, respectively
+//!
+//! * public API via to interact with the external world:
+//!     * [`start_local_timeline_sync`] to launch a background async loop to handle the synchronization
+//!     * [`schedule_timeline_checkpoint_upload`] and [`schedule_timeline_download`] to enqueue a new upload and download tasks,
+//!       to be processed by the async loop
+//!
+//! Here's a schematic overview of all interactions backup and the rest of the pageserver perform:
+//!
+//! +------------------------+                                    +--------->-------+
+//! |                        |  - - - (init async loop) - - - ->  |                 |
+//! |                        |                                    |                 |
+//! |                        |  ------------------------------->  |      async      |
+//! |       pageserver       |    (enqueue timeline sync task)    | upload/download |
+//! |                        |                                    |      loop       |
+//! |                        |  <-------------------------------  |                 |
+//! |                        |  (apply new timeline sync states)  |                 |
+//! +------------------------+                                    +---------<-------+
+//!                                                                         |
+//!                                                                         |
+//!                                          CRUD layer file operations     |
+//!                                     (upload/download/delete/list, etc.) |
+//!                                                                         V
+//!                                                            +------------------------+
+//!                                                            |                        |
+//!                                                            | [`RemoteStorage`] impl |
+//!                                                            |                        |
+//!                                                            | pageserver assumes it  |
+//!                                                            | owns exclusive write   |
+//!                                                            | access to this storage |
+//!                                                            +------------------------+
+//!
+//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop uninitialised, if configured so.
+//! The loop inits the storage connection and checks the remote files stored.
+//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server).
+//! Based on the remote storage data, the sync logic immediately schedules sync tasks for local timelines and reports about remote only timelines to pageserver, so it can
+//! query their downloads later if they are accessed.
+//!
+//! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata.
+//! If the storage sync loop was successfully started before, pageserver schedules the new checkpoint file uploads after every checkpoint.
+//! The checkpoint uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either).
+//! See [`crate::layered_repository`] for the upload calls and the adjacent logic.
+//!
+//! Synchronization logic is able to communicate back with updated timeline sync states, [`TimelineSyncState`],
+//! submitted via [`crate::tenant_mgr::set_timeline_states`] function. Tenant manager applies corresponding timeline updates in pageserver's in-memory state.
+//! Such submissions happen in two cases:
+//! * once after the sync loop startup, to signal pageserver which timelines will be synchronized in the near future
+//! * after every loop step, in case a timeline needs to be reloaded or evicted from pageserver's memory
+//!
+//! When the pageserver terminates, the upload loop finishes a current sync task (if any) and exits.
+//!
+//! The storage logic considers `image` as a set of local files, fully representing a certain timeline at given moment (identified with `disk_consistent_lsn`).
+//! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed
+//! by the storage upload, if enabled.
+//! Yet timeline cannot alter already existing files, and normally cannot remote those too: only a GC process is capable of removing unused files.
+//! This way, remote storage synchronization relies on the fact that every checkpoint is incremental and local files are "immutable":
+//! * when a certain checkpoint gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same state
+//! * no files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten
+//! when the newer image is downloaded
+//!
+//! To optimize S3 storage (and access), the sync loop compresses the checkpoint files before placing them to S3, and uncompresses them back, keeping track of timeline files and metadata.
+//! Also, the remote file list is queried once only, at startup, to avoid possible extra costs and latency issues.
+//!
+//! NOTES:
+//! * pageserver assumes it has exclusive write access to the remote storage. If supported, the way multiple pageservers can be separated in the same storage
+//! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API.
+//!
+//! * the sync tasks may not processed immediately after the submission: if they error and get re-enqueued, their execution might be backed off to ensure error cap is not exceeded too fast.
+//! The sync queue processing also happens in batches, so the sync tasks can wait in the queue for some time.
+
+mod local_fs;
+mod rust_s3;
+mod storage_sync;
+
+use std::{
+    collections::HashMap,
+    ffi, fs,
+    path::{Path, PathBuf},
+    thread,
+};
+
+use anyhow::{bail, Context};
+use tokio::io;
+use tracing::{error, info};
+use zenith_utils::zid::{ZTenantId, ZTimelineId};
+
+pub use self::storage_sync::{schedule_timeline_checkpoint_upload, schedule_timeline_download};
+use self::{local_fs::LocalFs, rust_s3::S3};
+use crate::{
+    layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME},
+    repository::TimelineSyncState,
+    PageServerConf, RemoteStorageKind,
+};
+
+/// Any timeline has its own id and its own tenant it belongs to,
+/// the sync processes group timelines by both for simplicity.
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct TimelineSyncId(ZTenantId, ZTimelineId);
+
+impl std::fmt::Display for TimelineSyncId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "(tenant id: {}, timeline id: {})", self.0, self.1)
+    }
+}
+
+/// A structure to combine all synchronization data to share with pageserver after a successful sync loop initialization.
+/// Successful initialization includes a case when sync loop is not started, in which case the startup data is returned still,
+/// to simplify the received code.
+pub struct SyncStartupData {
+    /// A sync state, derived from initial comparison of local timeline files and the remote archives,
+    /// before any sync tasks are executed.
+    /// To reuse the local file scan logic, the timeline states are returned even if no sync loop get started during init:
+    /// in this case, no remote files exist and all local timelines with correct metadata files are considered ready.
+    pub initial_timeline_states: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>>,
+    /// A handle to the sync loop, if it was started from the configuration provided.
+    pub sync_loop_handle: Option<thread::JoinHandle<anyhow::Result<()>>>,
+}
+
+/// Based on the config, initiates the remote storage connection and starts a separate thread
+/// that ensures that pageserver and the remote storage are in sync with each other.
+/// If no external configuration connection given, no thread or storage initialization is done.
+/// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states.
+pub fn start_local_timeline_sync(
+    config: &'static PageServerConf,
+) -> anyhow::Result<SyncStartupData> {
+    let local_timeline_files = local_tenant_timeline_files(config)
+        .context("Failed to collect local tenant timeline files")?;
+
+    match &config.remote_storage_config {
+        Some(storage_config) => match &storage_config.storage {
+            RemoteStorageKind::LocalFs(root) => storage_sync::spawn_storage_sync_thread(
+                config,
+                local_timeline_files,
+                LocalFs::new(root.clone(), &config.workdir)?,
+                storage_config.max_concurrent_sync,
+                storage_config.max_sync_errors,
+            ),
+            RemoteStorageKind::AwsS3(s3_config) => storage_sync::spawn_storage_sync_thread(
+                config,
+                local_timeline_files,
+                S3::new(s3_config, &config.workdir)?,
+                storage_config.max_concurrent_sync,
+                storage_config.max_sync_errors,
+            ),
+        }
+        .context("Failed to spawn the storage sync thread"),
+        None => {
+            info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled");
+            let mut initial_timeline_states: HashMap<
+                ZTenantId,
+                HashMap<ZTimelineId, TimelineSyncState>,
+            > = HashMap::new();
+            for TimelineSyncId(tenant_id, timeline_id) in local_timeline_files.into_keys() {
+                initial_timeline_states
+                    .entry(tenant_id)
+                    .or_default()
+                    .insert(timeline_id, TimelineSyncState::Ready);
+            }
+            Ok(SyncStartupData {
+                initial_timeline_states,
+                sync_loop_handle: None,
+            })
+        }
+    }
+}
+
+fn local_tenant_timeline_files(
+    config: &'static PageServerConf,
+) -> anyhow::Result<HashMap<TimelineSyncId, (TimelineMetadata, Vec<PathBuf>)>> {
+    let mut local_tenant_timeline_files = HashMap::new();
+    let tenants_dir = config.tenants_path();
+    for tenants_dir_entry in fs::read_dir(&tenants_dir)
+        .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))?
+    {
+        match &tenants_dir_entry {
+            Ok(tenants_dir_entry) => {
+                match collect_timelines_for_tenant(config, &tenants_dir_entry.path()) {
+                    Ok(collected_files) => {
+                        local_tenant_timeline_files.extend(collected_files.into_iter())
+                    }
+                    Err(e) => error!(
+                        "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}",
+                        tenants_dir.display(),
+                        tenants_dir_entry,
+                        e
+                    ),
+                }
+            }
+            Err(e) => error!(
+                "Failed to list tenants dir entry {:?} in directory {}, reason: {:#}",
+                tenants_dir_entry,
+                tenants_dir.display(),
+                e
+            ),
+        }
+    }
+
+    Ok(local_tenant_timeline_files)
+}
+
+fn collect_timelines_for_tenant(
+    config: &'static PageServerConf,
+    tenant_path: &Path,
+) -> anyhow::Result<HashMap<TimelineSyncId, (TimelineMetadata, Vec<PathBuf>)>> {
+    let mut timelines: HashMap<TimelineSyncId, (TimelineMetadata, Vec<PathBuf>)> = HashMap::new();
+    let tenant_id = tenant_path
+        .file_name()
+        .and_then(ffi::OsStr::to_str)
+        .unwrap_or_default()
+        .parse::<ZTenantId>()
+        .context("Could not parse tenant id out of the tenant dir name")?;
+    let timelines_dir = config.timelines_path(&tenant_id);
+
+    for timelines_dir_entry in fs::read_dir(&timelines_dir).with_context(|| {
+        format!(
+            "Failed to list timelines dir entry for tenant {}",
+            tenant_id
+        )
+    })? {
+        match timelines_dir_entry {
+            Ok(timelines_dir_entry) => {
+                let timeline_path = timelines_dir_entry.path();
+                match collect_timeline_files(&timeline_path) {
+                    Ok((timeline_id, metadata, timeline_files)) => {
+                        timelines.insert(
+                            TimelineSyncId(tenant_id, timeline_id),
+                            (metadata, timeline_files),
+                        );
+                    }
+                    Err(e) => error!(
+                        "Failed to process timeline dir contents at '{}', reason: {:#}",
+                        timeline_path.display(),
+                        e
+                    ),
+                }
+            }
+            Err(e) => error!(
+                "Failed to list timelines for entry tenant {}, reason: {:#}",
+                tenant_id, e
+            ),
+        }
+    }
+
+    Ok(timelines)
+}
+
+fn collect_timeline_files(
+    timeline_dir: &Path,
+) -> anyhow::Result<(ZTimelineId, TimelineMetadata, Vec<PathBuf>)> {
+    let mut timeline_files = Vec::new();
+    let mut timeline_metadata_path = None;
+
+    let timeline_id = timeline_dir
+        .file_name()
+        .and_then(ffi::OsStr::to_str)
+        .unwrap_or_default()
+        .parse::<ZTimelineId>()
+        .context("Could not parse timeline id out of the timeline dir name")?;
+    let timeline_dir_entries =
+        fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?;
+    for entry in timeline_dir_entries {
+        let entry_path = entry.context("Failed to list timeline dir entry")?.path();
+        if entry_path.is_file() {
+            if entry_path.file_name().and_then(ffi::OsStr::to_str) == Some(METADATA_FILE_NAME) {
+                timeline_metadata_path = Some(entry_path);
+            } else {
+                timeline_files.push(entry_path);
+            }
+        }
+    }
+
+    let timeline_metadata_path = match timeline_metadata_path {
+        Some(path) => path,
+        None => bail!("No metadata file found in the timeline directory"),
+    };
+    let metadata = TimelineMetadata::from_bytes(
+        &fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?,
+    )
+    .context("Failed to parse timeline metadata file bytes")?;
+
+    Ok((timeline_id, metadata, timeline_files))
+}
+
+/// Storage (potentially remote) API to manage its state.
+/// This storage tries to be unaware of any layered repository context,
+/// providing basic CRUD operations for storage files.
+#[async_trait::async_trait]
+trait RemoteStorage: Send + Sync {
+    /// A way to uniquely reference a file in the remote storage.
+    type StoragePath;
+
+    /// Attempts to derive the storage path out of the local path, if the latter is correct.
+    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath>;
+
+    /// Gets the download path of the given storage file.
+    fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf>;
+
+    /// Lists all items the storage has right now.
+    async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>>;
+
+    /// Streams the local file contents into remote into the remote storage entry.
+    async fn upload(
+        &self,
+        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        to: &Self::StoragePath,
+    ) -> anyhow::Result<()>;
+
+    /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
+    async fn download(
+        &self,
+        from: &Self::StoragePath,
+        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
+    ) -> anyhow::Result<()>;
+
+    /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
+    async fn download_range(
+        &self,
+        from: &Self::StoragePath,
+        start_inclusive: u64,
+        end_exclusive: Option<u64>,
+        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
+    ) -> anyhow::Result<()>;
+
+    async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()>;
+}
+
+fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
+    if prefix == path {
+        anyhow::bail!(
+            "Prefix and the path are equal, cannot strip: '{}'",
+            prefix.display()
+        )
+    } else {
+        path.strip_prefix(prefix).with_context(|| {
+            format!(
+                "Path '{}' is not prefixed with '{}'",
+                path.display(),
+                prefix.display(),
+            )
+        })
+    }
+}
--- a/pageserver/src/remote_storage/README.md
+++ b/pageserver/src/remote_storage/README.md
@@ -0,0 +1,77 @@
+# Non-implementation details
+
+This document describes the current state of the backup system in pageserver, existing limitations and concerns, why some things are done the way they are the future development plans.
+Detailed description on how the synchronization works and how it fits into the rest of the pageserver can be found in the [storage module](./../remote_storage.rs) and its submodules.
+Ideally, this document should disappear after current implementation concerns are mitigated, with the remaining useful knowledge bits moved into rustdocs.
+
+## Approach
+
+Backup functionality is a new component, appeared way after the core DB functionality was implemented.
+Pageserver layer functionality is also quite volatile at the moment, there's a risk its local file management changes over time.
+
+To avoid adding more chaos into that, backup functionality is currently designed as a relatively standalone component, with the majority of its logic placed in a standalone async loop.
+This way, the backups are managed in background, not affecting directly other pageserver parts: this way the backup and restoration process may lag behind, but eventually keep up with the reality. To track that, a set of prometheus metrics is exposed from pageserver.
+
+## What's done
+
+Current implementation
+* provides remote storage wrappers for AWS S3 and local FS
+* synchronizes the differences with local timelines and remote states as fast as possible
+* uploads new relishes, frozen by pageserver checkpoint thread
+* downloads and registers timelines, found on the remote storage, but missing locally, if those are requested somehow via pageserver (e.g. http api, gc)
+* uses compression when deals with files, for better S3 usage
+* maintains an index of what's stored remotely
+* evicts failing tasks and stops the corresponding timelines
+
+The tasks are delayed with every retry and the retries are capped, to avoid poisonous tasks.
+After any task eviction, or any error at startup checks (e.g. obviously different and wrong local and remote states fot the same timeline),
+the timeline has to be stopped from submitting further checkpoint upload tasks, which is done along the corresponding timeline status change.
+
+No good optimisations or performance testing is done, the feature is disabled by default and gets polished over time.
+It's planned to deal with all questions that are currently on and prepare the feature to be enabled by default in cloud environments.
+
+### Peculiarities
+
+As mentioned, the backup component is rather new and under development currently, so not all things are done properly from the start.
+Here's the list of known compromises with comments:
+
+* Remote storage file model is currently a custom archive format, that's not possible to deserialize without a particular Rust code of ours (including `serde`).
+We also don't optimize the archivation and pack every timeline checkpoint separately, so the resulting blob's size that gets on S3 could be arbitrary.
+But, it's a single blob, which is way better than storing ~780 small files separately.
+
+* Archive index restoration requires reading every blob's head.
+This could be avoided by a background thread/future storing the serialized index in the remote storage.
+
+* no proper file comparison
+
+No file checksum assertion is done currently, but should be (AWS S3 returns file checksums during the `list` operation)
+
+* sad rust-s3 api
+
+rust-s3 is not very pleasant to use:
+1. it returns `anyhow::Result` and it's hard to distinguish "missing file" cases from "no connection" one, for instance
+2. at least one function it its API that we need (`get_object_stream`) has `async` keyword and blocks (!), see details [here](https://github.com/zenithdb/zenith/pull/752#discussion_r728373091)
+3. it's a prerelease library with unclear maintenance status
+4. noisy on debug level
+
+But it's already used in the project, so for now it's reused to avoid bloating the dependency tree.
+Based on previous evaluation, even `rusoto-s3` could be a better choice over this library, but needs further benchmarking.
+
+
+* gc is ignored
+
+So far, we don't adjust the remote storage based on GC thread loop results, only checkpointer loop affects the remote storage.
+Index module could be used as a base to implement a deferred GC mechanism, a "defragmentation" that repacks archives into new ones after GC is done removing the files from the archives.
+
+* bracnhes implementaion could be improved
+
+Currently, there's a code to sync the branches along with the timeline files: on upload, every local branch files that are missing remotely are uploaded,
+on the timeline download, missing remote branch files are downlaoded.
+
+A branch is a per-tenant entity, yet a current implementaion requires synchronizing a timeline first to get the branch files locally.
+Currently, there's no other way to know about the remote branch files, neither the file contents is verified and updated.
+
+* no IT tests
+
+Automated S3 testing is lacking currently, due to no convenient way to enable backups during the tests.
+After it's fixed, benchmark runs should also be carried out to find bottlenecks.
--- a/pageserver/src/remote_storage/local_fs.rs
+++ b/pageserver/src/remote_storage/local_fs.rs
@@ -0,0 +1,689 @@
+//! Local filesystem acting as a remote storage.
+//! Multiple pageservers can use the same "storage" of this kind by using different storage roots.
+//!
+//! This storage used in pageserver tests, but can also be used in cases when a certain persistent
+//! volume is mounted to the local FS.
+
+use std::{
+    future::Future,
+    path::{Path, PathBuf},
+    pin::Pin,
+};
+
+use anyhow::{bail, ensure, Context};
+use tokio::{
+    fs,
+    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
+};
+use tracing::*;
+
+use super::{strip_path_prefix, RemoteStorage};
+
+pub struct LocalFs {
+    pageserver_workdir: &'static Path,
+    root: PathBuf,
+}
+
+impl LocalFs {
+    /// Attempts to create local FS storage, along with its root directory.
+    pub fn new(root: PathBuf, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
+        if !root.exists() {
+            std::fs::create_dir_all(&root).with_context(|| {
+                format!(
+                    "Failed to create all directories in the given root path '{}'",
+                    root.display(),
+                )
+            })?;
+        }
+        Ok(Self {
+            pageserver_workdir,
+            root,
+        })
+    }
+
+    fn resolve_in_storage(&self, path: &Path) -> anyhow::Result<PathBuf> {
+        if path.is_relative() {
+            Ok(self.root.join(path))
+        } else if path.starts_with(&self.root) {
+            Ok(path.to_path_buf())
+        } else {
+            bail!(
+                "Path '{}' does not belong to the current storage",
+                path.display()
+            )
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl RemoteStorage for LocalFs {
+    type StoragePath = PathBuf;
+
+    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath> {
+        Ok(self.root.join(
+            strip_path_prefix(self.pageserver_workdir, local_path)
+                .context("local path does not belong to this storage")?,
+        ))
+    }
+
+    fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf> {
+        let relative_path = strip_path_prefix(&self.root, storage_path)
+            .context("local path does not belong to this storage")?;
+        Ok(self.pageserver_workdir.join(relative_path))
+    }
+
+    async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
+        Ok(get_all_files(&self.root).await?.into_iter().collect())
+    }
+
+    async fn upload(
+        &self,
+        mut from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        to: &Self::StoragePath,
+    ) -> anyhow::Result<()> {
+        let target_file_path = self.resolve_in_storage(to)?;
+        create_target_directory(&target_file_path).await?;
+        let mut destination = io::BufWriter::new(
+            fs::OpenOptions::new()
+                .write(true)
+                .create(true)
+                .open(&target_file_path)
+                .await
+                .with_context(|| {
+                    format!(
+                        "Failed to open target fs destination at '{}'",
+                        target_file_path.display()
+                    )
+                })?,
+        );
+
+        io::copy(&mut from, &mut destination)
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to upload file to the local storage at '{}'",
+                    target_file_path.display()
+                )
+            })?;
+        destination.flush().await.with_context(|| {
+            format!(
+                "Failed to upload file to the local storage at '{}'",
+                target_file_path.display()
+            )
+        })?;
+        Ok(())
+    }
+
+    async fn download(
+        &self,
+        from: &Self::StoragePath,
+        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
+    ) -> anyhow::Result<()> {
+        let file_path = self.resolve_in_storage(from)?;
+
+        if file_path.exists() && file_path.is_file() {
+            let mut source = io::BufReader::new(
+                fs::OpenOptions::new()
+                    .read(true)
+                    .open(&file_path)
+                    .await
+                    .with_context(|| {
+                        format!(
+                            "Failed to open source file '{}' to use in the download",
+                            file_path.display()
+                        )
+                    })?,
+            );
+            io::copy(&mut source, to).await.with_context(|| {
+                format!(
+                    "Failed to download file '{}' from the local storage",
+                    file_path.display()
+                )
+            })?;
+            source.flush().await?;
+            Ok(())
+        } else {
+            bail!(
+                "File '{}' either does not exist or is not a file",
+                file_path.display()
+            )
+        }
+    }
+
+    async fn download_range(
+        &self,
+        from: &Self::StoragePath,
+        start_inclusive: u64,
+        end_exclusive: Option<u64>,
+        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
+    ) -> anyhow::Result<()> {
+        if let Some(end_exclusive) = end_exclusive {
+            ensure!(
+                end_exclusive > start_inclusive,
+                "Invalid range, start ({}) is bigger then end ({:?})",
+                start_inclusive,
+                end_exclusive
+            );
+            if start_inclusive == end_exclusive.saturating_sub(1) {
+                return Ok(());
+            }
+        }
+        let file_path = self.resolve_in_storage(from)?;
+
+        if file_path.exists() && file_path.is_file() {
+            let mut source = io::BufReader::new(
+                fs::OpenOptions::new()
+                    .read(true)
+                    .open(&file_path)
+                    .await
+                    .with_context(|| {
+                        format!(
+                            "Failed to open source file '{}' to use in the download",
+                            file_path.display()
+                        )
+                    })?,
+            );
+            source
+                .seek(io::SeekFrom::Start(start_inclusive))
+                .await
+                .context("Failed to seek to the range start in a local storage file")?;
+            match end_exclusive {
+                Some(end_exclusive) => {
+                    io::copy(&mut source.take(end_exclusive - start_inclusive), to).await
+                }
+                None => io::copy(&mut source, to).await,
+            }
+            .with_context(|| {
+                format!(
+                    "Failed to download file '{}' range from the local storage",
+                    file_path.display()
+                )
+            })?;
+            Ok(())
+        } else {
+            bail!(
+                "File '{}' either does not exist or is not a file",
+                file_path.display()
+            )
+        }
+    }
+
+    async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> {
+        let file_path = self.resolve_in_storage(path)?;
+        if file_path.exists() && file_path.is_file() {
+            Ok(fs::remove_file(file_path).await?)
+        } else {
+            bail!(
+                "File '{}' either does not exist or is not a file",
+                file_path.display()
+            )
+        }
+    }
+}
+
+fn get_all_files<'a, P>(
+    directory_path: P,
+) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
+where
+    P: AsRef<Path> + Send + Sync + 'a,
+{
+    Box::pin(async move {
+        let directory_path = directory_path.as_ref();
+        if directory_path.exists() {
+            if directory_path.is_dir() {
+                let mut paths = Vec::new();
+                let mut dir_contents = fs::read_dir(directory_path).await?;
+                while let Some(dir_entry) = dir_contents.next_entry().await? {
+                    let file_type = dir_entry.file_type().await?;
+                    let entry_path = dir_entry.path();
+                    if file_type.is_symlink() {
+                        debug!("{:?} us a symlink, skipping", entry_path)
+                    } else if file_type.is_dir() {
+                        paths.extend(get_all_files(entry_path).await?.into_iter())
+                    } else {
+                        paths.push(dir_entry.path());
+                    }
+                }
+                Ok(paths)
+            } else {
+                bail!("Path '{}' is not a directory", directory_path.display())
+            }
+        } else {
+            Ok(Vec::new())
+        }
+    })
+}
+
+async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()> {
+    let target_dir = match target_file_path.parent() {
+        Some(parent_dir) => parent_dir,
+        None => bail!(
+            "File path '{}' has no parent directory",
+            target_file_path.display()
+        ),
+    };
+    if !target_dir.exists() {
+        fs::create_dir_all(target_dir).await?;
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod pure_tests {
+    use crate::{
+        layered_repository::metadata::METADATA_FILE_NAME,
+        repository::repo_harness::{RepoHarness, TIMELINE_ID},
+    };
+
+    use super::*;
+
+    #[test]
+    fn storage_path_positive() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("storage_path_positive")?;
+        let storage_root = PathBuf::from("somewhere").join("else");
+        let storage = LocalFs {
+            pageserver_workdir: &repo_harness.conf.workdir,
+            root: storage_root.clone(),
+        };
+
+        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("file_name");
+        let expected_path = storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?);
+
+        assert_eq!(
+            expected_path,
+            storage.storage_path(&local_path).expect("Matching path should map to storage path normally"),
+            "File paths from pageserver workdir should be stored in local fs storage with the same path they have relative to the workdir"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn storage_path_negatives() -> anyhow::Result<()> {
+        #[track_caller]
+        fn storage_path_error(storage: &LocalFs, mismatching_path: &Path) -> String {
+            match storage.storage_path(mismatching_path) {
+                Ok(wrong_path) => panic!(
+                    "Expected path '{}' to error, but got storage path: {:?}",
+                    mismatching_path.display(),
+                    wrong_path,
+                ),
+                Err(e) => format!("{:?}", e),
+            }
+        }
+
+        let repo_harness = RepoHarness::create("storage_path_negatives")?;
+        let storage_root = PathBuf::from("somewhere").join("else");
+        let storage = LocalFs {
+            pageserver_workdir: &repo_harness.conf.workdir,
+            root: storage_root,
+        };
+
+        let error_string = storage_path_error(&storage, &repo_harness.conf.workdir);
+        assert!(error_string.contains("does not belong to this storage"));
+        assert!(error_string.contains(repo_harness.conf.workdir.to_str().unwrap()));
+
+        let mismatching_path_str = "/something/else";
+        let error_message = storage_path_error(&storage, Path::new(mismatching_path_str));
+        assert!(
+            error_message.contains(mismatching_path_str),
+            "Error should mention wrong path"
+        );
+        assert!(
+            error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
+            "Error should mention server workdir"
+        );
+        assert!(error_message.contains("does not belong to this storage"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn local_path_positive() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("local_path_positive")?;
+        let storage_root = PathBuf::from("somewhere").join("else");
+        let storage = LocalFs {
+            pageserver_workdir: &repo_harness.conf.workdir,
+            root: storage_root.clone(),
+        };
+
+        let name = "not a metadata";
+        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join(name);
+        assert_eq!(
+            local_path,
+            storage
+                .local_path(
+                    &storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?)
+                )
+                .expect("For a valid input, valid local path should be parsed"),
+            "Should be able to parse metadata out of the correctly named remote delta file"
+        );
+
+        let local_metadata_path = repo_harness
+            .timeline_path(&TIMELINE_ID)
+            .join(METADATA_FILE_NAME);
+        let remote_metadata_path = storage.storage_path(&local_metadata_path)?;
+        assert_eq!(
+            local_metadata_path,
+            storage
+                .local_path(&remote_metadata_path)
+                .expect("For a valid input, valid local path should be parsed"),
+            "Should be able to parse metadata out of the correctly named remote metadata file"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn local_path_negatives() -> anyhow::Result<()> {
+        #[track_caller]
+        #[allow(clippy::ptr_arg)] // have to use &PathBuf due to `storage.local_path` parameter requirements
+        fn local_path_error(storage: &LocalFs, storage_path: &PathBuf) -> String {
+            match storage.local_path(storage_path) {
+                Ok(wrong_path) => panic!(
+                    "Expected local path input {:?} to cause an error, but got file path: {:?}",
+                    storage_path, wrong_path,
+                ),
+                Err(e) => format!("{:?}", e),
+            }
+        }
+
+        let repo_harness = RepoHarness::create("local_path_negatives")?;
+        let storage_root = PathBuf::from("somewhere").join("else");
+        let storage = LocalFs {
+            pageserver_workdir: &repo_harness.conf.workdir,
+            root: storage_root,
+        };
+
+        let totally_wrong_path = "wrong_wrong_wrong";
+        let error_message = local_path_error(&storage, &PathBuf::from(totally_wrong_path));
+        assert!(error_message.contains(totally_wrong_path));
+
+        Ok(())
+    }
+
+    #[test]
+    fn download_destination_matches_original_path() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
+        let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
+
+        let storage_root = PathBuf::from("somewhere").join("else");
+        let dummy_storage = LocalFs {
+            pageserver_workdir: &repo_harness.conf.workdir,
+            root: storage_root,
+        };
+
+        let storage_path = dummy_storage.storage_path(&original_path)?;
+        let download_destination = dummy_storage.local_path(&storage_path)?;
+
+        assert_eq!(
+            original_path, download_destination,
+            "'original path -> storage path -> matching fs path' transformation should produce the same path as the input one for the correct path"
+        );
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod fs_tests {
+    use super::*;
+    use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID};
+
+    use std::io::Write;
+    use tempfile::tempdir;
+
+    #[tokio::test]
+    async fn upload_file() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("upload_file")?;
+        let storage = create_storage()?;
+
+        let source = create_file_for_upload(
+            &storage.pageserver_workdir.join("whatever"),
+            "whatever_contents",
+        )
+        .await?;
+        let target_path = PathBuf::from("/").join("somewhere").join("else");
+        match storage.upload(source, &target_path).await {
+            Ok(()) => panic!("Should not allow storing files with wrong target path"),
+            Err(e) => {
+                let message = format!("{:?}", e);
+                assert!(message.contains(&target_path.display().to_string()));
+                assert!(message.contains("does not belong to the current storage"));
+            }
+        }
+        assert!(storage.list().await?.is_empty());
+
+        let target_path_1 = upload_dummy_file(&repo_harness, &storage, "upload_1").await?;
+        assert_eq!(
+            storage.list().await?,
+            vec![target_path_1.clone()],
+            "Should list a single file after first upload"
+        );
+
+        let target_path_2 = upload_dummy_file(&repo_harness, &storage, "upload_2").await?;
+        assert_eq!(
+            list_files_sorted(&storage).await?,
+            vec![target_path_1.clone(), target_path_2.clone()],
+            "Should list a two different files after second upload"
+        );
+
+        Ok(())
+    }
+
+    fn create_storage() -> anyhow::Result<LocalFs> {
+        let pageserver_workdir = Box::leak(Box::new(tempdir()?.path().to_owned()));
+        let storage = LocalFs::new(tempdir()?.path().to_owned(), pageserver_workdir)?;
+        Ok(storage)
+    }
+
+    #[tokio::test]
+    async fn download_file() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("download_file")?;
+        let storage = create_storage()?;
+        let upload_name = "upload_1";
+        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
+
+        let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        storage.download(&upload_target, &mut content_bytes).await?;
+        content_bytes.flush().await?;
+
+        let contents = String::from_utf8(content_bytes.into_inner().into_inner())?;
+        assert_eq!(
+            dummy_contents(upload_name),
+            contents,
+            "We should upload and download the same contents"
+        );
+
+        let non_existing_path = PathBuf::from("somewhere").join("else");
+        match storage.download(&non_existing_path, &mut io::sink()).await {
+            Ok(_) => panic!("Should not allow downloading non-existing storage files"),
+            Err(e) => {
+                let error_string = e.to_string();
+                assert!(error_string.contains("does not exist"));
+                assert!(error_string.contains(&non_existing_path.display().to_string()));
+            }
+        }
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn download_file_range_positive() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("download_file_range_positive")?;
+        let storage = create_storage()?;
+        let upload_name = "upload_1";
+        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
+
+        let mut full_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        storage
+            .download_range(&upload_target, 0, None, &mut full_range_bytes)
+            .await?;
+        full_range_bytes.flush().await?;
+        assert_eq!(
+            dummy_contents(upload_name),
+            String::from_utf8(full_range_bytes.into_inner().into_inner())?,
+            "Download full range should return the whole upload"
+        );
+
+        let mut zero_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        let same_byte = 1_000_000_000;
+        storage
+            .download_range(
+                &upload_target,
+                same_byte,
+                Some(same_byte + 1), // exclusive end
+                &mut zero_range_bytes,
+            )
+            .await?;
+        zero_range_bytes.flush().await?;
+        assert!(
+            zero_range_bytes.into_inner().into_inner().is_empty(),
+            "Zero byte range should not download any part of the file"
+        );
+
+        let uploaded_bytes = dummy_contents(upload_name).into_bytes();
+        let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
+
+        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        storage
+            .download_range(
+                &upload_target,
+                0,
+                Some(first_part_local.len() as u64),
+                &mut first_part_remote,
+            )
+            .await?;
+        first_part_remote.flush().await?;
+        let first_part_remote = first_part_remote.into_inner().into_inner();
+        assert_eq!(
+            first_part_local,
+            first_part_remote.as_slice(),
+            "First part bytes should be returned when requested"
+        );
+
+        let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        storage
+            .download_range(
+                &upload_target,
+                first_part_local.len() as u64,
+                Some((first_part_local.len() + second_part_local.len()) as u64),
+                &mut second_part_remote,
+            )
+            .await?;
+        second_part_remote.flush().await?;
+        let second_part_remote = second_part_remote.into_inner().into_inner();
+        assert_eq!(
+            second_part_local,
+            second_part_remote.as_slice(),
+            "Second part bytes should be returned when requested"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn download_file_range_negative() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("download_file_range_negative")?;
+        let storage = create_storage()?;
+        let upload_name = "upload_1";
+        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
+
+        let start = 10000;
+        let end = 234;
+        assert!(start > end, "Should test an incorrect range");
+        match storage
+            .download_range(&upload_target, start, Some(end), &mut io::sink())
+            .await
+        {
+            Ok(_) => panic!("Should not allow downloading wrong ranges"),
+            Err(e) => {
+                let error_string = e.to_string();
+                assert!(error_string.contains("Invalid range"));
+                assert!(error_string.contains(&start.to_string()));
+                assert!(error_string.contains(&end.to_string()));
+            }
+        }
+
+        let non_existing_path = PathBuf::from("somewhere").join("else");
+        match storage
+            .download_range(&non_existing_path, 1, Some(3), &mut io::sink())
+            .await
+        {
+            Ok(_) => panic!("Should not allow downloading non-existing storage file ranges"),
+            Err(e) => {
+                let error_string = e.to_string();
+                assert!(error_string.contains("does not exist"));
+                assert!(error_string.contains(&non_existing_path.display().to_string()));
+            }
+        }
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn delete_file() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("delete_file")?;
+        let storage = create_storage()?;
+        let upload_name = "upload_1";
+        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
+
+        storage.delete(&upload_target).await?;
+        assert!(storage.list().await?.is_empty());
+
+        match storage.delete(&upload_target).await {
+            Ok(()) => panic!("Should not allow deleting non-existing storage files"),
+            Err(e) => {
+                let error_string = e.to_string();
+                assert!(error_string.contains("does not exist"));
+                assert!(error_string.contains(&upload_target.display().to_string()));
+            }
+        }
+        Ok(())
+    }
+
+    async fn upload_dummy_file(
+        harness: &RepoHarness,
+        storage: &LocalFs,
+        name: &str,
+    ) -> anyhow::Result<PathBuf> {
+        let timeline_path = harness.timeline_path(&TIMELINE_ID);
+        let relative_timeline_path = timeline_path.strip_prefix(&harness.conf.workdir)?;
+        let storage_path = storage.root.join(relative_timeline_path).join(name);
+        storage
+            .upload(
+                create_file_for_upload(
+                    &storage.pageserver_workdir.join(name),
+                    &dummy_contents(name),
+                )
+                .await?,
+                &storage_path,
+            )
+            .await?;
+        Ok(storage_path)
+    }
+
+    async fn create_file_for_upload(
+        path: &Path,
+        contents: &str,
+    ) -> anyhow::Result<io::BufReader<fs::File>> {
+        std::fs::create_dir_all(path.parent().unwrap())?;
+        let mut file_for_writing = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .open(path)?;
+        write!(file_for_writing, "{}", contents)?;
+        drop(file_for_writing);
+        Ok(io::BufReader::new(
+            fs::OpenOptions::new().read(true).open(&path).await?,
+        ))
+    }
+
+    fn dummy_contents(name: &str) -> String {
+        format!("contents for {}", name)
+    }
+
+    async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<PathBuf>> {
+        let mut files = storage.list().await?;
+        files.sort();
+        Ok(files)
+    }
+}
--- a/pageserver/src/remote_storage/rust_s3.rs
+++ b/pageserver/src/remote_storage/rust_s3.rs
@@ -0,0 +1,373 @@
+//! AWS S3 storage wrapper around `rust_s3` library.
+//! Currently does not allow multiple pageservers to use the same bucket concurrently: objects are
+//! placed in the root of the bucket.
+
+use std::path::{Path, PathBuf};
+
+use anyhow::Context;
+use s3::{bucket::Bucket, creds::Credentials, region::Region};
+use tokio::io::{self, AsyncWriteExt};
+
+use crate::{
+    remote_storage::{strip_path_prefix, RemoteStorage},
+    S3Config,
+};
+
+const S3_FILE_SEPARATOR: char = '/';
+
+#[derive(Debug, Eq, PartialEq)]
+pub struct S3ObjectKey(String);
+
+impl S3ObjectKey {
+    fn key(&self) -> &str {
+        &self.0
+    }
+
+    fn download_destination(&self, pageserver_workdir: &Path) -> PathBuf {
+        pageserver_workdir.join(self.0.split(S3_FILE_SEPARATOR).collect::<PathBuf>())
+    }
+}
+
+/// AWS S3 storage.
+pub struct S3 {
+    pageserver_workdir: &'static Path,
+    bucket: Bucket,
+}
+
+impl S3 {
+    /// Creates the storage, errors if incorrect AWS S3 configuration provided.
+    pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
+        let region = aws_config
+            .bucket_region
+            .parse::<Region>()
+            .context("Failed to parse the s3 region from config")?;
+        let credentials = Credentials::new(
+            aws_config.access_key_id.as_deref(),
+            aws_config.secret_access_key.as_deref(),
+            None,
+            None,
+            None,
+        )
+        .context("Failed to create the s3 credentials")?;
+        Ok(Self {
+            bucket: Bucket::new_with_path_style(
+                aws_config.bucket_name.as_str(),
+                region,
+                credentials,
+            )
+            .context("Failed to create the s3 bucket")?,
+            pageserver_workdir,
+        })
+    }
+}
+
+#[async_trait::async_trait]
+impl RemoteStorage for S3 {
+    type StoragePath = S3ObjectKey;
+
+    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath> {
+        let relative_path = strip_path_prefix(self.pageserver_workdir, local_path)?;
+        let mut key = String::new();
+        for segment in relative_path {
+            key.push(S3_FILE_SEPARATOR);
+            key.push_str(&segment.to_string_lossy());
+        }
+        Ok(S3ObjectKey(key))
+    }
+
+    fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf> {
+        Ok(storage_path.download_destination(self.pageserver_workdir))
+    }
+
+    async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
+        let list_response = self
+            .bucket
+            .list(String::new(), None)
+            .await
+            .context("Failed to list s3 objects")?;
+
+        Ok(list_response
+            .into_iter()
+            .flat_map(|response| response.contents)
+            .map(|s3_object| S3ObjectKey(s3_object.key))
+            .collect())
+    }
+
+    async fn upload(
+        &self,
+        mut from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        to: &Self::StoragePath,
+    ) -> anyhow::Result<()> {
+        let mut upload_contents = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        io::copy(&mut from, &mut upload_contents)
+            .await
+            .context("Failed to read the upload contents")?;
+        upload_contents
+            .flush()
+            .await
+            .context("Failed to read the upload contents")?;
+        let upload_contents = upload_contents.into_inner().into_inner();
+
+        let (_, code) = self
+            .bucket
+            .put_object(to.key(), &upload_contents)
+            .await
+            .with_context(|| format!("Failed to create s3 object with key {}", to.key()))?;
+        if code != 200 {
+            Err(anyhow::format_err!(
+                "Received non-200 exit code during creating object with key '{}', code: {}",
+                to.key(),
+                code
+            ))
+        } else {
+            Ok(())
+        }
+    }
+
+    async fn download(
+        &self,
+        from: &Self::StoragePath,
+        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
+    ) -> anyhow::Result<()> {
+        let (data, code) = self
+            .bucket
+            .get_object(from.key())
+            .await
+            .with_context(|| format!("Failed to download s3 object with key {}", from.key()))?;
+        if code != 200 {
+            Err(anyhow::format_err!(
+                "Received non-200 exit code during downloading object, code: {}",
+                code
+            ))
+        } else {
+            // we don't have to write vector into the destination this way, `to_write_all` would be enough.
+            // but we want to prepare for migration on `rusoto`, that has a streaming HTTP body instead here, with
+            // which it makes more sense to use `io::copy`.
+            io::copy(&mut data.as_slice(), to)
+                .await
+                .context("Failed to write downloaded data into the destination buffer")?;
+            Ok(())
+        }
+    }
+
+    async fn download_range(
+        &self,
+        from: &Self::StoragePath,
+        start_inclusive: u64,
+        end_exclusive: Option<u64>,
+        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
+    ) -> anyhow::Result<()> {
+        // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
+        // and needs both ends to be exclusive
+        let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1));
+        let (data, code) = self
+            .bucket
+            .get_object_range(from.key(), start_inclusive, end_inclusive)
+            .await
+            .with_context(|| format!("Failed to download s3 object with key {}", from.key()))?;
+        if code != 206 {
+            Err(anyhow::format_err!(
+                "Received non-206 exit code during downloading object range, code: {}",
+                code
+            ))
+        } else {
+            // see `download` function above for the comment on why `Vec<u8>` buffer is copied this way
+            io::copy(&mut data.as_slice(), to)
+                .await
+                .context("Failed to write downloaded range into the destination buffer")?;
+            Ok(())
+        }
+    }
+
+    async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> {
+        let (_, code) = self
+            .bucket
+            .delete_object(path.key())
+            .await
+            .with_context(|| format!("Failed to delete s3 object with key {}", path.key()))?;
+        if code != 204 {
+            Err(anyhow::format_err!(
+                "Received non-204 exit code during deleting object with key '{}', code: {}",
+                path.key(),
+                code
+            ))
+        } else {
+            Ok(())
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        layered_repository::metadata::METADATA_FILE_NAME,
+        repository::repo_harness::{RepoHarness, TIMELINE_ID},
+    };
+
+    use super::*;
+
+    #[test]
+    fn download_destination() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("download_destination")?;
+
+        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("test_name");
+        let relative_path = local_path.strip_prefix(&repo_harness.conf.workdir)?;
+
+        let key = S3ObjectKey(format!(
+            "{}{}",
+            S3_FILE_SEPARATOR,
+            relative_path
+                .iter()
+                .map(|segment| segment.to_str().unwrap())
+                .collect::<Vec<_>>()
+                .join(&S3_FILE_SEPARATOR.to_string()),
+        ));
+
+        assert_eq!(
+            local_path,
+            key.download_destination(&repo_harness.conf.workdir),
+            "Download destination should consist of s3 path joined with the pageserver workdir prefix"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn storage_path_positive() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("storage_path_positive")?;
+
+        let segment_1 = "matching";
+        let segment_2 = "file";
+        let local_path = &repo_harness.conf.workdir.join(segment_1).join(segment_2);
+        let expected_key = S3ObjectKey(format!(
+            "{SEPARATOR}{}{SEPARATOR}{}",
+            segment_1,
+            segment_2,
+            SEPARATOR = S3_FILE_SEPARATOR,
+        ));
+
+        let actual_key = dummy_storage(&repo_harness.conf.workdir)
+            .storage_path(local_path)
+            .expect("Matching path should map to S3 path normally");
+        assert_eq!(
+            expected_key,
+            actual_key,
+            "S3 key from the matching path should contain all segments after the workspace prefix, separated with S3 separator"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn storage_path_negatives() -> anyhow::Result<()> {
+        #[track_caller]
+        fn storage_path_error(storage: &S3, mismatching_path: &Path) -> String {
+            match storage.storage_path(mismatching_path) {
+                Ok(wrong_key) => panic!(
+                    "Expected path '{}' to error, but got S3 key: {:?}",
+                    mismatching_path.display(),
+                    wrong_key,
+                ),
+                Err(e) => e.to_string(),
+            }
+        }
+
+        let repo_harness = RepoHarness::create("storage_path_negatives")?;
+        let storage = dummy_storage(&repo_harness.conf.workdir);
+
+        let error_message = storage_path_error(&storage, &repo_harness.conf.workdir);
+        assert!(
+            error_message.contains("Prefix and the path are equal"),
+            "Message '{}' does not contain the required string",
+            error_message
+        );
+
+        let mismatching_path = PathBuf::from("somewhere").join("else");
+        let error_message = storage_path_error(&storage, &mismatching_path);
+        assert!(
+            error_message.contains(mismatching_path.to_str().unwrap()),
+            "Error should mention wrong path"
+        );
+        assert!(
+            error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
+            "Error should mention server workdir"
+        );
+        assert!(
+            error_message.contains("is not prefixed with"),
+            "Message '{}' does not contain a required string",
+            error_message
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn local_path_positive() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("local_path_positive")?;
+        let storage = dummy_storage(&repo_harness.conf.workdir);
+        let timeline_dir = repo_harness.timeline_path(&TIMELINE_ID);
+        let relative_timeline_path = timeline_dir.strip_prefix(&repo_harness.conf.workdir)?;
+
+        let s3_key = create_s3_key(&relative_timeline_path.join("not a metadata"));
+        assert_eq!(
+            s3_key.download_destination(&repo_harness.conf.workdir),
+            storage
+                .local_path(&s3_key)
+                .expect("For a valid input, valid S3 info should be parsed"),
+            "Should be able to parse metadata out of the correctly named remote delta file"
+        );
+
+        let s3_key = create_s3_key(&relative_timeline_path.join(METADATA_FILE_NAME));
+        assert_eq!(
+            s3_key.download_destination(&repo_harness.conf.workdir),
+            storage
+                .local_path(&s3_key)
+                .expect("For a valid input, valid S3 info should be parsed"),
+            "Should be able to parse metadata out of the correctly named remote metadata file"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn download_destination_matches_original_path() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
+        let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
+
+        let dummy_storage = dummy_storage(&repo_harness.conf.workdir);
+
+        let key = dummy_storage.storage_path(&original_path)?;
+        let download_destination = dummy_storage.local_path(&key)?;
+
+        assert_eq!(
+            original_path, download_destination,
+            "'original path -> storage key -> matching fs path' transformation should produce the same path as the input one for the correct path"
+        );
+
+        Ok(())
+    }
+
+    fn dummy_storage(pageserver_workdir: &'static Path) -> S3 {
+        S3 {
+            pageserver_workdir,
+            bucket: Bucket::new(
+                "dummy-bucket",
+                "us-east-1".parse().unwrap(),
+                Credentials::anonymous().unwrap(),
+            )
+            .unwrap(),
+        }
+    }
+
+    fn create_s3_key(relative_file_path: &Path) -> S3ObjectKey {
+        S3ObjectKey(
+            relative_file_path
+                .iter()
+                .fold(String::new(), |mut path_string, segment| {
+                    path_string.push(S3_FILE_SEPARATOR);
+                    path_string.push_str(segment.to_str().unwrap());
+                    path_string
+                }),
+        )
+    }
+}
--- a/pageserver/src/remote_storage/storage_sync.rs
+++ b/pageserver/src/remote_storage/storage_sync.rs
--- a/pageserver/src/remote_storage/storage_sync/compression.rs
+++ b/pageserver/src/remote_storage/storage_sync/compression.rs
@@ -0,0 +1,611 @@
+//! A set of structs to represent a compressed part of the timeline, and methods to asynchronously compress and uncompress a stream of data,
+//! without holding the entire data in memory.
+//! For the latter, both compress and uncompress functions operate buffered streams (currently hardcoded size of [`ARCHIVE_STREAM_BUFFER_SIZE_BYTES`]),
+//! not attempting to hold the entire archive in memory.
+//!
+//! The compression is done with <a href="https://datatracker.ietf.org/doc/html/rfc8878">zstd</a> streaming algorithm via the `async-compression` crate.
+//! The crate does not contain any knobs to tweak the compression, but otherwise is one of the only ones that's both async and has an API to manage the part of an archive.
+//! Zstd was picked as the best algorithm among the ones available in the crate, after testing the initial timeline file compression.
+//!
+//! Archiving is almost agnostic to timeline file types, with an exception of the metadata file, that's currently distinguished in the [un]compression code.
+//! The metadata file is treated separately when [de]compression is involved, to reduce the risk of corrupting the metadata file.
+//! When compressed, the metadata file is always required and stored as the last file in the archive stream.
+//! When uncompressed, the metadata file gets naturally uncompressed last, to ensure that all other relishes are decompressed successfully first.
+//!
+//! Archive structure:
+//! +----------------------------------------+
+//! | header | file_1, ..., file_k, metadata |
+//! +----------------------------------------+
+//!
+//! The archive consists of two separate zstd archives:
+//! * header archive, that contains all files names and their sizes and relative paths in the timeline directory
+//! Header is a Rust structure, serialized into bytes and compressed with zstd.
+//! * files archive, that has metadata file as the last one, all compressed with zstd into a single binary blob
+//!
+//! Header offset is stored in the file name, along with the `disk_consistent_lsn` from the metadata file.
+//! See [`parse_archive_name`] and [`ARCHIVE_EXTENSION`] for the name details, example: `00000000016B9150-.zst_9732`.
+//! This way, the header could be retrieved without reading an entire archive file.
+
+use std::{
+    collections::BTreeSet,
+    future::Future,
+    io::Cursor,
+    path::{Path, PathBuf},
+    sync::Arc,
+};
+
+use anyhow::{anyhow, bail, ensure, Context};
+use async_compression::tokio::bufread::{ZstdDecoder, ZstdEncoder};
+use serde::{Deserialize, Serialize};
+use tokio::{
+    fs,
+    io::{self, AsyncReadExt, AsyncWriteExt},
+};
+use tracing::*;
+use zenith_utils::{bin_ser::BeSer, lsn::Lsn};
+
+use crate::layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME};
+
+use super::index::RelativePath;
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct ArchiveHeader {
+    /// All regular timeline files, excluding the metadata file.
+    pub files: Vec<FileEntry>,
+    // Metadata file name is known to the system, as its location relative to the timeline dir,
+    // so no need to store anything but its size in bytes.
+    pub metadata_file_size: u64,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash)]
+pub struct FileEntry {
+    /// Uncompressed file size, bytes.
+    pub size: u64,
+    /// A path, relative to the directory root, used when compressing the directory contents.
+    pub subpath: RelativePath,
+}
+
+const ARCHIVE_EXTENSION: &str = "-.zst_";
+const ARCHIVE_STREAM_BUFFER_SIZE_BYTES: usize = 4 * 1024 * 1024;
+
+/// Streams an archive of files given into a stream target, defined by the closure.
+///
+/// The closure approach is picked for cases like S3, where we would need a name of the file before we can get a stream to write the bytes into.
+/// Current idea is to place the header size in the name of the file, to enable the fast partial remote file index restoration without actually reading remote storage file contents.
+///
+/// Performs the compression in multiple steps:
+/// * prepares an archive header, stripping the `source_dir` prefix from the `files`
+/// * generates the name of the archive
+/// * prepares archive producer future, knowing the header and the file list
+/// An `impl AsyncRead` and `impl AsyncWrite` pair of connected streams is created to implement the partial contents streaming.
+/// The writer end gets into the archive producer future, to put the header and a stream of compressed files.
+/// * prepares archive consumer future, by executing the provided closure
+/// The closure gets the reader end stream and the name of the file to create a future that would stream the file contents elsewhere.
+/// * runs and waits for both futures to complete
+/// * on a successful completion of both futures, header, its size and the user-defined consumer future return data is returned
+/// Due to the design above, the archive name and related data is visible inside the consumer future only, so it's possible to return the data,
+/// needed for future processing.
+pub async fn archive_files_as_stream<Cons, ConsRet, Fut>(
+    source_dir: &Path,
+    files: impl Iterator<Item = &PathBuf>,
+    metadata: &TimelineMetadata,
+    create_archive_consumer: Cons,
+) -> anyhow::Result<(ArchiveHeader, u64, ConsRet)>
+where
+    Cons: FnOnce(Box<dyn io::AsyncRead + Unpin + Send + Sync + 'static>, String) -> Fut
+        + Send
+        + 'static,
+    Fut: Future<Output = anyhow::Result<ConsRet>> + Send + 'static,
+    ConsRet: Send + Sync + 'static,
+{
+    let metadata_bytes = metadata
+        .to_bytes()
+        .context("Failed to create metadata bytes")?;
+    let (archive_header, compressed_header_bytes) =
+        prepare_header(source_dir, files, &metadata_bytes)
+            .await
+            .context("Failed to prepare file for archivation")?;
+
+    let header_size = compressed_header_bytes.len() as u64;
+    let (write, read) = io::duplex(ARCHIVE_STREAM_BUFFER_SIZE_BYTES);
+    let archive_filler = write_archive_contents(
+        source_dir.to_path_buf(),
+        archive_header.clone(),
+        metadata_bytes,
+        write,
+    );
+    let archive_name = archive_name(metadata.disk_consistent_lsn(), header_size);
+    let archive_stream =
+        Cursor::new(compressed_header_bytes).chain(ZstdEncoder::new(io::BufReader::new(read)));
+
+    let (archive_creation_result, archive_upload_result) = tokio::join!(
+        tokio::spawn(archive_filler),
+        tokio::spawn(async move {
+            create_archive_consumer(Box::new(archive_stream), archive_name).await
+        })
+    );
+    archive_creation_result
+        .context("Failed to spawn archive creation future")?
+        .context("Failed to create an archive")?;
+    let upload_return_value = archive_upload_result
+        .context("Failed to spawn archive upload future")?
+        .context("Failed to upload the archive")?;
+
+    Ok((archive_header, header_size, upload_return_value))
+}
+
+/// Similar to [`archive_files_as_stream`], creates a pair of streams to uncompress the 2nd part of the archive,
+/// that contains files and is located after the header.
+/// S3 allows downloading partial file contents for a given file key (i.e. name), to accommodate this retrieval,
+/// a closure is used.
+/// Same concepts with two concurrent futures, user-defined closure, future and return value apply here, but the
+/// consumer and the receiver ends are swapped, since the uncompression happens.
+pub async fn uncompress_file_stream_with_index<Prod, ProdRet, Fut>(
+    destination_dir: PathBuf,
+    files_to_skip: Arc<BTreeSet<PathBuf>>,
+    disk_consistent_lsn: Lsn,
+    header: ArchiveHeader,
+    header_size: u64,
+    create_archive_file_part: Prod,
+) -> anyhow::Result<ProdRet>
+where
+    Prod: FnOnce(Box<dyn io::AsyncWrite + Unpin + Send + Sync + 'static>, String) -> Fut
+        + Send
+        + 'static,
+    Fut: Future<Output = anyhow::Result<ProdRet>> + Send + 'static,
+    ProdRet: Send + Sync + 'static,
+{
+    let (write, mut read) = io::duplex(ARCHIVE_STREAM_BUFFER_SIZE_BYTES);
+    let archive_name = archive_name(disk_consistent_lsn, header_size);
+
+    let (archive_download_result, archive_uncompress_result) = tokio::join!(
+        tokio::spawn(async move { create_archive_file_part(Box::new(write), archive_name).await }),
+        tokio::spawn(async move {
+            uncompress_with_header(&files_to_skip, &destination_dir, header, &mut read).await
+        })
+    );
+
+    let download_value = archive_download_result
+        .context("Failed to spawn archive download future")?
+        .context("Failed to download an archive")?;
+    archive_uncompress_result
+        .context("Failed to spawn archive uncompress future")?
+        .context("Failed to uncompress the archive")?;
+
+    Ok(download_value)
+}
+
+/// Reads archive header from the stream given:
+/// * parses the file name to get the header size
+/// * reads the exact amount of bytes
+/// * uncompresses and deserializes those
+pub async fn read_archive_header<A: io::AsyncRead + Send + Sync + Unpin>(
+    archive_name: &str,
+    from: &mut A,
+) -> anyhow::Result<ArchiveHeader> {
+    let (_, header_size) = parse_archive_name(Path::new(archive_name))?;
+
+    let mut compressed_header_bytes = vec![0; header_size as usize];
+    from.read_exact(&mut compressed_header_bytes)
+        .await
+        .with_context(|| {
+            format!(
+                "Failed to read header header from the archive {}",
+                archive_name
+            )
+        })?;
+
+    let mut header_bytes = Vec::new();
+    ZstdDecoder::new(io::BufReader::new(compressed_header_bytes.as_slice()))
+        .read_to_end(&mut header_bytes)
+        .await
+        .context("Failed to decompress a header from the archive")?;
+
+    Ok(ArchiveHeader::des(&header_bytes)
+        .context("Failed to deserialize a header from the archive")?)
+}
+
+/// Reads the archive metadata out of the archive name:
+/// * `disk_consistent_lsn` of the checkpoint that was archived
+/// * size of the archive header
+pub fn parse_archive_name(archive_path: &Path) -> anyhow::Result<(Lsn, u64)> {
+    let archive_name = archive_path
+        .file_name()
+        .ok_or_else(|| anyhow!("Archive '{}' has no file name", archive_path.display()))?
+        .to_string_lossy();
+    let (lsn_str, header_size_str) =
+        archive_name.rsplit_once(ARCHIVE_EXTENSION).ok_or_else(|| {
+            anyhow!(
+                "Archive '{}' has incorrect extension, expected to contain '{}'",
+                archive_path.display(),
+                ARCHIVE_EXTENSION
+            )
+        })?;
+    let disk_consistent_lsn = Lsn::from_hex(lsn_str).with_context(|| {
+        format!(
+            "Archive '{}' has an invalid disk consistent lsn in its extension",
+            archive_path.display(),
+        )
+    })?;
+    let header_size = header_size_str.parse::<u64>().with_context(|| {
+        format!(
+            "Archive '{}' has an invalid a header offset number in its extension",
+            archive_path.display(),
+        )
+    })?;
+    Ok((disk_consistent_lsn, header_size))
+}
+
+fn archive_name(disk_consistent_lsn: Lsn, header_size: u64) -> String {
+    let archive_name = format!(
+        "{:016X}{ARCHIVE_EXTENSION}{}",
+        u64::from(disk_consistent_lsn),
+        header_size,
+        ARCHIVE_EXTENSION = ARCHIVE_EXTENSION,
+    );
+    archive_name
+}
+
+async fn uncompress_with_header(
+    files_to_skip: &BTreeSet<PathBuf>,
+    destination_dir: &Path,
+    header: ArchiveHeader,
+    archive_after_header: impl io::AsyncRead + Send + Sync + Unpin,
+) -> anyhow::Result<()> {
+    debug!("Uncompressing archive into {}", destination_dir.display());
+    let mut archive = ZstdDecoder::new(io::BufReader::new(archive_after_header));
+
+    if !destination_dir.exists() {
+        fs::create_dir_all(&destination_dir)
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to create target directory at {}",
+                    destination_dir.display()
+                )
+            })?;
+    } else if !destination_dir.is_dir() {
+        bail!(
+            "Destination path '{}' is not a valid directory",
+            destination_dir.display()
+        );
+    }
+    debug!("Will extract {} files from the archive", header.files.len());
+    for entry in header.files {
+        uncompress_entry(
+            &mut archive,
+            &entry.subpath.as_path(destination_dir),
+            entry.size,
+            files_to_skip,
+        )
+        .await
+        .with_context(|| format!("Failed to uncompress archive entry {:?}", entry))?;
+    }
+    uncompress_entry(
+        &mut archive,
+        &destination_dir.join(METADATA_FILE_NAME),
+        header.metadata_file_size,
+        files_to_skip,
+    )
+    .await
+    .context("Failed to uncompress the metadata entry")?;
+    Ok(())
+}
+
+async fn uncompress_entry(
+    archive: &mut ZstdDecoder<io::BufReader<impl io::AsyncRead + Send + Sync + Unpin>>,
+    destination_path: &Path,
+    entry_size: u64,
+    files_to_skip: &BTreeSet<PathBuf>,
+) -> anyhow::Result<()> {
+    if let Some(parent) = destination_path.parent() {
+        fs::create_dir_all(parent).await.with_context(|| {
+            format!(
+                "Failed to create parent directory for {}",
+                destination_path.display()
+            )
+        })?;
+    };
+
+    if files_to_skip.contains(destination_path) {
+        debug!("Skipping {}", destination_path.display());
+        copy_n_bytes(entry_size, archive, &mut io::sink())
+            .await
+            .context("Failed to skip bytes in the archive")?;
+        return Ok(());
+    }
+
+    let mut destination =
+        io::BufWriter::new(fs::File::create(&destination_path).await.with_context(|| {
+            format!(
+                "Failed to open file {} for extraction",
+                destination_path.display()
+            )
+        })?);
+    copy_n_bytes(entry_size, archive, &mut destination)
+        .await
+        .with_context(|| {
+            format!(
+                "Failed to write extracted archive contents into file {}",
+                destination_path.display()
+            )
+        })?;
+    destination
+        .flush()
+        .await
+        .context("Failed to flush the streaming archive bytes")?;
+    Ok(())
+}
+
+async fn write_archive_contents(
+    source_dir: PathBuf,
+    header: ArchiveHeader,
+    metadata_bytes: Vec<u8>,
+    mut archive_input: io::DuplexStream,
+) -> anyhow::Result<()> {
+    debug!("Starting writing files into archive");
+    for file_entry in header.files {
+        let path = file_entry.subpath.as_path(&source_dir);
+        let mut source_file =
+            io::BufReader::new(fs::File::open(&path).await.with_context(|| {
+                format!(
+                    "Failed to open file for archiving to path {}",
+                    path.display()
+                )
+            })?);
+        let bytes_written = io::copy(&mut source_file, &mut archive_input)
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to open add a file into archive, file path {}",
+                    path.display()
+                )
+            })?;
+        ensure!(
+            file_entry.size == bytes_written,
+            "File {} was written to the archive incompletely",
+            path.display()
+        );
+        trace!(
+            "Added file '{}' ({} bytes) into the archive",
+            path.display(),
+            bytes_written
+        );
+    }
+    let metadata_bytes_written = io::copy(&mut metadata_bytes.as_slice(), &mut archive_input)
+        .await
+        .with_context(|| "Failed to add metadata into the archive")?;
+    ensure!(
+        header.metadata_file_size == metadata_bytes_written,
+        "Metadata file was written to the archive incompletely",
+    );
+
+    archive_input
+        .shutdown()
+        .await
+        .context("Failed to finalize the archive")?;
+    debug!("Successfully streamed all files into the archive");
+    Ok(())
+}
+
+async fn prepare_header(
+    source_dir: &Path,
+    files: impl Iterator<Item = &PathBuf>,
+    metadata_bytes: &[u8],
+) -> anyhow::Result<(ArchiveHeader, Vec<u8>)> {
+    let mut archive_files = Vec::new();
+    for file_path in files {
+        let file_metadata = fs::metadata(file_path).await.with_context(|| {
+            format!(
+                "Failed to read metadata during archive indexing for {}",
+                file_path.display()
+            )
+        })?;
+        ensure!(
+            file_metadata.is_file(),
+            "Archive indexed path {} is not a file",
+            file_path.display()
+        );
+
+        if file_path.file_name().and_then(|name| name.to_str()) != Some(METADATA_FILE_NAME) {
+            let entry = FileEntry {
+                subpath: RelativePath::new(source_dir, file_path).with_context(|| {
+                    format!(
+                        "File '{}' does not belong to pageserver workspace",
+                        file_path.display()
+                    )
+                })?,
+                size: file_metadata.len(),
+            };
+            archive_files.push(entry);
+        }
+    }
+
+    let header = ArchiveHeader {
+        files: archive_files,
+        metadata_file_size: metadata_bytes.len() as u64,
+    };
+
+    debug!("Appending a header for {} files", header.files.len());
+    let header_bytes = header.ser().context("Failed to serialize a header")?;
+    debug!("Header bytes len {}", header_bytes.len());
+    let mut compressed_header_bytes = Vec::new();
+    ZstdEncoder::new(io::BufReader::new(header_bytes.as_slice()))
+        .read_to_end(&mut compressed_header_bytes)
+        .await
+        .context("Failed to compress header bytes")?;
+    debug!(
+        "Compressed header bytes len {}",
+        compressed_header_bytes.len()
+    );
+    Ok((header, compressed_header_bytes))
+}
+
+async fn copy_n_bytes(
+    n: u64,
+    from: &mut (impl io::AsyncRead + Send + Sync + Unpin),
+    into: &mut (impl io::AsyncWrite + Send + Sync + Unpin),
+) -> anyhow::Result<()> {
+    let bytes_written = io::copy(&mut from.take(n), into).await?;
+    ensure!(
+        bytes_written == n,
+        "Failed to read exactly {} bytes from the input, bytes written: {}",
+        n,
+        bytes_written,
+    );
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use tokio::{fs, io::AsyncSeekExt};
+
+    use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID};
+
+    use super::*;
+
+    #[tokio::test]
+    async fn compress_and_uncompress() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("compress_and_uncompress")?;
+        let timeline_dir = repo_harness.timeline_path(&TIMELINE_ID);
+        init_directory(
+            &timeline_dir,
+            vec![
+                ("first", "first_contents"),
+                ("second", "second_contents"),
+                (METADATA_FILE_NAME, "wrong_metadata"),
+            ],
+        )
+        .await?;
+        let timeline_files = list_file_paths_with_contents(&timeline_dir).await?;
+        assert_eq!(
+            timeline_files,
+            vec![
+                (
+                    timeline_dir.join("first"),
+                    FileContents::Text("first_contents".to_string())
+                ),
+                (
+                    timeline_dir.join(METADATA_FILE_NAME),
+                    FileContents::Text("wrong_metadata".to_string())
+                ),
+                (
+                    timeline_dir.join("second"),
+                    FileContents::Text("second_contents".to_string())
+                ),
+            ],
+            "Initial timeline contents should contain two normal files and a wrong metadata file"
+        );
+
+        let metadata = TimelineMetadata::new(Lsn(0x30), None, None, Lsn(0), Lsn(0), Lsn(0));
+        let paths_to_archive = timeline_files
+            .into_iter()
+            .map(|(path, _)| path)
+            .collect::<Vec<_>>();
+
+        let tempdir = tempfile::tempdir()?;
+        let base_path = tempdir.path().to_path_buf();
+        let (header, header_size, archive_target) = archive_files_as_stream(
+            &timeline_dir,
+            paths_to_archive.iter(),
+            &metadata,
+            move |mut archive_streamer, archive_name| async move {
+                let archive_target = base_path.join(&archive_name);
+                let mut archive_file = fs::File::create(&archive_target).await?;
+                io::copy(&mut archive_streamer, &mut archive_file).await?;
+                Ok(archive_target)
+            },
+        )
+        .await?;
+
+        let mut file = fs::File::open(&archive_target).await?;
+        file.seek(io::SeekFrom::Start(header_size)).await?;
+        let target_dir = tempdir.path().join("extracted");
+        uncompress_with_header(&BTreeSet::new(), &target_dir, header, file).await?;
+
+        let extracted_files = list_file_paths_with_contents(&target_dir).await?;
+
+        assert_eq!(
+            extracted_files,
+            vec![
+                (
+                    target_dir.join("first"),
+                    FileContents::Text("first_contents".to_string())
+                ),
+                (
+                    target_dir.join(METADATA_FILE_NAME),
+                    FileContents::Binary(metadata.to_bytes()?)
+                ),
+                (
+                    target_dir.join("second"),
+                    FileContents::Text("second_contents".to_string())
+                ),
+            ],
+            "Extracted files should contain all local timeline files besides its metadata, which should be taken from the arguments"
+        );
+
+        Ok(())
+    }
+
+    async fn init_directory(
+        root: &Path,
+        files_with_contents: Vec<(&str, &str)>,
+    ) -> anyhow::Result<()> {
+        fs::create_dir_all(root).await?;
+        for (file_name, contents) in files_with_contents {
+            fs::File::create(root.join(file_name))
+                .await?
+                .write_all(contents.as_bytes())
+                .await?;
+        }
+        Ok(())
+    }
+
+    #[derive(PartialEq, Eq, PartialOrd, Ord)]
+    enum FileContents {
+        Text(String),
+        Binary(Vec<u8>),
+    }
+
+    impl std::fmt::Debug for FileContents {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            match self {
+                Self::Text(text) => f.debug_tuple("Text").field(text).finish(),
+                Self::Binary(bytes) => f
+                    .debug_tuple("Binary")
+                    .field(&format!("{} bytes", bytes.len()))
+                    .finish(),
+            }
+        }
+    }
+
+    async fn list_file_paths_with_contents(
+        root: &Path,
+    ) -> anyhow::Result<Vec<(PathBuf, FileContents)>> {
+        let mut file_paths = Vec::new();
+
+        let mut dir_listings = vec![fs::read_dir(root).await?];
+        while let Some(mut dir_listing) = dir_listings.pop() {
+            while let Some(entry) = dir_listing.next_entry().await? {
+                let entry_path = entry.path();
+                if entry_path.is_file() {
+                    let contents = match String::from_utf8(fs::read(&entry_path).await?) {
+                        Ok(text) => FileContents::Text(text),
+                        Err(e) => FileContents::Binary(e.into_bytes()),
+                    };
+                    file_paths.push((entry_path, contents));
+                } else if entry_path.is_dir() {
+                    dir_listings.push(fs::read_dir(entry_path).await?);
+                } else {
+                    info!(
+                        "Skipping path '{}' as it's not a file or a directory",
+                        entry_path.display()
+                    );
+                }
+            }
+        }
+
+        file_paths.sort();
+        Ok(file_paths)
+    }
+}
--- a/pageserver/src/remote_storage/storage_sync/download.rs
+++ b/pageserver/src/remote_storage/storage_sync/download.rs
@@ -0,0 +1,370 @@
+//! Timeline synchrnonization logic to put files from archives on remote storage into pageserver's local directory.
+//! Currently, tenant branch files are also downloaded, but this does not appear final.
+
+use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc};
+
+use anyhow::{anyhow, ensure, Context};
+use futures::{stream::FuturesUnordered, StreamExt};
+use tokio::{fs, sync::RwLock};
+use tracing::{debug, error, warn};
+use zenith_utils::zid::ZTenantId;
+
+use crate::{
+    layered_repository::metadata::{metadata_path, TimelineMetadata},
+    remote_storage::{
+        storage_sync::{
+            compression, index::TimelineIndexEntry, sync_queue, tenant_branch_files,
+            update_index_description, SyncKind, SyncTask,
+        },
+        RemoteStorage, TimelineSyncId,
+    },
+    PageServerConf,
+};
+
+use super::{
+    index::{ArchiveId, RemoteTimeline, RemoteTimelineIndex},
+    TimelineDownload,
+};
+
+/// Attempts to download and uncompress files from all remote archives for the timeline given.
+/// Timeline files that already exist locally are skipped during the download, but the local metadata file is
+/// updated in the end of every checkpoint archive extraction.
+///
+/// Before any archives are considered, the branch files are checked locally and remotely, all remote-only files are downloaded.
+///
+/// On an error, bumps the retries count and reschedules the download, with updated archive skip list
+/// (for any new successful archive downloads and extractions).
+pub(super) async fn download_timeline<
+    P: std::fmt::Debug + Send + Sync + 'static,
+    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
+>(
+    conf: &'static PageServerConf,
+    remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
+    sync_id: TimelineSyncId,
+    mut download: TimelineDownload,
+    retries: u32,
+) -> Option<bool> {
+    debug!("Downloading layers for sync id {}", sync_id);
+    if let Err(e) = download_missing_branches(conf, remote_assets.as_ref(), sync_id.0).await {
+        error!(
+            "Failed to download missing branches for sync id {}: {:#}",
+            sync_id, e
+        );
+        sync_queue::push(SyncTask::new(
+            sync_id,
+            retries,
+            SyncKind::Download(download),
+        ));
+        return Some(false);
+    }
+
+    let TimelineSyncId(tenant_id, timeline_id) = sync_id;
+
+    let index_read = remote_assets.1.read().await;
+    let remote_timeline = match index_read.timeline_entry(&sync_id) {
+        None => {
+            error!("Cannot download: no timeline is present in the index for given ids");
+            return None;
+        }
+        Some(TimelineIndexEntry::Full(remote_timeline)) => Cow::Borrowed(remote_timeline),
+        Some(TimelineIndexEntry::Description(_)) => {
+            drop(index_read);
+            debug!("Found timeline description for the given ids, downloading the full index");
+            match update_index_description(
+                remote_assets.as_ref(),
+                &conf.timeline_path(&timeline_id, &tenant_id),
+                sync_id,
+            )
+            .await
+            {
+                Ok(remote_timeline) => Cow::Owned(remote_timeline),
+                Err(e) => {
+                    error!("Failed to download full timeline index: {:#}", e);
+                    sync_queue::push(SyncTask::new(
+                        sync_id,
+                        retries,
+                        SyncKind::Download(download),
+                    ));
+                    return Some(false);
+                }
+            }
+        }
+    };
+
+    let mut archives_to_download = remote_timeline
+        .checkpoints()
+        .map(ArchiveId)
+        .filter(|remote_archive| !download.archives_to_skip.contains(remote_archive))
+        .collect::<Vec<_>>();
+
+    let archives_total = archives_to_download.len();
+    debug!("Downloading {} archives of a timeline", archives_total);
+
+    while let Some(archive_id) = archives_to_download.pop() {
+        match try_download_archive(
+            conf,
+            sync_id,
+            Arc::clone(&remote_assets),
+            remote_timeline.as_ref(),
+            archive_id,
+            Arc::clone(&download.files_to_skip),
+        )
+        .await
+        {
+            Err(e) => {
+                let archives_left = archives_to_download.len();
+                error!(
+                    "Failed to download archive {:?} for tenant {} timeline {} : {:#}, requeueing the download ({} archives left out of {})",
+                    archive_id, tenant_id, timeline_id, e, archives_left, archives_total
+                );
+                sync_queue::push(SyncTask::new(
+                    sync_id,
+                    retries,
+                    SyncKind::Download(download),
+                ));
+                return Some(false);
+            }
+            Ok(()) => {
+                debug!("Successfully downloaded archive {:?}", archive_id);
+                download.archives_to_skip.insert(archive_id);
+            }
+        }
+    }
+
+    debug!("Finished downloading all timeline's archives");
+    Some(true)
+}
+
+async fn try_download_archive<
+    P: Send + Sync + 'static,
+    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
+>(
+    conf: &'static PageServerConf,
+    TimelineSyncId(tenant_id, timeline_id): TimelineSyncId,
+    remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
+    remote_timeline: &RemoteTimeline,
+    archive_id: ArchiveId,
+    files_to_skip: Arc<BTreeSet<PathBuf>>,
+) -> anyhow::Result<()> {
+    debug!("Downloading archive {:?}", archive_id);
+    let archive_to_download = remote_timeline
+        .archive_data(archive_id)
+        .ok_or_else(|| anyhow!("Archive {:?} not found in remote storage", archive_id))?;
+    let (archive_header, header_size) = remote_timeline
+        .restore_header(archive_id)
+        .context("Failed to restore header when downloading an archive")?;
+
+    match read_local_metadata(conf, timeline_id, tenant_id).await {
+        Ok(local_metadata) => ensure!(
+            // need to allow `<=` instead of `<` due to cases when a failed archive can be redownloaded
+            local_metadata.disk_consistent_lsn() <= archive_to_download.disk_consistent_lsn(),
+            "Cannot download archive with LSN {} since it's earlier than local LSN {}",
+            archive_to_download.disk_consistent_lsn(),
+            local_metadata.disk_consistent_lsn()
+        ),
+        Err(e) => warn!("Failed to read local metadata file, assuing it's safe to override its with the download. Read: {:#}", e),
+    }
+    compression::uncompress_file_stream_with_index(
+        conf.timeline_path(&timeline_id, &tenant_id),
+        files_to_skip,
+        archive_to_download.disk_consistent_lsn(),
+        archive_header,
+        header_size,
+        move |mut archive_target, archive_name| async move {
+            let archive_local_path = conf
+                .timeline_path(&timeline_id, &tenant_id)
+                .join(&archive_name);
+            let remote_storage = &remote_assets.0;
+            remote_storage
+                .download_range(
+                    &remote_storage.storage_path(&archive_local_path)?,
+                    header_size,
+                    None,
+                    &mut archive_target,
+                )
+                .await
+        },
+    )
+    .await?;
+
+    Ok(())
+}
+
+async fn read_local_metadata(
+    conf: &'static PageServerConf,
+    timeline_id: zenith_utils::zid::ZTimelineId,
+    tenant_id: ZTenantId,
+) -> anyhow::Result<TimelineMetadata> {
+    let local_metadata_path = metadata_path(conf, timeline_id, tenant_id);
+    let local_metadata_bytes = fs::read(&local_metadata_path)
+        .await
+        .context("Failed to read local metadata file bytes")?;
+    Ok(TimelineMetadata::from_bytes(&local_metadata_bytes)
+        .context("Failed to read local metadata files bytes")?)
+}
+
+async fn download_missing_branches<
+    P: std::fmt::Debug + Send + Sync + 'static,
+    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
+>(
+    conf: &'static PageServerConf,
+    (storage, index): &(S, RwLock<RemoteTimelineIndex>),
+    tenant_id: ZTenantId,
+) -> anyhow::Result<()> {
+    let local_branches = tenant_branch_files(conf, tenant_id)
+        .await
+        .context("Failed to list local branch files for the tenant")?;
+    let local_branches_dir = conf.branches_path(&tenant_id);
+    if !local_branches_dir.exists() {
+        fs::create_dir_all(&local_branches_dir)
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to create local branches directory at path '{}'",
+                    local_branches_dir.display()
+                )
+            })?;
+    }
+
+    if let Some(remote_branches) = index.read().await.branch_files(tenant_id) {
+        let mut remote_only_branches_downloads = remote_branches
+            .difference(&local_branches)
+            .map(|remote_only_branch| async move {
+                let branches_dir = conf.branches_path(&tenant_id);
+                let remote_branch_path = remote_only_branch.as_path(&branches_dir);
+                let storage_path =
+                    storage.storage_path(&remote_branch_path).with_context(|| {
+                        format!(
+                            "Failed to derive a storage path for branch with local path '{}'",
+                            remote_branch_path.display()
+                        )
+                    })?;
+                let mut target_file = fs::OpenOptions::new()
+                    .write(true)
+                    .create_new(true)
+                    .open(&remote_branch_path)
+                    .await
+                    .with_context(|| {
+                        format!(
+                            "Failed to create local branch file at '{}'",
+                            remote_branch_path.display()
+                        )
+                    })?;
+                storage
+                    .download(&storage_path, &mut target_file)
+                    .await
+                    .with_context(|| {
+                        format!(
+                            "Failed to download branch file from the remote path {:?}",
+                            storage_path
+                        )
+                    })?;
+                Ok::<_, anyhow::Error>(())
+            })
+            .collect::<FuturesUnordered<_>>();
+
+        let mut branch_downloads_failed = false;
+        while let Some(download_result) = remote_only_branches_downloads.next().await {
+            if let Err(e) = download_result {
+                branch_downloads_failed = true;
+                error!("Failed to download a branch file: {:#}", e);
+            }
+        }
+        ensure!(
+            !branch_downloads_failed,
+            "Failed to download all branch files"
+        );
+    }
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::BTreeSet;
+
+    use tempfile::tempdir;
+    use tokio::fs;
+    use zenith_utils::lsn::Lsn;
+
+    use crate::{
+        remote_storage::{
+            local_fs::LocalFs,
+            storage_sync::test_utils::{
+                assert_index_descriptions, assert_timeline_files_match, create_local_timeline,
+                dummy_metadata, ensure_correct_timeline_upload, expect_timeline,
+            },
+        },
+        repository::repo_harness::{RepoHarness, TIMELINE_ID},
+    };
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_download_timeline() -> anyhow::Result<()> {
+        let tempdir = tempdir()?;
+        let tempdir_path = tempdir.path();
+        let _ = zenith_utils::logging::init(tempdir_path.join("log.log"), false);
+
+        let repo_harness = RepoHarness::create("test_download_timeline")?;
+        let sync_id = TimelineSyncId(repo_harness.tenant_id, TIMELINE_ID);
+        let storage = LocalFs::new(tempdir_path.to_owned(), &repo_harness.conf.workdir)?;
+        let index = RwLock::new(RemoteTimelineIndex::try_parse_descriptions_from_paths(
+            repo_harness.conf,
+            storage
+                .list()
+                .await?
+                .into_iter()
+                .map(|storage_path| storage.local_path(&storage_path).unwrap()),
+        ));
+        let remote_assets = Arc::new((storage, index));
+        let storage = &remote_assets.0;
+        let index = &remote_assets.1;
+
+        let regular_timeline_path = repo_harness.timeline_path(&TIMELINE_ID);
+        let regular_timeline = create_local_timeline(
+            &repo_harness,
+            TIMELINE_ID,
+            &["a", "b"],
+            dummy_metadata(Lsn(0x30)),
+        )?;
+        ensure_correct_timeline_upload(
+            &repo_harness,
+            Arc::clone(&remote_assets),
+            TIMELINE_ID,
+            regular_timeline,
+        )
+        .await;
+        fs::remove_dir_all(&regular_timeline_path).await?;
+        let remote_regular_timeline = expect_timeline(index, sync_id).await;
+
+        download_timeline(
+            repo_harness.conf,
+            Arc::clone(&remote_assets),
+            sync_id,
+            TimelineDownload {
+                files_to_skip: Arc::new(BTreeSet::new()),
+                archives_to_skip: BTreeSet::new(),
+            },
+            0,
+        )
+        .await;
+        assert_index_descriptions(
+            index,
+            RemoteTimelineIndex::try_parse_descriptions_from_paths(
+                repo_harness.conf,
+                remote_assets
+                    .0
+                    .list()
+                    .await
+                    .unwrap()
+                    .into_iter()
+                    .map(|storage_path| storage.local_path(&storage_path).unwrap()),
+            ),
+        )
+        .await;
+        assert_timeline_files_match(&repo_harness, TIMELINE_ID, remote_regular_timeline);
+
+        Ok(())
+    }
+}
--- a/pageserver/src/remote_storage/storage_sync/index.rs
+++ b/pageserver/src/remote_storage/storage_sync/index.rs
@@ -0,0 +1,408 @@
+//! In-memory index to track the tenant files on the remote strorage, mitigating the storage format differences between the local and remote files.
+//! Able to restore itself from the storage archive data and reconstruct archive indices on demand.
+//!
+//! The index is intended to be portable, so deliberately does not store any local paths inside.
+//! This way in the future, the index could be restored fast from its serialized stored form.
+
+use std::{
+    collections::{BTreeMap, BTreeSet, HashMap, HashSet},
+    path::{Path, PathBuf},
+};
+
+use anyhow::{anyhow, bail, ensure, Context};
+use serde::{Deserialize, Serialize};
+use tracing::debug;
+use zenith_utils::{
+    lsn::Lsn,
+    zid::{ZTenantId, ZTimelineId},
+};
+
+use crate::{
+    layered_repository::TIMELINES_SEGMENT_NAME,
+    remote_storage::{
+        storage_sync::compression::{parse_archive_name, FileEntry},
+        TimelineSyncId,
+    },
+    PageServerConf,
+};
+
+use super::compression::ArchiveHeader;
+
+/// A part of the filesystem path, that needs a root to become a path again.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub struct RelativePath(String);
+
+impl RelativePath {
+    /// Attempts to strip off the base from path, producing a relative path or an error.
+    pub fn new<P: AsRef<Path>>(base: &Path, path: P) -> anyhow::Result<Self> {
+        let relative = path
+            .as_ref()
+            .strip_prefix(base)
+            .context("path is not relative to base")?;
+        Ok(RelativePath(relative.to_string_lossy().to_string()))
+    }
+
+    /// Joins the relative path with the base path.
+    pub fn as_path(&self, base: &Path) -> PathBuf {
+        base.join(&self.0)
+    }
+}
+
+/// An index to track tenant files that exist on the remote storage.
+/// Currently, timeline archives and branch files are tracked.
+#[derive(Debug, Clone)]
+pub struct RemoteTimelineIndex {
+    branch_files: HashMap<ZTenantId, HashSet<RelativePath>>,
+    timeline_files: HashMap<TimelineSyncId, TimelineIndexEntry>,
+}
+
+impl RemoteTimelineIndex {
+    /// Attempts to parse file paths (not checking the file contents) and find files
+    /// that can be tracked wiht the index.
+    /// On parse falures, logs the error and continues, so empty index can be created from not suitable paths.
+    pub fn try_parse_descriptions_from_paths<P: AsRef<Path>>(
+        conf: &'static PageServerConf,
+        paths: impl Iterator<Item = P>,
+    ) -> Self {
+        let mut index = Self {
+            branch_files: HashMap::new(),
+            timeline_files: HashMap::new(),
+        };
+        for path in paths {
+            if let Err(e) = try_parse_index_entry(&mut index, conf, path.as_ref()) {
+                debug!(
+                    "Failed to parse path '{}' as index entry: {:#}",
+                    path.as_ref().display(),
+                    e
+                );
+            }
+        }
+        index
+    }
+
+    pub fn timeline_entry(&self, id: &TimelineSyncId) -> Option<&TimelineIndexEntry> {
+        self.timeline_files.get(id)
+    }
+
+    pub fn timeline_entry_mut(&mut self, id: &TimelineSyncId) -> Option<&mut TimelineIndexEntry> {
+        self.timeline_files.get_mut(id)
+    }
+
+    pub fn add_timeline_entry(&mut self, id: TimelineSyncId, entry: TimelineIndexEntry) {
+        self.timeline_files.insert(id, entry);
+    }
+
+    pub fn all_sync_ids(&self) -> impl Iterator<Item = TimelineSyncId> + '_ {
+        self.timeline_files.keys().copied()
+    }
+
+    pub fn add_branch_file(&mut self, tenant_id: ZTenantId, path: RelativePath) {
+        self.branch_files
+            .entry(tenant_id)
+            .or_insert_with(HashSet::new)
+            .insert(path);
+    }
+
+    pub fn branch_files(&self, tenant_id: ZTenantId) -> Option<&HashSet<RelativePath>> {
+        self.branch_files.get(&tenant_id)
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum TimelineIndexEntry {
+    /// An archive found on the remote storage, but not yet downloaded, only a metadata from its storage path is available, without archive contents.
+    Description(BTreeMap<ArchiveId, ArchiveDescription>),
+    /// Full archive metadata, including the file list, parsed from the archive header.
+    Full(RemoteTimeline),
+}
+
+impl TimelineIndexEntry {
+    pub fn uploaded_checkpoints(&self) -> BTreeSet<Lsn> {
+        match self {
+            TimelineIndexEntry::Description(description) => {
+                description.keys().map(|archive_id| archive_id.0).collect()
+            }
+            TimelineIndexEntry::Full(remote_timeline) => remote_timeline
+                .checkpoint_archives
+                .keys()
+                .map(|archive_id| archive_id.0)
+                .collect(),
+        }
+    }
+}
+
+/// Checkpoint archive's id, corresponding to the `disk_consistent_lsn` from the timeline's metadata file during checkpointing.
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
+pub struct ArchiveId(pub(super) Lsn);
+
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
+struct FileId(ArchiveId, ArchiveEntryNumber);
+
+type ArchiveEntryNumber = usize;
+
+/// All archives and files in them, representing a certain timeline.
+/// Uses file and archive IDs to reference those without ownership issues.
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct RemoteTimeline {
+    timeline_files: BTreeMap<FileId, FileEntry>,
+    checkpoint_archives: BTreeMap<ArchiveId, CheckpointArchive>,
+}
+
+/// Archive metadata, enough to restore a header with the timeline data.
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct CheckpointArchive {
+    disk_consistent_lsn: Lsn,
+    metadata_file_size: u64,
+    files: BTreeSet<FileId>,
+    archive_header_size: u64,
+}
+
+impl CheckpointArchive {
+    pub fn disk_consistent_lsn(&self) -> Lsn {
+        self.disk_consistent_lsn
+    }
+}
+
+impl RemoteTimeline {
+    pub fn empty() -> Self {
+        Self {
+            timeline_files: BTreeMap::new(),
+            checkpoint_archives: BTreeMap::new(),
+        }
+    }
+
+    pub fn checkpoints(&self) -> impl Iterator<Item = Lsn> + '_ {
+        self.checkpoint_archives
+            .values()
+            .map(CheckpointArchive::disk_consistent_lsn)
+    }
+
+    /// Lists all relish files in the given remote timeline. Omits the metadata file.
+    pub fn stored_files(&self, timeline_dir: &Path) -> BTreeSet<PathBuf> {
+        self.timeline_files
+            .values()
+            .map(|file_entry| file_entry.subpath.as_path(timeline_dir))
+            .collect()
+    }
+
+    pub fn contains_checkpoint_at(&self, disk_consistent_lsn: Lsn) -> bool {
+        self.checkpoint_archives
+            .contains_key(&ArchiveId(disk_consistent_lsn))
+    }
+
+    pub fn archive_data(&self, archive_id: ArchiveId) -> Option<&CheckpointArchive> {
+        self.checkpoint_archives.get(&archive_id)
+    }
+
+    /// Restores a header of a certain remote archive from the memory data.
+    /// Returns the header and its compressed size in the archive, both can be used to uncompress that archive.
+    pub fn restore_header(&self, archive_id: ArchiveId) -> anyhow::Result<(ArchiveHeader, u64)> {
+        let archive = self
+            .checkpoint_archives
+            .get(&archive_id)
+            .ok_or_else(|| anyhow!("Archive {:?} not found", archive_id))?;
+
+        let mut header_files = Vec::with_capacity(archive.files.len());
+        for (expected_archive_position, archive_file) in archive.files.iter().enumerate() {
+            let &FileId(archive_id, archive_position) = archive_file;
+            ensure!(
+                expected_archive_position == archive_position,
+                "Archive header is corrupt, file # {} from archive {:?} header is missing",
+                expected_archive_position,
+                archive_id,
+            );
+
+            let timeline_file = self.timeline_files.get(archive_file).ok_or_else(|| {
+                anyhow!(
+                    "File with id {:?} not found for archive {:?}",
+                    archive_file,
+                    archive_id
+                )
+            })?;
+            header_files.push(timeline_file.clone());
+        }
+
+        Ok((
+            ArchiveHeader {
+                files: header_files,
+                metadata_file_size: archive.metadata_file_size,
+            },
+            archive.archive_header_size,
+        ))
+    }
+
+    /// Updates (creates, if necessary) the data about certain archive contents.
+    pub fn update_archive_contents(
+        &mut self,
+        disk_consistent_lsn: Lsn,
+        header: ArchiveHeader,
+        header_size: u64,
+    ) {
+        let archive_id = ArchiveId(disk_consistent_lsn);
+        let mut common_archive_files = BTreeSet::new();
+        for (file_index, file_entry) in header.files.into_iter().enumerate() {
+            let file_id = FileId(archive_id, file_index);
+            self.timeline_files.insert(file_id, file_entry);
+            common_archive_files.insert(file_id);
+        }
+
+        let metadata_file_size = header.metadata_file_size;
+        self.checkpoint_archives
+            .entry(archive_id)
+            .or_insert_with(|| CheckpointArchive {
+                metadata_file_size,
+                files: BTreeSet::new(),
+                archive_header_size: header_size,
+                disk_consistent_lsn,
+            })
+            .files
+            .extend(common_archive_files.into_iter());
+    }
+}
+
+/// Metadata abput timeline checkpoint archive, parsed from its remote storage path.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct ArchiveDescription {
+    pub header_size: u64,
+    pub disk_consistent_lsn: Lsn,
+    pub archive_name: String,
+}
+
+fn try_parse_index_entry(
+    index: &mut RemoteTimelineIndex,
+    conf: &'static PageServerConf,
+    path: &Path,
+) -> anyhow::Result<()> {
+    let tenants_dir = conf.tenants_path();
+    let tenant_id = path
+        .strip_prefix(&tenants_dir)
+        .with_context(|| {
+            format!(
+                "Path '{}' does not belong to tenants directory '{}'",
+                path.display(),
+                tenants_dir.display(),
+            )
+        })?
+        .iter()
+        .next()
+        .ok_or_else(|| anyhow!("Found no tenant id in path '{}'", path.display()))?
+        .to_string_lossy()
+        .parse::<ZTenantId>()
+        .with_context(|| format!("Failed to parse tenant id from path '{}'", path.display()))?;
+
+    let branches_path = conf.branches_path(&tenant_id);
+    let timelines_path = conf.timelines_path(&tenant_id);
+    match (
+        RelativePath::new(&branches_path, &path),
+        path.strip_prefix(&timelines_path),
+    ) {
+        (Ok(_), Ok(_)) => bail!(
+            "Path '{}' cannot start with both branches '{}' and the timelines '{}' prefixes",
+            path.display(),
+            branches_path.display(),
+            timelines_path.display()
+        ),
+        (Ok(branches_entry), Err(_)) => index.add_branch_file(tenant_id, branches_entry),
+        (Err(_), Ok(timelines_subpath)) => {
+            let mut segments = timelines_subpath.iter();
+            let timeline_id = segments
+                .next()
+                .ok_or_else(|| {
+                    anyhow!(
+                        "{} directory of tenant {} (path '{}') is not an index entry",
+                        TIMELINES_SEGMENT_NAME,
+                        tenant_id,
+                        path.display()
+                    )
+                })?
+                .to_string_lossy()
+                .parse::<ZTimelineId>()
+                .with_context(|| {
+                    format!("Failed to parse timeline id from path '{}'", path.display())
+                })?;
+
+            let (disk_consistent_lsn, header_size) =
+                parse_archive_name(path).with_context(|| {
+                    format!(
+                        "Failed to parse archive name out in path '{}'",
+                        path.display()
+                    )
+                })?;
+
+            let archive_name = path
+                .file_name()
+                .ok_or_else(|| anyhow!("Archive '{}' has no file name", path.display()))?
+                .to_string_lossy()
+                .to_string();
+
+            let sync_id = TimelineSyncId(tenant_id, timeline_id);
+            let timeline_index_entry = index
+                .timeline_files
+                .entry(sync_id)
+                .or_insert_with(|| TimelineIndexEntry::Description(BTreeMap::new()));
+            match timeline_index_entry {
+                TimelineIndexEntry::Description(descriptions) => {
+                    descriptions.insert(
+                        ArchiveId(disk_consistent_lsn),
+                        ArchiveDescription {
+                            header_size,
+                            disk_consistent_lsn,
+                            archive_name,
+                        },
+                    );
+                }
+                TimelineIndexEntry::Full(_) => {
+                    bail!("Cannot add parsed archive description to its full context in index with sync id {}", sync_id)
+                }
+            }
+        }
+        (Err(branches_error), Err(timelines_strip_error)) => {
+            bail!(
+                "Path '{}' is not an index entry: it's neither parsable as a branch entry '{:#}' nor as an archive entry '{}'",
+                path.display(),
+                branches_error,
+                timelines_strip_error,
+            )
+        }
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn header_restoration_preserves_file_order() {
+        let header = ArchiveHeader {
+            files: vec![
+                FileEntry {
+                    size: 5,
+                    subpath: RelativePath("one".to_string()),
+                },
+                FileEntry {
+                    size: 1,
+                    subpath: RelativePath("two".to_string()),
+                },
+                FileEntry {
+                    size: 222,
+                    subpath: RelativePath("zero".to_string()),
+                },
+            ],
+            metadata_file_size: 5,
+        };
+
+        let lsn = Lsn(1);
+        let mut remote_timeline = RemoteTimeline::empty();
+        remote_timeline.update_archive_contents(lsn, header.clone(), 15);
+
+        let (restored_header, _) = remote_timeline
+            .restore_header(ArchiveId(lsn))
+            .expect("Should be able to restore header from a valid remote timeline");
+
+        assert_eq!(
+            header, restored_header,
+            "Header restoration should preserve file order"
+        );
+    }
+}
--- a/pageserver/src/remote_storage/storage_sync/upload.rs
+++ b/pageserver/src/remote_storage/storage_sync/upload.rs
@@ -0,0 +1,566 @@
+//! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints.
+//! Currently, tenant branch files are also uploaded, but this does not appear final.
+
+use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc};
+
+use anyhow::{ensure, Context};
+use futures::{stream::FuturesUnordered, StreamExt};
+use tokio::{fs, sync::RwLock};
+use tracing::{debug, error, warn};
+use zenith_utils::zid::ZTenantId;
+
+use crate::{
+    remote_storage::{
+        storage_sync::{
+            compression,
+            index::{RemoteTimeline, TimelineIndexEntry},
+            sync_queue, tenant_branch_files, update_index_description, SyncKind, SyncTask,
+        },
+        RemoteStorage, TimelineSyncId,
+    },
+    PageServerConf,
+};
+
+use super::{compression::ArchiveHeader, index::RemoteTimelineIndex, NewCheckpoint};
+
+/// Attempts to compress and upload given checkpoint files.
+/// No extra checks for overlapping files is made: download takes care of that, ensuring no non-metadata local timeline files are overwritten.
+///
+/// Before the checkpoint files are uploaded, branch files are uploaded, if any local ones are missing remotely.
+///
+/// On an error, bumps the retries count and reschedules the entire task.
+/// On success, populates index data with new downloads.
+pub(super) async fn upload_timeline_checkpoint<
+    P: std::fmt::Debug + Send + Sync + 'static,
+    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
+>(
+    config: &'static PageServerConf,
+    remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
+    sync_id: TimelineSyncId,
+    new_checkpoint: NewCheckpoint,
+    retries: u32,
+) -> Option<bool> {
+    debug!("Uploading checkpoint for sync id {}", sync_id);
+    if let Err(e) = upload_missing_branches(config, remote_assets.as_ref(), sync_id.0).await {
+        error!(
+            "Failed to upload missing branches for sync id {}: {:#}",
+            sync_id, e
+        );
+        sync_queue::push(SyncTask::new(
+            sync_id,
+            retries,
+            SyncKind::Upload(new_checkpoint),
+        ));
+        return Some(false);
+    }
+    let new_upload_lsn = new_checkpoint.metadata.disk_consistent_lsn();
+
+    let index = &remote_assets.1;
+
+    let TimelineSyncId(tenant_id, timeline_id) = sync_id;
+    let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);
+
+    let index_read = index.read().await;
+    let remote_timeline = match index_read.timeline_entry(&sync_id) {
+        None => None,
+        Some(TimelineIndexEntry::Full(remote_timeline)) => Some(Cow::Borrowed(remote_timeline)),
+        Some(TimelineIndexEntry::Description(_)) => {
+            debug!("Found timeline description for the given ids, downloading the full index");
+            match update_index_description(remote_assets.as_ref(), &timeline_dir, sync_id).await {
+                Ok(remote_timeline) => Some(Cow::Owned(remote_timeline)),
+                Err(e) => {
+                    error!("Failed to download full timeline index: {:#}", e);
+                    sync_queue::push(SyncTask::new(
+                        sync_id,
+                        retries,
+                        SyncKind::Upload(new_checkpoint),
+                    ));
+                    return Some(false);
+                }
+            }
+        }
+    };
+
+    let already_contains_upload_lsn = remote_timeline
+        .as_ref()
+        .map(|remote_timeline| remote_timeline.contains_checkpoint_at(new_upload_lsn))
+        .unwrap_or(false);
+    if already_contains_upload_lsn {
+        warn!(
+            "Received a checkpoint with Lsn {} that's already been uploaded to remote storage, skipping the upload.",
+            new_upload_lsn
+        );
+        return None;
+    }
+
+    let already_uploaded_files = remote_timeline
+        .map(|timeline| timeline.stored_files(&timeline_dir))
+        .unwrap_or_default();
+    drop(index_read);
+
+    match try_upload_checkpoint(
+        config,
+        Arc::clone(&remote_assets),
+        sync_id,
+        &new_checkpoint,
+        already_uploaded_files,
+    )
+    .await
+    {
+        Ok((archive_header, header_size)) => {
+            let mut index_write = index.write().await;
+            match index_write.timeline_entry_mut(&sync_id) {
+                Some(TimelineIndexEntry::Full(remote_timeline)) => {
+                    remote_timeline.update_archive_contents(
+                        new_checkpoint.metadata.disk_consistent_lsn(),
+                        archive_header,
+                        header_size,
+                    );
+                }
+                None | Some(TimelineIndexEntry::Description(_)) => {
+                    let mut new_timeline = RemoteTimeline::empty();
+                    new_timeline.update_archive_contents(
+                        new_checkpoint.metadata.disk_consistent_lsn(),
+                        archive_header,
+                        header_size,
+                    );
+                    index_write.add_timeline_entry(sync_id, TimelineIndexEntry::Full(new_timeline));
+                }
+            }
+            debug!("Checkpoint uploaded successfully");
+            Some(true)
+        }
+        Err(e) => {
+            error!(
+                "Failed to upload checkpoint: {:#}, requeueing the upload",
+                e
+            );
+            sync_queue::push(SyncTask::new(
+                sync_id,
+                retries,
+                SyncKind::Upload(new_checkpoint),
+            ));
+            Some(false)
+        }
+    }
+}
+
+async fn try_upload_checkpoint<
+    P: Send + Sync + 'static,
+    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
+>(
+    config: &'static PageServerConf,
+    remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
+    sync_id: TimelineSyncId,
+    new_checkpoint: &NewCheckpoint,
+    files_to_skip: BTreeSet<PathBuf>,
+) -> anyhow::Result<(ArchiveHeader, u64)> {
+    let TimelineSyncId(tenant_id, timeline_id) = sync_id;
+    let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);
+
+    let files_to_upload = new_checkpoint
+        .layers
+        .iter()
+        .filter(|&path_to_upload| {
+            if files_to_skip.contains(path_to_upload) {
+                error!(
+                    "Skipping file upload '{}', since it was already uploaded",
+                    path_to_upload.display()
+                );
+                false
+            } else {
+                true
+            }
+        })
+        .collect::<Vec<_>>();
+    ensure!(!files_to_upload.is_empty(), "No files to upload");
+
+    compression::archive_files_as_stream(
+        &timeline_dir,
+        files_to_upload.into_iter(),
+        &new_checkpoint.metadata,
+        move |archive_streamer, archive_name| async move {
+            let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);
+            let remote_storage = &remote_assets.0;
+            remote_storage
+                .upload(
+                    archive_streamer,
+                    &remote_storage.storage_path(&timeline_dir.join(&archive_name))?,
+                )
+                .await
+        },
+    )
+    .await
+    .map(|(header, header_size, _)| (header, header_size))
+}
+
+async fn upload_missing_branches<
+    P: std::fmt::Debug + Send + Sync + 'static,
+    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
+>(
+    config: &'static PageServerConf,
+    (storage, index): &(S, RwLock<RemoteTimelineIndex>),
+    tenant_id: ZTenantId,
+) -> anyhow::Result<()> {
+    let local_branches = tenant_branch_files(config, tenant_id)
+        .await
+        .context("Failed to list local branch files for the tenant")?;
+    let index_read = index.read().await;
+    let remote_branches = index_read
+        .branch_files(tenant_id)
+        .cloned()
+        .unwrap_or_default();
+    drop(index_read);
+
+    let mut branch_uploads = local_branches
+        .difference(&remote_branches)
+        .map(|local_only_branch| async move {
+            let local_branch_path = local_only_branch.as_path(&config.branches_path(&tenant_id));
+            let storage_path = storage.storage_path(&local_branch_path).with_context(|| {
+                format!(
+                    "Failed to derive a storage path for branch with local path '{}'",
+                    local_branch_path.display()
+                )
+            })?;
+            let local_branch_file = fs::OpenOptions::new()
+                .read(true)
+                .open(&local_branch_path)
+                .await
+                .with_context(|| {
+                    format!(
+                        "Failed to open local branch file {} for reading",
+                        local_branch_path.display()
+                    )
+                })?;
+            storage
+                .upload(local_branch_file, &storage_path)
+                .await
+                .with_context(|| {
+                    format!(
+                        "Failed to upload branch file to the remote path {:?}",
+                        storage_path
+                    )
+                })?;
+            Ok::<_, anyhow::Error>(local_only_branch)
+        })
+        .collect::<FuturesUnordered<_>>();
+
+    let mut branch_uploads_failed = false;
+    while let Some(upload_result) = branch_uploads.next().await {
+        match upload_result {
+            Ok(local_only_branch) => index
+                .write()
+                .await
+                .add_branch_file(tenant_id, local_only_branch.clone()),
+            Err(e) => {
+                error!("Failed to upload branch file: {:#}", e);
+                branch_uploads_failed = true;
+            }
+        }
+    }
+
+    ensure!(!branch_uploads_failed, "Failed to upload all branch files");
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use tempfile::tempdir;
+    use zenith_utils::lsn::Lsn;
+
+    use crate::{
+        remote_storage::{
+            local_fs::LocalFs,
+            storage_sync::{
+                index::ArchiveId,
+                test_utils::{
+                    assert_index_descriptions, create_local_timeline, dummy_metadata,
+                    ensure_correct_timeline_upload, expect_timeline,
+                },
+            },
+        },
+        repository::repo_harness::{RepoHarness, TIMELINE_ID},
+    };
+
+    use super::*;
+
+    #[tokio::test]
+    async fn reupload_timeline() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("reupload_timeline")?;
+        let sync_id = TimelineSyncId(repo_harness.tenant_id, TIMELINE_ID);
+        let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?;
+        let index = RwLock::new(RemoteTimelineIndex::try_parse_descriptions_from_paths(
+            repo_harness.conf,
+            storage
+                .list()
+                .await?
+                .into_iter()
+                .map(|storage_path| storage.local_path(&storage_path).unwrap()),
+        ));
+        let remote_assets = Arc::new((storage, index));
+        let index = &remote_assets.1;
+
+        let first_upload_metadata = dummy_metadata(Lsn(0x10));
+        let first_checkpoint = create_local_timeline(
+            &repo_harness,
+            TIMELINE_ID,
+            &["a", "b"],
+            first_upload_metadata.clone(),
+        )?;
+        let local_timeline_path = repo_harness.timeline_path(&TIMELINE_ID);
+        ensure_correct_timeline_upload(
+            &repo_harness,
+            Arc::clone(&remote_assets),
+            TIMELINE_ID,
+            first_checkpoint,
+        )
+        .await;
+
+        let uploaded_timeline = expect_timeline(index, sync_id).await;
+        let uploaded_archives = uploaded_timeline
+            .checkpoints()
+            .map(ArchiveId)
+            .collect::<Vec<_>>();
+        assert_eq!(
+            uploaded_archives.len(),
+            1,
+            "Only one archive is expected after a first upload"
+        );
+        let first_uploaded_archive = uploaded_archives.first().copied().unwrap();
+        assert_eq!(
+            uploaded_timeline.checkpoints().last(),
+            Some(first_upload_metadata.disk_consistent_lsn()),
+            "Metadata that was uploaded, should have its Lsn stored"
+        );
+        assert_eq!(
+            uploaded_timeline
+                .archive_data(uploaded_archives.first().copied().unwrap())
+                .unwrap()
+                .disk_consistent_lsn(),
+            first_upload_metadata.disk_consistent_lsn(),
+            "Uploaded archive should have corresponding Lsn"
+        );
+        assert_eq!(
+            uploaded_timeline.stored_files(&local_timeline_path),
+            vec![local_timeline_path.join("a"), local_timeline_path.join("b")]
+                .into_iter()
+                .collect(),
+            "Should have all files from the first checkpoint"
+        );
+
+        let second_upload_metadata = dummy_metadata(Lsn(0x40));
+        let second_checkpoint = create_local_timeline(
+            &repo_harness,
+            TIMELINE_ID,
+            &["b", "c"],
+            second_upload_metadata.clone(),
+        )?;
+        assert!(
+            first_upload_metadata.disk_consistent_lsn()
+                < second_upload_metadata.disk_consistent_lsn()
+        );
+        ensure_correct_timeline_upload(
+            &repo_harness,
+            Arc::clone(&remote_assets),
+            TIMELINE_ID,
+            second_checkpoint,
+        )
+        .await;
+
+        let updated_timeline = expect_timeline(index, sync_id).await;
+        let mut updated_archives = updated_timeline
+            .checkpoints()
+            .map(ArchiveId)
+            .collect::<Vec<_>>();
+        assert_eq!(
+            updated_archives.len(),
+            2,
+            "Two archives are expected after a successful update of the upload"
+        );
+        updated_archives.retain(|archive_id| archive_id != &first_uploaded_archive);
+        assert_eq!(
+            updated_archives.len(),
+            1,
+            "Only one new archive is expected among the uploaded"
+        );
+        let second_uploaded_archive = updated_archives.last().copied().unwrap();
+        assert_eq!(
+            updated_timeline.checkpoints().max(),
+            Some(second_upload_metadata.disk_consistent_lsn()),
+            "Metadata that was uploaded, should have its Lsn stored"
+        );
+        assert_eq!(
+            updated_timeline
+                .archive_data(second_uploaded_archive)
+                .unwrap()
+                .disk_consistent_lsn(),
+            second_upload_metadata.disk_consistent_lsn(),
+            "Uploaded archive should have corresponding Lsn"
+        );
+        assert_eq!(
+            updated_timeline.stored_files(&local_timeline_path),
+            vec![
+                local_timeline_path.join("a"),
+                local_timeline_path.join("b"),
+                local_timeline_path.join("c"),
+            ]
+            .into_iter()
+            .collect(),
+            "Should have all files from both checkpoints without duplicates"
+        );
+
+        let third_upload_metadata = dummy_metadata(Lsn(0x20));
+        let third_checkpoint = create_local_timeline(
+            &repo_harness,
+            TIMELINE_ID,
+            &["d"],
+            third_upload_metadata.clone(),
+        )?;
+        assert_ne!(
+            third_upload_metadata.disk_consistent_lsn(),
+            first_upload_metadata.disk_consistent_lsn()
+        );
+        assert!(
+            third_upload_metadata.disk_consistent_lsn()
+                < second_upload_metadata.disk_consistent_lsn()
+        );
+        ensure_correct_timeline_upload(
+            &repo_harness,
+            Arc::clone(&remote_assets),
+            TIMELINE_ID,
+            third_checkpoint,
+        )
+        .await;
+
+        let updated_timeline = expect_timeline(index, sync_id).await;
+        let mut updated_archives = updated_timeline
+            .checkpoints()
+            .map(ArchiveId)
+            .collect::<Vec<_>>();
+        assert_eq!(
+            updated_archives.len(),
+            3,
+            "Three archives are expected after two successful updates of the upload"
+        );
+        updated_archives.retain(|archive_id| {
+            archive_id != &first_uploaded_archive && archive_id != &second_uploaded_archive
+        });
+        assert_eq!(
+            updated_archives.len(),
+            1,
+            "Only one new archive is expected among the uploaded"
+        );
+        let third_uploaded_archive = updated_archives.last().copied().unwrap();
+        assert!(
+            updated_timeline.checkpoints().max().unwrap()
+                > third_upload_metadata.disk_consistent_lsn(),
+            "Should not influence the last lsn by uploading an older checkpoint"
+        );
+        assert_eq!(
+            updated_timeline
+                .archive_data(third_uploaded_archive)
+                .unwrap()
+                .disk_consistent_lsn(),
+            third_upload_metadata.disk_consistent_lsn(),
+            "Uploaded archive should have corresponding Lsn"
+        );
+        assert_eq!(
+            updated_timeline.stored_files(&local_timeline_path),
+            vec![
+                local_timeline_path.join("a"),
+                local_timeline_path.join("b"),
+                local_timeline_path.join("c"),
+                local_timeline_path.join("d"),
+            ]
+            .into_iter()
+            .collect(),
+            "Should have all files from three checkpoints without duplicates"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn reupload_timeline_rejected() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("reupload_timeline_rejected")?;
+        let sync_id = TimelineSyncId(repo_harness.tenant_id, TIMELINE_ID);
+        let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?;
+        let index = RwLock::new(RemoteTimelineIndex::try_parse_descriptions_from_paths(
+            repo_harness.conf,
+            storage
+                .list()
+                .await?
+                .into_iter()
+                .map(|storage_path| storage.local_path(&storage_path).unwrap()),
+        ));
+        let remote_assets = Arc::new((storage, index));
+        let storage = &remote_assets.0;
+        let index = &remote_assets.1;
+
+        let first_upload_metadata = dummy_metadata(Lsn(0x10));
+        let first_checkpoint = create_local_timeline(
+            &repo_harness,
+            TIMELINE_ID,
+            &["a", "b"],
+            first_upload_metadata.clone(),
+        )?;
+        ensure_correct_timeline_upload(
+            &repo_harness,
+            Arc::clone(&remote_assets),
+            TIMELINE_ID,
+            first_checkpoint,
+        )
+        .await;
+        let after_first_uploads = RemoteTimelineIndex::try_parse_descriptions_from_paths(
+            repo_harness.conf,
+            remote_assets
+                .0
+                .list()
+                .await
+                .unwrap()
+                .into_iter()
+                .map(|storage_path| storage.local_path(&storage_path).unwrap()),
+        );
+
+        let normal_upload_metadata = dummy_metadata(Lsn(0x20));
+        assert_ne!(
+            normal_upload_metadata.disk_consistent_lsn(),
+            first_upload_metadata.disk_consistent_lsn()
+        );
+
+        let checkpoint_with_no_files = create_local_timeline(
+            &repo_harness,
+            TIMELINE_ID,
+            &[],
+            normal_upload_metadata.clone(),
+        )?;
+        upload_timeline_checkpoint(
+            repo_harness.conf,
+            Arc::clone(&remote_assets),
+            sync_id,
+            checkpoint_with_no_files,
+            0,
+        )
+        .await;
+        assert_index_descriptions(index, after_first_uploads.clone()).await;
+
+        let checkpoint_with_uploaded_lsn = create_local_timeline(
+            &repo_harness,
+            TIMELINE_ID,
+            &["something", "new"],
+            first_upload_metadata.clone(),
+        )?;
+        upload_timeline_checkpoint(
+            repo_harness.conf,
+            Arc::clone(&remote_assets),
+            sync_id,
+            checkpoint_with_uploaded_lsn,
+            0,
+        )
+        .await;
+        assert_index_descriptions(index, after_first_uploads.clone()).await;
+
+        Ok(())
+    }
+}
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -1,6 +1,7 @@
 use crate::relish::*;
+use crate::CheckpointConfig;
 use anyhow::Result;
-use bytes::{Buf, BufMut, Bytes, BytesMut};
+use bytes::Bytes;
 use serde::{Deserialize, Serialize};
 use std::collections::HashSet;
 use std::ops::{AddAssign, Deref};
@@ -15,18 +16,35 @@ use zenith_utils::zid::ZTimelineId;
 pub trait Repository: Send + Sync {
    fn shutdown(&self) -> Result<()>;

+    /// Updates timeline based on the new sync state, received from the remote storage synchronization.
+    /// See [`crate::remote_storage`] for more details about the synchronization.
+    fn set_timeline_state(
+        &self,
+        timeline_id: ZTimelineId,
+        new_state: TimelineSyncState,
+    ) -> Result<()>;
+
+    /// Gets current synchronization state of the timeline.
+    /// See [`crate::remote_storage`] for more details about the synchronization.
+    fn get_timeline_state(&self, timeline_id: ZTimelineId) -> Option<TimelineSyncState>;
+
    /// Get Timeline handle for given zenith timeline ID.
-    fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
+    fn get_timeline(&self, timelineid: ZTimelineId) -> Result<RepositoryTimeline>;

    /// Create a new, empty timeline. The caller is responsible for loading data into it
-    fn create_empty_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
+    /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
+    fn create_empty_timeline(
+        &self,
+        timelineid: ZTimelineId,
+        initdb_lsn: Lsn,
+    ) -> Result<Arc<dyn Timeline>>;

    /// Branch a timeline
    fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>;

-    /// perform one garbage collection iteration.
-    /// garbage collection is periodically performed by gc thread,
-    /// but it can be explicitly requested through page server api.
+    /// perform one garbage collection iteration, removing old data files from disk.
+    /// this function is periodically called by gc thread.
+    /// also it can be explicitly requested through page server api 'do_gc' command.
    ///
    /// 'timelineid' specifies the timeline to GC, or None for all.
    /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval).
@@ -39,32 +57,91 @@ pub trait Repository: Send + Sync {
        horizon: u64,
        checkpoint_before_gc: bool,
    ) -> Result<GcResult>;
+
+    /// perform one checkpoint iteration, flushing in-memory data on disk.
+    /// this function is periodically called by checkponter thread.
+    fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()>;
+}
+
+/// A timeline, that belongs to the current repository.
+pub enum RepositoryTimeline {
+    /// Timeline, with its files present locally in pageserver's working directory.
+    /// Loaded into pageserver's memory and ready to be used.
+    Local(Arc<dyn Timeline>),
+    /// Timeline, found on the pageserver's remote storage, but not yet downloaded locally.
+    Remote(ZTimelineId),
+}
+
+impl RepositoryTimeline {
+    pub fn local_timeline(&self) -> Option<Arc<dyn Timeline>> {
+        if let Self::Local(local_timeline) = self {
+            Some(Arc::clone(local_timeline))
+        } else {
+            None
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
+pub enum TimelineSyncState {
+    /// No further downloads from the remote storage are needed.
+    /// The timeline state is up-to-date or ahead of the remote storage one,
+    /// ready to be used in any pageserver operation.
+    Ready,
+    /// Timeline is scheduled for downloading, but its current local state is not up to date with the remote storage.
+    /// The timeline is not ready to be used in any pageserver operations, otherwise it might diverge its local state from the remote version,
+    /// making it impossible to sync it further.
+    AwaitsDownload,
+    /// Timeline was not in the pageserver's local working directory, but was found on the remote storage, ready to be downloaded.
+    /// Cannot be used in any pageserver operations due to complete absence locally.
+    CloudOnly,
+    /// Timeline was evicted from the pageserver's local working directory due to conflicting remote and local states or too many errors during the synchronization.
+    /// Such timelines cannot have their state synchronized further.
+    Evicted,
 }

 ///
 /// Result of performing GC
 ///
-#[derive(Default, Debug)]
+#[derive(Default)]
 pub struct GcResult {
-    pub meta_removed: u64, // removed versions beyond PITR interval for which new page image exists
-    pub meta_dropped: u64, // removed versions beyond PITR interval of dropped relations
-    pub meta_total: u64,   // total number of metaobject version histories
+    pub ondisk_relfiles_total: u64,
+    pub ondisk_relfiles_needed_by_cutoff: u64,
+    pub ondisk_relfiles_needed_by_branches: u64,
+    pub ondisk_relfiles_not_updated: u64,
+    pub ondisk_relfiles_needed_as_tombstone: u64,
+    pub ondisk_relfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
+    pub ondisk_relfiles_dropped: u64, // # of layer files removed because the relation was dropped

-    pub pages_removed: u64, // removed versions beyond PITR interval for which new page image exists
-    pub pages_dropped: u64, // removed versions beyond PITR interval of dropped relations
-    pub pages_total: u64,   // total number of page vaersion histories
+    pub ondisk_nonrelfiles_total: u64,
+    pub ondisk_nonrelfiles_needed_by_cutoff: u64,
+    pub ondisk_nonrelfiles_needed_by_branches: u64,
+    pub ondisk_nonrelfiles_not_updated: u64,
+    pub ondisk_nonrelfiles_needed_as_tombstone: u64,
+    pub ondisk_nonrelfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
+    pub ondisk_nonrelfiles_dropped: u64, // # of layer files removed because the relation was dropped

    pub elapsed: Duration,
 }

 impl AddAssign for GcResult {
    fn add_assign(&mut self, other: Self) {
-        self.meta_total += other.meta_total;
-        self.meta_removed += other.meta_removed;
-        self.meta_dropped += other.meta_dropped;
-        self.pages_total += other.pages_total;
-        self.pages_removed += other.pages_removed;
-        self.pages_dropped += other.pages_dropped;
+        self.ondisk_relfiles_total += other.ondisk_relfiles_total;
+        self.ondisk_relfiles_needed_by_cutoff += other.ondisk_relfiles_needed_by_cutoff;
+        self.ondisk_relfiles_needed_by_branches += other.ondisk_relfiles_needed_by_branches;
+        self.ondisk_relfiles_not_updated += other.ondisk_relfiles_not_updated;
+        self.ondisk_relfiles_needed_as_tombstone += other.ondisk_relfiles_needed_as_tombstone;
+        self.ondisk_relfiles_removed += other.ondisk_relfiles_removed;
+        self.ondisk_relfiles_dropped += other.ondisk_relfiles_dropped;
+
+        self.ondisk_nonrelfiles_total += other.ondisk_nonrelfiles_total;
+        self.ondisk_nonrelfiles_needed_by_cutoff += other.ondisk_nonrelfiles_needed_by_cutoff;
+        self.ondisk_nonrelfiles_needed_by_branches += other.ondisk_nonrelfiles_needed_by_branches;
+        self.ondisk_nonrelfiles_not_updated += other.ondisk_nonrelfiles_not_updated;
+        self.ondisk_nonrelfiles_needed_as_tombstone += other.ondisk_nonrelfiles_needed_as_tombstone;
+        self.ondisk_nonrelfiles_removed += other.ondisk_nonrelfiles_removed;
+        self.ondisk_nonrelfiles_dropped += other.ondisk_nonrelfiles_dropped;
+
        self.elapsed += other.elapsed;
    }
 }
@@ -93,17 +170,16 @@ pub trait Timeline: Send + Sync {

    /// Get a list of all existing relations
    /// Pass RelTag to get relation objects or None to get nonrels.
+    fn list_relishes(&self, tag: Option<RelTag>, lsn: Lsn) -> Result<HashSet<RelishTag>>;
+
    /// Get a list of all existing relations in given tablespace and database.
    fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelishTag>>;

    /// Get a list of all existing non-relational objects
    fn list_nonrels(&self, lsn: Lsn) -> Result<HashSet<RelishTag>>;

-    ///
-    /// Export data as delats and image layers between 'start_lsn' to 'end_lsn'. The
-    /// start is inclusive, and end is exclusive.
-    ///
-    fn export_timeline(&self, start_lsn: Lsn, end_lsn: Lsn) -> Result<()>;
+    /// Get the ancestor's timeline id
+    fn get_ancestor_timeline_id(&self) -> Option<ZTimelineId>;

    /// Get the LSN where this branch was created
    fn get_ancestor_lsn(&self) -> Lsn;
@@ -120,6 +196,7 @@ pub trait Timeline: Send + Sync {
    fn get_last_record_lsn(&self) -> Lsn;
    fn get_prev_record_lsn(&self) -> Lsn;
    fn get_start_lsn(&self) -> Lsn;
+    fn get_disk_consistent_lsn(&self) -> Lsn;

    /// Mutate the timeline with a [`TimelineWriter`].
    fn writer<'a>(&'a self) -> Box<dyn TimelineWriter + 'a>;
@@ -129,7 +206,11 @@ pub trait Timeline: Send + Sync {
    ///
    /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
    /// know anything about them here in the repository.
-    fn checkpoint(&self) -> Result<()>;
+    fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()>;
+
+    ///
+    /// Check that it is valid to request operations with that lsn.
+    fn check_lsn_is_in_scope(&self, lsn: Lsn) -> Result<()>;

    /// Retrieve current logical size of the timeline
    ///
@@ -140,6 +221,9 @@ pub trait Timeline: Send + Sync {
    /// Does the same as get_current_logical_size but counted on demand.
    /// Used in tests to ensure thet incremental and non incremental variants match.
    fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize>;
+
+    /// An escape hatch to allow "casting" a generic Timeline to LayeredTimeline.
+    fn upgrade_to_layered_timeline(&self) -> &crate::layered_repository::LayeredTimeline;
 }

 /// Various functions to mutate the timeline.
@@ -167,16 +251,6 @@ pub trait TimelineWriter: Deref<Target = dyn Timeline> {
    /// Advance requires aligned LSN as an argument and would wake wait_lsn() callers.
    /// Previous last record LSN is stored alongside the latest and can be read.
    fn advance_last_record_lsn(&self, lsn: Lsn);
-
-    ///
-    /// Complete all delayed commits and advance disk_consistent_lsn
-    ///
-    fn checkpoint(&self) -> Result<()>;
-
-    ///
-    /// Import data from layer files
-    ///
-    fn import_timeline(&self, snapshot_lsn: Lsn) -> Result<()>;
 }

 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
@@ -189,57 +263,104 @@ pub struct WALRecord {
    pub main_data_offset: u32,
 }

-impl WALRecord {
-    pub fn pack(&self, buf: &mut BytesMut) {
-        buf.put_u8(self.will_init as u8);
-        buf.put_u32(self.main_data_offset);
-        buf.put_u32(self.rec.len() as u32);
-        buf.put_slice(&self.rec[..]);
+#[cfg(test)]
+pub mod repo_harness {
+    use bytes::BytesMut;
+    use std::{fs, path::PathBuf};
+
+    use crate::{
+        layered_repository::{LayeredRepository, TIMELINES_SEGMENT_NAME},
+        walredo::{WalRedoError, WalRedoManager},
+        PageServerConf,
+    };
+
+    use super::*;
+    use hex_literal::hex;
+    use zenith_utils::zid::ZTenantId;
+
+    pub const TIMELINE_ID: ZTimelineId =
+        ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
+    pub const NEW_TIMELINE_ID: ZTimelineId =
+        ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
+
+    /// Convenience function to create a page image with given string as the only content
+    #[allow(non_snake_case)]
+    pub fn TEST_IMG(s: &str) -> Bytes {
+        let mut buf = BytesMut::new();
+        buf.extend_from_slice(s.as_bytes());
+        buf.resize(8192, 0);
+
+        buf.freeze()
    }
-    pub fn unpack(buf: &mut Bytes) -> WALRecord {
-        let will_init = buf.get_u8() != 0;
-        let main_data_offset = buf.get_u32();
-        let rec_len = buf.get_u32() as usize;
-        let rec = buf.split_to(rec_len);
-        WALRecord {
-            will_init,
-            rec,
-            main_data_offset,
+
+    pub struct RepoHarness {
+        pub conf: &'static PageServerConf,
+        pub tenant_id: ZTenantId,
+    }
+
+    impl RepoHarness {
+        pub fn create(test_name: &'static str) -> Result<Self> {
+            let repo_dir = PageServerConf::test_repo_dir(test_name);
+            let _ = fs::remove_dir_all(&repo_dir);
+            fs::create_dir_all(&repo_dir)?;
+            fs::create_dir_all(&repo_dir.join(TIMELINES_SEGMENT_NAME))?;
+
+            let conf = PageServerConf::dummy_conf(repo_dir);
+            // Make a static copy of the config. This can never be free'd, but that's
+            // OK in a test.
+            let conf: &'static PageServerConf = Box::leak(Box::new(conf));
+
+            let tenant_id = ZTenantId::generate();
+            fs::create_dir_all(conf.tenant_path(&tenant_id))?;
+            fs::create_dir_all(conf.branches_path(&tenant_id))?;
+
+            Ok(Self { conf, tenant_id })
+        }
+
+        pub fn load(&self) -> Box<dyn Repository> {
+            let walredo_mgr = Arc::new(TestRedoManager);
+
+            Box::new(LayeredRepository::new(
+                self.conf,
+                walredo_mgr,
+                self.tenant_id,
+                false,
+            ))
+        }
+
+        pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
+            self.conf.timeline_path(timeline_id, &self.tenant_id)
        }
    }
-}

-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub enum PageVersion {
-    /// an 8kb page image
-    Page(Bytes),
-    /// WAL record to get from previous page version to this one.
-    Wal(WALRecord),
-}
+    // Mock WAL redo manager that doesn't do much
+    struct TestRedoManager;

-///
-/// Data needed to reconstruct a page version
-///
-/// 'page_img' is the old base image of the page to start the WAL replay with.
-/// It can be None, if the first WAL record initializes the page (will_init)
-/// 'records' contains the records to apply over the base image.
-///
-pub struct PageReconstructData {
-    pub records: Vec<(Lsn, WALRecord)>,
-    pub page_img: Option<Bytes>,
-}
-
-/// Return value from Layer::get_page_reconstruct_data
-pub enum PageReconstructResult {
-    /// Got all the data needed to reconstruct the requested page
-    Complete,
-    /// This layer didn't contain all the required data, the caller should look up
-    /// the predecessor layer at the returned LSN and collect more data from there.
-    Continue(Lsn),
-    /// This layer didn't contain data needed to reconstruct the page version at
-    /// the returned LSN. This is usually considered an error, but might be OK
-    /// in some circumstances.
-    Missing(Lsn),
+    impl WalRedoManager for TestRedoManager {
+        fn request_redo(
+            &self,
+            rel: RelishTag,
+            blknum: u32,
+            lsn: Lsn,
+            base_img: Option<Bytes>,
+            records: Vec<(Lsn, WALRecord)>,
+        ) -> Result<Bytes, WalRedoError> {
+            let s = format!(
+                "redo for {} blk {} to get to {}, with {} and {} records",
+                rel,
+                blknum,
+                lsn,
+                if base_img.is_some() {
+                    "base image"
+                } else {
+                    "no base image"
+                },
+                records.len()
+            );
+            println!("{}", s);
+            Ok(TEST_IMG(&s))
+        }
+    }
 }

 ///
@@ -248,29 +369,21 @@ pub enum PageReconstructResult {
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
-    use super::*;
-    use crate::buffered_repository::{BufferedRepository, METADATA_FILE_NAME};
-    use crate::walredo::{WalRedoError, WalRedoManager};
-    use crate::PageServerConf;
-    use hex_literal::hex;
-    use postgres_ffi::pg_constants;
-    use postgres_ffi::xlog_utils::SIZEOF_CHECKPOINT;
-    use std::fs;
-    use std::path::PathBuf;
-    use zenith_utils::zid::ZTenantId;
+    use crate::layered_repository::metadata::METADATA_FILE_NAME;

-    const TIMELINE_ID: ZTimelineId =
-        ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
-    const NEW_TIMELINE_ID: ZTimelineId =
-        ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
+    use super::repo_harness::*;
+    use super::*;
+    use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT};
+    use std::fs;

    /// Arbitrary relation tag, for testing.
-    const TESTREL_A: RelishTag = RelishTag::Relation(RelTag {
+    const TESTREL_A_REL_TAG: RelTag = RelTag {
        spcnode: 0,
        dbnode: 111,
        relnode: 1000,
        forknum: 0,
-    });
+    };
+    const TESTREL_A: RelishTag = RelishTag::Relation(TESTREL_A_REL_TAG);
    const TESTREL_B: RelishTag = RelishTag::Relation(RelTag {
        spcnode: 0,
        dbnode: 111,
@@ -278,16 +391,6 @@ mod tests {
        forknum: 0,
    });

-    /// Convenience function to create a page image with given string as the only content
-    #[allow(non_snake_case)]
-    fn TEST_IMG(s: &str) -> Bytes {
-        let mut buf = BytesMut::new();
-        buf.extend_from_slice(s.as_bytes());
-        buf.resize(8192, 0);
-
-        buf.freeze()
-    }
-
    fn assert_current_logical_size(timeline: &Arc<dyn Timeline>, lsn: Lsn) {
        let incremental = timeline.get_current_logical_size();
        let non_incremental = timeline
@@ -299,45 +402,6 @@ mod tests {
    static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
    static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);

-    struct RepoHarness {
-        conf: &'static PageServerConf,
-        tenant_id: ZTenantId,
-    }
-
-    impl RepoHarness {
-        fn create(test_name: &'static str) -> Result<Self> {
-            let repo_dir = PageServerConf::test_repo_dir(test_name);
-            let _ = fs::remove_dir_all(&repo_dir);
-            fs::create_dir_all(&repo_dir)?;
-            fs::create_dir_all(&repo_dir.join("timelines"))?;
-
-            let conf = PageServerConf::dummy_conf(repo_dir);
-            // Make a static copy of the config. This can never be free'd, but that's
-            // OK in a test.
-            let conf: &'static PageServerConf = Box::leak(Box::new(conf));
-
-            let tenant_id = ZTenantId::generate();
-            fs::create_dir_all(conf.tenant_path(&tenant_id))?;
-
-            Ok(Self { conf, tenant_id })
-        }
-
-        fn load(&self) -> Box<dyn Repository> {
-            let walredo_mgr = Arc::new(TestRedoManager);
-
-            Box::new(BufferedRepository::new(
-                self.conf,
-                walredo_mgr,
-                self.tenant_id,
-                false,
-            ))
-        }
-
-        fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
-            self.conf.timeline_path(timeline_id, &self.tenant_id)
-        }
-    }
-
    #[test]
    fn test_relsize() -> Result<()> {
        let repo = RepoHarness::create("test_relsize")?.load();
@@ -345,7 +409,7 @@ mod tests {
        //repo.get_timeline("11223344556677881122334455667788");

        // Create timeline to work on
-        let tline = repo.create_empty_timeline(TIMELINE_ID)?;
+        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
        let writer = tline.writer();

        writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
@@ -463,7 +527,7 @@ mod tests {
        let repo = RepoHarness::create("test_drop_extend")?.load();

        // Create timeline to work on
-        let tline = repo.create_empty_timeline(TIMELINE_ID)?;
+        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
        let writer = tline.writer();

        writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
@@ -500,7 +564,7 @@ mod tests {
        let repo = RepoHarness::create("test_truncate_extend")?.load();

        // Create timeline to work on
-        let tline = repo.create_empty_timeline(TIMELINE_ID)?;
+        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
        let writer = tline.writer();

        //from storage_layer.rs
@@ -599,7 +663,7 @@ mod tests {
    #[test]
    fn test_large_rel() -> Result<()> {
        let repo = RepoHarness::create("test_large_rel")?.load();
-        let tline = repo.create_empty_timeline(TIMELINE_ID)?;
+        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
        let writer = tline.writer();

        let mut lsn = 0x10;
@@ -662,7 +726,7 @@ mod tests {
    #[test]
    fn test_list_rels_drop() -> Result<()> {
        let repo = RepoHarness::create("test_list_rels_drop")?.load();
-        let tline = repo.create_empty_timeline(TIMELINE_ID)?;
+        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
        let writer = tline.writer();
        const TESTDB: u32 = 111;

@@ -682,7 +746,10 @@ mod tests {

        // Create a branch, check that the relation is visible there
        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
-        let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
+        let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() {
+            Some(timeline) => timeline,
+            None => panic!("Should have a local timeline"),
+        };
        let new_writer = newtline.writer();

        assert!(newtline
@@ -704,7 +771,7 @@ mod tests {
            .contains(&TESTREL_A));

        // Run checkpoint and garbage collection and check that it's still not visible
-        newtline.checkpoint()?;
+        newtline.checkpoint(CheckpointConfig::Forced)?;
        repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?;

        assert!(!newtline
@@ -720,7 +787,7 @@ mod tests {
    #[test]
    fn test_branch() -> Result<()> {
        let repo = RepoHarness::create("test_branch")?.load();
-        let tline = repo.create_empty_timeline(TIMELINE_ID)?;
+        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
        let writer = tline.writer();

        // Import initial dummy checkpoint record, otherwise the get_timeline() call
@@ -740,7 +807,10 @@ mod tests {

        // Branch the history, modify relation differently on the new timeline
        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
-        let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
+        let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() {
+            Some(timeline) => timeline,
+            None => panic!("Should have a local timeline"),
+        };
        let new_writer = newtline.writer();

        new_writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?;
@@ -769,13 +839,235 @@ mod tests {
        Ok(())
    }

+    fn make_some_layers(tline: &Arc<dyn Timeline>, start_lsn: Lsn) -> Result<()> {
+        let mut lsn = start_lsn;
+        {
+            let writer = tline.writer();
+            // Create a relation on the timeline
+            writer.put_page_image(
+                TESTREL_A,
+                0,
+                lsn,
+                TEST_IMG(&format!("foo blk 0 at {}", lsn)),
+            )?;
+            lsn += 0x10;
+            writer.put_page_image(
+                TESTREL_A,
+                0,
+                lsn,
+                TEST_IMG(&format!("foo blk 0 at {}", lsn)),
+            )?;
+            writer.advance_last_record_lsn(lsn);
+        }
+        tline.checkpoint(CheckpointConfig::Forced)?;
+        {
+            let writer = tline.writer();
+            lsn += 0x10;
+            writer.put_page_image(
+                TESTREL_A,
+                0,
+                lsn,
+                TEST_IMG(&format!("foo blk 0 at {}", lsn)),
+            )?;
+            lsn += 0x10;
+            writer.put_page_image(
+                TESTREL_A,
+                0,
+                lsn,
+                TEST_IMG(&format!("foo blk 0 at {}", lsn)),
+            )?;
+            writer.advance_last_record_lsn(lsn);
+        }
+        tline.checkpoint(CheckpointConfig::Forced)
+    }
+
+    #[test]
+    fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> {
+        let repo =
+            RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load();
+
+        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
+        make_some_layers(&tline, Lsn(0x20))?;
+
+        // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
+        repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
+
+        // try to branch at lsn 25, should fail because we already garbage collected the data
+        match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) {
+            Ok(_) => panic!("branching should have failed"),
+            Err(err) => {
+                assert!(err.to_string().contains("invalid branch start lsn"));
+                assert!(err
+                    .source()
+                    .unwrap()
+                    .to_string()
+                    .contains("we might've already garbage collected needed data"))
+            }
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> {
+        let repo = RepoHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load();
+
+        repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?;
+        // try to branch at lsn 0x25, should fail because initdb lsn is 0x50
+        match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) {
+            Ok(_) => panic!("branching should have failed"),
+            Err(err) => {
+                assert!(&err.to_string().contains("invalid branch start lsn"));
+                assert!(&err
+                    .source()
+                    .unwrap()
+                    .to_string()
+                    .contains("is earlier than initdb lsn"));
+            }
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_prohibit_get_page_at_lsn_for_garbage_collected_pages() -> Result<()> {
+        let repo =
+            RepoHarness::create("test_prohibit_get_page_at_lsn_for_garbage_collected_pages")?
+                .load();
+
+        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
+        make_some_layers(&tline, Lsn(0x20))?;
+
+        repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
+
+        match tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)) {
+            Ok(_) => panic!("request for page should have failed"),
+            Err(err) => assert!(err
+                .to_string()
+                .contains("tried to request a page version that was garbage collected")),
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> {
+        let repo =
+            RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load();
+        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
+
+        make_some_layers(&tline, Lsn(0x20))?;
+
+        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
+        let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() {
+            Some(timeline) => timeline,
+            None => panic!("Should have a local timeline"),
+        };
+
+        // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
+        repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
+        assert!(newtline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)).is_ok());
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_parent_keeps_data_forever_after_branching() -> Result<()> {
+        let harness = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?;
+        let repo = harness.load();
+        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
+
+        make_some_layers(&tline, Lsn(0x20))?;
+
+        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
+        let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() {
+            Some(timeline) => timeline,
+            None => panic!("Should have a local timeline"),
+        };
+
+        make_some_layers(&newtline, Lsn(0x60))?;
+
+        // run gc on parent
+        repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
+
+        // check that the layer in parent before the branching point is still there
+        let tline_dir = harness.conf.timeline_path(&TIMELINE_ID, &harness.tenant_id);
+
+        let expected_image_layer_path = tline_dir.join(format!(
+            "rel_{}_{}_{}_{}_{}_rel_{}_{}_{}_{}_{}_{:016X}_{:016X}",
+            TESTREL_A_REL_TAG.spcnode,
+            TESTREL_A_REL_TAG.dbnode,
+            TESTREL_A_REL_TAG.relnode,
+            TESTREL_A_REL_TAG.forknum,
+            0, // seg is 0
+            TESTREL_A_REL_TAG.spcnode,
+            TESTREL_A_REL_TAG.dbnode,
+            TESTREL_A_REL_TAG.relnode,
+            TESTREL_A_REL_TAG.forknum,
+            1, // end seg is 1
+            0x20,
+            0x30,
+        ));
+        assert!(fs::metadata(&expected_image_layer_path).is_ok());
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_read_beyond_eof() -> Result<()> {
+        let harness = RepoHarness::create("test_read_beyond_eof")?;
+        let repo = harness.load();
+        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
+
+        make_some_layers(&tline, Lsn(0x20))?;
+        {
+            let writer = tline.writer();
+            writer.put_page_image(
+                TESTREL_A,
+                0,
+                Lsn(0x60),
+                TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x50))),
+            )?;
+            writer.advance_last_record_lsn(Lsn(0x60));
+        }
+
+        // Test read before rel creation. Should error out.
+        assert!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x10)).is_err());
+
+        // Read block beyond end of relation at different points in time.
+        // These reads should fall into different delta, image, and in-memory layers.
+        assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x20))?, ZERO_PAGE);
+        assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x25))?, ZERO_PAGE);
+        assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x30))?, ZERO_PAGE);
+        assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x35))?, ZERO_PAGE);
+        assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x40))?, ZERO_PAGE);
+        assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x45))?, ZERO_PAGE);
+        assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x50))?, ZERO_PAGE);
+        assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x55))?, ZERO_PAGE);
+        assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x60))?, ZERO_PAGE);
+
+        // Test on an in-memory layer with no preceding layer
+        {
+            let writer = tline.writer();
+            writer.put_page_image(
+                TESTREL_B,
+                0,
+                Lsn(0x70),
+                TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x70))),
+            )?;
+            writer.advance_last_record_lsn(Lsn(0x70));
+        }
+        assert_eq!(tline.get_page_at_lsn(TESTREL_B, 1, Lsn(0x70))?, ZERO_PAGE);
+
+        Ok(())
+    }
+
    #[test]
    fn corrupt_metadata() -> Result<()> {
        const TEST_NAME: &str = "corrupt_metadata";
        let harness = RepoHarness::create(TEST_NAME)?;
        let repo = harness.load();

-        repo.create_empty_timeline(TIMELINE_ID)?;
+        repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
        drop(repo);

        let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
@@ -789,7 +1081,11 @@ mod tests {

        let new_repo = harness.load();
        let err = new_repo.get_timeline(TIMELINE_ID).err().unwrap();
-        assert!(err.to_string().contains("checksum"));
+        assert_eq!(err.to_string(), "failed to load metadata");
+        assert_eq!(
+            err.source().unwrap().to_string(),
+            "metadata checksum mismatch"
+        );

        Ok(())
    }
@@ -800,7 +1096,12 @@ mod tests {
        let harness = RepoHarness::create(TEST_NAME)?;
        let repo = harness.load();

-        repo.create_empty_timeline(TIMELINE_ID)?;
+        // Create a timeline with disk_consistent_lsn = 8000
+        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?;
+        let writer = tline.writer();
+        writer.advance_last_record_lsn(Lsn(0x8000));
+        drop(writer);
+        repo.checkpoint_iteration(CheckpointConfig::Forced)?;
        drop(repo);

        let timeline_path = harness.timeline_path(&TIMELINE_ID);
@@ -814,68 +1115,86 @@ mod tests {
            Ok(())
        };

-        let image_filename = format!("pg_control_0_{:016X}", 8000);
-        let delta_filename = format!("pg_control_0_{:016X}_{:016X}", 8000, 8008);
-
-        make_empty_file(&image_filename)?;
-        make_empty_file(&delta_filename)?;
-
-        let new_repo = harness.load();
-        new_repo.get_timeline(TIMELINE_ID).unwrap();
-        drop(new_repo);
-
-        let check_old = |filename: &str, num: u32| {
+        // Helper function to check that a relation file exists, and a corresponding
+        // <filename>.0.old file does not.
+        let assert_exists = |filename: &str| {
            let path = timeline_path.join(filename);
-            assert!(!path.exists());
+            assert!(path.exists(), "file {} was removed", filename);

-            let backup_path = timeline_path.join(format!("{}.{}.old", filename, num));
-            assert!(backup_path.exists());
+            // Check that there is no .old file
+            let backup_path = timeline_path.join(format!("{}.0.old", filename));
+            assert!(
+                !backup_path.exists(),
+                "unexpected backup file {}",
+                backup_path.display()
+            );
        };

-        check_old(&image_filename, 0);
-        check_old(&delta_filename, 0);
+        // Helper function to check that a relation file does *not* exists, and a corresponding
+        // <filename>.<num>.old file does.
+        let assert_is_renamed = |filename: &str, num: u32| {
+            let path = timeline_path.join(filename);
+            assert!(
+                !path.exists(),
+                "file {} was not removed as expected",
+                filename
+            );

-        make_empty_file(&image_filename)?;
-        make_empty_file(&delta_filename)?;
+            let backup_path = timeline_path.join(format!("{}.{}.old", filename, num));
+            assert!(
+                backup_path.exists(),
+                "backup file {} was not created",
+                backup_path.display()
+            );
+        };
+
+        // These files are considered to be in the future and will be renamed out
+        // of the way
+        let future_filenames = vec![
+            format!("pg_control_0_pg_control_1_{:016X}", 0x8001),
+            format!("pg_control_0_pg_control_1_{:016X}_{:016X}", 0x8001, 0x8008),
+        ];
+        // But these are not:
+        let past_filenames = vec![
+            format!("pg_control_0_pg_control_1_{:016X}", 0x8000),
+            format!("pg_control_0_pg_control_1_{:016X}_{:016X}", 0x7000, 0x8001),
+        ];
+
+        for filename in future_filenames.iter().chain(past_filenames.iter()) {
+            make_empty_file(filename)?;
+        }
+
+        // Load the timeline. This will cause the files in the "future" to be renamed
+        // away.
+        let new_repo = harness.load();
+        new_repo.get_timeline(TIMELINE_ID).unwrap();
+        drop(new_repo);
+
+        for filename in future_filenames.iter() {
+            assert_is_renamed(filename, 0);
+        }
+        for filename in past_filenames.iter() {
+            assert_exists(filename);
+        }
+
+        // Create the future files again, and load again. They should be renamed to
+        // *.1.old this time.
+        for filename in future_filenames.iter() {
+            make_empty_file(filename)?;
+        }

        let new_repo = harness.load();
        new_repo.get_timeline(TIMELINE_ID).unwrap();
        drop(new_repo);

-        check_old(&image_filename, 0);
-        check_old(&delta_filename, 0);
-        check_old(&image_filename, 1);
-        check_old(&delta_filename, 1);
+        for filename in future_filenames.iter() {
+            assert_is_renamed(filename, 0);
+            assert_is_renamed(filename, 1);
+        }
+        for filename in past_filenames.iter() {
+            assert_exists(filename);
+        }

        Ok(())
    }
-
-    // Mock WAL redo manager that doesn't do much
-    struct TestRedoManager;
-
-    impl WalRedoManager for TestRedoManager {
-        fn request_redo(
-            &self,
-            rel: RelishTag,
-            blknum: u32,
-            lsn: Lsn,
-            base_img: Option<Bytes>,
-            records: Vec<(Lsn, WALRecord)>,
-        ) -> Result<Bytes, WalRedoError> {
-            let s = format!(
-                "redo for {} blk {} to get to {}, with {} and {} records",
-                rel,
-                blknum,
-                lsn,
-                if base_img.is_some() {
-                    "base image"
-                } else {
-                    "no base image"
-                },
-                records.len()
-            );
-            println!("{}", s);
-            Ok(TEST_IMG(&s))
-        }
-    }
 }
--- a/pageserver/src/restore_local_repo.rs
+++ b/pageserver/src/restore_local_repo.rs
@@ -16,9 +16,10 @@ use tracing::*;

 use crate::relish::*;
 use crate::repository::*;
-use crate::waldecoder::*;
+use crate::walrecord::*;
 use postgres_ffi::nonrelfile_utils::mx_offset_to_member_segment;
 use postgres_ffi::relfile_utils::*;
+use postgres_ffi::waldecoder::*;
 use postgres_ffi::xlog_utils::*;
 use postgres_ffi::Oid;
 use postgres_ffi::{pg_constants, CheckPoint, ControlFileData};
@@ -126,6 +127,7 @@ pub fn import_timeline_from_postgres_datadir(
        import_nonrel_file(writer, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
    }
    // TODO: Scan pg_tblspc
+
    writer.advance_last_record_lsn(lsn);

    // Import WAL. This is needed even when starting from a shutdown checkpoint, because
@@ -139,7 +141,6 @@ pub fn import_timeline_from_postgres_datadir(
        lsn,
        &mut pg_control.checkPointCopy.clone(),
    )?;
-    writer.checkpoint()?;

    Ok(())
 }
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -2,20 +2,19 @@
 //! page server.

 use crate::branches;
-use crate::buffered_repository::BufferedRepository;
-use crate::repository::{Repository, Timeline};
+use crate::layered_repository::LayeredRepository;
+use crate::repository::{Repository, Timeline, TimelineSyncState};
+use crate::tenant_threads;
 use crate::walredo::PostgresRedoManager;
 use crate::PageServerConf;
 use anyhow::{anyhow, bail, Context, Result};
 use lazy_static::lazy_static;
-use log::{debug, info};
-use std::collections::HashMap;
+use log::*;
+use serde::{Deserialize, Serialize};
+use std::collections::{hash_map, HashMap};
 use std::fmt;
-use std::fs;
-use std::str::FromStr;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, Mutex, MutexGuard};
-use std::thread::JoinHandle;
 use zenith_utils::zid::{ZTenantId, ZTimelineId};

 lazy_static! {
@@ -27,31 +26,28 @@ struct Tenant {
    repo: Option<Arc<dyn Repository>>,
 }

-#[derive(Debug)]
-enum TenantState {
-    // This tenant only exists in cloud storage. It cannot be accessed.
-    CloudOnly,
-    // This tenant exists in cloud storage, and we are currently downloading it to local disk.
-    // It cannot be accessed yet, not until it's been fully downloaded to local disk.
-    Downloading,
+#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
+pub enum TenantState {
    // All data for this tenant is complete on local disk, but we haven't loaded the Repository,
    // Timeline and Layer structs into memory yet, so it cannot be accessed yet.
    //Ready,
    // This tenant exists on local disk, and the layer map has been loaded into memory.
    // The local disk might have some newer files that don't exist in cloud storage yet.
    Active,
+    // Tenant is active, but there is no walreceiver connection.
+    Idle,
    // This tenant exists on local disk, and the layer map has been loaded into memory.
    // The local disk might have some newer files that don't exist in cloud storage yet.
    // The tenant cannot be accessed anymore for any reason, but graceful shutdown.
-    //Stopping,
+    Stopping,
 }

 impl fmt::Display for TenantState {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
-            TenantState::CloudOnly => f.write_str("CloudOnly"),
-            TenantState::Downloading => f.write_str("Downloading"),
            TenantState::Active => f.write_str("Active"),
+            TenantState::Idle => f.write_str("Idle"),
+            TenantState::Stopping => f.write_str("Stopping"),
        }
    }
 }
@@ -60,107 +56,71 @@ fn access_tenants() -> MutexGuard<'static, HashMap<ZTenantId, Tenant>> {
    TENANTS.lock().unwrap()
 }

-struct TenantHandleEntry {
-    checkpointer_handle: Option<JoinHandle<()>>,
-    uploader_handle: Option<JoinHandle<()>>,
-    gc_handle: Option<JoinHandle<()>>,
-}
-
-// Logically these handles belong to Repository,
-// but it's just simpler to store them separately
-lazy_static! {
-    static ref TENANT_HANDLES: Mutex<HashMap<ZTenantId, TenantHandleEntry>> =
-        Mutex::new(HashMap::new());
-}
-
 static SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false);

-pub fn init(conf: &'static PageServerConf) {
-    for dir_entry in fs::read_dir(conf.tenants_path()).unwrap() {
-        let tenantid =
-            ZTenantId::from_str(dir_entry.unwrap().file_name().to_str().unwrap()).unwrap();
-
-        {
-            let mut m = access_tenants();
-            let tenant = Tenant {
-                state: TenantState::CloudOnly,
-                repo: None,
-            };
-            m.insert(tenantid, tenant);
-        }
-
-        init_repo(conf, tenantid);
-        info!("initialized storage for tenant: {}", &tenantid);
+/// Updates tenants' repositories, changing their timelines state in memory.
+pub fn set_timeline_states(
+    conf: &'static PageServerConf,
+    timeline_states: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>>,
+) {
+    if timeline_states.is_empty() {
+        debug!("no timeline state updates to perform");
+        return;
    }
-}

-fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) {
-    // Set up a WAL redo manager, for applying WAL records.
-    let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
-
-    // Set up an object repository, for actual data storage.
-    let repo = Arc::new(BufferedRepository::new(
-        conf,
-        Arc::new(walredo_mgr),
-        tenant_id,
-        true,
-    ));
-
-    let checkpointer_handle = BufferedRepository::launch_checkpointer_thread(conf, repo.clone());
-    let gc_handle = BufferedRepository::launch_gc_thread(conf, repo.clone());
-    let uploader_handle = BufferedRepository::launch_upload_thread(conf, repo.clone());
-
-    let mut handles = TENANT_HANDLES.lock().unwrap();
-    let h = TenantHandleEntry {
-        checkpointer_handle: Some(checkpointer_handle),
-        gc_handle: Some(gc_handle),
-        uploader_handle: Some(uploader_handle),
-    };
-
-    handles.insert(tenant_id, h);
+    info!("Updating states for {} timelines", timeline_states.len());
+    trace!("States: {:?}", timeline_states);

    let mut m = access_tenants();
-    let tenant = m.get_mut(&tenant_id).unwrap();
-    tenant.repo = Some(repo);
-    tenant.state = TenantState::Active;
-}
-
-// TODO kb Currently unused function, will later be used when the relish storage downloads a new layer.
-// Relevant PR: https://github.com/zenithdb/zenith/pull/686
-pub fn register_relish_download(
-    conf: &'static PageServerConf,
-    tenant_id: ZTenantId,
-    timeline_id: ZTimelineId,
-) {
-    log::info!(
-        "Registering new download, tenant id {}, timeline id: {}",
-        tenant_id,
-        timeline_id
-    );
-
-    {
-        let mut m = access_tenants();
-        let mut tenant = m.get_mut(&tenant_id).unwrap();
-        tenant.state = TenantState::Downloading;
-        match &tenant.repo {
-            Some(repo) => init_timeline(repo.as_ref(), timeline_id),
-            None => {
-                log::info!("Initialize new repo");
-            }
+    for (tenant_id, timeline_states) in timeline_states {
+        let tenant = m.entry(tenant_id).or_insert_with(|| Tenant {
+            state: TenantState::Idle,
+            repo: None,
+        });
+        if let Err(e) = put_timelines_into_tenant(conf, tenant, tenant_id, timeline_states) {
+            error!(
+                "Failed to update timeline states for tenant {}: {:#}",
+                tenant_id, e
+            );
        }
    }
-
-    // init repo updates Tenant state
-    init_repo(conf, tenant_id);
-    let new_repo = get_repository_for_tenant(tenant_id).unwrap();
-    init_timeline(new_repo.as_ref(), timeline_id);
 }

-fn init_timeline(repo: &dyn Repository, timeline_id: ZTimelineId) {
-    match repo.get_timeline(timeline_id) {
-        Ok(_timeline) => log::info!("Successfully initialized timeline {}", timeline_id),
-        Err(e) => log::error!("Failed to init timeline {}, reason: {:#}", timeline_id, e),
+fn put_timelines_into_tenant(
+    conf: &'static PageServerConf,
+    tenant: &mut Tenant,
+    tenant_id: ZTenantId,
+    timeline_states: HashMap<ZTimelineId, TimelineSyncState>,
+) -> anyhow::Result<()> {
+    let repo = match tenant.repo.as_ref() {
+        Some(repo) => Arc::clone(repo),
+        None => {
+            // Set up a WAL redo manager, for applying WAL records.
+            let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
+
+            // Set up an object repository, for actual data storage.
+            let repo: Arc<dyn Repository> = Arc::new(LayeredRepository::new(
+                conf,
+                Arc::new(walredo_mgr),
+                tenant_id,
+                conf.remote_storage_config.is_some(),
+            ));
+            tenant.repo = Some(Arc::clone(&repo));
+            repo
+        }
+    };
+
+    for (timeline_id, timeline_state) in timeline_states {
+        repo.set_timeline_state(timeline_id, timeline_state)
+            .with_context(|| {
+                format!(
+                    "Failed to update timeline {} state to {:?}",
+                    timeline_id, timeline_state
+                )
+            })?;
    }
+
+    Ok(())
 }

 // Check this flag in the thread loops to know when to exit
@@ -168,29 +128,23 @@ pub fn shutdown_requested() -> bool {
    SHUTDOWN_REQUESTED.load(Ordering::Relaxed)
 }

-pub fn stop_tenant_threads(tenantid: ZTenantId) {
-    let mut handles = TENANT_HANDLES.lock().unwrap();
-    if let Some(h) = handles.get_mut(&tenantid) {
-        h.checkpointer_handle.take().map(JoinHandle::join);
-        debug!("checkpointer for tenant {} has stopped", tenantid);
-        h.uploader_handle.take().map(JoinHandle::join);
-        debug!("uploader for tenant {} has stopped", tenantid);
-        h.gc_handle.take().map(JoinHandle::join);
-        debug!("gc for tenant {} has stopped", tenantid);
-    }
-}
-
 pub fn shutdown_all_tenants() -> Result<()> {
    SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed);

    let tenantids = list_tenantids()?;
+
+    for tenantid in &tenantids {
+        set_tenant_state(*tenantid, TenantState::Stopping)?;
+    }
+
    for tenantid in tenantids {
-        stop_tenant_threads(tenantid);
+        // Wait for checkpointer and GC to finish their job
+        tenant_threads::wait_for_tenant_threads_to_stop(tenantid);
+
        let repo = get_repository_for_tenant(tenantid)?;
        debug!("shutdown tenant {}", tenantid);
        repo.shutdown()?;
    }
-
    Ok(())
 }

@@ -198,37 +152,51 @@ pub fn create_repository_for_tenant(
    conf: &'static PageServerConf,
    tenantid: ZTenantId,
 ) -> Result<()> {
-    {
-        let mut m = access_tenants();
-        // First check that the tenant doesn't exist already
-        if m.get(&tenantid).is_some() {
-            bail!("tenant {} already exists", tenantid);
+    let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
+    let repo = Some(branches::create_repo(conf, tenantid, wal_redo_manager)?);
+
+    match access_tenants().entry(tenantid) {
+        hash_map::Entry::Occupied(_) => bail!("tenant {} already exists", tenantid),
+        hash_map::Entry::Vacant(v) => {
+            v.insert(Tenant {
+                state: TenantState::Idle,
+                repo,
+            });
        }
-        let tenant = Tenant {
-            state: TenantState::CloudOnly,
-            repo: None,
-        };
-        m.insert(tenantid, tenant);
    }

-    let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
-    let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?;
-
-    let mut m = access_tenants();
-    let tenant = m.get_mut(&tenantid).unwrap();
-    tenant.repo = Some(repo);
-    tenant.state = TenantState::Active;
-
    Ok(())
 }

+pub fn get_tenant_state(tenantid: ZTenantId) -> Option<TenantState> {
+    Some(access_tenants().get(&tenantid)?.state)
+}
+
+pub fn set_tenant_state(tenantid: ZTenantId, newstate: TenantState) -> Result<TenantState> {
+    let mut m = access_tenants();
+    let tenant = m.get_mut(&tenantid);
+
+    match tenant {
+        Some(tenant) => {
+            if newstate == TenantState::Idle && tenant.state != TenantState::Active {
+                // Only Active tenant can become Idle
+                return Ok(tenant.state);
+            }
+            info!("set_tenant_state: {} -> {}", tenant.state, newstate);
+            tenant.state = newstate;
+            Ok(tenant.state)
+        }
+        None => bail!("Tenant not found for id {}", tenantid),
+    }
+}
+
 pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<dyn Repository>> {
    let m = access_tenants();
    let tenant = m
        .get(&tenantid)
-        .ok_or_else(|| anyhow!("Tenant not found for tenant {}", tenantid));
+        .ok_or_else(|| anyhow!("Tenant not found for tenant {}", tenantid))?;

-    match &tenant.unwrap().repo {
+    match &tenant.repo {
        Some(repo) => Ok(Arc::clone(repo)),
        None => anyhow::bail!("Repository for tenant {} is not yet valid", tenantid),
    }
@@ -239,16 +207,37 @@ pub fn get_timeline_for_tenant(
    timelineid: ZTimelineId,
 ) -> Result<Arc<dyn Timeline>> {
    get_repository_for_tenant(tenantid)?
-        .get_timeline(timelineid)
-        .with_context(|| format!("cannot fetch timeline {}", timelineid))
+        .get_timeline(timelineid)?
+        .local_timeline()
+        .ok_or_else(|| anyhow!("cannot fetch timeline {}", timelineid))
 }

 fn list_tenantids() -> Result<Vec<ZTenantId>> {
-    let m = access_tenants();
-    m.iter()
+    access_tenants()
+        .iter()
        .map(|v| {
            let (tenantid, _) = v;
            Ok(*tenantid)
        })
        .collect()
 }
+
+#[derive(Serialize, Deserialize, Clone)]
+pub struct TenantInfo {
+    #[serde(with = "hex")]
+    pub id: ZTenantId,
+    pub state: TenantState,
+}
+
+pub fn list_tenants() -> Result<Vec<TenantInfo>> {
+    access_tenants()
+        .iter()
+        .map(|v| {
+            let (id, tenant) = v;
+            Ok(TenantInfo {
+                id: *id,
+                state: tenant.state,
+            })
+        })
+        .collect()
+}
--- a/pageserver/src/tenant_threads.rs
+++ b/pageserver/src/tenant_threads.rs
@@ -0,0 +1,150 @@
+//! This module contains functions to serve per-tenant background processes,
+//! such as checkpointer and GC
+use crate::tenant_mgr;
+use crate::tenant_mgr::TenantState;
+use crate::CheckpointConfig;
+use crate::PageServerConf;
+use anyhow::Result;
+use lazy_static::lazy_static;
+use std::collections::HashMap;
+use std::sync::Mutex;
+use std::thread::JoinHandle;
+use std::time::Duration;
+use tracing::*;
+use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};
+use zenith_utils::zid::ZTenantId;
+
+struct TenantHandleEntry {
+    checkpointer_handle: Option<JoinHandle<()>>,
+    gc_handle: Option<JoinHandle<()>>,
+}
+
+// Preserve handles to wait for thread completion
+// at shutdown
+lazy_static! {
+    static ref TENANT_HANDLES: Mutex<HashMap<ZTenantId, TenantHandleEntry>> =
+        Mutex::new(HashMap::new());
+}
+
+lazy_static! {
+    static ref TENANT_THREADS_COUNT: IntGaugeVec = register_int_gauge_vec!(
+        "tenant_threads_count",
+        "Number of live tenant threads",
+        &["tenant_thread_type"]
+    )
+    .expect("failed to define a metric");
+}
+
+// Launch checkpointer and GC for the tenant.
+// It's possible that the threads are running already,
+// if so, just don't spawn new ones.
+pub fn start_tenant_threads(conf: &'static PageServerConf, tenantid: ZTenantId) {
+    let mut handles = TENANT_HANDLES.lock().unwrap();
+    let h = handles
+        .entry(tenantid)
+        .or_insert_with(|| TenantHandleEntry {
+            checkpointer_handle: None,
+            gc_handle: None,
+        });
+
+    if h.checkpointer_handle.is_none() {
+        h.checkpointer_handle = std::thread::Builder::new()
+            .name("Checkpointer thread".into())
+            .spawn(move || {
+                checkpoint_loop(tenantid, conf).expect("Checkpointer thread died");
+            })
+            .ok();
+    }
+
+    if h.gc_handle.is_none() {
+        h.gc_handle = std::thread::Builder::new()
+            .name("GC thread".into())
+            .spawn(move || {
+                gc_loop(tenantid, conf).expect("GC thread died");
+            })
+            .ok();
+    }
+}
+
+pub fn wait_for_tenant_threads_to_stop(tenantid: ZTenantId) {
+    let mut handles = TENANT_HANDLES.lock().unwrap();
+    if let Some(h) = handles.get_mut(&tenantid) {
+        h.checkpointer_handle.take().map(JoinHandle::join);
+        trace!("checkpointer for tenant {} has stopped", tenantid);
+        h.gc_handle.take().map(JoinHandle::join);
+        trace!("gc for tenant {} has stopped", tenantid);
+    }
+    handles.remove(&tenantid);
+}
+
+///
+/// Checkpointer thread's main loop
+///
+fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
+    let gauge = TENANT_THREADS_COUNT.with_label_values(&["checkpointer"]);
+    gauge.inc();
+    scopeguard::defer! {
+        gauge.dec();
+    }
+
+    loop {
+        if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
+            break;
+        }
+
+        std::thread::sleep(conf.checkpoint_period);
+        trace!("checkpointer thread for tenant {} waking up", tenantid);
+
+        // checkpoint timelines that have accumulated more than CHECKPOINT_DISTANCE
+        // bytes of WAL since last checkpoint.
+        let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
+        repo.checkpoint_iteration(CheckpointConfig::Distance(conf.checkpoint_distance))?;
+    }
+
+    trace!(
+        "checkpointer thread stopped for tenant {} state is {:?}",
+        tenantid,
+        tenant_mgr::get_tenant_state(tenantid)
+    );
+    Ok(())
+}
+
+///
+/// GC thread's main loop
+///
+fn gc_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
+    let gauge = TENANT_THREADS_COUNT.with_label_values(&["gc"]);
+    gauge.inc();
+    scopeguard::defer! {
+        gauge.dec();
+    }
+
+    loop {
+        if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
+            break;
+        }
+
+        trace!("gc thread for tenant {} waking up", tenantid);
+
+        // Garbage collect old files that are not needed for PITR anymore
+        if conf.gc_horizon > 0 {
+            let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
+            repo.gc_iteration(None, conf.gc_horizon, false).unwrap();
+        }
+
+        // TODO Write it in more adequate way using
+        // condvar.wait_timeout() or something
+        let mut sleep_time = conf.gc_period.as_secs();
+        while sleep_time > 0 && tenant_mgr::get_tenant_state(tenantid) == Some(TenantState::Active)
+        {
+            sleep_time -= 1;
+            std::thread::sleep(Duration::from_secs(1));
+        }
+    }
+    trace!(
+        "GC thread stopped for tenant {} state is {:?}",
+        tenantid,
+        tenant_mgr::get_tenant_state(tenantid)
+    );
+    Ok(())
+}
--- a/pageserver/src/toast_store.rs
+++ b/pageserver/src/toast_store.rs
@@ -1,268 +0,0 @@
-use anyhow::{anyhow, Result};
-use lz4_flex;
-use std::convert::TryInto;
-use std::ops::{Bound, RangeBounds};
-use std::path::Path;
-
-use yakv::storage::{
-    Key,
-    Select,
-    //    ReadOnlyTransaction,
-    Snapshot,
-    Storage,
-    StorageConfig,
-    StorageIterator,
-    Transaction,
-    Value,
-};
-
-const TOAST_SEGMENT_SIZE: usize = 2000;
-const CACHE_SIZE: usize = 1024; // 8Mb
-
-///
-/// Toast storage consistof two KV databases: one for storing main index
-/// and second for storing sliced BLOB (values larger than 2kb).
-/// BLOBs and main data are stored in different databases to improve
-/// data locality and reduce key size for TOAST segments.
-///
-pub struct ToastStore {
-    db: Storage, // key-value database
-}
-
-pub struct ToastIterator<'a> {
-    iter: StorageIterator<'a>,
-}
-
-pub struct ToastSnapshot<'a> {
-    //    tx: ReadOnlyTransaction<'a>,
-    tx: Snapshot<'a>,
-}
-
-impl<'a> ToastSnapshot<'a> {
-    pub fn range<R: RangeBounds<Key>>(&self, range: R) -> ToastIterator<'_> {
-        let from = match range.start_bound() {
-            Bound::Included(key) => {
-                let mut key = key.clone();
-                key.extend_from_slice(&[0u8; 4]);
-                Bound::Included(key)
-            }
-            Bound::Excluded(key) => {
-                let mut key = key.clone();
-                key.extend_from_slice(&[0u8; 4]);
-                Bound::Excluded(key)
-            }
-            _ => Bound::Unbounded,
-        };
-        let till = match range.end_bound() {
-            Bound::Included(key) => {
-                let mut key = key.clone();
-                key.extend_from_slice(&[0xFFu8; 4]);
-                Bound::Included(key)
-            }
-            Bound::Excluded(key) => {
-                let mut key = key.clone();
-                key.extend_from_slice(&[0xFFu8; 4]);
-                Bound::Excluded(key)
-            }
-            _ => Bound::Unbounded,
-        };
-        ToastIterator {
-            iter: self.tx.range((from, till)),
-        }
-    }
-
-    pub fn iter(&self) -> ToastIterator<'_> {
-        self.range(..)
-    }
-}
-
-impl<'a> Iterator for ToastIterator<'a> {
-    type Item = Result<(Key, Value)>;
-    fn next(&mut self) -> Option<Self::Item> {
-        let mut toast: Option<Vec<u8>> = None;
-        let mut next_segno = 0u16;
-        for elem in &mut self.iter {
-            let res = if let Ok((key, value)) = elem {
-                let key_len = key.len();
-                let n_segments =
-                    u16::from_be_bytes(key[key_len - 4..key_len - 2].try_into().unwrap());
-                let segno = u16::from_be_bytes(key[key_len - 2..].try_into().unwrap());
-                let key = key[..key_len - 4].to_vec();
-                if n_segments != 0 {
-                    // TOAST
-                    assert_eq!(segno, next_segno);
-                    if next_segno == 0 {
-                        toast = Some(Vec::with_capacity(n_segments as usize * TOAST_SEGMENT_SIZE))
-                    }
-                    toast.as_mut().unwrap().extend_from_slice(&value);
-                    next_segno = segno + 1;
-                    if next_segno != n_segments {
-                        continue;
-                    }
-                    let res = lz4_flex::decompress_size_prepended(&toast.unwrap());
-                    if let Ok(decompressed_data) = res {
-                        Ok((key, decompressed_data))
-                    } else {
-                        Err(anyhow!(res.unwrap_err()))
-                    }
-                } else {
-                    Ok((key, value))
-                }
-            } else {
-                elem
-            };
-            return Some(res);
-        }
-        assert_eq!(next_segno, 0);
-        None
-    }
-}
-
-impl<'a> DoubleEndedIterator for ToastIterator<'a> {
-    fn next_back(&mut self) -> Option<Self::Item> {
-        let mut toast: Option<Vec<u8>> = None;
-        let mut next_segno = 0u16;
-        while let Some(elem) = self.iter.next_back() {
-            if let Ok((key, value)) = elem {
-                assert!(!value.is_empty());
-                let key_len = key.len();
-                let n_segments =
-                    u16::from_be_bytes(key[key_len - 4..key_len - 2].try_into().unwrap());
-                let segno = u16::from_be_bytes(key[key_len - 2..].try_into().unwrap());
-                let key = key[..key_len - 4].to_vec();
-                if n_segments != 0 {
-                    // TOAST
-                    assert!(segno + 1 == next_segno || next_segno == 0);
-                    if next_segno == 0 {
-                        let len = (n_segments - 1) as usize * TOAST_SEGMENT_SIZE + value.len();
-                        let mut vec = vec![0u8; len];
-                        vec[len - value.len()..].copy_from_slice(&value);
-                        toast = Some(vec);
-                    } else {
-                        toast.as_mut().unwrap()[segno as usize * TOAST_SEGMENT_SIZE
-                            ..(segno + 1) as usize * TOAST_SEGMENT_SIZE]
-                            .copy_from_slice(&value);
-                    }
-                    next_segno = segno;
-                    if next_segno == 0 {
-                        let toast = toast.unwrap();
-                        assert!(!toast.is_empty());
-                        let res = lz4_flex::decompress_size_prepended(&toast);
-                        return Some(if let Ok(decompressed_data) = res {
-                            Ok((key, decompressed_data))
-                        } else {
-                            Err(anyhow!(res.unwrap_err()))
-                        });
-                    }
-                } else {
-                    return Some(Ok((key, value)));
-                }
-            } else {
-                return Some(elem);
-            }
-        }
-        assert_eq!(next_segno, 0);
-        None
-    }
-}
-
-//
-// FIXME-KK: not using WAL now. Implement asynchronous or delayed commit.
-//
-impl ToastStore {
-    pub fn new(path: &Path) -> Result<ToastStore> {
-        Ok(ToastStore {
-            db: Storage::open(
-                &path.join("pageserver.db"),
-                StorageConfig {
-                    cache_size: CACHE_SIZE,
-                    nosync: false,
-                },
-            )?,
-        })
-    }
-
-    pub fn put(&self, key: Key, value: Value) -> Result<()> {
-        let mut tx = self.db.start_transaction();
-        self.tx_remove(&mut tx, &key)?;
-        let value_len = value.len();
-        let mut key = key;
-        if value_len >= TOAST_SEGMENT_SIZE {
-            let compressed_data = lz4_flex::compress_prepend_size(&value);
-            let compressed_data_len = compressed_data.len();
-            let mut offs: usize = 0;
-            let mut segno = 0u16;
-            let n_segments =
-                ((compressed_data_len + TOAST_SEGMENT_SIZE - 1) / TOAST_SEGMENT_SIZE) as u16;
-            assert!(n_segments != 0);
-            key.extend_from_slice(&n_segments.to_be_bytes());
-            key.extend_from_slice(&[0u8; 2]);
-            let key_len = key.len();
-            while offs + TOAST_SEGMENT_SIZE < compressed_data_len {
-                key[key_len - 2..].copy_from_slice(&segno.to_be_bytes());
-                tx.put(
-                    &key,
-                    &compressed_data[offs..offs + TOAST_SEGMENT_SIZE].to_vec(),
-                )?;
-                offs += TOAST_SEGMENT_SIZE;
-                segno += 1;
-            }
-            key[key_len - 2..].copy_from_slice(&segno.to_be_bytes());
-            tx.put(&key, &compressed_data[offs..].to_vec())?;
-        } else {
-            key.extend_from_slice(&[0u8; 4]);
-            tx.put(&key, &value)?;
-        }
-        tx.subcommit()?;
-        //tx.delay();
-        Ok(())
-    }
-
-    pub fn commit(&self) -> Result<()> {
-        let tx = self.db.start_transaction();
-        tx.commit()?;
-        Ok(())
-    }
-
-    pub fn take_snapshot(&self) -> ToastSnapshot<'_> {
-        ToastSnapshot {
-            //tx: self.db.read_only_transaction(),
-            tx: self.db.take_snapshot(),
-        }
-    }
-
-    pub fn remove(&self, key: Key) -> Result<()> {
-        let mut tx = self.db.start_transaction();
-        self.tx_remove(&mut tx, &key)?;
-        tx.subcommit()?;
-        //tx.delay();
-        Ok(())
-    }
-
-    pub fn tx_remove(&self, tx: &mut Transaction, key: &[u8]) -> Result<()> {
-        let mut min_key = key.to_vec();
-        let mut max_key = key.to_vec();
-        min_key.extend_from_slice(&[0u8; 4]);
-        max_key.extend_from_slice(&[0xFFu8; 4]);
-        let mut iter = tx.range(&min_key..&max_key);
-        if let Some(entry) = iter.next() {
-            let mut key = entry?.0;
-            let key_len = key.len();
-            let n_segments = u16::from_be_bytes(key[key_len - 4..key_len - 2].try_into().unwrap());
-            if n_segments != 0 {
-                // TOAST
-                for i in 0..n_segments {
-                    key[key_len - 2..].copy_from_slice(&i.to_be_bytes());
-                    tx.remove(&key)?;
-                }
-            } else {
-                tx.remove(&key)?;
-            }
-        }
-        Ok(())
-    }
-
-    pub fn size(&self) -> u64 {
-        self.db.get_database_info().db_used
-    }
-}
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -0,0 +1,619 @@
+//!
+//! VirtualFile is like a normal File, but it's not bound directly to
+//! a file descriptor. Instead, the file is opened when it's read from,
+//! and if too many files are open globally in the system, least-recently
+//! used ones are closed.
+//!
+//! To track which files have been recently used, we use the clock algorithm
+//! with a 'recently_used' flag on each slot.
+//!
+//! This is similar to PostgreSQL's virtual file descriptor facility in
+//! src/backend/storage/file/fd.c
+//!
+use std::fs::{File, OpenOptions};
+use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write};
+use std::os::unix::fs::FileExt;
+use std::path::{Path, PathBuf};
+use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+use std::sync::{RwLock, RwLockWriteGuard};
+
+use once_cell::sync::OnceCell;
+
+///
+/// A virtual file descriptor. You can use this just like std::fs::File, but internally
+/// the underlying file is closed if the system is low on file descriptors,
+/// and re-opened when it's accessed again.
+///
+/// Like with std::fs::File, multiple threads can read/write the file concurrently,
+/// holding just a shared reference the same VirtualFile, using the read_at() / write_at()
+/// functions from the FileExt trait. But the functions from the Read/Write/Seek traits
+/// require a mutable reference, because they modify the "current position".
+///
+/// Each VirtualFile has a physical file descriptor in the global OPEN_FILES array, at the
+/// slot that 'handle points to, if the underlying file is currently open. If it's not
+/// currently open, the 'handle' can still point to the slot where it was last kept. The
+/// 'tag' field is used to detect whether the handle still is valid or not.
+///
+pub struct VirtualFile {
+    /// Lazy handle to the global file descriptor cache. The slot that this points to
+    /// might contain our File, or it may be empty, or it may contain a File that
+    /// belongs to a different VirtualFile.
+    handle: RwLock<SlotHandle>,
+
+    /// Current file position
+    pos: u64,
+
+    /// File path and options to use to open it.
+    ///
+    /// Note: this only contains the options needed to re-open it. For example,
+    /// if a new file is created, we only pass the create flag when it's initially
+    /// opened, in the VirtualFile::create() function, and strip the flag before
+    /// storing it here.
+    pub path: PathBuf,
+    open_options: OpenOptions,
+}
+
+#[derive(PartialEq, Clone, Copy)]
+struct SlotHandle {
+    /// Index into OPEN_FILES.slots
+    index: usize,
+
+    /// Value of 'tag' in the slot. If slot's tag doesn't match, then the slot has
+    /// been recycled and no longer contains the FD for this virtual file.
+    tag: u64,
+}
+
+/// OPEN_FILES is the global array that holds the physical file descriptors that
+/// are currently open. Each slot in the array is protected by a separate lock,
+/// so that different files can be accessed independently. The lock must be held
+/// in write mode to replace the slot with a different file, but a read mode
+/// is enough to operate on the file, whether you're reading or writing to it.
+///
+/// OPEN_FILES starts in uninitialized state, and it's initialized by
+/// the virtual_file::init() function. It must be called exactly once at page
+/// server startup.
+static OPEN_FILES: OnceCell<OpenFiles> = OnceCell::new();
+
+struct OpenFiles {
+    slots: &'static [Slot],
+
+    /// clock arm for the clock algorithm
+    next: AtomicUsize,
+}
+
+struct Slot {
+    inner: RwLock<SlotInner>,
+
+    /// has this file been used since last clock sweep?
+    recently_used: AtomicBool,
+}
+
+struct SlotInner {
+    /// Counter that's incremented every time a different file is stored here.
+    /// To avoid the ABA problem.
+    tag: u64,
+
+    /// the underlying file
+    file: Option<File>,
+}
+
+impl OpenFiles {
+    /// Find a slot to use, evicting an existing file descriptor if needed.
+    ///
+    /// On return, we hold a lock on the slot, and its 'tag' has been updated
+    /// recently_used has been set. It's all ready for reuse.
+    fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
+        //
+        // Run the clock algorithm to find a slot to replace.
+        //
+        let num_slots = self.slots.len();
+        let mut retries = 0;
+        let mut slot;
+        let mut slot_guard;
+        let index;
+        loop {
+            let next = self.next.fetch_add(1, Ordering::AcqRel) % num_slots;
+            slot = &self.slots[next];
+
+            // If the recently_used flag on this slot is set, continue the clock
+            // sweep. Otherwise try to use this slot. If we cannot acquire the
+            // lock, also continue the clock sweep.
+            //
+            // We only continue in this manner for a while, though. If we loop
+            // through the array twice without finding a victim, just pick the
+            // next slot and wait until we can reuse it. This way, we avoid
+            // spinning in the extreme case that all the slots are busy with an
+            // I/O operation.
+            if retries < num_slots * 2 {
+                if !slot.recently_used.swap(false, Ordering::Release) {
+                    if let Ok(guard) = slot.inner.try_write() {
+                        slot_guard = guard;
+                        index = next;
+                        break;
+                    }
+                }
+                retries += 1;
+            } else {
+                slot_guard = slot.inner.write().unwrap();
+                index = next;
+                break;
+            }
+        }
+
+        //
+        // We now have the victim slot locked. If it was in use previously, close the
+        // old file.
+        //
+        if let Some(old_file) = slot_guard.file.take() {
+            drop(old_file);
+        }
+
+        // Prepare the slot for reuse and return it
+        slot_guard.tag += 1;
+        slot.recently_used.store(true, Ordering::Relaxed);
+        (
+            SlotHandle {
+                index,
+                tag: slot_guard.tag,
+            },
+            slot_guard,
+        )
+    }
+}
+
+impl VirtualFile {
+    /// Open a file in read-only mode. Like File::open.
+    pub fn open(path: &Path) -> Result<VirtualFile, std::io::Error> {
+        Self::open_with_options(path, OpenOptions::new().read(true))
+    }
+
+    /// Create a new file for writing. If the file exists, it will be truncated.
+    /// Like File::create.
+    pub fn create(path: &Path) -> Result<VirtualFile, std::io::Error> {
+        Self::open_with_options(
+            path,
+            OpenOptions::new().write(true).create(true).truncate(true),
+        )
+    }
+
+    /// Open a file with given options.
+    ///
+    /// Note: If any custom flags were set in 'open_options' through OpenOptionsExt,
+    /// they will be applied also when the file is subsequently re-opened, not only
+    /// on the first time. Make sure that's sane!
+    pub fn open_with_options(
+        path: &Path,
+        open_options: &OpenOptions,
+    ) -> Result<VirtualFile, std::io::Error> {
+        let (handle, mut slot_guard) = get_open_files().find_victim_slot();
+
+        let file = open_options.open(path)?;
+
+        // Strip all options other than read and write.
+        //
+        // It would perhaps be nicer to check just for the read and write flags
+        // explicitly, but OpenOptions doesn't contain any functions to read flags,
+        // only to set them.
+        let mut reopen_options = open_options.clone();
+        reopen_options.create(false);
+        reopen_options.create_new(false);
+        reopen_options.truncate(false);
+
+        let vfile = VirtualFile {
+            handle: RwLock::new(handle),
+            pos: 0,
+            path: path.to_path_buf(),
+            open_options: reopen_options,
+        };
+
+        slot_guard.file.replace(file);
+
+        Ok(vfile)
+    }
+
+    /// Call File::sync_all() on the underlying File.
+    pub fn sync_all(&self) -> Result<(), Error> {
+        self.with_file(|file| file.sync_all())?
+    }
+
+    /// Helper function that looks up the underlying File for this VirtualFile,
+    /// opening it and evicting some other File if necessary. It calls 'func'
+    /// with the physical File.
+    fn with_file<F, R>(&self, mut func: F) -> Result<R, Error>
+    where
+        F: FnMut(&File) -> R,
+    {
+        let open_files = get_open_files();
+
+        let mut handle_guard = {
+            // Read the cached slot handle, and see if the slot that it points to still
+            // contains our File.
+            //
+            // We only need to hold the handle lock while we read the current handle. If
+            // another thread closes the file and recycles the slot for a different file,
+            // we will notice that the handle we read is no longer valid and retry.
+            let mut handle = *self.handle.read().unwrap();
+            loop {
+                // Check if the slot contains our File
+                {
+                    let slot = &open_files.slots[handle.index];
+                    let slot_guard = slot.inner.read().unwrap();
+                    if slot_guard.tag == handle.tag {
+                        if let Some(file) = &slot_guard.file {
+                            // Found a cached file descriptor.
+                            slot.recently_used.store(true, Ordering::Relaxed);
+                            return Ok(func(file));
+                        }
+                    }
+                }
+
+                // The slot didn't contain our File. We will have to open it ourselves,
+                // but before that, grab a write lock on handle in the VirtualFile, so
+                // that no other thread will try to concurrently open the same file.
+                let handle_guard = self.handle.write().unwrap();
+
+                // If another thread changed the handle while we were not holding the lock,
+                // then the handle might now be valid again. Loop back to retry.
+                if *handle_guard != handle {
+                    handle = *handle_guard;
+                    continue;
+                }
+                break handle_guard;
+            }
+        };
+
+        // We need to open the file ourselves. The handle in the VirtualFile is
+        // now locked in write-mode. Find a free slot to put it in.
+        let (handle, mut slot_guard) = open_files.find_victim_slot();
+
+        // Open the physical file
+        let file = self.open_options.open(&self.path)?;
+
+        // Perform the requested operation on it
+        //
+        // TODO: We could downgrade the locks to read mode before calling
+        // 'func', to allow a little bit more concurrency, but the standard
+        // library RwLock doesn't allow downgrading without releasing the lock,
+        // and that doesn't seem worth the trouble. (parking_lot RwLock would
+        // allow it)
+        let result = func(&file);
+
+        // Store the File in the slot and update the handle in the VirtualFile
+        // to point to it.
+        slot_guard.file.replace(file);
+
+        *handle_guard = handle;
+
+        Ok(result)
+    }
+}
+
+impl Drop for VirtualFile {
+    /// If a VirtualFile is dropped, close the underlying file if it was open.
+    fn drop(&mut self) {
+        let handle = self.handle.get_mut().unwrap();
+
+        // We could check with a read-lock first, to avoid waiting on an
+        // unrelated I/O.
+        let slot = &get_open_files().slots[handle.index];
+        let mut slot_guard = slot.inner.write().unwrap();
+        if slot_guard.tag == handle.tag {
+            slot.recently_used.store(false, Ordering::Relaxed);
+            slot_guard.file.take();
+        }
+    }
+}
+
+impl Read for VirtualFile {
+    fn read(&mut self, buf: &mut [u8]) -> Result<usize, Error> {
+        let pos = self.pos;
+        let n = self.read_at(buf, pos)?;
+        self.pos += n as u64;
+        Ok(n)
+    }
+}
+
+impl Write for VirtualFile {
+    fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
+        let pos = self.pos;
+        let n = self.write_at(buf, pos)?;
+        self.pos += n as u64;
+        Ok(n)
+    }
+
+    fn flush(&mut self) -> Result<(), std::io::Error> {
+        // flush is no-op for File (at least on unix), so we don't need to do
+        // anything here either.
+        Ok(())
+    }
+}
+
+impl Seek for VirtualFile {
+    fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
+        match pos {
+            SeekFrom::Start(offset) => {
+                self.pos = offset;
+            }
+            SeekFrom::End(offset) => {
+                self.pos = self.with_file(|mut file| file.seek(SeekFrom::End(offset)))??
+            }
+            SeekFrom::Current(offset) => {
+                let pos = self.pos as i128 + offset as i128;
+                if pos < 0 {
+                    return Err(Error::new(
+                        ErrorKind::InvalidInput,
+                        "offset would be negative",
+                    ));
+                }
+                if pos > u64::MAX as i128 {
+                    return Err(Error::new(ErrorKind::InvalidInput, "offset overflow"));
+                }
+                self.pos = pos as u64;
+            }
+        }
+        Ok(self.pos)
+    }
+}
+
+impl FileExt for VirtualFile {
+    fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
+        self.with_file(|file| file.read_at(buf, offset))?
+    }
+
+    fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
+        self.with_file(|file| file.write_at(buf, offset))?
+    }
+}
+
+impl OpenFiles {
+    fn new(num_slots: usize) -> OpenFiles {
+        let mut slots = Box::new(Vec::with_capacity(num_slots));
+        for _ in 0..num_slots {
+            let slot = Slot {
+                recently_used: AtomicBool::new(false),
+                inner: RwLock::new(SlotInner { tag: 0, file: None }),
+            };
+            slots.push(slot);
+        }
+
+        OpenFiles {
+            next: AtomicUsize::new(0),
+            slots: Box::leak(slots),
+        }
+    }
+}
+
+///
+/// Initialize the virtual file module. This must be called once at page
+/// server startup.
+///
+pub fn init(num_slots: usize) {
+    if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
+        panic!("virtual_file::init called twice");
+    }
+}
+
+const TEST_MAX_FILE_DESCRIPTORS: usize = 10;
+
+// Get a handle to the global slots array.
+fn get_open_files() -> &'static OpenFiles {
+    //
+    // In unit tests, page server startup doesn't happen and no one calls
+    // virtual_file::init(). Initialize it here, with a small array.
+    //
+    // This applies to the virtual file tests below, but all other unit
+    // tests too, so the virtual file facility is always usable in
+    // unit tests.
+    //
+    if cfg!(test) {
+        OPEN_FILES.get_or_init(|| OpenFiles::new(TEST_MAX_FILE_DESCRIPTORS))
+    } else {
+        OPEN_FILES.get().expect("virtual_file::init not called yet")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use rand::seq::SliceRandom;
+    use rand::thread_rng;
+    use rand::Rng;
+    use std::sync::Arc;
+    use std::thread;
+
+    // Helper function to slurp contents of a file, starting at the current position,
+    // into a string
+    fn read_string<FD>(vfile: &mut FD) -> Result<String, Error>
+    where
+        FD: Read,
+    {
+        let mut buf = String::new();
+        vfile.read_to_string(&mut buf)?;
+        Ok(buf)
+    }
+
+    // Helper function to slurp a portion of a file into a string
+    fn read_string_at<FD>(vfile: &mut FD, pos: u64, len: usize) -> Result<String, Error>
+    where
+        FD: FileExt,
+    {
+        let mut buf = Vec::new();
+        buf.resize(len, 0);
+        vfile.read_exact_at(&mut buf, pos)?;
+        Ok(String::from_utf8(buf).unwrap())
+    }
+
+    #[test]
+    fn test_virtual_files() -> Result<(), Error> {
+        // The real work is done in the test_files() helper function. This
+        // allows us to run the same set of tests against a native File, and
+        // VirtualFile. We trust the native Files and wouldn't need to test them,
+        // but this allows us to verify that the operations return the same
+        // results with VirtualFiles as with native Files. (Except that with
+        // native files, you will run out of file descriptors if the ulimit
+        // is low enough.)
+        test_files("virtual_files", |path, open_options| {
+            VirtualFile::open_with_options(path, open_options)
+        })
+    }
+
+    #[test]
+    fn test_physical_files() -> Result<(), Error> {
+        test_files("physical_files", |path, open_options| {
+            open_options.open(path)
+        })
+    }
+
+    fn test_files<OF, FD>(testname: &str, openfunc: OF) -> Result<(), Error>
+    where
+        FD: Read + Write + Seek + FileExt,
+        OF: Fn(&Path, &OpenOptions) -> Result<FD, std::io::Error>,
+    {
+        let testdir = crate::PageServerConf::test_repo_dir(testname);
+        std::fs::create_dir_all(&testdir)?;
+
+        let path_a = testdir.join("file_a");
+        let mut file_a = openfunc(
+            &path_a,
+            OpenOptions::new().write(true).create(true).truncate(true),
+        )?;
+        file_a.write_all(b"foobar")?;
+
+        // cannot read from a file opened in write-only mode
+        assert!(read_string(&mut file_a).is_err());
+
+        // Close the file and re-open for reading
+        let mut file_a = openfunc(&path_a, OpenOptions::new().read(true))?;
+
+        // cannot write to a file opened in read-only mode
+        assert!(file_a.write(b"bar").is_err());
+
+        // Try simple read
+        assert_eq!("foobar", read_string(&mut file_a)?);
+
+        // It's positioned at the EOF now.
+        assert_eq!("", read_string(&mut file_a)?);
+
+        // Test seeks.
+        assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
+        assert_eq!("oobar", read_string(&mut file_a)?);
+
+        assert_eq!(file_a.seek(SeekFrom::End(-2))?, 4);
+        assert_eq!("ar", read_string(&mut file_a)?);
+
+        assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
+        assert_eq!(file_a.seek(SeekFrom::Current(2))?, 3);
+        assert_eq!("bar", read_string(&mut file_a)?);
+
+        assert_eq!(file_a.seek(SeekFrom::Current(-5))?, 1);
+        assert_eq!("oobar", read_string(&mut file_a)?);
+
+        // Test erroneous seeks to before byte 0
+        assert!(file_a.seek(SeekFrom::End(-7)).is_err());
+        assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
+        assert!(file_a.seek(SeekFrom::Current(-2)).is_err());
+
+        // the erroneous seek should have left the position unchanged
+        assert_eq!("oobar", read_string(&mut file_a)?);
+
+        // Create another test file, and try FileExt functions on it.
+        let path_b = testdir.join("file_b");
+        let mut file_b = openfunc(
+            &path_b,
+            OpenOptions::new()
+                .read(true)
+                .write(true)
+                .create(true)
+                .truncate(true),
+        )?;
+        file_b.write_all_at(b"BAR", 3)?;
+        file_b.write_all_at(b"FOO", 0)?;
+
+        assert_eq!(read_string_at(&mut file_b, 2, 3)?, "OBA");
+
+        // Open a lot of files, enough to cause some evictions. (Or to be precise,
+        // open the same file many times. The effect is the same.)
+        //
+        // leave file_a positioned at offset 1 before we start
+        assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
+
+        let mut vfiles = Vec::new();
+        for _ in 0..100 {
+            let mut vfile = openfunc(&path_b, OpenOptions::new().read(true))?;
+            assert_eq!("FOOBAR", read_string(&mut vfile)?);
+            vfiles.push(vfile);
+        }
+
+        // make sure we opened enough files to definitely cause evictions.
+        assert!(vfiles.len() > TEST_MAX_FILE_DESCRIPTORS * 2);
+
+        // The underlying file descriptor for 'file_a' should be closed now. Try to read
+        // from it again. We left the file positioned at offset 1 above.
+        assert_eq!("oobar", read_string(&mut file_a)?);
+
+        // Check that all the other FDs still work too. Use them in random order for
+        // good measure.
+        vfiles.as_mut_slice().shuffle(&mut thread_rng());
+        for vfile in vfiles.iter_mut() {
+            assert_eq!("OOBAR", read_string_at(vfile, 1, 5)?);
+        }
+
+        Ok(())
+    }
+
+    /// Test using VirtualFiles from many threads concurrently. This tests both using
+    /// a lot of VirtualFiles concurrently, causing evictions, and also using the same
+    /// VirtualFile from multiple threads concurrently.
+    #[test]
+    fn test_vfile_concurrency() -> Result<(), Error> {
+        const SIZE: usize = 8 * 1024;
+        const VIRTUAL_FILES: usize = 100;
+        const THREADS: usize = 100;
+        const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];
+
+        let testdir = crate::PageServerConf::test_repo_dir("vfile_concurrency");
+        std::fs::create_dir_all(&testdir)?;
+
+        // Create a test file.
+        let test_file_path = testdir.join("concurrency_test_file");
+        {
+            let file = File::create(&test_file_path)?;
+            file.write_all_at(&SAMPLE, 0)?;
+        }
+
+        // Open the file many times.
+        let mut files = Vec::new();
+        for _ in 0..VIRTUAL_FILES {
+            let f = VirtualFile::open_with_options(&test_file_path, OpenOptions::new().read(true))?;
+            files.push(f);
+        }
+        let files = Arc::new(files);
+
+        // Launch many threads, and use the virtual files concurrently in random order.
+        let mut threads = Vec::new();
+        for threadno in 0..THREADS {
+            let builder =
+                thread::Builder::new().name(format!("test_vfile_concurrency thread {}", threadno));
+
+            let files = files.clone();
+            let thread = builder
+                .spawn(move || {
+                    let mut buf = [0u8; SIZE];
+                    let mut rng = rand::thread_rng();
+                    for _ in 1..1000 {
+                        let f = &files[rng.gen_range(0..files.len())];
+                        f.read_exact_at(&mut buf, 0).unwrap();
+                        assert!(buf == SAMPLE);
+                    }
+                })
+                .unwrap();
+            threads.push(thread);
+        }
+
+        for thread in threads {
+            thread.join().unwrap();
+        }
+
+        Ok(())
+    }
+}
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -8,13 +8,16 @@
 use crate::relish::*;
 use crate::restore_local_repo;
 use crate::tenant_mgr;
-use crate::waldecoder::*;
+use crate::tenant_mgr::TenantState;
+use crate::tenant_threads;
+use crate::walrecord::*;
 use crate::PageServerConf;
-use anyhow::{bail, Error, Result};
+use anyhow::{bail, Context, Error, Result};
 use lazy_static::lazy_static;
 use postgres::fallible_iterator::FallibleIterator;
 use postgres::replication::ReplicationIter;
 use postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow};
+use postgres_ffi::waldecoder::*;
 use postgres_ffi::*;
 use postgres_protocol::message::backend::ReplicationMessage;
 use postgres_types::PgLsn;
@@ -38,6 +41,7 @@ use zenith_utils::zid::ZTimelineId;
 struct WalReceiverEntry {
    wal_producer_connstr: String,
    wal_receiver_handle: Option<JoinHandle<()>>,
+    tenantid: ZTenantId,
 }

 lazy_static! {
@@ -65,6 +69,23 @@ pub fn stop_wal_receiver(timelineid: ZTimelineId) {
    }
 }

+pub fn drop_wal_receiver(timelineid: ZTimelineId, tenantid: ZTenantId) {
+    let mut receivers = WAL_RECEIVERS.lock().unwrap();
+    receivers.remove(&timelineid);
+
+    // Check if it was the last walreceiver of the tenant.
+    // TODO now we store one WalReceiverEntry per timeline,
+    // so this iterator looks a bit strange.
+    for (_timelineid, entry) in receivers.iter() {
+        if entry.tenantid == tenantid {
+            return;
+        }
+    }
+
+    // When last walreceiver of the tenant is gone, change state to Idle
+    tenant_mgr::set_tenant_state(tenantid, TenantState::Idle).unwrap();
+}
+
 // Launch a new WAL receiver, or tell one that's running about change in connection string
 pub fn launch_wal_receiver(
    conf: &'static PageServerConf,
@@ -90,8 +111,13 @@ pub fn launch_wal_receiver(
            let receiver = WalReceiverEntry {
                wal_producer_connstr: wal_producer_connstr.into(),
                wal_receiver_handle: Some(wal_receiver_handle),
+                tenantid,
            };
            receivers.insert(timelineid, receiver);
+
+            // Update tenant state and start tenant threads, if they are not running yet.
+            tenant_mgr::set_tenant_state(tenantid, TenantState::Active).unwrap();
+            tenant_threads::start_tenant_threads(conf, tenantid);
        }
    };
 }
@@ -114,11 +140,15 @@ fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid:
    let _enter = info_span!("WAL receiver", timeline = %timelineid, tenant = %tenantid).entered();
    info!("WAL receiver thread started");

+    let mut retry_count = 10;
+
    //
    // Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
    // and start streaming WAL from it. If the connection is lost, keep retrying.
+    // TODO How long should we retry in case of losing connection?
+    // Should we retry at all or we can wait for the next callmemaybe request?
    //
-    while !tenant_mgr::shutdown_requested() {
+    while !tenant_mgr::shutdown_requested() && retry_count > 0 {
        // Look up the current WAL producer address
        let wal_producer_connstr = get_wal_producer_connstr(timelineid);

@@ -129,10 +159,20 @@ fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid:
                "WAL streaming connection failed ({}), retrying in 1 second",
                e
            );
+            retry_count -= 1;
            sleep(Duration::from_secs(1));
+        } else {
+            info!(
+                "walreceiver disconnected tenant {}, timelineid {}",
+                tenantid, timelineid
+            );
+            break;
        }
    }
-    debug!("WAL streaming shut down");
+    info!("WAL streaming shut down");
+    // Drop it from list of active WAL_RECEIVERS
+    // so that next callmemaybe request launched a new thread
+    drop_wal_receiver(timelineid, tenantid);
 }

 fn walreceiver_main(
@@ -165,7 +205,13 @@ fn walreceiver_main(
    let end_of_wal = Lsn::from(u64::from(identify.xlogpos));
    let mut caught_up = false;

-    let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
+    let timeline =
+        tenant_mgr::get_timeline_for_tenant(tenantid, timelineid).with_context(|| {
+            format!(
+                "Can not start the walrecever for a remote tenant {}, timeline {}",
+                tenantid, timelineid,
+            )
+        })?;

    //
    // Start streaming the WAL, from where we left off previously.
@@ -282,14 +328,26 @@ fn walreceiver_main(
        };

        if let Some(last_lsn) = status_update {
-            // TODO: More thought should go into what values are sent here.
            let last_lsn = PgLsn::from(u64::from(last_lsn));
+
+            // The last LSN we processed. It is not guaranteed to survive pageserver crash.
            let write_lsn = last_lsn;
-            let flush_lsn = last_lsn;
-            let apply_lsn = PgLsn::from(0);
+            // This value doesn't guarantee data durability, but it's ok.
+            // In setup with WAL service, pageserver durability is guaranteed by safekeepers.
+            // In setup without WAL service, we just don't care.
+            let flush_lsn = write_lsn;
+            // `disk_consistent_lsn` is the LSN at which page server guarantees persistence of all received data
+            // Depending on the setup we recieve WAL directly from Compute Node or
+            // from a WAL service.
+            //
+            // Senders use the feedback to determine if we are caught up:
+            // - Safekeepers are free to remove WAL preceding `apply_lsn`,
+            // as it will never be requested by this page server.
+            // - Compute Node uses 'apply_lsn' to calculate a lag for back pressure mechanism
+            // (delay WAL inserts to avoid lagging pageserver responses and WAL overflow).
+            let apply_lsn = PgLsn::from(u64::from(timeline.get_disk_consistent_lsn()));
            let ts = SystemTime::now();
            const NO_REPLY: u8 = 0;
-
            physical_stream.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
        }

@@ -298,6 +356,7 @@ fn walreceiver_main(
            break;
        }
    }
+
    Ok(())
 }

--- a/pageserver/src/waldecoder.rs
+++ b/pageserver/src/waldecoder.rs
@@ -1,220 +1,15 @@
 //!
-//! WAL decoder. For each WAL record, it decodes the record to figure out which data blocks
-//! the record affects, so that they can be stored in repository.
+//! Functions for parsing WAL records.
 //!
-use bytes::{Buf, BufMut, Bytes, BytesMut};
-use crc32c::*;
-use log::*;
+use bytes::{Buf, Bytes};
 use postgres_ffi::pg_constants;
-use postgres_ffi::xlog_utils::*;
-use postgres_ffi::XLogLongPageHeaderData;
-use postgres_ffi::XLogPageHeaderData;
+use postgres_ffi::xlog_utils::{TimestampTz, XLOG_SIZE_OF_XLOG_RECORD};
 use postgres_ffi::XLogRecord;
 use postgres_ffi::{BlockNumber, OffsetNumber};
 use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId};
-use std::cmp::min;
-use thiserror::Error;
-use zenith_utils::lsn::Lsn;
+use tracing::*;

-#[allow(dead_code)]
-pub struct WalStreamDecoder {
-    lsn: Lsn,
-
-    startlsn: Lsn, // LSN where this record starts
-    contlen: u32,
-    padlen: u32,
-
-    inputbuf: BytesMut,
-
-    recordbuf: BytesMut,
-}
-
-#[derive(Error, Debug, Clone)]
-#[error("{msg} at {lsn}")]
-pub struct WalDecodeError {
-    msg: String,
-    lsn: Lsn,
-}
-
-//
-// WalRecordStream is a Stream that returns a stream of WAL records
-// FIXME: This isn't a proper rust stream
-//
-impl WalStreamDecoder {
-    pub fn new(lsn: Lsn) -> WalStreamDecoder {
-        WalStreamDecoder {
-            lsn,
-
-            startlsn: Lsn(0),
-            contlen: 0,
-            padlen: 0,
-
-            inputbuf: BytesMut::new(),
-            recordbuf: BytesMut::new(),
-        }
-    }
-
-    // The latest LSN position fed to the decoder.
-    pub fn available(&self) -> Lsn {
-        self.lsn + self.inputbuf.remaining() as u64
-    }
-
-    pub fn feed_bytes(&mut self, buf: &[u8]) {
-        self.inputbuf.extend_from_slice(buf);
-    }
-
-    /// Attempt to decode another WAL record from the input that has been fed to the
-    /// decoder so far.
-    ///
-    /// Returns one of the following:
-    ///     Ok((Lsn, Bytes)): a tuple containing the LSN of next record, and the record itself
-    ///     Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function
-    ///     Err(WalDecodeError): an error occured while decoding, meaning the input was invalid.
-    ///
-    pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
-        let recordbuf;
-
-        // Run state machine that validates page headers, and reassembles records
-        // that cross page boundaries.
-        loop {
-            // parse and verify page boundaries as we go
-            if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
-                // parse long header
-
-                if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD {
-                    return Ok(None);
-                }
-
-                let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf);
-
-                if hdr.std.xlp_pageaddr != self.lsn.0 {
-                    return Err(WalDecodeError {
-                        msg: "invalid xlog segment header".into(),
-                        lsn: self.lsn,
-                    });
-                }
-                // TODO: verify the remaining fields in the header
-
-                self.lsn += XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
-                continue;
-            } else if self.lsn.block_offset() == 0 {
-                if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_SHORT_PHD {
-                    return Ok(None);
-                }
-
-                let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf);
-
-                if hdr.xlp_pageaddr != self.lsn.0 {
-                    return Err(WalDecodeError {
-                        msg: "invalid xlog page header".into(),
-                        lsn: self.lsn,
-                    });
-                }
-                // TODO: verify the remaining fields in the header
-
-                self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
-                continue;
-            } else if self.padlen > 0 {
-                if self.inputbuf.remaining() < self.padlen as usize {
-                    return Ok(None);
-                }
-
-                // skip padding
-                self.inputbuf.advance(self.padlen as usize);
-                self.lsn += self.padlen as u64;
-                self.padlen = 0;
-            } else if self.contlen == 0 {
-                assert!(self.recordbuf.is_empty());
-
-                // need to have at least the xl_tot_len field
-                if self.inputbuf.remaining() < 4 {
-                    return Ok(None);
-                }
-
-                // peek xl_tot_len at the beginning of the record.
-                // FIXME: assumes little-endian
-                self.startlsn = self.lsn;
-                let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le();
-                if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD {
-                    return Err(WalDecodeError {
-                        msg: format!("invalid xl_tot_len {}", xl_tot_len),
-                        lsn: self.lsn,
-                    });
-                }
-
-                // Fast path for the common case that the whole record fits on the page.
-                let pageleft = self.lsn.remaining_in_block() as u32;
-                if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft {
-                    // Take the record from the 'inputbuf', and validate it.
-                    recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize);
-                    self.lsn += xl_tot_len as u64;
-                    break;
-                } else {
-                    // Need to assemble the record from pieces. Remember the size of the
-                    // record, and loop back. On next iteration, we will reach the 'else'
-                    // branch below, and copy the part of the record that was on this page
-                    // to 'recordbuf'.  Subsequent iterations will skip page headers, and
-                    // append the continuations from the next pages to 'recordbuf'.
-                    self.recordbuf.reserve(xl_tot_len as usize);
-                    self.contlen = xl_tot_len;
-                    continue;
-                }
-            } else {
-                // we're continuing a record, possibly from previous page.
-                let pageleft = self.lsn.remaining_in_block() as u32;
-
-                // read the rest of the record, or as much as fits on this page.
-                let n = min(self.contlen, pageleft) as usize;
-
-                if self.inputbuf.remaining() < n {
-                    return Ok(None);
-                }
-
-                self.recordbuf.put(self.inputbuf.split_to(n));
-                self.lsn += n as u64;
-                self.contlen -= n as u32;
-
-                if self.contlen == 0 {
-                    // The record is now complete.
-                    recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new()).freeze();
-                    break;
-                }
-                continue;
-            }
-        }
-
-        // We now have a record in the 'recordbuf' local variable.
-        let xlogrec = XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]);
-
-        let mut crc = 0;
-        crc = crc32c_append(crc, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]);
-        crc = crc32c_append(crc, &recordbuf[0..XLOG_RECORD_CRC_OFFS]);
-        if crc != xlogrec.xl_crc {
-            return Err(WalDecodeError {
-                msg: "WAL record crc mismatch".into(),
-                lsn: self.lsn,
-            });
-        }
-
-        // XLOG_SWITCH records are special. If we see one, we need to skip
-        // to the next WAL segment.
-        if xlogrec.is_xlog_switch_record() {
-            trace!("saw xlog switch record at {}", self.lsn);
-            self.padlen = self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32;
-        } else {
-            // Pad to an 8-byte boundary
-            self.padlen = self.lsn.calc_padding(8u32) as u32;
-        }
-
-        // Always align resulting LSN on 0x8 boundary -- that is important for getPage()
-        // and WalReceiver integration. Since this code is used both for WalReceiver and
-        // initial WAL import let's force alignment right here.
-        let result = (self.lsn.align(), recordbuf);
-        Ok(Some(result))
-    }
-}
-
-#[allow(dead_code)]
+/// DecodedBkpBlock represents per-page data contained in a WAL record.
 #[derive(Default)]
 pub struct DecodedBkpBlock {
    /* Is this block ref in use? */
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -32,9 +32,6 @@ use std::os::unix::io::AsRawFd;
 use std::path::PathBuf;
 use std::process::Stdio;
 use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
-use std::sync::atomic::{AtomicUsize, Ordering};
-use std::sync::mpsc;
-use std::sync::mpsc::{Receiver, Sender, SyncSender};
 use std::sync::Mutex;
 use std::time::Duration;
 use std::time::Instant;
@@ -46,8 +43,8 @@ use zenith_utils::zid::ZTenantId;

 use crate::relish::*;
 use crate::repository::WALRecord;
-use crate::waldecoder::XlMultiXactCreate;
-use crate::waldecoder::XlXactParsedRecord;
+use crate::walrecord::XlMultiXactCreate;
+use crate::walrecord::XlXactParsedRecord;
 use crate::PageServerConf;
 use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_bitshift;
 use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_offset;
@@ -56,10 +53,6 @@ use postgres_ffi::nonrelfile_utils::transaction_id_set_status;
 use postgres_ffi::pg_constants;
 use postgres_ffi::XLogRecord;

-const N_CHANNELS: usize = 16;
-const CHANNEL_SIZE: usize = 1024 * 1024;
-type ChannelId = usize;
-
 ///
 /// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
 ///
@@ -137,22 +130,16 @@ lazy_static! {

 ///
 /// This is the real implementation that uses a Postgres process to
-/// perform WAL replay. I multiplex requests from multiple threads
-/// using `sender` channel and send them to the postgres wal-redo process
-/// pipe by separate thread. Responses are returned through set of `receivers`
-/// channels, used in round robin manner.  Receiver thread is protected by mutex
-/// to prevent it's usage by more than one thread
-/// In the future, we might want to launch a pool of processes to allow concurrent
-/// replay of multiple records.
+/// perform WAL replay. Only one thread can use the processs at a time,
+/// that is controlled by the Mutex. In the future, we might want to
+/// launch a pool of processes to allow concurrent replay of multiple
+/// records.
 ///
 pub struct PostgresRedoManager {
-    // mutiplexor pipe: use sync_channel to allow sharing sender by multiple threads
-    // and limit size of buffer
-    sender: SyncSender<(ChannelId, Vec<u8>)>,
-    // set of receiver channels
-    receivers: Vec<Mutex<Receiver<Bytes>>>,
-    // atomicly incremented counter for choosing receiver
-    round_robin: AtomicUsize,
+    tenantid: ZTenantId,
+    conf: &'static PageServerConf,
+
+    process: Mutex<Option<PostgresRedoProcess>>,
 }

 #[derive(Debug)]
@@ -202,6 +189,9 @@ impl WalRedoManager for PostgresRedoManager {
        base_img: Option<Bytes>,
        records: Vec<(Lsn, WALRecord)>,
    ) -> Result<Bytes, WalRedoError> {
+        let start_time;
+        let end_time;
+
        let request = WalRedoRequest {
            rel,
            blknum,
@@ -209,14 +199,39 @@ impl WalRedoManager for PostgresRedoManager {
            base_img,
            records,
        };
-        let start_time = Instant::now();
-        let result = if request.can_apply_in_zenith() {
-            self.handle_apply_request_zenith(&request)
+
+        start_time = Instant::now();
+        let result;
+
+        if request.can_apply_in_zenith() {
+            result = self.handle_apply_request_zenith(&request);
+
+            end_time = Instant::now();
+            WAL_REDO_TIME.observe(end_time.duration_since(start_time).as_secs_f64());
        } else {
-            self.handle_apply_request_postgres(&request)
-        };
-        let end_time = Instant::now();
-        WAL_REDO_TIME.observe(end_time.duration_since(start_time).as_secs_f64());
+            let mut process_guard = self.process.lock().unwrap();
+            let lock_time = Instant::now();
+
+            // launch the WAL redo process on first use
+            if process_guard.is_none() {
+                let p = PostgresRedoProcess::launch(self.conf, &self.tenantid)?;
+                *process_guard = Some(p);
+            }
+            let process = process_guard.as_mut().unwrap();
+
+            result = self.handle_apply_request_postgres(process, &request);
+
+            WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
+            end_time = Instant::now();
+            WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());
+
+            // If something went wrong, don't try to reuse the process. Kill it, and
+            // next request will launch a new one.
+            if result.is_err() {
+                let process = process_guard.take().unwrap();
+                process.kill();
+            }
+        }

        result
    }
@@ -226,66 +241,13 @@ impl PostgresRedoManager {
    ///
    /// Create a new PostgresRedoManager.
    ///
-    pub fn new(conf: &PageServerConf, tenantid: ZTenantId) -> PostgresRedoManager {
-        let (tx, rx): (
-            SyncSender<(ChannelId, Vec<u8>)>,
-            Receiver<(ChannelId, Vec<u8>)>,
-        ) = mpsc::sync_channel(CHANNEL_SIZE);
-        let mut senders: Vec<Sender<Bytes>> = Vec::with_capacity(N_CHANNELS);
-        let mut receivers: Vec<Mutex<Receiver<Bytes>>> = Vec::with_capacity(N_CHANNELS);
-        for _ in 0..N_CHANNELS {
-            let (tx, rx) = mpsc::channel();
-            senders.push(tx);
-            receivers.push(Mutex::new(rx));
+    pub fn new(conf: &'static PageServerConf, tenantid: ZTenantId) -> PostgresRedoManager {
+        // The actual process is launched lazily, on first request.
+        PostgresRedoManager {
+            tenantid,
+            conf,
+            process: Mutex::new(None),
        }
-        if let Ok(mut proc) = PostgresRedoProcess::launch(conf, &tenantid) {
-            let _proxy = std::thread::spawn(move || loop {
-                let (id, data) = rx.recv().unwrap();
-                match proc.apply_wal_records(data) {
-                    Ok(page) => senders[id as usize].send(page).unwrap(),
-                    Err(err) => {
-                        info!("wal-redo failed with error {:?}", err);
-                        proc.kill();
-                        break;
-                    }
-                }
-            });
-            PostgresRedoManager {
-                sender: tx,
-                receivers,
-                round_robin: AtomicUsize::new(0),
-            }
-        } else {
-            panic!("Failed to launch wal-redo postgres");
-        }
-    }
-
-    fn apply_wal_records(
-        &self,
-        tag: BufferTag,
-        base_img: Option<Bytes>,
-        records: &[(Lsn, WALRecord)],
-    ) -> Result<Bytes, std::io::Error> {
-        // Serialize all the messages to send the WAL redo process first.
-        //
-        // This could be problematic if there are millions of records to replay,
-        // but in practice the number of records is usually so small that it doesn't
-        // matter, and it's better to keep this code simple.
-        let mut writebuf: Vec<u8> = Vec::new();
-        build_begin_redo_for_block_msg(tag, &mut writebuf);
-        if let Some(img) = base_img {
-            build_push_page_msg(tag, &img, &mut writebuf);
-        }
-        for (lsn, rec) in records.iter() {
-            build_apply_record_msg(*lsn, &rec.rec, &mut writebuf);
-        }
-        build_get_page_msg(tag, &mut writebuf);
-        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
-
-        let id = self.round_robin.fetch_add(1, Ordering::Relaxed) % N_CHANNELS;
-        let rx = self.receivers[id].lock().unwrap();
-        self.sender.send((id, writebuf)).unwrap();
-        Ok(rx.recv().unwrap())
    }

    ///
@@ -293,6 +255,7 @@ impl PostgresRedoManager {
    ///
    fn handle_apply_request_postgres(
        &self,
+        process: &mut PostgresRedoProcess,
        request: &WalRedoRequest,
    ) -> Result<Bytes, WalRedoError> {
        let blknum = request.blknum;
@@ -308,7 +271,7 @@ impl PostgresRedoManager {
        if let RelishTag::Relation(rel) = request.rel {
            // Relational WAL records are applied using wal-redo-postgres
            let buf_tag = BufferTag { rel, blknum };
-            apply_result = self.apply_wal_records(buf_tag, base_img, records);
+            apply_result = process.apply_wal_records(buf_tag, base_img, records);

            let duration = start.elapsed();

@@ -594,7 +557,32 @@ impl PostgresRedoProcess {
    // Apply given WAL records ('records') over an old page image. Returns
    // new page image.
    //
-    fn apply_wal_records(&mut self, writebuf: Vec<u8>) -> Result<Bytes, std::io::Error> {
+    fn apply_wal_records(
+        &mut self,
+        tag: BufferTag,
+        base_img: Option<Bytes>,
+        records: &[(Lsn, WALRecord)],
+    ) -> Result<Bytes, std::io::Error> {
+        // Serialize all the messages to send the WAL redo process first.
+        //
+        // This could be problematic if there are millions of records to replay,
+        // but in practice the number of records is usually so small that it doesn't
+        // matter, and it's better to keep this code simple.
+        let mut writebuf: Vec<u8> = Vec::new();
+        build_begin_redo_for_block_msg(tag, &mut writebuf);
+        if let Some(img) = base_img {
+            build_push_page_msg(tag, &img, &mut writebuf);
+        }
+        for (lsn, rec) in records.iter() {
+            build_apply_record_msg(*lsn, &rec.rec, &mut writebuf);
+        }
+        build_get_page_msg(tag, &mut writebuf);
+        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
+
+        // The input is now in 'writebuf'. Do a blind write first, writing as much as
+        // we can, before calling poll(). That skips one call to poll() if the stdin is
+        // already available for writing, which it almost certainly is because the
+        // process is idle.
        let mut nwrite = self.stdin.write(&writebuf)?;

        // We expect the WAL redo process to respond with an 8k page image. We read it
--- a/postgres_ffi/src/lib.rs
+++ b/postgres_ffi/src/lib.rs
@@ -13,6 +13,7 @@ pub mod controlfile_utils;
 pub mod nonrelfile_utils;
 pub mod pg_constants;
 pub mod relfile_utils;
+pub mod waldecoder;
 pub mod xlog_utils;

 //  See TransactionIdIsNormal in transam.h
--- a/postgres_ffi/src/waldecoder.rs
+++ b/postgres_ffi/src/waldecoder.rs
@@ -0,0 +1,219 @@
+//!
+//! Basic WAL stream decoding.
+//!
+//! This understands the WAL page and record format, enough to figure out where the WAL record
+//! boundaries are, and to reassemble WAL records that cross page boundaries.
+//!
+//! This functionality is needed by both the pageserver and the walkeepers. The pageserver needs
+//! to look deeper into the WAL records to also understand which blocks they modify, the code
+//! for that is in pageserver/src/walrecord.rs
+//!
+use super::pg_constants;
+use super::xlog_utils::*;
+use super::XLogLongPageHeaderData;
+use super::XLogPageHeaderData;
+use super::XLogRecord;
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+use crc32c::*;
+use log::*;
+use std::cmp::min;
+use thiserror::Error;
+use zenith_utils::lsn::Lsn;
+
+pub struct WalStreamDecoder {
+    lsn: Lsn,
+
+    startlsn: Lsn, // LSN where this record starts
+    contlen: u32,
+    padlen: u32,
+
+    inputbuf: BytesMut,
+
+    /// buffer used to reassemble records that cross page boundaries.
+    recordbuf: BytesMut,
+}
+
+#[derive(Error, Debug, Clone)]
+#[error("{msg} at {lsn}")]
+pub struct WalDecodeError {
+    msg: String,
+    lsn: Lsn,
+}
+
+//
+// WalRecordStream is a Stream that returns a stream of WAL records
+// FIXME: This isn't a proper rust stream
+//
+impl WalStreamDecoder {
+    pub fn new(lsn: Lsn) -> WalStreamDecoder {
+        WalStreamDecoder {
+            lsn,
+
+            startlsn: Lsn(0),
+            contlen: 0,
+            padlen: 0,
+
+            inputbuf: BytesMut::new(),
+            recordbuf: BytesMut::new(),
+        }
+    }
+
+    // The latest LSN position fed to the decoder.
+    pub fn available(&self) -> Lsn {
+        self.lsn + self.inputbuf.remaining() as u64
+    }
+
+    pub fn feed_bytes(&mut self, buf: &[u8]) {
+        self.inputbuf.extend_from_slice(buf);
+    }
+
+    /// Attempt to decode another WAL record from the input that has been fed to the
+    /// decoder so far.
+    ///
+    /// Returns one of the following:
+    ///     Ok((Lsn, Bytes)): a tuple containing the LSN of next record, and the record itself
+    ///     Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function
+    ///     Err(WalDecodeError): an error occured while decoding, meaning the input was invalid.
+    ///
+    pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
+        let recordbuf;
+
+        // Run state machine that validates page headers, and reassembles records
+        // that cross page boundaries.
+        loop {
+            // parse and verify page boundaries as we go
+            if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
+                // parse long header
+
+                if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD {
+                    return Ok(None);
+                }
+
+                let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf);
+
+                if hdr.std.xlp_pageaddr != self.lsn.0 {
+                    return Err(WalDecodeError {
+                        msg: "invalid xlog segment header".into(),
+                        lsn: self.lsn,
+                    });
+                }
+                // TODO: verify the remaining fields in the header
+
+                self.lsn += XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
+                continue;
+            } else if self.lsn.block_offset() == 0 {
+                if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_SHORT_PHD {
+                    return Ok(None);
+                }
+
+                let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf);
+
+                if hdr.xlp_pageaddr != self.lsn.0 {
+                    return Err(WalDecodeError {
+                        msg: "invalid xlog page header".into(),
+                        lsn: self.lsn,
+                    });
+                }
+                // TODO: verify the remaining fields in the header
+
+                self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
+                continue;
+            } else if self.padlen > 0 {
+                if self.inputbuf.remaining() < self.padlen as usize {
+                    return Ok(None);
+                }
+
+                // skip padding
+                self.inputbuf.advance(self.padlen as usize);
+                self.lsn += self.padlen as u64;
+                self.padlen = 0;
+            } else if self.contlen == 0 {
+                assert!(self.recordbuf.is_empty());
+
+                // need to have at least the xl_tot_len field
+                if self.inputbuf.remaining() < 4 {
+                    return Ok(None);
+                }
+
+                // peek xl_tot_len at the beginning of the record.
+                // FIXME: assumes little-endian
+                self.startlsn = self.lsn;
+                let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le();
+                if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD {
+                    return Err(WalDecodeError {
+                        msg: format!("invalid xl_tot_len {}", xl_tot_len),
+                        lsn: self.lsn,
+                    });
+                }
+
+                // Fast path for the common case that the whole record fits on the page.
+                let pageleft = self.lsn.remaining_in_block() as u32;
+                if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft {
+                    // Take the record from the 'inputbuf', and validate it.
+                    recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize);
+                    self.lsn += xl_tot_len as u64;
+                    break;
+                } else {
+                    // Need to assemble the record from pieces. Remember the size of the
+                    // record, and loop back. On next iteration, we will reach the 'else'
+                    // branch below, and copy the part of the record that was on this page
+                    // to 'recordbuf'.  Subsequent iterations will skip page headers, and
+                    // append the continuations from the next pages to 'recordbuf'.
+                    self.recordbuf.reserve(xl_tot_len as usize);
+                    self.contlen = xl_tot_len;
+                    continue;
+                }
+            } else {
+                // we're continuing a record, possibly from previous page.
+                let pageleft = self.lsn.remaining_in_block() as u32;
+
+                // read the rest of the record, or as much as fits on this page.
+                let n = min(self.contlen, pageleft) as usize;
+
+                if self.inputbuf.remaining() < n {
+                    return Ok(None);
+                }
+
+                self.recordbuf.put(self.inputbuf.split_to(n));
+                self.lsn += n as u64;
+                self.contlen -= n as u32;
+
+                if self.contlen == 0 {
+                    // The record is now complete.
+                    recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new()).freeze();
+                    break;
+                }
+                continue;
+            }
+        }
+
+        // We now have a record in the 'recordbuf' local variable.
+        let xlogrec = XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]);
+
+        let mut crc = 0;
+        crc = crc32c_append(crc, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]);
+        crc = crc32c_append(crc, &recordbuf[0..XLOG_RECORD_CRC_OFFS]);
+        if crc != xlogrec.xl_crc {
+            return Err(WalDecodeError {
+                msg: "WAL record crc mismatch".into(),
+                lsn: self.lsn,
+            });
+        }
+
+        // XLOG_SWITCH records are special. If we see one, we need to skip
+        // to the next WAL segment.
+        if xlogrec.is_xlog_switch_record() {
+            trace!("saw xlog switch record at {}", self.lsn);
+            self.padlen = self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32;
+        } else {
+            // Pad to an 8-byte boundary
+            self.padlen = self.lsn.calc_padding(8u32) as u32;
+        }
+
+        // Always align resulting LSN on 0x8 boundary -- that is important for getPage()
+        // and WalReceiver integration. Since this code is used both for WalReceiver and
+        // initial WAL import let's force alignment right here.
+        let result = (self.lsn.align(), recordbuf);
+        Ok(Some(result))
+    }
+}
--- a/postgres_ffi/src/xlog_utils.rs
+++ b/postgres_ffi/src/xlog_utils.rs
@@ -43,6 +43,9 @@ pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::<XLogRecord>();
 #[allow(clippy::identity_op)]
 pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;

+// PG timeline is always 1, changing it doesn't have useful meaning in Zenith.
+pub const PG_TLI: u32 = 1;
+
 pub type XLogRecPtr = u64;
 pub type TimeLineID = u32;
 pub type TimestampTz = i64;
@@ -184,8 +187,13 @@ fn find_end_of_wal_segment(
            let xl_tot_len = LittleEndian::read_u32(&buf[page_offs..page_offs + 4]) as usize;
            if xl_tot_len == 0 {
                info!(
-                    "find_end_of_wal_segment reached zeros at {:?}",
-                    Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size))
+                    "find_end_of_wal_segment reached zeros at {:?}, last records ends at {:?}",
+                    Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size)),
+                    Lsn(XLogSegNoOffsetToRecPtr(
+                        segno,
+                        last_valid_rec_pos as u32,
+                        wal_seg_size
+                    ))
                );
                break; // zeros, reached the end
            }
@@ -300,12 +308,17 @@ pub fn find_end_of_wal(
                    high_segno,
                );
            }
+            let start_offset = if start_lsn.segment_number(wal_seg_size) == high_segno {
+                start_lsn.segment_offset(wal_seg_size)
+            } else {
+                0
+            };
            high_offs = find_end_of_wal_segment(
                data_dir,
                high_segno,
                high_tli,
                wal_seg_size,
-                start_lsn.segment_offset(wal_seg_size),
+                start_offset,
            )?;
        }
        let high_ptr = XLogSegNoOffsetToRecPtr(high_segno, high_offs, wal_seg_size);
@@ -421,7 +434,7 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes {
            XLogPageHeaderData {
                xlp_magic: XLOG_PAGE_MAGIC as u16,
                xlp_info: pg_constants::XLP_LONG_HEADER,
-                xlp_tli: 1, // FIXME: always use Postgres timeline 1
+                xlp_tli: PG_TLI,
                xlp_pageaddr: pageaddr,
                xlp_rem_len: 0,
                ..Default::default() // Put 0 in padding fields.
--- a/pre-commit.py
+++ b/pre-commit.py
@@ -37,20 +37,27 @@ def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str:
    return cmd


+def yapf(fix_inplace: bool) -> str:
+    cmd = "pipenv run yapf --recursive"
+    if fix_inplace:
+        cmd += " --in-place"
+    else:
+        cmd += " --diff"
+    return cmd
+
+
+def mypy() -> str:
+    return "pipenv run mypy"
+
+
 def get_commit_files() -> List[str]:
-    files = subprocess.check_output(
-        "git diff --cached --name-only --diff-filter=ACM".split()
-    )
+    files = subprocess.check_output("git diff --cached --name-only --diff-filter=ACM".split())
    return files.decode().splitlines()


-def check(
-    name: str, suffix: str, cmd: str, changed_files: List[str], no_color: bool = False
-):
+def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color: bool = False):
    print(f"Checking: {name} ", end="")
-    applicable_files = list(
-        filter(lambda fname: fname.strip().endswith(suffix), changed_files)
-    )
+    applicable_files = list(filter(lambda fname: fname.strip().endswith(suffix), changed_files))
    if not applicable_files:
        print(colorify("[NOT APPLICABLE]", Color.CYAN, no_color))
        return
@@ -59,7 +66,14 @@ def check(
    res = subprocess.run(cmd.split(), capture_output=True)
    if res.returncode != 0:
        print(colorify("[FAILED]", Color.RED, no_color))
-        print("Please inspect the output below and run make fmt to fix automatically\n")
+        if name == "mypy":
+            print("Please inspect the output below and fix type mismatches.")
+        else:
+            print("Please inspect the output below and run make fmt to fix automatically.")
+        if suffix == ".py":
+            print("If the output is empty, ensure that you've installed Python tooling by\n"
+                  "running 'pipenv install --dev' in the current directory (no root needed)")
+        print()
        print(res.stdout.decode())
        exit(1)

@@ -68,12 +82,11 @@ def check(

 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--fix-inplace", action="store_true", help="apply fixes inplace"
-    )
-    parser.add_argument(
-        "--no-color", action="store_true", help="disable colored output", default=not sys.stdout.isatty()
-    )
+    parser.add_argument("--fix-inplace", action="store_true", help="apply fixes inplace")
+    parser.add_argument("--no-color",
+                        action="store_true",
+                        help="disable colored output",
+                        default=not sys.stdout.isatty())
    args = parser.parse_args()

    files = get_commit_files()
@@ -87,3 +100,17 @@ if __name__ == "__main__":
        changed_files=files,
        no_color=args.no_color,
    )
+    check(
+        name="yapf",
+        suffix=".py",
+        cmd=yapf(fix_inplace=args.fix_inplace),
+        changed_files=files,
+        no_color=args.no_color,
+    )
+    check(
+        name="mypy",
+        suffix=".py",
+        cmd=mypy(),
+        changed_files=files,
+        no_color=args.no_color,
+    )
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -18,6 +18,6 @@ tokio = "1.11"
 tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
 clap = "2.33.0"
 rustls = "0.19.1"
-reqwest = { version = "0.11", features = ["blocking", "json"] }
+reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }

 zenith_utils = { path = "../zenith_utils" }
--- a/proxy/src/cplane_api.rs
+++ b/proxy/src/cplane_api.rs
@@ -1,67 +1,139 @@
-use anyhow::{bail, Context, Result};
+use anyhow::{anyhow, bail, Context};
 use serde::{Deserialize, Serialize};
 use std::net::{SocketAddr, ToSocketAddrs};

-pub struct CPlaneApi {
-    auth_endpoint: &'static str,
-}
+use crate::state::ProxyWaiters;

-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize, Debug, Default)]
 pub struct DatabaseInfo {
    pub host: String,
    pub port: u16,
    pub dbname: String,
    pub user: String,
-    pub password: String,
+    pub password: Option<String>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(untagged)]
+enum ProxyAuthResponse {
+    Ready { conn_info: DatabaseInfo },
+    Error { error: String },
+    NotReady { ready: bool }, // TODO: get rid of `ready`
 }

 impl DatabaseInfo {
-    pub fn socket_addr(&self) -> Result<SocketAddr> {
+    pub fn socket_addr(&self) -> anyhow::Result<SocketAddr> {
        let host_port = format!("{}:{}", self.host, self.port);
        host_port
            .to_socket_addrs()
            .with_context(|| format!("cannot resolve {} to SocketAddr", host_port))?
            .next()
-            .ok_or_else(|| anyhow::Error::msg("cannot resolve at least one SocketAddr"))
-    }
-
-    pub fn conn_string(&self) -> String {
-        format!(
-            "dbname={} user={} password={}",
-            self.dbname, self.user, self.password
-        )
+            .ok_or_else(|| anyhow!("cannot resolve at least one SocketAddr"))
    }
 }

-impl CPlaneApi {
-    pub fn new(auth_endpoint: &'static str) -> CPlaneApi {
-        CPlaneApi { auth_endpoint }
-    }
+impl From<DatabaseInfo> for tokio_postgres::Config {
+    fn from(db_info: DatabaseInfo) -> Self {
+        let mut config = tokio_postgres::Config::new();

+        config
+            .host(&db_info.host)
+            .port(db_info.port)
+            .dbname(&db_info.dbname)
+            .user(&db_info.user);
+
+        if let Some(password) = db_info.password {
+            config.password(password);
+        }
+
+        config
+    }
+}
+
+pub struct CPlaneApi<'a> {
+    auth_endpoint: &'a str,
+    waiters: &'a ProxyWaiters,
+}
+
+impl<'a> CPlaneApi<'a> {
+    pub fn new(auth_endpoint: &'a str, waiters: &'a ProxyWaiters) -> Self {
+        Self {
+            auth_endpoint,
+            waiters,
+        }
+    }
+}
+
+impl CPlaneApi<'_> {
    pub fn authenticate_proxy_request(
        &self,
        user: &str,
        database: &str,
        md5_response: &[u8],
        salt: &[u8; 4],
-    ) -> Result<DatabaseInfo> {
+        psql_session_id: &str,
+    ) -> anyhow::Result<DatabaseInfo> {
        let mut url = reqwest::Url::parse(self.auth_endpoint)?;
        url.query_pairs_mut()
            .append_pair("login", user)
            .append_pair("database", database)
            .append_pair("md5response", std::str::from_utf8(md5_response)?)
-            .append_pair("salt", &hex::encode(salt));
+            .append_pair("salt", &hex::encode(salt))
+            .append_pair("psql_session_id", psql_session_id);

-        println!("cplane request: {}", url.as_str());
+        let waiter = self.waiters.register(psql_session_id.to_owned());

+        println!("cplane request: {}", url);
        let resp = reqwest::blocking::get(url)?;
+        if !resp.status().is_success() {
+            bail!("Auth failed: {}", resp.status())
+        }

-        if resp.status().is_success() {
-            let conn_info: DatabaseInfo = serde_json::from_str(resp.text()?.as_str())?;
-            println!("got conn info: #{:?}", conn_info);
-            Ok(conn_info)
-        } else {
-            bail!("Auth failed")
+        let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text()?.as_str())?;
+        println!("got auth info: #{:?}", auth_info);
+
+        use ProxyAuthResponse::*;
+        match auth_info {
+            Ready { conn_info } => Ok(conn_info),
+            Error { error } => bail!(error),
+            NotReady { .. } => waiter.wait()?.map_err(|e| anyhow!(e)),
        }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    #[test]
+    fn test_proxy_auth_response() {
+        // Ready
+        let auth: ProxyAuthResponse = serde_json::from_value(json!({
+            "ready": true,
+            "conn_info": DatabaseInfo::default(),
+        }))
+        .unwrap();
+        assert!(matches!(
+            auth,
+            ProxyAuthResponse::Ready {
+                conn_info: DatabaseInfo { .. }
+            }
+        ));
+
+        // Error
+        let auth: ProxyAuthResponse = serde_json::from_value(json!({
+            "ready": false,
+            "error": "too bad, so sad",
+        }))
+        .unwrap();
+        assert!(matches!(auth, ProxyAuthResponse::Error { .. }));
+
+        // NotReady
+        let auth: ProxyAuthResponse = serde_json::from_value(json!({
+            "ready": false,
+        }))
+        .unwrap();
+        assert!(matches!(auth, ProxyAuthResponse::NotReady { .. }));
+    }
+}
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -5,78 +5,21 @@
 /// (control plane API in our case) and can create new databases and accounts
 /// in somewhat transparent manner (again via communication with control plane API).
 ///
-use std::{
-    collections::HashMap,
-    net::{SocketAddr, TcpListener},
-    sync::{mpsc, Arc, Mutex},
-    thread,
-};
-
-use anyhow::{anyhow, bail, ensure, Context};
-use clap::{App, Arg, ArgMatches};
-
-use cplane_api::DatabaseInfo;
-use rustls::{internal::pemfile, NoClientAuth, ProtocolVersion, ServerConfig};
+use anyhow::bail;
+use clap::{App, Arg};
+use state::{ProxyConfig, ProxyState};
+use std::thread;
+use zenith_utils::{tcp_listener, GIT_VERSION};

 mod cplane_api;
 mod mgmt;
 mod proxy;
-
-pub struct ProxyConf {
-    /// main entrypoint for users to connect to
-    pub proxy_address: SocketAddr,
-
-    /// http management endpoint. Upon user account creation control plane
-    /// will notify us here, so that we can 'unfreeze' user session.
-    pub mgmt_address: SocketAddr,
-
-    /// send unauthenticated users to this URI
-    pub redirect_uri: String,
-
-    /// control plane address where we would check auth.
-    pub auth_endpoint: String,
-
-    pub ssl_config: Option<Arc<ServerConfig>>,
-}
-
-pub struct ProxyState {
-    pub conf: ProxyConf,
-    pub waiters: Mutex<HashMap<String, mpsc::Sender<anyhow::Result<DatabaseInfo>>>>,
-}
-
-fn configure_ssl(arg_matches: &ArgMatches) -> anyhow::Result<Option<Arc<ServerConfig>>> {
-    let (key_path, cert_path) = match (
-        arg_matches.value_of("ssl-key"),
-        arg_matches.value_of("ssl-cert"),
-    ) {
-        (Some(key_path), Some(cert_path)) => (key_path, cert_path),
-        (None, None) => return Ok(None),
-        _ => bail!("either both or neither ssl-key and ssl-cert must be specified"),
-    };
-
-    let key = {
-        let key_bytes = std::fs::read(key_path).context("SSL key file")?;
-        let mut keys = pemfile::pkcs8_private_keys(&mut &key_bytes[..])
-            .map_err(|_| anyhow!("couldn't read TLS keys"))?;
-        ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
-        keys.pop().unwrap()
-    };
-
-    let cert_chain = {
-        let cert_chain_bytes = std::fs::read(cert_path).context("SSL cert file")?;
-        pemfile::certs(&mut &cert_chain_bytes[..])
-            .map_err(|_| anyhow!("couldn't read TLS certificates"))?
-    };
-
-    let mut config = ServerConfig::new(NoClientAuth::new());
-    config.set_single_cert(cert_chain, key)?;
-    config.versions = vec![ProtocolVersion::TLSv1_3];
-
-    Ok(Some(Arc::new(config)))
-}
+mod state;
+mod waiters;

 fn main() -> anyhow::Result<()> {
    let arg_matches = App::new("Zenith proxy/router")
+        .version(GIT_VERSION)
        .arg(
            Arg::with_name("proxy")
                .short("p")
@@ -125,38 +68,47 @@ fn main() -> anyhow::Result<()> {
        )
        .get_matches();

-    let conf = ProxyConf {
+    let ssl_config = match (
+        arg_matches.value_of("ssl-key"),
+        arg_matches.value_of("ssl-cert"),
+    ) {
+        (Some(key_path), Some(cert_path)) => {
+            Some(crate::state::configure_ssl(key_path, cert_path)?)
+        }
+        (None, None) => None,
+        _ => bail!("either both or neither ssl-key and ssl-cert must be specified"),
+    };
+
+    let config = ProxyConfig {
        proxy_address: arg_matches.value_of("proxy").unwrap().parse()?,
        mgmt_address: arg_matches.value_of("mgmt").unwrap().parse()?,
        redirect_uri: arg_matches.value_of("uri").unwrap().parse()?,
        auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?,
-        ssl_config: configure_ssl(&arg_matches)?,
+        ssl_config,
    };
-    let state = ProxyState {
-        conf,
-        waiters: Mutex::new(HashMap::new()),
-    };
-    let state: &'static ProxyState = Box::leak(Box::new(state));
+    let state: &ProxyState = Box::leak(Box::new(ProxyState::new(config)));
+
+    println!("Version: {}", GIT_VERSION);

    // Check that we can bind to address before further initialization
    println!("Starting proxy on {}", state.conf.proxy_address);
-    let pageserver_listener = TcpListener::bind(state.conf.proxy_address)?;
+    let pageserver_listener = tcp_listener::bind(state.conf.proxy_address)?;

    println!("Starting mgmt on {}", state.conf.mgmt_address);
-    let mgmt_listener = TcpListener::bind(state.conf.mgmt_address)?;
+    let mgmt_listener = tcp_listener::bind(state.conf.mgmt_address)?;

-    let threads = vec![
+    let threads = [
        // Spawn a thread to listen for connections. It will spawn further threads
        // for each connection.
        thread::Builder::new()
-            .name("Proxy thread".into())
+            .name("Listener thread".into())
            .spawn(move || proxy::thread_main(state, pageserver_listener))?,
        thread::Builder::new()
            .name("Mgmt thread".into())
            .spawn(move || mgmt::thread_main(state, mgmt_listener))?,
    ];

-    for t in threads.into_iter() {
+    for t in threads {
        t.join().unwrap()?;
    }

--- a/proxy/src/mgmt.rs
+++ b/proxy/src/mgmt.rs
@@ -3,7 +3,6 @@ use std::{
    thread,
 };

-use anyhow::bail;
 use bytes::Bytes;
 use serde::Deserialize;
 use zenith_utils::{
@@ -25,22 +24,23 @@ pub fn thread_main(state: &'static ProxyState, listener: TcpListener) -> anyhow:
        socket.set_nodelay(true).unwrap();

        thread::spawn(move || {
-            if let Err(err) = mgmt_conn_main(state, socket) {
+            if let Err(err) = handle_connection(state, socket) {
                println!("error: {}", err);
            }
        });
    }
 }

-pub fn mgmt_conn_main(state: &'static ProxyState, socket: TcpStream) -> anyhow::Result<()> {
+fn handle_connection(state: &ProxyState, socket: TcpStream) -> anyhow::Result<()> {
    let mut conn_handler = MgmtHandler { state };
    let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, true)?;
    pgbackend.run(&mut conn_handler)
 }

-struct MgmtHandler {
-    state: &'static ProxyState,
+struct MgmtHandler<'a> {
+    state: &'a ProxyState,
 }
+
 /// Serialized examples:
 // {
 //     "session_id": "71d6d03e6d93d99a",
@@ -64,18 +64,18 @@ struct MgmtHandler {
 // // to test manually by sending a query to mgmt interface:
 // psql -h 127.0.0.1 -p 9999 -c '{"session_id":"4f10dde522e14739","result":{"Success":{"host":"127.0.0.1","port":5432,"dbname":"stas","user":"stas","password":"stas"}}}'
 #[derive(Deserialize)]
-pub struct PsqlSessionResponse {
+struct PsqlSessionResponse {
    session_id: String,
    result: PsqlSessionResult,
 }

 #[derive(Deserialize)]
-pub enum PsqlSessionResult {
+enum PsqlSessionResult {
    Success(DatabaseInfo),
    Failure(String),
 }

-impl postgres_backend::Handler for MgmtHandler {
+impl postgres_backend::Handler for MgmtHandler<'_> {
    fn process_query(
        &mut self,
        pgb: &mut PostgresBackend,
@@ -96,32 +96,26 @@ fn try_process_query(
    query_string: Bytes,
 ) -> anyhow::Result<()> {
    let query_string = query_from_cstring(query_string);
-
    println!("Got mgmt query: '{}'", std::str::from_utf8(&query_string)?);

    let resp: PsqlSessionResponse = serde_json::from_slice(&query_string)?;

-    let waiters = mgmt.state.waiters.lock().unwrap();
-
-    let sender = waiters
-        .get(&resp.session_id)
-        .ok_or_else(|| anyhow::Error::msg("psql_session_id is not found"))?;
-
-    match resp.result {
-        PsqlSessionResult::Success(db_info) => {
-            sender.send(Ok(db_info))?;
+    use PsqlSessionResult::*;
+    let msg = match resp.result {
+        Success(db_info) => Ok(db_info),
+        Failure(message) => Err(message),
+    };

+    match mgmt.state.waiters.notify(&resp.session_id, msg) {
+        Ok(()) => {
            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
                .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
-                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-            pgb.flush()?;
-            Ok(())
+                .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
        }
-
-        PsqlSessionResult::Failure(message) => {
-            sender.send(Err(anyhow::Error::msg(message.clone())))?;
-
-            bail!("psql session request failed: {}", message)
+        Err(e) => {
+            pgb.write_message(&BeMessage::ErrorResponse(e.to_string()))?;
        }
    }
+
+    Ok(())
 }
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -1,18 +1,12 @@
-use crate::cplane_api::CPlaneApi;
-use crate::cplane_api::DatabaseInfo;
+use crate::cplane_api::{CPlaneApi, DatabaseInfo};
 use crate::ProxyState;
-
-use anyhow::bail;
+use anyhow::{anyhow, bail};
+use std::net::TcpStream;
+use std::{io, thread};
 use tokio_postgres::NoTls;
-
-use rand::Rng;
-use std::io::Write;
-use std::{io, sync::mpsc::channel, thread};
-use zenith_utils::postgres_backend::Stream;
-use zenith_utils::postgres_backend::{PostgresBackend, ProtoState};
-use zenith_utils::pq_proto::*;
+use zenith_utils::postgres_backend::{self, PostgresBackend, ProtoState, Stream};
+use zenith_utils::pq_proto::{BeMessage as Be, FeMessage as Fe, *};
 use zenith_utils::sock_split::{ReadStream, WriteStream};
-use zenith_utils::{postgres_backend, pq_proto::BeMessage};

 ///
 /// Main proxy listener loop.
@@ -28,273 +22,259 @@ pub fn thread_main(
        println!("accepted connection from {}", peer_addr);
        socket.set_nodelay(true).unwrap();

-        thread::spawn(move || {
-            if let Err(err) = proxy_conn_main(state, socket) {
-                println!("error: {}", err);
-            }
-        });
+        thread::Builder::new()
+            .name("Proxy thread".into())
+            .spawn(move || {
+                if let Err(err) = proxy_conn_main(state, socket) {
+                    println!("error: {}", err);
+                }
+            })?;
    }
 }

-// XXX: clean up fields
+// TODO: clean up fields
 struct ProxyConnection {
    state: &'static ProxyState,
-
-    cplane: CPlaneApi,
-
-    user: String,
-    database: String,
-
-    pgb: PostgresBackend,
-    md5_salt: [u8; 4],
-
    psql_session_id: String,
+    pgb: PostgresBackend,
 }

-pub fn proxy_conn_main(
-    state: &'static ProxyState,
-    socket: std::net::TcpStream,
-) -> anyhow::Result<()> {
-    let mut conn = ProxyConnection {
+pub fn proxy_conn_main(state: &'static ProxyState, socket: TcpStream) -> anyhow::Result<()> {
+    let conn = ProxyConnection {
        state,
-        cplane: CPlaneApi::new(&state.conf.auth_endpoint),
-        user: "".into(),
-        database: "".into(),
+        psql_session_id: hex::encode(rand::random::<[u8; 8]>()),
        pgb: PostgresBackend::new(
            socket,
            postgres_backend::AuthType::MD5,
            state.conf.ssl_config.clone(),
            false,
        )?,
-        md5_salt: [0u8; 4],
-        psql_session_id: "".into(),
    };

-    // Check StartupMessage
-    // This will set conn.existing_user and we can decide on next actions
-    conn.handle_startup()?;
+    let (client, server) = conn.handle_client()?;

-    // both scenarious here should end up producing database connection string
-    let db_info = if conn.is_existing_user() {
-        conn.handle_existing_user()?
-    } else {
-        conn.handle_new_user()?
+    let server = zenith_utils::sock_split::BidiStream::from_tcp(server);
+
+    let client = match client {
+        Stream::Bidirectional(bidi_stream) => bidi_stream,
+        _ => panic!("invalid stream type"),
    };

-    // XXX: move that inside handle_new_user/handle_existing_user to be able to
-    // report wrong connection error.
-    proxy_pass(conn.pgb, db_info)
+    proxy(client.split(), server.split())
 }

 impl ProxyConnection {
-    fn is_existing_user(&self) -> bool {
-        self.user.ends_with("@zenith")
+    fn handle_client(mut self) -> anyhow::Result<(Stream, TcpStream)> {
+        let mut authenticate = || {
+            let (username, dbname) = self.handle_startup()?;
+
+            // Both scenarios here should end up producing database credentials
+            if username.ends_with("@zenith") {
+                self.handle_existing_user(&username, &dbname)
+            } else {
+                self.handle_new_user()
+            }
+        };
+
+        let conn = match authenticate() {
+            Ok(db_info) => connect_to_db(db_info),
+            Err(e) => {
+                // Report the error to the client
+                self.pgb.write_message(&Be::ErrorResponse(e.to_string()))?;
+                bail!("failed to handle client: {:?}", e);
+            }
+        };
+
+        // We'll get rid of this once migration to async is complete
+        let (pg_version, db_stream) = {
+            let runtime = tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()?;
+
+            let (pg_version, stream) = runtime.block_on(conn)?;
+            let stream = stream.into_std()?;
+            stream.set_nonblocking(false)?;
+
+            (pg_version, stream)
+        };
+
+        // Let the client send new requests
+        self.pgb
+            .write_message_noflush(&BeMessage::ParameterStatus(
+                BeParameterStatusMessage::ServerVersion(&pg_version),
+            ))?
+            .write_message(&Be::ReadyForQuery)?;
+
+        Ok((self.pgb.into_stream(), db_stream))
    }

-    fn handle_startup(&mut self) -> anyhow::Result<()> {
+    fn handle_startup(&mut self) -> anyhow::Result<(String, String)> {
+        let have_tls = self.pgb.tls_config.is_some();
        let mut encrypted = false;
+
        loop {
-            let msg = self.pgb.read_message()?;
-            println!("got message {:?}", msg);
-            match msg {
-                Some(FeMessage::StartupMessage(m)) => {
-                    println!("got startup message {:?}", m);
+            let mut msg = match self.pgb.read_message()? {
+                Some(Fe::StartupMessage(msg)) => msg,
+                None => bail!("connection is lost"),
+                bad => bail!("unexpected message type: {:?}", bad),
+            };
+            println!("got message: {:?}", msg);

-                    match m.kind {
-                        StartupRequestCode::NegotiateGss => {
-                            self.pgb
-                                .write_message(&BeMessage::EncryptionResponse(false))?;
-                        }
-                        StartupRequestCode::NegotiateSsl => {
-                            println!("SSL requested");
-                            if self.pgb.tls_config.is_some() {
-                                self.pgb
-                                    .write_message(&BeMessage::EncryptionResponse(true))?;
-                                self.pgb.start_tls()?;
-                                encrypted = true;
-                            } else {
-                                self.pgb
-                                    .write_message(&BeMessage::EncryptionResponse(false))?;
-                            }
-                        }
-                        StartupRequestCode::Normal => {
-                            if self.state.conf.ssl_config.is_some() && !encrypted {
-                                self.pgb.write_message(&BeMessage::ErrorResponse(
-                                    "must connect with TLS".to_string(),
-                                ))?;
-                                bail!("client did not connect with TLS");
-                            }
-                            self.user = m
-                                .params
-                                .get("user")
-                                .ok_or_else(|| {
-                                    anyhow::Error::msg("user is required in startup packet")
-                                })?
-                                .into();
-                            self.database = m
-                                .params
-                                .get("database")
-                                .ok_or_else(|| {
-                                    anyhow::Error::msg("database is required in startup packet")
-                                })?
-                                .into();
-
-                            break;
-                        }
-                        StartupRequestCode::Cancel => break,
+            match msg.kind {
+                StartupRequestCode::NegotiateGss => {
+                    self.pgb.write_message(&Be::EncryptionResponse(false))?;
+                }
+                StartupRequestCode::NegotiateSsl => {
+                    self.pgb.write_message(&Be::EncryptionResponse(have_tls))?;
+                    if have_tls {
+                        self.pgb.start_tls()?;
+                        encrypted = true;
                    }
                }
-                None => {
-                    bail!("connection closed")
-                }
-                unexpected => {
-                    bail!("unexpected message type : {:?}", unexpected)
+                StartupRequestCode::Normal => {
+                    if have_tls && !encrypted {
+                        bail!("must connect with TLS");
+                    }
+
+                    let mut get_param = |key| {
+                        msg.params
+                            .remove(key)
+                            .ok_or_else(|| anyhow!("{} is missing in startup packet", key))
+                    };
+
+                    return Ok((get_param("user")?, get_param("database")?));
                }
+                // TODO: implement proper stmt cancellation
+                StartupRequestCode::Cancel => bail!("query cancellation is not supported"),
            }
        }
-        Ok(())
    }

-    fn handle_existing_user(&mut self) -> anyhow::Result<DatabaseInfo> {
-        // ask password
-        rand::thread_rng().fill(&mut self.md5_salt);
+    fn handle_existing_user(&mut self, user: &str, db: &str) -> anyhow::Result<DatabaseInfo> {
+        let md5_salt = rand::random::<[u8; 4]>();
+
+        // Ask password
        self.pgb
-            .write_message(&BeMessage::AuthenticationMD5Password(&self.md5_salt))?;
+            .write_message(&Be::AuthenticationMD5Password(&md5_salt))?;
        self.pgb.state = ProtoState::Authentication; // XXX

-        // check password
-        println!("handle_existing_user");
-        let msg = self.pgb.read_message()?;
-        println!("got message {:?}", msg);
-        if let Some(FeMessage::PasswordMessage(m)) = msg {
-            println!("got password message '{:?}'", m);
+        // Check password
+        let msg = match self.pgb.read_message()? {
+            Some(Fe::PasswordMessage(msg)) => msg,
+            None => bail!("connection is lost"),
+            bad => bail!("unexpected message type: {:?}", bad),
+        };
+        println!("got message: {:?}", msg);

-            assert!(self.is_existing_user());
+        let (_trailing_null, md5_response) = msg
+            .split_last()
+            .ok_or_else(|| anyhow!("unexpected password message"))?;

-            let (_trailing_null, md5_response) = m
-                .split_last()
-                .ok_or_else(|| anyhow::Error::msg("unexpected password message"))?;
+        let cplane = CPlaneApi::new(&self.state.conf.auth_endpoint, &self.state.waiters);
+        let db_info = cplane.authenticate_proxy_request(
+            user,
+            db,
+            md5_response,
+            &md5_salt,
+            &self.psql_session_id,
+        )?;

-            match self.cplane.authenticate_proxy_request(
-                self.user.as_str(),
-                self.database.as_str(),
-                md5_response,
-                &self.md5_salt,
-            ) {
-                Err(e) => {
-                    self.pgb
-                        .write_message(&BeMessage::ErrorResponse(format!("{}", e)))?;
+        self.pgb
+            .write_message_noflush(&Be::AuthenticationOk)?
+            .write_message_noflush(&BeParameterStatusMessage::encoding())?;

-                    bail!("auth failed: {}", e);
-                }
-                Ok(conn_info) => {
-                    self.pgb
-                        .write_message_noflush(&BeMessage::AuthenticationOk)?;
-                    self.pgb
-                        .write_message_noflush(&BeMessage::ParameterStatus)?;
-                    self.pgb.write_message(&BeMessage::ReadyForQuery)?;
-
-                    Ok(conn_info)
-                }
-            }
-        } else {
-            bail!("protocol violation");
-        }
+        Ok(db_info)
    }

    fn handle_new_user(&mut self) -> anyhow::Result<DatabaseInfo> {
-        let mut psql_session_id_buf = [0u8; 8];
-        rand::thread_rng().fill(&mut psql_session_id_buf);
-        self.psql_session_id = hex::encode(psql_session_id_buf);
+        let greeting = hello_message(&self.state.conf.redirect_uri, &self.psql_session_id);

-        let hello_message = format!("☀️  Welcome to Zenith!
-
-To proceed with database creation, open the following link:
-
-    {redirect_uri}{sess_id}
-
-It needs to be done once and we will send you '.pgpass' file, which will allow you to access or create
-databases without opening the browser.
-
-", redirect_uri = self.state.conf.redirect_uri, sess_id = self.psql_session_id);
+        // First, register this session
+        let waiter = self.state.waiters.register(self.psql_session_id.clone());

+        // Give user a URL to spawn a new database
        self.pgb
-            .write_message_noflush(&BeMessage::AuthenticationOk)?;
-        self.pgb
-            .write_message_noflush(&BeMessage::ParameterStatus)?;
-        self.pgb
-            .write_message(&BeMessage::NoticeResponse(hello_message))?;
-
-        // await for database creation
-        let (tx, rx) = channel::<anyhow::Result<DatabaseInfo>>();
-        let _ = self
-            .state
-            .waiters
-            .lock()
-            .unwrap()
-            .insert(self.psql_session_id.clone(), tx);
+            .write_message_noflush(&Be::AuthenticationOk)?
+            .write_message_noflush(&BeParameterStatusMessage::encoding())?
+            .write_message(&Be::NoticeResponse(greeting))?;

        // Wait for web console response
-        // XXX: respond with error to client
-        let dbinfo = rx.recv()??;
+        let db_info = waiter.wait()?.map_err(|e| anyhow!(e))?;

-        self.pgb.write_message_noflush(&BeMessage::NoticeResponse(
-            "Connecting to database.".to_string(),
-        ))?;
-        self.pgb.write_message(&BeMessage::ReadyForQuery)?;
+        self.pgb
+            .write_message_noflush(&Be::NoticeResponse("Connecting to database.".into()))?;

-        Ok(dbinfo)
+        Ok(db_info)
    }
 }

+fn hello_message(redirect_uri: &str, session_id: &str) -> String {
+    format!(
+        concat![
+            "☀️  Welcome to Zenith!\n",
+            "To proceed with database creation, open the following link:\n\n",
+            "    {redirect_uri}{session_id}\n\n",
+            "It needs to be done once and we will send you '.pgpass' file,\n",
+            "which will allow you to access or create ",
+            "databases without opening your web browser."
+        ],
+        redirect_uri = redirect_uri,
+        session_id = session_id,
+    )
+}
+
 /// Create a TCP connection to a postgres database, authenticate with it, and receive the ReadyForQuery message
-async fn connect_to_db(db_info: DatabaseInfo) -> anyhow::Result<tokio::net::TcpStream> {
+async fn connect_to_db(db_info: DatabaseInfo) -> anyhow::Result<(String, tokio::net::TcpStream)> {
    let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()?).await?;
-    let config = db_info.conn_string().parse::<tokio_postgres::Config>()?;
-    let _ = config.connect_raw(&mut socket, NoTls).await?;
-    Ok(socket)
+    let config = tokio_postgres::Config::from(db_info);
+    let (client, conn) = config.connect_raw(&mut socket, NoTls).await?;
+
+    let query = client.query_one("select current_setting('server_version')", &[]);
+
+    tokio::pin!(query, conn);
+
+    let version = tokio::select!(
+        x = query => x?.try_get(0)?,
+        _ = conn => bail!("connection closed too early"),
+    );
+
+    Ok((version, socket))
 }

 /// Concurrently proxy both directions of the client and server connections
 fn proxy(
-    client_read: ReadStream,
-    client_write: WriteStream,
-    server_read: ReadStream,
-    server_write: WriteStream,
+    (client_read, client_write): (ReadStream, WriteStream),
+    (server_read, server_write): (ReadStream, WriteStream),
 ) -> anyhow::Result<()> {
-    fn do_proxy(mut reader: ReadStream, mut writer: WriteStream) -> io::Result<()> {
-        std::io::copy(&mut reader, &mut writer)?;
-        writer.flush()?;
-        writer.shutdown(std::net::Shutdown::Both)
+    fn do_proxy(mut reader: impl io::Read, mut writer: WriteStream) -> io::Result<u64> {
+        /// FlushWriter will make sure that every message is sent as soon as possible
+        struct FlushWriter<W>(W);
+
+        impl<W: io::Write> io::Write for FlushWriter<W> {
+            fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
+                // `std::io::copy` is guaranteed to exit if we return an error,
+                // so we can afford to lose `res` in case `flush` fails
+                let res = self.0.write(buf);
+                if res.is_ok() {
+                    self.flush()?;
+                }
+                res
+            }
+
+            fn flush(&mut self) -> io::Result<()> {
+                self.0.flush()
+            }
+        }
+
+        let res = std::io::copy(&mut reader, &mut FlushWriter(&mut writer));
+        writer.shutdown(std::net::Shutdown::Both)?;
+        res
    }

    let client_to_server_jh = thread::spawn(move || do_proxy(client_read, server_write));

-    let res1 = do_proxy(server_read, client_write);
-    let res2 = client_to_server_jh.join().unwrap();
-    res1?;
-    res2?;
+    do_proxy(server_read, client_write)?;
+    client_to_server_jh.join().unwrap()?;

    Ok(())
 }
-
-/// Proxy a client connection to a postgres database
-fn proxy_pass(pgb: PostgresBackend, db_info: DatabaseInfo) -> anyhow::Result<()> {
-    let runtime = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()?;
-    let db_stream = runtime.block_on(connect_to_db(db_info))?;
-    let db_stream = db_stream.into_std()?;
-    db_stream.set_nonblocking(false)?;
-
-    let db_stream = zenith_utils::sock_split::BidiStream::from_tcp(db_stream);
-    let (db_read, db_write) = db_stream.split();
-
-    let stream = match pgb.into_stream() {
-        Stream::Bidirectional(bidi_stream) => bidi_stream,
-        _ => bail!("invalid stream"),
-    };
-
-    let (client_read, client_write) = stream.split();
-    proxy(client_read, client_write, db_read, db_write)
-}
--- a/proxy/src/state.rs
+++ b/proxy/src/state.rs
@@ -0,0 +1,62 @@
+use crate::cplane_api::DatabaseInfo;
+use anyhow::{anyhow, ensure, Context};
+use rustls::{internal::pemfile, NoClientAuth, ProtocolVersion, ServerConfig};
+use std::net::SocketAddr;
+use std::sync::Arc;
+
+pub type SslConfig = Arc<ServerConfig>;
+
+pub struct ProxyConfig {
+    /// main entrypoint for users to connect to
+    pub proxy_address: SocketAddr,
+
+    /// http management endpoint. Upon user account creation control plane
+    /// will notify us here, so that we can 'unfreeze' user session.
+    pub mgmt_address: SocketAddr,
+
+    /// send unauthenticated users to this URI
+    pub redirect_uri: String,
+
+    /// control plane address where we would check auth.
+    pub auth_endpoint: String,
+
+    pub ssl_config: Option<SslConfig>,
+}
+
+pub type ProxyWaiters = crate::waiters::Waiters<Result<DatabaseInfo, String>>;
+
+pub struct ProxyState {
+    pub conf: ProxyConfig,
+    pub waiters: ProxyWaiters,
+}
+
+impl ProxyState {
+    pub fn new(conf: ProxyConfig) -> Self {
+        Self {
+            conf,
+            waiters: ProxyWaiters::default(),
+        }
+    }
+}
+
+pub fn configure_ssl(key_path: &str, cert_path: &str) -> anyhow::Result<SslConfig> {
+    let key = {
+        let key_bytes = std::fs::read(key_path).context("SSL key file")?;
+        let mut keys = pemfile::pkcs8_private_keys(&mut &key_bytes[..])
+            .map_err(|_| anyhow!("couldn't read TLS keys"))?;
+        ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
+        keys.pop().unwrap()
+    };
+
+    let cert_chain = {
+        let cert_chain_bytes = std::fs::read(cert_path).context("SSL cert file")?;
+        pemfile::certs(&mut &cert_chain_bytes[..])
+            .map_err(|_| anyhow!("couldn't read TLS certificates"))?
+    };
+
+    let mut config = ServerConfig::new(NoClientAuth::new());
+    config.set_single_cert(cert_chain, key)?;
+    config.versions = vec![ProtocolVersion::TLSv1_3];
+
+    Ok(config.into())
+}
--- a/proxy/src/waiters.rs
+++ b/proxy/src/waiters.rs
@@ -0,0 +1,58 @@
+use anyhow::{anyhow, Context};
+use std::collections::HashMap;
+use std::sync::{mpsc, Mutex};
+
+pub struct Waiters<T>(pub(self) Mutex<HashMap<String, mpsc::Sender<T>>>);
+
+impl<T> Default for Waiters<T> {
+    fn default() -> Self {
+        Waiters(Default::default())
+    }
+}
+
+impl<T> Waiters<T> {
+    pub fn register(&self, key: String) -> Waiter<T> {
+        let (tx, rx) = mpsc::channel();
+
+        // TODO: use `try_insert` (unstable)
+        let prev = self.0.lock().unwrap().insert(key.clone(), tx);
+        assert!(matches!(prev, None)); // assert_matches! is nightly-only
+
+        Waiter {
+            receiver: rx,
+            registry: self,
+            key,
+        }
+    }
+
+    pub fn notify(&self, key: &str, value: T) -> anyhow::Result<()>
+    where
+        T: Send + Sync + 'static,
+    {
+        let tx = self
+            .0
+            .lock()
+            .unwrap()
+            .remove(key)
+            .ok_or_else(|| anyhow!("key {} not found", key))?;
+        tx.send(value).context("channel hangup")
+    }
+}
+
+pub struct Waiter<'a, T> {
+    receiver: mpsc::Receiver<T>,
+    registry: &'a Waiters<T>,
+    key: String,
+}
+
+impl<T> Waiter<'_, T> {
+    pub fn wait(self) -> anyhow::Result<T> {
+        self.receiver.recv().context("channel hangup")
+    }
+}
+
+impl<T> Drop for Waiter<'_, T> {
+    fn drop(&mut self) {
+        self.registry.0.lock().unwrap().remove(&self.key);
+    }
+}
--- a/test_runner/pytest.ini
+++ b/test_runner/pytest.ini
@@ -1,4 +1,9 @@
 [pytest]
+addopts =
+    -m 'not remote_cluster'
+markers =
+    remote_cluster
 minversion = 6.0
 log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s
 log_date_format = %Y-%m-%d %H:%M:%S
+log_cli = true
--- a/scripts/coverage
+++ b/scripts/coverage
@@ -0,0 +1,510 @@
+#!/usr/bin/env python3
+
+# Here'a good link in case you're interested in learning more
+# about current deficiencies of rust code coverage story:
+# https://github.com/rust-lang/rust/issues?q=is%3Aissue+is%3Aopen+instrument-coverage+label%3AA-code-coverage
+#
+# Also a couple of inspirational tools which I deliberately ended up not using:
+#  * https://github.com/mozilla/grcov
+#  * https://github.com/taiki-e/cargo-llvm-cov
+#  * https://github.com/llvm/llvm-project/tree/main/llvm/test/tools/llvm-cov
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from textwrap import dedent
+from typing import Any, Iterable, List, Optional
+
+import argparse
+import json
+import os
+import shutil
+import subprocess
+import sys
+
+
+def intersperse(sep: Any, iterable: Iterable[Any]):
+    fst = True
+    for item in iterable:
+        if not fst:
+            yield sep
+        fst = False
+        yield item
+
+
+def find_demangler(demangler=None):
+    known_tools = ['c++filt', 'rustfilt', 'llvm-cxxfilt']
+
+    if demangler:
+        # Explicit argument has precedence over `known_tools`
+        demanglers = [demangler]
+    else:
+        demanglers = known_tools
+
+    for demangler in demanglers:
+        if shutil.which(demangler):
+            return demangler
+
+    raise Exception(' '.join([
+        'Failed to find symbol demangler.',
+        'Please install it or provide another tool',
+        f"(e.g. {', '.join(known_tools)})",
+    ]))
+
+
+class Cargo:
+    def __init__(self, cwd: Path):
+        self.cwd = cwd
+        self.target_dir = Path(os.environ.get('CARGO_TARGET_DIR', cwd / 'target')).resolve()
+        self._rustlib_dir = None
+
+    @property
+    def rustlib_dir(self):
+        if not self._rustlib_dir:
+            cmd = [
+                'cargo',
+                '-Zunstable-options',
+                'rustc',
+                '--print=target-libdir',
+            ]
+            self._rustlib_dir = Path(subprocess.check_output(cmd, cwd=self.cwd, text=True)).parent
+
+        return self._rustlib_dir
+
+    def binaries(self, profile: str) -> List[str]:
+        executables = []
+
+        # This will emit json messages containing test binaries names
+        cmd = [
+            'cargo',
+            'test',
+            '--no-run',
+            '--message-format=json',
+        ]
+        env = dict(os.environ, PROFILE=profile)
+        output = subprocess.check_output(cmd, cwd=self.cwd, env=env, text=True)
+
+        for line in output.splitlines(keepends=False):
+            meta = json.loads(line)
+            exe = meta.get('executable')
+            if exe:
+                executables.append(exe)
+
+        # Metadata contains crate names, which can be used
+        # to recover names of executables, e.g. `pageserver`
+        cmd = [
+            'cargo',
+            'metadata',
+            '--format-version=1',
+            '--no-deps',
+        ]
+        meta = json.loads(subprocess.check_output(cmd, cwd=self.cwd))
+
+        for pkg in meta.get('packages', []):
+            for target in pkg.get('targets', []):
+                if 'bin' in target['kind']:
+                    exe = self.target_dir / profile / target['name']
+                    if exe.exists():
+                        executables.append(str(exe))
+
+        return executables
+
+
+@dataclass
+class LLVM:
+    cargo: Cargo
+
+    def resolve_tool(self, name: str) -> str:
+        exe = self.cargo.rustlib_dir / 'bin' / name
+        if exe.exists():
+            return str(exe)
+
+        if not shutil.which(name):
+            # Show a user-friendly warning
+            raise Exception(' '.join([
+                f"It appears that you don't have `{name}` installed.",
+                "Please execute `rustup component add llvm-tools-preview`,",
+                "or install it via your package manager of choice.",
+                "LLVM tools should be the same version as LLVM in `rustc --version --verbose`.",
+            ]))
+
+        return name
+
+    def profdata(self, input_dir: Path, output_profdata: Path):
+        profraws = [f for f in input_dir.iterdir() if f.suffix == '.profraw']
+        if not profraws:
+            raise Exception(f'No profraw files found at {input_dir}')
+
+        with open(input_dir / 'profraw.list', 'w') as input_files:
+            profraw_mtime = 0
+            for profraw in profraws:
+                profraw_mtime = max(profraw_mtime, profraw.stat().st_mtime_ns)
+                print(profraw, file=input_files)
+            input_files.flush()
+
+            try:
+                profdata_mtime = output_profdata.stat().st_mtime_ns
+            except FileNotFoundError:
+                profdata_mtime = 0
+
+            # An obvious make-ish optimization
+            if profraw_mtime >= profdata_mtime:
+                subprocess.check_call([
+                    self.resolve_tool('llvm-profdata'),
+                    'merge',
+                    '-sparse',
+                    f'-input-files={input_files.name}',
+                    f'-output={output_profdata}',
+                ])
+
+    def _cov(self,
+             *extras,
+             subcommand: str,
+             profdata: Path,
+             objects: List[str],
+             sources: List[str],
+             demangler: Optional[str] = None) -> None:
+
+        cwd = self.cargo.cwd
+        objects = list(intersperse('-object', objects))
+        extras = list(extras)
+
+        # For some reason `rustc` produces relative paths to src files,
+        # so we force it to cut the $PWD prefix.
+        # see: https://github.com/rust-lang/rust/issues/34701#issuecomment-739809584
+        if sources:
+            extras.append(f'-path-equivalence=.,{cwd.resolve()}')
+
+        if demangler:
+            extras.append(f'-Xdemangler={demangler}')
+
+        cmd = [
+            self.resolve_tool('llvm-cov'),
+            subcommand,  # '-dump-collected-paths',  # classified debug flag
+            '-instr-profile',
+            str(profdata),
+            *extras,
+            *objects,
+            *sources,
+        ]
+        subprocess.check_call(cmd, cwd=cwd)
+
+    def cov_report(self, **kwargs) -> None:
+        self._cov(subcommand='report', **kwargs)
+
+    def cov_export(self, *, kind: str, **kwargs) -> None:
+        extras = [f'-format={kind}']
+        self._cov(subcommand='export', *extras, **kwargs)
+
+    def cov_show(self, *, kind: str, output_dir: Optional[Path] = None, **kwargs) -> None:
+        extras = [f'-format={kind}']
+        if output_dir:
+            extras.append(f'-output-dir={output_dir}')
+
+        self._cov(subcommand='show', *extras, **kwargs)
+
+
+@dataclass
+class Report(ABC):
+    """ Common properties of a coverage report """
+
+    llvm: LLVM
+    demangler: str
+    profdata: Path
+    objects: List[str]
+    sources: List[str]
+
+    def _common_kwargs(self):
+        return dict(profdata=self.profdata,
+                    objects=self.objects,
+                    sources=self.sources,
+                    demangler=self.demangler)
+
+    @abstractmethod
+    def generate(self):
+        pass
+
+    def open(self):
+        # Do nothing by default
+        pass
+
+
+class SummaryReport(Report):
+    def generate(self):
+        self.llvm.cov_report(**self._common_kwargs())
+
+
+class TextReport(Report):
+    def generate(self):
+        self.llvm.cov_show(kind='text', **self._common_kwargs())
+
+
+class LcovReport(Report):
+    def generate(self):
+        self.llvm.cov_export(kind='lcov', **self._common_kwargs())
+
+
+@dataclass
+class HtmlReport(Report):
+    output_dir: Path
+
+    def generate(self):
+        self.llvm.cov_show(kind='html', output_dir=self.output_dir, **self._common_kwargs())
+        print(f'HTML report is located at `{self.output_dir}`')
+
+    def open(self):
+        tool = dict(linux='xdg-open', darwin='open').get(sys.platform)
+        if not tool:
+            raise Exception(f'Unknown platform {sys.platform}')
+
+        subprocess.check_call([tool, self.output_dir / 'index.html'],
+                              stdout=subprocess.DEVNULL,
+                              stderr=subprocess.DEVNULL)
+
+
+@dataclass
+class GithubPagesReport(HtmlReport):
+    output_dir: Path
+    commit_url: str
+
+    def generate(self):
+        def index_path(path):
+            return path / 'index.html'
+
+        common = self._common_kwargs()
+        # Provide default sources if there's none
+        common.setdefault('sources', ['.'])
+
+        self.llvm.cov_show(kind='html', output_dir=self.output_dir, **common)
+        shutil.copy(index_path(self.output_dir), self.output_dir / 'local.html')
+
+        with TemporaryDirectory() as tmp:
+            output_dir = Path(tmp)
+            args = dict(common, sources=[])
+            self.llvm.cov_show(kind='html', output_dir=output_dir, **args)
+            shutil.copy(index_path(output_dir), self.output_dir / 'all.html')
+
+        with open(index_path(self.output_dir), 'w') as index:
+            commit_sha = self.commit_url.rsplit('/', maxsplit=1)[-1][:10]
+
+            html = f"""
+                <!DOCTYPE html>
+                <html>
+                    <head>
+                        <title>Coverage ({commit_sha})</title>
+                    </head>
+                    <body>
+                        <h1>
+                            Coverage report for commit
+                                <a href="{self.commit_url}">
+                                    {commit_sha}
+                                </a>
+                        </h1>
+
+                        <p>
+                            <a href="./local.html">
+                                <b>Show only local sources</b>
+                            </a>
+                        </p>
+
+                        <p>
+                            <a href="./all.html">
+                                Show all sources (including dependencies)
+                            </a>
+                        </p>
+                    </body>
+                </html>
+            """
+            index.write(dedent(html))
+
+        print(f'HTML report is located at `{self.output_dir}`')
+
+
+class State:
+    def __init__(self, cwd: Path, top_dir: Optional[Path], profraw_prefix: Optional[str]):
+        # Use hostname by default
+        profraw_prefix = profraw_prefix or '%h'
+
+        self.cwd = cwd
+        self.cargo = Cargo(self.cwd)
+        self.llvm = LLVM(self.cargo)
+
+        self.top_dir = top_dir or self.cargo.target_dir / 'coverage'
+        self.report_dir = self.top_dir / 'report'
+
+        # Directory for raw coverage data emitted by executables
+        self.profraw_dir = self.top_dir / 'profraw'
+        self.profraw_dir.mkdir(parents=True, exist_ok=True)
+
+        # Aggregated coverage data
+        self.profdata_file = self.top_dir / 'coverage.profdata'
+
+        # Dump all coverage data files into a dedicated directory.
+        # Each filename is parameterized by PID & executable's signature.
+        os.environ['LLVM_PROFILE_FILE'] = str(self.profraw_dir /
+                                              f'cov-{profraw_prefix}-%p-%m.profraw')
+
+        os.environ['RUSTFLAGS'] = ' '.join([
+            os.environ.get('RUSTFLAGS', ''),
+            # Enable LLVM's source-based coverage
+            # see: https://clang.llvm.org/docs/SourceBasedCodeCoverage.html
+            # see: https://blog.rust-lang.org/inside-rust/2020/11/12/source-based-code-coverage.html
+            '-Zinstrument-coverage',
+            # Link every bit of code to prevent "holes" in coverage report
+            # see: https://doc.rust-lang.org/rustc/codegen-options/index.html#link-dead-code
+            '-Clink-dead-code',
+            # Some of the paths that `rustc` embeds into binaries are absolute, others are relative.
+            # The point is, we can't have both, because depending on `-path-equivalence`, `llvm-cov`
+            # either will cripple absolute paths or won't be able to show relative paths at all.
+            # There's no way to turn relative paths into absolute, so we strip $PWD prefix.
+            # Only source files of deps (e.g. `$HOME/.cargo`) will keep their absolute paths,
+            # but we won't include them in report by default (but see `--all`).
+            f'--remap-path-prefix {self.cwd}=',
+        ])
+
+        # XXX: God, have mercy on our souls...
+        # see: https://github.com/rust-lang/rust/pull/90132
+        os.environ['RUSTC_BOOTSTRAP'] = '1'
+
+    def do_run(self, args):
+        subprocess.check_call([*args.command, *args.args])
+
+    def do_report(self, args):
+        if args.all and args.sources:
+            raise Exception('--all should not be used with sources')
+
+        # see man for `llvm-cov show [sources]`
+        if args.all:
+            sources = []
+        elif not args.sources:
+            sources = ['.']
+        else:
+            sources = args.sources
+
+        print('* Merging profraw files')
+        self.llvm.profdata(self.profraw_dir, self.profdata_file)
+
+        objects = []
+        if args.input_objects:
+            print('* Collecting object files using --input-objects')
+            with open(args.input_objects) as f:
+                objects.extend(f.read().splitlines(keepends=False))
+        if args.cargo_objects == 'true' or (args.cargo_objects == 'auto'
+                                            and not args.input_objects):
+            print('* Collecting object files using cargo')
+            objects.extend(self.cargo.binaries(args.profile))
+
+        params = dict(llvm=self.llvm,
+                      demangler=find_demangler(args.demangler),
+                      profdata=self.profdata_file,
+                      objects=objects,
+                      sources=sources)
+
+        formats = {
+            'html':
+            lambda: HtmlReport(**params, output_dir=self.report_dir),
+            'text':
+            lambda: TextReport(**params),
+            'lcov':
+            lambda: LcovReport(**params),
+            'summary':
+            lambda: SummaryReport(**params),
+            'github':
+            lambda: GithubPagesReport(
+                **params, output_dir=self.report_dir, commit_url=args.commit_url),
+        }
+
+        report = formats.get(args.format)()
+        if not report:
+            raise Exception('Format `{args.format}` is not supported')
+
+        print(f'* Rendering coverage report ({args.format})')
+        report.generate()
+
+        if args.open:
+            print('* Opening the report')
+            report.open()
+
+    def do_clean(self, args):
+        # Wipe everything if no filters have been provided
+        if not (args.report or args.prof):
+            shutil.rmtree(self.top_dir, ignore_errors=True)
+        else:
+            if args.report:
+                shutil.rmtree(self.report_dir, ignore_errors=True)
+            if args.prof:
+                self.profdata_file.unlink(missing_ok=True)
+
+
+def main():
+    app = sys.argv[0]
+    example = f"""
+prerequisites:
+    # alternatively, install a system package for `llvm-tools`
+    rustup component add llvm-tools-preview
+
+self-contained example:
+    {app} run make
+    {app} run pipenv run pytest test_runner
+    {app} run cargo test
+    {app} report --open
+    """
+
+    parser = argparse.ArgumentParser(description='Coverage report builder',
+                                     formatter_class=argparse.RawDescriptionHelpFormatter,
+                                     epilog=example)
+    parser.add_argument('--dir', type=Path, help='output directory')
+    parser.add_argument('--profraw-prefix', metavar='STRING', type=str)
+
+    commands = parser.add_subparsers(title='commands', dest='subparser_name')
+
+    p_run = commands.add_parser('run', help='run a command with magic env')
+    p_run.add_argument('command', nargs=1)
+    p_run.add_argument('args', nargs=argparse.REMAINDER)
+
+    p_report = commands.add_parser('report', help='generate a coverage report')
+    p_report.add_argument('--profile',
+                          default='debug',
+                          choices=('debug', 'release'),
+                          help='cargo build profile')
+    p_report.add_argument('--format',
+                          default='html',
+                          choices=('html', 'text', 'summary', 'lcov', 'github'),
+                          help='report format')
+    p_report.add_argument('--input-objects',
+                          metavar='FILE',
+                          type=Path,
+                          help='file containing list of binaries')
+    p_report.add_argument('--cargo-objects',
+                          default='auto',
+                          choices=('auto', 'true', 'false'),
+                          help='use cargo for auto discovery of binaries')
+    p_report.add_argument('--commit-url', type=str, help='required for --format=github')
+    p_report.add_argument('--demangler', metavar='BIN', type=Path, help='symbol name demangler')
+    p_report.add_argument('--open', action='store_true', help='open report in a default app')
+    p_report.add_argument('--all', action='store_true', help='show everything, e.g. deps')
+    p_report.add_argument('sources', nargs='*', type=Path, help='source file or directory')
+
+    p_clean = commands.add_parser('clean', help='wipe coverage artifacts')
+    p_clean.add_argument('--report', action='store_true', help='pick generated report')
+    p_clean.add_argument('--prof', action='store_true', help='pick *.profdata & *.profraw')
+
+    args = parser.parse_args()
+    state = State(cwd=Path.cwd(), top_dir=args.dir, profraw_prefix=args.profraw_prefix)
+
+    commands = {
+        'run': state.do_run,
+        'report': state.do_report,
+        'clean': state.do_clean,
+    }
+
+    action = commands.get(args.subparser_name)
+    if action:
+        action(args)
+    else:
+        parser.print_help()
+
+
+if __name__ == '__main__':
+    main()
--- a/scripts/generate_and_push_perf_report.sh
+++ b/scripts/generate_and_push_perf_report.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# this is a shortcut script to avoid duplication in CI
+
+set -eux -o pipefail
+
+SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+
+git clone https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-perf-data.git
+cd zenith-perf-data
+mkdir -p reports/
+mkdir -p data/$REPORT_TO
+
+cp $REPORT_FROM/* data/$REPORT_TO
+
+echo "Generating report"
+pipenv run python $SCRIPT_DIR/generate_perf_report_page.py --input-dir data/$REPORT_TO --out reports/$REPORT_TO.html 
+echo "Uploading perf result"
+git add data reports
+git \
+    -c "user.name=vipvap" \
+    -c "user.email=vipvap@zenith.tech" \
+    commit \
+    --author="vipvap <vipvap@zenith.tech>" \
+    -m "add performance test result for $GITHUB_SHA zenith revision"
+
+git push https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-perf-data.git master
--- a/scripts/generate_perf_report_page.py
+++ b/scripts/generate_perf_report_page.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python3
+import argparse
+from dataclasses import dataclass
+from pathlib import Path
+import json
+from typing import Any, Dict, List, Optional, Tuple, cast
+from jinja2 import Template
+
+# skip 'input' columns. They are included in the header and just blow the table
+EXCLUDE_COLUMNS = frozenset({
+    'scale',
+    'duration',
+    'number_of_clients',
+    'number_of_threads',
+    'init_start_timestamp',
+    'init_end_timestamp',
+    'run_start_timestamp',
+    'run_end_timestamp',
+})
+
+KEY_EXCLUDE_FIELDS = frozenset({
+    'init_start_timestamp',
+    'init_end_timestamp',
+    'run_start_timestamp',
+    'run_end_timestamp',
+})
+NEGATIVE_COLOR = 'negative'
+POSITIVE_COLOR = 'positive'
+
+
+@dataclass
+class SuitRun:
+    revision: str
+    values: Dict[str, Any]
+
+
+@dataclass
+class SuitRuns:
+    platform: str
+    suit: str
+    common_columns: List[Tuple[str, str]]
+    value_columns: List[str]
+    runs: List[SuitRun]
+
+
+@dataclass
+class RowValue:
+    value: str
+    color: str
+    ratio: str
+
+
+def get_columns(values: List[Dict[Any, Any]]) -> Tuple[List[Tuple[str, str]], List[str]]:
+    value_columns = []
+    common_columns = []
+    for item in values:
+        if item['name'] in KEY_EXCLUDE_FIELDS:
+            continue
+        if item['report'] != 'test_param':
+            value_columns.append(cast(str, item['name']))
+        else:
+            common_columns.append((cast(str, item['name']), cast(str, item['value'])))
+    value_columns.sort()
+    common_columns.sort(key=lambda x: x[0])  # sort by name
+    return common_columns, value_columns
+
+
+def format_ratio(ratio: float, report: str) -> Tuple[str, str]:
+    color = ''
+    sign = '+' if ratio > 0 else ''
+    if abs(ratio) < 0.05:
+        return f'&nbsp({sign}{ratio:.2f})', color
+
+    if report not in {'test_param', 'higher_is_better', 'lower_is_better'}:
+        raise ValueError(f'Unknown report type: {report}')
+
+    if report == 'test_param':
+        return f'{ratio:.2f}', color
+
+    if ratio > 0:
+        if report == 'higher_is_better':
+            color = POSITIVE_COLOR
+        elif report == 'lower_is_better':
+            color = NEGATIVE_COLOR
+    elif ratio < 0:
+        if report == 'higher_is_better':
+            color = NEGATIVE_COLOR
+        elif report == 'lower_is_better':
+            color = POSITIVE_COLOR
+
+    return f'&nbsp({sign}{ratio:.2f})', color
+
+
+def extract_value(name: str, suit_run: SuitRun) -> Optional[Dict[str, Any]]:
+    for item in suit_run.values['data']:
+        if item['name'] == name:
+            return cast(Dict[str, Any], item)
+    return None
+
+
+def get_row_values(columns: List[str], run_result: SuitRun,
+                   prev_result: Optional[SuitRun]) -> List[RowValue]:
+    row_values = []
+    for column in columns:
+        current_value = extract_value(column, run_result)
+        if current_value is None:
+            # should never happen
+            raise ValueError(f'{column} not found in {run_result.values}')
+
+        value = current_value["value"]
+        if isinstance(value, float):
+            value = f'{value:.2f}'
+
+        if prev_result is None:
+            row_values.append(RowValue(value, '', ''))
+            continue
+
+        prev_value = extract_value(column, prev_result)
+        if prev_value is None:
+            # this might happen when new metric is added and there is no value for it in previous run
+            # let this be here, TODO add proper handling when this actually happens
+            raise ValueError(f'{column} not found in previous result')
+        ratio = float(value) / float(prev_value['value']) - 1
+        ratio_display, color = format_ratio(ratio, current_value['report'])
+        row_values.append(RowValue(value, color, ratio_display))
+    return row_values
+
+
+@dataclass
+class SuiteRunTableRow:
+    revision: str
+    values: List[RowValue]
+
+
+def prepare_rows_from_runs(value_columns: List[str], runs: List[SuitRun]) -> List[SuiteRunTableRow]:
+    rows = []
+    prev_run = None
+    for run in runs:
+        rows.append(
+            SuiteRunTableRow(revision=run.revision,
+                             values=get_row_values(value_columns, run, prev_run)))
+        prev_run = run
+
+    return rows
+
+
+def main(args: argparse.Namespace) -> None:
+    input_dir = Path(args.input_dir)
+    grouped_runs: Dict[str, SuitRuns] = {}
+    # we have files in form: <ctr>_<rev>.json
+    # fill them in the hashmap so we have grouped items for the
+    # same run configuration (scale, duration etc.) ordered by counter.
+    for item in sorted(input_dir.iterdir(), key=lambda x: int(x.name.split('_')[0])):
+        run_data = json.loads(item.read_text())
+        revision = run_data['revision']
+
+        for suit_result in run_data['result']:
+            key = "{}{}".format(run_data['platform'], suit_result['suit'])
+            # pack total duration as a synthetic value
+            total_duration = suit_result['total_duration']
+            suit_result['data'].append({
+                'name': 'total_duration',
+                'value': total_duration,
+                'unit': 's',
+                'report': 'lower_is_better',
+            })
+            common_columns, value_columns = get_columns(suit_result['data'])
+
+            grouped_runs.setdefault(
+                key,
+                SuitRuns(
+                    platform=run_data['platform'],
+                    suit=suit_result['suit'],
+                    common_columns=common_columns,
+                    value_columns=value_columns,
+                    runs=[],
+                ),
+            )
+
+            grouped_runs[key].runs.append(SuitRun(revision=revision, values=suit_result))
+    context = {}
+    for result in grouped_runs.values():
+        suit = result.suit
+        context[suit] = {
+            'common_columns': result.common_columns,
+            'value_columns': result.value_columns,
+            'platform': result.platform,
+            # reverse the order so newest results are on top of the table
+            'rows': reversed(prepare_rows_from_runs(result.value_columns, result.runs)),
+        }
+
+    template = Template((Path(__file__).parent / 'perf_report_template.html').read_text())
+
+    Path(args.out).write_text(template.render(context=context))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input-dir',
+        dest='input_dir',
+        required=True,
+        help='Directory with jsons generated by the test suite',
+    )
+    parser.add_argument('--out', required=True, help='Output html file path')
+    args = parser.parse_args()
+    main(args)
--- a/scripts/git-upload
+++ b/scripts/git-upload
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+
+from contextlib import contextmanager
+from tempfile import TemporaryDirectory
+from pathlib import Path
+
+import argparse
+import os
+import shutil
+import subprocess
+import sys
+
+
+def absolute_path(path):
+    return Path(path).resolve()
+
+
+def relative_path(path):
+    path = Path(path)
+    if path.is_absolute():
+        raise Exception(f'path `{path}` must be relative!')
+    return path
+
+
+@contextmanager
+def chdir(cwd: Path):
+    old = os.getcwd()
+    os.chdir(cwd)
+    try:
+        yield cwd
+    finally:
+        os.chdir(old)
+
+
+def run(cmd, *args, **kwargs):
+    print('$', ' '.join(cmd))
+    subprocess.check_call(cmd, *args, **kwargs)
+
+
+class GitRepo:
+    def __init__(self, url):
+        self.url = url
+        self.cwd = TemporaryDirectory()
+
+        subprocess.check_call([
+            'git',
+            'clone',
+            str(url),
+            self.cwd.name,
+        ])
+
+    def is_dirty(self):
+        res = subprocess.check_output(['git', 'status', '--porcelain'], text=True).strip()
+        return bool(res)
+
+    def update(self, message, action, branch=None):
+        with chdir(self.cwd.name):
+            if not branch:
+                cmd = ['git', 'branch', '--show-current']
+                branch = subprocess.check_output(cmd, text=True).strip()
+
+            # Run action in repo's directory
+            action()
+
+            run(['git', 'add', '.'])
+
+            if not self.is_dirty():
+                print('No changes detected, quitting')
+                return
+
+            run([
+                'git',
+                '-c',
+                'user.name=vipvap',
+                '-c',
+                'user.email=vipvap@zenith.tech',
+                'commit',
+                '--author="vipvap <vipvap@zenith.tech>"',
+                f'--message={message}',
+            ])
+
+            for _ in range(5):
+                try:
+                    run(['git', 'fetch', 'origin', branch])
+                    run(['git', 'rebase', f'origin/{branch}'])
+                    run(['git', 'push', 'origin', branch])
+                    return
+
+                except subprocess.CalledProcessError as e:
+                    print(f'failed to update branch `{branch}`: {e}', file=sys.stderr)
+
+            raise Exception(f'failed to update branch `{branch}`')
+
+
+def do_copy(args):
+    src = args.src
+    dst = args.dst
+
+    if args.forbid_overwrite and dst.exists():
+        raise FileExistsError(f"File exists: '{dst}'")
+
+    if src.is_dir():
+        shutil.rmtree(dst, ignore_errors=True)
+        shutil.copytree(src, dst)
+    else:
+        shutil.copy(src, dst)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Git upload tool')
+    parser.add_argument('--repo', type=str, metavar='URL', required=True, help='git repo url')
+    parser.add_argument('--message', type=str, metavar='TEXT', help='commit message')
+
+    commands = parser.add_subparsers(title='commands', dest='subparser_name')
+
+    p_copy = commands.add_parser('copy', help='copy file into the repo')
+    p_copy.add_argument('src', type=absolute_path, help='source path')
+    p_copy.add_argument('dst', type=relative_path, help='relative dest path')
+    p_copy.add_argument('--forbid-overwrite', action='store_true', help='do not allow overwrites')
+
+    args = parser.parse_args()
+
+    commands = {
+        'copy': do_copy,
+    }
+
+    action = commands.get(args.subparser_name)
+    if action:
+        message = args.message or 'update'
+        GitRepo(args.repo).update(message, lambda: action(args))
+    else:
+        parser.print_usage()
+
+
+if __name__ == '__main__':
+    main()
--- a/scripts/perf_report_template.html
+++ b/scripts/perf_report_template.html
@@ -0,0 +1,52 @@
+<!DOCTYPE html>
+<html>
+
+<body>
+    <style>
+        table,
+        th,
+        td {
+            border: 1px solid black;
+            border-collapse: collapse;
+        }
+
+        .positive {
+            background-color: rgba(0, 255, 0, 0.8)
+        }
+
+        .negative {
+            background-color: rgba(255, 0, 0, 0.65)
+        }
+    </style>
+
+    <h2>Zenith Performance Tests</h2>
+
+    {% for suit_name, suit_data in context.items() %}
+    <h3>Runs for {{ suit_name }} </h3>
+    <b>platform:</b> {{ suit_data.platform }}<br>
+    {% for common_column_name, common_column_value in suit_data.common_columns %}
+    <b>{{ common_column_name }}</b>: {{ common_column_value }}<br>
+    {% endfor %}
+    <br>
+
+    <table>
+        <tr>
+            <th>revision</th>
+            {% for column_name in suit_data.value_columns %}
+            <th>{{ column_name }}</th>
+            {% endfor %}
+        </tr>
+        {% for row in suit_data.rows %}
+        <tr>
+            <td><a href=https://github.com/zenithdb/zenith/commit/{{ row.revision }}>{{ row.revision[:6] }}</a></td>
+            {% for column_value in row.values %}
+            <td class="{{ column_value.color }}">{{ column_value.value }}{{column_value.ratio}}</td>
+            {% endfor %}
+        </tr>
+        {% endfor %}
+    </table>
+    {% endfor %}
+
+</body>
+
+</html>
--- a/test_runner/setup.cfg
+++ b/test_runner/setup.cfg
@@ -13,6 +13,8 @@ column_limit = 100
 split_all_top_level_comma_separated_values = true

 [mypy]
+# mypy uses regex
+exclude = ^vendor/
 # some tests don't typecheck when this flag is set
 check_untyped_defs = false

@@ -22,7 +24,11 @@ disallow_untyped_decorators = false
 disallow_untyped_defs = false
 strict = true

-[mypy-psycopg2.*]
+[mypy-asyncpg.*]
+# There is some work in progress, though: https://github.com/MagicStack/asyncpg/pull/577
+ignore_missing_imports = true
+
+[mypy-cached_property.*]
 ignore_missing_imports = true

 [mypy-pytest.*]
--- a/test_runner/Pipfile
+++ b/test_runner/Pipfile
@@ -1,25 +0,0 @@
-[[source]]
-url = "https://pypi.python.org/simple"
-verify_ssl = true
-name = "pypi"
-
-[packages]
-pytest = ">=6.0.0"
-psycopg2 = "*"
-typing-extensions = "*"
-pyjwt = {extras = ["crypto"], version = "*"}
-requests = "*"
-pytest-xdist = "*"
-asyncpg = "*"
-cached-property = "*"
-
-[dev-packages]
-flake8 = "*"
-mypy = "*"
-# Behavior may change slightly between versions. These are run continuously,
-# so we pin exact versions to avoid suprising breaks. Update if comfortable.
-yapf = "==0.31.0"
-
-[requires]
-# we need at least 3.6, but pipenv doesn't allow to say this directly
-python_version = "3"
--- a/test_runner/Pipfile.lock
+++ b/test_runner/Pipfile.lock
@@ -1,390 +0,0 @@
-{
-    "_meta": {
-        "hash": {
-            "sha256": "3645ae8d2dcf55bd2a54963c44cfeedf577f3b289d1077365214a80a7f36e643"
-        },
-        "pipfile-spec": 6,
-        "requires": {
-            "python_version": "3"
-        },
-        "sources": [
-            {
-                "name": "pypi",
-                "url": "https://pypi.python.org/simple",
-                "verify_ssl": true
-            }
-        ]
-    },
-    "default": {
-        "asyncpg": {
-            "hashes": [
-                "sha256:129d501f3d30616afd51eb8d3142ef51ba05374256bd5834cec3ef4956a9b317",
-                "sha256:29ef6ae0a617fc13cc2ac5dc8e9b367bb83cba220614b437af9b67766f4b6b20",
-                "sha256:41704c561d354bef01353835a7846e5606faabbeb846214dfcf666cf53319f18",
-                "sha256:556b0e92e2b75dc028b3c4bc9bd5162ddf0053b856437cf1f04c97f9c6837d03",
-                "sha256:8ff5073d4b654e34bd5eaadc01dc4d68b8a9609084d835acd364cd934190a08d",
-                "sha256:a458fc69051fbb67d995fdda46d75a012b5d6200f91e17d23d4751482640ed4c",
-                "sha256:a7095890c96ba36f9f668eb552bb020dddb44f8e73e932f8573efc613ee83843",
-                "sha256:a738f4807c853623d3f93f0fea11f61be6b0e5ca16ea8aeb42c2c7ee742aa853",
-                "sha256:c4fc0205fe4ddd5aeb3dfdc0f7bafd43411181e1f5650189608e5971cceacff1",
-                "sha256:dd2fa063c3344823487d9ddccb40802f02622ddf8bf8a6cc53885ee7a2c1c0c6",
-                "sha256:ddffcb85227bf39cd1bedd4603e0082b243cf3b14ced64dce506a15b05232b83",
-                "sha256:e36c6806883786b19551bb70a4882561f31135dc8105a59662e0376cf5b2cbc5",
-                "sha256:eed43abc6ccf1dc02e0d0efc06ce46a411362f3358847c6b0ec9a43426f91ece"
-            ],
-            "index": "pypi",
-            "version": "==0.24.0"
-        },
-        "attrs": {
-            "hashes": [
-                "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1",
-                "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==21.2.0"
-        },
-        "cached-property": {
-            "hashes": [
-                "sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130",
-                "sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"
-            ],
-            "index": "pypi",
-            "version": "==1.5.2"
-        },
-        "certifi": {
-            "hashes": [
-                "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
-                "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
-            ],
-            "version": "==2021.10.8"
-        },
-        "cffi": {
-            "hashes": [
-                "sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3",
-                "sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2",
-                "sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636",
-                "sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20",
-                "sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728",
-                "sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27",
-                "sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66",
-                "sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443",
-                "sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0",
-                "sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7",
-                "sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39",
-                "sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605",
-                "sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a",
-                "sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37",
-                "sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029",
-                "sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139",
-                "sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc",
-                "sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df",
-                "sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14",
-                "sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880",
-                "sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2",
-                "sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a",
-                "sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e",
-                "sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474",
-                "sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024",
-                "sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8",
-                "sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0",
-                "sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e",
-                "sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a",
-                "sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e",
-                "sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032",
-                "sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6",
-                "sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e",
-                "sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b",
-                "sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e",
-                "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954",
-                "sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962",
-                "sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c",
-                "sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4",
-                "sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55",
-                "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962",
-                "sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023",
-                "sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c",
-                "sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6",
-                "sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8",
-                "sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382",
-                "sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7",
-                "sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc",
-                "sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997",
-                "sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"
-            ],
-            "version": "==1.15.0"
-        },
-        "charset-normalizer": {
-            "hashes": [
-                "sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0",
-                "sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b"
-            ],
-            "markers": "python_version >= '3'",
-            "version": "==2.0.7"
-        },
-        "cryptography": {
-            "hashes": [
-                "sha256:07bb7fbfb5de0980590ddfc7f13081520def06dc9ed214000ad4372fb4e3c7f6",
-                "sha256:18d90f4711bf63e2fb21e8c8e51ed8189438e6b35a6d996201ebd98a26abbbe6",
-                "sha256:1ed82abf16df40a60942a8c211251ae72858b25b7421ce2497c2eb7a1cee817c",
-                "sha256:22a38e96118a4ce3b97509443feace1d1011d0571fae81fc3ad35f25ba3ea999",
-                "sha256:2d69645f535f4b2c722cfb07a8eab916265545b3475fdb34e0be2f4ee8b0b15e",
-                "sha256:4a2d0e0acc20ede0f06ef7aa58546eee96d2592c00f450c9acb89c5879b61992",
-                "sha256:54b2605e5475944e2213258e0ab8696f4f357a31371e538ef21e8d61c843c28d",
-                "sha256:7075b304cd567694dc692ffc9747f3e9cb393cc4aa4fb7b9f3abd6f5c4e43588",
-                "sha256:7b7ceeff114c31f285528ba8b390d3e9cfa2da17b56f11d366769a807f17cbaa",
-                "sha256:7eba2cebca600a7806b893cb1d541a6e910afa87e97acf2021a22b32da1df52d",
-                "sha256:928185a6d1ccdb816e883f56ebe92e975a262d31cc536429041921f8cb5a62fd",
-                "sha256:9933f28f70d0517686bd7de36166dda42094eac49415459d9bdf5e7df3e0086d",
-                "sha256:a688ebcd08250eab5bb5bca318cc05a8c66de5e4171a65ca51db6bd753ff8953",
-                "sha256:abb5a361d2585bb95012a19ed9b2c8f412c5d723a9836418fab7aaa0243e67d2",
-                "sha256:c10c797ac89c746e488d2ee92bd4abd593615694ee17b2500578b63cad6b93a8",
-                "sha256:ced40344e811d6abba00295ced98c01aecf0c2de39481792d87af4fa58b7b4d6",
-                "sha256:d57e0cdc1b44b6cdf8af1d01807db06886f10177469312fbde8f44ccbb284bc9",
-                "sha256:d99915d6ab265c22873f1b4d6ea5ef462ef797b4140be4c9d8b179915e0985c6",
-                "sha256:eb80e8a1f91e4b7ef8b33041591e6d89b2b8e122d787e87eeb2b08da71bb16ad",
-                "sha256:ebeddd119f526bcf323a89f853afb12e225902a24d29b55fe18dd6fcb2838a76"
-            ],
-            "version": "==35.0.0"
-        },
-        "execnet": {
-            "hashes": [
-                "sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5",
-                "sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==1.9.0"
-        },
-        "idna": {
-            "hashes": [
-                "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
-                "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
-            ],
-            "markers": "python_version >= '3'",
-            "version": "==3.3"
-        },
-        "iniconfig": {
-            "hashes": [
-                "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
-                "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
-            ],
-            "version": "==1.1.1"
-        },
-        "packaging": {
-            "hashes": [
-                "sha256:7dc96269f53a4ccec5c0670940a4281106dd0bb343f47b7471f779df49c2fbe7",
-                "sha256:c86254f9220d55e31cc94d69bade760f0847da8000def4dfe1c6b872fd14ff14"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==21.0"
-        },
-        "pluggy": {
-            "hashes": [
-                "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
-                "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==1.0.0"
-        },
-        "psycopg2": {
-            "hashes": [
-                "sha256:079d97fc22de90da1d370c90583659a9f9a6ee4007355f5825e5f1c70dffc1fa",
-                "sha256:2087013c159a73e09713294a44d0c8008204d06326006b7f652bef5ace66eebb",
-                "sha256:2c992196719fadda59f72d44603ee1a2fdcc67de097eea38d41c7ad9ad246e62",
-                "sha256:7640e1e4d72444ef012e275e7b53204d7fab341fb22bc76057ede22fe6860b25",
-                "sha256:7f91312f065df517187134cce8e395ab37f5b601a42446bdc0f0d51773621854",
-                "sha256:830c8e8dddab6b6716a4bf73a09910c7954a92f40cf1d1e702fb93c8a919cc56",
-                "sha256:89409d369f4882c47f7ea20c42c5046879ce22c1e4ea20ef3b00a4dfc0a7f188",
-                "sha256:bf35a25f1aaa8a3781195595577fcbb59934856ee46b4f252f56ad12b8043bcf",
-                "sha256:de5303a6f1d0a7a34b9d40e4d3bef684ccc44a49bbe3eb85e3c0bffb4a131b7c"
-            ],
-            "index": "pypi",
-            "version": "==2.9.1"
-        },
-        "py": {
-            "hashes": [
-                "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3",
-                "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==1.10.0"
-        },
-        "pycparser": {
-            "hashes": [
-                "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
-                "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==2.20"
-        },
-        "pyjwt": {
-            "extras": [
-                "crypto"
-            ],
-            "hashes": [
-                "sha256:b888b4d56f06f6dcd777210c334e69c737be74755d3e5e9ee3fe67dc18a0ee41",
-                "sha256:e0c4bb8d9f0af0c7f5b1ec4c5036309617d03d56932877f2f7a0beeb5318322f"
-            ],
-            "index": "pypi",
-            "version": "==2.3.0"
-        },
-        "pyparsing": {
-            "hashes": [
-                "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
-                "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
-            ],
-            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==2.4.7"
-        },
-        "pytest": {
-            "hashes": [
-                "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89",
-                "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"
-            ],
-            "index": "pypi",
-            "version": "==6.2.5"
-        },
-        "pytest-forked": {
-            "hashes": [
-                "sha256:6aa9ac7e00ad1a539c41bec6d21011332de671e938c7637378ec9710204e37ca",
-                "sha256:dc4147784048e70ef5d437951728825a131b81714b398d5d52f17c7c144d8815"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==1.3.0"
-        },
-        "pytest-xdist": {
-            "hashes": [
-                "sha256:7b61ebb46997a0820a263553179d6d1e25a8c50d8a8620cd1aa1e20e3be99168",
-                "sha256:89b330316f7fc475f999c81b577c2b926c9569f3d397ae432c0c2e2496d61ff9"
-            ],
-            "index": "pypi",
-            "version": "==2.4.0"
-        },
-        "requests": {
-            "hashes": [
-                "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
-                "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
-            ],
-            "index": "pypi",
-            "version": "==2.26.0"
-        },
-        "toml": {
-            "hashes": [
-                "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
-                "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
-            ],
-            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==0.10.2"
-        },
-        "typing-extensions": {
-            "hashes": [
-                "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
-                "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
-                "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
-            ],
-            "index": "pypi",
-            "version": "==3.10.0.2"
-        },
-        "urllib3": {
-            "hashes": [
-                "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece",
-                "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
-            "version": "==1.26.7"
-        }
-    },
-    "develop": {
-        "flake8": {
-            "hashes": [
-                "sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d",
-                "sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"
-            ],
-            "index": "pypi",
-            "version": "==4.0.1"
-        },
-        "mccabe": {
-            "hashes": [
-                "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
-                "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
-            ],
-            "version": "==0.6.1"
-        },
-        "mypy": {
-            "hashes": [
-                "sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9",
-                "sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a",
-                "sha256:119bed3832d961f3a880787bf621634ba042cb8dc850a7429f643508eeac97b9",
-                "sha256:1a85e280d4d217150ce8cb1a6dddffd14e753a4e0c3cf90baabb32cefa41b59e",
-                "sha256:3c4b8ca36877fc75339253721f69603a9c7fdb5d4d5a95a1a1b899d8b86a4de2",
-                "sha256:3e382b29f8e0ccf19a2df2b29a167591245df90c0b5a2542249873b5c1d78212",
-                "sha256:42c266ced41b65ed40a282c575705325fa7991af370036d3f134518336636f5b",
-                "sha256:53fd2eb27a8ee2892614370896956af2ff61254c275aaee4c230ae771cadd885",
-                "sha256:704098302473cb31a218f1775a873b376b30b4c18229421e9e9dc8916fd16150",
-                "sha256:7df1ead20c81371ccd6091fa3e2878559b5c4d4caadaf1a484cf88d93ca06703",
-                "sha256:866c41f28cee548475f146aa4d39a51cf3b6a84246969f3759cb3e9c742fc072",
-                "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457",
-                "sha256:adaeee09bfde366d2c13fe6093a7df5df83c9a2ba98638c7d76b010694db760e",
-                "sha256:b6fb13123aeef4a3abbcfd7e71773ff3ff1526a7d3dc538f3929a49b42be03f0",
-                "sha256:b94e4b785e304a04ea0828759172a15add27088520dc7e49ceade7834275bedb",
-                "sha256:c0df2d30ed496a08de5daed2a9ea807d07c21ae0ab23acf541ab88c24b26ab97",
-                "sha256:c6c2602dffb74867498f86e6129fd52a2770c48b7cd3ece77ada4fa38f94eba8",
-                "sha256:ceb6e0a6e27fb364fb3853389607cf7eb3a126ad335790fa1e14ed02fba50811",
-                "sha256:d9dd839eb0dc1bbe866a288ba3c1afc33a202015d2ad83b31e875b5905a079b6",
-                "sha256:e4dab234478e3bd3ce83bac4193b2ecd9cf94e720ddd95ce69840273bf44f6de",
-                "sha256:ec4e0cd079db280b6bdabdc807047ff3e199f334050db5cbb91ba3e959a67504",
-                "sha256:ecd2c3fe726758037234c93df7e98deb257fd15c24c9180dacf1ef829da5f921",
-                "sha256:ef565033fa5a958e62796867b1df10c40263ea9ded87164d67572834e57a174d"
-            ],
-            "index": "pypi",
-            "version": "==0.910"
-        },
-        "mypy-extensions": {
-            "hashes": [
-                "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d",
-                "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"
-            ],
-            "version": "==0.4.3"
-        },
-        "pycodestyle": {
-            "hashes": [
-                "sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20",
-                "sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==2.8.0"
-        },
-        "pyflakes": {
-            "hashes": [
-                "sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c",
-                "sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==2.4.0"
-        },
-        "toml": {
-            "hashes": [
-                "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
-                "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
-            ],
-            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==0.10.2"
-        },
-        "typing-extensions": {
-            "hashes": [
-                "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
-                "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
-                "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
-            ],
-            "index": "pypi",
-            "version": "==3.10.0.2"
-        },
-        "yapf": {
-            "hashes": [
-                "sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d",
-                "sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"
-            ],
-            "index": "pypi",
-            "version": "==0.31.0"
-        }
-    }
-}
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -3,18 +3,13 @@
 This directory contains integration tests.

 Prerequisites:
- Python 3.6 or later
- Dependencies: install them via `pipenv install`. Note that Debian/Ubuntu
-  packages are stale, as it commonly happens, so manual installation is not
-  recommended.
-  Run `pipenv shell` to activate the venv or use `pipenv run` to run a single
-  command in the venv, e.g. `pipenv run pytest`.
+- Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python)
 - Zenith and Postgres binaries
-    - See the root README.md for build directions
+    - See the root [README.md](/README.md) for build directions
    - Tests can be run from the git tree; or see the environment variables
      below to run from other directories.
 - The zenith git repo, including the postgres submodule
-  (for some tests, e.g. pg_regress)
+  (for some tests, e.g. `pg_regress`)

 ### Test Organization

@@ -35,15 +30,15 @@ be stored under a directory `test_output`.

 You can run all the tests with:

-`pytest`
+`pipenv run pytest`

 If you want to run all the tests in a particular file:

-`pytest test_pgbench.py`
+`pipenv run pytest test_pgbench.py`

 If you want to run all tests that have the string "bench" in their names:

-`pytest -k bench`
+`pipenv run pytest -k bench`

 Useful environment variables:

@@ -62,46 +57,51 @@ Exit after the first test failure:
 `pytest -x ...`
 (there are many more pytest options; run `pytest -h` to see them.)

+### Writing a test

-### Building new tests
+Every test needs a Zenith Environment, or ZenithEnv to operate in. A Zenith Environment
+is like a little cloud-in-a-box, and consists of a Pageserver, 0-N Safekeepers, and
+compute Postgres nodes. The connections between them can be configured to use JWT
+authentication tokens, and some other configuration options can be tweaked too.

-The tests make heavy use of pytest fixtures. You can read about how they work here: https://docs.pytest.org/en/stable/fixture.html
+The easiest way to get access to a Zenith Environment is by using the `zenith_simple_env`
+fixture. The 'simple' env may be shared across multiple tests, so don't shut down the nodes
+or make other destructive changes in that environment. Also don't assume that
+there are no tenants or branches or data in the cluster. For convenience, there is a
+branch called `empty`, though. The convention is to create a test-specific branch of
+that and load any test data there, instead of the 'main' branch.

-Essentially, this means that each time you see a fixture named as an input parameter, the function with that name will be run and passed as a parameter to the function.
-
-So this code:
+For more complicated cases, you can build a custom Zenith Environment, with the `zenith_env`
+fixture:

 ```python
-def test_something(zenith_cli, pg_bin):
-    pass
+def test_foobar(zenith_env_builder: ZenithEnvBuilder):
+    # Prescribe the environment.
+    # We want to have 3 safekeeper nodes, and use JWT authentication in the
+    # connections to the page server
+    zenith_env_builder.num_safekeepers = 3
+    zenith_env_builder.set_pageserver_auth(True)
+
+    # Now create the environment. This initializes the repository, and starts
+    # up the page server and the safekeepers
+    env = zenith_env_builder.init()
+
+    # Run the test
+    ...
 ```

-... will run the fixtures called `zenith_cli` and `pg_bin` and deliver those results to the test function.
+For more information about pytest fixtures, see https://docs.pytest.org/en/stable/fixture.html

-Fixtures can't be imported using the normal python syntax. Instead, use this:
+At the end of a test, all the nodes in the environment are automatically stopped, so you
+don't need to worry about cleaning up. Logs and test data are preserved for the analysis,
+in a directory under `../test_output/<testname>`

-```python
-pytest_plugins = ("fixtures.something")
-```
+### Before submitting a patch
+Ensure that you pass all [obligatory checks](/docs/sourcetree.md#obligatory-checks).

-That will make all the fixtures in the `fixtures/something.py` file available.
-
-Anything that's likely to be used in multiple tests should be built into a fixture.
-
-Note that fixtures can clean up after themselves if they use the `yield` syntax.
-Cleanup will happen even if the test fails (raises an unhandled exception).
-Python destructors, e.g. `__del__()` aren't recommended for cleanup.
-
-
-### Code quality
-
-We force code formatting via yapf:
-
-1. Install `yapf` and other tools (`flake8`, `mypy`) with `pipenv install --dev`.
-1. Reformat all your code by running `pipenv run yapf -ri .` in the `test_runner/` directory.
-
-Before submitting a patch, please consider:
+Also consider:

 * Writing a couple of docstrings to clarify the reasoning behind a new test.
-* Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any.
-* (Optional) Typechecking the code with `mypy .`. Currently this mostly affects `fixtures/zenith_fixtures.py`.
+* Adding more type hints to your code to avoid `Any`, especially:
+  * For fixture parameters, they are not automatically deduced.
+  * For function arguments and return values.
--- a/test_runner/batch_others/test_auth.py
+++ b/test_runner/batch_others/test_auth.py
@@ -2,18 +2,21 @@ from contextlib import closing
 from typing import Iterator
 from uuid import uuid4
 import psycopg2
-from fixtures.zenith_fixtures import PortDistributor, Postgres, ZenithCli, ZenithPageserver, PgBin
+from fixtures.zenith_fixtures import ZenithEnvBuilder
 import pytest

 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_pageserver_auth(pageserver_auth_enabled: ZenithPageserver):
-    ps = pageserver_auth_enabled
+def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder):
+    zenith_env_builder.pageserver_auth_enabled = True
+    env = zenith_env_builder.init()

-    tenant_token = ps.auth_keys.generate_tenant_token(ps.initial_tenant)
-    invalid_tenant_token = ps.auth_keys.generate_tenant_token(uuid4().hex)
-    management_token = ps.auth_keys.generate_management_token()
+    ps = env.pageserver
+
+    tenant_token = env.auth_keys.generate_tenant_token(env.initial_tenant)
+    invalid_tenant_token = env.auth_keys.generate_tenant_token(uuid4().hex)
+    management_token = env.auth_keys.generate_management_token()

    # this does not invoke auth check and only decodes jwt and checks it for validity
    # check both tokens
@@ -21,13 +24,13 @@ def test_pageserver_auth(pageserver_auth_enabled: ZenithPageserver):
    ps.safe_psql("status", password=management_token)

    # tenant can create branches
-    ps.safe_psql(f"branch_create {ps.initial_tenant} new1 main", password=tenant_token)
+    ps.safe_psql(f"branch_create {env.initial_tenant} new1 main", password=tenant_token)
    # console can create branches for tenant
-    ps.safe_psql(f"branch_create {ps.initial_tenant} new2 main", password=management_token)
+    ps.safe_psql(f"branch_create {env.initial_tenant} new2 main", password=management_token)

    # fail to create branch using token with different tenantid
    with pytest.raises(psycopg2.DatabaseError, match='Tenant id mismatch. Permission denied'):
-        ps.safe_psql(f"branch_create {ps.initial_tenant} new2 main", password=invalid_tenant_token)
+        ps.safe_psql(f"branch_create {env.initial_tenant} new2 main", password=invalid_tenant_token)

    # create tenant using management token
    ps.safe_psql(f"tenant_create {uuid4().hex}", password=management_token)
@@ -40,40 +43,22 @@ def test_pageserver_auth(pageserver_auth_enabled: ZenithPageserver):


@pytest.mark.parametrize('with_wal_acceptors', [False, True])
-def test_compute_auth_to_pageserver(
-    zenith_cli: ZenithCli,
-    wa_factory,
-    pageserver_auth_enabled: ZenithPageserver,
-    repo_dir: str,
-    with_wal_acceptors: bool,
-    pg_bin: PgBin,
-    port_distributor: PortDistributor,
-):
-    ps = pageserver_auth_enabled
-    # since we are in progress of refactoring protocols between compute safekeeper and page server
-    # use hardcoded management token in safekeeper
-    management_token = ps.auth_keys.generate_management_token()
+def test_compute_auth_to_pageserver(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptors: bool):
+    zenith_env_builder.pageserver_auth_enabled = True
+    if with_wal_acceptors:
+        zenith_env_builder.num_safekeepers = 3
+    env = zenith_env_builder.init()

    branch = f"test_compute_auth_to_pageserver{with_wal_acceptors}"
-    zenith_cli.run(["branch", branch, "empty"])
-    if with_wal_acceptors:
-        wa_factory.start_n_new(3, management_token)
+    env.zenith_cli(["branch", branch, "main"])

-    with Postgres(
-            zenith_cli=zenith_cli,
-            repo_dir=repo_dir,
-            pg_bin=pg_bin,
-            tenant_id=ps.initial_tenant,
-            port=port_distributor.get_port(),
-    ).create_start(
-            branch,
-            wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None,
-    ) as pg:
-        with closing(pg.connect()) as conn:
-            with conn.cursor() as cur:
-                # we rely upon autocommit after each statement
-                # as waiting for acceptors happens there
-                cur.execute('CREATE TABLE t(key int primary key, value text)')
-                cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
-                cur.execute('SELECT sum(key) FROM t')
-                assert cur.fetchone() == (5000050000, )
+    pg = env.postgres.create_start(branch)
+
+    with closing(pg.connect()) as conn:
+        with conn.cursor() as cur:
+            # we rely upon autocommit after each statement
+            # as waiting for acceptors happens there
+            cur.execute('CREATE TABLE t(key int primary key, value text)')
+            cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
+            cur.execute('SELECT sum(key) FROM t')
+            assert cur.fetchone() == (5000050000, )
--- a/test_runner/batch_others/test_branch_behind.py
+++ b/test_runner/batch_others/test_branch_behind.py
@@ -1,6 +1,11 @@
 import subprocess
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from contextlib import closing
+
+import psycopg2.extras
+import pytest
 from fixtures.log_helper import log
+from fixtures.utils import print_gc_result
+from fixtures.zenith_fixtures import ZenithEnv

 pytest_plugins = ("fixtures.zenith_fixtures")

@@ -8,18 +13,27 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Create a couple of branches off the main branch, at a historical point in time.
 #
-def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
+def test_branch_behind(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Branch at the point where only 100 rows were inserted
-    zenith_cli.run(["branch", "test_branch_behind", "empty"])
+    env.zenith_cli(["branch", "test_branch_behind", "empty"])

-    pgmain = postgres.create_start('test_branch_behind')
+    pgmain = env.postgres.create_start('test_branch_behind')
    log.info("postgres is running on 'test_branch_behind' branch")

    main_pg_conn = pgmain.connect()
    main_cur = main_pg_conn.cursor()

+    main_cur.execute("SHOW zenith.zenith_timeline")
+    timeline = main_cur.fetchone()[0]
+
    # Create table, and insert the first 100 rows
    main_cur.execute('CREATE TABLE foo (t text)')
+
+    # keep some early lsn to test branch creation on out of date lsn
+    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
+    gced_lsn = main_cur.fetchone()[0]
+
    main_cur.execute('''
        INSERT INTO foo
            SELECT 'long string to consume some space' || g
@@ -40,7 +54,7 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
    log.info(f'LSN after 200100 rows: {lsn_b}')

    # Branch at the point where only 100 rows were inserted
-    zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])
+    env.zenith_cli(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])

    # Insert many more rows. This generates enough WAL to fill a few segments.
    main_cur.execute('''
@@ -55,10 +69,10 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
    log.info(f'LSN after 400100 rows: {lsn_c}')

    # Branch at the point where only 200100 rows were inserted
-    zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])
+    env.zenith_cli(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])

-    pg_hundred = postgres.create_start("test_branch_behind_hundred")
-    pg_more = postgres.create_start("test_branch_behind_more")
+    pg_hundred = env.postgres.create_start("test_branch_behind_hundred")
+    pg_more = env.postgres.create_start("test_branch_behind_more")

    # On the 'hundred' branch, we should see only 100 rows
    hundred_pg_conn = pg_hundred.connect()
@@ -79,17 +93,34 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
    # Check bad lsn's for branching

    # branch at segment boundary
-    zenith_cli.run(["branch", "test_branch_segment_boundary", "test_branch_behind@0/3000000"])
-    pg = postgres.create_start("test_branch_segment_boundary")
+    env.zenith_cli(["branch", "test_branch_segment_boundary", "test_branch_behind@0/3000000"])
+    pg = env.postgres.create_start("test_branch_segment_boundary")
    cur = pg.connect().cursor()
    cur.execute('SELECT 1')
    assert cur.fetchone() == (1, )

    # branch at pre-initdb lsn
-    #
-    # FIXME: This works currently, but probably shouldn't be allowed
-    try:
-        zenith_cli.run(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
-        # FIXME: assert false, "branch with invalid LSN should have failed"
-    except subprocess.CalledProcessError:
-        log.info("Branch creation with pre-initdb LSN failed (as expected)")
+    with pytest.raises(Exception, match="invalid branch start lsn"):
+        env.zenith_cli(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
+
+    # check that we cannot create branch based on garbage collected data
+    with closing(env.pageserver.connect()) as psconn:
+        with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur:
+            # call gc to advace latest_gc_cutoff_lsn
+            pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
+            row = pscur.fetchone()
+            print_gc_result(row)
+
+    with pytest.raises(Exception, match="invalid branch start lsn"):
+        # this gced_lsn is pretty random, so if gc is disabled this woudln't fail
+        env.zenith_cli(["branch", "test_branch_create_fail", f"test_branch_behind@{gced_lsn}"])
+
+    # check that after gc everything is still there
+    hundred_cur.execute('SELECT count(*) FROM foo')
+    assert hundred_cur.fetchone() == (100, )
+
+    more_cur.execute('SELECT count(*) FROM foo')
+    assert more_cur.fetchone() == (200100, )
+
+    main_cur.execute('SELECT count(*) FROM foo')
+    assert main_cur.fetchone() == (400100, )
--- a/test_runner/batch_others/test_clog_truncate.py
+++ b/test_runner/batch_others/test_clog_truncate.py
@@ -3,7 +3,7 @@ import os

 from contextlib import closing

-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -12,9 +12,10 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test compute node start after clog truncation
 #
-def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
+def test_clog_truncate(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_clog_truncate", "empty"])
+    env.zenith_cli(["branch", "test_clog_truncate", "empty"])

    # set agressive autovacuum to make sure that truncation will happen
    config = [
@@ -27,7 +28,7 @@ def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
        'autovacuum_freeze_max_age=100000'
    ]

-    pg = postgres.create_start('test_clog_truncate', config_lines=config)
+    pg = env.postgres.create_start('test_clog_truncate', config_lines=config)
    log.info('postgres is running on test_clog_truncate branch')

    # Install extension containing function needed for test
@@ -64,10 +65,10 @@ def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: Postg

    # create new branch after clog truncation and start a compute node on it
    log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}')
-    zenith_cli.run(
+    env.zenith_cli(
        ["branch", "test_clog_truncate_new", "test_clog_truncate@" + lsn_after_truncation])

-    pg2 = postgres.create_start('test_clog_truncate_new')
+    pg2 = env.postgres.create_start('test_clog_truncate_new')
    log.info('postgres is running on test_clog_truncate_new branch')

    # check that new node doesn't contain truncated segment
--- a/test_runner/batch_others/test_config.py
+++ b/test_runner/batch_others/test_config.py
@@ -1,6 +1,6 @@
 from contextlib import closing

-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -9,12 +9,13 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test starting Postgres with custom options
 #
-def test_config(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
+def test_config(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_config", "empty"])
+    env.zenith_cli(["branch", "test_config", "empty"])

    # change config
-    pg = postgres.create_start('test_config', config_lines=['log_min_messages=debug1'])
+    pg = env.postgres.create_start('test_config', config_lines=['log_min_messages=debug1'])
    log.info('postgres is running on test_config branch')

    with closing(pg.connect()) as conn:
--- a/test_runner/batch_others/test_createdropdb.py
+++ b/test_runner/batch_others/test_createdropdb.py
@@ -2,7 +2,7 @@ import os
 import pathlib

 from contextlib import closing
-from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory, ZenithCli, check_restored_datadir_content
+from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -11,15 +11,11 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test CREATE DATABASE when there have been relmapper changes
 #
-def test_createdb(
-    zenith_cli: ZenithCli,
-    pageserver: ZenithPageserver,
-    postgres: PostgresFactory,
-    pg_bin,
-):
-    zenith_cli.run(["branch", "test_createdb", "empty"])
+def test_createdb(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    env.zenith_cli(["branch", "test_createdb", "empty"])

-    pg = postgres.create_start('test_createdb')
+    pg = env.postgres.create_start('test_createdb')
    log.info("postgres is running on 'test_createdb' branch")

    with closing(pg.connect()) as conn:
@@ -33,9 +29,9 @@ def test_createdb(
            lsn = cur.fetchone()[0]

    # Create a branch
-    zenith_cli.run(["branch", "test_createdb2", "test_createdb@" + lsn])
+    env.zenith_cli(["branch", "test_createdb2", "test_createdb@" + lsn])

-    pg2 = postgres.create_start('test_createdb2')
+    pg2 = env.postgres.create_start('test_createdb2')

    # Test that you can connect to the new database on both branches
    for db in (pg, pg2):
@@ -45,16 +41,11 @@ def test_createdb(
 #
 # Test DROP DATABASE
 #
-def test_dropdb(
-    zenith_cli: ZenithCli,
-    pageserver: ZenithPageserver,
-    postgres: PostgresFactory,
-    pg_bin,
-    test_output_dir,
-):
-    zenith_cli.run(["branch", "test_dropdb", "empty"])
+def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir):
+    env = zenith_simple_env
+    env.zenith_cli(["branch", "test_dropdb", "empty"])

-    pg = postgres.create_start('test_dropdb')
+    pg = env.postgres.create_start('test_dropdb')
    log.info("postgres is running on 'test_dropdb' branch")

    with closing(pg.connect()) as conn:
@@ -77,26 +68,28 @@ def test_dropdb(
            lsn_after_drop = cur.fetchone()[0]

    # Create two branches before and after database drop.
-    zenith_cli.run(["branch", "test_before_dropdb", "test_dropdb@" + lsn_before_drop])
-    pg_before = postgres.create_start('test_before_dropdb')
+    env.zenith_cli(["branch", "test_before_dropdb", "test_dropdb@" + lsn_before_drop])
+    pg_before = env.postgres.create_start('test_before_dropdb')

-    zenith_cli.run(["branch", "test_after_dropdb", "test_dropdb@" + lsn_after_drop])
-    pg_after = postgres.create_start('test_after_dropdb')
+    env.zenith_cli(["branch", "test_after_dropdb", "test_dropdb@" + lsn_after_drop])
+    pg_after = env.postgres.create_start('test_after_dropdb')

    # Test that database exists on the branch before drop
    pg_before.connect(dbname='foodb').close()

    # Test that database subdir exists on the branch before drop
+    assert pg_before.pgdata_dir
    dbpath = pathlib.Path(pg_before.pgdata_dir) / 'base' / str(dboid)
    log.info(dbpath)

    assert os.path.isdir(dbpath) == True

    # Test that database subdir doesn't exist on the branch after drop
+    assert pg_after.pgdata_dir
    dbpath = pathlib.Path(pg_after.pgdata_dir) / 'base' / str(dboid)
    log.info(dbpath)

    assert os.path.isdir(dbpath) == False

    # Check that we restore the content of the datadir correctly
-    check_restored_datadir_content(zenith_cli, test_output_dir, pg, pageserver.service_port.pg)
+    check_restored_datadir_content(test_output_dir, env, pg)
--- a/test_runner/batch_others/test_createuser.py
+++ b/test_runner/batch_others/test_createuser.py
@@ -1,6 +1,6 @@
 from contextlib import closing

-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -9,10 +9,11 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test CREATE USER to check shared catalog restore
 #
-def test_createuser(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
-    zenith_cli.run(["branch", "test_createuser", "empty"])
+def test_createuser(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    env.zenith_cli(["branch", "test_createuser", "empty"])

-    pg = postgres.create_start('test_createuser')
+    pg = env.postgres.create_start('test_createuser')
    log.info("postgres is running on 'test_createuser' branch")

    with closing(pg.connect()) as conn:
@@ -26,9 +27,9 @@ def test_createuser(zenith_cli, pageserver: ZenithPageserver, postgres: Postgres
            lsn = cur.fetchone()[0]

    # Create a branch
-    zenith_cli.run(["branch", "test_createuser2", "test_createuser@" + lsn])
+    env.zenith_cli(["branch", "test_createuser2", "test_createuser@" + lsn])

-    pg2 = postgres.create_start('test_createuser2')
+    pg2 = env.postgres.create_start('test_createuser2')

    # Test that you can connect to new branch as a new user
    assert pg2.safe_psql('select current_user', username='testuser') == [('testuser', )]
--- a/test_runner/batch_others/test_multixact.py
+++ b/test_runner/batch_others/test_multixact.py
@@ -1,4 +1,4 @@
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content
+from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -10,15 +10,11 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 # it only checks next_multixact_id field in restored pg_control,
 # since we don't have functions to check multixact internals.
 #
-def test_multixact(pageserver: ZenithPageserver,
-                   postgres: PostgresFactory,
-                   pg_bin,
-                   zenith_cli,
-                   base_dir,
-                   test_output_dir):
+def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_multixact", "empty"])
-    pg = postgres.create_start('test_multixact')
+    env.zenith_cli(["branch", "test_multixact", "empty"])
+    pg = env.postgres.create_start('test_multixact')

    log.info("postgres is running on 'test_multixact' branch")
    pg_conn = pg.connect()
@@ -57,8 +53,8 @@ def test_multixact(pageserver: ZenithPageserver,
    assert int(next_multixact_id) > int(next_multixact_id_old)

    # Branch at this point
-    zenith_cli.run(["branch", "test_multixact_new", "test_multixact@" + lsn])
-    pg_new = postgres.create_start('test_multixact_new')
+    env.zenith_cli(["branch", "test_multixact_new", "test_multixact@" + lsn])
+    pg_new = env.postgres.create_start('test_multixact_new')

    log.info("postgres is running on 'test_multixact_new' branch")
    pg_new_conn = pg_new.connect()
@@ -71,4 +67,4 @@ def test_multixact(pageserver: ZenithPageserver,
    assert next_multixact_id_new == next_multixact_id

    # Check that we restore the content of the datadir correctly
-    check_restored_datadir_content(zenith_cli, test_output_dir, pg_new, pageserver.service_port.pg)
+    check_restored_datadir_content(test_output_dir, env, pg_new)
--- a/test_runner/batch_others/test_old_request_lsn.py
+++ b/test_runner/batch_others/test_old_request_lsn.py
@@ -1,6 +1,6 @@
 from contextlib import closing

-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

 pytest_plugins = ("fixtures.zenith_fixtures")
@@ -16,13 +16,11 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 # just a hint that the page hasn't been modified since that LSN, and the page
 # server should return the latest page version regardless of the LSN.
 #
-def test_old_request_lsn(zenith_cli,
-                         pageserver: ZenithPageserver,
-                         postgres: PostgresFactory,
-                         pg_bin):
+def test_old_request_lsn(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_old_request_lsn", "empty"])
-    pg = postgres.create_start('test_old_request_lsn')
+    env.zenith_cli(["branch", "test_old_request_lsn", "empty"])
+    pg = env.postgres.create_start('test_old_request_lsn')
    log.info('postgres is running on test_old_request_lsn branch')

    pg_conn = pg.connect()
@@ -32,7 +30,7 @@ def test_old_request_lsn(zenith_cli,
    cur.execute("SHOW zenith.zenith_timeline")
    timeline = cur.fetchone()[0]

-    psconn = pageserver.connect()
+    psconn = env.pageserver.connect()
    pscur = psconn.cursor()

    # Create table, and insert some rows. Make it big enough that it doesn't fit in
@@ -59,7 +57,7 @@ def test_old_request_lsn(zenith_cli,
    # Make a lot of updates on a single row, generating a lot of WAL. Trigger
    # garbage collections so that the page server will remove old page versions.
    for i in range(10):
-        pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+        pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
        for j in range(100):
            cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;')

--- a/test_runner/batch_others/test_pageserver_api.py
+++ b/test_runner/batch_others/test_pageserver_api.py
@@ -3,25 +3,28 @@ from uuid import uuid4
 import pytest
 import psycopg2
 import requests
-from fixtures.zenith_fixtures import ZenithPageserver, ZenithPageserverHttpClient
+from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient
+from typing import cast

 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_status_psql(pageserver):
-    assert pageserver.safe_psql('status') == [
+def test_status_psql(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    assert env.pageserver.safe_psql('status') == [
        ('hello world', ),
    ]


-def test_branch_list_psql(pageserver: ZenithPageserver, zenith_cli):
+def test_branch_list_psql(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
    # Create a branch for us
-    zenith_cli.run(["branch", "test_branch_list_main", "empty"])
+    env.zenith_cli(["branch", "test_branch_list_main", "empty"])

-    conn = pageserver.connect()
+    conn = env.pageserver.connect()
    cur = conn.cursor()

-    cur.execute(f'branch_list {pageserver.initial_tenant}')
+    cur.execute(f'branch_list {env.initial_tenant}')
    branches = json.loads(cur.fetchone()[0])
    # Filter out branches created by other tests
    branches = [x for x in branches if x['name'].startswith('test_branch_list')]
@@ -34,10 +37,10 @@ def test_branch_list_psql(pageserver: ZenithPageserver, zenith_cli):
    assert 'ancestor_lsn' in branches[0]

    # Create another branch, and start Postgres on it
-    zenith_cli.run(['branch', 'test_branch_list_experimental', 'test_branch_list_main'])
-    zenith_cli.run(['pg', 'create', 'test_branch_list_experimental'])
+    env.zenith_cli(['branch', 'test_branch_list_experimental', 'test_branch_list_main'])
+    env.zenith_cli(['pg', 'create', 'test_branch_list_experimental'])

-    cur.execute(f'branch_list {pageserver.initial_tenant}')
+    cur.execute(f'branch_list {env.initial_tenant}')
    new_branches = json.loads(cur.fetchone()[0])
    # Filter out branches created by other tests
    new_branches = [x for x in new_branches if x['name'].startswith('test_branch_list')]
@@ -53,19 +56,23 @@ def test_branch_list_psql(pageserver: ZenithPageserver, zenith_cli):
    conn.close()


-def test_tenant_list_psql(pageserver: ZenithPageserver, zenith_cli):
-    res = zenith_cli.run(["tenant", "list"])
-    res.check_returncode()
-    tenants = res.stdout.splitlines()
-    assert tenants == [pageserver.initial_tenant]
+def test_tenant_list_psql(zenith_env_builder: ZenithEnvBuilder):
+    # don't use zenith_simple_env, because there might be other tenants there,
+    # left over from other tests.
+    env = zenith_env_builder.init()

-    conn = pageserver.connect()
+    res = env.zenith_cli(["tenant", "list"])
+    res.check_returncode()
+    tenants = sorted(map(lambda t: t.split()[0], res.stdout.splitlines()))
+    assert tenants == [env.initial_tenant]
+
+    conn = env.pageserver.connect()
    cur = conn.cursor()

    # check same tenant cannot be created twice
    with pytest.raises(psycopg2.DatabaseError,
-                       match=f'tenant {pageserver.initial_tenant} already exists'):
-        cur.execute(f'tenant_create {pageserver.initial_tenant}')
+                       match=f'repo for {env.initial_tenant} already exists'):
+        cur.execute(f'tenant_create {env.initial_tenant}')

    # create one more tenant
    tenant1 = uuid4().hex
@@ -74,20 +81,20 @@ def test_tenant_list_psql(pageserver: ZenithPageserver, zenith_cli):
    cur.execute('tenant_list')

    # compare tenants list
-    new_tenants = sorted(json.loads(cur.fetchone()[0]))
-    assert sorted([pageserver.initial_tenant, tenant1]) == new_tenants
+    new_tenants = sorted(map(lambda t: cast(str, t['id']), json.loads(cur.fetchone()[0])))
+    assert sorted([env.initial_tenant, tenant1]) == new_tenants


 def check_client(client: ZenithPageserverHttpClient, initial_tenant: str):
    client.check_status()

    # check initial tenant is there
-    assert initial_tenant in set(client.tenant_list())
+    assert initial_tenant in {t['id'] for t in client.tenant_list()}

    # create new tenant and check it is also there
    tenant_id = uuid4()
    client.tenant_create(tenant_id)
-    assert tenant_id.hex in set(client.tenant_list())
+    assert tenant_id.hex in {t['id'] for t in client.tenant_list()}

    # create branch
    branch_name = uuid4().hex
@@ -97,12 +104,17 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: str):
    assert branch_name in {b['name'] for b in client.branch_list(tenant_id)}


-def test_pageserver_http_api_client(pageserver: ZenithPageserver):
-    client = pageserver.http_client()
-    check_client(client, pageserver.initial_tenant)
+def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    client = env.pageserver.http_client()
+    check_client(client, env.initial_tenant)


-def test_pageserver_http_api_client_auth_enabled(pageserver_auth_enabled: ZenithPageserver):
-    client = pageserver_auth_enabled.http_client(
-        auth_token=pageserver_auth_enabled.auth_keys.generate_management_token())
-    check_client(client, pageserver_auth_enabled.initial_tenant)
+def test_pageserver_http_api_client_auth_enabled(zenith_env_builder: ZenithEnvBuilder):
+    zenith_env_builder.pageserver_auth_enabled = True
+    env = zenith_env_builder.init()
+
+    management_token = env.auth_keys.generate_management_token()
+
+    client = env.pageserver.http_client(auth_token=management_token)
+    check_client(client, env.initial_tenant)
--- a/Show More
+++ b/Show More