Change test params

Refactor keyspace code
Have separate classes for the KeySpace, a partitioning of the KeySpace (KeyPartitioning), and a builder object used to construct the KeySpace. Previously, KeyPartitioning did all those things, and it was a bit confusing.
2026-05-15 12:10:37 +00:00 · 2022-03-11 14:03:14 -05:00 · 2022-03-11 16:24:13 +02:00 · 2022-03-11 09:47:09 +02:00 · 2022-03-11 00:53:46 +02:00 · 2022-03-10 23:35:24 +02:00
231 changed files with 28825 additions and 14427 deletions
--- a/.circleci/ansible/ansible.cfg
+++ b/.circleci/ansible/ansible.cfg
@@ -0,0 +1,10 @@
+[defaults]
+
+localhost_warning = False
+host_key_checking = False
+timeout = 30
+
+[ssh_connection]
+ssh_args   = -F ./ansible.ssh.cfg
+scp_if_ssh = True
+pipelining = True
--- a/.circleci/ansible/ansible.ssh.cfg
+++ b/.circleci/ansible/ansible.ssh.cfg
@@ -0,0 +1,11 @@
+Host tele.zenith.tech
+    User admin
+    Port 3023
+    StrictHostKeyChecking no
+    UserKnownHostsFile /dev/null
+
+Host * !tele.zenith.tech
+    User admin
+    StrictHostKeyChecking no
+    UserKnownHostsFile /dev/null
+    ProxyJump tele.zenith.tech
--- a/.circleci/ansible/deploy.yaml
+++ b/.circleci/ansible/deploy.yaml
@@ -0,0 +1,174 @@
+- name: Upload Zenith binaries
+  hosts: pageservers:safekeepers
+  gather_facts: False
+  remote_user: admin
+  vars:
+    force_deploy: false
+
+  tasks:
+
+    - name: get latest version of Zenith binaries
+      ignore_errors: true
+      register: current_version_file
+      set_fact:
+        current_version: "{{ lookup('file', '.zenith_current_version') | trim }}"
+      tags:
+      - pageserver
+      - safekeeper
+
+    - name: set zero value for current_version
+      when: current_version_file is failed
+      set_fact:
+        current_version: "0"
+      tags:
+      - pageserver
+      - safekeeper
+
+    - name: get deployed version from content of remote file
+      ignore_errors: true
+      ansible.builtin.slurp:
+        src: /usr/local/.zenith_current_version
+      register: remote_version_file
+      tags:
+      - pageserver
+      - safekeeper
+
+    - name: decode remote file content
+      when: remote_version_file is succeeded
+      set_fact:
+        remote_version: "{{ remote_version_file['content'] | b64decode | trim }}"
+      tags:
+      - pageserver
+      - safekeeper
+
+    - name: set zero value for remote_version
+      when: remote_version_file is failed
+      set_fact:
+        remote_version: "0"
+      tags:
+      - pageserver
+      - safekeeper
+
+    - name: inform about versions
+      debug: msg="Version to deploy - {{ current_version }}, version on storage node - {{ remote_version }}"
+      tags:
+      - pageserver
+      - safekeeper
+
+
+    - name: upload and extract Zenith binaries to /usr/local
+      when: current_version > remote_version or force_deploy
+      ansible.builtin.unarchive:
+        owner: root
+        group: root
+        src: zenith_install.tar.gz
+        dest: /usr/local
+      become: true
+      tags:
+      - pageserver
+      - safekeeper
+      - binaries
+      - putbinaries
+
+- name: Deploy pageserver
+  hosts: pageservers
+  gather_facts: False
+  remote_user: admin
+  vars:
+    force_deploy: false
+
+  tasks:
+    - name: init pageserver
+      when: current_version > remote_version or force_deploy
+      shell:
+        cmd: sudo -u pageserver /usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" --init -D /storage/pageserver/data
+      args:
+        creates: "/storage/pageserver/data/tenants"
+      environment:
+        ZENITH_REPO_DIR: "/storage/pageserver/data"
+        LD_LIBRARY_PATH: "/usr/local/lib"
+      become: true
+      tags:
+      - pageserver
+
+    - name: upload systemd service definition
+      when: current_version > remote_version or force_deploy
+      ansible.builtin.template:
+        src: systemd/pageserver.service
+        dest: /etc/systemd/system/pageserver.service
+        owner: root
+        group: root
+        mode: '0644'
+      become: true
+      tags:
+      - pageserver
+
+    - name: start systemd service
+      when: current_version > remote_version or force_deploy
+      ansible.builtin.systemd:
+        daemon_reload: yes
+        name: pageserver
+        enabled: yes
+        state: restarted
+      become: true
+      tags:
+      - pageserver
+
+    - name: post version to console
+      when: (current_version > remote_version or force_deploy) and console_mgmt_base_url is defined
+      shell:
+        cmd: |
+          INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
+          curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID
+      tags:
+      - pageserver
+
+- name: Deploy safekeeper
+  hosts: safekeepers
+  gather_facts: False
+  remote_user: admin
+  vars:
+    force_deploy: false
+
+  tasks:
+
+    # in the future safekeepers should discover pageservers byself
+    # but currently use first pageserver that was discovered
+    - name: set first pageserver var for safekeepers
+      when: current_version > remote_version or force_deploy
+      set_fact:
+        first_pageserver: "{{ hostvars[groups['pageservers'][0]]['inventory_hostname'] }}"
+      tags:
+      - safekeeper
+
+    - name: upload systemd service definition
+      when: current_version > remote_version or force_deploy
+      ansible.builtin.template:
+        src: systemd/safekeeper.service
+        dest: /etc/systemd/system/safekeeper.service
+        owner: root
+        group: root
+        mode: '0644'
+      become: true
+      tags:
+      - safekeeper
+
+    - name: start systemd service
+      when: current_version > remote_version or force_deploy
+      ansible.builtin.systemd:
+        daemon_reload: yes
+        name: safekeeper
+        enabled: yes
+        state: restarted
+      become: true
+      tags:
+      - safekeeper
+
+    - name: post version to console
+      when: (current_version > remote_version or force_deploy) and console_mgmt_base_url is defined
+      shell:
+        cmd: |
+          INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
+          curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID
+      tags:
+      - safekeeper
--- a/.circleci/ansible/get_binaries.sh
+++ b/.circleci/ansible/get_binaries.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+set -e
+
+RELEASE=${RELEASE:-false}
+
+# look at docker hub for latest tag fo zenith docker image
+if [ "${RELEASE}" = "true" ]; then
+    echo "search latest relase tag"
+    VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/zenithdb/zenith/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | tail -1)
+    if [ -z "${VERSION}" ]; then
+        echo "no any docker tags found, exiting..."
+        exit 1
+    else
+        TAG="release-${VERSION}"
+    fi
+else
+    echo "search latest dev tag"
+    VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/zenithdb/zenith/tags |jq -r -S '.[].name' | grep -v release | tail -1)
+    if [ -z "${VERSION}" ]; then
+        echo "no any docker tags found, exiting..."
+        exit 1
+    else
+        TAG="${VERSION}"
+    fi
+fi
+
+echo "found ${VERSION}"
+
+# do initial cleanup
+rm -rf zenith_install postgres_install.tar.gz zenith_install.tar.gz .zenith_current_version
+mkdir zenith_install
+
+# retrive binaries from docker image
+echo "getting binaries from docker image"
+docker pull --quiet zenithdb/zenith:${TAG}
+ID=$(docker create zenithdb/zenith:${TAG})
+docker cp ${ID}:/data/postgres_install.tar.gz .
+tar -xzf postgres_install.tar.gz -C zenith_install
+docker cp ${ID}:/usr/local/bin/pageserver zenith_install/bin/
+docker cp ${ID}:/usr/local/bin/safekeeper zenith_install/bin/
+docker cp ${ID}:/usr/local/bin/proxy zenith_install/bin/
+docker cp ${ID}:/usr/local/bin/postgres zenith_install/bin/
+docker rm -vf ${ID}
+
+# store version to file (for ansible playbooks) and create binaries tarball
+echo ${VERSION} > zenith_install/.zenith_current_version
+echo ${VERSION} > .zenith_current_version
+tar -czf zenith_install.tar.gz -C zenith_install .
+
+# do final cleaup
+rm -rf zenith_install postgres_install.tar.gz
--- a/.circleci/ansible/production.hosts
+++ b/.circleci/ansible/production.hosts
@@ -0,0 +1,7 @@
+[pageservers]
+zenith-1-ps-1
+
+[safekeepers]
+zenith-1-sk-1
+zenith-1-sk-2
+zenith-1-sk-3
--- a/.circleci/ansible/staging.hosts
+++ b/.circleci/ansible/staging.hosts
@@ -0,0 +1,7 @@
+[pageservers]
+zenith-us-stage-ps-1
+
+[safekeepers]
+zenith-us-stage-sk-1
+zenith-us-stage-sk-2
+zenith-us-stage-sk-3
--- a/.circleci/ansible/systemd/pageserver.service
+++ b/.circleci/ansible/systemd/pageserver.service
@@ -0,0 +1,18 @@
+[Unit]
+Description=Zenith pageserver
+After=network.target auditd.service
+
+[Service]
+Type=simple
+User=pageserver
+Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib
+ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /storage/pageserver/data
+ExecReload=/bin/kill -HUP $MAINPID
+KillMode=mixed
+KillSignal=SIGINT
+Restart=on-failure
+TimeoutSec=10
+LimitNOFILE=30000000
+
+[Install]
+WantedBy=multi-user.target
--- a/.circleci/ansible/systemd/safekeeper.service
+++ b/.circleci/ansible/systemd/safekeeper.service
@@ -0,0 +1,18 @@
+[Unit]
+Description=Zenith safekeeper
+After=network.target auditd.service
+
+[Service]
+Type=simple
+User=safekeeper
+Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
+ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data
+ExecReload=/bin/kill -HUP $MAINPID
+KillMode=mixed
+KillSignal=SIGINT
+Restart=on-failure
+TimeoutSec=10
+LimitNOFILE=30000000
+
+[Install]
+WantedBy=multi-user.target
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,29 +1,28 @@
 version: 2.1

 executors:
-  zenith-build-executor:
+  zenith-xlarge-executor:
    resource_class: xlarge
    docker:
-      - image: cimg/rust:1.55.0
-  zenith-python-executor:
+      # NB: when changed, do not forget to update rust image tag in all Dockerfiles
+      - image: zimg/rust:1.56
+  zenith-executor:
    docker:
-      - image: cimg/python:3.7.10  # Oldest available 3.7 with Ubuntu 20.04 (for GLIBC and Rust) at CirlceCI
+      - image: zimg/rust:1.56

 jobs:
  check-codestyle-rust:
-    executor: zenith-build-executor
+    executor: zenith-xlarge-executor
    steps:
      - checkout
-
      - run:
          name: rustfmt
          when: always
-          command: |
-            cargo fmt --all -- --check
+          command: cargo fmt --all -- --check

  # A job to build postgres
  build-postgres:
-    executor: zenith-build-executor
+    executor: zenith-xlarge-executor
    parameters:
      build_type:
        type: enum
@@ -38,8 +37,7 @@ jobs:
        # Note this works even though the submodule hasn't been checkout out yet.
      - run:
          name: Get postgres cache key
-          command: |
-            git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
+          command: git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres

      - restore_cache:
          name: Restore postgres cache
@@ -47,15 +45,6 @@ jobs:
            # Restore ONLY if the rev key matches exactly
            - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}

-        # FIXME We could cache our own docker container, instead of installing packages every time.
-      - run:
-          name: apt install dependencies
-          command: |
-            if [ ! -e tmp_install/bin/postgres ]; then
-              sudo apt update
-              sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
-            fi
-
        # Build postgres if the restore_cache didn't find a build.
        # `make` can't figure out whether the cache is valid, since
        # it only compares file timestamps.
@@ -65,7 +54,8 @@ jobs:
            if [ ! -e tmp_install/bin/postgres ]; then
              # "depth 1" saves some time by not cloning the whole repo
              git submodule update --init --depth 1
-              make postgres -j8
+              # bail out on any warnings
+              COPT='-Werror' mold -run make postgres -j$(nproc)
            fi

      - save_cache:
@@ -76,18 +66,14 @@ jobs:

  # A job to build zenith rust code
  build-zenith:
-    executor: zenith-build-executor
+    executor: zenith-xlarge-executor
    parameters:
      build_type:
        type: enum
        enum: ["debug", "release"]
+    environment:
+      BUILD_TYPE: << parameters.build_type >>
    steps:
-      - run:
-          name: apt install dependencies
-          command: |
-            sudo apt update
-            sudo apt install libssl-dev clang
-
        # Checkout the git repo (without submodules)
      - checkout

@@ -116,16 +102,17 @@ jobs:
      - run:
          name: Rust build << parameters.build_type >>
          command: |
-            export CARGO_INCREMENTAL=0
-            BUILD_TYPE="<< parameters.build_type >>"
            if [[ $BUILD_TYPE == "debug" ]]; then
-              echo "Build in debug mode"
-              cargo build --bins --tests
+              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
+              CARGO_FLAGS=
            elif [[ $BUILD_TYPE == "release" ]]; then
-              echo "Build in release mode"
-              cargo build --release --bins --tests
+              cov_prefix=()
+              CARGO_FLAGS=--release
            fi

+            export CARGO_INCREMENTAL=0
+            "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --bins --tests
+
      - save_cache:
          name: Save rust cache
          key: v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
@@ -138,68 +125,115 @@ jobs:
        # has to run separately from cargo fmt section
        # since needs to run with dependencies
      - run:
-          name: clippy
+          name: cargo clippy
          command: |
-            ./run_clippy.sh
+            if [[ $BUILD_TYPE == "debug" ]]; then
+              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
+            elif [[ $BUILD_TYPE == "release" ]]; then
+              cov_prefix=()
+            fi
+
+            "${cov_prefix[@]}" ./run_clippy.sh

        # Run rust unit tests
-      - run: cargo test
+      - run:
+          name: cargo test
+          command: |
+            if [[ $BUILD_TYPE == "debug" ]]; then
+              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
+            elif [[ $BUILD_TYPE == "release" ]]; then
+              cov_prefix=()
+            fi
+
+            "${cov_prefix[@]}" cargo test

        # Install the rust binaries, for use by test jobs
-        # `--locked` is required; otherwise, `cargo install` will ignore Cargo.lock.
-        # FIXME: this is a really silly way to install; maybe we should just output
-        # a tarball as an artifact? Or a .deb package?
      - run:
-          name: cargo install
+          name: Install rust binaries
          command: |
-            export CARGO_INCREMENTAL=0
-            BUILD_TYPE="<< parameters.build_type >>"
            if [[ $BUILD_TYPE == "debug" ]]; then
-              echo "Install debug mode"
-              CARGO_FLAGS="--debug"
+              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
            elif [[ $BUILD_TYPE == "release" ]]; then
-              echo "Install release mode"
-              # The default is release mode; there is no --release flag.
-              CARGO_FLAGS=""
+              cov_prefix=()
+            fi
+
+            binaries=$(
+              "${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps |
+              jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
+            )
+
+            test_exe_paths=$(
+              "${cov_prefix[@]}" cargo test --message-format=json --no-run |
+              jq -r '.executable | select(. != null)'
+            )
+
+            mkdir -p /tmp/zenith/bin
+            mkdir -p /tmp/zenith/test_bin
+            mkdir -p /tmp/zenith/etc
+
+            # Install target binaries
+            for bin in $binaries; do
+              SRC=target/$BUILD_TYPE/$bin
+              DST=/tmp/zenith/bin/$bin
+              cp $SRC $DST
+              echo $DST >> /tmp/zenith/etc/binaries.list
+            done
+
+            # Install test executables (for code coverage)
+            if [[ $BUILD_TYPE == "debug" ]]; then
+              for bin in $test_exe_paths; do
+                SRC=$bin
+                DST=/tmp/zenith/test_bin/$(basename $bin)
+                cp $SRC $DST
+                echo $DST >> /tmp/zenith/etc/binaries.list
+              done
            fi
-            cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path pageserver
-            cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path walkeeper
-            cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path zenith

        # Install the postgres binaries, for use by test jobs
-        # FIXME: this is a silly way to do "install"; maybe just output a standard
-        # postgres package, whatever the favored form is (tarball? .deb package?)
-        # Note that pg_regress needs some build artifacts that probably aren't
-        # in the usual package...?
      - run:
-          name: postgres install
+          name: Install postgres binaries
          command: |
            cp -a tmp_install /tmp/zenith/pg_install

-        # Save the rust output binaries for other jobs in this workflow.
+      - run:
+          name: Merge coverage data
+          command: |
+            # This will speed up workspace uploads
+            if [[ $BUILD_TYPE == "debug" ]]; then
+              scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge
+            fi
+
+        # Save the rust binaries and coverage data for other jobs in this workflow.
      - persist_to_workspace:
          root: /tmp/zenith
          paths:
            - "*"

  check-codestyle-python:
-    executor: zenith-python-executor
+    executor: zenith-executor
    steps:
      - checkout
+      - restore_cache:
+          keys:
+            - v1-python-deps-{{ checksum "poetry.lock" }}
      - run:
          name: Install deps
-          command: pipenv --python 3.7 install --dev
+          command: ./scripts/pysync
+      - save_cache:
+          key: v1-python-deps-{{ checksum "poetry.lock" }}
+          paths:
+            - /home/circleci/.cache/pypoetry/virtualenvs
      - run:
          name: Run yapf to ensure code format
          when: always
-          command: pipenv run yapf --recursive --diff .
+          command: poetry run yapf --recursive --diff .
      - run:
          name: Run mypy to check types
          when: always
-          command: pipenv run mypy .
+          command: poetry run mypy .

  run-pytest:
-    executor: zenith-python-executor
+    executor: zenith-executor
    parameters:
      # pytest args to specify the tests to run.
      #
@@ -225,6 +259,11 @@ jobs:
      run_in_parallel:
        type: boolean
        default: true
+      save_perf_report:
+        type: boolean
+        default: false
+    environment:
+      BUILD_TYPE: << parameters.build_type >>
    steps:
      - attach_workspace:
          at: /tmp/zenith
@@ -233,23 +272,35 @@ jobs:
          condition: << parameters.needs_postgres_source >>
          steps:
            - run: git submodule update --init --depth 1
+      - restore_cache:
+          keys:
+            - v1-python-deps-{{ checksum "poetry.lock" }}
      - run:
          name: Install deps
-          command: pipenv --python 3.7 install
+          command: ./scripts/pysync
+      - save_cache:
+          key: v1-python-deps-{{ checksum "poetry.lock" }}
+          paths:
+            - /home/circleci/.cache/pypoetry/virtualenvs
      - run:
          name: Run pytest
-          working_directory: test_runner
          # pytest doesn't output test logs in real time, so CI job may fail with
          # `Too long with no output` error, if a test is running for a long time.
-          # In that case, tests should have internal timeouts that are less than 
+          # In that case, tests should have internal timeouts that are less than
          # no_output_timeout, specified here.
          no_output_timeout: 10m
          environment:
            - ZENITH_BIN: /tmp/zenith/bin
            - POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
            - TEST_OUTPUT: /tmp/test_output
+            # this variable will be embedded in perf test report
+            # and is needed to distinguish different environments
+            - PLATFORM: zenith-local-ci
          command: |
-            TEST_SELECTION="<< parameters.test_selection >>"
+            PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
+            rm -rf $PERF_REPORT_DIR
+
+            TEST_SELECTION="test_runner/<< parameters.test_selection >>"
            EXTRA_PARAMS="<< parameters.extra_params >>"
            if [ -z "$TEST_SELECTION" ]; then
              echo "test_selection must be set"
@@ -257,7 +308,22 @@ jobs:
            fi
            if << parameters.run_in_parallel >>; then
              EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
-            fi;
+            fi
+            if << parameters.save_perf_report >>; then
+              if [[ $CIRCLE_BRANCH == "main" ]]; then
+                mkdir -p "$PERF_REPORT_DIR"
+                EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
+              fi
+            fi
+
+            export GITHUB_SHA=$CIRCLE_SHA1
+
+            if [[ $BUILD_TYPE == "debug" ]]; then
+              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
+            elif [[ $BUILD_TYPE == "release" ]]; then
+              cov_prefix=()
+            fi
+
            # Run the tests.
            #
            # The junit.xml file allows CircleCI to display more fine-grained test information
@@ -268,7 +334,20 @@ jobs:
            # -n4 uses four processes to run tests via pytest-xdist
            # -s is not used to prevent pytest from capturing output, because tests are running
            # in parallel and logs are mixed between different tests
-            pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short --verbose -m "not remote_cluster" -rA $TEST_SELECTION $EXTRA_PARAMS
+            "${cov_prefix[@]}" ./scripts/pytest \
+              --junitxml=$TEST_OUTPUT/junit.xml \
+              --tb=short \
+              --verbose \
+              -m "not remote_cluster" \
+              -rA $TEST_SELECTION $EXTRA_PARAMS
+
+            if << parameters.save_perf_report >>; then
+              if [[ $CIRCLE_BRANCH == "main" ]]; then
+                export REPORT_FROM="$PERF_REPORT_DIR"
+                export REPORT_TO=local
+                scripts/generate_and_push_perf_report.sh
+              fi
+            fi
      - run:
          # CircleCI artifacts are preserved one file at a time, so skipping
          # this step isn't a good idea. If you want to extract the
@@ -284,6 +363,66 @@ jobs:
      # The store_test_results step tells CircleCI where to find the junit.xml file.
      - store_test_results:
          path: /tmp/test_output
+      - run:
+          name: Merge coverage data
+          command: |
+            # This will speed up workspace uploads
+            if [[ $BUILD_TYPE == "debug" ]]; then
+              scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge
+            fi
+      # Save coverage data (if any)
+      - persist_to_workspace:
+          root: /tmp/zenith
+          paths:
+            - "*"
+
+  coverage-report:
+    executor: zenith-xlarge-executor
+    steps:
+      - attach_workspace:
+          at: /tmp/zenith
+      - checkout
+      - restore_cache:
+          name: Restore rust cache
+          keys:
+            # Require an exact match. While an out of date cache might speed up the build,
+            # there's no way to clean out old packages, so the cache grows every time something
+            # changes.
+            - v04-rust-cache-deps-debug-{{ checksum "Cargo.lock" }}
+      - run:
+          name: Build coverage report
+          command: |
+            COMMIT_URL=https://github.com/zenithdb/zenith/commit/$CIRCLE_SHA1
+
+            scripts/coverage \
+              --dir=/tmp/zenith/coverage report \
+              --input-objects=/tmp/zenith/etc/binaries.list \
+              --commit-url=$COMMIT_URL \
+              --format=github
+      - run:
+          name: Upload coverage report
+          command: |
+            LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
+            REPORT_URL=https://zenithdb.github.io/zenith-coverage-data/$CIRCLE_SHA1
+            COMMIT_URL=https://github.com/zenithdb/zenith/commit/$CIRCLE_SHA1
+
+            scripts/git-upload \
+              --repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-coverage-data.git \
+              --message="Add code coverage for $COMMIT_URL" \
+              copy /tmp/zenith/coverage/report $CIRCLE_SHA1 # COPY FROM TO_RELATIVE
+
+            # Add link to the coverage report to the commit
+            curl -f -X POST \
+            https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \
+            -H "Accept: application/vnd.github.v3+json" \
+            --user "$CI_ACCESS_TOKEN" \
+            --data \
+              "{
+                \"state\": \"success\",
+                \"context\": \"zenith-coverage\",
+                \"description\": \"Coverage report is ready\",
+                \"target_url\": \"$REPORT_URL\"
+              }"

  # Build zenithdb/zenith:latest image and push it to Docker hub
  docker-image:
@@ -300,7 +439,197 @@ jobs:
          name: Build and push Docker image
          command: |
            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
-            docker build --build-arg GIT_VERSION=$CIRCLE_SHA1 -t zenithdb/zenith:latest . && docker push zenithdb/zenith:latest
+            DOCKER_TAG=$(git log --oneline|wc -l)
+            docker build \
+              --pull \
+              --build-arg GIT_VERSION=${CIRCLE_SHA1} \
+              --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
+              --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
+              --tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:latest .
+            docker push zenithdb/zenith:${DOCKER_TAG}
+            docker push zenithdb/zenith:latest
+
+  # Build zenithdb/compute-node:latest image and push it to Docker hub
+  docker-image-compute:
+    docker:
+      - image: cimg/base:2021.04
+    steps:
+      - checkout
+      - setup_remote_docker:
+          docker_layer_caching: true
+      # Build zenithdb/compute-tools:latest image and push it to Docker hub
+      # TODO: this should probably also use versioned tag, not just :latest.
+      # XXX: but should it? We build and use it only locally now.
+      - run:
+          name: Build and push compute-tools Docker image
+          command: |
+            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
+            docker build -t zenithdb/compute-tools:latest -f Dockerfile.compute-tools .
+            docker push zenithdb/compute-tools:latest
+      - run:
+          name: Init postgres submodule
+          command: git submodule update --init --depth 1
+      - run:
+          name: Build and push compute-node Docker image
+          command: |
+            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
+            DOCKER_TAG=$(git log --oneline|wc -l)
+            docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:latest vendor/postgres
+            docker push zenithdb/compute-node:${DOCKER_TAG}
+            docker push zenithdb/compute-node:latest
+
+  # Build production zenithdb/zenith:release image and push it to Docker hub
+  docker-image-release:
+    docker:
+      - image: cimg/base:2021.04
+    steps:
+      - checkout
+      - setup_remote_docker:
+          docker_layer_caching: true
+      - run:
+          name: Init postgres submodule
+          command: git submodule update --init --depth 1
+      - run:
+          name: Build and push Docker image
+          command: |
+            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
+            DOCKER_TAG="release-$(git log --oneline|wc -l)"
+            docker build \
+              --pull \
+              --build-arg GIT_VERSION=${CIRCLE_SHA1} \
+              --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
+              --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
+              --tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:release .
+            docker push zenithdb/zenith:${DOCKER_TAG}
+            docker push zenithdb/zenith:release
+
+  # Build production zenithdb/compute-node:release image and push it to Docker hub
+  docker-image-compute-release:
+    docker:
+      - image: cimg/base:2021.04
+    steps:
+      - checkout
+      - setup_remote_docker:
+          docker_layer_caching: true
+      # Build zenithdb/compute-tools:release image and push it to Docker hub
+      # TODO: this should probably also use versioned tag, not just :latest.
+      # XXX: but should it? We build and use it only locally now.
+      - run:
+          name: Build and push compute-tools Docker image
+          command: |
+            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
+            docker build -t zenithdb/compute-tools:release -f Dockerfile.compute-tools .
+            docker push zenithdb/compute-tools:release
+      - run:
+          name: Init postgres submodule
+          command: git submodule update --init --depth 1
+      - run:
+          name: Build and push compute-node Docker image
+          command: |
+            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
+            DOCKER_TAG="release-$(git log --oneline|wc -l)"
+            docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:release vendor/postgres
+            docker push zenithdb/compute-node:${DOCKER_TAG}
+            docker push zenithdb/compute-node:release
+
+  deploy-staging:
+    docker:
+      - image: cimg/python:3.10
+    steps:
+      - checkout
+      - setup_remote_docker
+      - run:
+          name: Setup ansible
+          command: |
+            pip install --progress-bar off --user ansible boto3
+      - run:
+          name: Redeploy
+          command: |
+            cd "$(pwd)/.circleci/ansible"
+
+            ./get_binaries.sh
+
+            echo "${TELEPORT_SSH_KEY}"  | tr -d '\n'| base64 --decode >ssh-key
+            echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
+            chmod 0600 ssh-key
+            ssh-add ssh-key
+            rm -f ssh-key ssh-key-cert.pub
+
+            ansible-playbook deploy.yaml -i staging.hosts
+            rm -f zenith_install.tar.gz .zenith_current_version
+
+  deploy-staging-proxy:
+    docker:
+      - image: cimg/base:2021.04
+    environment:
+      KUBECONFIG: .kubeconfig
+    steps:
+      - checkout
+      - run:
+          name: Store kubeconfig file
+          command: |
+            echo "${STAGING_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
+            chmod 0600 ${KUBECONFIG}
+      - run:
+          name: Setup helm v3
+          command: |
+            curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+            helm repo add zenithdb https://zenithdb.github.io/helm-charts
+      - run:
+          name: Re-deploy proxy
+          command: |
+            DOCKER_TAG=$(git log --oneline|wc -l)
+            helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
+
+
+  deploy-release:
+    docker:
+      - image: cimg/python:3.10
+    steps:
+      - checkout
+      - setup_remote_docker
+      - run:
+          name: Setup ansible
+          command: |
+            pip install --progress-bar off --user ansible boto3
+      - run:
+          name: Redeploy
+          command: |
+            cd "$(pwd)/.circleci/ansible"
+
+            RELEASE=true ./get_binaries.sh
+
+            echo "${TELEPORT_SSH_KEY}"  | tr -d '\n'| base64 --decode >ssh-key
+            echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
+            chmod 0600 ssh-key
+            ssh-add ssh-key
+            rm -f ssh-key ssh-key-cert.pub
+
+            ansible-playbook deploy.yaml -i production.hosts -e console_mgmt_base_url=http://console-release.local
+            rm -f zenith_install.tar.gz .zenith_current_version
+
+  deploy-release-proxy:
+    docker:
+      - image: cimg/base:2021.04
+    environment:
+      KUBECONFIG: .kubeconfig
+    steps:
+      - checkout
+      - run:
+          name: Store kubeconfig file
+          command: |
+            echo "${PRODUCTION_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
+            chmod 0600 ${KUBECONFIG}
+      - run:
+          name: Setup helm v3
+          command: |
+            curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+            helm repo add zenithdb https://zenithdb.github.io/helm-charts
+      - run:
+          name: Re-deploy proxy
+          command: |
+            DOCKER_TAG="release-$(git log --oneline|wc -l)"
+            helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait

  # Trigger a new remote CI job
  remote-ci-trigger:
@@ -365,6 +694,7 @@ workflows:
            - build-postgres-<< matrix.build_type >>
      - run-pytest:
          name: pg_regress-tests-<< matrix.build_type >>
+          context: PERF_TEST_RESULT_CONNSTR
          matrix:
            parameters:
              build_type: ["debug", "release"]
@@ -382,11 +712,19 @@ workflows:
            - build-zenith-<< matrix.build_type >>
      - run-pytest:
          name: benchmarks
+          context: PERF_TEST_RESULT_CONNSTR
          build_type: release
          test_selection: performance
          run_in_parallel: false
+          save_perf_report: true
          requires:
            - build-zenith-release
+      - coverage-report:
+          # Context passes credentials for gh api
+          context: CI_ACCESS_TOKEN
+          requires:
+            # TODO: consider adding more
+            - other-tests-debug
      - docker-image:
          # Context gives an ability to login
          context: Docker Hub
@@ -398,6 +736,76 @@ workflows:
          requires:
            - pg_regress-tests-release
            - other-tests-release
+      - docker-image-compute:
+          # Context gives an ability to login
+          context: Docker Hub
+          # Build image only for commits to main
+          filters:
+            branches:
+              only:
+                - main
+          requires:
+            - pg_regress-tests-release
+            - other-tests-release
+      - deploy-staging:
+          # Context gives an ability to login
+          context: Docker Hub
+          # deploy only for commits to main
+          filters:
+            branches:
+              only:
+                - main
+          requires:
+            - docker-image
+      - deploy-staging-proxy:
+          # deploy only for commits to main
+          filters:
+            branches:
+              only:
+                - main
+          requires:
+            - docker-image
+
+      - docker-image-release:
+          # Context gives an ability to login
+          context: Docker Hub
+          # Build image only for commits to main
+          filters:
+            branches:
+              only:
+                - release
+          requires:
+            - pg_regress-tests-release
+            - other-tests-release
+      - docker-image-compute-release:
+          # Context gives an ability to login
+          context: Docker Hub
+          # Build image only for commits to main
+          filters:
+            branches:
+              only:
+                - release
+          requires:
+            - pg_regress-tests-release
+            - other-tests-release
+      - deploy-release:
+          # Context gives an ability to login
+          context: Docker Hub
+          # deploy only for commits to main
+          filters:
+            branches:
+              only:
+                - release
+          requires:
+            - docker-image-release
+      - deploy-release-proxy:
+          # deploy only for commits to main
+          filters:
+            branches:
+              only:
+                - release
+          requires:
+            - docker-image-release
      - remote-ci-trigger:
          # Context passes credentials for gh api
          context: CI_ACCESS_TOKEN
--- a/.circleci/helm-values/production.proxy.yaml
+++ b/.circleci/helm-values/production.proxy.yaml
@@ -0,0 +1,35 @@
+# Helm chart values for zenith-proxy.
+# This is a YAML-formatted file.
+
+settings:
+  authEndpoint: "https://console.zenith.tech/authenticate_proxy_request/"
+  uri: "https://console.zenith.tech/psql_session/"
+
+# -- Additional labels for zenith-proxy pods
+podLabels:
+  zenith_service: proxy
+  zenith_env: production
+  zenith_region: us-west-2
+  zenith_region_slug: oregon
+
+service:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal
+    external-dns.alpha.kubernetes.io/hostname: proxy-release.local
+  type: LoadBalancer
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: start.zenith.tech
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
--- a/.circleci/helm-values/staging.proxy.yaml
+++ b/.circleci/helm-values/staging.proxy.yaml
@@ -0,0 +1,27 @@
+# Helm chart values for zenith-proxy.
+# This is a YAML-formatted file.
+
+settings:
+  authEndpoint: "https://console.stage.zenith.tech/authenticate_proxy_request/"
+  uri: "https://console.stage.zenith.tech/psql_session/"
+
+# -- Additional labels for zenith-proxy pods
+podLabels:
+  zenith_service: proxy
+  zenith_env: staging
+  zenith_region: us-east-1
+  zenith_region_slug: virginia
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: start.stage.zenith.tech
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -3,7 +3,7 @@ name: benchmarking
 on:
  # uncomment to run on push for debugging your PR
  # push:
-  #   branches: [ mybranch ]
+  #   branches: [ your branch ]
  schedule:
    # * is a special character in YAML so you have to quote this string
    #          ┌───────────── minute (0 - 59)
@@ -15,9 +15,6 @@ on:

  workflow_dispatch: # adds ability to run this manually

-env:
-  BASE_URL: "https://console.zenith.tech"
-
 jobs:
  bench:
    # this workflow runs on self hosteed runner
@@ -35,32 +32,24 @@ jobs:
    - name: Checkout zenith repo
      uses: actions/checkout@v2

-    - name: Checkout zenith-perf-data repo
-      uses: actions/checkout@v2
-      with:
-        repository: zenithdb/zenith-perf-data
-        token: ${{ secrets.VIP_VAP_ACCESS_TOKEN }}
-        ref: master
-        path: zenith-perf-data
-
    # actions/setup-python@v2 is not working correctly on self-hosted runners
    # see https://github.com/actions/setup-python/issues/162
    # and probably https://github.com/actions/setup-python/issues/162#issuecomment-865387976 in particular
    # so the simplest solution to me is to use already installed system python and spin virtualenvs for job runs.
-    # there is Python 3.7.10 already installed on the machine so use it to install pipenv and then use pipenv's virtuealenvs
-    - name: Install pipenv & deps
+    # there is Python 3.7.10 already installed on the machine so use it to install poetry and then use poetry's virtuealenvs
+    - name: Install poetry & deps
      run: |
-        python3 -m pip install --upgrade pipenv wheel
-        # since pip/pipenv caches are reused there shouldn't be any troubles with install every time
-        pipenv install
+        python3 -m pip install --upgrade poetry wheel
+        # since pip/poetry caches are reused there shouldn't be any troubles with install every time
+        ./scripts/pysync

    - name: Show versions
      run: |
        echo Python
        python3 --version
-        pipenv run python3 --version
+        poetry run python3 --version
        echo Pipenv
-        pipenv --version
+        poetry --version
        echo Pgbench
        $PG_BIN/pgbench --version

@@ -70,25 +59,14 @@ jobs:
    # So use pre created cluster. It needs to be started manually, but stop is automatic after 5 minutes of inactivity
    - name: Setup cluster
      env:
-        BENCHMARK_CONSOLE_USER_PASSWORD: "${{ secrets.BENCHMARK_CONSOLE_USER_PASSWORD }}"
-        BENCHMARK_CONSOLE_ACCESS_TOKEN: "${{ secrets.BENCHMARK_CONSOLE_ACCESS_TOKEN }}"
-        BENCHMARK_CLUSTER_ID: "${{ secrets.BENCHMARK_CLUSTER_ID }}"
+        BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
      shell: bash
      run: |
        set -e

        echo "Starting cluster"
-        CLUSTER=$(curl -s --fail --show-error -X POST $BASE_URL/api/v1/clusters/$BENCHMARK_CLUSTER_ID/start \
-            -H "Authorization: Bearer $BENCHMARK_CONSOLE_ACCESS_TOKEN")
-        echo $CLUSTER | python -m json.tool
-
-        echo "Waiting for cluster to become ready"
-        sleep 10
-
-        echo "CLUSTER_ID=$BENCHMARK_CLUSTER_ID" >> $GITHUB_ENV
-        CLUSTER=$(curl -s --fail --show-error -X GET $BASE_URL/api/v1/clusters/$BENCHMARK_CLUSTER_ID.json \
-            -H "Authorization: Bearer $BENCHMARK_CONSOLE_ACCESS_TOKEN")
-        echo $CLUSTER | python -m json.tool
+        # wake up the cluster
+        $PG_BIN/psql $BENCHMARK_CONNSTR -c "SELECT 1"

    - name: Run benchmark
      # pgbench is installed system wide from official repo
@@ -108,18 +86,18 @@ jobs:
        TEST_PG_BENCH_TRANSACTIONS_MATRIX: "5000,10000,20000"
        TEST_PG_BENCH_SCALES_MATRIX: "10,15"
        PLATFORM: "zenith-staging"
-        BENCHMARK_CONSOLE_ACCESS_TOKEN: "${{ secrets.BENCHMARK_CONSOLE_ACCESS_TOKEN }}"
        BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
        REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
      run: |
-        mkdir -p zenith-perf-data/data/staging
-        pipenv run pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir zenith-perf-data/data/staging
+        # just to be sure that no data was cached on self hosted runner
+        # since it might generate duplicates when calling ingest_perf_test_result.py
+        rm -rf perf-report-staging
+        mkdir -p perf-report-staging
+        ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging

    - name: Submit result
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
      run: |
-        cd zenith-perf-data
-        git add data
-        git commit --author="vipvap <vipvap@zenith.tech>" -m "add performance test result for $GITHUB_SHA zenith revision"
-        git push https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-perf-data.git master
+        REPORT_FROM=$(realpath perf-report-staging) REPORT_TO=staging scripts/generate_and_push_perf_report.sh
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -64,10 +64,11 @@ jobs:
            target
          key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}

+      # Use `env CARGO_INCREMENTAL=0` to mitigate https://github.com/rust-lang/rust/issues/91696 for rustc 1.57.0
      - name: Run cargo build
        run: |
-          cargo build --workspace --bins --examples --tests
+          env CARGO_INCREMENTAL=0 cargo build --workspace --bins --examples --tests

      - name: Run cargo test
        run: |
-          cargo test -- --nocapture --test-threads=1
+          env CARGO_INCREMENTAL=0 cargo test -- --nocapture --test-threads=1
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,7 @@ test_output/
 .vscode
 /.zenith
 /integration_tests/.zenith
+
+# Coverage
+*.profraw
+*.profdata
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,5 +1,6 @@
 [workspace]
 members = [
+    "compute_tools",
    "control_plane",
    "pageserver",
    "postgres_ffi",
@@ -15,3 +16,8 @@ members = [
 # This is useful for profiling and, to some extent, debug.
 # Besides, debug info should not affect the performance.
 debug = true
+
+# This is only needed for proxy's tests
+# TODO: we should probably fork tokio-postgres-rustls instead
+[patch.crates-io]
+tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
--- a/86
+++ b/86
@@ -1,62 +1,62 @@
+# Build Postgres
 #
-# Docker image for console integration testing.
-#
+#FROM zimg/rust:1.56 AS pg-build
+FROM zenithdb/build:buster-20220309 AS pg-build
+WORKDIR /pg
+
+USER root
+
+COPY vendor/postgres vendor/postgres
+COPY Makefile Makefile

-#
-# Build Postgres separately --- this layer will be rebuilt only if one of
-# mentioned paths will get any changes.
-#
-FROM zenithdb/build:buster AS pg-build
-WORKDIR /zenith
-COPY ./vendor/postgres vendor/postgres
-COPY ./Makefile Makefile
 ENV BUILD_TYPE release
-RUN make -j $(getconf _NPROCESSORS_ONLN) -s postgres
-RUN rm -rf postgres_install/build
+RUN set -e \
+    && make -j $(nproc) -s postgres \
+    && rm -rf tmp_install/build \
+    && tar -C tmp_install -czf /postgres_install.tar.gz .

-#
 # Build zenith binaries
 #
-# TODO: build cargo deps as separate layer. We used cargo-chef before but that was
-# net time waste in a lot of cases. Copying Cargo.lock with empty lib.rs should do the work.
-#
-FROM zenithdb/build:buster AS build
+#FROM zimg/rust:1.56 AS build
+FROM zenithdb/build:buster-20220309 AS build
+ARG GIT_VERSION=local

-ARG GIT_VERSION
-RUN if [ -z "$GIT_VERSION" ]; then echo "GIT_VERSION is reqired, use build_arg to pass it"; exit 1; fi
-
-WORKDIR /zenith
-COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
+ARG CACHEPOT_BUCKET=zenith-rust-cachepot
+ARG AWS_ACCESS_KEY_ID
+ARG AWS_SECRET_ACCESS_KEY
+#ENV RUSTC_WRAPPER cachepot
+ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot

+COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
 COPY . .
-RUN GIT_VERSION=$GIT_VERSION cargo build --release

+RUN cargo build --release
+
+# Build final image
 #
-# Copy binaries to resulting image.
-#
-FROM debian:buster-slim
+FROM debian:bullseye-slim
 WORKDIR /data

-RUN apt-get update && apt-get -yq install libreadline-dev libseccomp-dev openssl ca-certificates && \
-    mkdir zenith_install
+RUN set -e \
+    && apt-get update \
+    && apt-get install -y \
+        libreadline-dev \
+        libseccomp-dev \
+        openssl \
+        ca-certificates \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
+    && useradd -d /data zenith \
+    && chown -R zenith:zenith /data
+
+COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/pageserver /usr/local/bin
+COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/safekeeper /usr/local/bin
+COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/proxy      /usr/local/bin
+
+COPY --from=pg-build /pg/tmp_install/         /usr/local/
+COPY --from=pg-build /postgres_install.tar.gz /data/

-COPY --from=build /zenith/target/release/pageserver /usr/local/bin
-COPY --from=build /zenith/target/release/safekeeper /usr/local/bin
-COPY --from=build /zenith/target/release/proxy /usr/local/bin
-COPY --from=pg-build /zenith/tmp_install postgres_install
 COPY docker-entrypoint.sh /docker-entrypoint.sh

-# Remove build artifacts (~ 500 MB)
-RUN rm -rf postgres_install/build && \
-    # 'Install' Postgres binaries locally
-    cp -r postgres_install/* /usr/local/ && \
-    # Prepare an archive of Postgres binaries (should be around 11 MB)
-    # and keep it inside container for an ease of deploy pipeline.
-    cd postgres_install && tar -czf /data/postgres_install.tar.gz . && cd .. && \
-    rm -rf postgres_install
-
-RUN useradd -d /data zenith && chown -R zenith:zenith /data
-
 VOLUME ["/data"]
 USER zenith
 EXPOSE 6400
--- a/Dockerfile.build
+++ b/Dockerfile.build
@@ -1,15 +1,23 @@
-#
-# Image with all the required dependencies to build https://github.com/zenithdb/zenith
-# and Postgres from https://github.com/zenithdb/postgres
-# Also includes some rust development and build tools.
-#
-FROM rust:slim-buster
-WORKDIR /zenith
+FROM rust:1.56.1-slim-buster
+WORKDIR /home/circleci/project

-# Install postgres and zenith build dependencies
-# clang is for rocksdb
-RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-                                          libseccomp-dev pkg-config libssl-dev clang
+RUN set -e \
+    && apt-get update \
+    && apt-get -yq install \
+        automake \
+        libtool \
+        build-essential \
+        bison \
+        flex \
+        libreadline-dev \
+        zlib1g-dev \
+        libxml2-dev \
+        libseccomp-dev \
+        pkg-config \
+        libssl-dev \
+        clang

-# Install rust tools
-RUN rustup component add clippy && cargo install cargo-audit
+RUN set -e \
+    && rustup component add clippy \
+    && cargo install cargo-audit \
+    && cargo install --git https://github.com/paritytech/cachepot
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -0,0 +1,14 @@
+# First transient image to build compute_tools binaries
+# NB: keep in sync with rust image version in .circle/config.yml
+FROM rust:1.56.1-slim-buster AS rust-build
+
+WORKDIR /zenith
+
+COPY . .
+
+RUN cargo build -p compute_tools --release
+
+# Final image that only has one binary
+FROM debian:buster-slim
+
+COPY --from=rust-build /zenith/target/release/zenith_ctl /usr/local/bin/zenith_ctl
--- a/30
+++ b/30
@@ -1,30 +0,0 @@
-[[source]]
-url = "https://pypi.python.org/simple"
-verify_ssl = true
-name = "pypi"
-
-[packages]
-pytest = ">=6.0.0"
-typing-extensions = "*"
-pyjwt = {extras = ["crypto"], version = "*"}
-requests = "*"
-pytest-xdist = "*"
-asyncpg = "*"
-cached-property = "*"
-psycopg2-binary = "*"
-jinja2 = "*"
-
-[dev-packages]
-# Behavior may change slightly between versions. These are run continuously,
-# so we pin exact versions to avoid suprising breaks. Update if comfortable.
-yapf = "==0.31.0"
-mypy = "==0.910"
-# Non-pinned packages follow.
-pipenv = "*"
-flake8 = "*"
-types-requests = "*"
-types-psycopg2 = "*"
-
-[requires]
-# we need at least 3.7, but pipenv doesn't allow to say this directly
-python_version = "3"
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,652 +0,0 @@
-{
-    "_meta": {
-        "hash": {
-            "sha256": "c309cb963a7b07ae3d30e9cbf08b495f77bdecc0e5356fc89d133c4fbcb65b2b"
-        },
-        "pipfile-spec": 6,
-        "requires": {
-            "python_version": "3"
-        },
-        "sources": [
-            {
-                "name": "pypi",
-                "url": "https://pypi.python.org/simple",
-                "verify_ssl": true
-            }
-        ]
-    },
-    "default": {
-        "asyncpg": {
-            "hashes": [
-                "sha256:129d501f3d30616afd51eb8d3142ef51ba05374256bd5834cec3ef4956a9b317",
-                "sha256:29ef6ae0a617fc13cc2ac5dc8e9b367bb83cba220614b437af9b67766f4b6b20",
-                "sha256:41704c561d354bef01353835a7846e5606faabbeb846214dfcf666cf53319f18",
-                "sha256:556b0e92e2b75dc028b3c4bc9bd5162ddf0053b856437cf1f04c97f9c6837d03",
-                "sha256:8ff5073d4b654e34bd5eaadc01dc4d68b8a9609084d835acd364cd934190a08d",
-                "sha256:a458fc69051fbb67d995fdda46d75a012b5d6200f91e17d23d4751482640ed4c",
-                "sha256:a7095890c96ba36f9f668eb552bb020dddb44f8e73e932f8573efc613ee83843",
-                "sha256:a738f4807c853623d3f93f0fea11f61be6b0e5ca16ea8aeb42c2c7ee742aa853",
-                "sha256:c4fc0205fe4ddd5aeb3dfdc0f7bafd43411181e1f5650189608e5971cceacff1",
-                "sha256:dd2fa063c3344823487d9ddccb40802f02622ddf8bf8a6cc53885ee7a2c1c0c6",
-                "sha256:ddffcb85227bf39cd1bedd4603e0082b243cf3b14ced64dce506a15b05232b83",
-                "sha256:e36c6806883786b19551bb70a4882561f31135dc8105a59662e0376cf5b2cbc5",
-                "sha256:eed43abc6ccf1dc02e0d0efc06ce46a411362f3358847c6b0ec9a43426f91ece"
-            ],
-            "index": "pypi",
-            "version": "==0.24.0"
-        },
-        "attrs": {
-            "hashes": [
-                "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1",
-                "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==21.2.0"
-        },
-        "cached-property": {
-            "hashes": [
-                "sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130",
-                "sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"
-            ],
-            "index": "pypi",
-            "version": "==1.5.2"
-        },
-        "certifi": {
-            "hashes": [
-                "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
-                "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
-            ],
-            "version": "==2021.10.8"
-        },
-        "cffi": {
-            "hashes": [
-                "sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3",
-                "sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2",
-                "sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636",
-                "sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20",
-                "sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728",
-                "sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27",
-                "sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66",
-                "sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443",
-                "sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0",
-                "sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7",
-                "sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39",
-                "sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605",
-                "sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a",
-                "sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37",
-                "sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029",
-                "sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139",
-                "sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc",
-                "sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df",
-                "sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14",
-                "sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880",
-                "sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2",
-                "sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a",
-                "sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e",
-                "sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474",
-                "sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024",
-                "sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8",
-                "sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0",
-                "sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e",
-                "sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a",
-                "sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e",
-                "sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032",
-                "sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6",
-                "sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e",
-                "sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b",
-                "sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e",
-                "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954",
-                "sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962",
-                "sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c",
-                "sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4",
-                "sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55",
-                "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962",
-                "sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023",
-                "sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c",
-                "sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6",
-                "sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8",
-                "sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382",
-                "sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7",
-                "sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc",
-                "sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997",
-                "sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"
-            ],
-            "version": "==1.15.0"
-        },
-        "charset-normalizer": {
-            "hashes": [
-                "sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0",
-                "sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b"
-            ],
-            "markers": "python_version >= '3'",
-            "version": "==2.0.7"
-        },
-        "cryptography": {
-            "hashes": [
-                "sha256:07bb7fbfb5de0980590ddfc7f13081520def06dc9ed214000ad4372fb4e3c7f6",
-                "sha256:18d90f4711bf63e2fb21e8c8e51ed8189438e6b35a6d996201ebd98a26abbbe6",
-                "sha256:1ed82abf16df40a60942a8c211251ae72858b25b7421ce2497c2eb7a1cee817c",
-                "sha256:22a38e96118a4ce3b97509443feace1d1011d0571fae81fc3ad35f25ba3ea999",
-                "sha256:2d69645f535f4b2c722cfb07a8eab916265545b3475fdb34e0be2f4ee8b0b15e",
-                "sha256:4a2d0e0acc20ede0f06ef7aa58546eee96d2592c00f450c9acb89c5879b61992",
-                "sha256:54b2605e5475944e2213258e0ab8696f4f357a31371e538ef21e8d61c843c28d",
-                "sha256:7075b304cd567694dc692ffc9747f3e9cb393cc4aa4fb7b9f3abd6f5c4e43588",
-                "sha256:7b7ceeff114c31f285528ba8b390d3e9cfa2da17b56f11d366769a807f17cbaa",
-                "sha256:7eba2cebca600a7806b893cb1d541a6e910afa87e97acf2021a22b32da1df52d",
-                "sha256:928185a6d1ccdb816e883f56ebe92e975a262d31cc536429041921f8cb5a62fd",
-                "sha256:9933f28f70d0517686bd7de36166dda42094eac49415459d9bdf5e7df3e0086d",
-                "sha256:a688ebcd08250eab5bb5bca318cc05a8c66de5e4171a65ca51db6bd753ff8953",
-                "sha256:abb5a361d2585bb95012a19ed9b2c8f412c5d723a9836418fab7aaa0243e67d2",
-                "sha256:c10c797ac89c746e488d2ee92bd4abd593615694ee17b2500578b63cad6b93a8",
-                "sha256:ced40344e811d6abba00295ced98c01aecf0c2de39481792d87af4fa58b7b4d6",
-                "sha256:d57e0cdc1b44b6cdf8af1d01807db06886f10177469312fbde8f44ccbb284bc9",
-                "sha256:d99915d6ab265c22873f1b4d6ea5ef462ef797b4140be4c9d8b179915e0985c6",
-                "sha256:eb80e8a1f91e4b7ef8b33041591e6d89b2b8e122d787e87eeb2b08da71bb16ad",
-                "sha256:ebeddd119f526bcf323a89f853afb12e225902a24d29b55fe18dd6fcb2838a76"
-            ],
-            "version": "==35.0.0"
-        },
-        "execnet": {
-            "hashes": [
-                "sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5",
-                "sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==1.9.0"
-        },
-        "idna": {
-            "hashes": [
-                "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
-                "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
-            ],
-            "markers": "python_version >= '3'",
-            "version": "==3.3"
-        },
-        "importlib-metadata": {
-            "hashes": [
-                "sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
-                "sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
-            ],
-            "markers": "python_version < '3.8'",
-            "version": "==4.8.1"
-        },
-        "iniconfig": {
-            "hashes": [
-                "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
-                "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
-            ],
-            "version": "==1.1.1"
-        },
-        "jinja2": {
-            "hashes": [
-                "sha256:827a0e32839ab1600d4eb1c4c33ec5a8edfbc5cb42dafa13b81f182f97784b45",
-                "sha256:8569982d3f0889eed11dd620c706d39b60c36d6d25843961f33f77fb6bc6b20c"
-            ],
-            "index": "pypi",
-            "version": "==3.0.2"
-        },
-        "markupsafe": {
-            "hashes": [
-                "sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298",
-                "sha256:023cb26ec21ece8dc3907c0e8320058b2e0cb3c55cf9564da612bc325bed5e64",
-                "sha256:0446679737af14f45767963a1a9ef7620189912317d095f2d9ffa183a4d25d2b",
-                "sha256:04635854b943835a6ea959e948d19dcd311762c5c0c6e1f0e16ee57022669194",
-                "sha256:0717a7390a68be14b8c793ba258e075c6f4ca819f15edfc2a3a027c823718567",
-                "sha256:0955295dd5eec6cb6cc2fe1698f4c6d84af2e92de33fbcac4111913cd100a6ff",
-                "sha256:0d4b31cc67ab36e3392bbf3862cfbadac3db12bdd8b02a2731f509ed5b829724",
-                "sha256:10f82115e21dc0dfec9ab5c0223652f7197feb168c940f3ef61563fc2d6beb74",
-                "sha256:168cd0a3642de83558a5153c8bd34f175a9a6e7f6dc6384b9655d2697312a646",
-                "sha256:1d609f577dc6e1aa17d746f8bd3c31aa4d258f4070d61b2aa5c4166c1539de35",
-                "sha256:1f2ade76b9903f39aa442b4aadd2177decb66525062db244b35d71d0ee8599b6",
-                "sha256:20dca64a3ef2d6e4d5d615a3fd418ad3bde77a47ec8a23d984a12b5b4c74491a",
-                "sha256:2a7d351cbd8cfeb19ca00de495e224dea7e7d919659c2841bbb7f420ad03e2d6",
-                "sha256:2d7d807855b419fc2ed3e631034685db6079889a1f01d5d9dac950f764da3dad",
-                "sha256:2ef54abee730b502252bcdf31b10dacb0a416229b72c18b19e24a4509f273d26",
-                "sha256:36bc903cbb393720fad60fc28c10de6acf10dc6cc883f3e24ee4012371399a38",
-                "sha256:37205cac2a79194e3750b0af2a5720d95f786a55ce7df90c3af697bfa100eaac",
-                "sha256:3c112550557578c26af18a1ccc9e090bfe03832ae994343cfdacd287db6a6ae7",
-                "sha256:3dd007d54ee88b46be476e293f48c85048603f5f516008bee124ddd891398ed6",
-                "sha256:4296f2b1ce8c86a6aea78613c34bb1a672ea0e3de9c6ba08a960efe0b0a09047",
-                "sha256:47ab1e7b91c098ab893b828deafa1203de86d0bc6ab587b160f78fe6c4011f75",
-                "sha256:49e3ceeabbfb9d66c3aef5af3a60cc43b85c33df25ce03d0031a608b0a8b2e3f",
-                "sha256:4dc8f9fb58f7364b63fd9f85013b780ef83c11857ae79f2feda41e270468dd9b",
-                "sha256:4efca8f86c54b22348a5467704e3fec767b2db12fc39c6d963168ab1d3fc9135",
-                "sha256:53edb4da6925ad13c07b6d26c2a852bd81e364f95301c66e930ab2aef5b5ddd8",
-                "sha256:5855f8438a7d1d458206a2466bf82b0f104a3724bf96a1c781ab731e4201731a",
-                "sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a",
-                "sha256:5b6d930f030f8ed98e3e6c98ffa0652bdb82601e7a016ec2ab5d7ff23baa78d1",
-                "sha256:5bb28c636d87e840583ee3adeb78172efc47c8b26127267f54a9c0ec251d41a9",
-                "sha256:60bf42e36abfaf9aff1f50f52644b336d4f0a3fd6d8a60ca0d054ac9f713a864",
-                "sha256:611d1ad9a4288cf3e3c16014564df047fe08410e628f89805e475368bd304914",
-                "sha256:6300b8454aa6930a24b9618fbb54b5a68135092bc666f7b06901f897fa5c2fee",
-                "sha256:63f3268ba69ace99cab4e3e3b5840b03340efed0948ab8f78d2fd87ee5442a4f",
-                "sha256:6557b31b5e2c9ddf0de32a691f2312a32f77cd7681d8af66c2692efdbef84c18",
-                "sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8",
-                "sha256:6a7fae0dd14cf60ad5ff42baa2e95727c3d81ded453457771d02b7d2b3f9c0c2",
-                "sha256:6c4ca60fa24e85fe25b912b01e62cb969d69a23a5d5867682dd3e80b5b02581d",
-                "sha256:6fcf051089389abe060c9cd7caa212c707e58153afa2c649f00346ce6d260f1b",
-                "sha256:7d91275b0245b1da4d4cfa07e0faedd5b0812efc15b702576d103293e252af1b",
-                "sha256:89c687013cb1cd489a0f0ac24febe8c7a666e6e221b783e53ac50ebf68e45d86",
-                "sha256:8d206346619592c6200148b01a2142798c989edcb9c896f9ac9722a99d4e77e6",
-                "sha256:905fec760bd2fa1388bb5b489ee8ee5f7291d692638ea5f67982d968366bef9f",
-                "sha256:97383d78eb34da7e1fa37dd273c20ad4320929af65d156e35a5e2d89566d9dfb",
-                "sha256:984d76483eb32f1bcb536dc27e4ad56bba4baa70be32fa87152832cdd9db0833",
-                "sha256:99df47edb6bda1249d3e80fdabb1dab8c08ef3975f69aed437cb69d0a5de1e28",
-                "sha256:9f02365d4e99430a12647f09b6cc8bab61a6564363f313126f775eb4f6ef798e",
-                "sha256:a30e67a65b53ea0a5e62fe23682cfe22712e01f453b95233b25502f7c61cb415",
-                "sha256:ab3ef638ace319fa26553db0624c4699e31a28bb2a835c5faca8f8acf6a5a902",
-                "sha256:aca6377c0cb8a8253e493c6b451565ac77e98c2951c45f913e0b52facdcff83f",
-                "sha256:add36cb2dbb8b736611303cd3bfcee00afd96471b09cda130da3581cbdc56a6d",
-                "sha256:b2f4bf27480f5e5e8ce285a8c8fd176c0b03e93dcc6646477d4630e83440c6a9",
-                "sha256:b7f2d075102dc8c794cbde1947378051c4e5180d52d276987b8d28a3bd58c17d",
-                "sha256:baa1a4e8f868845af802979fcdbf0bb11f94f1cb7ced4c4b8a351bb60d108145",
-                "sha256:be98f628055368795d818ebf93da628541e10b75b41c559fdf36d104c5787066",
-                "sha256:bf5d821ffabf0ef3533c39c518f3357b171a1651c1ff6827325e4489b0e46c3c",
-                "sha256:c47adbc92fc1bb2b3274c4b3a43ae0e4573d9fbff4f54cd484555edbf030baf1",
-                "sha256:cdfba22ea2f0029c9261a4bd07e830a8da012291fbe44dc794e488b6c9bb353a",
-                "sha256:d6c7ebd4e944c85e2c3421e612a7057a2f48d478d79e61800d81468a8d842207",
-                "sha256:d7f9850398e85aba693bb640262d3611788b1f29a79f0c93c565694658f4071f",
-                "sha256:d8446c54dc28c01e5a2dbac5a25f071f6653e6e40f3a8818e8b45d790fe6ef53",
-                "sha256:deb993cacb280823246a026e3b2d81c493c53de6acfd5e6bfe31ab3402bb37dd",
-                "sha256:e0f138900af21926a02425cf736db95be9f4af72ba1bb21453432a07f6082134",
-                "sha256:e9936f0b261d4df76ad22f8fee3ae83b60d7c3e871292cd42f40b81b70afae85",
-                "sha256:f0567c4dc99f264f49fe27da5f735f414c4e7e7dd850cfd8e69f0862d7c74ea9",
-                "sha256:f5653a225f31e113b152e56f154ccbe59eeb1c7487b39b9d9f9cdb58e6c79dc5",
-                "sha256:f826e31d18b516f653fe296d967d700fddad5901ae07c622bb3705955e1faa94",
-                "sha256:f8ba0e8349a38d3001fae7eadded3f6606f0da5d748ee53cc1dab1d6527b9509",
-                "sha256:f9081981fe268bd86831e5c75f7de206ef275defcb82bc70740ae6dc507aee51",
-                "sha256:fa130dd50c57d53368c9d59395cb5526eda596d3ffe36666cd81a44d56e48872"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==2.0.1"
-        },
-        "packaging": {
-            "hashes": [
-                "sha256:096d689d78ca690e4cd8a89568ba06d07ca097e3306a4381635073ca91479966",
-                "sha256:14317396d1e8cdb122989b916fa2c7e9ca8e2be9e8060a6eff75b6b7b4d8a7e0"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==21.2"
-        },
-        "pluggy": {
-            "hashes": [
-                "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
-                "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==1.0.0"
-        },
-        "psycopg2-binary": {
-            "hashes": [
-                "sha256:0b7dae87f0b729922e06f85f667de7bf16455d411971b2043bbd9577af9d1975",
-                "sha256:0f2e04bd2a2ab54fa44ee67fe2d002bb90cee1c0f1cc0ebc3148af7b02034cbd",
-                "sha256:123c3fb684e9abfc47218d3784c7b4c47c8587951ea4dd5bc38b6636ac57f616",
-                "sha256:1473c0215b0613dd938db54a653f68251a45a78b05f6fc21af4326f40e8360a2",
-                "sha256:14db1752acdd2187d99cb2ca0a1a6dfe57fc65c3281e0f20e597aac8d2a5bd90",
-                "sha256:1e3a362790edc0a365385b1ac4cc0acc429a0c0d662d829a50b6ce743ae61b5a",
-                "sha256:1e85b74cbbb3056e3656f1cc4781294df03383127a8114cbc6531e8b8367bf1e",
-                "sha256:20f1ab44d8c352074e2d7ca67dc00843067788791be373e67a0911998787ce7d",
-                "sha256:24b0b6688b9f31a911f2361fe818492650795c9e5d3a1bc647acbd7440142a4f",
-                "sha256:2f62c207d1740b0bde5c4e949f857b044818f734a3d57f1d0d0edc65050532ed",
-                "sha256:3242b9619de955ab44581a03a64bdd7d5e470cc4183e8fcadd85ab9d3756ce7a",
-                "sha256:35c4310f8febe41f442d3c65066ca93cccefd75013df3d8c736c5b93ec288140",
-                "sha256:4235f9d5ddcab0b8dbd723dca56ea2922b485ea00e1dafacf33b0c7e840b3d32",
-                "sha256:542875f62bc56e91c6eac05a0deadeae20e1730be4c6334d8f04c944fcd99759",
-                "sha256:5ced67f1e34e1a450cdb48eb53ca73b60aa0af21c46b9b35ac3e581cf9f00e31",
-                "sha256:661509f51531ec125e52357a489ea3806640d0ca37d9dada461ffc69ee1e7b6e",
-                "sha256:7360647ea04db2e7dff1648d1da825c8cf68dc5fbd80b8fb5b3ee9f068dcd21a",
-                "sha256:736b8797b58febabb85494142c627bd182b50d2a7ec65322983e71065ad3034c",
-                "sha256:8c13d72ed6af7fd2c8acbd95661cf9477f94e381fce0792c04981a8283b52917",
-                "sha256:988b47ac70d204aed01589ed342303da7c4d84b56c2f4c4b8b00deda123372bf",
-                "sha256:995fc41ebda5a7a663a254a1dcac52638c3e847f48307b5416ee373da15075d7",
-                "sha256:a36c7eb6152ba5467fb264d73844877be8b0847874d4822b7cf2d3c0cb8cdcb0",
-                "sha256:aed4a9a7e3221b3e252c39d0bf794c438dc5453bc2963e8befe9d4cd324dff72",
-                "sha256:aef9aee84ec78af51107181d02fe8773b100b01c5dfde351184ad9223eab3698",
-                "sha256:b0221ca5a9837e040ebf61f48899926b5783668b7807419e4adae8175a31f773",
-                "sha256:b4d7679a08fea64573c969f6994a2631908bb2c0e69a7235648642f3d2e39a68",
-                "sha256:c250a7ec489b652c892e4f0a5d122cc14c3780f9f643e1a326754aedf82d9a76",
-                "sha256:ca86db5b561b894f9e5f115d6a159fff2a2570a652e07889d8a383b5fae66eb4",
-                "sha256:cfc523edecddaef56f6740d7de1ce24a2fdf94fd5e704091856a201872e37f9f",
-                "sha256:d92272c7c16e105788efe2cfa5d680f07e34e0c29b03c1908f8636f55d5f915a",
-                "sha256:da113b70f6ec40e7d81b43d1b139b9db6a05727ab8be1ee559f3a69854a69d34",
-                "sha256:f6fac64a38f6768e7bc7b035b9e10d8a538a9fadce06b983fb3e6fa55ac5f5ce",
-                "sha256:f8559617b1fcf59a9aedba2c9838b5b6aa211ffedecabca412b92a1ff75aac1a",
-                "sha256:fbb42a541b1093385a2d8c7eec94d26d30437d0e77c1d25dae1dcc46741a385e"
-            ],
-            "index": "pypi",
-            "version": "==2.9.1"
-        },
-        "py": {
-            "hashes": [
-                "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3",
-                "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==1.10.0"
-        },
-        "pycparser": {
-            "hashes": [
-                "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
-                "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==2.20"
-        },
-        "pyjwt": {
-            "extras": [
-                "crypto"
-            ],
-            "hashes": [
-                "sha256:b888b4d56f06f6dcd777210c334e69c737be74755d3e5e9ee3fe67dc18a0ee41",
-                "sha256:e0c4bb8d9f0af0c7f5b1ec4c5036309617d03d56932877f2f7a0beeb5318322f"
-            ],
-            "index": "pypi",
-            "version": "==2.3.0"
-        },
-        "pyparsing": {
-            "hashes": [
-                "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
-                "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
-            ],
-            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==2.4.7"
-        },
-        "pytest": {
-            "hashes": [
-                "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89",
-                "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"
-            ],
-            "index": "pypi",
-            "version": "==6.2.5"
-        },
-        "pytest-forked": {
-            "hashes": [
-                "sha256:6aa9ac7e00ad1a539c41bec6d21011332de671e938c7637378ec9710204e37ca",
-                "sha256:dc4147784048e70ef5d437951728825a131b81714b398d5d52f17c7c144d8815"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==1.3.0"
-        },
-        "pytest-xdist": {
-            "hashes": [
-                "sha256:7b61ebb46997a0820a263553179d6d1e25a8c50d8a8620cd1aa1e20e3be99168",
-                "sha256:89b330316f7fc475f999c81b577c2b926c9569f3d397ae432c0c2e2496d61ff9"
-            ],
-            "index": "pypi",
-            "version": "==2.4.0"
-        },
-        "requests": {
-            "hashes": [
-                "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
-                "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
-            ],
-            "index": "pypi",
-            "version": "==2.26.0"
-        },
-        "toml": {
-            "hashes": [
-                "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
-                "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
-            ],
-            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==0.10.2"
-        },
-        "typing-extensions": {
-            "hashes": [
-                "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
-                "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
-                "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
-            ],
-            "index": "pypi",
-            "version": "==3.10.0.2"
-        },
-        "urllib3": {
-            "hashes": [
-                "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece",
-                "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
-            "version": "==1.26.7"
-        },
-        "zipp": {
-            "hashes": [
-                "sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
-                "sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==3.6.0"
-        }
-    },
-    "develop": {
-        "backports.entry-points-selectable": {
-            "hashes": [
-                "sha256:988468260ec1c196dab6ae1149260e2f5472c9110334e5d51adcb77867361f6a",
-                "sha256:a6d9a871cde5e15b4c4a53e3d43ba890cc6861ec1332c9c2428c92f977192acc"
-            ],
-            "markers": "python_version >= '2.7'",
-            "version": "==1.1.0"
-        },
-        "certifi": {
-            "hashes": [
-                "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
-                "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
-            ],
-            "version": "==2021.10.8"
-        },
-        "distlib": {
-            "hashes": [
-                "sha256:c8b54e8454e5bf6237cc84c20e8264c3e991e824ef27e8f1e81049867d861e31",
-                "sha256:d982d0751ff6eaaab5e2ec8e691d949ee80eddf01a62eaa96ddb11531fe16b05"
-            ],
-            "version": "==0.3.3"
-        },
-        "filelock": {
-            "hashes": [
-                "sha256:7afc856f74fa7006a289fd10fa840e1eebd8bbff6bffb69c26c54a0512ea8cf8",
-                "sha256:bb2a1c717df74c48a2d00ed625e5a66f8572a3a30baacb7657add1d7bac4097b"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==3.3.2"
-        },
-        "flake8": {
-            "hashes": [
-                "sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d",
-                "sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"
-            ],
-            "index": "pypi",
-            "version": "==4.0.1"
-        },
-        "importlib-metadata": {
-            "hashes": [
-                "sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
-                "sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
-            ],
-            "markers": "python_version < '3.8'",
-            "version": "==4.8.1"
-        },
-        "mccabe": {
-            "hashes": [
-                "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
-                "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
-            ],
-            "version": "==0.6.1"
-        },
-        "mypy": {
-            "hashes": [
-                "sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9",
-                "sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a",
-                "sha256:119bed3832d961f3a880787bf621634ba042cb8dc850a7429f643508eeac97b9",
-                "sha256:1a85e280d4d217150ce8cb1a6dddffd14e753a4e0c3cf90baabb32cefa41b59e",
-                "sha256:3c4b8ca36877fc75339253721f69603a9c7fdb5d4d5a95a1a1b899d8b86a4de2",
-                "sha256:3e382b29f8e0ccf19a2df2b29a167591245df90c0b5a2542249873b5c1d78212",
-                "sha256:42c266ced41b65ed40a282c575705325fa7991af370036d3f134518336636f5b",
-                "sha256:53fd2eb27a8ee2892614370896956af2ff61254c275aaee4c230ae771cadd885",
-                "sha256:704098302473cb31a218f1775a873b376b30b4c18229421e9e9dc8916fd16150",
-                "sha256:7df1ead20c81371ccd6091fa3e2878559b5c4d4caadaf1a484cf88d93ca06703",
-                "sha256:866c41f28cee548475f146aa4d39a51cf3b6a84246969f3759cb3e9c742fc072",
-                "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457",
-                "sha256:adaeee09bfde366d2c13fe6093a7df5df83c9a2ba98638c7d76b010694db760e",
-                "sha256:b6fb13123aeef4a3abbcfd7e71773ff3ff1526a7d3dc538f3929a49b42be03f0",
-                "sha256:b94e4b785e304a04ea0828759172a15add27088520dc7e49ceade7834275bedb",
-                "sha256:c0df2d30ed496a08de5daed2a9ea807d07c21ae0ab23acf541ab88c24b26ab97",
-                "sha256:c6c2602dffb74867498f86e6129fd52a2770c48b7cd3ece77ada4fa38f94eba8",
-                "sha256:ceb6e0a6e27fb364fb3853389607cf7eb3a126ad335790fa1e14ed02fba50811",
-                "sha256:d9dd839eb0dc1bbe866a288ba3c1afc33a202015d2ad83b31e875b5905a079b6",
-                "sha256:e4dab234478e3bd3ce83bac4193b2ecd9cf94e720ddd95ce69840273bf44f6de",
-                "sha256:ec4e0cd079db280b6bdabdc807047ff3e199f334050db5cbb91ba3e959a67504",
-                "sha256:ecd2c3fe726758037234c93df7e98deb257fd15c24c9180dacf1ef829da5f921",
-                "sha256:ef565033fa5a958e62796867b1df10c40263ea9ded87164d67572834e57a174d"
-            ],
-            "index": "pypi",
-            "version": "==0.910"
-        },
-        "mypy-extensions": {
-            "hashes": [
-                "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d",
-                "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"
-            ],
-            "version": "==0.4.3"
-        },
-        "pipenv": {
-            "hashes": [
-                "sha256:05958fadcd70b2de6a27542fcd2bd72dd5c59c6d35307fdac3e06361fb06e30e",
-                "sha256:d180f5be4775c552fd5e69ae18a9d6099d9dafb462efe54f11c72cb5f4d5e977"
-            ],
-            "index": "pypi",
-            "version": "==2021.5.29"
-        },
-        "platformdirs": {
-            "hashes": [
-                "sha256:367a5e80b3d04d2428ffa76d33f124cf11e8fff2acdaa9b43d545f5c7d661ef2",
-                "sha256:8868bbe3c3c80d42f20156f22e7131d2fb321f5bc86a2a345375c6481a67021d"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==2.4.0"
-        },
-        "pycodestyle": {
-            "hashes": [
-                "sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20",
-                "sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==2.8.0"
-        },
-        "pyflakes": {
-            "hashes": [
-                "sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c",
-                "sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==2.4.0"
-        },
-        "six": {
-            "hashes": [
-                "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
-                "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==1.16.0"
-        },
-        "toml": {
-            "hashes": [
-                "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
-                "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
-            ],
-            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==0.10.2"
-        },
-        "typed-ast": {
-            "hashes": [
-                "sha256:01ae5f73431d21eead5015997ab41afa53aa1fbe252f9da060be5dad2c730ace",
-                "sha256:067a74454df670dcaa4e59349a2e5c81e567d8d65458d480a5b3dfecec08c5ff",
-                "sha256:0fb71b8c643187d7492c1f8352f2c15b4c4af3f6338f21681d3681b3dc31a266",
-                "sha256:1b3ead4a96c9101bef08f9f7d1217c096f31667617b58de957f690c92378b528",
-                "sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6",
-                "sha256:209596a4ec71d990d71d5e0d312ac935d86930e6eecff6ccc7007fe54d703808",
-                "sha256:2c726c276d09fc5c414693a2de063f521052d9ea7c240ce553316f70656c84d4",
-                "sha256:398e44cd480f4d2b7ee8d98385ca104e35c81525dd98c519acff1b79bdaac363",
-                "sha256:52b1eb8c83f178ab787f3a4283f68258525f8d70f778a2f6dd54d3b5e5fb4341",
-                "sha256:5feca99c17af94057417d744607b82dd0a664fd5e4ca98061480fd8b14b18d04",
-                "sha256:7538e495704e2ccda9b234b82423a4038f324f3a10c43bc088a1636180f11a41",
-                "sha256:760ad187b1041a154f0e4d0f6aae3e40fdb51d6de16e5c99aedadd9246450e9e",
-                "sha256:777a26c84bea6cd934422ac2e3b78863a37017618b6e5c08f92ef69853e765d3",
-                "sha256:95431a26309a21874005845c21118c83991c63ea800dd44843e42a916aec5899",
-                "sha256:9ad2c92ec681e02baf81fdfa056fe0d818645efa9af1f1cd5fd6f1bd2bdfd805",
-                "sha256:9c6d1a54552b5330bc657b7ef0eae25d00ba7ffe85d9ea8ae6540d2197a3788c",
-                "sha256:aee0c1256be6c07bd3e1263ff920c325b59849dc95392a05f258bb9b259cf39c",
-                "sha256:af3d4a73793725138d6b334d9d247ce7e5f084d96284ed23f22ee626a7b88e39",
-                "sha256:b36b4f3920103a25e1d5d024d155c504080959582b928e91cb608a65c3a49e1a",
-                "sha256:b9574c6f03f685070d859e75c7f9eeca02d6933273b5e69572e5ff9d5e3931c3",
-                "sha256:bff6ad71c81b3bba8fa35f0f1921fb24ff4476235a6e94a26ada2e54370e6da7",
-                "sha256:c190f0899e9f9f8b6b7863debfb739abcb21a5c054f911ca3596d12b8a4c4c7f",
-                "sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075",
-                "sha256:cae53c389825d3b46fb37538441f75d6aecc4174f615d048321b716df2757fb0",
-                "sha256:dd4a21253f42b8d2b48410cb31fe501d32f8b9fbeb1f55063ad102fe9c425e40",
-                "sha256:dde816ca9dac1d9c01dd504ea5967821606f02e510438120091b84e852367428",
-                "sha256:f2362f3cb0f3172c42938946dbc5b7843c2a28aec307c49100c8b38764eb6927",
-                "sha256:f328adcfebed9f11301eaedfa48e15bdece9b519fb27e6a8c01aa52a17ec31b3",
-                "sha256:f8afcf15cc511ada719a88e013cec87c11aff7b91f019295eb4530f96fe5ef2f",
-                "sha256:fb1bbeac803adea29cedd70781399c99138358c26d05fcbd23c13016b7f5ec65"
-            ],
-            "markers": "python_version < '3.8'",
-            "version": "==1.4.3"
-        },
-        "types-psycopg2": {
-            "hashes": [
-                "sha256:77ed80f2668582654623e04fb3d741ecce93effcc39c929d7e02f4a917a538ce",
-                "sha256:98a6e0e9580cd7eb4bd4d20f7c7063d154b2589a2b90c0ce4e3ca6085cde77c6"
-            ],
-            "index": "pypi",
-            "version": "==2.9.1"
-        },
-        "types-requests": {
-            "hashes": [
-                "sha256:b279284e51f668e38ee12d9665e4d789089f532dc2a0be4a1508ca0efd98ba9e",
-                "sha256:ba1d108d512e294b6080c37f6ae7cb2a2abf527560e2b671d1786c1fc46b541a"
-            ],
-            "index": "pypi",
-            "version": "==2.25.11"
-        },
-        "typing-extensions": {
-            "hashes": [
-                "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
-                "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
-                "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
-            ],
-            "index": "pypi",
-            "version": "==3.10.0.2"
-        },
-        "virtualenv": {
-            "hashes": [
-                "sha256:4b02e52a624336eece99c96e3ab7111f469c24ba226a53ec474e8e787b365814",
-                "sha256:576d05b46eace16a9c348085f7d0dc8ef28713a2cabaa1cf0aea41e8f12c9218"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==20.10.0"
-        },
-        "virtualenv-clone": {
-            "hashes": [
-                "sha256:418ee935c36152f8f153c79824bb93eaf6f0f7984bae31d3f48f350b9183501a",
-                "sha256:44d5263bceed0bac3e1424d64f798095233b64def1c5689afa43dc3223caf5b0"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==0.5.7"
-        },
-        "yapf": {
-            "hashes": [
-                "sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d",
-                "sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"
-            ],
-            "index": "pypi",
-            "version": "==0.31.0"
-        },
-        "zipp": {
-            "hashes": [
-                "sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
-                "sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==3.6.0"
-        }
-    }
-}
--- a/README.md
+++ b/README.md
@@ -1,12 +1,12 @@
 # Zenith

-Zenith substitutes PostgreSQL storage layer and redistributes data across a cluster of nodes
+Zenith is a serverless open source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes PostgreSQL storage layer by redistributing data across a cluster of nodes.

 ## Architecture overview

-A Zenith installation consists of Compute nodes and Storage engine.
+A Zenith installation consists of compute nodes and Zenith storage engine.

-Compute nodes are stateless PostgreSQL nodes, backed by zenith storage.
+Compute nodes are stateless PostgreSQL nodes, backed by Zenith storage engine.

 Zenith storage engine consists of two major components:
 - Pageserver. Scalable storage backend for compute nodes.
@@ -28,12 +28,12 @@ apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libsec
 libssl-dev clang pkg-config libpq-dev
 ```

-[Rust] 1.55 or later is also required.
+[Rust] 1.56.1 or later is also required.

 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.

 To run the integration tests or Python scripts (not required to use the code), install
-Python (3.7 or higher), and install python3 packages using `pipenv install` in the project directory.
+Python (3.7 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory.

 2. Build zenith and patched postgres
 ```sh
@@ -128,8 +128,7 @@ INSERT 0 1
 ```sh
 git clone --recursive https://github.com/zenithdb/zenith.git
 make # builds also postgres and installs it to ./tmp_install
-cd test_runner
-pipenv run pytest
+./scripts/pytest
 ```

 ## Documentation
--- a/compute_tools/.dockerignore
+++ b/compute_tools/.dockerignore
@@ -0,0 +1 @@
+target
--- a/compute_tools/.gitignore
+++ b/compute_tools/.gitignore
@@ -0,0 +1 @@
+target
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "compute_tools"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+libc = "0.2"
+anyhow = "1.0"
+chrono = "0.4"
+clap = "3.0"
+env_logger = "0.9"
+hyper = { version = "0.14", features = ["full"] }
+log = { version = "0.4", features = ["std", "serde"] }
+postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
+regex = "1"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1"
+tar = "0.4"
+tokio = { version = "1", features = ["macros", "rt", "rt-multi-thread"] }
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -0,0 +1,81 @@
+# Compute node tools
+
+Postgres wrapper (`zenith_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
+`ExecStart` option. It will handle all the `zenith` specifics during compute node
+initialization:
+- `zenith_ctl` accepts cluster (compute node) specification as a JSON file.
+- Every start is a fresh start, so the data directory is removed and
+  initialized again on each run.
+- Next it will put configuration files into the `PGDATA` directory.
+- Sync safekeepers and get commit LSN.
+- Get `basebackup` from pageserver using the returned on the previous step LSN.
+- Try to start `postgres` and wait until it is ready to accept connections.
+- Check and alter/drop/create roles and databases.
+- Hang waiting on the `postmaster` process to exit.
+
+Also `zenith_ctl` spawns two separate service threads:
+- `compute-monitor` checks the last Postgres activity timestamp and saves it
+  into the shared `ComputeState`;
+- `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
+  last activity requests.
+
+Usage example:
+```sh
+zenith_ctl -D /var/db/postgres/compute \
+           -C 'postgresql://zenith_admin@localhost/postgres' \
+           -S /var/db/postgres/specs/current.json \
+           -b /usr/local/bin/postgres
+```
+
+## Tests
+
+Cargo formatter:
+```sh
+cargo fmt
+```
+
+Run tests:
+```sh
+cargo test
+```
+
+Clippy linter:
+```sh
+cargo clippy --all --all-targets -- -Dwarnings -Drust-2018-idioms
+```
+
+## Cross-platform compilation
+
+Imaging that you are on macOS (x86) and you want a Linux GNU (`x86_64-unknown-linux-gnu` platform in `rust` terminology) executable.
+
+### Using docker
+
+You can use a throw-away Docker container ([rustlang/rust](https://hub.docker.com/r/rustlang/rust/) image) for doing that:
+```sh
+docker run --rm \
+    -v $(pwd):/compute_tools \
+    -w /compute_tools \
+    -t rustlang/rust:nightly cargo build --release --target=x86_64-unknown-linux-gnu
+```
+or one-line:
+```sh
+docker run --rm -v $(pwd):/compute_tools -w /compute_tools -t rust:latest cargo build --release --target=x86_64-unknown-linux-gnu
+```
+
+### Using rust native cross-compilation
+
+Another way is to add `x86_64-unknown-linux-gnu` target on your host system:
+```sh
+rustup target add x86_64-unknown-linux-gnu
+```
+
+Install macOS cross-compiler toolchain:
+```sh
+brew tap SergioBenitez/osxct
+brew install x86_64-unknown-linux-gnu
+```
+
+And finally run `cargo build`:
+```sh
+CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_LINKER=x86_64-unknown-linux-gnu-gcc cargo build --target=x86_64-unknown-linux-gnu --release
+```
--- a/compute_tools/rustfmt.toml
+++ b/compute_tools/rustfmt.toml
@@ -0,0 +1 @@
+max_width = 100
--- a/compute_tools/src/bin/zenith_ctl.rs
+++ b/compute_tools/src/bin/zenith_ctl.rs
@@ -0,0 +1,249 @@
+//!
+//! Postgres wrapper (`zenith_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
+//! `ExecStart` option. It will handle all the `zenith` specifics during compute node
+//! initialization:
+//! - `zenith_ctl` accepts cluster (compute node) specification as a JSON file.
+//! - Every start is a fresh start, so the data directory is removed and
+//!   initialized again on each run.
+//! - Next it will put configuration files into the `PGDATA` directory.
+//! - Sync safekeepers and get commit LSN.
+//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
+//! - Try to start `postgres` and wait until it is ready to accept connections.
+//! - Check and alter/drop/create roles and databases.
+//! - Hang waiting on the `postmaster` process to exit.
+//!
+//! Also `zenith_ctl` spawns two separate service threads:
+//! - `compute-monitor` checks the last Postgres activity timestamp and saves it
+//!   into the shared `ComputeState`;
+//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
+//!   last activity requests.
+//!
+//! Usage example:
+//! ```sh
+//! zenith_ctl -D /var/db/postgres/compute \
+//!            -C 'postgresql://zenith_admin@localhost/postgres' \
+//!            -S /var/db/postgres/specs/current.json \
+//!            -b /usr/local/bin/postgres
+//! ```
+//!
+use std::fs::File;
+use std::panic;
+use std::path::Path;
+use std::process::{exit, Command, ExitStatus};
+use std::sync::{Arc, RwLock};
+
+use anyhow::{Context, Result};
+use chrono::Utc;
+use clap::Arg;
+use log::info;
+use postgres::{Client, NoTls};
+
+use compute_tools::config;
+use compute_tools::http_api::launch_http_server;
+use compute_tools::logger::*;
+use compute_tools::monitor::launch_monitor;
+use compute_tools::params::*;
+use compute_tools::pg_helpers::*;
+use compute_tools::spec::*;
+use compute_tools::zenith::*;
+
+/// Do all the preparations like PGDATA directory creation, configuration,
+/// safekeepers sync, basebackup, etc.
+fn prepare_pgdata(state: &Arc<RwLock<ComputeState>>) -> Result<()> {
+    let state = state.read().unwrap();
+    let spec = &state.spec;
+    let pgdata_path = Path::new(&state.pgdata);
+    let pageserver_connstr = spec
+        .cluster
+        .settings
+        .find("zenith.page_server_connstring")
+        .expect("pageserver connstr should be provided");
+    let tenant = spec
+        .cluster
+        .settings
+        .find("zenith.zenith_tenant")
+        .expect("tenant id should be provided");
+    let timeline = spec
+        .cluster
+        .settings
+        .find("zenith.zenith_timeline")
+        .expect("tenant id should be provided");
+
+    info!(
+        "starting cluster #{}, operation #{}",
+        spec.cluster.cluster_id,
+        spec.operation_uuid.as_ref().unwrap()
+    );
+
+    // Remove/create an empty pgdata directory and put configuration there.
+    create_pgdata(&state.pgdata)?;
+    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
+
+    info!("starting safekeepers syncing");
+    let lsn = sync_safekeepers(&state.pgdata, &state.pgbin)
+        .with_context(|| "failed to sync safekeepers")?;
+    info!("safekeepers synced at LSN {}", lsn);
+
+    info!(
+        "getting basebackup@{} from pageserver {}",
+        lsn, pageserver_connstr
+    );
+    get_basebackup(&state.pgdata, &pageserver_connstr, &tenant, &timeline, &lsn).with_context(
+        || {
+            format!(
+                "failed to get basebackup@{} from pageserver {}",
+                lsn, pageserver_connstr
+            )
+        },
+    )?;
+
+    // Update pg_hba.conf received with basebackup.
+    update_pg_hba(pgdata_path)?;
+
+    Ok(())
+}
+
+/// Start Postgres as a child process and manage DBs/roles.
+/// After that this will hang waiting on the postmaster process to exit.
+fn run_compute(state: &Arc<RwLock<ComputeState>>) -> Result<ExitStatus> {
+    let read_state = state.read().unwrap();
+    let pgdata_path = Path::new(&read_state.pgdata);
+
+    // Run postgres as a child process.
+    let mut pg = Command::new(&read_state.pgbin)
+        .args(&["-D", &read_state.pgdata])
+        .spawn()
+        .expect("cannot start postgres process");
+
+    // Try default Postgres port if it is not provided
+    let port = read_state
+        .spec
+        .cluster
+        .settings
+        .find("port")
+        .unwrap_or_else(|| "5432".to_string());
+    wait_for_postgres(&port, pgdata_path)?;
+
+    let mut client = Client::connect(&read_state.connstr, NoTls)?;
+
+    handle_roles(&read_state.spec, &mut client)?;
+    handle_databases(&read_state.spec, &mut client)?;
+
+    // 'Close' connection
+    drop(client);
+
+    info!(
+        "finished configuration of cluster #{}",
+        read_state.spec.cluster.cluster_id
+    );
+
+    // Release the read lock.
+    drop(read_state);
+
+    // Get the write lock, update state and release the lock, so HTTP API
+    // was able to serve requests, while we are blocked waiting on
+    // Postgres.
+    let mut state = state.write().unwrap();
+    state.ready = true;
+    drop(state);
+
+    // Wait for child postgres process basically forever. In this state Ctrl+C
+    // will be propagated to postgres and it will be shut down as well.
+    let ecode = pg.wait().expect("failed to wait on postgres");
+
+    Ok(ecode)
+}
+
+fn main() -> Result<()> {
+    // TODO: re-use `zenith_utils::logging` later
+    init_logger(DEFAULT_LOG_LEVEL)?;
+
+    // Env variable is set by `cargo`
+    let version: Option<&str> = option_env!("CARGO_PKG_VERSION");
+    let matches = clap::App::new("zenith_ctl")
+        .version(version.unwrap_or("unknown"))
+        .arg(
+            Arg::new("connstr")
+                .short('C')
+                .long("connstr")
+                .value_name("DATABASE_URL")
+                .required(true),
+        )
+        .arg(
+            Arg::new("pgdata")
+                .short('D')
+                .long("pgdata")
+                .value_name("DATADIR")
+                .required(true),
+        )
+        .arg(
+            Arg::new("pgbin")
+                .short('b')
+                .long("pgbin")
+                .value_name("POSTGRES_PATH"),
+        )
+        .arg(
+            Arg::new("spec")
+                .short('s')
+                .long("spec")
+                .value_name("SPEC_JSON"),
+        )
+        .arg(
+            Arg::new("spec-path")
+                .short('S')
+                .long("spec-path")
+                .value_name("SPEC_PATH"),
+        )
+        .get_matches();
+
+    let pgdata = matches.value_of("pgdata").expect("PGDATA path is required");
+    let connstr = matches
+        .value_of("connstr")
+        .expect("Postgres connection string is required");
+    let spec = matches.value_of("spec");
+    let spec_path = matches.value_of("spec-path");
+
+    // Try to use just 'postgres' if no path is provided
+    let pgbin = matches.value_of("pgbin").unwrap_or("postgres");
+
+    let spec: ClusterSpec = match spec {
+        // First, try to get cluster spec from the cli argument
+        Some(json) => serde_json::from_str(json)?,
+        None => {
+            // Second, try to read it from the file if path is provided
+            if let Some(sp) = spec_path {
+                let path = Path::new(sp);
+                let file = File::open(path)?;
+                serde_json::from_reader(file)?
+            } else {
+                panic!("cluster spec should be provided via --spec or --spec-path argument");
+            }
+        }
+    };
+
+    let compute_state = ComputeState {
+        connstr: connstr.to_string(),
+        pgdata: pgdata.to_string(),
+        pgbin: pgbin.to_string(),
+        spec,
+        ready: false,
+        last_active: Utc::now(),
+    };
+    let compute_state = Arc::new(RwLock::new(compute_state));
+
+    // Launch service threads first, so we were able to serve availability
+    // requests, while configuration is still in progress.
+    let mut _threads = vec![
+        launch_http_server(&compute_state).expect("cannot launch compute monitor thread"),
+        launch_monitor(&compute_state).expect("cannot launch http endpoint thread"),
+    ];
+
+    prepare_pgdata(&compute_state)?;
+
+    // Run compute (Postgres) and hang waiting on it. Panic if any error happens,
+    // it will help us to trigger unwind and kill postmaster as well.
+    match run_compute(&compute_state) {
+        Ok(ec) => exit(ec.success() as i32),
+        Err(error) => panic!("cannot start compute node, error: {}", error),
+    }
+}
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -0,0 +1,51 @@
+use std::fs::{File, OpenOptions};
+use std::io;
+use std::io::prelude::*;
+use std::path::Path;
+
+use anyhow::Result;
+
+use crate::pg_helpers::PgOptionsSerialize;
+use crate::zenith::ClusterSpec;
+
+/// Check that `line` is inside a text file and put it there if it is not.
+/// Create file if it doesn't exist.
+pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
+    let mut file = OpenOptions::new()
+        .read(true)
+        .write(true)
+        .create(true)
+        .append(false)
+        .open(path)?;
+    let buf = io::BufReader::new(&file);
+    let mut count: usize = 0;
+
+    for l in buf.lines() {
+        if l? == line {
+            return Ok(false);
+        }
+        count = 1;
+    }
+
+    write!(file, "{}{}", "\n".repeat(count), line)?;
+    Ok(true)
+}
+
+/// Create or completely rewrite configuration file specified by `path`
+pub fn write_postgres_conf(path: &Path, spec: &ClusterSpec) -> Result<()> {
+    // File::create() destroys the file content if it exists.
+    let mut postgres_conf = File::create(path)?;
+
+    write_zenith_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?;
+
+    Ok(())
+}
+
+// Write Postgres config block wrapped with generated comment section
+fn write_zenith_managed_block(file: &mut File, buf: &str) -> Result<()> {
+    writeln!(file, "# Managed by Zenith: begin")?;
+    writeln!(file, "{}", buf)?;
+    writeln!(file, "# Managed by Zenith: end")?;
+
+    Ok(())
+}
--- a/compute_tools/src/http_api.rs
+++ b/compute_tools/src/http_api.rs
@@ -0,0 +1,73 @@
+use std::convert::Infallible;
+use std::net::SocketAddr;
+use std::sync::{Arc, RwLock};
+use std::thread;
+
+use anyhow::Result;
+use hyper::service::{make_service_fn, service_fn};
+use hyper::{Body, Method, Request, Response, Server, StatusCode};
+use log::{error, info};
+
+use crate::zenith::*;
+
+// Service function to handle all available routes.
+fn routes(req: Request<Body>, state: Arc<RwLock<ComputeState>>) -> Response<Body> {
+    match (req.method(), req.uri().path()) {
+        // Timestamp of the last Postgres activity in the plain text.
+        (&Method::GET, "/last_activity") => {
+            info!("serving /last_active GET request");
+            let state = state.read().unwrap();
+
+            // Use RFC3339 format for consistency.
+            Response::new(Body::from(state.last_active.to_rfc3339()))
+        }
+
+        // Has compute setup process finished? -> true/false
+        (&Method::GET, "/ready") => {
+            info!("serving /ready GET request");
+            let state = state.read().unwrap();
+            Response::new(Body::from(format!("{}", state.ready)))
+        }
+
+        // Return the `404 Not Found` for any other routes.
+        _ => {
+            let mut not_found = Response::new(Body::from("404 Not Found"));
+            *not_found.status_mut() = StatusCode::NOT_FOUND;
+            not_found
+        }
+    }
+}
+
+// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
+#[tokio::main]
+async fn serve(state: Arc<RwLock<ComputeState>>) {
+    let addr = SocketAddr::from(([0, 0, 0, 0], 3080));
+
+    let make_service = make_service_fn(move |_conn| {
+        let state = state.clone();
+        async move {
+            Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
+                let state = state.clone();
+                async move { Ok::<_, Infallible>(routes(req, state)) }
+            }))
+        }
+    });
+
+    info!("starting HTTP server on {}", addr);
+
+    let server = Server::bind(&addr).serve(make_service);
+
+    // Run this server forever
+    if let Err(e) = server.await {
+        error!("server error: {}", e);
+    }
+}
+
+/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
+pub fn launch_http_server(state: &Arc<RwLock<ComputeState>>) -> Result<thread::JoinHandle<()>> {
+    let state = Arc::clone(state);
+
+    Ok(thread::Builder::new()
+        .name("http-endpoint".into())
+        .spawn(move || serve(state))?)
+}
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -0,0 +1,13 @@
+//!
+//! Various tools and helpers to handle cluster / compute node (Postgres)
+//! configuration.
+//!
+pub mod config;
+pub mod http_api;
+#[macro_use]
+pub mod logger;
+pub mod monitor;
+pub mod params;
+pub mod pg_helpers;
+pub mod spec;
+pub mod zenith;
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -0,0 +1,43 @@
+use std::io::Write;
+
+use anyhow::Result;
+use chrono::Utc;
+use env_logger::{Builder, Env};
+
+macro_rules! info_println {
+    ($($tts:tt)*) => {
+        if log_enabled!(Level::Info) {
+            println!($($tts)*);
+        }
+    }
+}
+
+macro_rules! info_print {
+    ($($tts:tt)*) => {
+        if log_enabled!(Level::Info) {
+            print!($($tts)*);
+        }
+    }
+}
+
+/// Initialize `env_logger` using either `default_level` or
+/// `RUST_LOG` environment variable as default log level.
+pub fn init_logger(default_level: &str) -> Result<()> {
+    let env = Env::default().filter_or("RUST_LOG", default_level);
+
+    Builder::from_env(env)
+        .format(|buf, record| {
+            let thread_handle = std::thread::current();
+            writeln!(
+                buf,
+                "{} [{}] {}: {}",
+                Utc::now().format("%Y-%m-%d %H:%M:%S%.3f %Z"),
+                thread_handle.name().unwrap_or("main"),
+                record.level(),
+                record.args()
+            )
+        })
+        .init();
+
+    Ok(())
+}
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -0,0 +1,109 @@
+use std::sync::{Arc, RwLock};
+use std::{thread, time};
+
+use anyhow::Result;
+use chrono::{DateTime, Utc};
+use log::{debug, info};
+use postgres::{Client, NoTls};
+
+use crate::zenith::ComputeState;
+
+const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds
+
+// Spin in a loop and figure out the last activity time in the Postgres.
+// Then update it in the shared state. This function never errors out.
+// XXX: the only expected panic is at `RwLock` unwrap().
+fn watch_compute_activity(state: &Arc<RwLock<ComputeState>>) {
+    // Suppose that `connstr` doesn't change
+    let connstr = state.read().unwrap().connstr.clone();
+    // Define `client` outside of the loop to reuse existing connection if it's active.
+    let mut client = Client::connect(&connstr, NoTls);
+    let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);
+
+    info!("watching Postgres activity at {}", connstr);
+
+    loop {
+        // Should be outside of the write lock to allow others to read while we sleep.
+        thread::sleep(timeout);
+
+        match &mut client {
+            Ok(cli) => {
+                if cli.is_closed() {
+                    info!("connection to postgres closed, trying to reconnect");
+
+                    // Connection is closed, reconnect and try again.
+                    client = Client::connect(&connstr, NoTls);
+                    continue;
+                }
+
+                // Get all running client backends except ourself, use RFC3339 DateTime format.
+                let backends = cli
+                    .query(
+                        "SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change
+                         FROM pg_stat_activity
+                         WHERE backend_type = 'client backend'
+                            AND pid != pg_backend_pid()
+                            AND usename != 'zenith_admin';", // XXX: find a better way to filter other monitors?
+                        &[],
+                    );
+                let mut last_active = state.read().unwrap().last_active;
+
+                if let Ok(backs) = backends {
+                    let mut idle_backs: Vec<DateTime<Utc>> = vec![];
+
+                    for b in backs.into_iter() {
+                        let state: String = b.get("state");
+                        let change: String = b.get("state_change");
+
+                        if state == "idle" {
+                            let change = DateTime::parse_from_rfc3339(&change);
+                            match change {
+                                Ok(t) => idle_backs.push(t.with_timezone(&Utc)),
+                                Err(e) => {
+                                    info!("cannot parse backend state_change DateTime: {}", e);
+                                    continue;
+                                }
+                            }
+                        } else {
+                            // Found non-idle backend, so the last activity is NOW.
+                            // Save it and exit the for loop. Also clear the idle backend
+                            // `state_change` timestamps array as it doesn't matter now.
+                            last_active = Utc::now();
+                            idle_backs.clear();
+                            break;
+                        }
+                    }
+
+                    // Sort idle backend `state_change` timestamps. The last one corresponds
+                    // to the last activity.
+                    idle_backs.sort();
+                    if let Some(last) = idle_backs.last() {
+                        last_active = *last;
+                    }
+                }
+
+                // Update the last activity in the shared state if we got a more recent one.
+                let mut state = state.write().unwrap();
+                if last_active > state.last_active {
+                    state.last_active = last_active;
+                    debug!("set the last compute activity time to: {}", last_active);
+                }
+            }
+            Err(e) => {
+                info!("cannot connect to postgres: {}, retrying", e);
+
+                // Establish a new connection and try again.
+                client = Client::connect(&connstr, NoTls);
+            }
+        }
+    }
+}
+
+/// Launch a separate compute monitor thread and return its `JoinHandle`.
+pub fn launch_monitor(state: &Arc<RwLock<ComputeState>>) -> Result<thread::JoinHandle<()>> {
+    let state = Arc::clone(state);
+
+    Ok(thread::Builder::new()
+        .name("compute-monitor".into())
+        .spawn(move || watch_compute_activity(&state))?)
+}
--- a/compute_tools/src/params.rs
+++ b/compute_tools/src/params.rs
@@ -0,0 +1,3 @@
+pub const DEFAULT_LOG_LEVEL: &str = "info";
+pub const DEFAULT_CONNSTRING: &str = "host=localhost user=postgres";
+pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5";
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -0,0 +1,264 @@
+use std::net::{SocketAddr, TcpStream};
+use std::os::unix::fs::PermissionsExt;
+use std::path::Path;
+use std::process::Command;
+use std::str::FromStr;
+use std::{fs, thread, time};
+
+use anyhow::{bail, Result};
+use postgres::{Client, Transaction};
+use serde::Deserialize;
+
+const POSTGRES_WAIT_TIMEOUT: u64 = 60 * 1000; // milliseconds
+
+/// Rust representation of Postgres role info with only those fields
+/// that matter for us.
+#[derive(Clone, Deserialize)]
+pub struct Role {
+    pub name: PgIdent,
+    pub encrypted_password: Option<String>,
+    pub options: GenericOptions,
+}
+
+/// Rust representation of Postgres database info with only those fields
+/// that matter for us.
+#[derive(Clone, Deserialize)]
+pub struct Database {
+    pub name: PgIdent,
+    pub owner: PgIdent,
+    pub options: GenericOptions,
+}
+
+/// Common type representing both SQL statement params with or without value,
+/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
+/// options like `wal_level = logical`.
+#[derive(Clone, Deserialize)]
+pub struct GenericOption {
+    pub name: String,
+    pub value: Option<String>,
+    pub vartype: String,
+}
+
+/// Optional collection of `GenericOption`'s. Type alias allows us to
+/// declare a `trait` on it.
+pub type GenericOptions = Option<Vec<GenericOption>>;
+
+impl GenericOption {
+    /// Represent `GenericOption` as SQL statement parameter.
+    pub fn to_pg_option(&self) -> String {
+        if let Some(val) = &self.value {
+            match self.vartype.as_ref() {
+                "string" => format!("{} '{}'", self.name, val),
+                _ => format!("{} {}", self.name, val),
+            }
+        } else {
+            self.name.to_owned()
+        }
+    }
+
+    /// Represent `GenericOption` as configuration option.
+    pub fn to_pg_setting(&self) -> String {
+        if let Some(val) = &self.value {
+            match self.vartype.as_ref() {
+                "string" => format!("{} = '{}'", self.name, val),
+                _ => format!("{} = {}", self.name, val),
+            }
+        } else {
+            self.name.to_owned()
+        }
+    }
+}
+
+pub trait PgOptionsSerialize {
+    fn as_pg_options(&self) -> String;
+    fn as_pg_settings(&self) -> String;
+}
+
+impl PgOptionsSerialize for GenericOptions {
+    /// Serialize an optional collection of `GenericOption`'s to
+    /// Postgres SQL statement arguments.
+    fn as_pg_options(&self) -> String {
+        if let Some(ops) = &self {
+            ops.iter()
+                .map(|op| op.to_pg_option())
+                .collect::<Vec<String>>()
+                .join(" ")
+        } else {
+            "".to_string()
+        }
+    }
+
+    /// Serialize an optional collection of `GenericOption`'s to
+    /// `postgresql.conf` compatible format.
+    fn as_pg_settings(&self) -> String {
+        if let Some(ops) = &self {
+            ops.iter()
+                .map(|op| op.to_pg_setting())
+                .collect::<Vec<String>>()
+                .join("\n")
+        } else {
+            "".to_string()
+        }
+    }
+}
+
+pub trait GenericOptionsSearch {
+    fn find(&self, name: &str) -> Option<String>;
+}
+
+impl GenericOptionsSearch for GenericOptions {
+    /// Lookup option by name
+    fn find(&self, name: &str) -> Option<String> {
+        match &self {
+            Some(ops) => {
+                let op = ops.iter().find(|s| s.name == name);
+                match op {
+                    Some(op) => op.value.clone(),
+                    None => None,
+                }
+            }
+            None => None,
+        }
+    }
+}
+
+impl Role {
+    /// Serialize a list of role parameters into a Postgres-acceptable
+    /// string of arguments.
+    pub fn to_pg_options(&self) -> String {
+        // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in Rails.
+        // For now we do not use generic `options` for roles. Once used, add
+        // `self.options.as_pg_options()` somewhere here.
+        let mut params: String = "LOGIN".to_string();
+
+        if let Some(pass) = &self.encrypted_password {
+            params.push_str(&format!(" PASSWORD 'md5{}'", pass));
+        } else {
+            params.push_str(" PASSWORD NULL");
+        }
+
+        params
+    }
+}
+
+impl Database {
+    /// Serialize a list of database parameters into a Postgres-acceptable
+    /// string of arguments.
+    /// NB: `TEMPLATE` is actually also an identifier, but so far we only need
+    /// to use `template0` and `template1`, so it is not a problem. Yet in the future
+    /// it may require a proper quoting too.
+    pub fn to_pg_options(&self) -> String {
+        let mut params: String = self.options.as_pg_options();
+        params.push_str(&format!(" OWNER {}", &self.owner.quote()));
+
+        params
+    }
+}
+
+/// String type alias representing Postgres identifier and
+/// intended to be used for DB / role names.
+pub type PgIdent = String;
+
+/// Generic trait used to provide quoting for strings used in the
+/// Postgres SQL queries. Currently used only to implement quoting
+/// of identifiers, but could be used for literals in the future.
+pub trait PgQuote {
+    fn quote(&self) -> String;
+}
+
+impl PgQuote for PgIdent {
+    /// This is intended to mimic Postgres quote_ident(), but for simplicity it
+    /// always quotes provided string with `""` and escapes every `"`. Not idempotent,
+    /// i.e. if string is already escaped it will be escaped again.
+    fn quote(&self) -> String {
+        let result = format!("\"{}\"", self.replace('"', "\"\""));
+        result
+    }
+}
+
+/// Build a list of existing Postgres roles
+pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
+    let postgres_roles = xact
+        .query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
+        .iter()
+        .map(|row| Role {
+            name: row.get("rolname"),
+            encrypted_password: row.get("rolpassword"),
+            options: None,
+        })
+        .collect();
+
+    Ok(postgres_roles)
+}
+
+/// Build a list of existing Postgres databases
+pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
+    let postgres_dbs = client
+        .query(
+            "SELECT datname, datdba::regrole::text as owner
+               FROM pg_catalog.pg_database;",
+            &[],
+        )?
+        .iter()
+        .map(|row| Database {
+            name: row.get("datname"),
+            owner: row.get("owner"),
+            options: None,
+        })
+        .collect();
+
+    Ok(postgres_dbs)
+}
+
+/// Wait for Postgres to become ready to accept connections:
+/// - state should be `ready` in the `pgdata/postmaster.pid`
+/// - and we should be able to connect to 127.0.0.1:5432
+pub fn wait_for_postgres(port: &str, pgdata: &Path) -> Result<()> {
+    let pid_path = pgdata.join("postmaster.pid");
+    let mut slept: u64 = 0; // ms
+    let pause = time::Duration::from_millis(100);
+
+    let timeout = time::Duration::from_millis(200);
+    let addr = SocketAddr::from_str(&format!("127.0.0.1:{}", port)).unwrap();
+
+    loop {
+        // Sleep POSTGRES_WAIT_TIMEOUT at max (a bit longer actually if consider a TCP timeout,
+        // but postgres starts listening almost immediately, even if it is not really
+        // ready to accept connections).
+        if slept >= POSTGRES_WAIT_TIMEOUT {
+            bail!("timed out while waiting for Postgres to start");
+        }
+
+        if pid_path.exists() {
+            // XXX: dumb and the simplest way to get the last line in a text file
+            // TODO: better use `.lines().last()` later
+            let stdout = Command::new("tail")
+                .args(&["-n1", pid_path.to_str().unwrap()])
+                .output()?
+                .stdout;
+            let status = String::from_utf8(stdout)?;
+            let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok();
+
+            // Now Postgres is ready to accept connections
+            if status.trim() == "ready" && can_connect {
+                break;
+            }
+        }
+
+        thread::sleep(pause);
+        slept += 100;
+    }
+
+    Ok(())
+}
+
+/// Remove `pgdata` directory and create it again with right permissions.
+pub fn create_pgdata(pgdata: &str) -> Result<()> {
+    // Ignore removal error, likely it is a 'No such file or directory (os error 2)'.
+    // If it is something different then create_dir() will error out anyway.
+    let _ok = fs::remove_dir_all(pgdata);
+    fs::create_dir(pgdata)?;
+    fs::set_permissions(pgdata, fs::Permissions::from_mode(0o700))?;
+
+    Ok(())
+}
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -0,0 +1,246 @@
+use std::path::Path;
+
+use anyhow::Result;
+use log::{info, log_enabled, warn, Level};
+use postgres::Client;
+
+use crate::config;
+use crate::params::PG_HBA_ALL_MD5;
+use crate::pg_helpers::*;
+use crate::zenith::ClusterSpec;
+
+/// It takes cluster specification and does the following:
+/// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file.
+/// - Update `pg_hba.conf` to allow external connections.
+pub fn handle_configuration(spec: &ClusterSpec, pgdata_path: &Path) -> Result<()> {
+    // File `postgresql.conf` is no longer included into `basebackup`, so just
+    // always write all config into it creating new file.
+    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
+
+    update_pg_hba(pgdata_path)?;
+
+    Ok(())
+}
+
+/// Check `pg_hba.conf` and update if needed to allow external connections.
+pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
+    // XXX: consider making it a part of spec.json
+    info!("checking pg_hba.conf");
+    let pghba_path = pgdata_path.join("pg_hba.conf");
+
+    if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? {
+        info!("updated pg_hba.conf to allow external connections");
+    } else {
+        info!("pg_hba.conf is up-to-date");
+    }
+
+    Ok(())
+}
+
+/// Given a cluster spec json and open transaction it handles roles creation,
+/// deletion and update.
+pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
+    let mut xact = client.transaction()?;
+    let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
+
+    // Print a list of existing Postgres roles (only in debug mode)
+    info!("postgres roles:");
+    for r in &existing_roles {
+        info_println!(
+            "{} - {}:{}",
+            " ".repeat(27 + 5),
+            r.name,
+            if r.encrypted_password.is_some() {
+                "[FILTERED]"
+            } else {
+                "(null)"
+            }
+        );
+    }
+
+    // Process delta operations first
+    if let Some(ops) = &spec.delta_operations {
+        info!("processing delta operations on roles");
+        for op in ops {
+            match op.action.as_ref() {
+                // We do not check either role exists or not,
+                // Postgres will take care of it for us
+                "delete_role" => {
+                    let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote());
+
+                    warn!("deleting role '{}'", &op.name);
+                    xact.execute(query.as_str(), &[])?;
+                }
+                // Renaming role drops its password, since tole name is
+                // used as a salt there.  It is important that this role
+                // is recorded with a new `name` in the `roles` list.
+                // Follow up roles update will set the new password.
+                "rename_role" => {
+                    let new_name = op.new_name.as_ref().unwrap();
+
+                    // XXX: with a limited number of roles it is fine, but consider making it a HashMap
+                    if existing_roles.iter().any(|r| r.name == op.name) {
+                        let query: String = format!(
+                            "ALTER ROLE {} RENAME TO {}",
+                            op.name.quote(),
+                            new_name.quote()
+                        );
+
+                        warn!("renaming role '{}' to '{}'", op.name, new_name);
+                        xact.execute(query.as_str(), &[])?;
+                    }
+                }
+                _ => {}
+            }
+        }
+    }
+
+    // Refresh Postgres roles info to handle possible roles renaming
+    let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
+
+    info!("cluster spec roles:");
+    for role in &spec.cluster.roles {
+        let name = &role.name;
+
+        info_print!(
+            "{} - {}:{}",
+            " ".repeat(27 + 5),
+            name,
+            if role.encrypted_password.is_some() {
+                "[FILTERED]"
+            } else {
+                "(null)"
+            }
+        );
+
+        // XXX: with a limited number of roles it is fine, but consider making it a HashMap
+        let pg_role = existing_roles.iter().find(|r| r.name == *name);
+
+        if let Some(r) = pg_role {
+            let mut update_role = false;
+
+            if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
+                || (r.encrypted_password.is_some() && role.encrypted_password.is_none())
+            {
+                update_role = true;
+            } else if let Some(pg_pwd) = &r.encrypted_password {
+                // Check whether password changed or not (trim 'md5:' prefix first)
+                update_role = pg_pwd[3..] != *role.encrypted_password.as_ref().unwrap();
+            }
+
+            if update_role {
+                let mut query: String = format!("ALTER ROLE {} ", name.quote());
+                info_print!(" -> update");
+
+                query.push_str(&role.to_pg_options());
+                xact.execute(query.as_str(), &[])?;
+            }
+        } else {
+            info!("role name {}", &name);
+            let mut query: String = format!("CREATE ROLE {} ", name.quote());
+            info!("role create query {}", &query);
+            info_print!(" -> create");
+
+            query.push_str(&role.to_pg_options());
+            xact.execute(query.as_str(), &[])?;
+        }
+
+        info_print!("\n");
+    }
+
+    xact.commit()?;
+
+    Ok(())
+}
+
+/// It follows mostly the same logic as `handle_roles()` excepting that we
+/// does not use an explicit transactions block, since major database operations
+/// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
+/// atomicity should be enough here due to the order of operations and various checks,
+/// which together provide us idempotency.
+pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
+    let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
+
+    // Print a list of existing Postgres databases (only in debug mode)
+    info!("postgres databases:");
+    for r in &existing_dbs {
+        info_println!("{} - {}:{}", " ".repeat(27 + 5), r.name, r.owner);
+    }
+
+    // Process delta operations first
+    if let Some(ops) = &spec.delta_operations {
+        info!("processing delta operations on databases");
+        for op in ops {
+            match op.action.as_ref() {
+                // We do not check either DB exists or not,
+                // Postgres will take care of it for us
+                "delete_db" => {
+                    let query: String = format!("DROP DATABASE IF EXISTS {}", &op.name.quote());
+
+                    warn!("deleting database '{}'", &op.name);
+                    client.execute(query.as_str(), &[])?;
+                }
+                "rename_db" => {
+                    let new_name = op.new_name.as_ref().unwrap();
+
+                    // XXX: with a limited number of roles it is fine, but consider making it a HashMap
+                    if existing_dbs.iter().any(|r| r.name == op.name) {
+                        let query: String = format!(
+                            "ALTER DATABASE {} RENAME TO {}",
+                            op.name.quote(),
+                            new_name.quote()
+                        );
+
+                        warn!("renaming database '{}' to '{}'", op.name, new_name);
+                        client.execute(query.as_str(), &[])?;
+                    }
+                }
+                _ => {}
+            }
+        }
+    }
+
+    // Refresh Postgres databases info to handle possible renames
+    let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
+
+    info!("cluster spec databases:");
+    for db in &spec.cluster.databases {
+        let name = &db.name;
+
+        info_print!("{} - {}:{}", " ".repeat(27 + 5), db.name, db.owner);
+
+        // XXX: with a limited number of databases it is fine, but consider making it a HashMap
+        let pg_db = existing_dbs.iter().find(|r| r.name == *name);
+
+        if let Some(r) = pg_db {
+            // XXX: db owner name is returned as quoted string from Postgres,
+            // when quoting is needed.
+            let new_owner = if r.owner.starts_with('"') {
+                db.owner.quote()
+            } else {
+                db.owner.clone()
+            };
+
+            if new_owner != r.owner {
+                let query: String = format!(
+                    "ALTER DATABASE {} OWNER TO {}",
+                    name.quote(),
+                    db.owner.quote()
+                );
+                info_print!(" -> update");
+
+                client.execute(query.as_str(), &[])?;
+            }
+        } else {
+            let mut query: String = format!("CREATE DATABASE {} ", name.quote());
+            info_print!(" -> create");
+
+            query.push_str(&db.to_pg_options());
+            client.execute(query.as_str(), &[])?;
+        }
+
+        info_print!("\n");
+    }
+
+    Ok(())
+}
--- a/compute_tools/src/zenith.rs
+++ b/compute_tools/src/zenith.rs
@@ -0,0 +1,109 @@
+use std::process::{Command, Stdio};
+
+use anyhow::Result;
+use chrono::{DateTime, Utc};
+use postgres::{Client, NoTls};
+use serde::Deserialize;
+
+use crate::pg_helpers::*;
+
+/// Compute node state shared across several `zenith_ctl` threads.
+/// Should be used under `RwLock` to allow HTTP API server to serve
+/// status requests, while configuration is in progress.
+pub struct ComputeState {
+    pub connstr: String,
+    pub pgdata: String,
+    pub pgbin: String,
+    pub spec: ClusterSpec,
+    /// Compute setup process has finished
+    pub ready: bool,
+    /// Timestamp of the last Postgres activity
+    pub last_active: DateTime<Utc>,
+}
+
+/// Cluster spec or configuration represented as an optional number of
+/// delta operations + final cluster state description.
+#[derive(Clone, Deserialize)]
+pub struct ClusterSpec {
+    pub format_version: f32,
+    pub timestamp: String,
+    pub operation_uuid: Option<String>,
+    /// Expected cluster state at the end of transition process.
+    pub cluster: Cluster,
+    pub delta_operations: Option<Vec<DeltaOp>>,
+}
+
+/// Cluster state seen from the perspective of the external tools
+/// like Rails web console.
+#[derive(Clone, Deserialize)]
+pub struct Cluster {
+    pub cluster_id: String,
+    pub name: String,
+    pub state: Option<String>,
+    pub roles: Vec<Role>,
+    pub databases: Vec<Database>,
+    pub settings: GenericOptions,
+}
+
+/// Single cluster state changing operation that could not be represented as
+/// a static `Cluster` structure. For example:
+/// - DROP DATABASE
+/// - DROP ROLE
+/// - ALTER ROLE name RENAME TO new_name
+/// - ALTER DATABASE name RENAME TO new_name
+#[derive(Clone, Deserialize)]
+pub struct DeltaOp {
+    pub action: String,
+    pub name: PgIdent,
+    pub new_name: Option<PgIdent>,
+}
+
+/// Get basebackup from the libpq connection to pageserver using `connstr` and
+/// unarchive it to `pgdata` directory overriding all its previous content.
+pub fn get_basebackup(
+    pgdata: &str,
+    connstr: &str,
+    tenant: &str,
+    timeline: &str,
+    lsn: &str,
+) -> Result<()> {
+    let mut client = Client::connect(connstr, NoTls)?;
+    let basebackup_cmd = match lsn {
+        "0/0" => format!("basebackup {} {}", tenant, timeline), // First start of the compute
+        _ => format!("basebackup {} {} {}", tenant, timeline, lsn),
+    };
+    let copyreader = client.copy_out(basebackup_cmd.as_str())?;
+    let mut ar = tar::Archive::new(copyreader);
+
+    ar.unpack(&pgdata)?;
+
+    Ok(())
+}
+
+/// Run `postgres` in a special mode with `--sync-safekeepers` argument
+/// and return the reported LSN back to the caller.
+pub fn sync_safekeepers(pgdata: &str, pgbin: &str) -> Result<String> {
+    let sync_handle = Command::new(&pgbin)
+        .args(&["--sync-safekeepers"])
+        .env("PGDATA", &pgdata) // we cannot use -D in this mode
+        .stdout(Stdio::piped())
+        .spawn()
+        .expect("postgres --sync-safekeepers failed to start");
+
+    // `postgres --sync-safekeepers` will print all log output to stderr and
+    // final LSN to stdout. So we pipe only stdout, while stderr will be automatically
+    // redirected to the caller output.
+    let sync_output = sync_handle
+        .wait_with_output()
+        .expect("postgres --sync-safekeepers failed");
+    if !sync_output.status.success() {
+        anyhow::bail!(
+            "postgres --sync-safekeepers exited with non-zero status: {}",
+            sync_output.status,
+        );
+    }
+
+    let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim());
+
+    Ok(lsn)
+}
--- a/compute_tools/tests/cluster_spec.json
+++ b/compute_tools/tests/cluster_spec.json
@@ -0,0 +1,205 @@
+{
+    "format_version": 1.0,
+
+    "timestamp": "2021-05-23T18:25:43.511Z",
+    "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b",
+
+    "cluster": {
+        "cluster_id": "test-cluster-42",
+        "name": "Zenith Test",
+        "state": "restarted",
+        "roles": [
+            {
+                "name": "postgres",
+                "encrypted_password": "6b1d16b78004bbd51fa06af9eda75972",
+                "options": null
+            },
+            {
+                "name": "alexk",
+                "encrypted_password": null,
+                "options": null
+            },
+            {
+                "name": "zenith \"new\"",
+                "encrypted_password": "5b1d16b78004bbd51fa06af9eda75972",
+                "options": null
+            },
+            {
+                "name": "zen",
+                "encrypted_password": "9b1d16b78004bbd51fa06af9eda75972"
+            },
+            {
+                "name": "\"name\";\\n select 1;",
+                "encrypted_password": "5b1d16b78004bbd51fa06af9eda75972"
+            },
+            {
+                "name": "MyRole",
+                "encrypted_password": "5b1d16b78004bbd51fa06af9eda75972"
+            }
+        ],
+        "databases": [
+            {
+                "name": "DB2",
+                "owner": "alexk",
+                "options": [
+                    {
+                        "name": "LC_COLLATE",
+                        "value": "C",
+                        "vartype": "string"
+                    },
+                    {
+                        "name": "LC_CTYPE",
+                        "value": "C",
+                        "vartype": "string"
+                    },
+                    {
+                        "name": "TEMPLATE",
+                        "value": "template0",
+                        "vartype": "enum"
+                    }
+                ]
+            },
+            {
+                "name": "zenith",
+                "owner": "MyRole"
+            },
+            {
+                "name": "zen",
+                "owner": "zen"
+            }
+        ],
+        "settings": [
+            {
+                "name": "fsync",
+                "value": "off",
+                "vartype": "bool"
+            },
+            {
+                "name": "wal_level",
+                "value": "replica",
+                "vartype": "enum"
+            },
+            {
+                "name": "hot_standby",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "wal_acceptors",
+                "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501",
+                "vartype": "string"
+            },
+            {
+                "name": "wal_log_hints",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "log_connections",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "shared_buffers",
+                "value": "32768",
+                "vartype": "integer"
+            },
+            {
+                "name": "port",
+                "value": "55432",
+                "vartype": "integer"
+            },
+            {
+                "name": "max_connections",
+                "value": "100",
+                "vartype": "integer"
+            },
+            {
+                "name": "max_wal_senders",
+                "value": "10",
+                "vartype": "integer"
+            },
+            {
+                "name": "listen_addresses",
+                "value": "0.0.0.0",
+                "vartype": "string"
+            },
+            {
+                "name": "wal_sender_timeout",
+                "value": "0",
+                "vartype": "integer"
+            },
+            {
+                "name": "password_encryption",
+                "value": "md5",
+                "vartype": "enum"
+            },
+            {
+                "name": "maintenance_work_mem",
+                "value": "65536",
+                "vartype": "integer"
+            },
+            {
+                "name": "max_parallel_workers",
+                "value": "8",
+                "vartype": "integer"
+            },
+            {
+                "name": "max_worker_processes",
+                "value": "8",
+                "vartype": "integer"
+            },
+            {
+                "name": "zenith.zenith_tenant",
+                "value": "b0554b632bd4d547a63b86c3630317e8",
+                "vartype": "string"
+            },
+            {
+                "name": "max_replication_slots",
+                "value": "10",
+                "vartype": "integer"
+            },
+            {
+                "name": "zenith.zenith_timeline",
+                "value": "2414a61ffc94e428f14b5758fe308e13",
+                "vartype": "string"
+            },
+            {
+                "name": "shared_preload_libraries",
+                "value": "zenith",
+                "vartype": "string"
+            },
+            {
+                "name": "synchronous_standby_names",
+                "value": "walproposer",
+                "vartype": "string"
+            },
+            {
+                "name": "zenith.page_server_connstring",
+                "value": "host=127.0.0.1 port=6400",
+                "vartype": "string"
+            }
+        ]
+    },
+
+    "delta_operations": [
+        {
+            "action": "delete_db",
+            "name": "zenith_test"
+        },
+        {
+            "action": "rename_db",
+            "name": "DB",
+            "new_name": "DB2"
+        },
+        {
+            "action": "delete_role",
+            "name": "zenith2"
+        },
+        {
+            "action": "rename_role",
+            "name": "zenith new",
+            "new_name": "zenith \"new\""
+        }
+    ]
+}
--- a/compute_tools/tests/config_test.rs
+++ b/compute_tools/tests/config_test.rs
@@ -0,0 +1,48 @@
+#[cfg(test)]
+mod config_tests {
+
+    use std::fs::{remove_file, File};
+    use std::io::{Read, Write};
+    use std::path::Path;
+
+    use compute_tools::config::*;
+
+    fn write_test_file(path: &Path, content: &str) {
+        let mut file = File::create(path).unwrap();
+        file.write_all(content.as_bytes()).unwrap();
+    }
+
+    fn check_file_content(path: &Path, expected_content: &str) {
+        let mut file = File::open(path).unwrap();
+        let mut content = String::new();
+
+        file.read_to_string(&mut content).unwrap();
+        assert_eq!(content, expected_content);
+    }
+
+    #[test]
+    fn test_line_in_file() {
+        let path = Path::new("./tests/tmp/config_test.txt");
+        write_test_file(path, "line1\nline2.1\t line2.2\nline3");
+
+        let line = "line2.1\t line2.2";
+        let result = line_in_file(path, line).unwrap();
+        assert!(!result);
+        check_file_content(path, "line1\nline2.1\t line2.2\nline3");
+
+        let line = "line4";
+        let result = line_in_file(path, line).unwrap();
+        assert!(result);
+        check_file_content(path, "line1\nline2.1\t line2.2\nline3\nline4");
+
+        remove_file(path).unwrap();
+
+        let path = Path::new("./tests/tmp/new_config_test.txt");
+        let line = "line4";
+        let result = line_in_file(path, line).unwrap();
+        assert!(result);
+        check_file_content(path, "line4");
+
+        remove_file(path).unwrap();
+    }
+}
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -0,0 +1,41 @@
+#[cfg(test)]
+mod pg_helpers_tests {
+
+    use std::fs::File;
+
+    use compute_tools::pg_helpers::*;
+    use compute_tools::zenith::ClusterSpec;
+
+    #[test]
+    fn params_serialize() {
+        let file = File::open("tests/cluster_spec.json").unwrap();
+        let spec: ClusterSpec = serde_json::from_reader(file).unwrap();
+
+        assert_eq!(
+            spec.cluster.databases.first().unwrap().to_pg_options(),
+            "LC_COLLATE 'C' LC_CTYPE 'C' TEMPLATE template0 OWNER \"alexk\""
+        );
+        assert_eq!(
+            spec.cluster.roles.first().unwrap().to_pg_options(),
+            "LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'"
+        );
+    }
+
+    #[test]
+    fn settings_serialize() {
+        let file = File::open("tests/cluster_spec.json").unwrap();
+        let spec: ClusterSpec = serde_json::from_reader(file).unwrap();
+
+        assert_eq!(
+            spec.cluster.settings.as_pg_settings(),
+            "fsync = off\nwal_level = replica\nhot_standby = on\nwal_acceptors = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nzenith.zenith_tenant = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nzenith.zenith_timeline = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'zenith'\nsynchronous_standby_names = 'walproposer'\nzenith.page_server_connstring = 'host=127.0.0.1 port=6400'"
+        );
+    }
+
+    #[test]
+    fn quote_ident() {
+        let ident: PgIdent = PgIdent::from("\"name\";\\n select 1;");
+
+        assert_eq!(ident.quote(), "\"\"\"name\"\";\\n select 1;\"");
+    }
+}
--- a/compute_tools/tests/tmp/.gitignore
+++ b/compute_tools/tests/tmp/.gitignore
@@ -0,0 +1 @@
+**/*
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -1,30 +1,22 @@
 [package]
 name = "control_plane"
 version = "0.1.0"
-authors = ["Stas Kelvich <stas@zenith.tech>"]
-edition = "2018"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+edition = "2021"

 [dependencies]
-rand = "0.8.3"
 tar = "0.4.33"
-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
+postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1"
 toml = "0.5"
 lazy_static = "1.4"
 regex = "1"
 anyhow = "1.0"
 thiserror = "1"
-bytes = "1.0.1"
 nix = "0.23"
 url = "2.2.2"
-hex = { version = "0.4.3", features = ["serde"] }
-reqwest = { version = "0.11", features = ["blocking", "json"] }
+reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }

 pageserver = { path = "../pageserver" }
 walkeeper = { path = "../walkeeper" }
-postgres_ffi = { path = "../postgres_ffi" }
 zenith_utils = { path = "../zenith_utils" }
 workspace_hack = { path = "../workspace_hack" }
--- a/control_plane/safekeepers.conf
+++ b/control_plane/safekeepers.conf
@@ -1,20 +1,20 @@
 # Page server and three safekeepers.
 [pageserver]
-pg_port = 64000
-http_port = 9898
+listen_pg_addr = '127.0.0.1:64000'
+listen_http_addr = '127.0.0.1:9898'
 auth_type = 'Trust'

 [[safekeepers]]
-name = 'sk1'
+id = 1
 pg_port = 5454
 http_port = 7676

 [[safekeepers]]
-name = 'sk2'
+id = 2
 pg_port = 5455
 http_port = 7677

 [[safekeepers]]
-name = 'sk3'
+id = 3
 pg_port = 5456
 http_port = 7678
--- a/control_plane/simple.conf
+++ b/control_plane/simple.conf
@@ -1,11 +1,11 @@
 # Minimal zenith environment with one safekeeper. This is equivalent to the built-in
 # defaults that you get with no --config
 [pageserver]
-pg_port = 64000
-http_port = 9898
+listen_pg_addr = '127.0.0.1:64000'
+listen_http_addr = '127.0.0.1:9898'
 auth_type = 'Trust'

 [[safekeepers]]
-name = 'single'
+id = 1
 pg_port = 5454
 http_port = 7676
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -82,15 +82,11 @@ impl ComputeControlPlane {
        let mut strings = s.split('@');
        let name = strings.next().unwrap();

-        let lsn: Option<Lsn>;
-        if let Some(lsnstr) = strings.next() {
-            lsn = Some(
-                Lsn::from_str(lsnstr)
-                    .with_context(|| "invalid LSN in point-in-time specification")?,
-            );
-        } else {
-            lsn = None
-        }
+        let lsn = strings
+            .next()
+            .map(Lsn::from_str)
+            .transpose()
+            .context("invalid LSN in point-in-time specification")?;

        // Resolve the timeline ID, given the human-readable branch name
        let timeline_id = self
@@ -199,17 +195,24 @@ impl PostgresNode {
        })
    }

-    fn sync_safekeepers(&self) -> Result<Lsn> {
+    fn sync_safekeepers(&self, auth_token: &Option<String>) -> Result<Lsn> {
        let pg_path = self.env.pg_bin_dir().join("postgres");
-        let sync_handle = Command::new(pg_path)
-            .arg("--sync-safekeepers")
+        let mut cmd = Command::new(&pg_path);
+
+        cmd.arg("--sync-safekeepers")
            .env_clear()
            .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
            .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
            .env("PGDATA", self.pgdata().to_str().unwrap())
            .stdout(Stdio::piped())
            // Comment this to avoid capturing stderr (useful if command hangs)
-            .stderr(Stdio::piped())
+            .stderr(Stdio::piped());
+
+        if let Some(token) = auth_token {
+            cmd.env("ZENITH_AUTH_TOKEN", token);
+        }
+
+        let sync_handle = cmd
            .spawn()
            .expect("postgres --sync-safekeepers failed to start");

@@ -246,16 +249,16 @@ impl PostgresNode {
        let mut client = self
            .pageserver
            .page_server_psql_client()
-            .with_context(|| "connecting to page server failed")?;
+            .context("connecting to page server failed")?;

        let copyreader = client
            .copy_out(sql.as_str())
-            .with_context(|| "page server 'basebackup' command failed")?;
+            .context("page server 'basebackup' command failed")?;

        // Read the archive directly from the `CopyOutReader`
        tar::Archive::new(copyreader)
            .unpack(&self.pgdata())
-            .with_context(|| "extracting base backup failed")?;
+            .context("extracting base backup failed")?;

        Ok(())
    }
@@ -289,8 +292,10 @@ impl PostgresNode {
        conf.append("shared_buffers", "1MB");
        conf.append("fsync", "off");
        conf.append("max_connections", "100");
-        conf.append("wal_sender_timeout", "0");
        conf.append("wal_level", "replica");
+        // wal_sender_timeout is the maximum time to wait for WAL replication.
+        // It also defines how often the walreciever will send a feedback message to the wal sender.
+        conf.append("wal_sender_timeout", "5s");
        conf.append("listen_addresses", &self.address.ip().to_string());
        conf.append("port", &self.address.port().to_string());

@@ -315,8 +320,11 @@ impl PostgresNode {
            } else {
                ""
            };
-
-            format!("host={} port={} password={}", host, port, password)
+            // NOTE avoiding spaces in connection string, because it is less error prone if we forward it somewhere.
+            // Also note that not all parameters are supported here. Because in compute we substitute $ZENITH_AUTH_TOKEN
+            // We parse this string and build it back with token from env var, and for simplicity rebuild
+            // uses only needed variables namely host, port, user, password.
+            format!("postgresql://no_user:{}@{}:{}", password, host, port)
        };
        conf.append("shared_preload_libraries", "zenith");
        conf.append_line("");
@@ -326,7 +334,24 @@ impl PostgresNode {
        if let Some(lsn) = self.lsn {
            conf.append("recovery_target_lsn", &lsn.to_string());
        }
+
        conf.append_line("");
+        // Configure backpressure
+        // - Replication write lag depends on how fast the walreceiver can process incoming WAL.
+        //   This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec,
+        //   so to avoid expiration of 1 minute timeout, this lag should not be larger than 600MB.
+        //   Actually latency should be much smaller (better if < 1sec). But we assume that recently
+        //   updates pages are not requested from pageserver.
+        // - Replication flush lag depends on speed of persisting data by checkpointer (creation of
+        //   delta/image layers) and advancing disk_consistent_lsn. Safekeepers are able to
+        //   remove/archive WAL only beyond disk_consistent_lsn. Too large a lag can cause long
+        //   recovery time (in case of pageserver crash) and disk space overflow at safekeepers.
+        // - Replication apply lag depends on speed of uploading changes to S3 by uploader thread.
+        //   To be able to restore database in case of pageserver node crash, safekeeper should not
+        //   remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers
+        //   (if they are not able to upload WAL to S3).
+        conf.append("max_replication_write_lag", "500MB");
+        conf.append("max_replication_flush_lag", "10GB");

        if !self.env.safekeepers.is_empty() {
            // Configure the node to connect to the safekeepers
@@ -341,6 +366,11 @@ impl PostgresNode {
                .join(",");
            conf.append("wal_acceptors", &wal_acceptors);
        } else {
+            // We only use setup without safekeepers for tests,
+            // and don't care about data durability on pageserver,
+            // so set more relaxed synchronous_commit.
+            conf.append("synchronous_commit", "remote_write");
+
            // Configure the node to stream WAL directly to the pageserver
            // This isn't really a supported configuration, but can be useful for
            // testing.
@@ -354,7 +384,7 @@ impl PostgresNode {
        Ok(())
    }

-    fn load_basebackup(&self) -> Result<()> {
+    fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
        let backup_lsn = if let Some(lsn) = self.lsn {
            Some(lsn)
        } else if self.uses_wal_proposer {
@@ -362,7 +392,7 @@ impl PostgresNode {
            // latest data from the pageserver. That is a bit clumsy but whole bootstrap
            // procedure evolves quite actively right now, so let's think about it again
            // when things would be more stable (TODO).
-            let lsn = self.sync_safekeepers()?;
+            let lsn = self.sync_safekeepers(auth_token)?;
            if lsn == Lsn(0) {
                None
            } else {
@@ -413,11 +443,10 @@ impl PostgresNode {
        .env_clear()
        .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
        .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap());
-
        if let Some(token) = auth_token {
            cmd.env("ZENITH_AUTH_TOKEN", token);
        }
-        let pg_ctl = cmd.status().with_context(|| "pg_ctl failed")?;
+        let pg_ctl = cmd.status().context("pg_ctl failed")?;

        if !pg_ctl.success() {
            anyhow::bail!("pg_ctl failed");
@@ -447,7 +476,7 @@ impl PostgresNode {
        fs::write(&postgresql_conf_path, postgresql_conf)?;

        // 3. Load basebackup
-        self.load_basebackup()?;
+        self.load_basebackup(auth_token)?;

        if self.lsn.is_some() {
            File::create(self.pgdata().join("standby.signal"))?;
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -9,6 +9,7 @@
 use anyhow::{anyhow, bail, Context, Result};
 use std::fs;
 use std::path::Path;
+use std::process::Command;

 pub mod compute;
 pub mod local_env;
@@ -31,3 +32,19 @@ pub fn read_pidfile(pidfile: &Path) -> Result<i32> {
    }
    Ok(pid)
 }
+
+fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
+    let cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
+
+    let var = "LLVM_PROFILE_FILE";
+    if let Some(val) = std::env::var_os(var) {
+        cmd.env(var, val);
+    }
+
+    const RUST_LOG_KEY: &str = "RUST_LOG";
+    if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
+        cmd.env(RUST_LOG_KEY, rust_log_value)
+    } else {
+        cmd
+    }
+}
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -1,10 +1,9 @@
-//
-// This module is responsible for locating and loading paths in a local setup.
-//
-// Now it also provides init method which acts like a stub for proper installation
-// script which will use local paths.
-//
-use anyhow::{Context, Result};
+//! This module is responsible for locating and loading paths in a local setup.
+//!
+//! Now it also provides init method which acts like a stub for proper installation
+//! script which will use local paths.
+
+use anyhow::{bail, Context};
 use serde::{Deserialize, Serialize};
 use std::env;
 use std::fmt::Write;
@@ -13,7 +12,9 @@ use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
 use zenith_utils::auth::{encode_from_key_file, Claims, Scope};
 use zenith_utils::postgres_backend::AuthType;
-use zenith_utils::zid::ZTenantId;
+use zenith_utils::zid::{HexZTenantId, ZNodeId, ZTenantId};
+
+use crate::safekeeper::SafekeeperNode;

 //
 // This data structures represents zenith CLI config
@@ -46,9 +47,8 @@ pub struct LocalEnv {

    // Default tenant ID to use with the 'zenith' command line utility, when
    // --tenantid is not explicitly specified.
-    #[serde(with = "opt_tenantid_serde")]
    #[serde(default)]
-    pub default_tenantid: Option<ZTenantId>,
+    pub default_tenantid: Option<HexZTenantId>,

    // used to issue tokens during e.g pg start
    #[serde(default)]
@@ -63,9 +63,11 @@ pub struct LocalEnv {
 #[derive(Serialize, Deserialize, Clone, Debug)]
 #[serde(default)]
 pub struct PageServerConf {
+    // node id
+    pub id: ZNodeId,
    // Pageserver connection settings
-    pub pg_port: u16,
-    pub http_port: u16,
+    pub listen_pg_addr: String,
+    pub listen_http_addr: String,

    // used to determine which auth type is used
    pub auth_type: AuthType,
@@ -77,10 +79,11 @@ pub struct PageServerConf {
 impl Default for PageServerConf {
    fn default() -> Self {
        Self {
-            pg_port: 0,
-            http_port: 0,
+            id: ZNodeId(0),
+            listen_pg_addr: String::new(),
+            listen_http_addr: String::new(),
            auth_type: AuthType::Trust,
-            auth_token: "".to_string(),
+            auth_token: String::new(),
        }
    }
 }
@@ -88,7 +91,7 @@ impl Default for PageServerConf {
 #[derive(Serialize, Deserialize, Clone, Debug)]
 #[serde(default)]
 pub struct SafekeeperConf {
-    pub name: String,
+    pub id: ZNodeId,
    pub pg_port: u16,
    pub http_port: u16,
    pub sync: bool,
@@ -97,7 +100,7 @@ pub struct SafekeeperConf {
 impl Default for SafekeeperConf {
    fn default() -> Self {
        Self {
-            name: "".to_string(),
+            id: ZNodeId(0),
            pg_port: 0,
            http_port: 0,
            sync: true,
@@ -114,11 +117,11 @@ impl LocalEnv {
        self.pg_distrib_dir.join("lib")
    }

-    pub fn pageserver_bin(&self) -> Result<PathBuf> {
+    pub fn pageserver_bin(&self) -> anyhow::Result<PathBuf> {
        Ok(self.zenith_distrib_dir.join("pageserver"))
    }

-    pub fn safekeeper_bin(&self) -> Result<PathBuf> {
+    pub fn safekeeper_bin(&self) -> anyhow::Result<PathBuf> {
        Ok(self.zenith_distrib_dir.join("safekeeper"))
    }

@@ -137,15 +140,15 @@ impl LocalEnv {
        self.base_data_dir.clone()
    }

-    pub fn safekeeper_data_dir(&self, node_name: &str) -> PathBuf {
-        self.base_data_dir.join("safekeepers").join(node_name)
+    pub fn safekeeper_data_dir(&self, data_dir_name: &str) -> PathBuf {
+        self.base_data_dir.join("safekeepers").join(data_dir_name)
    }

    /// Create a LocalEnv from a config file.
    ///
    /// Unlike 'load_config', this function fills in any defaults that are missing
    /// from the config file.
-    pub fn create_config(toml: &str) -> Result<LocalEnv> {
+    pub fn create_config(toml: &str) -> anyhow::Result<Self> {
        let mut env: LocalEnv = toml::from_str(toml)?;

        // Find postgres binaries.
@@ -159,7 +162,7 @@ impl LocalEnv {
            }
        }
        if !env.pg_distrib_dir.join("bin/postgres").exists() {
-            anyhow::bail!(
+            bail!(
                "Can't find postgres binary at {}",
                env.pg_distrib_dir.display()
            );
@@ -169,16 +172,19 @@ impl LocalEnv {
        if env.zenith_distrib_dir == Path::new("") {
            env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
        }
-        if !env.zenith_distrib_dir.join("pageserver").exists() {
-            anyhow::bail!("Can't find pageserver binary.");
-        }
-        if !env.zenith_distrib_dir.join("safekeeper").exists() {
-            anyhow::bail!("Can't find safekeeper binary.");
+        for binary in ["pageserver", "safekeeper"] {
+            if !env.zenith_distrib_dir.join(binary).exists() {
+                bail!(
+                    "Can't find binary '{}' in zenith distrib dir '{}'",
+                    binary,
+                    env.zenith_distrib_dir.display()
+                );
+            }
        }

        // If no initial tenant ID was given, generate it.
        if env.default_tenantid.is_none() {
-            env.default_tenantid = Some(ZTenantId::generate());
+            env.default_tenantid = Some(HexZTenantId::from(ZTenantId::generate()));
        }

        env.base_data_dir = base_path();
@@ -187,11 +193,11 @@ impl LocalEnv {
    }

    /// Locate and load config
-    pub fn load_config() -> Result<LocalEnv> {
+    pub fn load_config() -> anyhow::Result<Self> {
        let repopath = base_path();

        if !repopath.exists() {
-            anyhow::bail!(
+            bail!(
                "Zenith config is not found in {}. You need to run 'zenith init' first",
                repopath.to_str().unwrap()
            );
@@ -209,7 +215,7 @@ impl LocalEnv {
    }

    // this function is used only for testing purposes in CLI e g generate tokens during init
-    pub fn generate_auth_token(&self, claims: &Claims) -> Result<String> {
+    pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result<String> {
        let private_key_path = if self.private_key_path.is_absolute() {
            self.private_key_path.to_path_buf()
        } else {
@@ -223,14 +229,14 @@ impl LocalEnv {
    //
    // Initialize a new Zenith repository
    //
-    pub fn init(&mut self) -> Result<()> {
+    pub fn init(&mut self) -> anyhow::Result<()> {
        // check if config already exists
        let base_path = &self.base_data_dir;
        if base_path == Path::new("") {
-            anyhow::bail!("repository base path is missing");
+            bail!("repository base path is missing");
        }
        if base_path.exists() {
-            anyhow::bail!(
+            bail!(
                "directory '{}' already exists. Perhaps already initialized?",
                base_path.to_str().unwrap()
            );
@@ -249,14 +255,14 @@ impl LocalEnv {
                .arg("2048")
                .stdout(Stdio::null())
                .output()
-                .with_context(|| "failed to generate auth private key")?;
+                .context("failed to generate auth private key")?;
            if !keygen_output.status.success() {
-                anyhow::bail!(
+                bail!(
                    "openssl failed: '{}'",
                    String::from_utf8_lossy(&keygen_output.stderr)
                );
            }
-            self.private_key_path = Path::new("auth_private_key.pem").to_path_buf();
+            self.private_key_path = PathBuf::from("auth_private_key.pem");

            let public_key_path = base_path.join("auth_public_key.pem");
            // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
@@ -268,9 +274,9 @@ impl LocalEnv {
                .args(&["-out", public_key_path.to_str().unwrap()])
                .stdout(Stdio::null())
                .output()
-                .with_context(|| "failed to generate auth private key")?;
+                .context("failed to generate auth private key")?;
            if !keygen_output.status.success() {
-                anyhow::bail!(
+                bail!(
                    "openssl failed: '{}'",
                    String::from_utf8_lossy(&keygen_output.stderr)
                );
@@ -282,8 +288,8 @@ impl LocalEnv {

        fs::create_dir_all(self.pg_data_dirs_path())?;

-        for safekeeper in self.safekeepers.iter() {
-            fs::create_dir_all(self.safekeeper_data_dir(&safekeeper.name))?;
+        for safekeeper in &self.safekeepers {
+            fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?;
        }

        let mut conf_content = String::new();
@@ -325,30 +331,3 @@ fn base_path() -> PathBuf {
        None => ".zenith".into(),
    }
 }
-
-/// Serde routines for Option<ZTenantId>. The serialized form is a hex string.
-mod opt_tenantid_serde {
-    use serde::{Deserialize, Deserializer, Serialize, Serializer};
-    use std::str::FromStr;
-    use zenith_utils::zid::ZTenantId;
-
-    pub fn serialize<S>(tenantid: &Option<ZTenantId>, ser: S) -> Result<S::Ok, S::Error>
-    where
-        S: Serializer,
-    {
-        tenantid.map(|t| t.to_string()).serialize(ser)
-    }
-
-    pub fn deserialize<'de, D>(des: D) -> Result<Option<ZTenantId>, D::Error>
-    where
-        D: Deserializer<'de>,
-    {
-        let s: Option<String> = Option::deserialize(des)?;
-        if let Some(s) = s {
-            return Ok(Some(
-                ZTenantId::from_str(&s).map_err(serde::de::Error::custom)?,
-            ));
-        }
-        Ok(None)
-    }
-}
--- a/control_plane/src/postgresql_conf.rs
+++ b/control_plane/src/postgresql_conf.rs
@@ -4,7 +4,7 @@
 /// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just
 /// enough to extract a few settings we need in Zenith, assuming you don't do
 /// funny stuff like include-directives or funny escaping.
-use anyhow::{anyhow, bail, Context, Result};
+use anyhow::{bail, Context, Result};
 use lazy_static::lazy_static;
 use regex::Regex;
 use std::collections::HashMap;
@@ -78,7 +78,7 @@ impl PostgresConf {
        <T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
    {
        self.get(field_name)
-            .ok_or_else(|| anyhow!("could not find '{}' option {}", field_name, context))?
+            .with_context(|| format!("could not find '{}' option {}", field_name, context))?
            .parse::<T>()
            .with_context(|| format!("could not parse '{}' option {}", field_name, context))
    }
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -14,14 +14,14 @@ use postgres::Config;
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
+use walkeeper::http::models::TimelineCreateRequest;
 use zenith_utils::http::error::HttpErrorBody;
-use zenith_utils::postgres_backend::AuthType;
+use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId};

 use crate::local_env::{LocalEnv, SafekeeperConf};
-use crate::read_pidfile;
 use crate::storage::PageServerNode;
+use crate::{fill_rust_env_vars, read_pidfile};
 use zenith_utils::connstring::connection_address;
-use zenith_utils::connstring::connection_host_port;

 #[derive(Error, Debug)]
 pub enum SafekeeperHttpError {
@@ -63,7 +63,7 @@ impl ResponseErrorMessageExt for Response {
 //
 #[derive(Debug)]
 pub struct SafekeeperNode {
-    pub name: String,
+    pub id: ZNodeId,

    pub conf: SafekeeperConf,

@@ -79,15 +79,15 @@ impl SafekeeperNode {
    pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
        let pageserver = Arc::new(PageServerNode::from_env(env));

-        println!("initializing for {} for {}", conf.name, conf.http_port);
+        println!("initializing for sk {} for {}", conf.id, conf.http_port);

        SafekeeperNode {
-            name: conf.name.clone(),
+            id: conf.id,
            conf: conf.clone(),
            pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
            env: env.clone(),
            http_client: Client::new(),
-            http_base_url: format!("http://localhost:{}/v1", conf.http_port),
+            http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
            pageserver,
        }
    }
@@ -95,13 +95,17 @@ impl SafekeeperNode {
    /// Construct libpq connection string for connecting to this safekeeper.
    fn safekeeper_connection_config(port: u16) -> Config {
        // TODO safekeeper authentication not implemented yet
-        format!("postgresql://no_user@localhost:{}/no_db", port)
+        format!("postgresql://no_user@127.0.0.1:{}/no_db", port)
            .parse()
            .unwrap()
    }

+    pub fn datadir_path_by_id(env: &LocalEnv, sk_id: ZNodeId) -> PathBuf {
+        env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref())
+    }
+
    pub fn datadir_path(&self) -> PathBuf {
-        self.env.safekeeper_data_dir(&self.name)
+        SafekeeperNode::datadir_path_by_id(&self.env, self.id)
    }

    pub fn pid_file(&self) -> PathBuf {
@@ -116,36 +120,20 @@ impl SafekeeperNode {
        );
        io::stdout().flush().unwrap();

-        // Configure connection to page server
-        //
-        // FIXME: We extract the host and port from the connection string instead of using
-        // the connection string directly, because the 'safekeeper' binary expects
-        // host:port format. That's a bit silly when we already have a full libpq connection
-        // string at hand.
-        let pageserver_conn = {
-            let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
-            format!("{}:{}", host, port)
-        };
+        let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
+        let listen_http = format!("127.0.0.1:{}", self.conf.http_port);

-        let listen_pg = format!("localhost:{}", self.conf.pg_port);
-        let listen_http = format!("localhost:{}", self.conf.http_port);
-
-        let mut cmd: &mut Command = &mut Command::new(self.env.safekeeper_bin()?);
-        cmd = cmd
-            .args(&["-D", self.datadir_path().to_str().unwrap()])
-            .args(&["--listen-pg", &listen_pg])
-            .args(&["--listen-http", &listen_http])
-            .args(&["--pageserver", &pageserver_conn])
-            .args(&["--recall", "1 second"])
-            .arg("--daemonize")
-            .env_clear()
-            .env("RUST_BACKTRACE", "1");
+        let mut cmd = Command::new(self.env.safekeeper_bin()?);
+        fill_rust_env_vars(
+            cmd.args(&["-D", self.datadir_path().to_str().unwrap()])
+                .args(&["--id", self.id.to_string().as_ref()])
+                .args(&["--listen-pg", &listen_pg])
+                .args(&["--listen-http", &listen_http])
+                .args(&["--recall", "1 second"])
+                .arg("--daemonize"),
+        );
        if !self.conf.sync {
-            cmd = cmd.arg("--no-sync");
-        }
-
-        if self.env.pageserver.auth_type == AuthType::ZenithJWT {
-            cmd.env("PAGESERVER_AUTH_TOKEN", &self.env.pageserver.auth_token);
+            cmd.arg("--no-sync");
        }

        if !cmd.status()?.success() {
@@ -202,7 +190,7 @@ impl SafekeeperNode {
    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
        let pid_file = self.pid_file();
        if !pid_file.exists() {
-            println!("Safekeeper {} is already stopped", self.name);
+            println!("Safekeeper {} is already stopped", self.id);
            return Ok(());
        }
        let pid = read_pidfile(&pid_file)?;
@@ -274,4 +262,25 @@ impl SafekeeperNode {
            .error_from_body()?;
        Ok(())
    }
+
+    pub fn timeline_create(
+        &self,
+        tenant_id: ZTenantId,
+        timeline_id: ZTimelineId,
+        peer_ids: Vec<ZNodeId>,
+    ) -> Result<()> {
+        Ok(self
+            .http_request(
+                Method::POST,
+                format!("{}/{}", self.http_base_url, "timeline"),
+            )
+            .json(&TimelineCreateRequest {
+                tenant_id,
+                timeline_id,
+                peer_ids,
+            })
+            .send()?
+            .error_from_body()?
+            .json()?)
+    }
 }
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -5,7 +5,7 @@ use std::process::Command;
 use std::time::Duration;
 use std::{io, result, thread};

-use anyhow::{anyhow, bail};
+use anyhow::bail;
 use nix::errno::Errno;
 use nix::sys::signal::{kill, Signal};
 use nix::unistd::Pid;
@@ -19,7 +19,7 @@ use zenith_utils::postgres_backend::AuthType;
 use zenith_utils::zid::ZTenantId;

 use crate::local_env::LocalEnv;
-use crate::read_pidfile;
+use crate::{fill_rust_env_vars, read_pidfile};
 use pageserver::branches::BranchInfo;
 use pageserver::tenant_mgr::TenantInfo;
 use zenith_utils::connstring::connection_address;
@@ -78,62 +78,78 @@ impl PageServerNode {
            ""
        };

-        PageServerNode {
+        Self {
            pg_connection_config: Self::pageserver_connection_config(
                password,
-                env.pageserver.pg_port,
+                &env.pageserver.listen_pg_addr,
            ),
            env: env.clone(),
            http_client: Client::new(),
-            http_base_url: format!("http://localhost:{}/v1", env.pageserver.http_port),
+            http_base_url: format!("http://{}/v1", env.pageserver.listen_http_addr),
        }
    }

    /// Construct libpq connection string for connecting to the pageserver.
-    fn pageserver_connection_config(password: &str, port: u16) -> Config {
-        format!("postgresql://no_user:{}@localhost:{}/no_db", password, port)
+    fn pageserver_connection_config(password: &str, listen_addr: &str) -> Config {
+        format!("postgresql://no_user:{}@{}/no_db", password, listen_addr)
            .parse()
            .unwrap()
    }

-    pub fn init(&self, create_tenant: Option<&str>) -> anyhow::Result<()> {
+    pub fn init(
+        &self,
+        create_tenant: Option<&str>,
+        config_overrides: &[&str],
+    ) -> anyhow::Result<()> {
        let mut cmd = Command::new(self.env.pageserver_bin()?);
-        let listen_pg = format!("localhost:{}", self.env.pageserver.pg_port);
-        let listen_http = format!("localhost:{}", self.env.pageserver.http_port);
-        let mut args = vec![
-            "--init",
-            "-D",
-            self.env.base_data_dir.to_str().unwrap(),
-            "--postgres-distrib",
-            self.env.pg_distrib_dir.to_str().unwrap(),
-            "--listen-pg",
-            &listen_pg,
-            "--listen-http",
-            &listen_http,
-        ];

-        let auth_type_str = &self.env.pageserver.auth_type.to_string();
-        if self.env.pageserver.auth_type != AuthType::Trust {
-            args.extend(&["--auth-validation-public-key-path", "auth_public_key.pem"]);
+        let id = format!("id={}", self.env.pageserver.id);
+
+        // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
+        let base_data_dir_param = self.env.base_data_dir.display().to_string();
+        let pg_distrib_dir_param =
+            format!("pg_distrib_dir='{}'", self.env.pg_distrib_dir.display());
+        let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type);
+        let listen_http_addr_param = format!(
+            "listen_http_addr='{}'",
+            self.env.pageserver.listen_http_addr
+        );
+        let listen_pg_addr_param =
+            format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr);
+        let mut args = Vec::with_capacity(20);
+
+        args.push("--init");
+        args.extend(["-D", &base_data_dir_param]);
+        args.extend(["-c", &pg_distrib_dir_param]);
+        args.extend(["-c", &authg_type_param]);
+        args.extend(["-c", &listen_http_addr_param]);
+        args.extend(["-c", &listen_pg_addr_param]);
+        args.extend(["-c", &id]);
+
+        for config_override in config_overrides {
+            args.extend(["-c", config_override]);
+        }
+
+        if self.env.pageserver.auth_type != AuthType::Trust {
+            args.extend([
+                "-c",
+                "auth_validation_public_key_path='auth_public_key.pem'",
+            ]);
        }
-        args.extend(&["--auth-type", auth_type_str]);

        if let Some(tenantid) = create_tenant {
-            args.extend(&["--create-tenant", tenantid])
+            args.extend(["--create-tenant", tenantid])
        }

-        let status = cmd
-            .args(args)
-            .env_clear()
-            .env("RUST_BACKTRACE", "1")
+        let status = fill_rust_env_vars(cmd.args(args))
            .status()
            .expect("pageserver init failed");

-        if status.success() {
-            Ok(())
-        } else {
-            Err(anyhow!("pageserver init failed"))
+        if !status.success() {
+            bail!("pageserver init failed");
        }
+
+        Ok(())
    }

    pub fn repo_path(&self) -> PathBuf {
@@ -144,7 +160,7 @@ impl PageServerNode {
        self.repo_path().join("pageserver.pid")
    }

-    pub fn start(&self) -> anyhow::Result<()> {
+    pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
        print!(
            "Starting pageserver at '{}' in '{}'",
            connection_address(&self.pg_connection_config),
@@ -153,10 +169,15 @@ impl PageServerNode {
        io::stdout().flush().unwrap();

        let mut cmd = Command::new(self.env.pageserver_bin()?);
-        cmd.args(&["-D", self.repo_path().to_str().unwrap()])
-            .arg("--daemonize")
-            .env_clear()
-            .env("RUST_BACKTRACE", "1");
+
+        let repo_path = self.repo_path();
+        let mut args = vec!["-D", repo_path.to_str().unwrap()];
+
+        for config_override in config_overrides {
+            args.extend(["-c", config_override]);
+        }
+
+        fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));

        if !cmd.status()?.success() {
            bail!(
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -4,10 +4,10 @@ set -eux
 if [ "$1" = 'pageserver' ]; then
    if [ ! -d "/data/tenants" ]; then
        echo "Initializing pageserver data directory"
-        pageserver --init -D /data --postgres-distrib /usr/local
+        pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10"
    fi
    echo "Staring pageserver at 0.0.0.0:6400"
-    pageserver -l 0.0.0.0:6400 --listen-http 0.0.0.0:9898 -D /data
+    pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /data
 else
    "$@"
 fi
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -7,32 +7,14 @@ Currently we build two main images:
 - [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
 - [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres).

-And two intermediate images used either to reduce build time or to deliver some additional binary tools from other repos:
+And additional intermediate images:

- [zenithdb/build](https://hub.docker.com/repository/docker/zenithdb/build) — image with all the dependencies required to build Zenith and compute node images. This image is based on `rust:slim-buster`, so it also has a proper `rust` environment. Built from [/Dockerfile.build](/Dockerfile.build).
 - [zenithdb/compute-tools](https://hub.docker.com/repository/docker/zenithdb/compute-tools) — compute node configuration management tools.

 ## Building pipeline

 1. Image `zenithdb/compute-tools` is re-built automatically.

-2. Image `zenithdb/build` is built manually. If you want to introduce any new compile time dependencies to Zenith or compute node you have to update this image as well, build it and push to Docker Hub.
+2. Image `zenithdb/compute-node` is built independently in the [zenithdb/postgres](https://github.com/zenithdb/postgres) repo.

-Build:
-```sh
-docker build -t zenithdb/build:buster -f Dockerfile.build .
-```
-
-Login:
-```sh
-docker login
-```
-
-Push to Docker Hub:
-```sh
-docker push zenithdb/build:buster
-```
-
-3. Image `zenithdb/compute-node` is built independently in the [zenithdb/postgres](https://github.com/zenithdb/postgres) repo.
-
-4. Image `zenithdb/zenith` is built in this repo after a successful `release` tests run and pushed to Docker Hub automatically.
+3. Image `zenithdb/zenith` is built in this repo after a successful `release` tests run and pushed to Docker Hub automatically.
--- a/docs/glossary.md
+++ b/docs/glossary.md
@@ -2,6 +2,16 @@

 ### Authentication

+### Backpresssure
+
+Backpressure is used to limit the lag between pageserver and compute node or WAL service.
+
+If compute node or WAL service run far ahead of Page Server,
+the time of serving page requests increases. This may lead to timeout errors.
+
+To tune backpressure limits use `max_replication_write_lag`, `max_replication_flush_lag` and `max_replication_apply_lag` settings.
+When lag between current LSN (pg_current_wal_flush_lsn() at compute node) and minimal write/flush/apply position of replica exceeds the limit
+backends performing writes are blocked until the replica is caught up.
 ### Base image (page image)

 ### Basebackup
@@ -76,7 +86,37 @@ The layer map tracks what layers exist for all the relishes in a timeline.
 Zenith repository implementation that keeps data in layers.
 ### LSN

+The Log Sequence Number (LSN) is a unique identifier of the WAL record[] in the WAL log.
+The insert position is a byte offset into the logs, increasing monotonically with each new record.
+Internally, an LSN is a 64-bit integer, representing a byte position in the write-ahead log stream.
+It is printed as two hexadecimal numbers of up to 8 digits each, separated by a slash.
+Check also [PostgreSQL doc about pg_lsn type](https://www.postgresql.org/docs/devel/datatype-pg-lsn.html)
+Values can be compared to calculate the volume of WAL data that separates them, so they are used to measure the progress of replication and recovery.

+In postgres and Zenith lsns are used to describe certain points in WAL handling.
+
+PostgreSQL LSNs and functions to monitor them:
+* `pg_current_wal_insert_lsn()` - Returns the current write-ahead log insert location.
+* `pg_current_wal_lsn()` - Returns the current write-ahead log write location.
+* `pg_current_wal_flush_lsn()` - Returns the current write-ahead log flush location.
+* `pg_last_wal_receive_lsn()` - Returns the last write-ahead log location that has been received and synced to disk by streaming replication. While streaming replication is in progress this will increase monotonically.
+* `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically. 
+[source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html):
+
+Zenith safekeeper LSNs. For more check [walkeeper/README_PROTO.md](/walkeeper/README_PROTO.md)
+* `CommitLSN`: position in WAL confirmed by quorum safekeepers.
+* `RestartLSN`: position in WAL confirmed by all safekeepers.
+* `FlushLSN`: part of WAL persisted to the disk by safekeeper.
+* `VCL`: the largerst LSN for which we can guarantee availablity of all prior records.
+
+Zenith pageserver LSNs:
+* `last_record_lsn` - the end of last processed WAL record.
+* `disk_consistent_lsn` - data is known to be fully flushed and fsync'd to local disk on pageserver up to this LSN.
+* `remote_consistent_lsn` - The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash.
+TODO: use this name consistently in remote storage code. Now `disk_consistent_lsn` is used and meaning depends on the context.
+* `ancestor_lsn` - LSN of the branch point (the LSN at which this branch was created)
+
+TODO: add table that describes mapping between PostgreSQL (compute), safekeeper and pageserver LSNs.
 ### Page (block)

 The basic structure used to store relation data. All pages are of the same size.
--- a/docs/pageserver-tenant-migration.md
+++ b/docs/pageserver-tenant-migration.md
@@ -0,0 +1,22 @@
+## Pageserver tenant migration
+
+### Overview
+
+This feature allows to migrate a timeline from one pageserver to another by utilizing remote storage capability.
+
+### Migration process
+
+Pageserver implements two new http handlers: timeline attach and timeline detach.
+Timeline migration is performed in a following way:
+1. Timeline attach is called on a target pageserver. This asks pageserver to download latest checkpoint uploaded to s3.
+2. For now it is necessary to manually initialize replication stream via callmemaybe call so target pageserver initializes replication from safekeeper (it is desired to avoid this and initialize replication directly in attach handler, but this requires some refactoring (probably [#997](https://github.com/zenithdb/zenith/issues/997)/[#1049](https://github.com/zenithdb/zenith/issues/1049))
+3. Replication state can be tracked via timeline detail pageserver call.
+4. Compute node should be restarted with new pageserver connection string. Issue with multiple compute nodes for one timeline is handled on the safekeeper consensus level. So this is not a problem here.Currently responsibility for rescheduling the compute with updated config lies on external coordinator (console).
+5. Timeline is detached from old pageserver. On disk data is removed.
+
+
+### Implementation details
+
+Now safekeeper needs to track which pageserver it is replicating to. This introduces complications into replication code:
+* We need to distinguish different pageservers (now this is done by connection string which is imperfect and is covered here: https://github.com/zenithdb/zenith/issues/1105). Callmemaybe subscription management also needs to track that (this is already implemented).
+* We need to track which pageserver is the primary. This is needed to avoid reconnections to non primary pageservers. Because we shouldn't reconnect to them when they decide to stop their walreceiver. I e this can appear when there is a load on the compute and we are trying to detach timeline from old pageserver. In this case callmemaybe will try to reconnect to it because replication termination condition is not met (page server with active compute could never catch up to the latest lsn, so there is always some wal tail)
--- a/docs/settings.md
+++ b/docs/settings.md
@@ -0,0 +1,180 @@
+## Pageserver
+
+Pageserver is mainly configured via a `pageserver.toml` config file.
+If there's no such file during `init` phase of the server, it creates the file itself. Without 'init', the file is read.
+
+There's a possibility to pass an arbitrary config value to the pageserver binary as an argument: such values override
+the values in the config file, if any are specified for the same key and get into the final config during init phase.
+
+
+### Config example
+
+```toml
+# Initial configuration file created by 'pageserver --init'
+
+listen_pg_addr = '127.0.0.1:64000'
+listen_http_addr = '127.0.0.1:9898'
+
+checkpoint_distance = '268435456' # in bytes
+checkpoint_period = '1 s'
+
+gc_period = '100 s'
+gc_horizon = '67108864'
+
+max_file_descriptors = '100'
+
+# initial superuser role name to use when creating a new tenant
+initial_superuser_name = 'zenith_admin'
+
+# [remote_storage]
+```
+
+The config above shows default values for all basic pageserver settings.
+Pageserver uses default values for all files that are missing in the config, so it's not a hard error to leave the config blank.
+Yet, it validates the config values it can (e.g. postgres install dir) and errors if the validation fails, refusing to start.
+
+Note the `[remote_storage]` section: it's a [table](https://toml.io/en/v1.0.0#table) in TOML specification and
+
+* either has to be placed in the config after the table-less values such as `initial_superuser_name = 'zenith_admin'`
+
+* or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}`
+
+### Config values
+
+All values can be passed as an argument to the pageserver binary, using the `-c` parameter and specified as a valid TOML string. All tables should be passed in the inline form.
+
+Example: `${PAGESERVER_BIN} -c "checkpoint_period = '100 s'" -c "remote_storage={local_path='/some/local/path/'}"`
+
+Note that TOML distinguishes between strings and integers, the former require single or double quotes around them.
+
+#### checkpoint_distance
+
+`checkpoint_distance` is the amount of incoming WAL that is held in
+the open layer, before it's flushed to local disk. It puts an upper
+bound on how much WAL needs to be re-processed after a pageserver
+crash. It is a soft limit, the pageserver can momentarily go above it,
+but it will trigger a checkpoint operation to get it back below the
+limit.
+
+`checkpoint_distance` also determines how much WAL needs to be kept
+durable in the safekeeper.  The safekeeper must have capacity to hold
+this much WAL, with some headroom, otherwise you can get stuck in a
+situation where the safekeeper is full and stops accepting new WAL,
+but the pageserver is not flushing out and releasing the space in the
+safekeeper because it hasn't reached checkpoint_distance yet.
+
+`checkpoint_distance` also controls how often the WAL is uploaded to
+S3.
+
+The unit is # of bytes.
+
+#### checkpoint_period
+
+The pageserver checks whether `checkpoint_distance` has been reached
+every `checkpoint_period` seconds. Default is 1 s, which should be
+fine.
+
+#### gc_horizon
+
+`gz_horizon` determines how much history is retained, to allow
+branching and read replicas at an older point in time. The unit is #
+of bytes of WAL. Page versions older than this are garbage collected
+away.
+
+#### gc_period
+
+Interval at which garbage collection is triggered. Default is 100 s.
+
+#### initial_superuser_name
+
+Name of the initial superuser role, passed to initdb when a new tenant
+is initialized. It doesn't affect anything after initialization. The
+default is Note: The default is 'zenith_admin', and the console
+depends on that, so if you change it, bad things will happen.
+
+#### page_cache_size
+
+Size of the page cache, to hold materialized page versions. Unit is
+number of 8 kB blocks. The default is 8192, which means 64 MB.
+
+#### max_file_descriptors
+
+Max number of file descriptors to hold open concurrently for accessing
+layer files. This should be kept well below the process/container/OS
+limit (see `ulimit -n`), as the pageserver also needs file descriptors
+for other files and for sockets for incoming connections.
+
+#### pg_distrib_dir
+
+A directory with Postgres installation to use during pageserver activities.
+Inside that dir, a `bin/postgres` binary should be present.
+
+The default distrib dir is `./tmp_install/`.
+
+#### workdir (-D)
+
+A directory in the file system, where pageserver will store its files.
+The default is `./.zenith/`.
+
+This parameter has a special CLI alias (`-D`) and can not be overridden with regular `-c` way.
+
+##### Remote storage
+
+There's a way to automatically back up and restore some of the pageserver's data from working dir to the remote storage.
+The backup system is disabled by default and can be enabled for either of the currently available storages:
+
+###### Local FS storage
+
+Pageserver can back up and restore some of its workdir contents to another directory.
+For that, only a path to that directory needs to be specified as a parameter:
+
+```toml
+[remote_storage]
+local_path = '/some/local/path/'
+```
+
+###### S3 storage
+
+Pageserver can back up and restore some of its workdir contents to S3.
+Full set of S3 credentials is needed for that as parameters.
+Configuration example:
+
+```toml
+[remote_storage]
+# Name of the bucket to connect to
+bucket_name = 'some-sample-bucket'
+
+# Name of the region where the bucket is located at
+bucket_region = 'eu-north-1'
+
+# A "subfolder" in the bucket, to use the same bucket separately by multiple pageservers at once.
+# Optional, pageserver uses entire bucket if the prefix is not specified.
+prefix_in_bucket = '/some/prefix/'
+
+# Access key to connect to the bucket ("login" part of the credentials)
+access_key_id = 'SOMEKEYAAAAASADSAH*#'
+
+# Secret access key to connect to the bucket ("password" part of the credentials)
+secret_access_key = 'SOMEsEcReTsd292v'
+```
+
+###### General remote storage configuration
+
+Pagesever allows only one remote storage configured concurrently and errors if parameters from multiple different remote configurations are used.
+No default values are used for the remote storage configuration parameters.
+
+Besides, there are parameters common for all types of remote storage that can be configured, those have defaults:
+
+```toml
+[remote_storage]
+# Max number of concurrent connections to open for uploading to or downloading from the remote storage.
+max_concurrent_sync = 100
+
+# Max number of errors a single task can have before it's considered failed and not attempted to run anymore.
+max_sync_errors = 10
+```
+
+
+## safekeeper
+
+TODO
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -87,31 +87,29 @@ so manual installation of dependencies is not recommended.
 A single virtual environment with all dependencies is described in the single `Pipfile`.

 ### Prerequisites
- Install Python 3.7 (the minimal supported version)
-    - Later version (e.g. 3.8) is ok if you don't write Python code
-    - You can install Python 3.7 separately, e.g.:
+- Install Python 3.7 (the minimal supported version) or greater.
+    - Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesnt work as expected.
+    - If you have some trouble with other version you can resolve it by installing Python 3.7 separately, via pyenv or via system package manager e.g.:
      ```bash
      # In Ubuntu
      sudo add-apt-repository ppa:deadsnakes/ppa
      sudo apt update
      sudo apt install python3.7
      ```
- Install `pipenv`
-    - Exact version of `pipenv` is not important, you can use Debian/Ubuntu package `pipenv`.
- Install dependencies via either
-  * `pipenv --python 3.7 install --dev` if you will write Python code, or
-  * `pipenv install` if you only want to run Python scripts and don't have Python 3.7.
+- Install `poetry`
+    - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation)`.
+- Install dependencies via `./scripts/pysync`. Note that CI uses Python 3.7 so if you have different version some linting tools can yield different result locally vs in the CI.

-Run `pipenv shell` to activate the virtual environment.
-Alternatively, use `pipenv run` to run a single command in the venv, e.g. `pipenv run pytest`.
+Run `poetry shell` to activate the virtual environment.
+Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.

 ### Obligatory checks
 We force code formatting via `yapf` and type hints via `mypy`.
 Run the following commands in the repository's root (next to `setup.cfg`):

 ```bash
-pipenv run yapf -ri .  # All code is reformatted
-pipenv run mypy .  # Ensure there are no typing errors
+poetry run yapf -ri .  # All code is reformatted
+poetry run mypy .  # Ensure there are no typing errors
 ```

 **WARNING**: do not run `mypy` from a directory other than the root of the repository.
@@ -123,17 +121,6 @@ Also consider:
 * Adding more type hints to your code to avoid `Any`.

 ### Changing dependencies
-You have to update `Pipfile.lock` if you have changed `Pipfile`:
+To add new package or change an existing one you can use `poetry add` or `poetry update` or edit `pyproject.toml` manually. Do not forget to run `poetry lock` in the latter case.

-```bash
-pipenv --python 3.7 install --dev  # Re-create venv for Python 3.7 and install recent pipenv inside
-pipenv run pipenv --version  # Should be at least 2021.5.29
-pipenv run pipenv lock  # Regenerate Pipfile.lock
-```
-
-As the minimal supported version is Python 3.7 and we use it in CI,
-you have to use a Python 3.7 environment when updating `Pipfile.lock`.
-Otherwise some back-compatibility packages will be missing.
-
-It is also important to run recent `pipenv`.
-Older versions remove markers from `Pipfile.lock`.
+More details are available in poetry's [documentation](https://python-poetry.org/docs/).
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -1,8 +1,7 @@
 [package]
 name = "pageserver"
 version = "0.1.0"
-authors = ["Stas Kelvich <stas@zenith.tech>"]
-edition = "2018"
+edition = "2021"

 [dependencies]
 bookfile = { git = "https://github.com/zenithdb/bookfile.git", branch="generic-readext" }
@@ -13,16 +12,18 @@ bytes = { version = "1.0.1", features = ['serde'] }
 byteorder = "1.4.3"
 futures = "0.3.13"
 hyper = "0.14"
+itertools = "0.10.3"
 lazy_static = "1.4.0"
 log = "0.4.14"
-clap = "2.33.0"
+clap = "3.0"
 daemonize = "0.4.1"
-tokio = { version = "1.11", features = ["process", "macros", "fs", "rt", "io-util"] }
-postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
-postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
-routerify = "2"
-anyhow = "1.0"
+tokio = { version = "1.11", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
+postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
+postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
+postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
+tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
+tokio-stream = "0.1.8"
+anyhow = { version = "1.0", features = ["backtrace"] }
 crc32c = "0.6.0"
 thiserror = "1.0"
 hex = { version = "0.4.3", features = ["serde"] }
@@ -30,16 +31,21 @@ tar = "0.4.33"
 humantime = "2.1.0"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
-toml = "0.5"
+toml_edit = { version = "0.13", features = ["easy"] }
 scopeguard = "1.1.0"
-rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
 async-trait = "0.1"
 const_format = "0.2.21"
 tracing = "0.1.27"
-signal-hook = {version = "0.3.10", features = ["extended-siginfo"] }
+tracing-futures = "0.2"
+signal-hook = "0.3.10"
 url = "2"
 nix = "0.23"
 once_cell = "1.8.0"
+crossbeam-utils = "0.8.5"
+fail = "0.5.0"
+
+rust-s3 = { version = "0.28", default-features = false, features = ["no-verify-ssl", "tokio-rustls-tls"] }
+async-compression = {version = "0.3", features = ["zstd", "tokio"]}

 postgres_ffi = { path = "../postgres_ffi" }
 zenith_metrics = { path = "../zenith_metrics" }
--- a/pageserver/README.md
+++ b/pageserver/README.md
@@ -9,7 +9,7 @@ The Page Server has a few different duties:

 S3 is the main fault-tolerant storage of all data, as there are no Page Server
 replicas. We use a separate fault-tolerant WAL service to reduce latency. It
-keeps track of WAL records which are not syncted to S3 yet.
+keeps track of WAL records which are not synced to S3 yet.

 The Page Server consists of multiple threads that operate on a shared
 repository of page versions:
@@ -129,13 +129,13 @@ There are the following implementations present:
 * local filesystem — to use in tests mainly
 * AWS S3           - to use in production

-Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs.
+Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs, parameters documentation can be found at [settings docs](../docs/settings.md).

 The backup service is disabled by default and can be enabled to interact with a single remote storage.

 CLI examples:
-* Local FS: `${PAGESERVER_BIN} --relish-storage-local-path="/some/local/path/"`
-* AWS S3  : `${PAGESERVER_BIN} --relish-storage-s3-bucket="some-sample-bucket" --relish-storage-region="eu-north-1" --relish-storage-access-key="SOMEKEYAAAAASADSAH*#" --relish-storage-secret-access-key="SOMEsEcReTsd292v"`
+* Local FS: `${PAGESERVER_BIN} -c "remote_storage={local_path='/some/local/path/'}"`
+* AWS S3  : `${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/',access_key_id='SOMEKEYAAAAASADSAH*#',secret_access_key='SOMEsEcReTsd292v'}"`

 For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
 For local S3 installations, refer to the their documentation for name format and credentials.
@@ -154,6 +154,7 @@ or
 [remote_storage]
 bucket_name = 'some-sample-bucket'
 bucket_region = 'eu-north-1'
+prefix_in_bucket = '/test_prefix/'
 access_key_id = 'SOMEKEYAAAAASADSAH*#'
 secret_access_key = 'SOMEsEcReTsd292v'
 ```
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -10,7 +10,7 @@
 //! This module is responsible for creation of such tarball
 //! from data stored in object storage.
 //!
-use anyhow::Result;
+use anyhow::{Context, Result};
 use bytes::{BufMut, BytesMut};
 use log::*;
 use std::fmt::Write as FmtWrite;
@@ -22,6 +22,7 @@ use tar::{Builder, EntryType, Header};

 use crate::relish::*;
 use crate::repository::Timeline;
+use crate::DatadirTimelineImpl;
 use postgres_ffi::xlog_utils::*;
 use postgres_ffi::*;
 use zenith_utils::lsn::Lsn;
@@ -31,7 +32,7 @@ use zenith_utils::lsn::Lsn;
 /// used for constructing tarball.
 pub struct Basebackup<'a> {
    ar: Builder<&'a mut dyn Write>,
-    timeline: &'a Arc<dyn Timeline>,
+    timeline: &'a Arc<DatadirTimelineImpl>,
    pub lsn: Lsn,
    prev_record_lsn: Lsn,
 }
@@ -46,7 +47,7 @@ pub struct Basebackup<'a> {
 impl<'a> Basebackup<'a> {
    pub fn new(
        write: &'a mut dyn Write,
-        timeline: &'a Arc<dyn Timeline>,
+        timeline: &'a Arc<DatadirTimelineImpl>,
        req_lsn: Option<Lsn>,
    ) -> Result<Basebackup<'a>> {
        // Compute postgres doesn't have any previous WAL files, but the first
@@ -64,7 +65,7 @@ impl<'a> Basebackup<'a> {
        // prev_lsn to Lsn(0) if we cannot provide the correct value.
        let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
            // Backup was requested at a particular LSN. Wait for it to arrive.
-            timeline.wait_lsn(req_lsn)?;
+            timeline.tline.wait_lsn(req_lsn)?;

            // If the requested point is the end of the timeline, we can
            // provide prev_lsn. (get_last_record_rlsn() might return it as
@@ -115,21 +116,24 @@ impl<'a> Basebackup<'a> {
        }

        // Gather non-relational files from object storage pages.
-        for obj in self.timeline.list_nonrels(self.lsn)? {
-            match obj {
-                RelishTag::Slru { slru, segno } => {
-                    self.add_slru_segment(slru, segno)?;
-                }
-                RelishTag::FileNodeMap { spcnode, dbnode } => {
-                    self.add_relmap_file(spcnode, dbnode)?;
-                }
-                RelishTag::TwoPhase { xid } => {
-                    self.add_twophase_file(xid)?;
-                }
-                _ => {}
+        for kind in [
+            SlruKind::Clog,
+            SlruKind::MultiXactOffsets,
+            SlruKind::MultiXactMembers,
+        ] {
+            for segno in self.timeline.list_slru_segments(kind, self.lsn)? {
+                self.add_slru_segment(kind, segno)?;
            }
        }

+        // Create tablespace directories
+        for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? {
+            self.add_dbdir(spcnode, dbnode, has_relmap_file)?;
+        }
+        for xid in self.timeline.list_twophase_files(self.lsn)? {
+            self.add_twophase_file(xid)?;
+        }
+
        // Generate pg_control and bootstrap WAL segment.
        self.add_pgcontrol_file()?;
        self.ar.finish()?;
@@ -141,27 +145,14 @@ impl<'a> Basebackup<'a> {
    // Generate SLRU segment files from repository.
    //
    fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
-        let seg_size = self
-            .timeline
-            .get_relish_size(RelishTag::Slru { slru, segno }, self.lsn)?;
-
-        if seg_size == None {
-            trace!(
-                "SLRU segment {}/{:>04X} was truncated",
-                slru.to_str(),
-                segno
-            );
-            return Ok(());
-        }
-
-        let nblocks = seg_size.unwrap();
+        let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?;

        let mut slru_buf: Vec<u8> =
            Vec::with_capacity(nblocks as usize * pg_constants::BLCKSZ as usize);
        for blknum in 0..nblocks {
-            let img =
-                self.timeline
-                    .get_page_at_lsn(RelishTag::Slru { slru, segno }, blknum, self.lsn)?;
+            let img = self
+                .timeline
+                .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?;
            assert!(img.len() == pg_constants::BLCKSZ as usize);

            slru_buf.extend_from_slice(&img);
@@ -176,16 +167,26 @@ impl<'a> Basebackup<'a> {
    }

    //
-    // Extract pg_filenode.map files from repository
-    // Along with them also send PG_VERSION for each database.
+    // Include database/tablespace directories.
    //
-    fn add_relmap_file(&mut self, spcnode: u32, dbnode: u32) -> anyhow::Result<()> {
-        let img = self.timeline.get_page_at_lsn(
-            RelishTag::FileNodeMap { spcnode, dbnode },
-            0,
-            self.lsn,
-        )?;
-        let path = if spcnode == pg_constants::GLOBALTABLESPACE_OID {
+    // Each directory contains a PG_VERSION file, and the default database
+    // directories also contain pg_filenode.map files.
+    //
+    fn add_dbdir(
+        &mut self,
+        spcnode: u32,
+        dbnode: u32,
+        has_relmap_file: bool,
+    ) -> anyhow::Result<()> {
+        let relmap_img = if has_relmap_file {
+            let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?;
+            assert!(img.len() == 512);
+            Some(img)
+        } else {
+            None
+        };
+
+        if spcnode == pg_constants::GLOBALTABLESPACE_OID {
            let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
            let header = new_tar_header("PG_VERSION", version_bytes.len() as u64)?;
            self.ar.append(&header, version_bytes)?;
@@ -193,9 +194,32 @@ impl<'a> Basebackup<'a> {
            let header = new_tar_header("global/PG_VERSION", version_bytes.len() as u64)?;
            self.ar.append(&header, version_bytes)?;

-            String::from("global/pg_filenode.map") // filenode map for global tablespace
+            if let Some(img) = relmap_img {
+                // filenode map for global tablespace
+                let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
+                self.ar.append(&header, &img[..])?;
+            } else {
+                warn!("global/pg_filenode.map is missing");
+            }
        } else {
-            // User defined tablespaces are not supported
+            // User defined tablespaces are not supported. However, as
+            // a special case, if a tablespace/db directory is
+            // completely empty, we can leave it out altogether. This
+            // makes taking a base backup after the 'tablespace'
+            // regression test pass, because the test drops the
+            // created tablespaces after the tests.
+            //
+            // FIXME: this wouldn't be necessary, if we handled
+            // XLOG_TBLSPC_DROP records. But we probably should just
+            // throw an error on CREATE TABLESPACE in the first place.
+            if !has_relmap_file
+                && self
+                    .timeline
+                    .list_rels(spcnode, dbnode, self.lsn)?
+                    .is_empty()
+            {
+                return Ok(());
+            }
            assert!(spcnode == pg_constants::DEFAULTTABLESPACE_OID);

            // Append dir path for each database
@@ -203,16 +227,17 @@ impl<'a> Basebackup<'a> {
            let header = new_tar_header_dir(&path)?;
            self.ar.append(&header, &mut io::empty())?;

-            let dst_path = format!("base/{}/PG_VERSION", dbnode);
-            let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
-            let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
-            self.ar.append(&header, version_bytes)?;
+            if let Some(img) = relmap_img {
+                let dst_path = format!("base/{}/PG_VERSION", dbnode);
+                let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
+                let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
+                self.ar.append(&header, version_bytes)?;

-            format!("base/{}/pg_filenode.map", dbnode)
+                let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
+                let header = new_tar_header(&relmap_path, img.len() as u64)?;
+                self.ar.append(&header, &img[..])?;
+            }
        };
-        assert!(img.len() == 512);
-        let header = new_tar_header(&path, img.len() as u64)?;
-        self.ar.append(&header, &img[..])?;
        Ok(())
    }

@@ -220,9 +245,7 @@ impl<'a> Basebackup<'a> {
    // Extract twophase state files
    //
    fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
-        let img = self
-            .timeline
-            .get_page_at_lsn(RelishTag::TwoPhase { xid }, 0, self.lsn)?;
+        let img = self.timeline.get_twophase_file(xid, self.lsn)?;

        let mut buf = BytesMut::new();
        buf.extend_from_slice(&img[..]);
@@ -242,10 +265,12 @@ impl<'a> Basebackup<'a> {
    fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
        let checkpoint_bytes = self
            .timeline
-            .get_page_at_lsn(RelishTag::Checkpoint, 0, self.lsn)?;
-        let pg_control_bytes =
-            self.timeline
-                .get_page_at_lsn(RelishTag::ControlFile, 0, self.lsn)?;
+            .get_checkpoint(self.lsn)
+            .context("failed to get checkpoint bytes")?;
+        let pg_control_bytes = self
+            .timeline
+            .get_control_file(self.lsn)
+            .context("failed get control bytes")?;
        let mut pg_control = ControlFileData::decode(&pg_control_bytes)?;
        let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;

@@ -265,7 +290,7 @@ impl<'a> Basebackup<'a> {
        // add zenith.signal file
        let mut zenith_signal = String::new();
        if self.prev_record_lsn == Lsn(0) {
-            if self.lsn == self.timeline.get_ancestor_lsn() {
+            if self.lsn == self.timeline.tline.get_ancestor_lsn() {
                write!(zenith_signal, "PREV LSN: none")?;
            } else {
                write!(zenith_signal, "PREV LSN: invalid")?;
--- a/pageserver/src/bin/dump_layerfile.rs
+++ b/pageserver/src/bin/dump_layerfile.rs
@@ -4,6 +4,7 @@
 use anyhow::Result;
 use clap::{App, Arg};
 use pageserver::layered_repository::dump_layerfile_from_path;
+use pageserver::virtual_file;
 use std::path::PathBuf;
 use zenith_utils::GIT_VERSION;

@@ -12,7 +13,7 @@ fn main() -> Result<()> {
        .about("Dump contents of one layer file, for debugging")
        .version(GIT_VERSION)
        .arg(
-            Arg::with_name("path")
+            Arg::new("path")
                .help("Path to file to dump")
                .required(true)
                .index(1),
@@ -21,6 +22,9 @@ fn main() -> Result<()> {

    let path = PathBuf::from(arg_matches.value_of("path").unwrap());

+    // Basic initialization of things that don't change after startup
+    virtual_file::init(10);
+
    dump_layerfile_from_path(&path)?;

    Ok(())
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -1,280 +1,25 @@
-//
-// Main entry point for the Page Server executable
-//
+//! Main entry point for the Page Server executable.

-use serde::{Deserialize, Serialize};
-use std::{
-    env,
-    path::{Path, PathBuf},
-    str::FromStr,
-    thread,
-};
+use std::{env, path::Path, str::FromStr};
 use tracing::*;
 use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType, tcp_listener, GIT_VERSION};

-use anyhow::{bail, ensure, Context, Result};
-use signal_hook::consts::signal::*;
-use signal_hook::consts::TERM_SIGNALS;
-use signal_hook::flag;
-use signal_hook::iterator::exfiltrator::WithOrigin;
-use signal_hook::iterator::SignalsInfo;
-use std::process::exit;
-use std::sync::atomic::AtomicBool;
-use std::sync::Arc;
+use anyhow::{bail, Context, Result};

-use clap::{App, Arg, ArgMatches};
+use clap::{App, Arg};
 use daemonize::Daemonize;

 use pageserver::{
-    branches, defaults::*, http, page_cache, page_service, remote_storage, tenant_mgr,
-    virtual_file, PageServerConf, RemoteStorageConfig, RemoteStorageKind, S3Config, LOG_FILE_NAME,
+    branches,
+    config::{defaults::*, PageServerConf},
+    http, page_cache, page_service, remote_storage, tenant_mgr, thread_mgr,
+    thread_mgr::ThreadKind,
+    virtual_file, LOG_FILE_NAME,
 };
 use zenith_utils::http::endpoint;
 use zenith_utils::postgres_backend;
-
-use const_format::formatcp;
-
-/// String arguments that can be declared via CLI or config file
-#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
-struct CfgFileParams {
-    listen_pg_addr: Option<String>,
-    listen_http_addr: Option<String>,
-    checkpoint_distance: Option<String>,
-    checkpoint_period: Option<String>,
-    gc_horizon: Option<String>,
-    gc_period: Option<String>,
-    open_mem_limit: Option<String>,
-    page_cache_size: Option<String>,
-    max_file_descriptors: Option<String>,
-    pg_distrib_dir: Option<String>,
-    auth_validation_public_key_path: Option<String>,
-    auth_type: Option<String>,
-    remote_storage_max_concurrent_sync: Option<String>,
-    /////////////////////////////////
-    //// Don't put `Option<String>` and other "simple" values below.
-    ////
-    /// `Option<RemoteStorage>` is a <a href='https://toml.io/en/v1.0.0#table'>table</a> in TOML.
-    /// Values in TOML cannot be defined after tables (other tables can),
-    /// and [`toml`] crate serializes all fields in the order of their appearance.
-    ////////////////////////////////
-    remote_storage: Option<RemoteStorage>,
-}
-
-#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
-// Without this attribute, enums with values won't be serialized by the `toml` library (but can be deserialized nonetheless!).
-// See https://github.com/alexcrichton/toml-rs/blob/6c162e6562c3e432bf04c82a3d1d789d80761a86/examples/enum_external.rs for the examples
-#[serde(untagged)]
-enum RemoteStorage {
-    Local {
-        local_path: String,
-    },
-    AwsS3 {
-        bucket_name: String,
-        bucket_region: String,
-        #[serde(skip_serializing)]
-        access_key_id: Option<String>,
-        #[serde(skip_serializing)]
-        secret_access_key: Option<String>,
-    },
-}
-
-impl CfgFileParams {
-    /// Extract string arguments from CLI
-    fn from_args(arg_matches: &ArgMatches) -> Self {
-        let get_arg = |arg_name: &str| -> Option<String> {
-            arg_matches.value_of(arg_name).map(str::to_owned)
-        };
-
-        let remote_storage = if let Some(local_path) = get_arg("relish-storage-local-path") {
-            Some(RemoteStorage::Local { local_path })
-        } else if let Some((bucket_name, bucket_region)) =
-            get_arg("relish-storage-s3-bucket").zip(get_arg("relish-storage-region"))
-        {
-            Some(RemoteStorage::AwsS3 {
-                bucket_name,
-                bucket_region,
-                access_key_id: get_arg("relish-storage-access-key"),
-                secret_access_key: get_arg("relish-storage-secret-access-key"),
-            })
-        } else {
-            None
-        };
-
-        Self {
-            listen_pg_addr: get_arg("listen-pg"),
-            listen_http_addr: get_arg("listen-http"),
-            checkpoint_distance: get_arg("checkpoint_distance"),
-            checkpoint_period: get_arg("checkpoint_period"),
-            gc_horizon: get_arg("gc_horizon"),
-            gc_period: get_arg("gc_period"),
-            open_mem_limit: get_arg("open_mem_limit"),
-            page_cache_size: get_arg("page_cache_size"),
-            max_file_descriptors: get_arg("max_file_descriptors"),
-            pg_distrib_dir: get_arg("postgres-distrib"),
-            auth_validation_public_key_path: get_arg("auth-validation-public-key-path"),
-            auth_type: get_arg("auth-type"),
-            remote_storage,
-            remote_storage_max_concurrent_sync: get_arg("relish-storage-max-concurrent-sync"),
-        }
-    }
-
-    /// Fill missing values in `self` with `other`
-    fn or(self, other: CfgFileParams) -> Self {
-        // TODO cleaner way to do this
-        Self {
-            listen_pg_addr: self.listen_pg_addr.or(other.listen_pg_addr),
-            listen_http_addr: self.listen_http_addr.or(other.listen_http_addr),
-            checkpoint_distance: self.checkpoint_distance.or(other.checkpoint_distance),
-            checkpoint_period: self.checkpoint_period.or(other.checkpoint_period),
-            gc_horizon: self.gc_horizon.or(other.gc_horizon),
-            gc_period: self.gc_period.or(other.gc_period),
-            open_mem_limit: self.open_mem_limit.or(other.open_mem_limit),
-            page_cache_size: self.page_cache_size.or(other.page_cache_size),
-            max_file_descriptors: self.max_file_descriptors.or(other.max_file_descriptors),
-            pg_distrib_dir: self.pg_distrib_dir.or(other.pg_distrib_dir),
-            auth_validation_public_key_path: self
-                .auth_validation_public_key_path
-                .or(other.auth_validation_public_key_path),
-            auth_type: self.auth_type.or(other.auth_type),
-            remote_storage: self.remote_storage.or(other.remote_storage),
-            remote_storage_max_concurrent_sync: self
-                .remote_storage_max_concurrent_sync
-                .or(other.remote_storage_max_concurrent_sync),
-        }
-    }
-
-    /// Create a PageServerConf from these string parameters
-    fn try_into_config(&self) -> Result<PageServerConf> {
-        let workdir = PathBuf::from(".");
-
-        let listen_pg_addr = match self.listen_pg_addr.as_ref() {
-            Some(addr) => addr.clone(),
-            None => DEFAULT_PG_LISTEN_ADDR.to_owned(),
-        };
-
-        let listen_http_addr = match self.listen_http_addr.as_ref() {
-            Some(addr) => addr.clone(),
-            None => DEFAULT_HTTP_LISTEN_ADDR.to_owned(),
-        };
-
-        let checkpoint_distance: u64 = match self.checkpoint_distance.as_ref() {
-            Some(checkpoint_distance_str) => checkpoint_distance_str.parse()?,
-            None => DEFAULT_CHECKPOINT_DISTANCE,
-        };
-        let checkpoint_period = match self.checkpoint_period.as_ref() {
-            Some(checkpoint_period_str) => humantime::parse_duration(checkpoint_period_str)?,
-            None => DEFAULT_CHECKPOINT_PERIOD,
-        };
-
-        let gc_horizon: u64 = match self.gc_horizon.as_ref() {
-            Some(horizon_str) => horizon_str.parse()?,
-            None => DEFAULT_GC_HORIZON,
-        };
-        let gc_period = match self.gc_period.as_ref() {
-            Some(period_str) => humantime::parse_duration(period_str)?,
-            None => DEFAULT_GC_PERIOD,
-        };
-
-        let open_mem_limit: usize = match self.open_mem_limit.as_ref() {
-            Some(open_mem_limit_str) => open_mem_limit_str.parse()?,
-            None => DEFAULT_OPEN_MEM_LIMIT,
-        };
-
-        let page_cache_size: usize = match self.page_cache_size.as_ref() {
-            Some(page_cache_size_str) => page_cache_size_str.parse()?,
-            None => DEFAULT_PAGE_CACHE_SIZE,
-        };
-
-        let max_file_descriptors: usize = match self.max_file_descriptors.as_ref() {
-            Some(max_file_descriptors_str) => max_file_descriptors_str.parse()?,
-            None => DEFAULT_MAX_FILE_DESCRIPTORS,
-        };
-
-        let pg_distrib_dir = match self.pg_distrib_dir.as_ref() {
-            Some(pg_distrib_dir_str) => PathBuf::from(pg_distrib_dir_str),
-            None => env::current_dir()?.join("tmp_install"),
-        };
-
-        let auth_validation_public_key_path = self
-            .auth_validation_public_key_path
-            .as_ref()
-            .map(PathBuf::from);
-
-        let auth_type = self
-            .auth_type
-            .as_ref()
-            .map_or(Ok(AuthType::Trust), |auth_type| {
-                AuthType::from_str(auth_type)
-            })?;
-
-        if !pg_distrib_dir.join("bin/postgres").exists() {
-            bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
-        }
-
-        if auth_type == AuthType::ZenithJWT {
-            ensure!(
-                auth_validation_public_key_path.is_some(),
-                "Missing auth_validation_public_key_path when auth_type is ZenithJWT"
-            );
-            let path_ref = auth_validation_public_key_path.as_ref().unwrap();
-            ensure!(
-                path_ref.exists(),
-                format!("Can't find auth_validation_public_key at {:?}", path_ref)
-            );
-        }
-
-        let max_concurrent_sync = match self.remote_storage_max_concurrent_sync.as_deref() {
-            Some(number_str) => number_str.parse()?,
-            None => DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC_LIMITS,
-        };
-        let remote_storage_config = self.remote_storage.as_ref().map(|storage_params| {
-            let storage = match storage_params.clone() {
-                RemoteStorage::Local { local_path } => {
-                    RemoteStorageKind::LocalFs(PathBuf::from(local_path))
-                }
-                RemoteStorage::AwsS3 {
-                    bucket_name,
-                    bucket_region,
-                    access_key_id,
-                    secret_access_key,
-                } => RemoteStorageKind::AwsS3(S3Config {
-                    bucket_name,
-                    bucket_region,
-                    access_key_id,
-                    secret_access_key,
-                }),
-            };
-            RemoteStorageConfig {
-                max_concurrent_sync,
-                storage,
-            }
-        });
-
-        Ok(PageServerConf {
-            daemonize: false,
-
-            listen_pg_addr,
-            listen_http_addr,
-            checkpoint_distance,
-            checkpoint_period,
-            gc_horizon,
-            gc_period,
-            open_mem_limit,
-            page_cache_size,
-            max_file_descriptors,
-
-            superuser: String::from(DEFAULT_SUPERUSER),
-
-            workdir,
-
-            pg_distrib_dir,
-
-            auth_validation_public_key_path,
-            auth_type,
-            remote_storage_config,
-        })
-    }
-}
+use zenith_utils::shutdown::exit_now;
+use zenith_utils::signals::{self, Signal};

 fn main() -> Result<()> {
    zenith_metrics::set_common_metrics_prefix("pageserver");
@@ -282,180 +27,53 @@ fn main() -> Result<()> {
        .about("Materializes WAL stream to pages and serves them to the postgres")
        .version(GIT_VERSION)
        .arg(
-            Arg::with_name("listen-pg")
-                .short("l")
-                .long("listen-pg")
-                .alias("listen") // keep some compatibility
-                .takes_value(true)
-                .help(formatcp!("listen for incoming page requests on ip:port (default: {DEFAULT_PG_LISTEN_ADDR})")),
-        )
-        .arg(
-            Arg::with_name("listen-http")
-                .long("listen-http")
-                .alias("http_endpoint") // keep some compatibility
-                .takes_value(true)
-                .help(formatcp!("http endpoint address for metrics and management API calls on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")),
-        )
-        .arg(
-            Arg::with_name("daemonize")
-                .short("d")
+            Arg::new("daemonize")
+                .short('d')
                .long("daemonize")
                .takes_value(false)
                .help("Run in the background"),
        )
        .arg(
-            Arg::with_name("init")
+            Arg::new("init")
                .long("init")
                .takes_value(false)
                .help("Initialize pageserver repo"),
        )
        .arg(
-            Arg::with_name("checkpoint_distance")
-                .long("checkpoint_distance")
-                .takes_value(true)
-                .help("Distance from current LSN to perform checkpoint of in-memory layers"),
-        )
-        .arg(
-            Arg::with_name("checkpoint_period")
-                .long("checkpoint_period")
-                .takes_value(true)
-                .help("Interval between checkpoint iterations"),
-        )
-        .arg(
-            Arg::with_name("gc_horizon")
-                .long("gc_horizon")
-                .takes_value(true)
-                .help("Distance from current LSN to perform all wal records cleanup"),
-        )
-        .arg(
-            Arg::with_name("gc_period")
-                .long("gc_period")
-                .takes_value(true)
-                .help("Interval between garbage collector iterations"),
-        )
-        .arg(
-            Arg::with_name("open_mem_limit")
-                .long("open_mem_limit")
-                .takes_value(true)
-                .help("Amount of memory reserved for buffering incoming WAL"),
-        )
-        .arg(
-
-            Arg::with_name("page_cache_size")
-                .long("page_cache_size")
-                .takes_value(true)
-                .help("Number of pages in the page cache"),
-        )
-        .arg(
-            Arg::with_name("max_file_descriptors")
-                .long("max_file_descriptors")
-                .takes_value(true)
-                .help("Max number of file descriptors to keep open for files"),
-        )
-        .arg(
-            Arg::with_name("workdir")
-                .short("D")
+            Arg::new("workdir")
+                .short('D')
                .long("workdir")
                .takes_value(true)
                .help("Working directory for the pageserver"),
        )
        .arg(
-            Arg::with_name("postgres-distrib")
-                .long("postgres-distrib")
-                .takes_value(true)
-                .help("Postgres distribution directory"),
-        )
-        .arg(
-            Arg::with_name("create-tenant")
+            Arg::new("create-tenant")
                .long("create-tenant")
                .takes_value(true)
                .help("Create tenant during init")
                .requires("init"),
        )
+        // See `settings.md` for more details on the extra configuration patameters pageserver can process
        .arg(
-            Arg::with_name("auth-validation-public-key-path")
-                .long("auth-validation-public-key-path")
+            Arg::new("config-override")
+                .short('c')
                .takes_value(true)
-                .help("Path to public key used to validate jwt signature"),
-        )
-        .arg(
-            Arg::with_name("auth-type")
-                .long("auth-type")
-                .takes_value(true)
-                .help("Authentication scheme type. One of: Trust, MD5, ZenithJWT"),
-        )
-        .arg(
-            Arg::with_name("relish-storage-local-path")
-                .long("relish-storage-local-path")
-                .takes_value(true)
-                .help("Path to the local directory, to be used as an external relish storage")
-                .conflicts_with_all(&[
-                    "relish-storage-s3-bucket",
-                    "relish-storage-region",
-                    "relish-storage-access-key",
-                    "relish-storage-secret-access-key",
-                ]),
-        )
-        .arg(
-            Arg::with_name("relish-storage-s3-bucket")
-                .long("relish-storage-s3-bucket")
-                .takes_value(true)
-                .help("Name of the AWS S3 bucket to use an external relish storage")
-                .requires("relish-storage-region"),
-        )
-        .arg(
-            Arg::with_name("relish-storage-region")
-                .long("relish-storage-region")
-                .takes_value(true)
-                .help("Region of the AWS S3 bucket"),
-        )
-        .arg(
-            Arg::with_name("relish-storage-access-key")
-                .long("relish-storage-access-key")
-                .takes_value(true)
-                .help("Credentials to access the AWS S3 bucket"),
-        )
-        .arg(
-            Arg::with_name("relish-storage-secret-access-key")
-                .long("relish-storage-secret-access-key")
-                .takes_value(true)
-                .help("Credentials to access the AWS S3 bucket"),
-        )
-        .arg(
-            Arg::with_name("relish-storage-max-concurrent-sync")
-                .long("relish-storage-max-concurrent-sync")
-                .takes_value(true)
-                .help("Maximum allowed concurrent synchronisations with storage"),
+                .number_of_values(1)
+                .multiple_occurrences(true)
+                .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there).
+                Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
        )
        .get_matches();

    let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
-    let cfg_file_path = workdir
+    let workdir = workdir
        .canonicalize()
-        .with_context(|| format!("Error opening workdir '{}'", workdir.display()))?
-        .join("pageserver.toml");
-
-    let args_params = CfgFileParams::from_args(&arg_matches);
+        .with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
+    let cfg_file_path = workdir.join("pageserver.toml");

    let init = arg_matches.is_present("init");
    let create_tenant = arg_matches.value_of("create-tenant");

-    let params = if init {
-        // We're initializing the repo, so there's no config file yet
-        args_params
-    } else {
-        // Supplement the CLI arguments with the config file
-        let cfg_file_contents = std::fs::read_to_string(&cfg_file_path)
-            .with_context(|| format!("No pageserver config at '{}'", cfg_file_path.display()))?;
-        let file_params: CfgFileParams = toml::from_str(&cfg_file_contents).with_context(|| {
-            format!(
-                "Failed to read '{}' as pageserver config",
-                cfg_file_path.display()
-            )
-        })?;
-        args_params.or(file_params)
-    };
-
    // Set CWD to workdir for non-daemon modes
    env::set_current_dir(&workdir).with_context(|| {
        format!(
@@ -464,20 +82,55 @@ fn main() -> Result<()> {
        )
    })?;

-    // Ensure the config is valid, even if just init-ing
-    let mut conf = params.try_into_config().with_context(|| {
-        format!(
-            "Pageserver config at '{}' is not valid",
-            cfg_file_path.display()
-        )
-    })?;
-
-    conf.daemonize = arg_matches.is_present("daemonize");
-
-    if init && conf.daemonize {
+    let daemonize = arg_matches.is_present("daemonize");
+    if init && daemonize {
        bail!("--daemonize cannot be used with --init")
    }

+    let mut toml = if init {
+        // We're initializing the repo, so there's no config file yet
+        DEFAULT_CONFIG_FILE
+            .parse::<toml_edit::Document>()
+            .expect("could not parse built-in config file")
+    } else {
+        // Supplement the CLI arguments with the config file
+        let cfg_file_contents = std::fs::read_to_string(&cfg_file_path)
+            .with_context(|| format!("No pageserver config at '{}'", cfg_file_path.display()))?;
+        cfg_file_contents
+            .parse::<toml_edit::Document>()
+            .with_context(|| {
+                format!(
+                    "Failed to read '{}' as pageserver config",
+                    cfg_file_path.display()
+                )
+            })?
+    };
+
+    // Process any extra options given with -c
+    if let Some(values) = arg_matches.values_of("config-override") {
+        for option_line in values {
+            let doc = toml_edit::Document::from_str(option_line).with_context(|| {
+                format!(
+                    "Option '{}' could not be parsed as a toml document",
+                    option_line
+                )
+            })?;
+
+            for (key, item) in doc.iter() {
+                if key == "id" {
+                    anyhow::ensure!(
+                        init,
+                        "node id can only be set during pageserver init and cannot be overridden"
+                    );
+                }
+                toml.insert(key, item.clone());
+            }
+        }
+    }
+    trace!("Resulting toml: {}", toml);
+    let conf = PageServerConf::parse_and_validate(&toml, &workdir)
+        .context("Failed to parse pageserver configuration")?;
+
    // The configuration is all set up now. Turn it into a 'static
    // that can be freely stored in structs and passed across threads
    // as a ref.
@@ -492,10 +145,7 @@ fn main() -> Result<()> {
    if init {
        branches::init_pageserver(conf, create_tenant).context("Failed to init pageserver")?;
        // write the config file
-        let cfg_file_contents = toml::to_string_pretty(&params)
-            .context("Failed to create pageserver config contents for initialisation")?;
-        // TODO support enable-auth flag
-        std::fs::write(&cfg_file_path, cfg_file_contents).with_context(|| {
+        std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| {
            format!(
                "Failed to initialize pageserver config at '{}'",
                cfg_file_path.display()
@@ -503,27 +153,16 @@ fn main() -> Result<()> {
        })?;
        Ok(())
    } else {
-        start_pageserver(conf).context("Failed to start pageserver")
+        start_pageserver(conf, daemonize).context("Failed to start pageserver")
    }
 }

-fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
+fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> {
    // Initialize logger
-    let log_file = logging::init(LOG_FILE_NAME, conf.daemonize)?;
+    let log_file = logging::init(LOG_FILE_NAME, daemonize)?;

    info!("version: {}", GIT_VERSION);

-    let term_now = Arc::new(AtomicBool::new(false));
-    for sig in TERM_SIGNALS {
-        // When terminated by a second term signal, exit with exit code 1.
-        // This will do nothing the first time (because term_now is false).
-        flag::register_conditional_shutdown(*sig, 1, Arc::clone(&term_now))?;
-        // But this will "arm" the above for the second time, by setting it to true.
-        // The order of registering these is important, if you put this one first, it will
-        // first arm and then terminate ‒ all in the first round.
-        flag::register(*sig, Arc::clone(&term_now))?;
-    }
-
    // TODO: Check that it looks like a valid repository before going further

    // bind sockets before daemonizing so we report errors early and do not return until we are listening
@@ -539,7 +178,8 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
    );
    let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;

-    if conf.daemonize {
+    // NB: Don't spawn any threads before daemonizing!
+    if daemonize {
        info!("daemonizing...");

        // There shouldn't be any logging to stdin/stdout. Redirect it to the main log so
@@ -553,21 +193,22 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
            .stdout(stdout)
            .stderr(stderr);

-        match daemonize.start() {
+        // XXX: The parent process should exit abruptly right after
+        // it has spawned a child to prevent coverage machinery from
+        // dumping stats into a `profraw` file now owned by the child.
+        // Otherwise, the coverage data will be damaged.
+        match daemonize.exit_action(|| exit_now(0)).start() {
            Ok(_) => info!("Success, daemonized"),
            Err(err) => error!(%err, "could not daemonize"),
        }
    }

-    // keep join handles for spawned threads
-    // don't spawn threads before daemonizing
-    let mut join_handles = Vec::new();
+    let signals = signals::install_shutdown_handlers()?;
+    let sync_startup = remote_storage::start_local_timeline_sync(conf)
+        .context("Failed to set up local files sync with external storage")?;

-    if let Some(handle) = remote_storage::run_storage_sync_thread(conf)? {
-        join_handles.push(handle);
-    }
    // Initialize tenant manager.
-    tenant_mgr::init(conf);
+    tenant_mgr::set_timeline_states(conf, sync_startup.initial_timeline_states);

    // initialize authentication for incoming connections
    let auth = match &conf.auth_type {
@@ -582,203 +223,74 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {

    // Spawn a new thread for the http endpoint
    // bind before launching separate thread so the error reported before startup exits
-    let cloned = auth.clone();
-    let http_endpoint_thread = thread::Builder::new()
-        .name("http_endpoint_thread".into())
-        .spawn(move || {
-            let router = http::make_router(conf, cloned);
-            endpoint::serve_thread_main(router, http_listener)
-        })?;
+    let auth_cloned = auth.clone();
+    thread_mgr::spawn(
+        ThreadKind::HttpEndpointListener,
+        None,
+        None,
+        "http_endpoint_thread",
+        move || {
+            let router = http::make_router(conf, auth_cloned);
+            endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher())
+        },
+    )?;

-    join_handles.push(http_endpoint_thread);
-
-    // Spawn a thread to listen for connections. It will spawn further threads
+    // Spawn a thread to listen for libpq connections. It will spawn further threads
    // for each connection.
-    let page_service_thread = thread::Builder::new()
-        .name("Page Service thread".into())
-        .spawn(move || {
-            page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type)
-        })?;
+    thread_mgr::spawn(
+        ThreadKind::LibpqEndpointListener,
+        None,
+        None,
+        "libpq endpoint thread",
+        move || page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type),
+    )?;

-    for info in SignalsInfo::<WithOrigin>::new(TERM_SIGNALS)?.into_iter() {
-        match info.signal {
-            SIGQUIT => {
-                info!("Got SIGQUIT. Terminate pageserver in immediate shutdown mode");
-                exit(111);
-            }
-            SIGINT | SIGTERM => {
-                info!("Got SIGINT/SIGTERM. Terminate gracefully in fast shutdown mode");
-                // Terminate postgres backends
-                postgres_backend::set_pgbackend_shutdown_requested();
-                // Stop all tenants and flush their data
-                tenant_mgr::shutdown_all_tenants()?;
-                // Wait for pageservice thread to complete the job
-                page_service_thread
-                    .join()
-                    .expect("thread panicked")
-                    .expect("thread exited with an error");
-
-                // Shut down http router
-                endpoint::shutdown();
-
-                // Wait for all threads
-                for handle in join_handles.into_iter() {
-                    handle
-                        .join()
-                        .expect("thread panicked")
-                        .expect("thread exited with an error");
-                }
-                info!("Pageserver shut down successfully completed");
-                exit(0);
-            }
-            unknown_signal => {
-                debug!("Unknown signal {}", unknown_signal);
-            }
+    signals.handle(|signal| match signal {
+        Signal::Quit => {
+            info!(
+                "Got {}. Terminating in immediate shutdown mode",
+                signal.name()
+            );
+            std::process::exit(111);
        }
-    }

-    Ok(())
+        Signal::Interrupt | Signal::Terminate => {
+            info!(
+                "Got {}. Terminating gracefully in fast shutdown mode",
+                signal.name()
+            );
+            shutdown_pageserver();
+            unreachable!()
+        }
+    })
 }

-#[cfg(test)]
-mod tests {
-    use super::*;
+fn shutdown_pageserver() {
+    // Shut down the libpq endpoint thread. This prevents new connections from
+    // being accepted.
+    thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None);

-    #[test]
-    fn page_server_conf_toml_serde() {
-        let params = CfgFileParams {
-            listen_pg_addr: Some("listen_pg_addr_VALUE".to_string()),
-            listen_http_addr: Some("listen_http_addr_VALUE".to_string()),
-            checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()),
-            checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
-            gc_horizon: Some("gc_horizon_VALUE".to_string()),
-            gc_period: Some("gc_period_VALUE".to_string()),
-            open_mem_limit: Some("open_mem_limit_VALUE".to_string()),
-            page_cache_size: Some("page_cache_size_VALUE".to_string()),
-            max_file_descriptors: Some("max_file_descriptors_VALUE".to_string()),
-            pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
-            auth_validation_public_key_path: Some(
-                "auth_validation_public_key_path_VALUE".to_string(),
-            ),
-            auth_type: Some("auth_type_VALUE".to_string()),
-            remote_storage: Some(RemoteStorage::Local {
-                local_path: "remote_storage_local_VALUE".to_string(),
-            }),
-            remote_storage_max_concurrent_sync: Some(
-                "remote_storage_max_concurrent_sync_VALUE".to_string(),
-            ),
-        };
+    // Shut down any page service threads.
+    postgres_backend::set_pgbackend_shutdown_requested();
+    thread_mgr::shutdown_threads(Some(ThreadKind::PageRequestHandler), None, None);

-        let toml_string = toml::to_string(&params).expect("Failed to serialize correct config");
-        let toml_pretty_string =
-            toml::to_string_pretty(&params).expect("Failed to serialize correct config");
-        assert_eq!(
-            r#"listen_pg_addr = 'listen_pg_addr_VALUE'
-listen_http_addr = 'listen_http_addr_VALUE'
-checkpoint_distance = 'checkpoint_distance_VALUE'
-checkpoint_period = 'checkpoint_period_VALUE'
-gc_horizon = 'gc_horizon_VALUE'
-gc_period = 'gc_period_VALUE'
-open_mem_limit = 'open_mem_limit_VALUE'
-page_cache_size = 'page_cache_size_VALUE'
-max_file_descriptors = 'max_file_descriptors_VALUE'
-pg_distrib_dir = 'pg_distrib_dir_VALUE'
-auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
-auth_type = 'auth_type_VALUE'
-remote_storage_max_concurrent_sync = 'remote_storage_max_concurrent_sync_VALUE'
+    // Shut down all the tenants. This flushes everything to disk and kills
+    // the checkpoint and GC threads.
+    tenant_mgr::shutdown_all_tenants();

-[remote_storage]
-local_path = 'remote_storage_local_VALUE'
-"#,
-            toml_pretty_string
-        );
+    // Stop syncing with remote storage.
+    //
+    // FIXME: Does this wait for the sync thread to finish syncing what's queued up?
+    // Should it?
+    thread_mgr::shutdown_threads(Some(ThreadKind::StorageSync), None, None);

-        let params_from_serialized: CfgFileParams = toml::from_str(&toml_string)
-            .expect("Failed to deserialize the serialization result of the config");
-        let params_from_serialized_pretty: CfgFileParams = toml::from_str(&toml_pretty_string)
-            .expect("Failed to deserialize the prettified serialization result of the config");
-        assert!(
-            params_from_serialized == params,
-            "Expected the same config in the end of config -> serialize -> deserialize chain"
-        );
-        assert!(
-            params_from_serialized_pretty == params,
-            "Expected the same config in the end of config -> serialize pretty -> deserialize chain"
-        );
-    }
+    // Shut down the HTTP endpoint last, so that you can still check the server's
+    // status while it's shutting down.
+    thread_mgr::shutdown_threads(Some(ThreadKind::HttpEndpointListener), None, None);

-    #[test]
-    fn credentials_omitted_during_serialization() {
-        let params = CfgFileParams {
-            listen_pg_addr: Some("listen_pg_addr_VALUE".to_string()),
-            listen_http_addr: Some("listen_http_addr_VALUE".to_string()),
-            checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()),
-            checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
-            gc_horizon: Some("gc_horizon_VALUE".to_string()),
-            gc_period: Some("gc_period_VALUE".to_string()),
-            open_mem_limit: Some("open_mem_limit_VALUE".to_string()),
-            page_cache_size: Some("page_cache_size_VALUE".to_string()),
-            max_file_descriptors: Some("max_file_descriptors_VALUE".to_string()),
-            pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
-            auth_validation_public_key_path: Some(
-                "auth_validation_public_key_path_VALUE".to_string(),
-            ),
-            auth_type: Some("auth_type_VALUE".to_string()),
-            remote_storage: Some(RemoteStorage::AwsS3 {
-                bucket_name: "bucket_name_VALUE".to_string(),
-                bucket_region: "bucket_region_VALUE".to_string(),
-                access_key_id: Some("access_key_id_VALUE".to_string()),
-                secret_access_key: Some("secret_access_key_VALUE".to_string()),
-            }),
-            remote_storage_max_concurrent_sync: Some(
-                "remote_storage_max_concurrent_sync_VALUE".to_string(),
-            ),
-        };
+    // There should be nothing left, but let's be sure
+    thread_mgr::shutdown_threads(None, None, None);

-        let toml_string = toml::to_string(&params).expect("Failed to serialize correct config");
-        let toml_pretty_string =
-            toml::to_string_pretty(&params).expect("Failed to serialize correct config");
-        assert_eq!(
-            r#"listen_pg_addr = 'listen_pg_addr_VALUE'
-listen_http_addr = 'listen_http_addr_VALUE'
-checkpoint_distance = 'checkpoint_distance_VALUE'
-checkpoint_period = 'checkpoint_period_VALUE'
-gc_horizon = 'gc_horizon_VALUE'
-gc_period = 'gc_period_VALUE'
-open_mem_limit = 'open_mem_limit_VALUE'
-page_cache_size = 'page_cache_size_VALUE'
-max_file_descriptors = 'max_file_descriptors_VALUE'
-pg_distrib_dir = 'pg_distrib_dir_VALUE'
-auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
-auth_type = 'auth_type_VALUE'
-remote_storage_max_concurrent_sync = 'remote_storage_max_concurrent_sync_VALUE'
-
-[remote_storage]
-bucket_name = 'bucket_name_VALUE'
-bucket_region = 'bucket_region_VALUE'
-"#,
-            toml_pretty_string
-        );
-
-        let params_from_serialized: CfgFileParams = toml::from_str(&toml_string)
-            .expect("Failed to deserialize the serialization result of the config");
-        let params_from_serialized_pretty: CfgFileParams = toml::from_str(&toml_pretty_string)
-            .expect("Failed to deserialize the prettified serialization result of the config");
-
-        let mut expected_params = params;
-        expected_params.remote_storage = Some(RemoteStorage::AwsS3 {
-            bucket_name: "bucket_name_VALUE".to_string(),
-            bucket_region: "bucket_region_VALUE".to_string(),
-            access_key_id: None,
-            secret_access_key: None,
-        });
-        assert!(
-            params_from_serialized == expected_params,
-            "Expected the config without credentials in the end of a 'config -> serialize -> deserialize' chain"
-        );
-        assert!(
-            params_from_serialized_pretty == expected_params,
-            "Expected the config without credentials in the end of a 'config -> serialize pretty -> deserialize' chain"
-        );
-    }
+    info!("Shut down successfully completed");
+    std::process::exit(0);
 }
--- a/pageserver/src/bin/pageserver_zst.rs
+++ b/pageserver/src/bin/pageserver_zst.rs
@@ -0,0 +1,334 @@
+//! A CLI helper to deal with remote storage (S3, usually) blobs as archives.
+//! See [`compression`] for more details about the archives.
+
+use std::{collections::BTreeSet, path::Path};
+
+use anyhow::{bail, ensure, Context};
+use clap::{App, Arg};
+use pageserver::{
+    layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME},
+    remote_storage::compression,
+};
+use tokio::{fs, io};
+use zenith_utils::GIT_VERSION;
+
+const LIST_SUBCOMMAND: &str = "list";
+const ARCHIVE_ARG_NAME: &str = "archive";
+
+const EXTRACT_SUBCOMMAND: &str = "extract";
+const TARGET_DIRECTORY_ARG_NAME: &str = "target_directory";
+
+const CREATE_SUBCOMMAND: &str = "create";
+const SOURCE_DIRECTORY_ARG_NAME: &str = "source_directory";
+
+#[tokio::main(flavor = "current_thread")]
+async fn main() -> anyhow::Result<()> {
+    let arg_matches = App::new("pageserver zst blob [un]compressor utility")
+        .version(GIT_VERSION)
+        .subcommands(vec![
+            App::new(LIST_SUBCOMMAND)
+                .about("List the archive contents")
+                .arg(
+                    Arg::new(ARCHIVE_ARG_NAME)
+                        .required(true)
+                        .takes_value(true)
+                        .help("An archive to list the contents of"),
+                ),
+            App::new(EXTRACT_SUBCOMMAND)
+                .about("Extracts the archive into the directory")
+                .arg(
+                    Arg::new(ARCHIVE_ARG_NAME)
+                        .required(true)
+                        .takes_value(true)
+                        .help("An archive to extract"),
+                )
+                .arg(
+                    Arg::new(TARGET_DIRECTORY_ARG_NAME)
+                        .required(false)
+                        .takes_value(true)
+                        .help("A directory to extract the archive into. Optional, will use the current directory if not specified"),
+                ),
+            App::new(CREATE_SUBCOMMAND)
+                .about("Creates an archive with the contents of a directory (only the first level files are taken, metadata file has to be present in the same directory)")
+                .arg(
+                    Arg::new(SOURCE_DIRECTORY_ARG_NAME)
+                        .required(true)
+                        .takes_value(true)
+                        .help("A directory to use for creating the archive"),
+                )
+                .arg(
+                    Arg::new(TARGET_DIRECTORY_ARG_NAME)
+                        .required(false)
+                        .takes_value(true)
+                        .help("A directory to create the archive in. Optional, will use the current directory if not specified"),
+                ),
+        ])
+        .get_matches();
+
+    let subcommand_name = match arg_matches.subcommand_name() {
+        Some(name) => name,
+        None => bail!("No subcommand specified"),
+    };
+
+    let subcommand_matches = match arg_matches.subcommand_matches(subcommand_name) {
+        Some(matches) => matches,
+        None => bail!(
+            "No subcommand arguments were recognized for subcommand '{}'",
+            subcommand_name
+        ),
+    };
+
+    let target_dir = Path::new(
+        subcommand_matches
+            .value_of(TARGET_DIRECTORY_ARG_NAME)
+            .unwrap_or("./"),
+    );
+
+    match subcommand_name {
+        LIST_SUBCOMMAND => {
+            let archive = match subcommand_matches.value_of(ARCHIVE_ARG_NAME) {
+                Some(archive) => Path::new(archive),
+                None => bail!("No '{}' argument is specified", ARCHIVE_ARG_NAME),
+            };
+            list_archive(archive).await
+        }
+        EXTRACT_SUBCOMMAND => {
+            let archive = match subcommand_matches.value_of(ARCHIVE_ARG_NAME) {
+                Some(archive) => Path::new(archive),
+                None => bail!("No '{}' argument is specified", ARCHIVE_ARG_NAME),
+            };
+            extract_archive(archive, target_dir).await
+        }
+        CREATE_SUBCOMMAND => {
+            let source_dir = match subcommand_matches.value_of(SOURCE_DIRECTORY_ARG_NAME) {
+                Some(source) => Path::new(source),
+                None => bail!("No '{}' argument is specified", SOURCE_DIRECTORY_ARG_NAME),
+            };
+            create_archive(source_dir, target_dir).await
+        }
+        unknown => bail!("Unknown subcommand {}", unknown),
+    }
+}
+
+async fn list_archive(archive: &Path) -> anyhow::Result<()> {
+    let archive = archive.canonicalize().with_context(|| {
+        format!(
+            "Failed to get the absolute path for the archive path '{}'",
+            archive.display()
+        )
+    })?;
+    ensure!(
+        archive.is_file(),
+        "Path '{}' is not an archive file",
+        archive.display()
+    );
+    println!("Listing an archive at path '{}'", archive.display());
+    let archive_name = match archive.file_name().and_then(|name| name.to_str()) {
+        Some(name) => name,
+        None => bail!(
+            "Failed to get the archive name from the path '{}'",
+            archive.display()
+        ),
+    };
+
+    let archive_bytes = fs::read(&archive)
+        .await
+        .context("Failed to read the archive bytes")?;
+
+    let header = compression::read_archive_header(archive_name, &mut archive_bytes.as_slice())
+        .await
+        .context("Failed to read the archive header")?;
+
+    let empty_path = Path::new("");
+    println!("-------------------------------");
+
+    let longest_path_in_archive = header
+        .files
+        .iter()
+        .filter_map(|file| Some(file.subpath.as_path(empty_path).to_str()?.len()))
+        .max()
+        .unwrap_or_default()
+        .max(METADATA_FILE_NAME.len());
+
+    for regular_file in &header.files {
+        println!(
+            "File: {:width$} uncompressed size: {} bytes",
+            regular_file.subpath.as_path(empty_path).display(),
+            regular_file.size,
+            width = longest_path_in_archive,
+        )
+    }
+    println!(
+        "File: {:width$} uncompressed size: {} bytes",
+        METADATA_FILE_NAME,
+        header.metadata_file_size,
+        width = longest_path_in_archive,
+    );
+    println!("-------------------------------");
+
+    Ok(())
+}
+
+async fn extract_archive(archive: &Path, target_dir: &Path) -> anyhow::Result<()> {
+    let archive = archive.canonicalize().with_context(|| {
+        format!(
+            "Failed to get the absolute path for the archive path '{}'",
+            archive.display()
+        )
+    })?;
+    ensure!(
+        archive.is_file(),
+        "Path '{}' is not an archive file",
+        archive.display()
+    );
+    let archive_name = match archive.file_name().and_then(|name| name.to_str()) {
+        Some(name) => name,
+        None => bail!(
+            "Failed to get the archive name from the path '{}'",
+            archive.display()
+        ),
+    };
+
+    if !target_dir.exists() {
+        fs::create_dir_all(target_dir).await.with_context(|| {
+            format!(
+                "Failed to create the target dir at path '{}'",
+                target_dir.display()
+            )
+        })?;
+    }
+    let target_dir = target_dir.canonicalize().with_context(|| {
+        format!(
+            "Failed to get the absolute path for the target dir path '{}'",
+            target_dir.display()
+        )
+    })?;
+    ensure!(
+        target_dir.is_dir(),
+        "Path '{}' is not a directory",
+        target_dir.display()
+    );
+    let mut dir_contents = fs::read_dir(&target_dir)
+        .await
+        .context("Failed to list the target directory contents")?;
+    let dir_entry = dir_contents
+        .next_entry()
+        .await
+        .context("Failed to list the target directory contents")?;
+    ensure!(
+        dir_entry.is_none(),
+        "Target directory '{}' is not empty",
+        target_dir.display()
+    );
+
+    println!(
+        "Extracting an archive at path '{}' into directory '{}'",
+        archive.display(),
+        target_dir.display()
+    );
+
+    let mut archive_file = fs::File::open(&archive).await.with_context(|| {
+        format!(
+            "Failed to get the archive name from the path '{}'",
+            archive.display()
+        )
+    })?;
+    let header = compression::read_archive_header(archive_name, &mut archive_file)
+        .await
+        .context("Failed to read the archive header")?;
+    compression::uncompress_with_header(&BTreeSet::new(), &target_dir, header, &mut archive_file)
+        .await
+        .context("Failed to extract the archive")
+}
+
+async fn create_archive(source_dir: &Path, target_dir: &Path) -> anyhow::Result<()> {
+    let source_dir = source_dir.canonicalize().with_context(|| {
+        format!(
+            "Failed to get the absolute path for the source dir path '{}'",
+            source_dir.display()
+        )
+    })?;
+    ensure!(
+        source_dir.is_dir(),
+        "Path '{}' is not a directory",
+        source_dir.display()
+    );
+
+    if !target_dir.exists() {
+        fs::create_dir_all(target_dir).await.with_context(|| {
+            format!(
+                "Failed to create the target dir at path '{}'",
+                target_dir.display()
+            )
+        })?;
+    }
+    let target_dir = target_dir.canonicalize().with_context(|| {
+        format!(
+            "Failed to get the absolute path for the target dir path '{}'",
+            target_dir.display()
+        )
+    })?;
+    ensure!(
+        target_dir.is_dir(),
+        "Path '{}' is not a directory",
+        target_dir.display()
+    );
+
+    println!(
+        "Compressing directory '{}' and creating resulting archive in directory '{}'",
+        source_dir.display(),
+        target_dir.display()
+    );
+
+    let mut metadata_file_contents = None;
+    let mut files_co_archive = Vec::new();
+
+    let mut source_dir_contents = fs::read_dir(&source_dir)
+        .await
+        .context("Failed to read the source directory contents")?;
+
+    while let Some(source_dir_entry) = source_dir_contents
+        .next_entry()
+        .await
+        .context("Failed to read a source dir entry")?
+    {
+        let entry_path = source_dir_entry.path();
+        if entry_path.is_file() {
+            if entry_path.file_name().and_then(|name| name.to_str()) == Some(METADATA_FILE_NAME) {
+                let metadata_bytes = fs::read(entry_path)
+                    .await
+                    .context("Failed to read metata file bytes in the source dir")?;
+                metadata_file_contents = Some(
+                    TimelineMetadata::from_bytes(&metadata_bytes)
+                        .context("Failed to parse metata file contents in the source dir")?,
+                );
+            } else {
+                files_co_archive.push(entry_path);
+            }
+        }
+    }
+
+    let metadata = match metadata_file_contents {
+        Some(metadata) => metadata,
+        None => bail!(
+            "No metadata file found in the source dir '{}', cannot create the archive",
+            source_dir.display()
+        ),
+    };
+
+    let _ = compression::archive_files_as_stream(
+        &source_dir,
+        files_co_archive.iter(),
+        &metadata,
+        move |mut archive_streamer, archive_name| async move {
+            let archive_target = target_dir.join(&archive_name);
+            let mut archive_file = fs::File::create(&archive_target).await?;
+            io::copy(&mut archive_streamer, &mut archive_file).await?;
+            Ok(archive_target)
+        },
+    )
+    .await
+    .context("Failed to create an archive")?;
+
+    Ok(())
+}
--- a/pageserver/src/bin/update_metadata.rs
+++ b/pageserver/src/bin/update_metadata.rs
@@ -0,0 +1,72 @@
+//! Main entry point for the edit_metadata executable
+//!
+//! A handy tool for debugging, that's all.
+use anyhow::Result;
+use clap::{App, Arg};
+use pageserver::layered_repository::metadata::TimelineMetadata;
+use std::path::PathBuf;
+use std::str::FromStr;
+use zenith_utils::lsn::Lsn;
+use zenith_utils::GIT_VERSION;
+
+fn main() -> Result<()> {
+    let arg_matches = App::new("Zenith update metadata utility")
+        .about("Dump or update metadata file")
+        .version(GIT_VERSION)
+        .arg(
+            Arg::new("path")
+                .help("Path to metadata file")
+                .required(true),
+        )
+        .arg(
+            Arg::new("disk_lsn")
+                .short('d')
+                .long("disk_lsn")
+                .takes_value(true)
+                .help("Replace disk constistent lsn"),
+        )
+        .arg(
+            Arg::new("prev_lsn")
+                .short('p')
+                .long("prev_lsn")
+                .takes_value(true)
+                .help("Previous record LSN"),
+        )
+        .get_matches();
+
+    let path = PathBuf::from(arg_matches.value_of("path").unwrap());
+    let metadata_bytes = std::fs::read(&path)?;
+    let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
+    println!("Current metadata:\n{:?}", &meta);
+
+    let mut update_meta = false;
+
+    if let Some(disk_lsn) = arg_matches.value_of("disk_lsn") {
+        meta = TimelineMetadata::new(
+            Lsn::from_str(disk_lsn)?,
+            meta.prev_record_lsn(),
+            meta.ancestor_timeline(),
+            meta.ancestor_lsn(),
+            meta.latest_gc_cutoff_lsn(),
+            meta.initdb_lsn(),
+        );
+        update_meta = true;
+    }
+
+    if let Some(prev_lsn) = arg_matches.value_of("prev_lsn") {
+        meta = TimelineMetadata::new(
+            meta.disk_consistent_lsn(),
+            Some(Lsn::from_str(prev_lsn)?),
+            meta.ancestor_timeline(),
+            meta.ancestor_lsn(),
+            meta.latest_gc_cutoff_lsn(),
+            meta.initdb_lsn(),
+        );
+        update_meta = true;
+    }
+    if update_meta {
+        let metadata_bytes = meta.to_bytes()?;
+        std::fs::write(&path, &metadata_bytes)?;
+    }
+    Ok(())
+}
--- a/pageserver/src/branches.rs
+++ b/pageserver/src/branches.rs
@@ -16,16 +16,18 @@ use std::{
 };
 use tracing::*;

-use zenith_utils::crashsafe_dir;
-use zenith_utils::logging;
 use zenith_utils::lsn::Lsn;
 use zenith_utils::zid::{ZTenantId, ZTimelineId};
+use zenith_utils::{crashsafe_dir, logging};

+use crate::config::PageServerConf;
+use crate::pgdatadir_mapping::DatadirTimeline;
+use crate::repository::{Repository, Timeline};
 use crate::tenant_mgr;
 use crate::walredo::WalRedoManager;
 use crate::CheckpointConfig;
-use crate::{repository::Repository, PageServerConf};
-use crate::{restore_local_repo, LOG_FILE_NAME};
+use crate::RepositoryImpl;
+use crate::{import_datadir, LOG_FILE_NAME};

 #[derive(Serialize, Deserialize, Clone)]
 pub struct BranchInfo {
@@ -42,43 +44,40 @@ pub struct BranchInfo {
 impl BranchInfo {
    pub fn from_path<T: AsRef<Path>>(
        path: T,
-        conf: &PageServerConf,
-        tenantid: &ZTenantId,
-        repo: &Arc<dyn Repository>,
+        tenantid: ZTenantId,
        include_non_incremental_logical_size: bool,
    ) -> Result<Self> {
-        let name = path
-            .as_ref()
-            .file_name()
-            .unwrap()
-            .to_str()
-            .unwrap()
-            .to_string();
-        let timeline_id = std::fs::read_to_string(path)?.parse::<ZTimelineId>()?;
+        let path = path.as_ref();
+        let name = path.file_name().unwrap().to_string_lossy().to_string();
+        let timeline_id = std::fs::read_to_string(path)
+            .with_context(|| {
+                format!(
+                    "Failed to read branch file contents at path '{}'",
+                    path.display()
+                )
+            })?
+            .parse::<ZTimelineId>()?;

-        let timeline = repo.get_timeline(timeline_id)?;
+        let timeline = match tenant_mgr::get_timeline_for_tenant(tenantid, timeline_id) {
+            Ok(timeline) => timeline,
+            Err(err) => {
+                // FIXME: this was:
+                // bail!("Timeline {} is remote, no branches to display", timeline_id)
+                //
+                // but we cannot distinguish that from other errors now. Have
+                // get_timeline_for_tenant() return a more specific error
+                return Err(err);
+            }
+        };

-        let ancestor_path = conf.ancestor_path(&timeline_id, tenantid);
-        let mut ancestor_id: Option<String> = None;
-        let mut ancestor_lsn: Option<String> = None;
-
-        if ancestor_path.exists() {
-            let ancestor = std::fs::read_to_string(ancestor_path)?;
-            let mut strings = ancestor.split('@');
-
-            ancestor_id = Some(
-                strings
-                    .next()
-                    .with_context(|| "wrong branch ancestor point in time format")?
-                    .to_owned(),
-            );
-            ancestor_lsn = Some(
-                strings
-                    .next()
-                    .with_context(|| "wrong branch ancestor point in time format")?
-                    .to_owned(),
-            );
-        }
+        // we use ancestor lsn zero if we don't have an ancestor, so turn this into an option based on timeline id
+        let (ancestor_id, ancestor_lsn) = match timeline.tline.get_ancestor_timeline_id() {
+            Some(ancestor_id) => (
+                Some(ancestor_id.to_string()),
+                Some(timeline.tline.get_ancestor_lsn().to_string()),
+            ),
+            None => (None, None),
+        };

        // non incremental size calculation can be heavy, so let it be optional
        // needed for tests to check size calculation
@@ -87,6 +86,7 @@ impl BranchInfo {
                timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn())
            })
            .transpose()?;
+        let current_logical_size = timeline.get_current_logical_size();

        Ok(BranchInfo {
            name,
@@ -94,7 +94,7 @@ impl BranchInfo {
            latest_valid_lsn: timeline.get_last_record_lsn(),
            ancestor_id,
            ancestor_lsn,
-            current_logical_size: timeline.get_current_logical_size(),
+            current_logical_size,
            current_logical_size_non_incremental,
        })
    }
@@ -126,7 +126,7 @@ pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str
    if let Some(tenantid) = create_tenant {
        let tenantid = ZTenantId::from_str(tenantid)?;
        println!("initializing tenantid {}", tenantid);
-        create_repo(conf, tenantid, dummy_redo_mgr).with_context(|| "failed to create repo")?;
+        create_repo(conf, tenantid, dummy_redo_mgr).context("failed to create repo")?;
    }
    crashsafe_dir::create_dir_all(conf.tenants_path())?;

@@ -138,7 +138,7 @@ pub fn create_repo(
    conf: &'static PageServerConf,
    tenantid: ZTenantId,
    wal_redo_manager: Arc<dyn WalRedoManager + Send + Sync>,
-) -> Result<Arc<dyn Repository>> {
+) -> Result<Arc<RepositoryImpl>> {
    let repo_dir = conf.tenant_path(&tenantid);
    if repo_dir.exists() {
        bail!("repo for {} already exists", tenantid)
@@ -154,21 +154,25 @@ pub fn create_repo(

    info!("created directory structure in {}", repo_dir.display());

-    let tli = create_timeline(conf, None, &tenantid)?;
+    // create a new timeline directory
+    let timeline_id = ZTimelineId::generate();
+    let timelinedir = conf.timeline_path(&timeline_id, &tenantid);

-    let repo = Arc::new(crate::layered_repository::LayeredRepository::new(
+    crashsafe_dir::create_dir(&timelinedir)?;
+
+    let repo = crate::layered_repository::LayeredRepository::new(
        conf,
        wal_redo_manager,
        tenantid,
-        false,
-    ));
+        conf.remote_storage_config.is_some(),
+    );

    // Load data into pageserver
    // TODO To implement zenith import we need to
    //      move data loading out of create_repo()
-    bootstrap_timeline(conf, tenantid, tli, repo.as_ref())?;
+    bootstrap_timeline(conf, tenantid, timeline_id, &repo)?;

-    Ok(repo)
+    Ok(Arc::new(repo))
 }

 // Returns checkpoint LSN from controlfile
@@ -191,6 +195,7 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
    let initdb_output = Command::new(initdb_path)
        .args(&["-D", initdbpath.to_str().unwrap()])
        .args(&["-U", &conf.superuser])
+        .args(&["-E", "utf8"])
        .arg("--no-instructions")
        // This is only used for a temporary installation that is deleted shortly after,
        // so no need to fsync it
@@ -200,7 +205,7 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
        .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
        .stdout(Stdio::null())
        .output()
-        .with_context(|| "failed to execute initdb")?;
+        .context("failed to execute initdb")?;
    if !initdb_output.status.success() {
        anyhow::bail!(
            "initdb failed: '{}'",
@@ -215,11 +220,11 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
 // - run initdb to init temporary instance and get bootstrap data
 // - after initialization complete, remove the temp dir.
 //
-fn bootstrap_timeline(
+fn bootstrap_timeline<R: Repository>(
    conf: &'static PageServerConf,
    tenantid: ZTenantId,
    tli: ZTimelineId,
-    repo: &dyn Repository,
+    repo: &R,
 ) -> Result<()> {
    let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered();

@@ -233,18 +238,19 @@ fn bootstrap_timeline(

    // Import the contents of the data directory at the initial checkpoint
    // LSN, and any WAL after that.
-    let timeline = repo.create_empty_timeline(tli)?;
-    restore_local_repo::import_timeline_from_postgres_datadir(
-        &pgdata_path,
-        timeline.writer().as_ref(),
-        lsn,
-    )?;
-    timeline.checkpoint(CheckpointConfig::Forced)?;
+    // Initdb lsn will be equal to last_record_lsn which will be set after import.
+    // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline.
+    let timeline = repo.create_empty_timeline(tli, lsn)?;
+
+    let mut page_tline: DatadirTimeline<R> = DatadirTimeline::new(timeline);
+
+    import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?;
+    page_tline.tline.checkpoint(CheckpointConfig::Forced)?;

    println!(
        "created initial timeline {} timeline.lsn {}",
        tli,
-        timeline.get_last_record_lsn()
+        page_tline.tline.get_last_record_lsn()
    );

    let data = tli.to_string();
@@ -262,8 +268,6 @@ pub(crate) fn get_branches(
    tenantid: &ZTenantId,
    include_non_incremental_logical_size: bool,
 ) -> Result<Vec<BranchInfo>> {
-    let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?;
-
    // Each branch has a corresponding record (text file) in the refs/branches
    // with timeline_id.
    let branches_dir = conf.branches_path(tenantid);
@@ -286,9 +290,7 @@ pub(crate) fn get_branches(
            })?;
            BranchInfo::from_path(
                dir_entry.path(),
-                conf,
-                tenantid,
-                &repo,
+                *tenantid,
                include_non_incremental_logical_size,
            )
        })
@@ -308,7 +310,10 @@ pub(crate) fn create_branch(
    }

    let mut startpoint = parse_point_in_time(conf, startpoint_str, tenantid)?;
-    let timeline = repo.get_timeline(startpoint.timelineid)?;
+    let timeline = repo
+        .get_timeline(startpoint.timelineid)?
+        .local_timeline()
+        .context("Cannot branch off the timeline that's not present locally")?;
    if startpoint.lsn == Lsn(0) {
        // Find end of WAL on the old timeline
        let end_of_wal = timeline.get_last_record_lsn();
@@ -324,33 +329,34 @@ pub(crate) fn create_branch(
        timeline.wait_lsn(startpoint.lsn)?;
    }
    startpoint.lsn = startpoint.lsn.align();
-    if timeline.get_start_lsn() > startpoint.lsn {
+    if timeline.get_ancestor_lsn() > startpoint.lsn {
+        // can we safely just branch from the ancestor instead?
        anyhow::bail!(
-            "invalid startpoint {} for the branch {}: less than timeline start {}",
+            "invalid startpoint {} for the branch {}: less than timeline ancestor lsn {:?}",
            startpoint.lsn,
            branchname,
-            timeline.get_start_lsn()
+            timeline.get_ancestor_lsn()
        );
    }

-    // create a new timeline directory for it
-    let newtli = create_timeline(conf, Some(startpoint), tenantid)?;
+    let new_timeline_id = ZTimelineId::generate();

-    // Let the Repository backend do its initialization
-    repo.branch_timeline(startpoint.timelineid, newtli, startpoint.lsn)?;
+    // Forward entire timeline creation routine to repository
+    // backend, so it can do all needed initialization
+    repo.branch_timeline(startpoint.timelineid, new_timeline_id, startpoint.lsn)?;

    // Remember the human-readable branch name for the new timeline.
    // FIXME: there's a race condition, if you create a branch with the same
    // name concurrently.
-    let data = newtli.to_string();
+    let data = new_timeline_id.to_string();
    fs::write(conf.branch_path(branchname, tenantid), data)?;

    Ok(BranchInfo {
        name: branchname.to_string(),
-        timeline_id: newtli,
+        timeline_id: new_timeline_id,
        latest_valid_lsn: startpoint.lsn,
-        ancestor_id: None,
-        ancestor_lsn: None,
+        ancestor_id: Some(startpoint.timelineid.to_string()),
+        ancestor_lsn: Some(startpoint.lsn.to_string()),
        current_logical_size: 0,
        current_logical_size_non_incremental: Some(0),
    })
@@ -383,14 +389,11 @@ fn parse_point_in_time(
    let mut strings = s.split('@');
    let name = strings.next().unwrap();

-    let lsn: Option<Lsn>;
-    if let Some(lsnstr) = strings.next() {
-        lsn = Some(
-            Lsn::from_str(lsnstr).with_context(|| "invalid LSN in point-in-time specification")?,
-        );
-    } else {
-        lsn = None
-    }
+    let lsn = strings
+        .next()
+        .map(Lsn::from_str)
+        .transpose()
+        .context("invalid LSN in point-in-time specification")?;

    // Check if it's a tag
    if lsn.is_none() {
@@ -428,24 +431,3 @@ fn parse_point_in_time(

    bail!("could not parse point-in-time {}", s);
 }
-
-fn create_timeline(
-    conf: &PageServerConf,
-    ancestor: Option<PointInTime>,
-    tenantid: &ZTenantId,
-) -> Result<ZTimelineId> {
-    // Create initial timeline
-
-    let timelineid = ZTimelineId::generate();
-
-    let timelinedir = conf.timeline_path(&timelineid, tenantid);
-
-    fs::create_dir(&timelinedir)?;
-
-    if let Some(ancestor) = ancestor {
-        let data = format!("{}@{}", ancestor.timelineid, ancestor.lsn);
-        fs::write(timelinedir.join("ancestor"), data)?;
-    }
-
-    Ok(timelineid)
-}
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -0,0 +1,898 @@
+//! Functions for handling page server configuration options
+//!
+//! Configuration options can be set in the pageserver.toml configuration
+//! file, or on the command line.
+//! See also `settings.md` for better description on every parameter.
+
+use anyhow::{bail, ensure, Context, Result};
+use toml_edit;
+use toml_edit::{Document, Item};
+use zenith_utils::postgres_backend::AuthType;
+use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId};
+
+use std::convert::TryInto;
+use std::env;
+use std::num::{NonZeroU32, NonZeroUsize};
+use std::path::{Path, PathBuf};
+use std::str::FromStr;
+use std::time::Duration;
+
+use crate::layered_repository::TIMELINES_SEGMENT_NAME;
+
+pub mod defaults {
+    use const_format::formatcp;
+
+    pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
+    pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
+    pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
+    pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
+
+    // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
+    // would be more appropriate. But a low value forces the code to be exercised more,
+    // which is good for now to trigger bugs.
+    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
+    pub const DEFAULT_CHECKPOINT_PERIOD: &str = "1 s";
+
+    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
+    pub const DEFAULT_GC_PERIOD: &str = "100 s";
+
+    pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
+    pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
+
+    pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
+    pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC: usize = 100;
+    pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
+
+    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
+    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
+
+    ///
+    /// Default built-in configuration file.
+    ///
+    pub const DEFAULT_CONFIG_FILE: &str = formatcp!(
+        r###"
+# Initial configuration file created by 'pageserver --init'
+
+#listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}'
+#listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}'
+
+#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
+#checkpoint_period = '{DEFAULT_CHECKPOINT_PERIOD}'
+
+#gc_period = '{DEFAULT_GC_PERIOD}'
+#gc_horizon = {DEFAULT_GC_HORIZON}
+
+#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
+#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
+
+#max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}
+
+# initial superuser role name to use when creating a new tenant
+#initial_superuser_name = '{DEFAULT_SUPERUSER}'
+
+# [remote_storage]
+
+"###
+    );
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct PageServerConf {
+    // Identifier of that particular pageserver so e g safekeepers
+    // can safely distinguish different pageservers
+    pub id: ZNodeId,
+
+    /// Example (default): 127.0.0.1:64000
+    pub listen_pg_addr: String,
+    /// Example (default): 127.0.0.1:9898
+    pub listen_http_addr: String,
+
+    // Flush out an inmemory layer, if it's holding WAL older than this
+    // This puts a backstop on how much WAL needs to be re-digested if the
+    // page server crashes.
+    pub checkpoint_distance: u64,
+    pub checkpoint_period: Duration,
+
+    pub gc_horizon: u64,
+    pub gc_period: Duration,
+
+    // Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call.
+    pub wait_lsn_timeout: Duration,
+    // How long to wait for WAL redo to complete.
+    pub wal_redo_timeout: Duration,
+
+    pub superuser: String,
+
+    pub page_cache_size: usize,
+    pub max_file_descriptors: usize,
+
+    // Repository directory, relative to current working directory.
+    // Normally, the page server changes the current working directory
+    // to the repository, and 'workdir' is always '.'. But we don't do
+    // that during unit testing, because the current directory is global
+    // to the process but different unit tests work on different
+    // repositories.
+    pub workdir: PathBuf,
+
+    pub pg_distrib_dir: PathBuf,
+
+    pub auth_type: AuthType,
+
+    pub auth_validation_public_key_path: Option<PathBuf>,
+    pub remote_storage_config: Option<RemoteStorageConfig>,
+}
+
+// use dedicated enum for builder to better indicate the intention
+// and avoid possible confusion with nested options
+pub enum BuilderValue<T> {
+    Set(T),
+    NotSet,
+}
+
+impl<T> BuilderValue<T> {
+    pub fn ok_or<E>(self, err: E) -> Result<T, E> {
+        match self {
+            Self::Set(v) => Ok(v),
+            Self::NotSet => Err(err),
+        }
+    }
+}
+
+// needed to simplify config construction
+struct PageServerConfigBuilder {
+    listen_pg_addr: BuilderValue<String>,
+
+    listen_http_addr: BuilderValue<String>,
+
+    checkpoint_distance: BuilderValue<u64>,
+    checkpoint_period: BuilderValue<Duration>,
+
+    gc_horizon: BuilderValue<u64>,
+    gc_period: BuilderValue<Duration>,
+
+    wait_lsn_timeout: BuilderValue<Duration>,
+    wal_redo_timeout: BuilderValue<Duration>,
+
+    superuser: BuilderValue<String>,
+
+    page_cache_size: BuilderValue<usize>,
+    max_file_descriptors: BuilderValue<usize>,
+
+    workdir: BuilderValue<PathBuf>,
+
+    pg_distrib_dir: BuilderValue<PathBuf>,
+
+    auth_type: BuilderValue<AuthType>,
+
+    //
+    auth_validation_public_key_path: BuilderValue<Option<PathBuf>>,
+    remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,
+
+    id: BuilderValue<ZNodeId>,
+}
+
+impl Default for PageServerConfigBuilder {
+    fn default() -> Self {
+        use self::BuilderValue::*;
+        use defaults::*;
+        Self {
+            listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()),
+            listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()),
+            checkpoint_distance: Set(DEFAULT_CHECKPOINT_DISTANCE),
+            checkpoint_period: Set(humantime::parse_duration(DEFAULT_CHECKPOINT_PERIOD)
+                .expect("cannot parse default checkpoint period")),
+            gc_horizon: Set(DEFAULT_GC_HORIZON),
+            gc_period: Set(humantime::parse_duration(DEFAULT_GC_PERIOD)
+                .expect("cannot parse default gc period")),
+            wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
+                .expect("cannot parse default wait lsn timeout")),
+            wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
+                .expect("cannot parse default wal redo timeout")),
+            superuser: Set(DEFAULT_SUPERUSER.to_string()),
+            page_cache_size: Set(DEFAULT_PAGE_CACHE_SIZE),
+            max_file_descriptors: Set(DEFAULT_MAX_FILE_DESCRIPTORS),
+            workdir: Set(PathBuf::new()),
+            pg_distrib_dir: Set(env::current_dir()
+                .expect("cannot access current directory")
+                .join("tmp_install")),
+            auth_type: Set(AuthType::Trust),
+            auth_validation_public_key_path: Set(None),
+            remote_storage_config: Set(None),
+            id: NotSet,
+        }
+    }
+}
+
+impl PageServerConfigBuilder {
+    pub fn listen_pg_addr(&mut self, listen_pg_addr: String) {
+        self.listen_pg_addr = BuilderValue::Set(listen_pg_addr)
+    }
+
+    pub fn listen_http_addr(&mut self, listen_http_addr: String) {
+        self.listen_http_addr = BuilderValue::Set(listen_http_addr)
+    }
+
+    pub fn checkpoint_distance(&mut self, checkpoint_distance: u64) {
+        self.checkpoint_distance = BuilderValue::Set(checkpoint_distance)
+    }
+
+    pub fn checkpoint_period(&mut self, checkpoint_period: Duration) {
+        self.checkpoint_period = BuilderValue::Set(checkpoint_period)
+    }
+
+    pub fn gc_horizon(&mut self, gc_horizon: u64) {
+        self.gc_horizon = BuilderValue::Set(gc_horizon)
+    }
+
+    pub fn gc_period(&mut self, gc_period: Duration) {
+        self.gc_period = BuilderValue::Set(gc_period)
+    }
+
+    pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) {
+        self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout)
+    }
+
+    pub fn wal_redo_timeout(&mut self, wal_redo_timeout: Duration) {
+        self.wal_redo_timeout = BuilderValue::Set(wal_redo_timeout)
+    }
+
+    pub fn superuser(&mut self, superuser: String) {
+        self.superuser = BuilderValue::Set(superuser)
+    }
+
+    pub fn page_cache_size(&mut self, page_cache_size: usize) {
+        self.page_cache_size = BuilderValue::Set(page_cache_size)
+    }
+
+    pub fn max_file_descriptors(&mut self, max_file_descriptors: usize) {
+        self.max_file_descriptors = BuilderValue::Set(max_file_descriptors)
+    }
+
+    pub fn workdir(&mut self, workdir: PathBuf) {
+        self.workdir = BuilderValue::Set(workdir)
+    }
+
+    pub fn pg_distrib_dir(&mut self, pg_distrib_dir: PathBuf) {
+        self.pg_distrib_dir = BuilderValue::Set(pg_distrib_dir)
+    }
+
+    pub fn auth_type(&mut self, auth_type: AuthType) {
+        self.auth_type = BuilderValue::Set(auth_type)
+    }
+
+    pub fn auth_validation_public_key_path(
+        &mut self,
+        auth_validation_public_key_path: Option<PathBuf>,
+    ) {
+        self.auth_validation_public_key_path = BuilderValue::Set(auth_validation_public_key_path)
+    }
+
+    pub fn remote_storage_config(&mut self, remote_storage_config: Option<RemoteStorageConfig>) {
+        self.remote_storage_config = BuilderValue::Set(remote_storage_config)
+    }
+
+    pub fn id(&mut self, node_id: ZNodeId) {
+        self.id = BuilderValue::Set(node_id)
+    }
+
+    pub fn build(self) -> Result<PageServerConf> {
+        Ok(PageServerConf {
+            listen_pg_addr: self
+                .listen_pg_addr
+                .ok_or(anyhow::anyhow!("missing listen_pg_addr"))?,
+            listen_http_addr: self
+                .listen_http_addr
+                .ok_or(anyhow::anyhow!("missing listen_http_addr"))?,
+            checkpoint_distance: self
+                .checkpoint_distance
+                .ok_or(anyhow::anyhow!("missing checkpoint_distance"))?,
+            checkpoint_period: self
+                .checkpoint_period
+                .ok_or(anyhow::anyhow!("missing checkpoint_period"))?,
+            gc_horizon: self
+                .gc_horizon
+                .ok_or(anyhow::anyhow!("missing gc_horizon"))?,
+            gc_period: self.gc_period.ok_or(anyhow::anyhow!("missing gc_period"))?,
+            wait_lsn_timeout: self
+                .wait_lsn_timeout
+                .ok_or(anyhow::anyhow!("missing wait_lsn_timeout"))?,
+            wal_redo_timeout: self
+                .wal_redo_timeout
+                .ok_or(anyhow::anyhow!("missing wal_redo_timeout"))?,
+            superuser: self.superuser.ok_or(anyhow::anyhow!("missing superuser"))?,
+            page_cache_size: self
+                .page_cache_size
+                .ok_or(anyhow::anyhow!("missing page_cache_size"))?,
+            max_file_descriptors: self
+                .max_file_descriptors
+                .ok_or(anyhow::anyhow!("missing max_file_descriptors"))?,
+            workdir: self.workdir.ok_or(anyhow::anyhow!("missing workdir"))?,
+            pg_distrib_dir: self
+                .pg_distrib_dir
+                .ok_or(anyhow::anyhow!("missing pg_distrib_dir"))?,
+            auth_type: self.auth_type.ok_or(anyhow::anyhow!("missing auth_type"))?,
+            auth_validation_public_key_path: self
+                .auth_validation_public_key_path
+                .ok_or(anyhow::anyhow!("missing auth_validation_public_key_path"))?,
+            remote_storage_config: self
+                .remote_storage_config
+                .ok_or(anyhow::anyhow!("missing remote_storage_config"))?,
+            id: self.id.ok_or(anyhow::anyhow!("missing id"))?,
+        })
+    }
+}
+
+/// External backup storage configuration, enough for creating a client for that storage.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct RemoteStorageConfig {
+    /// Max allowed number of concurrent sync operations between pageserver and the remote storage.
+    pub max_concurrent_sync: NonZeroUsize,
+    /// Max allowed errors before the sync task is considered failed and evicted.
+    pub max_sync_errors: NonZeroU32,
+    /// The storage connection configuration.
+    pub storage: RemoteStorageKind,
+}
+
+/// A kind of a remote storage to connect to, with its connection configuration.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum RemoteStorageKind {
+    /// Storage based on local file system.
+    /// Specify a root folder to place all stored relish data into.
+    LocalFs(PathBuf),
+    /// AWS S3 based storage, storing all relishes into the root
+    /// of the S3 bucket from the config.
+    AwsS3(S3Config),
+}
+
+/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
+#[derive(Clone, PartialEq, Eq)]
+pub struct S3Config {
+    /// Name of the bucket to connect to.
+    pub bucket_name: String,
+    /// The region where the bucket is located at.
+    pub bucket_region: String,
+    /// A "subfolder" in the bucket, to use the same bucket separately by multiple pageservers at once.
+    pub prefix_in_bucket: Option<String>,
+    /// "Login" to use when connecting to bucket.
+    /// Can be empty for cases like AWS k8s IAM
+    /// where we can allow certain pods to connect
+    /// to the bucket directly without any credentials.
+    pub access_key_id: Option<String>,
+    /// "Password" to use when connecting to bucket.
+    pub secret_access_key: Option<String>,
+    /// A base URL to send S3 requests to.
+    /// By default, the endpoint is derived from a region name, assuming it's
+    /// an AWS S3 region name, erroring on wrong region name.
+    /// Endpoint provides a way to support other S3 flavors and their regions.
+    ///
+    /// Example: `http://127.0.0.1:5000`
+    pub endpoint: Option<String>,
+}
+
+impl std::fmt::Debug for S3Config {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("S3Config")
+            .field("bucket_name", &self.bucket_name)
+            .field("bucket_region", &self.bucket_region)
+            .field("prefix_in_bucket", &self.prefix_in_bucket)
+            .finish()
+    }
+}
+
+impl PageServerConf {
+    //
+    // Repository paths, relative to workdir.
+    //
+
+    pub fn tenants_path(&self) -> PathBuf {
+        self.workdir.join("tenants")
+    }
+
+    pub fn tenant_path(&self, tenantid: &ZTenantId) -> PathBuf {
+        self.tenants_path().join(tenantid.to_string())
+    }
+
+    pub fn tags_path(&self, tenantid: &ZTenantId) -> PathBuf {
+        self.tenant_path(tenantid).join("refs").join("tags")
+    }
+
+    pub fn tag_path(&self, tag_name: &str, tenantid: &ZTenantId) -> PathBuf {
+        self.tags_path(tenantid).join(tag_name)
+    }
+
+    pub fn branches_path(&self, tenantid: &ZTenantId) -> PathBuf {
+        self.tenant_path(tenantid).join("refs").join("branches")
+    }
+
+    pub fn branch_path(&self, branch_name: &str, tenantid: &ZTenantId) -> PathBuf {
+        self.branches_path(tenantid).join(branch_name)
+    }
+
+    pub fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf {
+        self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME)
+    }
+
+    pub fn timeline_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
+        self.timelines_path(tenantid).join(timelineid.to_string())
+    }
+
+    pub fn ancestor_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
+        self.timeline_path(timelineid, tenantid).join("ancestor")
+    }
+
+    //
+    // Postgres distribution paths
+    //
+
+    pub fn pg_bin_dir(&self) -> PathBuf {
+        self.pg_distrib_dir.join("bin")
+    }
+
+    pub fn pg_lib_dir(&self) -> PathBuf {
+        self.pg_distrib_dir.join("lib")
+    }
+
+    /// Parse a configuration file (pageserver.toml) into a PageServerConf struct,
+    /// validating the input and failing on errors.
+    ///
+    /// This leaves any options not present in the file in the built-in defaults.
+    pub fn parse_and_validate(toml: &Document, workdir: &Path) -> Result<Self> {
+        let mut builder = PageServerConfigBuilder::default();
+        builder.workdir(workdir.to_owned());
+
+        for (key, item) in toml.iter() {
+            match key {
+                "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?),
+                "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?),
+                "checkpoint_distance" => builder.checkpoint_distance(parse_toml_u64(key, item)?),
+                "checkpoint_period" => builder.checkpoint_period(parse_toml_duration(key, item)?),
+                "gc_horizon" => builder.gc_horizon(parse_toml_u64(key, item)?),
+                "gc_period" => builder.gc_period(parse_toml_duration(key, item)?),
+                "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?),
+                "wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?),
+                "initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?),
+                "page_cache_size" => builder.page_cache_size(parse_toml_u64(key, item)? as usize),
+                "max_file_descriptors" => {
+                    builder.max_file_descriptors(parse_toml_u64(key, item)? as usize)
+                }
+                "pg_distrib_dir" => {
+                    builder.pg_distrib_dir(PathBuf::from(parse_toml_string(key, item)?))
+                }
+                "auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some(
+                    PathBuf::from(parse_toml_string(key, item)?),
+                )),
+                "auth_type" => builder.auth_type(parse_toml_auth_type(key, item)?),
+                "remote_storage" => {
+                    builder.remote_storage_config(Some(Self::parse_remote_storage_config(item)?))
+                }
+                "id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)),
+                _ => bail!("unrecognized pageserver option '{}'", key),
+            }
+        }
+
+        let mut conf = builder.build().context("invalid config")?;
+
+        if conf.auth_type == AuthType::ZenithJWT {
+            let auth_validation_public_key_path = conf
+                .auth_validation_public_key_path
+                .get_or_insert_with(|| workdir.join("auth_public_key.pem"));
+            ensure!(
+                auth_validation_public_key_path.exists(),
+                format!(
+                    "Can't find auth_validation_public_key at '{}'",
+                    auth_validation_public_key_path.display()
+                )
+            );
+        }
+
+        if !conf.pg_distrib_dir.join("bin/postgres").exists() {
+            bail!(
+                "Can't find postgres binary at {}",
+                conf.pg_distrib_dir.display()
+            );
+        }
+
+        Ok(conf)
+    }
+
+    /// subroutine of parse_config(), to parse the `[remote_storage]` table.
+    fn parse_remote_storage_config(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
+        let local_path = toml.get("local_path");
+        let bucket_name = toml.get("bucket_name");
+        let bucket_region = toml.get("bucket_region");
+
+        let max_concurrent_sync: NonZeroUsize = if let Some(s) = toml.get("max_concurrent_sync") {
+            parse_toml_u64("max_concurrent_sync", s)
+                .and_then(|toml_u64| {
+                    toml_u64.try_into().with_context(|| {
+                        format!("'max_concurrent_sync' value {} is too large", toml_u64)
+                    })
+                })
+                .ok()
+                .and_then(NonZeroUsize::new)
+                .context("'max_concurrent_sync' must be a non-zero positive integer")?
+        } else {
+            NonZeroUsize::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC).unwrap()
+        };
+        let max_sync_errors: NonZeroU32 = if let Some(s) = toml.get("max_sync_errors") {
+            parse_toml_u64("max_sync_errors", s)
+                .and_then(|toml_u64| {
+                    toml_u64.try_into().with_context(|| {
+                        format!("'max_sync_errors' value {} is too large", toml_u64)
+                    })
+                })
+                .ok()
+                .and_then(NonZeroU32::new)
+                .context("'max_sync_errors' must be a non-zero positive integer")?
+        } else {
+            NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS).unwrap()
+        };
+
+        let storage = match (local_path, bucket_name, bucket_region) {
+            (None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"),
+            (_, Some(_), None) => {
+                bail!("'bucket_region' option is mandatory if 'bucket_name' is given ")
+            }
+            (_, None, Some(_)) => {
+                bail!("'bucket_name' option is mandatory if 'bucket_region' is given ")
+            }
+            (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config {
+                bucket_name: parse_toml_string("bucket_name", bucket_name)?,
+                bucket_region: parse_toml_string("bucket_region", bucket_region)?,
+                access_key_id: toml
+                    .get("access_key_id")
+                    .map(|access_key_id| parse_toml_string("access_key_id", access_key_id))
+                    .transpose()?,
+                secret_access_key: toml
+                    .get("secret_access_key")
+                    .map(|secret_access_key| {
+                        parse_toml_string("secret_access_key", secret_access_key)
+                    })
+                    .transpose()?,
+                prefix_in_bucket: toml
+                    .get("prefix_in_bucket")
+                    .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket))
+                    .transpose()?,
+                endpoint: toml
+                    .get("endpoint")
+                    .map(|endpoint| parse_toml_string("endpoint", endpoint))
+                    .transpose()?,
+            }),
+            (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from(
+                parse_toml_string("local_path", local_path)?,
+            )),
+            (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"),
+        };
+
+        Ok(RemoteStorageConfig {
+            max_concurrent_sync,
+            max_sync_errors,
+            storage,
+        })
+    }
+
+    #[cfg(test)]
+    pub fn test_repo_dir(test_name: &str) -> PathBuf {
+        PathBuf::from(format!("../tmp_check/test_{}", test_name))
+    }
+
+    #[cfg(test)]
+    pub fn dummy_conf(repo_dir: PathBuf) -> Self {
+        PageServerConf {
+            id: ZNodeId(0),
+            checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
+            checkpoint_period: Duration::from_secs(10),
+            gc_horizon: defaults::DEFAULT_GC_HORIZON,
+            gc_period: Duration::from_secs(10),
+            wait_lsn_timeout: Duration::from_secs(60),
+            wal_redo_timeout: Duration::from_secs(60),
+            page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
+            max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
+            listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
+            listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
+            superuser: "zenith_admin".to_string(),
+            workdir: repo_dir,
+            pg_distrib_dir: PathBuf::new(),
+            auth_type: AuthType::Trust,
+            auth_validation_public_key_path: None,
+            remote_storage_config: None,
+        }
+    }
+}
+
+// Helper functions to parse a toml Item
+
+fn parse_toml_string(name: &str, item: &Item) -> Result<String> {
+    let s = item
+        .as_str()
+        .with_context(|| format!("configure option {} is not a string", name))?;
+    Ok(s.to_string())
+}
+
+fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
+    // A toml integer is signed, so it cannot represent the full range of an u64. That's OK
+    // for our use, though.
+    let i: i64 = item
+        .as_integer()
+        .with_context(|| format!("configure option {} is not an integer", name))?;
+    if i < 0 {
+        bail!("configure option {} cannot be negative", name);
+    }
+    Ok(i as u64)
+}
+
+fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> {
+    let s = item
+        .as_str()
+        .with_context(|| format!("configure option {} is not a string", name))?;
+
+    Ok(humantime::parse_duration(s)?)
+}
+
+fn parse_toml_auth_type(name: &str, item: &Item) -> Result<AuthType> {
+    let v = item
+        .as_str()
+        .with_context(|| format!("configure option {} is not a string", name))?;
+    AuthType::from_str(v)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::fs;
+
+    use tempfile::{tempdir, TempDir};
+
+    use super::*;
+
+    const ALL_BASE_VALUES_TOML: &str = r#"
+# Initial configuration file created by 'pageserver --init'
+
+listen_pg_addr = '127.0.0.1:64000'
+listen_http_addr = '127.0.0.1:9898'
+
+checkpoint_distance = 111 # in bytes
+checkpoint_period = '111 s'
+
+gc_period = '222 s'
+gc_horizon = 222
+
+wait_lsn_timeout = '111 s'
+wal_redo_timeout = '111 s'
+
+page_cache_size = 444
+max_file_descriptors = 333
+
+# initial superuser role name to use when creating a new tenant
+initial_superuser_name = 'zzzz'
+id = 10
+
+"#;
+
+    #[test]
+    fn parse_defaults() -> anyhow::Result<()> {
+        let tempdir = tempdir()?;
+        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
+        // we have to create dummy pathes to overcome the validation errors
+        let config_string = format!("pg_distrib_dir='{}'\nid=10", pg_distrib_dir.display());
+        let toml = config_string.parse()?;
+
+        let parsed_config =
+            PageServerConf::parse_and_validate(&toml, &workdir).unwrap_or_else(|e| {
+                panic!("Failed to parse config '{}', reason: {}", config_string, e)
+            });
+
+        assert_eq!(
+            parsed_config,
+            PageServerConf {
+                id: ZNodeId(10),
+                listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
+                listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
+                checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
+                checkpoint_period: humantime::parse_duration(defaults::DEFAULT_CHECKPOINT_PERIOD)?,
+                gc_horizon: defaults::DEFAULT_GC_HORIZON,
+                gc_period: humantime::parse_duration(defaults::DEFAULT_GC_PERIOD)?,
+                wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?,
+                wal_redo_timeout: humantime::parse_duration(defaults::DEFAULT_WAL_REDO_TIMEOUT)?,
+                superuser: defaults::DEFAULT_SUPERUSER.to_string(),
+                page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
+                max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
+                workdir,
+                pg_distrib_dir,
+                auth_type: AuthType::Trust,
+                auth_validation_public_key_path: None,
+                remote_storage_config: None,
+            },
+            "Correct defaults should be used when no config values are provided"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_basic_config() -> anyhow::Result<()> {
+        let tempdir = tempdir()?;
+        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
+
+        let config_string = format!(
+            "{}pg_distrib_dir='{}'",
+            ALL_BASE_VALUES_TOML,
+            pg_distrib_dir.display()
+        );
+        let toml = config_string.parse()?;
+
+        let parsed_config =
+            PageServerConf::parse_and_validate(&toml, &workdir).unwrap_or_else(|e| {
+                panic!("Failed to parse config '{}', reason: {}", config_string, e)
+            });
+
+        assert_eq!(
+            parsed_config,
+            PageServerConf {
+                id: ZNodeId(10),
+                listen_pg_addr: "127.0.0.1:64000".to_string(),
+                listen_http_addr: "127.0.0.1:9898".to_string(),
+                checkpoint_distance: 111,
+                checkpoint_period: Duration::from_secs(111),
+                gc_horizon: 222,
+                gc_period: Duration::from_secs(222),
+                wait_lsn_timeout: Duration::from_secs(111),
+                wal_redo_timeout: Duration::from_secs(111),
+                superuser: "zzzz".to_string(),
+                page_cache_size: 444,
+                max_file_descriptors: 333,
+                workdir,
+                pg_distrib_dir,
+                auth_type: AuthType::Trust,
+                auth_validation_public_key_path: None,
+                remote_storage_config: None,
+            },
+            "Should be able to parse all basic config values correctly"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_remote_fs_storage_config() -> anyhow::Result<()> {
+        let tempdir = tempdir()?;
+        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
+
+        let local_storage_path = tempdir.path().join("local_remote_storage");
+
+        let identical_toml_declarations = &[
+            format!(
+                r#"[remote_storage]
+local_path = '{}'"#,
+                local_storage_path.display()
+            ),
+            format!(
+                "remote_storage={{local_path='{}'}}",
+                local_storage_path.display()
+            ),
+        ];
+
+        for remote_storage_config_str in identical_toml_declarations {
+            let config_string = format!(
+                r#"{}
+pg_distrib_dir='{}'
+
+{}"#,
+                ALL_BASE_VALUES_TOML,
+                pg_distrib_dir.display(),
+                remote_storage_config_str,
+            );
+
+            let toml = config_string.parse()?;
+
+            let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
+                .unwrap_or_else(|e| {
+                    panic!("Failed to parse config '{}', reason: {}", config_string, e)
+                })
+                .remote_storage_config
+                .expect("Should have remote storage config for the local FS");
+
+            assert_eq!(
+            parsed_remote_storage_config,
+            RemoteStorageConfig {
+                max_concurrent_sync: NonZeroUsize::new(
+                    defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC
+                )
+                .unwrap(),
+                max_sync_errors: NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
+                    .unwrap(),
+                storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
+            },
+            "Remote storage config should correctly parse the local FS config and fill other storage defaults"
+        );
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn parse_remote_s3_storage_config() -> anyhow::Result<()> {
+        let tempdir = tempdir()?;
+        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
+
+        let bucket_name = "some-sample-bucket".to_string();
+        let bucket_region = "eu-north-1".to_string();
+        let prefix_in_bucket = "test_prefix".to_string();
+        let access_key_id = "SOMEKEYAAAAASADSAH*#".to_string();
+        let secret_access_key = "SOMEsEcReTsd292v".to_string();
+        let endpoint = "http://localhost:5000".to_string();
+        let max_concurrent_sync = NonZeroUsize::new(111).unwrap();
+        let max_sync_errors = NonZeroU32::new(222).unwrap();
+
+        let identical_toml_declarations = &[
+            format!(
+                r#"[remote_storage]
+max_concurrent_sync = {}
+max_sync_errors = {}
+bucket_name = '{}'
+bucket_region = '{}'
+prefix_in_bucket = '{}'
+access_key_id = '{}'
+secret_access_key = '{}'
+endpoint = '{}'"#,
+                max_concurrent_sync, max_sync_errors, bucket_name, bucket_region, prefix_in_bucket, access_key_id, secret_access_key, endpoint
+            ),
+            format!(
+                "remote_storage={{max_concurrent_sync={}, max_sync_errors={}, bucket_name='{}', bucket_region='{}', prefix_in_bucket='{}', access_key_id='{}', secret_access_key='{}', endpoint='{}'}}",
+                max_concurrent_sync, max_sync_errors, bucket_name, bucket_region, prefix_in_bucket, access_key_id, secret_access_key, endpoint
+            ),
+        ];
+
+        for remote_storage_config_str in identical_toml_declarations {
+            let config_string = format!(
+                r#"{}
+pg_distrib_dir='{}'
+
+{}"#,
+                ALL_BASE_VALUES_TOML,
+                pg_distrib_dir.display(),
+                remote_storage_config_str,
+            );
+
+            let toml = config_string.parse()?;
+
+            let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
+                .unwrap_or_else(|e| {
+                    panic!("Failed to parse config '{}', reason: {}", config_string, e)
+                })
+                .remote_storage_config
+                .expect("Should have remote storage config for S3");
+
+            assert_eq!(
+                parsed_remote_storage_config,
+                RemoteStorageConfig {
+                    max_concurrent_sync,
+                    max_sync_errors,
+                    storage: RemoteStorageKind::AwsS3(S3Config {
+                        bucket_name: bucket_name.clone(),
+                        bucket_region: bucket_region.clone(),
+                        access_key_id: Some(access_key_id.clone()),
+                        secret_access_key: Some(secret_access_key.clone()),
+                        prefix_in_bucket: Some(prefix_in_bucket.clone()),
+                        endpoint: Some(endpoint.clone())
+                    }),
+                },
+                "Remote storage config should correctly parse the S3 config"
+            );
+        }
+        Ok(())
+    }
+
+    fn prepare_fs(tempdir: &TempDir) -> anyhow::Result<(PathBuf, PathBuf)> {
+        let tempdir_path = tempdir.path();
+
+        let workdir = tempdir_path.join("workdir");
+        fs::create_dir_all(&workdir)?;
+
+        let pg_distrib_dir = tempdir_path.join("pg_distrib");
+        fs::create_dir_all(&pg_distrib_dir)?;
+        let postgres_bin_dir = pg_distrib_dir.join("bin");
+        fs::create_dir_all(&postgres_bin_dir)?;
+        fs::write(postgres_bin_dir.join("postgres"), "I'm postgres, trust me")?;
+
+        Ok((workdir, pg_distrib_dir))
+    }
+}
--- a/pageserver/src/http/models.rs
+++ b/pageserver/src/http/models.rs
@@ -1,6 +1,7 @@
 use serde::{Deserialize, Serialize};

 use crate::ZTenantId;
+use zenith_utils::zid::ZNodeId;

 #[derive(Serialize, Deserialize)]
 pub struct BranchCreateRequest {
@@ -15,3 +16,8 @@ pub struct TenantCreateRequest {
    #[serde(with = "hex")]
    pub tenant_id: ZTenantId,
 }
+
+#[derive(Serialize)]
+pub struct StatusResponse {
+    pub id: ZNodeId,
+}
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -17,6 +17,103 @@ paths:
            application/json:
              schema:
                type: object
+                required:
+                - id
+                properties:
+                  id:
+                    type: integer
+  /v1/timeline/{tenant_id}:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    get:
+      description: List tenant timelines
+      responses:
+        "200":
+          description: array of brief timeline descriptions
+          content:
+            application/json:
+              schema:
+                type: array
+                items:
+                  # currently, just a timeline id string, but when remote index gets to be accessed
+                  # remote/local timeline field would be added at least
+                  type: string
+        "400":
+          description: Error when no tenant id found in path
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+  /v1/timeline/{tenant_id}/{timeline_id}:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    get:
+      description: Get timeline info for tenant's remote timeline
+      responses:
+        "200":
+          description: TimelineInfo
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/TimelineInfo"
+        "400":
+          description: Error when no tenant id found in path or no branch name
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
  /v1/branch/{tenant_id}:
    parameters:
      - name: tenant_id
@@ -142,9 +239,7 @@ paths:
          content:
            application/json:
              schema:
-                type: array
-                items:
-                  $ref: "#/components/schemas/BranchInfo"
+                $ref: "#/components/schemas/BranchInfo"
        "400":
          description: Malformed branch create request
          content:
@@ -278,12 +373,45 @@ components:
          format: hex
        ancestor_id:
          type: string
+          format: hex
        ancestor_lsn:
          type: string
        current_logical_size:
          type: integer
        current_logical_size_non_incremental:
          type: integer
+        latest_valid_lsn:
+          type: integer
+    TimelineInfo:
+      type: object
+      required:
+        - timeline_id
+        - tenant_id
+        - last_record_lsn
+        - prev_record_lsn
+        - start_lsn
+        - disk_consistent_lsn
+      properties:
+        timeline_id:
+          type: string
+          format: hex
+        tenant_id:
+          type: string
+          format: hex
+        ancestor_timeline_id:
+          type: string
+          format: hex
+        last_record_lsn:
+          type: string
+        prev_record_lsn:
+          type: string
+        start_lsn:
+          type: string
+        disk_consistent_lsn:
+          type: string
+        timeline_state:
+          type: string
+
    Error:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1,10 +1,9 @@
 use std::sync::Arc;

-use anyhow::Result;
-use hyper::header;
+use anyhow::{Context, Result};
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
-use routerify::{ext::RequestExt, RouterBuilder};
+use serde::Serialize;
 use tracing::*;
 use zenith_utils::auth::JwtAuth;
 use zenith_utils::http::endpoint::attach_openapi_ui;
@@ -18,11 +17,19 @@ use zenith_utils::http::{
    request::get_request_param,
    request::parse_request_param,
 };
+use zenith_utils::http::{RequestExt, RouterBuilder};
+use zenith_utils::lsn::Lsn;
+use zenith_utils::zid::HexZTimelineId;
+use zenith_utils::zid::ZTimelineId;

 use super::models::BranchCreateRequest;
+use super::models::StatusResponse;
 use super::models::TenantCreateRequest;
 use crate::branches::BranchInfo;
-use crate::{branches, tenant_mgr, PageServerConf, ZTenantId};
+use crate::repository::RepositoryTimeline;
+use crate::repository::TimelineSyncState;
+use crate::repository::{Repository, Timeline};
+use crate::{branches, config::PageServerConf, tenant_mgr, ZTenantId};

 #[derive(Debug)]
 struct State {
@@ -59,12 +66,12 @@ fn get_config(request: &Request<Body>) -> &'static PageServerConf {
 }

 // healthcheck handler
-async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
-    Ok(Response::builder()
-        .status(StatusCode::OK)
-        .header(header::CONTENT_TYPE, "application/json")
-        .body(Body::from("{}"))
-        .map_err(ApiError::from_err)?)
+async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let config = get_config(&request);
+    Ok(json_response(
+        StatusCode::OK,
+        StatusResponse { id: config.id },
+    )?)
 }

 async fn branch_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -131,14 +138,7 @@ async fn branch_detail_handler(request: Request<Body>) -> Result<Response<Body>,

    let response_data = tokio::task::spawn_blocking(move || {
        let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered();
-        let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
-        BranchInfo::from_path(
-            path,
-            conf,
-            &tenantid,
-            &repo,
-            include_non_incremental_logical_size,
-        )
+        BranchInfo::from_path(path, tenantid, include_non_incremental_logical_size)
    })
    .await
    .map_err(ApiError::from_err)??;
@@ -146,6 +146,160 @@ async fn branch_detail_handler(request: Request<Body>) -> Result<Response<Body>,
    Ok(json_response(StatusCode::OK, response_data)?)
 }

+async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let conf = get_state(&request).conf;
+    let timelines_dir = conf.timelines_path(&tenant_id);
+
+    let mut timelines_dir_contents =
+        tokio::fs::read_dir(&timelines_dir).await.with_context(|| {
+            format!(
+                "Failed to list timelines dir '{}' contents",
+                timelines_dir.display()
+            )
+        })?;
+
+    let mut local_timelines = Vec::new();
+    while let Some(entry) = timelines_dir_contents.next_entry().await.with_context(|| {
+        format!(
+            "Failed to list timelines dir '{}' contents",
+            timelines_dir.display()
+        )
+    })? {
+        let entry_path = entry.path();
+        let entry_type = entry.file_type().await.with_context(|| {
+            format!(
+                "Failed to get file type of timeline dirs' entry '{}'",
+                entry_path.display()
+            )
+        })?;
+
+        if entry_type.is_dir() {
+            match entry.file_name().to_string_lossy().parse::<ZTimelineId>() {
+                Ok(timeline_id) => local_timelines.push(timeline_id.to_string()),
+                Err(e) => error!(
+                    "Failed to get parse timeline id from timeline dirs' entry '{}': {}",
+                    entry_path.display(),
+                    e
+                ),
+            }
+        }
+    }
+
+    Ok(json_response(StatusCode::OK, local_timelines)?)
+}
+
+#[derive(Debug, Serialize)]
+#[serde(tag = "type")]
+enum TimelineInfo {
+    Local {
+        #[serde(with = "hex")]
+        timeline_id: ZTimelineId,
+        #[serde(with = "hex")]
+        tenant_id: ZTenantId,
+        ancestor_timeline_id: Option<HexZTimelineId>,
+        last_record_lsn: Lsn,
+        prev_record_lsn: Lsn,
+        disk_consistent_lsn: Lsn,
+        timeline_state: Option<TimelineSyncState>,
+    },
+    Remote {
+        #[serde(with = "hex")]
+        timeline_id: ZTimelineId,
+        #[serde(with = "hex")]
+        tenant_id: ZTenantId,
+    },
+}
+
+async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
+
+    let response_data = tokio::task::spawn_blocking(move || {
+        let _enter =
+            info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id)
+                .entered();
+        let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
+        Ok::<_, anyhow::Error>(match repo.get_timeline(timeline_id)?.local_timeline() {
+            None => TimelineInfo::Remote {
+                timeline_id,
+                tenant_id,
+            },
+            Some(timeline) => TimelineInfo::Local {
+                timeline_id,
+                tenant_id,
+                ancestor_timeline_id: timeline
+                    .get_ancestor_timeline_id()
+                    .map(HexZTimelineId::from),
+                disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
+                last_record_lsn: timeline.get_last_record_lsn(),
+                prev_record_lsn: timeline.get_prev_record_lsn(),
+                timeline_state: repo.get_timeline_state(timeline_id),
+            },
+        })
+    })
+    .await
+    .map_err(ApiError::from_err)??;
+
+    Ok(json_response(StatusCode::OK, response_data)?)
+}
+
+async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
+
+    tokio::task::spawn_blocking(move || {
+        let _enter =
+            info_span!("timeline_attach_handler", tenant = %tenant_id, timeline = %timeline_id)
+                .entered();
+        let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
+        match repo.get_timeline(timeline_id)? {
+            RepositoryTimeline::Local(_) => {
+                anyhow::bail!("Timeline with id {} is already local", timeline_id)
+            }
+            RepositoryTimeline::Remote {
+                id: _,
+                disk_consistent_lsn: _,
+            } => {
+                // FIXME (rodionov) get timeline already schedules timeline for download, and duplicate tasks can cause errors
+                //  first should be fixed in https://github.com/zenithdb/zenith/issues/997
+                // TODO (rodionov) change timeline state to awaits download (incapsulate it somewhere in the repo)
+                // TODO (rodionov) can we safely request replication on the timeline before sync is completed? (can be implemented on top of the #997)
+                Ok(())
+            }
+        }
+    })
+    .await
+    .map_err(ApiError::from_err)??;
+
+    Ok(json_response(StatusCode::ACCEPTED, ())?)
+}
+
+async fn timeline_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
+
+    tokio::task::spawn_blocking(move || {
+        let _enter =
+            info_span!("timeline_detach_handler", tenant = %tenant_id, timeline = %timeline_id)
+                .entered();
+        let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
+        repo.detach_timeline(timeline_id)
+    })
+    .await
+    .map_err(ApiError::from_err)??;
+
+    Ok(json_response(StatusCode::OK, ())?)
+}
+
 async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    // check for management permission
    check_permission(&request, None)?;
@@ -166,13 +320,13 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo

    let request_data: TenantCreateRequest = json_request(&mut request).await?;

-    let response_data = tokio::task::spawn_blocking(move || {
+    tokio::task::spawn_blocking(move || {
        let _enter = info_span!("tenant_create", tenant = %request_data.tenant_id).entered();
        tenant_mgr::create_repository_for_tenant(get_config(&request), request_data.tenant_id)
    })
    .await
    .map_err(ApiError::from_err)??;
-    Ok(json_response(StatusCode::CREATED, response_data)?)
+    Ok(json_response(StatusCode::CREATED, ())?)
 }

 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -202,6 +356,19 @@ pub fn make_router(
    router
        .data(Arc::new(State::new(conf, auth)))
        .get("/v1/status", status_handler)
+        .get("/v1/timeline/:tenant_id", timeline_list_handler)
+        .get(
+            "/v1/timeline/:tenant_id/:timeline_id",
+            timeline_detail_handler,
+        )
+        .post(
+            "/v1/timeline/:tenant_id/:timeline_id/attach",
+            timeline_attach_handler,
+        )
+        .post(
+            "/v1/timeline/:tenant_id/:timeline_id/detach",
+            timeline_detach_handler,
+        )
        .get("/v1/branch/:tenant_id", branch_list_handler)
        .get("/v1/branch/:tenant_id/:branch_name", branch_detail_handler)
        .post("/v1/branch", branch_create_handler)
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -0,0 +1,396 @@
+//!
+//! Import data and WAL from a PostgreSQL data directory and WAL segments into
+//! a zenith Timeline.
+//!
+use std::fs;
+use std::fs::File;
+use std::io::{Read, Seek, SeekFrom};
+use std::path::{Path, PathBuf};
+
+use anyhow::{bail, ensure, Context, Result};
+use bytes::Bytes;
+use tracing::*;
+
+use crate::pgdatadir_mapping::*;
+use crate::relish::*;
+use crate::repository::Repository;
+use crate::walingest::WalIngest;
+use postgres_ffi::relfile_utils::*;
+use postgres_ffi::waldecoder::*;
+use postgres_ffi::xlog_utils::*;
+use postgres_ffi::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED};
+use postgres_ffi::{Oid, TransactionId};
+use zenith_utils::lsn::Lsn;
+
+///
+/// Import all relation data pages from local disk into the repository.
+///
+/// This is currently only used to import a cluster freshly created by initdb.
+/// The code that deals with the checkpoint would not work right if the
+/// cluster was not shut down cleanly.
+pub fn import_timeline_from_postgres_datadir<R: Repository>(
+    path: &Path,
+    tline: &mut DatadirTimeline<R>,
+    lsn: Lsn,
+) -> Result<()> {
+    let mut pg_control: Option<ControlFileData> = None;
+
+    let mut writer = tline.begin_record(lsn);
+    writer.init_empty()?;
+
+    // Scan 'global'
+    let mut relfiles: Vec<PathBuf> = Vec::new();
+    for direntry in fs::read_dir(path.join("global"))? {
+        let direntry = direntry?;
+        match direntry.file_name().to_str() {
+            None => continue,
+
+            Some("pg_control") => {
+                pg_control = Some(import_control_file(&mut writer, &direntry.path())?);
+            }
+            Some("pg_filenode.map") => {
+                import_relmap_file(
+                    &mut writer,
+                    pg_constants::GLOBALTABLESPACE_OID,
+                    0,
+                    &direntry.path(),
+                )?;
+            }
+
+            // Load any relation files into the page server (but only after the other files)
+            _ => relfiles.push(direntry.path()),
+        }
+    }
+    for relfile in relfiles {
+        import_relfile(&mut writer, &relfile, pg_constants::GLOBALTABLESPACE_OID, 0)?;
+    }
+
+    // Scan 'base'. It contains database dirs, the database OID is the filename.
+    // E.g. 'base/12345', where 12345 is the database OID.
+    for direntry in fs::read_dir(path.join("base"))? {
+        let direntry = direntry?;
+
+        //skip all temporary files
+        if direntry.file_name().to_str().unwrap() == "pgsql_tmp" {
+            continue;
+        }
+
+        let dboid = direntry.file_name().to_str().unwrap().parse::<u32>()?;
+
+        let mut relfiles: Vec<PathBuf> = Vec::new();
+        for direntry in fs::read_dir(direntry.path())? {
+            let direntry = direntry?;
+            match direntry.file_name().to_str() {
+                None => continue,
+
+                Some("PG_VERSION") => {
+                    //writer.put_dbdir_creation(pg_constants::DEFAULTTABLESPACE_OID, dboid)?;
+                }
+                Some("pg_filenode.map") => import_relmap_file(
+                    &mut writer,
+                    pg_constants::DEFAULTTABLESPACE_OID,
+                    dboid,
+                    &direntry.path(),
+                )?,
+
+                // Load any relation files into the page server
+                _ => relfiles.push(direntry.path()),
+            }
+        }
+        for relfile in relfiles {
+            import_relfile(
+                &mut writer,
+                &relfile,
+                pg_constants::DEFAULTTABLESPACE_OID,
+                dboid,
+            )?;
+        }
+    }
+    for entry in fs::read_dir(path.join("pg_xact"))? {
+        let entry = entry?;
+        import_slru_file(&mut writer, SlruKind::Clog, &entry.path())?;
+    }
+    for entry in fs::read_dir(path.join("pg_multixact").join("members"))? {
+        let entry = entry?;
+        import_slru_file(&mut writer, SlruKind::MultiXactMembers, &entry.path())?;
+    }
+    for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? {
+        let entry = entry?;
+        import_slru_file(&mut writer, SlruKind::MultiXactOffsets, &entry.path())?;
+    }
+    for entry in fs::read_dir(path.join("pg_twophase"))? {
+        let entry = entry?;
+        let xid = u32::from_str_radix(entry.path().to_str().unwrap(), 16)?;
+        import_twophase_file(&mut writer, xid, &entry.path())?;
+    }
+    // TODO: Scan pg_tblspc
+
+    // We're done importing all the data files.
+    writer.finish()?;
+
+    // We expect the Postgres server to be shut down cleanly.
+    let pg_control = pg_control.context("pg_control file not found")?;
+    ensure!(
+        pg_control.state == DBState_DB_SHUTDOWNED,
+        "Postgres cluster was not shut down cleanly"
+    );
+    ensure!(
+        pg_control.checkPointCopy.redo == lsn.0,
+        "unexpected checkpoint REDO pointer"
+    );
+
+    // Import WAL. This is needed even when starting from a shutdown checkpoint, because
+    // this reads the checkpoint record itself, advancing the tip of the timeline to
+    // *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'.
+    import_wal(
+        &path.join("pg_wal"),
+        tline,
+        Lsn(pg_control.checkPointCopy.redo),
+        lsn,
+    )?;
+
+    Ok(())
+}
+
+// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
+fn import_relfile<R: Repository>(
+    timeline: &mut DatadirTimelineWriter<R>,
+    path: &Path,
+    spcoid: Oid,
+    dboid: Oid,
+) -> Result<()> {
+    // Does it look like a relation file?
+    trace!("importing rel file {}", path.display());
+
+    let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
+    if let Err(e) = p {
+        warn!("unrecognized file in postgres datadir: {:?} ({})", path, e);
+        return Err(e.into());
+    }
+    let (relnode, forknum, segno) = p.unwrap();
+
+    let mut file = File::open(path)?;
+    let mut buf: [u8; 8192] = [0u8; 8192];
+
+    let len = file.metadata().unwrap().len();
+    ensure!(len % pg_constants::BLCKSZ as u64 == 0);
+    let nblocks = len / pg_constants::BLCKSZ as u64;
+
+    if segno != 0 {
+        todo!();
+    }
+
+    let rel = RelTag {
+        spcnode: spcoid,
+        dbnode: dboid,
+        relnode,
+        forknum,
+    };
+    timeline.put_rel_creation(rel, nblocks as u32)?;
+
+    let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
+    loop {
+        let r = file.read_exact(&mut buf);
+        match r {
+            Ok(_) => {
+                timeline.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
+            }
+
+            // TODO: UnexpectedEof is expected
+            Err(err) => match err.kind() {
+                std::io::ErrorKind::UnexpectedEof => {
+                    // reached EOF. That's expected.
+                    ensure!(blknum == nblocks as u32, "unexpected EOF");
+                    break;
+                }
+                _ => {
+                    bail!("error reading file {}: {:#}", path.display(), err);
+                }
+            },
+        };
+        blknum += 1;
+    }
+
+    Ok(())
+}
+
+/// Import a relmapper (pg_filenode.map) file into the repository
+fn import_relmap_file<R: Repository>(
+    timeline: &mut DatadirTimelineWriter<R>,
+    spcnode: Oid,
+    dbnode: Oid,
+    path: &Path,
+) -> Result<()> {
+    let mut file = File::open(path)?;
+    let mut buffer = Vec::new();
+    // read the whole file
+    file.read_to_end(&mut buffer)?;
+
+    trace!("importing relmap file {}", path.display());
+
+    timeline.put_relmap_file(spcnode, dbnode, Bytes::copy_from_slice(&buffer[..]))?;
+    Ok(())
+}
+
+/// Import a twophase state file (pg_twophase/<xid>) into the repository
+fn import_twophase_file<R: Repository>(
+    timeline: &mut DatadirTimelineWriter<R>,
+    xid: TransactionId,
+    path: &Path,
+) -> Result<()> {
+    let mut file = File::open(path)?;
+    let mut buffer = Vec::new();
+    // read the whole file
+    file.read_to_end(&mut buffer)?;
+
+    trace!("importing non-rel file {}", path.display());
+
+    timeline.put_twophase_file(xid, Bytes::copy_from_slice(&buffer[..]))?;
+    Ok(())
+}
+
+///
+/// Import pg_control file into the repository.
+///
+/// The control file is imported as is, but we also extract the checkpoint record
+/// from it and store it separated.
+fn import_control_file<R: Repository>(
+    timeline: &mut DatadirTimelineWriter<R>,
+    path: &Path,
+) -> Result<ControlFileData> {
+    let mut file = File::open(path)?;
+    let mut buffer = Vec::new();
+    // read the whole file
+    file.read_to_end(&mut buffer)?;
+
+    trace!("importing control file {}", path.display());
+
+    // Import it as ControlFile
+    timeline.put_control_file(Bytes::copy_from_slice(&buffer[..]))?;
+
+    // Extract the checkpoint record and import it separately.
+    let pg_control = ControlFileData::decode(&buffer)?;
+    let checkpoint_bytes = pg_control.checkPointCopy.encode();
+    timeline.put_checkpoint(checkpoint_bytes)?;
+
+    Ok(pg_control)
+}
+
+///
+/// Import an SLRU segment file
+///
+fn import_slru_file<R: Repository>(
+    timeline: &mut DatadirTimelineWriter<R>,
+    slru: SlruKind,
+    path: &Path,
+) -> Result<()> {
+    trace!("importing slru file {}", path.display());
+
+    let mut file = File::open(path)?;
+    let mut buf: [u8; 8192] = [0u8; 8192];
+    let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;
+
+    let len = file.metadata().unwrap().len();
+    ensure!(len % pg_constants::BLCKSZ as u64 == 0); // we assume SLRU block size is the same as BLCKSZ
+    let nblocks = len / pg_constants::BLCKSZ as u64;
+
+    ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as u64);
+
+    timeline.put_slru_segment_creation(slru, segno, nblocks as u32)?;
+
+    let mut rpageno = 0;
+    loop {
+        let r = file.read_exact(&mut buf);
+        match r {
+            Ok(_) => {
+                timeline.put_slru_page_image(slru, segno, rpageno, Bytes::copy_from_slice(&buf))?;
+            }
+
+            // TODO: UnexpectedEof is expected
+            Err(err) => match err.kind() {
+                std::io::ErrorKind::UnexpectedEof => {
+                    // reached EOF. That's expected.
+                    ensure!(rpageno == nblocks as u32, "unexpected EOF");
+                    break;
+                }
+                _ => {
+                    bail!("error reading file {}: {:#}", path.display(), err);
+                }
+            },
+        };
+        rpageno += 1;
+    }
+
+    Ok(())
+}
+
+/// Scan PostgreSQL WAL files in given directory and load all records between
+/// 'startpoint' and 'endpoint' into the repository.
+fn import_wal<R: Repository>(
+    walpath: &Path,
+    tline: &mut DatadirTimeline<R>,
+    startpoint: Lsn,
+    endpoint: Lsn,
+) -> Result<()> {
+    let mut waldecoder = WalStreamDecoder::new(startpoint);
+
+    let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE);
+    let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
+    let mut last_lsn = startpoint;
+
+    let mut walingest = WalIngest::new(tline, startpoint)?;
+
+    while last_lsn <= endpoint {
+        // FIXME: assume postgresql tli 1 for now
+        let filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE);
+        let mut buf = Vec::new();
+
+        // Read local file
+        let mut path = walpath.join(&filename);
+
+        // It could be as .partial
+        if !PathBuf::from(&path).exists() {
+            path = walpath.join(filename + ".partial");
+        }
+
+        // Slurp the WAL file
+        let mut file = File::open(&path)?;
+
+        if offset > 0 {
+            file.seek(SeekFrom::Start(offset as u64))?;
+        }
+
+        let nread = file.read_to_end(&mut buf)?;
+        if nread != pg_constants::WAL_SEGMENT_SIZE - offset as usize {
+            // Maybe allow this for .partial files?
+            error!("read only {} bytes from WAL file", nread);
+        }
+
+        waldecoder.feed_bytes(&buf);
+
+        let mut nrecords = 0;
+        while last_lsn <= endpoint {
+            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
+                walingest.ingest_record(tline, recdata, lsn)?;
+                last_lsn = lsn;
+
+                nrecords += 1;
+
+                trace!("imported record at {} (end {})", lsn, endpoint);
+            }
+        }
+
+        debug!("imported {} records up to {}", nrecords, last_lsn);
+
+        segno += 1;
+        offset = 0;
+    }
+
+    if last_lsn != startpoint {
+        debug!("reached end of WAL at {}", last_lsn);
+    } else {
+        info!("no WAL to import at {}", last_lsn);
+    }
+
+    Ok(())
+}
--- a/pageserver/src/keyspace.rs
+++ b/pageserver/src/keyspace.rs
@@ -0,0 +1,129 @@
+use crate::repository::{key_range_size, singleton_range, Key};
+use postgres_ffi::pg_constants;
+use std::ops::Range;
+
+// Target file size, when creating image and delta layers
+pub const TARGET_FILE_SIZE_BYTES: u64 = 128 * 1024 * 1024; // 128 MB
+
+///
+/// Represents a set of Keys, in a compact form.
+///
+pub struct KeySpace {
+    // Contiguous ranges of keys that belong to the key space. In key order, and
+    // with no overlap.
+    ranges: Vec<Range<Key>>,
+}
+
+impl KeySpace {
+    ///
+    /// Partition a key space into roughly chunks of roughly 'target_size' bytes in
+    /// each patition.
+    ///
+    pub fn partition(&self, target_size: u64) -> KeyPartitioning {
+        // Assume that each value is 8k in size.
+        let target_nblocks = (target_size / pg_constants::BLCKSZ as u64) as usize;
+
+        let mut partitions = Vec::new();
+        let mut current_part = Vec::new();
+        let mut current_part_size: usize = 0;
+        for range in &self.ranges {
+            // If appending the next contiguous range in the keyspace to the current
+            // partition would cause it to be too large, start a new partition.
+            let this_size = key_range_size(range) as usize;
+            if current_part_size + this_size > target_nblocks && !current_part.is_empty() {
+                partitions.push(current_part);
+                current_part = Vec::new();
+                current_part_size = 0;
+            }
+
+            // If the next range is larger than 'target_size', split it into
+            // 'target_size' chunks.
+            let mut remain_size = this_size;
+            let mut start = range.start;
+            while remain_size > target_nblocks {
+                let next = start.add(target_nblocks as u32);
+                partitions.push(vec![start..next]);
+                start = next;
+                remain_size -= target_nblocks
+            }
+            current_part.push(start..range.end);
+            current_part_size += remain_size;
+        }
+
+        // add last partition that wasn't full yet.
+        if !current_part.is_empty() {
+            partitions.push(current_part);
+        }
+
+        KeyPartitioning { partitions }
+    }
+}
+
+///
+/// Represents a partitioning of the key space.
+///
+/// The only kind of partitioning we do is to partition the key space into
+/// partitions that are roughly equal in physical size (see KeySpace::partition).
+/// But this data structure could represent any partitioning.
+///
+#[derive(Clone, Debug, Default)]
+pub struct KeyPartitioning {
+    pub partitions: Vec<Vec<Range<Key>>>,
+}
+
+impl KeyPartitioning {
+    pub fn new() -> Self {
+        KeyPartitioning {
+            partitions: Vec::new(),
+        }
+    }
+}
+
+///
+/// A helper object, to collect a set of keys and key ranges into a KeySpace
+/// object. This takes care of merging adjacent keys and key ranges into
+/// contiguous ranges.
+///
+#[derive(Clone, Debug, Default)]
+pub struct KeySpaceAccum {
+    accum: Option<Range<Key>>,
+
+    ranges: Vec<Range<Key>>,
+}
+
+impl KeySpaceAccum {
+    pub fn new() -> Self {
+        Self {
+            accum: None,
+            ranges: Vec::new(),
+        }
+    }
+
+    pub fn add_key(&mut self, key: Key) {
+        self.add_range(singleton_range(key))
+    }
+
+    pub fn add_range(&mut self, range: Range<Key>) {
+        match self.accum.as_mut() {
+            Some(accum) => {
+                if range.start == accum.end {
+                    accum.end = range.end;
+                } else {
+                    assert!(range.start > accum.end);
+                    self.ranges.push(accum.clone());
+                    *accum = range;
+                }
+            }
+            None => self.accum = Some(range),
+        }
+    }
+
+    pub fn to_keyspace(mut self) -> KeySpace {
+        if let Some(accum) = self.accum.take() {
+            self.ranges.push(accum);
+        }
+        KeySpace {
+            ranges: self.ranges,
+        }
+    }
+}
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
--- a/pageserver/src/layered_repository/README.md
+++ b/pageserver/src/layered_repository/README.md
@@ -82,13 +82,15 @@ A layer can be in different states:

 - Open - a layer where new WAL records can be appended to.
 - Closed - a layer that is read-only, no new WAL records can be appended to it
- Historical: synonym for closed
- InMemory: A layer that is kept only in memory, and needs to be rebuilt from WAL
-  on pageserver start
+- Historic: synonym for closed
+- InMemory: A layer that needs to be rebuilt from WAL on pageserver start.
+To avoid OOM errors, InMemory layers can be spilled to disk into ephemeral file.
 - OnDisk: A layer that is stored on disk. If its end-LSN is older than
  disk_consistent_lsn, it is known to be fully flushed and fsync'd to local disk.
 - Frozen layer: an in-memory layer that is Closed.

+TODO: Clarify the difference between Closed, Historic and Frozen.
+
 There are two kinds of OnDisk layers:
 - ImageLayer represents an image or a snapshot of a 10 MB relish segment, at one particular LSN.
 - DeltaLayer represents a collection of WAL records or page images in a range of LSNs, for one
--- a/pageserver/src/layered_repository/blob.rs
+++ b/pageserver/src/layered_repository/blob.rs
@@ -1,46 +0,0 @@
-use std::io::{Read, Write};
-use std::os::unix::prelude::FileExt;
-
-use anyhow::Result;
-use bookfile::{BookWriter, BoundedReader, ChapterId, ChapterWriter};
-use serde::{Deserialize, Serialize};
-
-#[derive(Serialize, Deserialize)]
-pub struct BlobRange {
-    offset: u64,
-    size: usize,
-}
-
-pub fn read_blob<F: FileExt>(reader: &BoundedReader<&'_ F>, range: &BlobRange) -> Result<Vec<u8>> {
-    let mut buf = vec![0u8; range.size];
-    reader.read_exact_at(&mut buf, range.offset)?;
-    Ok(buf)
-}
-
-pub struct BlobWriter<W> {
-    writer: ChapterWriter<W>,
-    offset: u64,
-}
-
-impl<W: Write> BlobWriter<W> {
-    // This function takes a BookWriter and creates a new chapter to ensure offset is 0.
-    pub fn new(book_writer: BookWriter<W>, chapter_id: impl Into<ChapterId>) -> Self {
-        let writer = book_writer.new_chapter(chapter_id);
-        Self { writer, offset: 0 }
-    }
-
-    pub fn write_blob_from_reader(&mut self, r: &mut impl Read) -> Result<BlobRange> {
-        let len = std::io::copy(r, &mut self.writer)?;
-
-        let range = BlobRange {
-            offset: self.offset,
-            size: len as usize,
-        };
-        self.offset += len as u64;
-        Ok(range)
-    }
-
-    pub fn close(self) -> bookfile::Result<BookWriter<W>> {
-        self.writer.close()
-    }
-}
--- a/pageserver/src/layered_repository/delta_layer.rs
+++ b/pageserver/src/layered_repository/delta_layer.rs
@@ -1,6 +1,5 @@
-//!
 //! A DeltaLayer represents a collection of WAL records or page images in a range of
-//! LSNs, for one segment. It is stored on a file on disk.
+//! LSNs, and in a range of Keys. It is stored on a file on disk.
 //!
 //! Usually a delta layer only contains differences - in the form of WAL records against
 //! a base LSN. However, if a segment is newly created, by creating a new relation or
@@ -11,86 +10,77 @@
 //! can happen when you create a new branch in the middle of a delta layer, and the WAL
 //! records on the new branch are put in a new delta layer.
 //!
-//! When a delta file needs to be accessed, we slurp the metadata and relsize chapters
+//! When a delta file needs to be accessed, we slurp the 'index' metadata
 //! into memory, into the DeltaLayerInner struct. See load() and unload() functions.
-//! To access a page/WAL record, we search `page_version_metas` for the block # and LSN.
-//! The byte ranges in the metadata can be used to find the page/WAL record in
-//! PAGE_VERSIONS_CHAPTER.
+//! To access a particular value, we search `index` for the given key.
+//! The byte offset in the index can be used to find the value in
+//! VALUES_CHAPTER.
 //!
 //! On disk, the delta files are stored in timelines/<timelineid> directory.
 //! Currently, there are no subdirectories, and each delta file is named like this:
 //!
-//!    <spcnode>_<dbnode>_<relnode>_<forknum>_<segno>_<start LSN>_<end LSN>
+//!    <key start>-<key end>__<start LSN>-<end LSN
 //!
 //! For example:
 //!
-//!    1663_13990_2609_0_5_000000000169C348_000000000169C349
+//!    000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
 //!
-//! If a relation is dropped, we add a '_DROPPED' to the end of the filename to indicate that.
-//! So the above example would become:
 //!
-//!    1663_13990_2609_0_5_000000000169C348_000000000169C349_DROPPED
+//! A delta file is constructed using the 'bookfile' crate. Each file consists of three
+//! parts: the 'index', the values, and a short summary header. They are stored as
+//! separate chapters.
 //!
-//! The end LSN indicates when it was dropped in that case, we don't store it in the
-//! file contents in any way.
-//!
-//! A detlta file is constructed using the 'bookfile' crate. Each file consists of two
-//! parts: the page versions and the relation sizes. They are stored as separate chapters.
-//!
-use crate::layered_repository::blob::BlobWriter;
+use crate::config::PageServerConf;
 use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
-use crate::layered_repository::page_versions::PageVersions;
 use crate::layered_repository::storage_layer::{
-    Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag,
+    Layer, ValueReconstructResult, ValueReconstructState,
 };
+use crate::layered_repository::utils;
+use crate::repository::{Key, Value};
 use crate::virtual_file::VirtualFile;
-use crate::waldecoder;
-use crate::PageServerConf;
+use crate::walrecord;
 use crate::{ZTenantId, ZTimelineId};
-use anyhow::{bail, ensure, Result};
+use anyhow::{bail, Result};
 use log::*;
 use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
 use zenith_utils::vec_map::VecMap;
 // avoid binding to Write (conflicts with std::io::Write)
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::fs;
-use std::io::{BufWriter, Write};
-use std::ops::Bound::Included;
+use std::io::BufWriter;
+use std::io::Write;
+use std::ops::Range;
+use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
-use std::sync::{Mutex, MutexGuard};
+use std::sync::{RwLock, RwLockReadGuard};

-use bookfile::{Book, BookWriter};
+use bookfile::{Book, BookWriter, ChapterWriter};

 use zenith_utils::bin_ser::BeSer;
 use zenith_utils::lsn::Lsn;

-use super::blob::{read_blob, BlobRange};
-
 // Magic constant to identify a Zenith delta file
 pub const DELTA_FILE_MAGIC: u32 = 0x5A616E01;

-/// Mapping from (block #, lsn) -> page/WAL record
-/// byte ranges in PAGE_VERSIONS_CHAPTER
-static PAGE_VERSION_METAS_CHAPTER: u64 = 1;
+/// Mapping from (key, lsn) -> page/WAL record
+/// byte ranges in VALUES_CHAPTER
+static INDEX_CHAPTER: u64 = 1;
+
 /// Page/WAL bytes - cannot be interpreted
-/// without PAGE_VERSION_METAS_CHAPTER
-static PAGE_VERSIONS_CHAPTER: u64 = 2;
-static REL_SIZES_CHAPTER: u64 = 3;
+/// without the page versions from the INDEX_CHAPTER
+static VALUES_CHAPTER: u64 = 2;

 /// Contains the [`Summary`] struct
-static SUMMARY_CHAPTER: u64 = 4;
+static SUMMARY_CHAPTER: u64 = 3;

 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
 struct Summary {
    tenantid: ZTenantId,
    timelineid: ZTimelineId,
-    seg: SegmentTag,
-
-    start_lsn: Lsn,
-    end_lsn: Lsn,
-
-    dropped: bool,
+    key_range: Range<Key>,
+    lsn_range: Range<Lsn>,
 }

 impl From<&DeltaLayer> for Summary {
@@ -98,12 +88,8 @@ impl From<&DeltaLayer> for Summary {
        Self {
            tenantid: layer.tenantid,
            timelineid: layer.timelineid,
-            seg: layer.seg,
-
-            start_lsn: layer.start_lsn,
-            end_lsn: layer.end_lsn,
-
-            dropped: layer.dropped,
+            key_range: layer.key_range.clone(),
+            lsn_range: layer.lsn_range.clone(),
        }
    }
 }
@@ -112,7 +98,7 @@ impl From<&DeltaLayer> for Summary {
 /// DeltaLayer is the in-memory data structure associated with an
 /// on-disk delta file.  We keep a DeltaLayer in memory for each
 /// file, in the LayerMap. If a layer is in "loaded" state, we have a
-/// copy of the file in memory, in 'inner'. Otherwise the struct is
+/// copy of the index in memory, in 'inner'. Otherwise the struct is
 /// just a placeholder for a file that exists on disk, and it needs to
 /// be loaded before using it in queries.
 ///
@@ -121,33 +107,24 @@ pub struct DeltaLayer {

    pub tenantid: ZTenantId,
    pub timelineid: ZTimelineId,
-    pub seg: SegmentTag,
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,

-    //
-    // This entry contains all the changes from 'start_lsn' to 'end_lsn'. The
-    // start is inclusive, and end is exclusive.
-    //
-    pub start_lsn: Lsn,
-    pub end_lsn: Lsn,
-
-    dropped: bool,
-
-    inner: Mutex<DeltaLayerInner>,
+    inner: RwLock<DeltaLayerInner>,
 }

 pub struct DeltaLayerInner {
-    /// If false, the 'page_version_metas' and 'relsizes' have not been
-    /// loaded into memory yet.
+    /// If false, the 'index' has not been loaded into memory yet.
    loaded: bool,

+    ///
+    /// All versions of all pages in the layer are kept here.
+    /// Indexed by block number and LSN. The value is an offset into the
+    /// chapter where the page version is stored.
+    ///
+    index: HashMap<Key, VecMap<Lsn, u64>>,
+
    book: Option<Book<VirtualFile>>,
-
-    /// All versions of all pages in the file are are kept here.
-    /// Indexed by block number and LSN.
-    page_version_metas: VecMap<(u32, Lsn), BlobRange>,
-
-    /// `relsizes` tracks the size of the relation at different points in time.
-    relsizes: VecMap<Lsn, u32>,
 }

 impl Layer for DeltaLayer {
@@ -159,144 +136,98 @@ impl Layer for DeltaLayer {
        self.timelineid
    }

-    fn get_seg_tag(&self) -> SegmentTag {
-        self.seg
+    fn get_key_range(&self) -> Range<Key> {
+        self.key_range.clone()
    }

-    fn is_dropped(&self) -> bool {
-        self.dropped
-    }
-
-    fn get_start_lsn(&self) -> Lsn {
-        self.start_lsn
-    }
-
-    fn get_end_lsn(&self) -> Lsn {
-        self.end_lsn
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.lsn_range.clone()
    }

    fn filename(&self) -> PathBuf {
        PathBuf::from(self.layer_name().to_string())
    }

-    /// Look up given page in the cache.
-    fn get_page_reconstruct_data(
+    fn get_value_reconstruct_data(
        &self,
-        blknum: u32,
-        lsn: Lsn,
-        cached_img_lsn: Option<Lsn>,
-        reconstruct_data: &mut PageReconstructData,
-    ) -> Result<PageReconstructResult> {
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+    ) -> Result<ValueReconstructResult> {
        let mut need_image = true;

-        assert!(self.seg.blknum_in_seg(blknum));
-
-        match &cached_img_lsn {
-            Some(cached_lsn) if &self.end_lsn <= cached_lsn => {
-                return Ok(PageReconstructResult::Cached)
-            }
-            _ => {}
-        }
+        assert!(self.key_range.contains(&key));

        {
            // Open the file and lock the metadata in memory
            let inner = self.load()?;
-            let page_version_reader = inner
+            let values_reader = inner
                .book
                .as_ref()
-                .unwrap()
-                .chapter_reader(PAGE_VERSIONS_CHAPTER)?;
+                .expect("should be loaded in load call above")
+                .chapter_reader(VALUES_CHAPTER)?;

-            // Scan the metadata BTreeMap backwards, starting from the given entry.
-            let minkey = (blknum, Lsn(0));
-            let maxkey = (blknum, lsn);
-            let iter = inner
-                .page_version_metas
-                .slice_range((Included(&minkey), Included(&maxkey)))
-                .iter()
-                .rev();
-            for ((_blknum, pv_lsn), blob_range) in iter {
-                match &cached_img_lsn {
-                    Some(cached_lsn) if pv_lsn <= cached_lsn => {
-                        return Ok(PageReconstructResult::Cached)
-                    }
-                    _ => {}
-                }
-
-                let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?;
-
-                match pv {
-                    PageVersion::Page(img) => {
-                        // Found a page image, return it
-                        reconstruct_data.page_img = Some(img);
-                        need_image = false;
-                        break;
-                    }
-                    PageVersion::Wal(rec) => {
-                        let will_init = rec.will_init;
-                        reconstruct_data.records.push((*pv_lsn, rec));
-                        if will_init {
-                            // This WAL record initializes the page, so no need to go further back
+            // Scan the page versions backwards, starting from `lsn`.
+            if let Some(vec_map) = inner.index.get(&key) {
+                let slice = vec_map.slice_range(lsn_range);
+                for (entry_lsn, pos) in slice.iter().rev() {
+                    let val = Value::des(&utils::read_blob_from_chapter(&values_reader, *pos)?)?;
+                    match val {
+                        Value::Image(img) => {
+                            reconstruct_state.img = Some((*entry_lsn, img));
                            need_image = false;
                            break;
                        }
+                        Value::WalRecord(rec) => {
+                            let will_init = rec.will_init();
+                            reconstruct_state.records.push((*entry_lsn, rec));
+                            if will_init {
+                                // This WAL record initializes the page, so no need to go further back
+                                need_image = false;
+                                break;
+                            }
+                        }
                    }
                }
            }
-
            // release metadata lock and close the file
        }

        // If an older page image is needed to reconstruct the page, let the
        // caller know.
        if need_image {
-            Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
+            Ok(ValueReconstructResult::Continue)
        } else {
-            Ok(PageReconstructResult::Complete)
+            Ok(ValueReconstructResult::Complete)
        }
    }

-    /// Get size of the relation at given LSN
-    fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
-        assert!(lsn >= self.start_lsn);
-        ensure!(
-            self.seg.rel.is_blocky(),
-            "get_seg_size() called on a non-blocky rel"
-        );
+    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_> {
+        let inner = self.load().unwrap();

-        // Scan the BTreeMap backwards, starting from the given entry.
-        let inner = self.load()?;
-        let slice = inner
-            .relsizes
-            .slice_range((Included(&Lsn(0)), Included(&lsn)));
+        let mut pairs: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
+        pairs.sort_by_key(|x| x.0);

-        if let Some((_entry_lsn, entry)) = slice.last() {
-            Ok(*entry)
-        } else {
-            Err(anyhow::anyhow!("could not find seg size in delta layer"))
+        match DeltaValueIter::new(inner) {
+            Ok(iter) => Box::new(iter),
+            Err(err) => Box::new(std::iter::once(Err(err))),
        }
    }

-    /// Does this segment exist at given LSN?
-    fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
-        // Is the requested LSN after the rel was dropped?
-        if self.dropped && lsn >= self.end_lsn {
-            return Ok(false);
-        }
-
-        // Otherwise, it exists.
-        Ok(true)
-    }
-
    ///
    /// Release most of the memory used by this layer. If it's accessed again later,
    /// it will need to be loaded back.
    ///
    fn unload(&self) -> Result<()> {
-        let mut inner = self.inner.lock().unwrap();
-        inner.page_version_metas = VecMap::default();
-        inner.relsizes = VecMap::default();
-        inner.loaded = false;
+        if let Ok(mut inner) = self.inner.try_write() {
+            inner.index = HashMap::default();
+            inner.loaded = false;
+
+            // Note: we keep the Book open. Is that a good idea? The virtual file
+            // machinery has its own rules for closing the file descriptor if it's not
+            // needed, but the Book struct uses up some memory, too.
+        }
+
        Ok(())
    }

@@ -310,48 +241,59 @@ impl Layer for DeltaLayer {
        true
    }

+    fn is_in_memory(&self) -> bool {
+        false
+    }
+
    /// debugging function to print out the contents of the layer
    fn dump(&self) -> Result<()> {
        println!(
-            "----- delta layer for ten {} tli {} seg {} {}-{} ----",
-            self.tenantid, self.timelineid, self.seg, self.start_lsn, self.end_lsn
+            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
+            self.tenantid,
+            self.timelineid,
+            self.key_range.start,
+            self.key_range.end,
+            self.lsn_range.start,
+            self.lsn_range.end
        );

-        println!("--- relsizes ---");
        let inner = self.load()?;
-        for (k, v) in inner.relsizes.as_slice() {
-            println!("  {}: {}", k, v);
-        }
-        println!("--- page versions ---");

        let path = self.path();
        let file = std::fs::File::open(&path)?;
        let book = Book::new(file)?;
+        let chapter = book.chapter_reader(VALUES_CHAPTER)?;

-        let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
-        for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
-            let mut desc = String::new();
+        let mut values: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
+        values.sort_by_key(|k| k.0);

-            let buf = read_blob(&chapter, blob_range)?;
-            let pv = PageVersion::des(&buf)?;
+        for (key, versions) in values {
+            for (lsn, off) in versions.as_slice() {
+                let mut desc = String::new();

-            match pv {
-                PageVersion::Page(img) => {
-                    write!(&mut desc, " img {} bytes", img.len())?;
-                }
-                PageVersion::Wal(rec) => {
-                    let wal_desc = waldecoder::describe_wal_record(&rec.rec);
-                    write!(
-                        &mut desc,
-                        " rec {} bytes will_init: {} {}",
-                        rec.rec.len(),
-                        rec.will_init,
-                        wal_desc
-                    )?;
+                let buf = utils::read_blob_from_chapter(&chapter, *off)?;
+                let val = Value::des(&buf);
+
+                match val {
+                    Ok(Value::Image(img)) => {
+                        write!(&mut desc, " img {} bytes", img.len())?;
+                    }
+                    Ok(Value::WalRecord(rec)) => {
+                        let wal_desc = walrecord::describe_wal_record(&rec);
+                        write!(
+                            &mut desc,
+                            " rec {} bytes will_init: {} {}",
+                            buf.len(),
+                            rec.will_init(),
+                            wal_desc
+                        )?;
+                    }
+                    Err(err) => {
+                        write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
+                    }
                }
+                println!("  key {} at {}: {}", key, lsn, desc);
            }
-
-            println!("  blk {} at {}: {}", blk, lsn, desc);
        }

        Ok(())
@@ -373,168 +315,64 @@ impl DeltaLayer {
        }
    }

-    /// Create a new delta file, using the given page versions and relsizes.
-    /// The page versions are passed in a PageVersions struct. If 'cutoff' is
-    /// given, only page versions with LSN < cutoff are included.
-    ///
-    /// This is used to write the in-memory layer to disk. The page_versions and
-    /// relsizes are thus passed in the same format as they are in the in-memory
-    /// layer, as that's expedient.
-    #[allow(clippy::too_many_arguments)]
-    pub fn create(
-        conf: &'static PageServerConf,
-        timelineid: ZTimelineId,
-        tenantid: ZTenantId,
-        seg: SegmentTag,
-        start_lsn: Lsn,
-        end_lsn: Lsn,
-        dropped: bool,
-        page_versions: &PageVersions,
-        cutoff: Option<Lsn>,
-        relsizes: VecMap<Lsn, u32>,
-    ) -> Result<DeltaLayer> {
-        if seg.rel.is_blocky() {
-            assert!(!relsizes.is_empty());
-        }
-
-        let delta_layer = DeltaLayer {
-            path_or_conf: PathOrConf::Conf(conf),
-            timelineid,
-            tenantid,
-            seg,
-            start_lsn,
-            end_lsn,
-            dropped,
-            inner: Mutex::new(DeltaLayerInner {
-                loaded: true,
-                book: None,
-                page_version_metas: VecMap::default(),
-                relsizes,
-            }),
-        };
-        let mut inner = delta_layer.inner.lock().unwrap();
-
-        // Write the data into a file
-        //
-        // Note: Because we open the file in write-only mode, we cannot
-        // reuse the same VirtualFile for reading later. That's why we don't
-        // set inner.book here. The first read will have to re-open it.
-        //
-        // Note: This overwrites any existing file. There shouldn't be any.
-        // FIXME: throw an error instead?
-        let path = delta_layer.path();
-        let file = VirtualFile::create(&path)?;
-        let buf_writer = BufWriter::new(file);
-        let book = BookWriter::new(buf_writer, DELTA_FILE_MAGIC)?;
-
-        let mut page_version_writer = BlobWriter::new(book, PAGE_VERSIONS_CHAPTER);
-
-        let page_versions_iter = page_versions.ordered_page_version_iter(cutoff);
-        for (blknum, lsn, pos) in page_versions_iter {
-            let blob_range =
-                page_version_writer.write_blob_from_reader(&mut page_versions.reader(pos)?)?;
-
-            inner
-                .page_version_metas
-                .append((blknum, lsn), blob_range)
-                .unwrap();
-        }
-
-        let book = page_version_writer.close()?;
-
-        // Write out page versions
-        let mut chapter = book.new_chapter(PAGE_VERSION_METAS_CHAPTER);
-        let buf = VecMap::ser(&inner.page_version_metas)?;
-        chapter.write_all(&buf)?;
-        let book = chapter.close()?;
-
-        // and relsizes to separate chapter
-        let mut chapter = book.new_chapter(REL_SIZES_CHAPTER);
-        let buf = VecMap::ser(&inner.relsizes)?;
-        chapter.write_all(&buf)?;
-        let book = chapter.close()?;
-
-        let mut chapter = book.new_chapter(SUMMARY_CHAPTER);
-        let summary = Summary {
-            tenantid,
-            timelineid,
-            seg,
-
-            start_lsn,
-            end_lsn,
-
-            dropped,
-        };
-        Summary::ser_into(&summary, &mut chapter)?;
-        let book = chapter.close()?;
-
-        // This flushes the underlying 'buf_writer'.
-        let writer = book.close()?;
-        writer.get_ref().sync_all()?;
-
-        trace!("saved {}", &path.display());
-
-        drop(inner);
-
-        Ok(delta_layer)
-    }
-
    ///
    /// Load the contents of the file into memory
    ///
-    fn load(&self) -> Result<MutexGuard<DeltaLayerInner>> {
-        // quick exit if already loaded
-        let mut inner = self.inner.lock().unwrap();
+    fn load(&self) -> Result<RwLockReadGuard<DeltaLayerInner>> {
+        loop {
+            // quick exit if already loaded
+            {
+                let inner = self.inner.read().unwrap();

-        if inner.loaded {
-            return Ok(inner);
-        }
-
-        let path = self.path();
-        let file = VirtualFile::open(&path)?;
-        let book = Book::new(file)?;
-
-        match &self.path_or_conf {
-            PathOrConf::Conf(_) => {
-                let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
-                let actual_summary = Summary::des(&chapter)?;
-
-                let expected_summary = Summary::from(self);
-
-                if actual_summary != expected_summary {
-                    bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
+                if inner.loaded {
+                    return Ok(inner);
                }
            }
-            PathOrConf::Path(path) => {
-                let actual_filename = Path::new(path.file_name().unwrap());
-                let expected_filename = self.filename();
+            // need to upgrade to write lock
+            let mut inner = self.inner.write().unwrap();

-                if actual_filename != expected_filename {
-                    println!(
-                        "warning: filename does not match what is expected from in-file summary"
-                    );
-                    println!("actual: {:?}", actual_filename);
-                    println!("expected: {:?}", expected_filename);
+            let path = self.path();
+
+            // Open the file if it's not open already.
+            if inner.book.is_none() {
+                let file = VirtualFile::open(&path)?;
+                inner.book = Some(Book::new(file)?);
+            }
+            let book = inner.book.as_ref().unwrap();
+
+            match &self.path_or_conf {
+                PathOrConf::Conf(_) => {
+                    let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
+                    let actual_summary = Summary::des(&chapter)?;
+
+                    let expected_summary = Summary::from(self);
+
+                    if actual_summary != expected_summary {
+                        bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
+                    }
+                }
+                PathOrConf::Path(path) => {
+                    let actual_filename = Path::new(path.file_name().unwrap());
+                    let expected_filename = self.filename();
+
+                    if actual_filename != expected_filename {
+                        println!(
+                            "warning: filename does not match what is expected from in-file summary"
+                        );
+                        println!("actual: {:?}", actual_filename);
+                        println!("expected: {:?}", expected_filename);
+                    }
                }
            }
+
+            let chapter = book.read_chapter(INDEX_CHAPTER)?;
+            let index = HashMap::des(&chapter)?;
+
+            debug!("loaded from {}", &path.display());
+
+            inner.index = index;
+            inner.loaded = true;
        }
-
-        let chapter = book.read_chapter(PAGE_VERSION_METAS_CHAPTER)?;
-        let page_version_metas = VecMap::des(&chapter)?;
-
-        let chapter = book.read_chapter(REL_SIZES_CHAPTER)?;
-        let relsizes = VecMap::des(&chapter)?;
-
-        debug!("loaded from {}", &path.display());
-
-        *inner = DeltaLayerInner {
-            loaded: true,
-            book: None,
-            page_version_metas,
-            relsizes,
-        };
-
-        Ok(inner)
    }

    /// Create a DeltaLayer struct representing an existing file on disk.
@@ -548,15 +386,12 @@ impl DeltaLayer {
            path_or_conf: PathOrConf::Conf(conf),
            timelineid,
            tenantid,
-            seg: filename.seg,
-            start_lsn: filename.start_lsn,
-            end_lsn: filename.end_lsn,
-            dropped: filename.dropped,
-            inner: Mutex::new(DeltaLayerInner {
+            key_range: filename.key_range.clone(),
+            lsn_range: filename.lsn_range.clone(),
+            inner: RwLock::new(DeltaLayerInner {
                loaded: false,
                book: None,
-                page_version_metas: VecMap::default(),
-                relsizes: VecMap::default(),
+                index: HashMap::default(),
            }),
        }
    }
@@ -566,7 +401,7 @@ impl DeltaLayer {
    /// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
    pub fn new_for_path<F>(path: &Path, book: &Book<F>) -> Result<Self>
    where
-        F: std::os::unix::prelude::FileExt,
+        F: FileExt,
    {
        let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
        let summary = Summary::des(&chapter)?;
@@ -575,25 +410,20 @@ impl DeltaLayer {
            path_or_conf: PathOrConf::Path(path.to_path_buf()),
            timelineid: summary.timelineid,
            tenantid: summary.tenantid,
-            seg: summary.seg,
-            start_lsn: summary.start_lsn,
-            end_lsn: summary.end_lsn,
-            dropped: summary.dropped,
-            inner: Mutex::new(DeltaLayerInner {
+            key_range: summary.key_range,
+            lsn_range: summary.lsn_range,
+            inner: RwLock::new(DeltaLayerInner {
                loaded: false,
                book: None,
-                page_version_metas: VecMap::default(),
-                relsizes: VecMap::default(),
+                index: HashMap::default(),
            }),
        })
    }

    fn layer_name(&self) -> DeltaFileName {
        DeltaFileName {
-            seg: self.seg,
-            start_lsn: self.start_lsn,
-            end_lsn: self.end_lsn,
-            dropped: self.dropped,
+            key_range: self.key_range.clone(),
+            lsn_range: self.lsn_range.clone(),
        }
    }

@@ -607,3 +437,247 @@ impl DeltaLayer {
        )
    }
 }
+
+/// A builder object for constructing a new delta layer.
+///
+/// Usage:
+///
+/// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...)
+///
+/// 2. Write the contents by calling `put_value` for every page
+///    version to store in the layer.
+///
+/// 3. Call `finish`.
+///
+pub struct DeltaLayerWriter {
+    conf: &'static PageServerConf,
+    path: PathBuf,
+    timelineid: ZTimelineId,
+    tenantid: ZTenantId,
+
+    key_start: Key,
+    lsn_range: Range<Lsn>,
+
+    index: HashMap<Key, VecMap<Lsn, u64>>,
+
+    values_writer: ChapterWriter<BufWriter<VirtualFile>>,
+    end_offset: u64,
+}
+
+impl DeltaLayerWriter {
+    ///
+    /// Start building a new delta layer.
+    ///
+    pub fn new(
+        conf: &'static PageServerConf,
+        timelineid: ZTimelineId,
+        tenantid: ZTenantId,
+        key_start: Key,
+        lsn_range: Range<Lsn>,
+    ) -> Result<DeltaLayerWriter> {
+        // Create the file
+        //
+        // Note: This overwrites any existing file. There shouldn't be any.
+        // FIXME: throw an error instead?
+
+        let path = conf.timeline_path(&timelineid, &tenantid).join(format!(
+            "{}-XXX__{:016X}-{:016X}.temp",
+            key_start,
+            u64::from(lsn_range.start),
+            u64::from(lsn_range.end)
+        ));
+        info!("temp deltalayer path {}", path.display());
+        let file = VirtualFile::create(&path)?;
+        let buf_writer = BufWriter::new(file);
+        let book = BookWriter::new(buf_writer, DELTA_FILE_MAGIC)?;
+
+        // Open the page-versions chapter for writing. The calls to
+        // `put_value` will use this to write the contents.
+        let values_writer = book.new_chapter(VALUES_CHAPTER);
+
+        Ok(DeltaLayerWriter {
+            conf,
+            path,
+            timelineid,
+            tenantid,
+            key_start,
+            lsn_range,
+            index: HashMap::new(),
+            values_writer,
+            end_offset: 0,
+        })
+    }
+
+    ///
+    /// Append a key-value pair to the file.
+    ///
+    /// The values must be appended in key, lsn order.
+    ///
+    pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
+        //info!("DELTA: key {} at {} on {}", key, lsn, self.path.display());
+        assert!(self.lsn_range.start <= lsn);
+        // Remember the offset and size metadata. The metadata is written
+        // to a separate chapter, in `finish`.
+        let off = self.end_offset;
+        let len = utils::write_blob(&mut self.values_writer, &Value::ser(&val)?)?;
+        self.end_offset += len;
+        let vec_map = self.index.entry(key).or_default();
+        let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
+        if old.is_some() {
+            // We already had an entry for this LSN. That's odd..
+            bail!(
+                "Value for {} at {} already exists in delta layer being built",
+                key,
+                lsn
+            );
+        }
+
+        Ok(())
+    }
+
+    pub fn size(&self) -> u64 {
+        self.end_offset
+    }
+
+    ///
+    /// Finish writing the delta layer.
+    ///
+    /// 'seg_sizes' is a list of size changes to store with the actual data.
+    ///
+    pub fn finish(self, key_end: Key) -> Result<DeltaLayer> {
+        // Close the values chapter
+        let book = self.values_writer.close()?;
+
+        // Write out the index
+        let mut chapter = book.new_chapter(INDEX_CHAPTER);
+        let buf = HashMap::ser(&self.index)?;
+        chapter.write_all(&buf)?;
+        let book = chapter.close()?;
+
+        let mut chapter = book.new_chapter(SUMMARY_CHAPTER);
+        let summary = Summary {
+            tenantid: self.tenantid,
+            timelineid: self.timelineid,
+            key_range: self.key_start..key_end,
+            lsn_range: self.lsn_range.clone(),
+        };
+        Summary::ser_into(&summary, &mut chapter)?;
+        let book = chapter.close()?;
+
+        // This flushes the underlying 'buf_writer'.
+        book.close()?;
+
+        // Note: Because we opened the file in write-only mode, we cannot
+        // reuse the same VirtualFile for reading later. That's why we don't
+        // set inner.book here. The first read will have to re-open it.
+        let layer = DeltaLayer {
+            path_or_conf: PathOrConf::Conf(self.conf),
+            tenantid: self.tenantid,
+            timelineid: self.timelineid,
+            key_range: self.key_start..key_end,
+            lsn_range: self.lsn_range.clone(),
+            inner: RwLock::new(DeltaLayerInner {
+                loaded: false,
+                index: HashMap::new(),
+                book: None,
+            }),
+        };
+
+        // Rename the file to its final name
+        //
+        // Note: This overwrites any existing file. There shouldn't be any.
+        // FIXME: throw an error instead?
+        let final_path = DeltaLayer::path_for(
+            &PathOrConf::Conf(self.conf),
+            self.timelineid,
+            self.tenantid,
+            &DeltaFileName {
+                key_range: self.key_start..key_end,
+                lsn_range: self.lsn_range,
+            },
+        );
+        std::fs::rename(self.path, &final_path)?;
+
+        trace!("created delta layer {}", final_path.display());
+
+        Ok(layer)
+    }
+
+    pub fn abort(self) {
+        match self.values_writer.close() {
+            Ok(book) => {
+                if let Err(err) = book.close() {
+                    error!("error while closing delta layer file: {}", err);
+                }
+            }
+            Err(err) => {
+                error!("error while closing chapter writer: {}", err);
+            }
+        }
+        if let Err(err) = std::fs::remove_file(self.path) {
+            error!("error removing unfinished delta layer file: {}", err);
+        }
+    }
+}
+
+///
+/// Iterator over all key-value pairse stored in a delta layer
+///
+/// FIXME: This creates a Vector to hold the offsets of all key value pairs.
+/// That takes up quite a lot of memory. Should do this in a more streaming
+/// fashion.
+///
+struct DeltaValueIter<'a> {
+    all_offsets: Vec<(Key, Lsn, u64)>,
+    next_idx: usize,
+
+    inner: RwLockReadGuard<'a, DeltaLayerInner>,
+}
+
+impl<'a> Iterator for DeltaValueIter<'a> {
+    type Item = Result<(Key, Lsn, Value)>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.next_res().transpose()
+    }
+}
+
+impl<'a> DeltaValueIter<'a> {
+    fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
+        let mut index: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
+        index.sort_by_key(|x| x.0);
+
+        let mut all_offsets: Vec<(Key, Lsn, u64)> = Vec::new();
+        for (key, vec_map) in index.iter() {
+            for (lsn, off) in vec_map.as_slice().iter() {
+                all_offsets.push((**key, *lsn, *off));
+            }
+        }
+
+        Ok(DeltaValueIter {
+            all_offsets,
+            inner,
+            next_idx: 0,
+        })
+    }
+
+    fn next_res(&mut self) -> Result<Option<(Key, Lsn, Value)>> {
+        if self.next_idx < self.all_offsets.len() {
+            let (key, lsn, off) = self.all_offsets[self.next_idx];
+
+            let values_reader = self
+                .inner
+                .book
+                .as_ref()
+                .expect("should be loaded in load call above")
+                .chapter_reader(VALUES_CHAPTER)?;
+
+            let val = Value::des(&utils::read_blob_from_chapter(&values_reader, off)?)?;
+
+            self.next_idx += 1;
+            Ok(Some((key, lsn, val)))
+        } else {
+            Ok(None)
+        }
+    }
+}
--- a/pageserver/src/layered_repository/ephemeral_file.rs
+++ b/pageserver/src/layered_repository/ephemeral_file.rs
@@ -1,8 +1,11 @@
+//! Implementation of append-only file data structure
+//! used to keep in-memory layers spilled on disk.
+
+use crate::config::PageServerConf;
 use crate::page_cache;
 use crate::page_cache::PAGE_SZ;
 use crate::page_cache::{ReadBufResult, WriteBufResult};
 use crate::virtual_file::VirtualFile;
-use crate::PageServerConf;
 use lazy_static::lazy_static;
 use std::cmp::min;
 use std::collections::HashMap;
@@ -92,6 +95,15 @@ impl EphemeralFile {
    }
 }

+/// Does the given filename look like an ephemeral file?
+pub fn is_ephemeral_file(filename: &str) -> bool {
+    if let Some(rest) = filename.strip_prefix("ephemeral-") {
+        rest.parse::<u32>().is_ok()
+    } else {
+        false
+    }
+}
+
 impl FileExt for EphemeralFile {
    fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result<usize, Error> {
        // Look up the right page
@@ -163,7 +175,10 @@ impl Write for EphemeralFile {
    }

    fn flush(&mut self) -> Result<(), std::io::Error> {
-        todo!()
+        // we don't need to flush data:
+        // * we either write input bytes or not, not keeping any intermediate data buffered
+        // * rust unix file `flush` impl does not flush things either, returning `Ok(())`
+        Ok(())
    }
 }

--- a/pageserver/src/layered_repository/filename.rs
+++ b/pageserver/src/layered_repository/filename.rs
@@ -1,36 +1,53 @@
 //!
 //! Helper functions for dealing with filenames of the image and delta layer files.
 //!
-use crate::layered_repository::storage_layer::SegmentTag;
-use crate::relish::*;
-use crate::PageServerConf;
-use crate::{ZTenantId, ZTimelineId};
+use crate::config::PageServerConf;
+use crate::repository::Key;
+use std::cmp::Ordering;
 use std::fmt;
-use std::fs;
+use std::ops::Range;
 use std::path::PathBuf;

-use anyhow::Result;
-use log::*;
 use zenith_utils::lsn::Lsn;

-use super::metadata::METADATA_FILE_NAME;
-
 // Note: LayeredTimeline::load_layer_map() relies on this sort order
-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
+#[derive(Debug, PartialEq, Eq, Clone)]
 pub struct DeltaFileName {
-    pub seg: SegmentTag,
-    pub start_lsn: Lsn,
-    pub end_lsn: Lsn,
-    pub dropped: bool,
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,
+}
+
+impl PartialOrd for DeltaFileName {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for DeltaFileName {
+    fn cmp(&self, other: &Self) -> Ordering {
+        let mut cmp;
+
+        cmp = self.key_range.start.cmp(&other.key_range.start);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.key_range.end.cmp(&other.key_range.end);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.lsn_range.start.cmp(&other.lsn_range.start);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.lsn_range.end.cmp(&other.lsn_range.end);
+
+        cmp
+    }
 }

 /// Represents the filename of a DeltaLayer
 ///
-///    <spcnode>_<dbnode>_<relnode>_<forknum>_<seg>_<start LSN>_<end LSN>
-///
-/// or if it was dropped:
-///
-///    <spcnode>_<dbnode>_<relnode>_<forknum>_<seg>_<start LSN>_<end LSN>_DROPPED
+///    <key start>-<key end>__<LSN start>-<LSN end>
 ///
 impl DeltaFileName {
    ///
@@ -38,269 +55,128 @@ impl DeltaFileName {
    /// match the expected pattern.
    ///
    pub fn parse_str(fname: &str) -> Option<Self> {
-        let rel;
-        let mut parts;
-        if let Some(rest) = fname.strip_prefix("rel_") {
-            parts = rest.split('_');
-            rel = RelishTag::Relation(RelTag {
-                spcnode: parts.next()?.parse::<u32>().ok()?,
-                dbnode: parts.next()?.parse::<u32>().ok()?,
-                relnode: parts.next()?.parse::<u32>().ok()?,
-                forknum: parts.next()?.parse::<u8>().ok()?,
-            });
-        } else if let Some(rest) = fname.strip_prefix("pg_xact_") {
-            parts = rest.split('_');
-            rel = RelishTag::Slru {
-                slru: SlruKind::Clog,
-                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") {
-            parts = rest.split('_');
-            rel = RelishTag::Slru {
-                slru: SlruKind::MultiXactMembers,
-                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") {
-            parts = rest.split('_');
-            rel = RelishTag::Slru {
-                slru: SlruKind::MultiXactOffsets,
-                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") {
-            parts = rest.split('_');
-            rel = RelishTag::FileNodeMap {
-                spcnode: parts.next()?.parse::<u32>().ok()?,
-                dbnode: parts.next()?.parse::<u32>().ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_twophase_") {
-            parts = rest.split('_');
-            rel = RelishTag::TwoPhase {
-                xid: parts.next()?.parse::<u32>().ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") {
-            parts = rest.split('_');
-            rel = RelishTag::Checkpoint;
-        } else if let Some(rest) = fname.strip_prefix("pg_control_") {
-            parts = rest.split('_');
-            rel = RelishTag::ControlFile;
-        } else {
+        let mut parts = fname.split("__");
+        let mut key_parts = parts.next()?.split('-');
+        let mut lsn_parts = parts.next()?.split('-');
+
+        let key_start_str = key_parts.next()?;
+        let key_end_str = key_parts.next()?;
+        let lsn_start_str = lsn_parts.next()?;
+        let lsn_end_str = lsn_parts.next()?;
+        if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() {
            return None;
        }

-        let segno = parts.next()?.parse::<u32>().ok()?;
+        let key_start = Key::from_hex(key_start_str).ok()?;
+        let key_end = Key::from_hex(key_end_str).ok()?;

-        let seg = SegmentTag { rel, segno };
+        let start_lsn = Lsn::from_hex(lsn_start_str).ok()?;
+        let end_lsn = Lsn::from_hex(lsn_end_str).ok()?;

-        let start_lsn = Lsn::from_hex(parts.next()?).ok()?;
-        let end_lsn = Lsn::from_hex(parts.next()?).ok()?;
-
-        let mut dropped = false;
-        if let Some(suffix) = parts.next() {
-            if suffix == "DROPPED" {
-                dropped = true;
-            } else {
-                return None;
-            }
-        }
-        if parts.next().is_some() {
+        if start_lsn >= end_lsn {
            return None;
+            // or panic?
+        }
+
+        if key_start >= key_end {
+            return None;
+            // or panic?
        }

        Some(DeltaFileName {
-            seg,
-            start_lsn,
-            end_lsn,
-            dropped,
+            key_range: key_start..key_end,
+            lsn_range: start_lsn..end_lsn,
        })
    }
 }

 impl fmt::Display for DeltaFileName {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let basename = match self.seg.rel {
-            RelishTag::Relation(reltag) => format!(
-                "rel_{}_{}_{}_{}",
-                reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum
-            ),
-            RelishTag::Slru {
-                slru: SlruKind::Clog,
-                segno,
-            } => format!("pg_xact_{:04X}", segno),
-            RelishTag::Slru {
-                slru: SlruKind::MultiXactMembers,
-                segno,
-            } => format!("pg_multixact_members_{:04X}", segno),
-            RelishTag::Slru {
-                slru: SlruKind::MultiXactOffsets,
-                segno,
-            } => format!("pg_multixact_offsets_{:04X}", segno),
-            RelishTag::FileNodeMap { spcnode, dbnode } => {
-                format!("pg_filenodemap_{}_{}", spcnode, dbnode)
-            }
-            RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
-            RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
-            RelishTag::ControlFile => "pg_control".to_string(),
-        };
-
        write!(
            f,
-            "{}_{}_{:016X}_{:016X}{}",
-            basename,
-            self.seg.segno,
-            u64::from(self.start_lsn),
-            u64::from(self.end_lsn),
-            if self.dropped { "_DROPPED" } else { "" }
+            "{}-{}__{:016X}-{:016X}",
+            self.key_range.start,
+            self.key_range.end,
+            u64::from(self.lsn_range.start),
+            u64::from(self.lsn_range.end),
        )
    }
 }

-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
+#[derive(Debug, PartialEq, Eq, Clone)]
 pub struct ImageFileName {
-    pub seg: SegmentTag,
+    pub key_range: Range<Key>,
    pub lsn: Lsn,
 }

+impl PartialOrd for ImageFileName {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for ImageFileName {
+    fn cmp(&self, other: &Self) -> Ordering {
+        let mut cmp;
+
+        cmp = self.key_range.start.cmp(&other.key_range.start);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.key_range.end.cmp(&other.key_range.end);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.lsn.cmp(&other.lsn);
+
+        cmp
+    }
+}
+
 ///
 /// Represents the filename of an ImageLayer
 ///
-///    <spcnode>_<dbnode>_<relnode>_<forknum>_<seg>_<LSN>
-///
+///    <key start>-<key end>__<LSN>
 impl ImageFileName {
    ///
    /// Parse a string as an image file name. Returns None if the filename does not
    /// match the expected pattern.
    ///
    pub fn parse_str(fname: &str) -> Option<Self> {
-        let rel;
-        let mut parts;
-        if let Some(rest) = fname.strip_prefix("rel_") {
-            parts = rest.split('_');
-            rel = RelishTag::Relation(RelTag {
-                spcnode: parts.next()?.parse::<u32>().ok()?,
-                dbnode: parts.next()?.parse::<u32>().ok()?,
-                relnode: parts.next()?.parse::<u32>().ok()?,
-                forknum: parts.next()?.parse::<u8>().ok()?,
-            });
-        } else if let Some(rest) = fname.strip_prefix("pg_xact_") {
-            parts = rest.split('_');
-            rel = RelishTag::Slru {
-                slru: SlruKind::Clog,
-                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") {
-            parts = rest.split('_');
-            rel = RelishTag::Slru {
-                slru: SlruKind::MultiXactMembers,
-                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") {
-            parts = rest.split('_');
-            rel = RelishTag::Slru {
-                slru: SlruKind::MultiXactOffsets,
-                segno: u32::from_str_radix(parts.next()?, 16).ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") {
-            parts = rest.split('_');
-            rel = RelishTag::FileNodeMap {
-                spcnode: parts.next()?.parse::<u32>().ok()?,
-                dbnode: parts.next()?.parse::<u32>().ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_twophase_") {
-            parts = rest.split('_');
-            rel = RelishTag::TwoPhase {
-                xid: parts.next()?.parse::<u32>().ok()?,
-            };
-        } else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") {
-            parts = rest.split('_');
-            rel = RelishTag::Checkpoint;
-        } else if let Some(rest) = fname.strip_prefix("pg_control_") {
-            parts = rest.split('_');
-            rel = RelishTag::ControlFile;
-        } else {
+        let mut parts = fname.split("__");
+        let mut key_parts = parts.next()?.split('-');
+
+        let key_start_str = key_parts.next()?;
+        let key_end_str = key_parts.next()?;
+        let lsn_str = parts.next()?;
+        if parts.next().is_some() || key_parts.next().is_some() {
            return None;
        }

-        let segno = parts.next()?.parse::<u32>().ok()?;
+        let key_start = Key::from_hex(key_start_str).ok()?;
+        let key_end = Key::from_hex(key_end_str).ok()?;

-        let seg = SegmentTag { rel, segno };
+        let lsn = Lsn::from_hex(lsn_str).ok()?;

-        let lsn = Lsn::from_hex(parts.next()?).ok()?;
-
-        if parts.next().is_some() {
-            return None;
-        }
-
-        Some(ImageFileName { seg, lsn })
+        Some(ImageFileName {
+            key_range: key_start..key_end,
+            lsn,
+        })
    }
 }

 impl fmt::Display for ImageFileName {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let basename = match self.seg.rel {
-            RelishTag::Relation(reltag) => format!(
-                "rel_{}_{}_{}_{}",
-                reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum
-            ),
-            RelishTag::Slru {
-                slru: SlruKind::Clog,
-                segno,
-            } => format!("pg_xact_{:04X}", segno),
-            RelishTag::Slru {
-                slru: SlruKind::MultiXactMembers,
-                segno,
-            } => format!("pg_multixact_members_{:04X}", segno),
-            RelishTag::Slru {
-                slru: SlruKind::MultiXactOffsets,
-                segno,
-            } => format!("pg_multixact_offsets_{:04X}", segno),
-            RelishTag::FileNodeMap { spcnode, dbnode } => {
-                format!("pg_filenodemap_{}_{}", spcnode, dbnode)
-            }
-            RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
-            RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
-            RelishTag::ControlFile => "pg_control".to_string(),
-        };
-
        write!(
            f,
-            "{}_{}_{:016X}",
-            basename,
-            self.seg.segno,
+            "{}-{}__{:016X}",
+            self.key_range.start,
+            self.key_range.end,
            u64::from(self.lsn),
        )
    }
 }

-/// Scan timeline directory and create ImageFileName and DeltaFilename
-/// structs representing all files on disk
-///
-/// TODO: returning an Iterator would be more idiomatic
-pub fn list_files(
-    conf: &'static PageServerConf,
-    timelineid: ZTimelineId,
-    tenantid: ZTenantId,
-) -> Result<(Vec<ImageFileName>, Vec<DeltaFileName>)> {
-    let path = conf.timeline_path(&timelineid, &tenantid);
-
-    let mut deltafiles: Vec<DeltaFileName> = Vec::new();
-    let mut imgfiles: Vec<ImageFileName> = Vec::new();
-    for direntry in fs::read_dir(path)? {
-        let fname = direntry?.file_name();
-        let fname = fname.to_str().unwrap();
-
-        if let Some(deltafilename) = DeltaFileName::parse_str(fname) {
-            deltafiles.push(deltafilename);
-        } else if let Some(imgfilename) = ImageFileName::parse_str(fname) {
-            imgfiles.push(imgfilename);
-        } else if fname == METADATA_FILE_NAME || fname == "ancestor" || fname.ends_with(".old") {
-            // ignore these
-        } else {
-            warn!("unrecognized filename in timeline dir: {}", fname);
-        }
-    }
-    Ok((imgfiles, deltafiles))
-}
-
 /// Helper enum to hold a PageServerConf, or a path
 ///
 /// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
--- a/pageserver/src/layered_repository/global_layer_map.rs
+++ b/pageserver/src/layered_repository/global_layer_map.rs
@@ -1,141 +0,0 @@
-//!
-//! Global registry of open layers.
-//!
-//! Whenever a new in-memory layer is created to hold incoming WAL, it is registered
-//! in [`GLOBAL_LAYER_MAP`], so that we can keep track of the total number of
-//! in-memory layers in the system, and know when we need to evict some to release
-//! memory.
-//!
-//! Each layer is assigned a unique ID when it's registered in the global registry.
-//! The ID can be used to relocate the layer later, without having to hold locks.
-//!
-
-use std::sync::atomic::{AtomicU8, Ordering};
-use std::sync::{Arc, RwLock};
-
-use super::inmemory_layer::OpenLayer;
-
-use lazy_static::lazy_static;
-
-const MAX_USAGE_COUNT: u8 = 5;
-
-lazy_static! {
-    pub static ref GLOBAL_LAYER_MAP: RwLock<OpenLayers> = RwLock::new(OpenLayers::default());
-}
-
-// TODO these types can probably be smaller
-#[derive(PartialEq, Eq, Clone, Copy)]
-pub struct LayerId {
-    index: usize,
-    tag: u64, // to avoid ABA problem
-}
-
-enum SlotData {
-    Occupied(Arc<OpenLayer>),
-    /// Vacant slots form a linked list, the value is the index
-    /// of the next vacant slot in the list.
-    Vacant(Option<usize>),
-}
-
-struct Slot {
-    tag: u64,
-    data: SlotData,
-    usage_count: AtomicU8, // for clock algorithm
-}
-
-#[derive(Default)]
-pub struct OpenLayers {
-    slots: Vec<Slot>,
-    num_occupied: usize,
-
-    // Head of free-slot list.
-    next_empty_slot_idx: Option<usize>,
-}
-
-impl OpenLayers {
-    pub fn insert(&mut self, layer: Arc<OpenLayer>) -> LayerId {
-        let slot_idx = match self.next_empty_slot_idx {
-            Some(slot_idx) => slot_idx,
-            None => {
-                let idx = self.slots.len();
-                self.slots.push(Slot {
-                    tag: 0,
-                    data: SlotData::Vacant(None),
-                    usage_count: AtomicU8::new(0),
-                });
-                idx
-            }
-        };
-        let slots_len = self.slots.len();
-
-        let slot = &mut self.slots[slot_idx];
-
-        match slot.data {
-            SlotData::Occupied(_) => {
-                panic!("an occupied slot was in the free list");
-            }
-            SlotData::Vacant(next_empty_slot_idx) => {
-                self.next_empty_slot_idx = next_empty_slot_idx;
-            }
-        }
-
-        slot.data = SlotData::Occupied(layer);
-        slot.usage_count.store(1, Ordering::Relaxed);
-
-        self.num_occupied += 1;
-        assert!(self.num_occupied <= slots_len);
-
-        LayerId {
-            index: slot_idx,
-            tag: slot.tag,
-        }
-    }
-
-    pub fn get(&self, layer_id: &LayerId) -> Option<Arc<OpenLayer>> {
-        let slot = self.slots.get(layer_id.index)?; // TODO should out of bounds indexes just panic?
-        if slot.tag != layer_id.tag {
-            return None;
-        }
-
-        if let SlotData::Occupied(layer) = &slot.data {
-            let _ = slot.usage_count.fetch_update(
-                Ordering::Relaxed,
-                Ordering::Relaxed,
-                |old_usage_count| {
-                    if old_usage_count < MAX_USAGE_COUNT {
-                        Some(old_usage_count + 1)
-                    } else {
-                        None
-                    }
-                },
-            );
-            Some(Arc::clone(layer))
-        } else {
-            None
-        }
-    }
-
-    // TODO this won't be a public API in the future
-    pub fn remove(&mut self, layer_id: &LayerId) {
-        let slot = &mut self.slots[layer_id.index];
-
-        if slot.tag != layer_id.tag {
-            return;
-        }
-
-        match &slot.data {
-            SlotData::Occupied(_layer) => {
-                // TODO evict the layer
-            }
-            SlotData::Vacant(_) => unimplemented!(),
-        }
-
-        slot.data = SlotData::Vacant(self.next_empty_slot_idx);
-        self.next_empty_slot_idx = Some(layer_id.index);
-
-        assert!(self.num_occupied > 0);
-        self.num_occupied -= 1;
-
-        slot.tag = slot.tag.wrapping_add(1);
-    }
-}
--- a/pageserver/src/layered_repository/image_layer.rs
+++ b/pageserver/src/layered_repository/image_layer.rs
@@ -1,56 +1,58 @@
-//! An ImageLayer represents an image or a snapshot of a segment at one particular LSN.
-//! It is stored in a file on disk.
+//! An ImageLayer represents an image or a snapshot of a key-range at
+//! one particular LSN. It contains an image of all key-value pairs
+//! in its key-range. Any key that falls into the image layer's range
+//! but does not exist in the layer, does not exist.
 //!
-//! On disk, the image files are stored in timelines/<timelineid> directory.
-//! Currently, there are no subdirectories, and each image layer file is named like this:
+//! An image layer is stored in a file on disk. The file is stored in
+//! timelines/<timelineid> directory.  Currently, there are no
+//! subdirectories, and each image layer file is named like this:
 //!
-//! Note that segno is
-//!    <spcnode>_<dbnode>_<relnode>_<forknum>_<segno>_<LSN>
+//!    <key start>-<key end>__<LSN>
 //!
 //! For example:
 //!
-//!    1663_13990_2609_0_5_000000000169C348
+//!    000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
 //!
 //! An image file is constructed using the 'bookfile' crate.
 //!
 //! Only metadata is loaded into memory by the load function.
 //! When images are needed, they are read directly from disk.
 //!
-//! For blocky relishes, the images are stored in BLOCKY_IMAGES_CHAPTER.
-//! All the images are required to be BLOCK_SIZE, which allows for random access.
-//!
-//! For non-blocky relishes, the image can be found in NONBLOCKY_IMAGE_CHAPTER.
-//!
+use crate::config::PageServerConf;
 use crate::layered_repository::filename::{ImageFileName, PathOrConf};
 use crate::layered_repository::storage_layer::{
-    Layer, PageReconstructData, PageReconstructResult, SegmentTag,
+    Layer, ValueReconstructResult, ValueReconstructState,
 };
-use crate::layered_repository::LayeredTimeline;
-use crate::layered_repository::RELISH_SEG_SIZE;
+use crate::layered_repository::utils;
+use crate::repository::{Key, Value};
 use crate::virtual_file::VirtualFile;
-use crate::PageServerConf;
 use crate::{ZTenantId, ZTimelineId};
-use anyhow::{anyhow, bail, ensure, Context, Result};
+use anyhow::{bail, Context, Result};
 use bytes::Bytes;
 use log::*;
 use serde::{Deserialize, Serialize};
-use std::convert::TryInto;
+use std::collections::HashMap;
 use std::fs;
 use std::io::{BufWriter, Write};
+use std::ops::Range;
 use std::path::{Path, PathBuf};
 use std::sync::{Mutex, MutexGuard};

-use bookfile::{Book, BookWriter};
+use bookfile::{Book, BookWriter, ChapterWriter};

 use zenith_utils::bin_ser::BeSer;
 use zenith_utils::lsn::Lsn;

-// Magic constant to identify a Zenith segment image file
+// Magic constant to identify a Zenith image layer file
+// FIXME: bump all magics
 pub const IMAGE_FILE_MAGIC: u32 = 0x5A616E01 + 1;

+/// Mapping from (key, lsn) -> page/WAL record
+/// byte ranges in VALUES_CHAPTER
+static INDEX_CHAPTER: u64 = 1;
+
 /// Contains each block in block # order
-const BLOCKY_IMAGES_CHAPTER: u64 = 1;
-const NONBLOCKY_IMAGE_CHAPTER: u64 = 2;
+const VALUES_CHAPTER: u64 = 2;

 /// Contains the [`Summary`] struct
 const SUMMARY_CHAPTER: u64 = 3;
@@ -59,7 +61,7 @@ const SUMMARY_CHAPTER: u64 = 3;
 struct Summary {
    tenantid: ZTenantId,
    timelineid: ZTimelineId,
-    seg: SegmentTag,
+    key_range: Range<Key>,

    lsn: Lsn,
 }
@@ -69,19 +71,17 @@ impl From<&ImageLayer> for Summary {
        Self {
            tenantid: layer.tenantid,
            timelineid: layer.timelineid,
-            seg: layer.seg,
+            key_range: layer.key_range.clone(),

            lsn: layer.lsn,
        }
    }
 }

-const BLOCK_SIZE: usize = 8192;
-
 ///
 /// ImageLayer is the in-memory data structure associated with an on-disk image
 /// file.  We keep an ImageLayer in memory for each file, in the LayerMap. If a
-/// layer is in "loaded" state, we have a copy of the file in memory, in 'inner'.
+/// layer is in "loaded" state, we have a copy of the index in memory, in 'inner'.
 /// Otherwise the struct is just a placeholder for a file that exists on disk,
 /// and it needs to be loaded before using it in queries.
 ///
@@ -89,7 +89,7 @@ pub struct ImageLayer {
    path_or_conf: PathOrConf,
    pub tenantid: ZTenantId,
    pub timelineid: ZTimelineId,
-    pub seg: SegmentTag,
+    pub key_range: Range<Key>,

    // This entry contains an image of all pages as of this LSN
    pub lsn: Lsn,
@@ -97,18 +97,15 @@ pub struct ImageLayer {
    inner: Mutex<ImageLayerInner>,
 }

-#[derive(Clone)]
-enum ImageType {
-    Blocky { num_blocks: u32 },
-    NonBlocky,
-}
-
 pub struct ImageLayerInner {
-    /// If None, the 'image_type' has not been loaded into memory yet.
+    /// If false, the 'index' has not been loaded into memory yet.
+    loaded: bool,
+
+    /// If None, the 'image_type' has not been loaded into memory yet. FIXME
    book: Option<Book<VirtualFile>>,

-    /// Derived from filename and bookfile chapter metadata
-    image_type: ImageType,
+    /// offset of each value
+    index: HashMap<Key, u64>,
 }

 impl Layer for ImageLayer {
@@ -124,90 +121,61 @@ impl Layer for ImageLayer {
        self.timelineid
    }

-    fn get_seg_tag(&self) -> SegmentTag {
-        self.seg
+    fn get_key_range(&self) -> Range<Key> {
+        self.key_range.clone()
    }

-    fn is_dropped(&self) -> bool {
-        false
-    }
-
-    fn get_start_lsn(&self) -> Lsn {
-        self.lsn
-    }
-
-    fn get_end_lsn(&self) -> Lsn {
+    fn get_lsn_range(&self) -> Range<Lsn> {
        // End-bound is exclusive
-        self.lsn + 1
+        self.lsn..(self.lsn + 1)
    }

    /// Look up given page in the file
-    fn get_page_reconstruct_data(
+    fn get_value_reconstruct_data(
        &self,
-        blknum: u32,
-        lsn: Lsn,
-        cached_img_lsn: Option<Lsn>,
-        reconstruct_data: &mut PageReconstructData,
-    ) -> Result<PageReconstructResult> {
-        assert!(lsn >= self.lsn);
-
-        match cached_img_lsn {
-            Some(cached_lsn) if self.lsn <= cached_lsn => return Ok(PageReconstructResult::Cached),
-            _ => {}
-        }
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+    ) -> Result<ValueReconstructResult> {
+        assert!(self.key_range.contains(&key));
+        assert!(lsn_range.end >= self.lsn);

        let inner = self.load()?;

-        let base_blknum = blknum % RELISH_SEG_SIZE;
+        if let Some(offset) = inner.index.get(&key) {
+            let chapter = inner
+                .book
+                .as_ref()
+                .unwrap()
+                .chapter_reader(VALUES_CHAPTER)?;

-        let buf = match &inner.image_type {
-            ImageType::Blocky { num_blocks } => {
-                if base_blknum >= *num_blocks {
-                    return Ok(PageReconstructResult::Missing(lsn));
-                }
+            let blob = utils::read_blob_from_chapter(&chapter, *offset).with_context(|| {
+                format!(
+                    "failed to read value from data file {} at offset {}",
+                    self.filename().display(),
+                    offset
+                )
+            })?;
+            let value = Bytes::from(blob);

-                let mut buf = vec![0u8; BLOCK_SIZE];
-                let offset = BLOCK_SIZE as u64 * base_blknum as u64;
-
-                let chapter = inner
-                    .book
-                    .as_ref()
-                    .unwrap()
-                    .chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
-                chapter.read_exact_at(&mut buf, offset)?;
-
-                buf
-            }
-            ImageType::NonBlocky => {
-                ensure!(base_blknum == 0);
-                inner
-                    .book
-                    .as_ref()
-                    .unwrap()
-                    .read_chapter(NONBLOCKY_IMAGE_CHAPTER)?
-                    .into_vec()
-            }
-        };
-
-        reconstruct_data.page_img = Some(Bytes::from(buf));
-        Ok(PageReconstructResult::Complete)
-    }
-
-    /// Get size of the segment
-    fn get_seg_size(&self, _lsn: Lsn) -> Result<u32> {
-        let inner = self.load()?;
-        match inner.image_type {
-            ImageType::Blocky { num_blocks } => Ok(num_blocks),
-            ImageType::NonBlocky => Err(anyhow!("get_seg_size called for non-blocky segment")),
+            reconstruct_state.img = Some((self.lsn, value));
+            Ok(ValueReconstructResult::Complete)
+        } else {
+            Ok(ValueReconstructResult::Missing)
        }
    }

-    /// Does this segment exist at given LSN?
-    fn get_seg_exists(&self, _lsn: Lsn) -> Result<bool> {
-        Ok(true)
+    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
+        todo!();
    }

    fn unload(&self) -> Result<()> {
+        // TODO: unload 'segs'. Or even better, don't hold it in memory but
+        // access it directly from the file (using the buffer cache)
+        let mut inner = self.inner.lock().unwrap();
+        inner.index = HashMap::default();
+        inner.loaded = false;
+
        Ok(())
    }

@@ -221,25 +189,24 @@ impl Layer for ImageLayer {
        false
    }

+    fn is_in_memory(&self) -> bool {
+        false
+    }
+
    /// debugging function to print out the contents of the layer
    fn dump(&self) -> Result<()> {
        println!(
-            "----- image layer for ten {} tli {} seg {} at {} ----",
-            self.tenantid, self.timelineid, self.seg, self.lsn
+            "----- image layer for ten {} tli {} key {}-{} at {} ----",
+            self.tenantid, self.timelineid, self.key_range.start, self.key_range.end, self.lsn
        );

        let inner = self.load()?;

-        match inner.image_type {
-            ImageType::Blocky { num_blocks } => println!("({}) blocks ", num_blocks),
-            ImageType::NonBlocky => {
-                let chapter = inner
-                    .book
-                    .as_ref()
-                    .unwrap()
-                    .read_chapter(NONBLOCKY_IMAGE_CHAPTER)?;
-                println!("non-blocky ({} bytes)", chapter.len());
-            }
+        let mut index_vec: Vec<(&Key, &u64)> = inner.index.iter().collect();
+        index_vec.sort_by_key(|x| x.1);
+
+        for (key, offset) in index_vec {
+            println!("key: {} offset {}", key, offset);
        }

        Ok(())
@@ -261,125 +228,6 @@ impl ImageLayer {
        }
    }

-    /// Create a new image file, using the given array of pages.
-    fn create(
-        conf: &'static PageServerConf,
-        timelineid: ZTimelineId,
-        tenantid: ZTenantId,
-        seg: SegmentTag,
-        lsn: Lsn,
-        base_images: Vec<Bytes>,
-    ) -> Result<ImageLayer> {
-        let image_type = if seg.rel.is_blocky() {
-            let num_blocks: u32 = base_images.len().try_into()?;
-            ImageType::Blocky { num_blocks }
-        } else {
-            assert_eq!(base_images.len(), 1);
-            ImageType::NonBlocky
-        };
-
-        let layer = ImageLayer {
-            path_or_conf: PathOrConf::Conf(conf),
-            timelineid,
-            tenantid,
-            seg,
-            lsn,
-            inner: Mutex::new(ImageLayerInner {
-                book: None,
-                image_type: image_type.clone(),
-            }),
-        };
-        let inner = layer.inner.lock().unwrap();
-
-        // Write the images into a file
-        //
-        // Note: Because we open the file in write-only mode, we cannot
-        // reuse the same VirtualFile for reading later. That's why we don't
-        // set inner.book here. The first read will have to re-open it.
-        //
-        // Note: This overwrites any existing file. There shouldn't be any.
-        // FIXME: throw an error instead?
-        let path = layer.path();
-        let file = VirtualFile::create(&path)?;
-        let buf_writer = BufWriter::new(file);
-        let book = BookWriter::new(buf_writer, IMAGE_FILE_MAGIC)?;
-
-        let book = match &image_type {
-            ImageType::Blocky { .. } => {
-                let mut chapter = book.new_chapter(BLOCKY_IMAGES_CHAPTER);
-                for block_bytes in base_images {
-                    assert_eq!(block_bytes.len(), BLOCK_SIZE);
-                    chapter.write_all(&block_bytes)?;
-                }
-                chapter.close()?
-            }
-            ImageType::NonBlocky => {
-                let mut chapter = book.new_chapter(NONBLOCKY_IMAGE_CHAPTER);
-                chapter.write_all(&base_images[0])?;
-                chapter.close()?
-            }
-        };
-
-        let mut chapter = book.new_chapter(SUMMARY_CHAPTER);
-        let summary = Summary {
-            tenantid,
-            timelineid,
-            seg,
-
-            lsn,
-        };
-        Summary::ser_into(&summary, &mut chapter)?;
-        let book = chapter.close()?;
-
-        // This flushes the underlying 'buf_writer'.
-        let writer = book.close()?;
-        writer.get_ref().sync_all()?;
-
-        trace!("saved {}", path.display());
-
-        drop(inner);
-
-        Ok(layer)
-    }
-
-    // Create a new image file by materializing every page in a source layer
-    // at given LSN.
-    pub fn create_from_src(
-        conf: &'static PageServerConf,
-        timeline: &LayeredTimeline,
-        src: &dyn Layer,
-        lsn: Lsn,
-    ) -> Result<ImageLayer> {
-        let seg = src.get_seg_tag();
-        let timelineid = timeline.timelineid;
-
-        let startblk;
-        let size;
-        if seg.rel.is_blocky() {
-            size = src.get_seg_size(lsn)?;
-            startblk = seg.segno * RELISH_SEG_SIZE;
-        } else {
-            size = 1;
-            startblk = 0;
-        }
-
-        trace!(
-            "creating new ImageLayer for {} on timeline {} at {}",
-            seg,
-            timelineid,
-            lsn,
-        );
-
-        let mut base_images: Vec<Bytes> = Vec::new();
-        for blknum in startblk..(startblk + size) {
-            let img = timeline.materialize_page(seg, blknum, lsn, &*src)?;
-
-            base_images.push(img);
-        }
-
-        Self::create(conf, timelineid, timeline.tenantid, seg, lsn, base_images)
-    }
-
    ///
    /// Load the contents of the file into memory
    ///
@@ -387,19 +235,21 @@ impl ImageLayer {
        // quick exit if already loaded
        let mut inner = self.inner.lock().unwrap();

-        if inner.book.is_some() {
+        if inner.loaded {
            return Ok(inner);
        }

        let path = self.path();
-        let file = VirtualFile::open(&path)
-            .with_context(|| format!("Failed to open virtual file '{}'", path.display()))?;
-        let book = Book::new(file).with_context(|| {
-            format!(
-                "Failed to open virtual file '{}' as a bookfile",
-                path.display()
-            )
-        })?;
+
+        // Open the file if it's not open already.
+        if inner.book.is_none() {
+            let file = VirtualFile::open(&path)
+                .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+            inner.book = Some(Book::new(file).with_context(|| {
+                format!("Failed to open file '{}' as a bookfile", path.display())
+            })?);
+        }
+        let book = inner.book.as_ref().unwrap();

        match &self.path_or_conf {
            PathOrConf::Conf(_) => {
@@ -426,23 +276,13 @@ impl ImageLayer {
            }
        }

-        let image_type = if self.seg.rel.is_blocky() {
-            let chapter = book.chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
-            let images_len = chapter.len();
-            ensure!(images_len % BLOCK_SIZE as u64 == 0);
-            let num_blocks: u32 = (images_len / BLOCK_SIZE as u64).try_into()?;
-            ImageType::Blocky { num_blocks }
-        } else {
-            let _chapter = book.chapter_reader(NONBLOCKY_IMAGE_CHAPTER)?;
-            ImageType::NonBlocky
-        };
+        let chapter = book.read_chapter(INDEX_CHAPTER)?;
+        let index = HashMap::des(&chapter)?;

-        debug!("loaded from {}", &path.display());
+        info!("loaded from {}", &path.display());

-        *inner = ImageLayerInner {
-            book: Some(book),
-            image_type,
-        };
+        inner.index = index;
+        inner.loaded = true;

        Ok(inner)
    }
@@ -458,11 +298,12 @@ impl ImageLayer {
            path_or_conf: PathOrConf::Conf(conf),
            timelineid,
            tenantid,
-            seg: filename.seg,
+            key_range: filename.key_range.clone(),
            lsn: filename.lsn,
            inner: Mutex::new(ImageLayerInner {
                book: None,
-                image_type: ImageType::Blocky { num_blocks: 0 },
+                index: HashMap::new(),
+                loaded: false,
            }),
        }
    }
@@ -481,18 +322,19 @@ impl ImageLayer {
            path_or_conf: PathOrConf::Path(path.to_path_buf()),
            timelineid: summary.timelineid,
            tenantid: summary.tenantid,
-            seg: summary.seg,
+            key_range: summary.key_range,
            lsn: summary.lsn,
            inner: Mutex::new(ImageLayerInner {
                book: None,
-                image_type: ImageType::Blocky { num_blocks: 0 },
+                index: HashMap::new(),
+                loaded: false,
            }),
        })
    }

    fn layer_name(&self) -> ImageFileName {
        ImageFileName {
-            seg: self.seg,
+            key_range: self.key_range.clone(),
            lsn: self.lsn,
        }
    }
@@ -507,3 +349,158 @@ impl ImageLayer {
        )
    }
 }
+
+/// A builder object for constructing a new image layer.
+///
+/// Usage:
+///
+/// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
+///
+/// 2. Write the contents by calling `put_page_image` for every page
+///    in the segment.
+///
+/// 3. Call `finish`.
+///
+pub struct ImageLayerWriter {
+    conf: &'static PageServerConf,
+    path: PathBuf,
+    timelineid: ZTimelineId,
+    tenantid: ZTenantId,
+    key_range: Range<Key>,
+    lsn: Lsn,
+
+    values_writer: Option<ChapterWriter<BufWriter<VirtualFile>>>,
+    end_offset: u64,
+
+    index: HashMap<Key, u64>,
+
+    finished: bool,
+}
+
+impl ImageLayerWriter {
+    pub fn new(
+        conf: &'static PageServerConf,
+        timelineid: ZTimelineId,
+        tenantid: ZTenantId,
+        key_range: &Range<Key>,
+        lsn: Lsn,
+    ) -> Result<ImageLayerWriter> {
+        // Create the file
+        //
+        // Note: This overwrites any existing file. There shouldn't be any.
+        // FIXME: throw an error instead?
+        let path = ImageLayer::path_for(
+            &PathOrConf::Conf(conf),
+            timelineid,
+            tenantid,
+            &ImageFileName {
+                key_range: key_range.clone(),
+                lsn,
+            },
+        );
+        info!("new image layer {}", path.display());
+        let file = VirtualFile::create(&path)?;
+        let buf_writer = BufWriter::new(file);
+        let book = BookWriter::new(buf_writer, IMAGE_FILE_MAGIC)?;
+
+        // Open the page-images chapter for writing. The calls to
+        // `put_image` will use this to write the contents.
+        let chapter = book.new_chapter(VALUES_CHAPTER);
+
+        let writer = ImageLayerWriter {
+            conf,
+            path,
+            timelineid,
+            tenantid,
+            key_range: key_range.clone(),
+            lsn,
+            values_writer: Some(chapter),
+            index: HashMap::new(),
+            end_offset: 0,
+            finished: false,
+        };
+
+        Ok(writer)
+    }
+
+    ///
+    /// Write next value to the file.
+    ///
+    /// The page versions must be appended in blknum order.
+    ///
+    pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> {
+        assert!(self.key_range.contains(&key));
+        let off = self.end_offset;
+
+        if let Some(writer) = &mut self.values_writer {
+            let len = utils::write_blob(writer, img)?;
+            self.end_offset += len;
+
+            let old = self.index.insert(key, off);
+            assert!(old.is_none());
+        } else {
+            panic!()
+        }
+
+        Ok(())
+    }
+
+    pub fn finish(&mut self) -> Result<ImageLayer> {
+        // Close the values chapter
+        let book = self.values_writer.take().unwrap().close()?;
+
+        // Write out the index
+        let mut chapter = book.new_chapter(INDEX_CHAPTER);
+        let buf = HashMap::ser(&self.index)?;
+        chapter.write_all(&buf)?;
+        let book = chapter.close()?;
+
+        // Write out the summary chapter
+        let mut chapter = book.new_chapter(SUMMARY_CHAPTER);
+        let summary = Summary {
+            tenantid: self.tenantid,
+            timelineid: self.timelineid,
+            key_range: self.key_range.clone(),
+            lsn: self.lsn,
+        };
+        Summary::ser_into(&summary, &mut chapter)?;
+        let book = chapter.close()?;
+
+        // This flushes the underlying 'buf_writer'.
+        book.close()?;
+
+        // Note: Because we open the file in write-only mode, we cannot
+        // reuse the same VirtualFile for reading later. That's why we don't
+        // set inner.book here. The first read will have to re-open it.
+        let layer = ImageLayer {
+            path_or_conf: PathOrConf::Conf(self.conf),
+            timelineid: self.timelineid,
+            tenantid: self.tenantid,
+            key_range: self.key_range.clone(),
+            lsn: self.lsn,
+            inner: Mutex::new(ImageLayerInner {
+                book: None,
+                loaded: false,
+                index: HashMap::new(),
+            }),
+        };
+        trace!("created image layer {}", layer.path().display());
+
+        self.finished = true;
+
+        Ok(layer)
+    }
+}
+
+impl Drop for ImageLayerWriter {
+    fn drop(&mut self) {
+        if let Some(page_image_writer) = self.values_writer.take() {
+            if let Ok(book) = page_image_writer.close() {
+                let _ = book.close();
+            }
+        }
+        if !self.finished {
+            let _ = fs::remove_file(&self.path);
+        }
+    }
+}
--- a/pageserver/src/layered_repository/inmemory_layer.rs
+++ b/pageserver/src/layered_repository/inmemory_layer.rs
@@ -1,45 +1,32 @@
-//! FIXME
-//! An in-memory layer stores recently received page versions in memory. The page versions
-//! are held in a BTreeMap, and there's another BTreeMap to track the size of the relation.
+//! An in-memory layer stores recently received key-value pairs.
 //!
+//! The "in-memory" part of the name is a bit misleading: the actual page versions are
+//! held in an ephemeral file, not in memory. The metadata for each page version, i.e.
+//! its position in the file, is kept in memory, though.
+//!
+use crate::config::PageServerConf;
+use crate::layered_repository::delta_layer::{DeltaLayer, DeltaLayerWriter};
 use crate::layered_repository::ephemeral_file::EphemeralFile;
-use crate::layered_repository::filename::DeltaFileName;
 use crate::layered_repository::storage_layer::{
-    Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag, RELISH_SEG_SIZE,
+    Layer, ValueReconstructResult, ValueReconstructState,
 };
-use crate::layered_repository::LayeredTimeline;
-use crate::layered_repository::ZERO_PAGE;
-use crate::layered_repository::{DeltaLayer, ImageLayer};
-use crate::repository::WALRecord;
-use crate::PageServerConf;
+use crate::layered_repository::utils;
+use crate::repository::{Key, Value};
 use crate::{ZTenantId, ZTimelineId};
-use anyhow::{ensure, Result};
-use bytes::Bytes;
-use lazy_static::lazy_static;
+use anyhow::Result;
 use log::*;
+use std::collections::HashMap;
+use std::ops::Range;
 use std::path::PathBuf;
-use std::sync::{Arc, RwLock};
+use std::sync::RwLock;
+use zenith_utils::bin_ser::BeSer;
 use zenith_utils::lsn::Lsn;
 use zenith_utils::vec_map::VecMap;

-use super::page_versions::PageVersions;
-
-use zenith_metrics::{register_int_counter, IntCounter};
-
-lazy_static! {
-    static ref LATEST_IMG_UPDATE_COUNTER: IntCounter =
-        register_int_counter!("latest_img_updates", "Number of updates of latest img").unwrap();
-    static ref LATEST_IMG_MISS_COUNTER: IntCounter =
-        register_int_counter!("latest_img_misses", "Number of cache misses of latest img").unwrap();
-    static ref LATEST_IMG_HIT_COUNTER: IntCounter =
-        register_int_counter!("latest_img_hits", "Number of cache hits of latest img").unwrap();
-}
-
-pub struct OpenLayer {
+pub struct InMemoryLayer {
    conf: &'static PageServerConf,
    tenantid: ZTenantId,
    timelineid: ZTimelineId,
-    seg: SegmentTag,

    ///
    /// This layer contains all the changes from 'start_lsn'. The
@@ -47,83 +34,65 @@ pub struct OpenLayer {
    ///
    start_lsn: Lsn,

-    /// LSN of the oldest page version stored in this layer
-    oldest_pending_lsn: Lsn,
+    ///
+    /// LSN of the oldest value stored in this layer.
+    ///
+    /// This is different from 'start_lsn' in that we enforce that the 'start_lsn'
+    /// of a layer always matches the 'end_lsn' of its predecessor, even if there
+    /// are no page versions until at a later LSN. That way you can detect any
+    /// missing layer files more easily. 'oldest_lsn' is the first page version
+    /// actually stored in this layer. In the range between 'start_lsn' and
+    /// 'oldest_lsn', there are no changes to the segment.
+    /// 'oldest_lsn' is used to adjust 'disk_consistent_lsn' and that is why it should
+    /// point to the beginning of WAL record. This is the other difference with 'start_lsn'
+    /// which points to end of WAL record. This is why 'oldest_lsn' can be smaller than 'start_lsn'.
+    ///
+    oldest_lsn: Lsn,

    /// The above fields never change. The parts that do change are in 'inner',
    /// and protected by mutex.
-    inner: RwLock<OpenLayerInner>,
-
-    /// Predecessor layer might be needed?
-    incremental: bool,
+    inner: RwLock<InMemoryLayerInner>,
 }

-pub struct OpenLayerInner {
+pub struct InMemoryLayerInner {
    /// Frozen layers have an exclusive end LSN.
    /// Writes are only allowed when this is None
    end_lsn: Option<Lsn>,

-    /// If this relation was dropped, remember when that happened.
-    /// The drop LSN is recorded in [`end_lsn`].
-    dropped: bool,
+    ///
+    /// All versions of all pages in the layer are kept here.  Indexed
+    /// by block number and LSN. The value is an offset into the
+    /// ephemeral file where the page version is stored.
+    ///
+    index: HashMap<Key, VecMap<Lsn, u64>>,

-    ///
-    /// All versions of all pages in the layer are are kept here.
-    /// Indexed by block number and LSN.
-    ///
-    page_versions: PageVersions,
+    /// The values are stored in a serialized format in this file.
+    /// Each serialized Value is preceded by a 'u32' length field.
+    /// PerSeg::page_versions map stores offsets into this file.
+    file: EphemeralFile,

-    ///
-    /// `segsizes` tracks the size of the segment at different points in time.
-    ///
-    /// For a blocky rel, there is always one entry, at the layer's start_lsn,
-    /// so that determining the size never depends on the predecessor layer. For
-    /// a non-blocky rel, 'segsizes' is not used and is always empty.
-    ///
-    segsizes: VecMap<Lsn, u32>,
+    end_offset: u64,
 }

-impl OpenLayerInner {
+impl InMemoryLayerInner {
    fn assert_writeable(&self) {
        assert!(self.end_lsn.is_none());
    }
-
-    fn get_seg_size(&self, lsn: Lsn) -> u32 {
-        // Scan the BTreeMap backwards, starting from the given entry.
-        let slice = self.segsizes.slice_range(..=lsn);
-
-        // We make sure there is always at least one entry
-        if let Some((_entry_lsn, entry)) = slice.last() {
-            *entry
-        } else {
-            panic!("could not find seg size in in-memory layer");
-        }
-    }
 }

-impl Layer for OpenLayer {
-    // FIXME
-    // An open layer doesn't really have a filename as it's not stored on disk,
-    // but we construct a filename as if it was a delta layer
+impl Layer for InMemoryLayer {
+    // An in-memory layer can be spilled to disk into ephemeral file,
+    // This function is used only for debugging, so we don't need to be very precise.
+    // Construct a filename as if it was a delta layer.
    fn filename(&self) -> PathBuf {
        let inner = self.inner.read().unwrap();

-        let end_lsn;
-        if let Some(drop_lsn) = inner.end_lsn {
-            end_lsn = drop_lsn;
-        } else {
-            end_lsn = Lsn(u64::MAX);
-        }
+        let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));

-        let delta_filename = DeltaFileName {
-            seg: self.seg,
-            start_lsn: self.start_lsn,
-            end_lsn,
-            dropped: inner.dropped,
-        }
-        .to_string();
-
-        PathBuf::from(format!("inmem-{}", delta_filename))
+        PathBuf::from(format!(
+            "inmem-{:016X}-{:016X}",
+            self.start_lsn.0, end_lsn.0
+        ))
    }

    fn get_tenant_id(&self) -> ZTenantId {
@@ -134,82 +103,54 @@ impl Layer for OpenLayer {
        self.timelineid
    }

-    fn get_seg_tag(&self) -> SegmentTag {
-        self.seg
+    fn get_key_range(&self) -> Range<Key> {
+        Key::MIN..Key::MAX
    }

-    fn get_start_lsn(&self) -> Lsn {
-        self.start_lsn
-    }
-
-    fn get_end_lsn(&self) -> Lsn {
+    fn get_lsn_range(&self) -> Range<Lsn> {
        let inner = self.inner.read().unwrap();

-        if let Some(end_lsn) = inner.end_lsn {
+        let end_lsn = if let Some(end_lsn) = inner.end_lsn {
            end_lsn
        } else {
            Lsn(u64::MAX)
-        }
+        };
+        self.start_lsn..end_lsn
    }

-    fn is_dropped(&self) -> bool {
-        let inner = self.inner.read().unwrap();
-        inner.dropped
-    }
-
-    /// Look up given page in the cache.
-    fn get_page_reconstruct_data(
+    /// Look up given value in the layer.
+    fn get_value_reconstruct_data(
        &self,
-        blknum: u32,
-        lsn: Lsn,
-        cached_img_lsn: Option<Lsn>,
-        reconstruct_data: &mut PageReconstructData,
-    ) -> Result<PageReconstructResult> {
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+    ) -> Result<ValueReconstructResult> {
+        assert!(lsn_range.start <= self.start_lsn);
        let mut need_image = true;

-        assert!(self.seg.blknum_in_seg(blknum));
+        let inner = self.inner.read().unwrap();

-        {
-            let inner = self.inner.read().unwrap();
-
-            let latest = inner.page_versions.get_latest(blknum);
-
-            // Scan the page versions backwards, starting from `lsn`.
-            let iter = inner
-                .page_versions
-                .get_block_lsn_range(blknum, ..=lsn)
-                .iter()
-                .rev();
-            for (entry_lsn, pos) in iter {
-                match &cached_img_lsn {
-                    Some(cached_lsn) if entry_lsn <= cached_lsn => {
-                        return Ok(PageReconstructResult::Cached)
+        // Scan the page versions backwards, starting from `lsn`.
+        if let Some(vec_map) = inner.index.get(&key) {
+            let slice = vec_map.slice_range(lsn_range);
+            for (entry_lsn, pos) in slice.iter().rev() {
+                match &reconstruct_state.img {
+                    Some((cached_lsn, _)) if entry_lsn <= cached_lsn => {
+                        return Ok(ValueReconstructResult::Complete)
                    }
                    _ => {}
                }

-                let pv = inner.page_versions.get_page_version(*pos)?;
-                match pv {
-                    PageVersion::Page(img) => {
-                        reconstruct_data.page_img = Some(img);
-                        need_image = false;
-                        break;
+                let value = Value::des(&utils::read_blob(&inner.file, *pos)?)?;
+                match value {
+                    Value::Image(img) => {
+                        reconstruct_state.img = Some((*entry_lsn, img));
+                        return Ok(ValueReconstructResult::Complete);
                    }
-                    PageVersion::Wal(rec) => {
-                        if let Some((latest_lsn, latest_pos)) = latest {
-                            if latest_lsn == entry_lsn {
-                                // we had this cached, nice!
-                                let img = inner.page_versions.fetch_cached_latest(*latest_pos)?;
-                                reconstruct_data.page_img = Some(img);
-                                need_image = false;
-                                LATEST_IMG_HIT_COUNTER.inc();
-                                break;
-                            }
-                        }
-                        LATEST_IMG_MISS_COUNTER.inc();
-
-                        reconstruct_data.records.push((*entry_lsn, rec.clone()));
-                        if rec.will_init {
+                    Value::WalRecord(rec) => {
+                        let will_init = rec.will_init();
+                        reconstruct_state.records.push((*entry_lsn, rec));
+                        if will_init {
                            // This WAL record initializes the page, so no need to go further back
                            need_image = false;
                            break;
@@ -217,60 +158,21 @@ impl Layer for OpenLayer {
                    }
                }
            }
-            // release lock on 'inner'
        }

+        // release lock on 'inner'
+
        // If an older page image is needed to reconstruct the page, let the
-        // caller know
+        // caller know.
        if need_image {
-            if self.incremental {
-                Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
-            } else {
-                Ok(PageReconstructResult::Missing(self.start_lsn))
-            }
+            Ok(ValueReconstructResult::Continue)
        } else {
-            Ok(PageReconstructResult::Complete)
+            Ok(ValueReconstructResult::Complete)
        }
    }

-    fn cache_page_image(&self, blknum: u32, lsn: Lsn, img: &[u8]) -> Result<()> {
-        let mut inner = self.inner.write().unwrap();
-
-        LATEST_IMG_UPDATE_COUNTER.inc();
-
-        inner.page_versions.cache_latest(blknum, lsn, img)
-    }
-
-    /// Get size of the relation at given LSN
-    fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
-        assert!(lsn >= self.start_lsn);
-        ensure!(
-            self.seg.rel.is_blocky(),
-            "get_seg_size() called on a non-blocky rel"
-        );
-
-        let inner = self.inner.read().unwrap();
-        Ok(inner.get_seg_size(lsn))
-    }
-
-    /// Does this segment exist at given LSN?
-    fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
-        let inner = self.inner.read().unwrap();
-
-        // If the segment created after requested LSN,
-        // it doesn't exist in the layer. But we shouldn't
-        // have requested it in the first place.
-        assert!(lsn >= self.start_lsn);
-
-        // Is the requested LSN after the segment was dropped?
-        if let Some(end_lsn) = inner.end_lsn {
-            if lsn >= end_lsn {
-                return Ok(false);
-            }
-        }
-
-        // Otherwise, it exists
-        Ok(true)
+    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
+        todo!();
    }

    /// Cannot unload anything in an in-memory layer, since there's no backing
@@ -283,11 +185,16 @@ impl Layer for OpenLayer {
    /// Nothing to do here. When you drop the last reference to the layer, it will
    /// be deallocated.
    fn delete(&self) -> Result<()> {
-        Ok(())
+        panic!("can't delete an InMemoryLayer")
    }

    fn is_incremental(&self) -> bool {
-        self.incremental
+        // in-memory layer is always considered incremental.
+        true
+    }
+
+    fn is_in_memory(&self) -> bool {
+        true
    }

    /// debugging function to print out the contents of the layer
@@ -301,38 +208,37 @@ impl Layer for OpenLayer {
            .unwrap_or_default();

        println!(
-            "----- in-memory layer for tli {} seg {} {}-{} {} ----",
-            self.timelineid, self.seg, self.start_lsn, end_str, inner.dropped,
+            "----- in-memory layer for tli {} LSNs {}-{} ----",
+            self.timelineid,
+            self.start_lsn,
+            end_str,
+            //inner.dropped,
        );

-        for (k, v) in inner.segsizes.as_slice() {
-            println!("segsizes {}: {}", k, v);
-        }
+        // FIXME
+        /*
+           for (blknum, versions) in page_versions {
+               for (lsn, off) in versions.as_slice() {
+                   let pv = inner.read_pv(*off);
+                   let pv_description = match pv {
+                       Ok(PageVersion::Page(_img)) => "page",
+                       Ok(PageVersion::Wal(_rec)) => "wal",
+                       Err(_err) => "INVALID",
+                   };

-        for (blknum, lsn, pos) in inner.page_versions.ordered_page_version_iter(None) {
-            let pv = inner.page_versions.get_page_version(pos)?;
-            let pv_description = match pv {
-                PageVersion::Page(_img) => "page",
-                PageVersion::Wal(_rec) => "wal",
-            };
-
-            println!("blk {} at {}: {}\n", blknum, lsn, pv_description);
-        }
+                   println!("blk {} at {}: {}\n", blknum, lsn, pv_description);
+               }
+           }
+        */

        Ok(())
    }
 }

-/// A result of an inmemory layer data being written to disk.
-pub struct LayersOnDisk {
-    pub delta_layers: Vec<DeltaLayer>,
-    pub image_layers: Vec<ImageLayer>,
-}
-
-impl OpenLayer {
+impl InMemoryLayer {
    /// Return the oldest page version that's stored in this layer
-    pub fn get_oldest_pending_lsn(&self) -> Lsn {
-        self.oldest_pending_lsn
+    pub fn get_oldest_lsn(&self) -> Lsn {
+        self.oldest_lsn
    }

    ///
@@ -342,261 +248,96 @@ impl OpenLayer {
        conf: &'static PageServerConf,
        timelineid: ZTimelineId,
        tenantid: ZTenantId,
-        seg: SegmentTag,
        start_lsn: Lsn,
-        oldest_pending_lsn: Lsn,
-    ) -> Result<OpenLayer> {
+        oldest_lsn: Lsn,
+    ) -> Result<InMemoryLayer> {
        trace!(
-            "initializing new empty InMemoryLayer for writing {} on timeline {} at {}",
-            seg,
+            "initializing new empty InMemoryLayer for writing on timeline {} at {}",
            timelineid,
            start_lsn
        );

-        // The segment is initially empty, so initialize 'segsizes' with 0.
-        let mut segsizes = VecMap::default();
-        if seg.rel.is_blocky() {
-            segsizes.append(start_lsn, 0).unwrap();
-        }
-
        let file = EphemeralFile::create(conf, tenantid, timelineid)?;

-        Ok(OpenLayer {
+        Ok(InMemoryLayer {
            conf,
            timelineid,
            tenantid,
-            seg,
            start_lsn,
-            oldest_pending_lsn,
-            incremental: false,
-            inner: RwLock::new(OpenLayerInner {
+            oldest_lsn,
+            inner: RwLock::new(InMemoryLayerInner {
                end_lsn: None,
-                dropped: false,
-                page_versions: PageVersions::new(file),
-                segsizes,
+                index: HashMap::new(),
+                file,
+                end_offset: 0,
            }),
        })
    }

    // Write operations

-    /// Remember new page version, as a WAL record over previous version
-    pub fn put_wal_record(&self, lsn: Lsn, blknum: u32, rec: WALRecord) -> Result<u32> {
-        self.put_page_version(blknum, lsn, PageVersion::Wal(rec))
-    }
-
-    /// Remember new page version, as a full page image
-    pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> Result<u32> {
-        self.put_page_version(blknum, lsn, PageVersion::Page(img))
-    }
-
    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> Result<u32> {
-        assert!(self.seg.blknum_in_seg(blknum));
-
-        trace!(
-            "put_page_version blk {} of {} at {}/{}",
-            blknum,
-            self.seg.rel,
-            self.timelineid,
-            lsn
-        );
+    pub fn put_value(&self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
+        trace!("put_value key {} at {}/{}", key, self.timelineid, lsn);
        let mut inner = self.inner.write().unwrap();

        inner.assert_writeable();

-        let old = inner.page_versions.append_or_update_last(blknum, lsn, pv)?;
+        let off = inner.end_offset;
+        let len = utils::write_blob(&mut inner.file, &Value::ser(&val)?)?;
+        inner.end_offset += len;

+        let vec_map = inner.index.entry(key).or_default();
+        let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
        if old.is_some() {
            // We already had an entry for this LSN. That's odd..
-            warn!(
-                "Page version of rel {} blk {} at {} already exists",
-                self.seg.rel, blknum, lsn
-            );
+            warn!("Key {} at {} already exists", key, lsn);
        }

-        // Also update the relation size, if this extended the relation.
-        if self.seg.rel.is_blocky() {
-            let newsize = blknum - self.seg.segno * RELISH_SEG_SIZE + 1;
-
-            // use inner get_seg_size, since calling self.get_seg_size will try to acquire the lock,
-            // which we've just acquired above
-            let oldsize = inner.get_seg_size(lsn);
-            if newsize > oldsize {
-                trace!(
-                    "enlarging segment {} from {} to {} blocks at {}",
-                    self.seg,
-                    oldsize,
-                    newsize,
-                    lsn
-                );
-
-                // If we are extending the relation by more than one page, initialize the "gap"
-                // with zeros
-                //
-                // XXX: What if the caller initializes the gap with subsequent call with same LSN?
-                // I don't think that can happen currently, but that is highly dependent on how
-                // PostgreSQL writes its WAL records and there's no guarantee of it. If it does
-                // happen, we would hit the "page version already exists" warning above on the
-                // subsequent call to initialize the gap page.
-                let gapstart = self.seg.segno * RELISH_SEG_SIZE + oldsize;
-                for gapblknum in gapstart..blknum {
-                    let zeropv = PageVersion::Page(ZERO_PAGE.clone());
-                    trace!(
-                        "filling gap blk {} with zeros for write of {}",
-                        gapblknum,
-                        blknum
-                    );
-                    let old = inner
-                        .page_versions
-                        .append_or_update_last(gapblknum, lsn, zeropv)?;
-                    // We already had an entry for this LSN. That's odd..
-
-                    if old.is_some() {
-                        warn!(
-                            "Page version of rel {} blk {} at {} already exists",
-                            self.seg.rel, blknum, lsn
-                        );
-                    }
-                }
-
-                inner.segsizes.append_or_update_last(lsn, newsize).unwrap();
-                return Ok(newsize - oldsize);
-            }
-        }
-
-        Ok(0)
+        Ok(())
    }

-    /// Remember that the relation was truncated at given LSN
-    pub fn put_truncation(&self, lsn: Lsn, segsize: u32) {
-        assert!(
-            self.seg.rel.is_blocky(),
-            "put_truncation() called on a non-blocky rel"
-        );
+    pub fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
+        // TODO: Currently, we just leak the storage for any deleted keys

-        let mut inner = self.inner.write().unwrap();
-        inner.assert_writeable();
-
-        // check that this we truncate to a smaller size than segment was before the truncation
-        let oldsize = inner.get_seg_size(lsn);
-        assert!(segsize < oldsize);
-
-        let (old, _delta_size) = inner.segsizes.append_or_update_last(lsn, segsize).unwrap();
-
-        if old.is_some() {
-            // We already had an entry for this LSN. That's odd..
-            warn!("Inserting truncation, but had an entry for the LSN already");
-        }
-    }
-
-    /// Remember that the segment was dropped at given LSN
-    pub fn drop_segment(&self, lsn: Lsn) {
-        let mut inner = self.inner.write().unwrap();
-
-        assert!(inner.end_lsn.is_none());
-        assert!(!inner.dropped);
-        inner.dropped = true;
-        assert!(self.start_lsn < lsn);
-        inner.end_lsn = Some(lsn);
-
-        trace!("dropped segment {} at {}", self.seg, lsn);
-    }
-
-    ///
-    /// Initialize a new OpenLayer for, by copying the state at the given
-    /// point in time from given existing layer.
-    ///
-    pub fn create_successor_layer(
-        conf: &'static PageServerConf,
-        src: Arc<dyn Layer>,
-        timelineid: ZTimelineId,
-        tenantid: ZTenantId,
-        start_lsn: Lsn,
-        oldest_pending_lsn: Lsn,
-    ) -> Result<OpenLayer> {
-        let seg = src.get_seg_tag();
-
-        assert!(oldest_pending_lsn.is_aligned());
-        assert!(oldest_pending_lsn >= start_lsn);
-
-        trace!(
-            "initializing new OpenLayer for writing {} on timeline {} at {}",
-            seg,
-            timelineid,
-            start_lsn,
-        );
-
-        // Copy the segment size at the start LSN from the predecessor layer.
-        let mut segsizes = VecMap::default();
-        if seg.rel.is_blocky() {
-            let size = src.get_seg_size(start_lsn)?;
-            segsizes.append(start_lsn, size).unwrap();
-        }
-
-        let file = EphemeralFile::create(conf, tenantid, timelineid)?;
-
-        Ok(OpenLayer {
-            conf,
-            timelineid,
-            tenantid,
-            seg,
-            start_lsn,
-            oldest_pending_lsn,
-            incremental: true,
-            inner: RwLock::new(OpenLayerInner {
-                end_lsn: None,
-                dropped: false,
-                page_versions: PageVersions::new(file),
-                segsizes,
-            }),
-        })
-    }
-
-    pub fn is_writeable(&self) -> bool {
-        let inner = self.inner.read().unwrap();
-        inner.end_lsn.is_none()
+        Ok(())
    }

    /// Make the layer non-writeable. Only call once.
    /// Records the end_lsn for non-dropped layers.
-    /// `end_lsn` is inclusive
+    /// `end_lsn` is exclusive
    pub fn freeze(&self, end_lsn: Lsn) {
        let mut inner = self.inner.write().unwrap();

-        if inner.end_lsn.is_some() {
-            assert!(inner.dropped);
-        } else {
-            assert!(!inner.dropped);
-            assert!(self.start_lsn < end_lsn + 1);
-            inner.end_lsn = Some(Lsn(end_lsn.0 + 1));
+        assert!(self.start_lsn < end_lsn);
+        inner.end_lsn = Some(end_lsn);

-            if let Some((lsn, _)) = inner.segsizes.as_slice().last() {
-                assert!(lsn <= &end_lsn, "{:?} {:?}", lsn, end_lsn);
-            }
+        // FIXME
+        /*
+                for perseg in inner.segs.values() {
+                    if let Some((lsn, _)) = perseg.seg_sizes.as_slice().last() {
+                        assert!(lsn < &end_lsn, "{:?} {:?}", lsn, end_lsn);
+                    }

-            for (_blk, lsn, _pv) in inner.page_versions.ordered_page_version_iter(None) {
-                assert!(lsn <= end_lsn);
-            }
-        }
+                    for (_blk, vec_map) in perseg.page_versions.iter() {
+                        for (lsn, _pos) in vec_map.as_slice() {
+                            assert!(*lsn < end_lsn);
+                        }
+                    }
+                }
+        */
    }

-    /// Write the this frozen in-memory layer to disk.
+    /// Write this frozen in-memory layer to disk.
    ///
    /// Returns new layers that replace this one.
-    /// If not dropped, returns a new image layer containing the page versions
+    /// If not dropped and reconstruct_pages is true, returns a new image layer containing the page versions
    /// at the `end_lsn`. Can also return a DeltaLayer that includes all the
    /// WAL records between start and end LSN. (The delta layer is not needed
    /// when a new relish is created with a single LSN, so that the start and
    /// end LSN are the same.)
-    pub fn write_to_disk(&self, timeline: &LayeredTimeline) -> Result<LayersOnDisk> {
-        trace!(
-            "write_to_disk {} get_end_lsn is {}",
-            self.filename().display(),
-            self.get_end_lsn()
-        );
-
+    pub fn write_to_disk(&self) -> Result<DeltaLayer> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -607,84 +348,31 @@ impl OpenLayer {
        // would have to wait until we release it. That race condition is very
        // rare though, so we just accept the potential latency hit for now.
        let inner = self.inner.read().unwrap();
-        let end_lsn_exclusive = inner.end_lsn.unwrap();

-        if inner.dropped {
-            let delta_layer = DeltaLayer::create(
-                self.conf,
-                self.timelineid,
-                self.tenantid,
-                self.seg,
-                self.start_lsn,
-                end_lsn_exclusive,
-                true,
-                &inner.page_versions,
-                None,
-                inner.segsizes.clone(),
-            )?;
-            trace!(
-                "freeze: created delta layer for dropped segment {} {}-{}",
-                self.seg,
-                self.start_lsn,
-                end_lsn_exclusive
-            );
-            return Ok(LayersOnDisk {
-                delta_layers: vec![delta_layer],
-                image_layers: Vec::new(),
-            });
+        let mut delta_layer_writer = DeltaLayerWriter::new(
+            self.conf,
+            self.timelineid,
+            self.tenantid,
+            Key::MIN,
+            self.start_lsn..inner.end_lsn.unwrap(),
+        )?;
+
+        let mut do_steps = || -> Result<()> {
+            for (key, vec_map) in inner.index.iter() {
+                // Write all page versions
+                for (lsn, pos) in vec_map.as_slice() {
+                    let val = Value::des(&utils::read_blob(&inner.file, *pos)?)?;
+                    delta_layer_writer.put_value(*key, *lsn, val)?;
+                }
+            }
+            Ok(())
+        };
+        if let Err(err) = do_steps() {
+            delta_layer_writer.abort();
+            return Err(err);
        }

-        // Since `end_lsn` is inclusive, subtract 1.
-        // We want to make an ImageLayer for the last included LSN,
-        // so the DeltaLayer should exclude that LSN.
-        let end_lsn_inclusive = Lsn(end_lsn_exclusive.0 - 1);
-
-        let mut delta_layers = Vec::new();
-
-        if self.start_lsn != end_lsn_inclusive {
-            let (segsizes, _) = inner.segsizes.split_at(&end_lsn_exclusive);
-            // Write the page versions before the cutoff to disk.
-            let delta_layer = DeltaLayer::create(
-                self.conf,
-                self.timelineid,
-                self.tenantid,
-                self.seg,
-                self.start_lsn,
-                end_lsn_inclusive,
-                false,
-                &inner.page_versions,
-                Some(end_lsn_inclusive),
-                segsizes,
-            )?;
-            delta_layers.push(delta_layer);
-            trace!(
-                "freeze: created delta layer {} {}-{}",
-                self.seg,
-                self.start_lsn,
-                end_lsn_inclusive
-            );
-        } else {
-            assert!(inner
-                .page_versions
-                .ordered_page_version_iter(None)
-                .next()
-                .is_none());
-        }
-
-        drop(inner);
-
-        // Write a new base image layer at the cutoff point
-        let image_layer =
-            ImageLayer::create_from_src(self.conf, timeline, self, end_lsn_inclusive)?;
-        trace!(
-            "freeze: created image layer {} at {}",
-            self.seg,
-            end_lsn_inclusive
-        );
-
-        Ok(LayersOnDisk {
-            delta_layers,
-            image_layers: vec![image_layer],
-        })
+        let delta_layer = delta_layer_writer.finish(Key::MAX)?;
+        Ok(delta_layer)
    }
 }
--- a/pageserver/src/layered_repository/interval_tree.rs
+++ b/pageserver/src/layered_repository/interval_tree.rs
@@ -1,468 +0,0 @@
-///
-/// IntervalTree is data structure for holding intervals. It is generic
-/// to make unit testing possible, but the only real user of it is the layer map,
-///
-/// It's inspired by the "segment tree" or a "statistic tree" as described in
-/// https://en.wikipedia.org/wiki/Segment_tree. However, we use a B-tree to hold
-/// the points instead of a binary tree. This is called an "interval tree" instead
-/// of "segment tree" because the term "segment" is already using Zenith to mean
-/// something else. To add to the confusion, there is another data structure known
-/// as "interval tree" out there (see https://en.wikipedia.org/wiki/Interval_tree),
-/// for storing intervals, but this isn't that.
-///
-/// The basic idea is to have a B-tree of "interesting Points". At each Point,
-/// there is a list of intervals that contain the point. The Points are formed
-/// from the start bounds of each interval; there is a Point for each distinct
-/// start bound.
-///
-/// Operations:
-///
-/// To find intervals that contain a given point, you search the b-tree to find
-/// the nearest Point <= search key. Then you just return the list of intervals.
-///
-/// To insert an interval, find the Point with start key equal to the inserted item.
-/// If the Point doesn't exist yet, create it, by copying all the items from the
-/// previous Point that cover the new Point. Then walk right, inserting the new
-/// interval to all the Points that are contained by the new interval (including the
-/// newly created Point).
-///
-/// To remove an interval, you scan the tree for all the Points that are contained by
-/// the removed interval, and remove it from the list in each Point.
-///
-/// Requirements and assumptions:
-///
-/// - Can store overlapping items
-/// - But there are not many overlapping items
-/// - The interval bounds don't change after it is added to the tree
-/// - Intervals are uniquely identified by pointer equality. You must not be insert the
-///   same interval object twice, and `remove` uses pointer equality to remove the right
-///   interval. It is OK to have two intervals with the same bounds, however.
-///
-use std::collections::BTreeMap;
-use std::fmt::Debug;
-use std::ops::Range;
-use std::sync::Arc;
-
-pub struct IntervalTree<I: ?Sized>
-where
-    I: IntervalItem,
-{
-    points: BTreeMap<I::Key, Point<I>>,
-}
-
-struct Point<I: ?Sized> {
-    /// All intervals that contain this point, in no particular order.
-    ///
-    /// We assume that there aren't a lot of overlappingg intervals, so that this vector
-    /// never grows very large. If that assumption doesn't hold, we could keep this ordered
-    /// by the end bound, to speed up `search`. But as long as there are only a few elements,
-    /// a linear search is OK.
-    elements: Vec<Arc<I>>,
-}
-
-/// Abstraction for an interval that can be stored in the tree
-///
-/// The start bound is inclusive and the end bound is exclusive. End must be greater
-/// than start.
-pub trait IntervalItem {
-    type Key: Ord + Copy + Debug + Sized;
-
-    fn start_key(&self) -> Self::Key;
-    fn end_key(&self) -> Self::Key;
-
-    fn bounds(&self) -> Range<Self::Key> {
-        self.start_key()..self.end_key()
-    }
-}
-
-impl<I: ?Sized> IntervalTree<I>
-where
-    I: IntervalItem,
-{
-    /// Return an element that contains 'key', or precedes it.
-    ///
-    /// If there are multiple candidates, returns the one with the highest 'end' key.
-    pub fn search(&self, key: I::Key) -> Option<Arc<I>> {
-        // Find the greatest point that precedes or is equal to the search key. If there is
-        // none, returns None.
-        let (_, p) = self.points.range(..=key).next_back()?;
-
-        // Find the element with the highest end key at this point
-        let highest_item = p
-            .elements
-            .iter()
-            .reduce(|a, b| {
-                // starting with Rust 1.53, could use `std::cmp::min_by_key` here
-                if a.end_key() > b.end_key() {
-                    a
-                } else {
-                    b
-                }
-            })
-            .unwrap();
-        Some(Arc::clone(highest_item))
-    }
-
-    /// Iterate over all items with start bound >= 'key'
-    pub fn iter_newer(&self, key: I::Key) -> IntervalIter<I> {
-        IntervalIter {
-            point_iter: self.points.range(key..),
-            elem_iter: None,
-        }
-    }
-
-    /// Iterate over all items
-    pub fn iter(&self) -> IntervalIter<I> {
-        IntervalIter {
-            point_iter: self.points.range(..),
-            elem_iter: None,
-        }
-    }
-
-    pub fn insert(&mut self, item: Arc<I>) {
-        let start_key = item.start_key();
-        let end_key = item.end_key();
-        assert!(start_key < end_key);
-        let bounds = start_key..end_key;
-
-        // Find the starting point and walk forward from there
-        let mut found_start_point = false;
-        let iter = self.points.range_mut(bounds);
-        for (point_key, point) in iter {
-            if *point_key == start_key {
-                found_start_point = true;
-                // It is an error to insert the same item to the tree twice.
-                assert!(
-                    !point.elements.iter().any(|x| Arc::ptr_eq(x, &item)),
-                    "interval is already in the tree"
-                );
-            }
-            point.elements.push(Arc::clone(&item));
-        }
-        if !found_start_point {
-            // Create a new Point for the starting point
-
-            // Look at the previous point, and copy over elements that overlap with this
-            // new point
-            let mut new_elements: Vec<Arc<I>> = Vec::new();
-            if let Some((_, prev_point)) = self.points.range(..start_key).next_back() {
-                let overlapping_prev_elements = prev_point
-                    .elements
-                    .iter()
-                    .filter(|x| x.bounds().contains(&start_key))
-                    .cloned();
-
-                new_elements.extend(overlapping_prev_elements);
-            }
-            new_elements.push(item);
-
-            let new_point = Point {
-                elements: new_elements,
-            };
-            self.points.insert(start_key, new_point);
-        }
-    }
-
-    pub fn remove(&mut self, item: &Arc<I>) {
-        // range search points
-        let start_key = item.start_key();
-        let end_key = item.end_key();
-        let bounds = start_key..end_key;
-
-        let mut points_to_remove: Vec<I::Key> = Vec::new();
-        let mut found_start_point = false;
-        for (point_key, point) in self.points.range_mut(bounds) {
-            if *point_key == start_key {
-                found_start_point = true;
-            }
-            let len_before = point.elements.len();
-            point.elements.retain(|other| !Arc::ptr_eq(other, item));
-            let len_after = point.elements.len();
-            assert_eq!(len_after + 1, len_before);
-            if len_after == 0 {
-                points_to_remove.push(*point_key);
-            }
-        }
-        assert!(found_start_point);
-
-        for k in points_to_remove {
-            self.points.remove(&k).unwrap();
-        }
-    }
-}
-
-pub struct IntervalIter<'a, I: ?Sized>
-where
-    I: IntervalItem,
-{
-    point_iter: std::collections::btree_map::Range<'a, I::Key, Point<I>>,
-    elem_iter: Option<(I::Key, std::slice::Iter<'a, Arc<I>>)>,
-}
-
-impl<'a, I> Iterator for IntervalIter<'a, I>
-where
-    I: IntervalItem + ?Sized,
-{
-    type Item = Arc<I>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        // Iterate over all elements in all the points in 'point_iter'. To avoid
-        // returning the same element twice, we only return each element at its
-        // starting point.
-        loop {
-            // Return next remaining element from the current point
-            if let Some((point_key, elem_iter)) = &mut self.elem_iter {
-                for elem in elem_iter {
-                    if elem.start_key() == *point_key {
-                        return Some(Arc::clone(elem));
-                    }
-                }
-            }
-            // No more elements at this point. Move to next point.
-            if let Some((point_key, point)) = self.point_iter.next() {
-                self.elem_iter = Some((*point_key, point.elements.iter()));
-                continue;
-            } else {
-                // No more points, all done
-                return None;
-            }
-        }
-    }
-}
-
-impl<I: ?Sized> Default for IntervalTree<I>
-where
-    I: IntervalItem,
-{
-    fn default() -> Self {
-        IntervalTree {
-            points: BTreeMap::new(),
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use std::fmt;
-
-    #[derive(Debug)]
-    struct MockItem {
-        start_key: u32,
-        end_key: u32,
-        val: String,
-    }
-    impl IntervalItem for MockItem {
-        type Key = u32;
-
-        fn start_key(&self) -> u32 {
-            self.start_key
-        }
-        fn end_key(&self) -> u32 {
-            self.end_key
-        }
-    }
-    impl MockItem {
-        fn new(start_key: u32, end_key: u32) -> Self {
-            MockItem {
-                start_key,
-                end_key,
-                val: format!("{}-{}", start_key, end_key),
-            }
-        }
-        fn new_str(start_key: u32, end_key: u32, val: &str) -> Self {
-            MockItem {
-                start_key,
-                end_key,
-                val: format!("{}-{}: {}", start_key, end_key, val),
-            }
-        }
-    }
-    impl fmt::Display for MockItem {
-        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-            write!(f, "{}", self.val)
-        }
-    }
-    #[rustfmt::skip]
-    fn assert_search(
-        tree: &IntervalTree<MockItem>,
-        key: u32,
-        expected: &[&str],
-    ) -> Option<Arc<MockItem>> {
-        if let Some(v) = tree.search(key) {
-            let vstr = v.to_string();
-
-            assert!(!expected.is_empty(), "search with {} returned {}, expected None", key, v);
-            assert!(
-                expected.contains(&vstr.as_str()),
-                "search with {} returned {}, expected one of: {:?}",
-                key, v, expected,
-            );
-
-            Some(v)
-        } else {
-            assert!(
-                expected.is_empty(),
-                "search with {} returned None, expected one of {:?}",
-                key, expected
-            );
-            None
-        }
-    }
-
-    fn assert_contents(tree: &IntervalTree<MockItem>, expected: &[&str]) {
-        let mut contents: Vec<String> = tree.iter().map(|e| e.to_string()).collect();
-        contents.sort();
-        assert_eq!(contents, expected);
-    }
-
-    fn dump_tree(tree: &IntervalTree<MockItem>) {
-        for (point_key, point) in tree.points.iter() {
-            print!("{}:", point_key);
-            for e in point.elements.iter() {
-                print!(" {}", e);
-            }
-            println!();
-        }
-    }
-
-    #[test]
-    fn test_interval_tree_simple() {
-        let mut tree: IntervalTree<MockItem> = IntervalTree::default();
-
-        // Simple, non-overlapping ranges.
-        tree.insert(Arc::new(MockItem::new(10, 11)));
-        tree.insert(Arc::new(MockItem::new(11, 12)));
-        tree.insert(Arc::new(MockItem::new(12, 13)));
-        tree.insert(Arc::new(MockItem::new(18, 19)));
-        tree.insert(Arc::new(MockItem::new(17, 18)));
-        tree.insert(Arc::new(MockItem::new(15, 16)));
-
-        assert_search(&tree, 9, &[]);
-        assert_search(&tree, 10, &["10-11"]);
-        assert_search(&tree, 11, &["11-12"]);
-        assert_search(&tree, 12, &["12-13"]);
-        assert_search(&tree, 13, &["12-13"]);
-        assert_search(&tree, 14, &["12-13"]);
-        assert_search(&tree, 15, &["15-16"]);
-        assert_search(&tree, 16, &["15-16"]);
-        assert_search(&tree, 17, &["17-18"]);
-        assert_search(&tree, 18, &["18-19"]);
-        assert_search(&tree, 19, &["18-19"]);
-        assert_search(&tree, 20, &["18-19"]);
-
-        // remove a few entries and search around them again
-        tree.remove(&assert_search(&tree, 10, &["10-11"]).unwrap()); // first entry
-        tree.remove(&assert_search(&tree, 12, &["12-13"]).unwrap()); // entry in the middle
-        tree.remove(&assert_search(&tree, 18, &["18-19"]).unwrap()); // last entry
-        assert_search(&tree, 9, &[]);
-        assert_search(&tree, 10, &[]);
-        assert_search(&tree, 11, &["11-12"]);
-        assert_search(&tree, 12, &["11-12"]);
-        assert_search(&tree, 14, &["11-12"]);
-        assert_search(&tree, 15, &["15-16"]);
-        assert_search(&tree, 17, &["17-18"]);
-        assert_search(&tree, 18, &["17-18"]);
-    }
-
-    #[test]
-    fn test_interval_tree_overlap() {
-        let mut tree: IntervalTree<MockItem> = IntervalTree::default();
-
-        // Overlapping items
-        tree.insert(Arc::new(MockItem::new(22, 24)));
-        tree.insert(Arc::new(MockItem::new(23, 25)));
-        let x24_26 = Arc::new(MockItem::new(24, 26));
-        tree.insert(Arc::clone(&x24_26));
-        let x26_28 = Arc::new(MockItem::new(26, 28));
-        tree.insert(Arc::clone(&x26_28));
-        tree.insert(Arc::new(MockItem::new(25, 27)));
-
-        assert_search(&tree, 22, &["22-24"]);
-        assert_search(&tree, 23, &["22-24", "23-25"]);
-        assert_search(&tree, 24, &["23-25", "24-26"]);
-        assert_search(&tree, 25, &["24-26", "25-27"]);
-        assert_search(&tree, 26, &["25-27", "26-28"]);
-        assert_search(&tree, 27, &["26-28"]);
-        assert_search(&tree, 28, &["26-28"]);
-        assert_search(&tree, 29, &["26-28"]);
-
-        tree.remove(&x24_26);
-        tree.remove(&x26_28);
-        assert_search(&tree, 23, &["22-24", "23-25"]);
-        assert_search(&tree, 24, &["23-25"]);
-        assert_search(&tree, 25, &["25-27"]);
-        assert_search(&tree, 26, &["25-27"]);
-        assert_search(&tree, 27, &["25-27"]);
-        assert_search(&tree, 28, &["25-27"]);
-        assert_search(&tree, 29, &["25-27"]);
-    }
-
-    #[test]
-    fn test_interval_tree_nested() {
-        let mut tree: IntervalTree<MockItem> = IntervalTree::default();
-
-        // Items containing other items
-        tree.insert(Arc::new(MockItem::new(31, 39)));
-        tree.insert(Arc::new(MockItem::new(32, 34)));
-        tree.insert(Arc::new(MockItem::new(33, 35)));
-        tree.insert(Arc::new(MockItem::new(30, 40)));
-
-        assert_search(&tree, 30, &["30-40"]);
-        assert_search(&tree, 31, &["30-40", "31-39"]);
-        assert_search(&tree, 32, &["30-40", "32-34", "31-39"]);
-        assert_search(&tree, 33, &["30-40", "32-34", "33-35", "31-39"]);
-        assert_search(&tree, 34, &["30-40", "33-35", "31-39"]);
-        assert_search(&tree, 35, &["30-40", "31-39"]);
-        assert_search(&tree, 36, &["30-40", "31-39"]);
-        assert_search(&tree, 37, &["30-40", "31-39"]);
-        assert_search(&tree, 38, &["30-40", "31-39"]);
-        assert_search(&tree, 39, &["30-40"]);
-        assert_search(&tree, 40, &["30-40"]);
-        assert_search(&tree, 41, &["30-40"]);
-    }
-
-    #[test]
-    fn test_interval_tree_duplicates() {
-        let mut tree: IntervalTree<MockItem> = IntervalTree::default();
-
-        // Duplicate keys
-        let item_a = Arc::new(MockItem::new_str(55, 56, "a"));
-        tree.insert(Arc::clone(&item_a));
-        let item_b = Arc::new(MockItem::new_str(55, 56, "b"));
-        tree.insert(Arc::clone(&item_b));
-        let item_c = Arc::new(MockItem::new_str(55, 56, "c"));
-        tree.insert(Arc::clone(&item_c));
-        let item_d = Arc::new(MockItem::new_str(54, 56, "d"));
-        tree.insert(Arc::clone(&item_d));
-        let item_e = Arc::new(MockItem::new_str(55, 57, "e"));
-        tree.insert(Arc::clone(&item_e));
-
-        dump_tree(&tree);
-
-        assert_search(
-            &tree,
-            55,
-            &["55-56: a", "55-56: b", "55-56: c", "54-56: d", "55-57: e"],
-        );
-        tree.remove(&item_b);
-        dump_tree(&tree);
-
-        assert_contents(&tree, &["54-56: d", "55-56: a", "55-56: c", "55-57: e"]);
-
-        tree.remove(&item_d);
-        dump_tree(&tree);
-        assert_contents(&tree, &["55-56: a", "55-56: c", "55-57: e"]);
-    }
-
-    #[test]
-    #[should_panic]
-    fn test_interval_tree_insert_twice() {
-        let mut tree: IntervalTree<MockItem> = IntervalTree::default();
-
-        // Inserting the same item twice is not cool
-        let item = Arc::new(MockItem::new(1, 2));
-        tree.insert(Arc::clone(&item));
-        tree.insert(Arc::clone(&item)); // fails assertion
-    }
-}
--- a/pageserver/src/layered_repository/layer_map.rs
+++ b/pageserver/src/layered_repository/layer_map.rs
@@ -3,30 +3,26 @@
 //!
 //! When the timeline is first accessed, the server lists of all layer files
 //! in the timelines/<timelineid> directory, and populates this map with
-//! ImageLayer and DeltaLayer structs corresponding to each file. When new WAL
-//! is received, we create InMemoryLayers to hold the incoming records. Now and
-//! then, in the checkpoint() function, the in-memory layers are frozen, forming
-//! new image and delta layers and corresponding files are written to disk.
+//! ImageLayer and DeltaLayer structs corresponding to each file. When the first
+//! new WAL record is received, we create an InMemoryLayer to hold the incoming
+//! records. Now and then, in the checkpoint() function, the in-memory layer is
+//! are frozen, and it is split up into new image and delta layers and the
+//! corresponding files are written to disk.
 //!

-use crate::layered_repository::interval_tree::{IntervalItem, IntervalIter, IntervalTree};
-use crate::layered_repository::storage_layer::{Layer, SegmentTag};
-use crate::layered_repository::OpenLayer;
-use crate::relish::*;
+use crate::layered_repository::storage_layer::Layer;
+use crate::layered_repository::storage_layer::{range_eq, range_overlaps};
+use crate::layered_repository::InMemoryLayer;
+use crate::repository::Key;
 use anyhow::Result;
 use lazy_static::lazy_static;
-use std::cmp::Ordering;
-use std::collections::{BinaryHeap, HashMap};
+use std::ops::Range;
 use std::sync::Arc;
+use tracing::*;
 use zenith_metrics::{register_int_gauge, IntGauge};
 use zenith_utils::lsn::Lsn;

-use super::global_layer_map::{LayerId, GLOBAL_LAYER_MAP};
-
 lazy_static! {
-    static ref NUM_INMEMORY_LAYERS: IntGauge =
-        register_int_gauge!("pageserver_inmemory_layers", "Number of layers in memory")
-            .expect("failed to define a metric");
    static ref NUM_ONDISK_LAYERS: IntGauge =
        register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk")
            .expect("failed to define a metric");
@@ -37,98 +33,135 @@ lazy_static! {
 ///
 #[derive(Default)]
 pub struct LayerMap {
-    /// All the layers keyed by segment tag
-    segs: HashMap<SegmentTag, SegEntry>,
+    //
+    // 'open_layer' holds the current InMemoryLayer that is accepting new
+    // records. If it is None, 'next_open_layer_at' will be set instead, indicating
+    // where the start LSN of the next InMemoryLayer that is to be created.
+    //
+    pub open_layer: Option<Arc<InMemoryLayer>>,
+    pub next_open_layer_at: Option<Lsn>,

-    /// All in-memory layers, ordered by 'oldest_pending_lsn' and generation
-    /// of each layer. This allows easy access to the in-memory layer that
-    /// contains the oldest WAL record.
-    open_layers: BinaryHeap<OpenLayerEntry>,
+    ///
+    /// The frozen layer, if any, contains WAL older than the current 'open_layer'
+    /// or 'next_open_layer_at', but newer than any historic layer. The frozen
+    /// layer is during checkpointing, when an InMemoryLayer is being written out
+    /// to disk.
+    ///
+    pub frozen_layer: Option<Arc<InMemoryLayer>>,

-    /// Generation number, used to distinguish newly inserted entries in the
-    /// binary heap from older entries during checkpoint.
-    current_generation: u64,
+    /// All the historic layers are kept here
+
+    /// TODO: This is a placeholder implementation of a data structure
+    /// to hold information about all the layer files on disk and in
+    /// S3. Currently, it's just a vector and all operations perform a
+    /// linear scan over it.  That obviously becomes slow as the
+    /// number of layers grows. I'm imagining that an R-tree or some
+    /// other 2D data structure would be the long-term solution here.
+    historic_layers: Vec<Arc<dyn Layer>>,
+}
+
+pub struct SearchResult {
+    pub layer: Arc<dyn Layer>,
+    pub lsn_floor: Lsn,
 }

 impl LayerMap {
-    ///
-    /// Look up a layer using the given segment tag and LSN. This differs from a
-    /// plain key-value lookup in that if there is any layer that covers the
-    /// given LSN, or precedes the given LSN, it is returned. In other words,
-    /// you don't need to know the exact start LSN of the layer.
-    ///
-    pub fn get(&self, tag: &SegmentTag, lsn: Lsn) -> Option<Arc<dyn Layer>> {
-        let segentry = self.segs.get(tag)?;
+    pub fn search(&self, key: Key, end_lsn: Lsn) -> Result<Option<SearchResult>> {
+        // linear search
+        // Find the latest image layer that covers the given key
+        let mut latest_img: Option<Arc<dyn Layer>> = None;
+        let mut latest_img_lsn: Option<Lsn> = None;
+        for l in self.historic_layers.iter() {
+            if l.is_incremental() {
+                continue;
+            }
+            if !l.get_key_range().contains(&key) {
+                continue;
+            }
+            let img_lsn = l.get_lsn_range().start;

-        segentry.get(lsn)
-    }
+            if img_lsn >= end_lsn {
+                // too new
+                continue;
+            }
+            if Lsn(img_lsn.0 + 1) == end_lsn {
+                // found exact match
+                return Ok(Some(SearchResult {
+                    layer: Arc::clone(l),
+                    lsn_floor: img_lsn,
+                }));
+            }
+            if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) {
+                latest_img = Some(Arc::clone(l));
+                latest_img_lsn = Some(img_lsn);
+            }
+        }

-    ///
-    /// Get the open layer for given segment for writing. Or None if no open
-    /// layer exists.
-    ///
-    pub fn get_open(&self, tag: &SegmentTag) -> Option<Arc<OpenLayer>> {
-        let segentry = self.segs.get(tag)?;
-
-        segentry
-            .open_layer_id
-            .and_then(|layer_id| GLOBAL_LAYER_MAP.read().unwrap().get(&layer_id))
-    }
-
-    ///
-    /// Insert an open in-memory layer
-    ///
-    pub fn insert_open(&mut self, layer: Arc<OpenLayer>) {
-        let segentry = self.segs.entry(layer.get_seg_tag()).or_default();
-
-        let layer_id = segentry.update_open(Arc::clone(&layer));
-
-        let oldest_pending_lsn = layer.get_oldest_pending_lsn();
-
-        // After a crash and restart, 'oldest_pending_lsn' of the oldest in-memory
-        // layer becomes the WAL streaming starting point, so it better not point
-        // in the middle of a WAL record.
-        assert!(oldest_pending_lsn.is_aligned());
-
-        // Also add it to the binary heap
-        let open_layer_entry = OpenLayerEntry {
-            oldest_pending_lsn: layer.get_oldest_pending_lsn(),
-            layer_id,
-            generation: self.current_generation,
-        };
-        self.open_layers.push(open_layer_entry);
-
-        NUM_INMEMORY_LAYERS.inc();
-    }
-
-    /// Remove an open in-memory layer
-    pub fn remove_open(&mut self, layer_id: LayerId) {
-        // Note: we don't try to remove the entry from the binary heap.
-        // It will be removed lazily by peek_oldest_open() when it's made it to
-        // the top of the heap.
-
-        let layer_opt = {
-            let mut global_map = GLOBAL_LAYER_MAP.write().unwrap();
-            let layer_opt = global_map.get(&layer_id);
-            global_map.remove(&layer_id);
-            // TODO it's bad that a ref can still exist after being evicted from cache
-            layer_opt
-        };
-
-        if let Some(layer) = layer_opt {
-            let mut segentry = self.segs.get_mut(&layer.get_seg_tag()).unwrap();
-
-            if segentry.open_layer_id == Some(layer_id) {
-                // Also remove it from the SegEntry of this segment
-                segentry.open_layer_id = None;
-            } else {
-                // We could have already updated segentry.open for
-                // dropped (non-writeable) layer. This is fine.
-                assert!(!layer.is_writeable());
-                assert!(layer.is_dropped());
+        // Search the delta layers
+        let mut latest_delta: Option<Arc<dyn Layer>> = None;
+        for l in self.historic_layers.iter() {
+            if !l.is_incremental() {
+                continue;
+            }
+            if !l.get_key_range().contains(&key) {
+                continue;
            }

-            NUM_INMEMORY_LAYERS.dec();
+            if l.get_lsn_range().start >= end_lsn {
+                // too new
+                continue;
+            }
+
+            if l.get_lsn_range().end >= end_lsn {
+                // this layer contains the requested point in the key/lsn space.
+                // No need to search any further
+                trace!(
+                    "found layer {} for request on {} at {}",
+                    l.filename().display(),
+                    key,
+                    end_lsn
+                );
+                latest_delta.replace(Arc::clone(l));
+                break;
+            }
+            // this layer's end LSN is smaller than the requested point. If there's
+            // nothing newer, this is what we need to return. Remember this.
+            if let Some(ref old_candidate) = latest_delta {
+                if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
+                    latest_delta.replace(Arc::clone(l));
+                }
+            } else {
+                latest_delta.replace(Arc::clone(l));
+            }
+        }
+        if let Some(l) = latest_delta {
+            trace!(
+                "found (old) layer {} for request on {} at {}",
+                l.filename().display(),
+                key,
+                end_lsn
+            );
+            let lsn_floor = std::cmp::max(
+                Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
+                l.get_lsn_range().start,
+            );
+            Ok(Some(SearchResult {
+                lsn_floor,
+                layer: l,
+            }))
+        } else if let Some(l) = latest_img {
+            trace!(
+                "found img layer and no deltas for request on {} at {}",
+                key,
+                end_lsn
+            );
+            Ok(Some(SearchResult {
+                lsn_floor: latest_img_lsn.unwrap(),
+                layer: l,
+            }))
+        } else {
+            trace!("no layer found for request on {} at {}", key, end_lsn);
+            Ok(None)
        }
    }

@@ -136,9 +169,7 @@ impl LayerMap {
    /// Insert an on-disk layer
    ///
    pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
-        let segentry = self.segs.entry(layer.get_seg_tag()).or_default();
-        segentry.insert_historic(layer);
-
+        self.historic_layers.push(layer);
        NUM_ONDISK_LAYERS.inc();
    }

@@ -147,55 +178,62 @@ impl LayerMap {
    ///
    /// This should be called when the corresponding file on disk has been deleted.
    ///
+    #[allow(dead_code)]
    pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
-        let tag = layer.get_seg_tag();
+        let len_before = self.historic_layers.len();

-        if let Some(segentry) = self.segs.get_mut(&tag) {
-            segentry.historic.remove(&layer);
-        }
+        // FIXME: ptr_eq might fail to return true for 'dyn'
+        // references.  Clippy complains about this. In practice it
+        // seems to work, the assertion below would be triggered
+        // otherwise but this ought to be fixed.
+        #[allow(clippy::vtable_address_comparisons)]
+        self.historic_layers
+            .retain(|other| !Arc::ptr_eq(other, &layer));
+
+        assert_eq!(self.historic_layers.len(), len_before - 1);
        NUM_ONDISK_LAYERS.dec();
    }

-    // List relations along with a flag that marks if they exist at the given lsn.
-    // spcnode 0 and dbnode 0 have special meanings and mean all tabespaces/databases.
-    // Pass Tag if we're only interested in some relations.
-    pub fn list_relishes(&self, tag: Option<RelTag>, lsn: Lsn) -> Result<HashMap<RelishTag, bool>> {
-        let mut rels: HashMap<RelishTag, bool> = HashMap::new();
-
-        for (seg, segentry) in self.segs.iter() {
-            match seg.rel {
-                RelishTag::Relation(reltag) => {
-                    if let Some(request_rel) = tag {
-                        if (request_rel.spcnode == 0 || reltag.spcnode == request_rel.spcnode)
-                            && (request_rel.dbnode == 0 || reltag.dbnode == request_rel.dbnode)
-                        {
-                            if let Some(exists) = segentry.exists_at_lsn(lsn)? {
-                                rels.insert(seg.rel, exists);
-                            }
-                        }
-                    }
-                }
-                _ => {
-                    if tag == None {
-                        if let Some(exists) = segentry.exists_at_lsn(lsn)? {
-                            rels.insert(seg.rel, exists);
-                        }
-                    }
-                }
-            }
-        }
-        Ok(rels)
-    }
-
    /// Is there a newer image layer for given segment?
    ///
    /// This is used for garbage collection, to determine if an old layer can
    /// be deleted.
-    pub fn newer_image_layer_exists(&self, seg: SegmentTag, lsn: Lsn) -> bool {
-        if let Some(segentry) = self.segs.get(&seg) {
-            segentry.newer_image_layer_exists(lsn)
-        } else {
-            false
+    /// We ignore segments newer than disk_consistent_lsn because they will be removed at restart
+    /// We also only look at historic layers
+    //#[allow(dead_code)]
+    pub fn newer_image_layer_exists(
+        &self,
+        key_range: &Range<Key>,
+        lsn: Lsn,
+        disk_consistent_lsn: Lsn,
+    ) -> Result<bool> {
+        let mut range_remain = key_range.clone();
+
+        loop {
+            let mut made_progress = false;
+            for l in self.historic_layers.iter() {
+                if l.is_incremental() {
+                    continue;
+                }
+                let img_lsn = l.get_lsn_range().start;
+                if !l.is_incremental()
+                    && l.get_key_range().contains(&range_remain.start)
+                    && img_lsn > lsn
+                    && img_lsn < disk_consistent_lsn
+                {
+                    made_progress = true;
+                    let img_key_end = l.get_key_range().end;
+
+                    if img_key_end >= range_remain.end {
+                        return Ok(true);
+                    }
+                    range_remain.start = img_key_end;
+                }
+            }
+
+            if !made_progress {
+                return Ok(false);
+            }
        }
    }

@@ -205,279 +243,144 @@ impl LayerMap {
    /// used for garbage collection, to determine if some alive layer
    /// exists at the lsn. If so, we shouldn't delete a newer dropped layer
    /// to avoid incorrectly making it visible.
-    pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool> {
-        Ok(if let Some(segentry) = self.segs.get(&seg) {
-            segentry.exists_at_lsn(lsn)?.unwrap_or(false)
-        } else {
-            false
-        })
+    /*
+        pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool> {
+            Ok(if let Some(segentry) = self.historic_layers.get(&seg) {
+                segentry.exists_at_lsn(seg, lsn)?.unwrap_or(false)
+            } else {
+                false
+            })
+        }
+    */
+
+    pub fn iter_historic_layers(&self) -> std::slice::Iter<Arc<dyn Layer>> {
+        self.historic_layers.iter()
    }

-    /// Return the oldest in-memory layer, along with its generation number.
-    pub fn peek_oldest_open(&mut self) -> Option<(LayerId, Arc<OpenLayer>, u64)> {
-        let global_map = GLOBAL_LAYER_MAP.read().unwrap();
+    fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option<Arc<dyn Layer>> {
+        // Find the last image layer that covers the key
+        let mut candidate_lsn = Lsn(0);
+        let mut candidate = None;
+        for l in self.historic_layers.iter() {
+            if l.is_incremental() {
+                continue;
+            }

-        while let Some(oldest_entry) = self.open_layers.peek() {
-            if let Some(layer) = global_map.get(&oldest_entry.layer_id) {
-                return Some((oldest_entry.layer_id, layer, oldest_entry.generation));
-            } else {
-                self.open_layers.pop();
+            if !l.get_key_range().contains(&key) {
+                continue;
+            }
+
+            let this_lsn = l.get_lsn_range().start;
+            if this_lsn > lsn {
+                continue;
+            }
+            if this_lsn < candidate_lsn {
+                // our previous candidate was better
+                continue;
+            }
+            candidate_lsn = this_lsn;
+            candidate = Some(Arc::clone(l));
+        }
+
+        candidate
+    }
+
+    ///
+    /// Divide the whole given range of keys into sub-ranges based on the latest
+    /// image layer that covers each range. (This is used when creating  new
+    /// image layers)
+    ///
+    // FIXME: clippy complains that the result type is very complex. She's probably
+    // right...
+    #[allow(clippy::type_complexity)]
+    pub fn image_coverage(
+        &self,
+        key_range: &Range<Key>,
+        lsn: Lsn,
+    ) -> Result<Vec<(Range<Key>, Option<Arc<dyn Layer>>)>> {
+        let mut points: Vec<Key>;
+
+        points = vec![key_range.start];
+        for l in self.historic_layers.iter() {
+            if l.get_lsn_range().start > lsn {
+                continue;
+            }
+            let range = l.get_key_range();
+            if key_range.contains(&range.start) {
+                points.push(l.get_key_range().start);
+            }
+            if key_range.contains(&range.end) {
+                points.push(l.get_key_range().end);
            }
        }
-        None
-    }
+        points.push(key_range.end);

-    /// Increment the generation number used to stamp open in-memory layers. Layers
-    /// added with `insert_open` after this call will be associated with the new
-    /// generation. Returns the new generation number.
-    pub fn increment_generation(&mut self) -> u64 {
-        self.current_generation += 1;
-        self.current_generation
-    }
+        points.sort();
+        points.dedup();

-    pub fn iter_historic_layers(&self) -> HistoricLayerIter {
-        HistoricLayerIter {
-            seg_iter: self.segs.iter(),
-            iter: None,
+        // Ok, we now have a list of "interesting" points in the key space
+
+        // For each range between the points, find the latest image
+        let mut start = *points.first().unwrap();
+        let mut ranges = Vec::new();
+        for end in points[1..].iter() {
+            let img = self.find_latest_image(start, lsn);
+
+            ranges.push((start..*end, img));
+
+            start = *end;
        }
+        Ok(ranges)
+    }
+
+    pub fn count_deltas(&self, key_range: &Range<Key>, lsn_range: &Range<Lsn>) -> Result<usize> {
+        let mut result = 0;
+        for l in self.historic_layers.iter() {
+            if !l.is_incremental() {
+                continue;
+            }
+            if !range_overlaps(&l.get_lsn_range(), lsn_range) {
+                continue;
+            }
+            if !range_overlaps(&l.get_key_range(), key_range) {
+                continue;
+            }
+
+            // We ignore level0 delta layers. Unless the whole keyspace fits
+            // into one partition
+            if !range_eq(key_range, &(Key::MIN..Key::MAX))
+                && range_eq(&l.get_key_range(), &(Key::MIN..Key::MAX))
+            {
+                continue;
+            }
+
+            result += 1;
+        }
+        Ok(result)
+    }
+
+    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<dyn Layer>>> {
+        let mut deltas = Vec::new();
+        for l in self.historic_layers.iter() {
+            if !l.is_incremental() {
+                continue;
+            }
+            if l.get_key_range() != (Key::MIN..Key::MAX) {
+                continue;
+            }
+            deltas.push(Arc::clone(l));
+        }
+        Ok(deltas)
    }

    /// debugging function to print out the contents of the layer map
    #[allow(unused)]
    pub fn dump(&self) -> Result<()> {
        println!("Begin dump LayerMap");
-        for (seg, segentry) in self.segs.iter() {
-            if let Some(open) = &segentry.open_layer_id {
-                if let Some(layer) = GLOBAL_LAYER_MAP.read().unwrap().get(open) {
-                    layer.dump()?;
-                } else {
-                    println!("layer not found in global map");
-                }
-            }
-
-            for layer in segentry.historic.iter() {
-                layer.dump()?;
-            }
+        for layer in self.historic_layers.iter() {
+            layer.dump()?;
        }
        println!("End dump LayerMap");
        Ok(())
    }
 }
-
-impl IntervalItem for dyn Layer {
-    type Key = Lsn;
-
-    fn start_key(&self) -> Lsn {
-        self.get_start_lsn()
-    }
-    fn end_key(&self) -> Lsn {
-        self.get_end_lsn()
-    }
-}
-
-///
-/// Per-segment entry in the LayerMap::segs hash map. Holds all the layers
-/// associated with the segment.
-///
-/// The last layer that is open for writes is always an OpenLayer,
-/// and is kept in a separate field, because there can be only one for
-/// each segment. The older layers, stored on disk, are kept in an
-/// IntervalTree.
-#[derive(Default)]
-struct SegEntry {
-    open_layer_id: Option<LayerId>,
-    historic: IntervalTree<dyn Layer>,
-}
-
-impl SegEntry {
-    /// Does the segment exist at given LSN?
-    /// Return None if object is not found in this SegEntry.
-    fn exists_at_lsn(&self, lsn: Lsn) -> Result<Option<bool>> {
-        if let Some(layer) = self.get(lsn) {
-            Ok(Some(layer.get_seg_exists(lsn)?))
-        } else {
-            Ok(None)
-        }
-    }
-
-    pub fn get(&self, lsn: Lsn) -> Option<Arc<dyn Layer>> {
-        if let Some(open_layer_id) = &self.open_layer_id {
-            let open_layer = GLOBAL_LAYER_MAP.read().unwrap().get(open_layer_id)?;
-            if open_layer.get_start_lsn() <= lsn {
-                return Some(open_layer);
-            }
-        }
-
-        self.historic.search(lsn)
-    }
-
-    pub fn newer_image_layer_exists(&self, lsn: Lsn) -> bool {
-        // We only check on-disk layers, because
-        // in-memory layers are not durable
-
-        self.historic
-            .iter_newer(lsn)
-            .any(|layer| !layer.is_incremental())
-    }
-
-    // Set new open layer for a SegEntry.
-    // It's ok to rewrite previous open layer,
-    // but only if it is not writeable anymore.
-    pub fn update_open(&mut self, layer: Arc<OpenLayer>) -> LayerId {
-        if let Some(prev_open_layer_id) = &self.open_layer_id {
-            if let Some(prev_open_layer) = GLOBAL_LAYER_MAP.read().unwrap().get(prev_open_layer_id)
-            {
-                assert!(!prev_open_layer.is_writeable());
-            }
-        }
-        let open_layer_id = GLOBAL_LAYER_MAP.write().unwrap().insert(layer);
-        self.open_layer_id = Some(open_layer_id);
-        open_layer_id
-    }
-
-    pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
-        self.historic.insert(layer);
-    }
-}
-
-/// Entry held in LayerMap::open_layers, with boilerplate comparison routines
-/// to implement a min-heap ordered by 'oldest_pending_lsn' and 'generation'
-///
-/// The generation number associated with each entry can be used to distinguish
-/// recently-added entries (i.e after last call to increment_generation()) from older
-/// entries with the same 'oldest_pending_lsn'.
-struct OpenLayerEntry {
-    oldest_pending_lsn: Lsn, // copy of layer.get_oldest_pending_lsn()
-    generation: u64,
-    layer_id: LayerId,
-}
-impl Ord for OpenLayerEntry {
-    fn cmp(&self, other: &Self) -> Ordering {
-        // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
-        // to get that. Entries with identical oldest_pending_lsn are ordered by generation
-        other
-            .oldest_pending_lsn
-            .cmp(&self.oldest_pending_lsn)
-            .then_with(|| other.generation.cmp(&self.generation))
-    }
-}
-impl PartialOrd for OpenLayerEntry {
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-impl PartialEq for OpenLayerEntry {
-    fn eq(&self, other: &Self) -> bool {
-        self.cmp(other) == Ordering::Equal
-    }
-}
-impl Eq for OpenLayerEntry {}
-
-/// Iterator returned by LayerMap::iter_historic_layers()
-pub struct HistoricLayerIter<'a> {
-    seg_iter: std::collections::hash_map::Iter<'a, SegmentTag, SegEntry>,
-    iter: Option<IntervalIter<'a, dyn Layer>>,
-}
-
-impl<'a> Iterator for HistoricLayerIter<'a> {
-    type Item = Arc<dyn Layer>;
-
-    fn next(&mut self) -> std::option::Option<<Self as std::iter::Iterator>::Item> {
-        loop {
-            if let Some(x) = &mut self.iter {
-                if let Some(x) = x.next() {
-                    return Some(Arc::clone(&x));
-                }
-            }
-            if let Some((_tag, segentry)) = self.seg_iter.next() {
-                self.iter = Some(segentry.historic.iter());
-                continue;
-            } else {
-                return None;
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::PageServerConf;
-    use std::str::FromStr;
-    use zenith_utils::zid::{ZTenantId, ZTimelineId};
-
-    /// Arbitrary relation tag, for testing.
-    const TESTREL_A: RelishTag = RelishTag::Relation(RelTag {
-        spcnode: 0,
-        dbnode: 111,
-        relnode: 1000,
-        forknum: 0,
-    });
-
-    lazy_static! {
-        static ref DUMMY_TIMELINEID: ZTimelineId =
-            ZTimelineId::from_str("00000000000000000000000000000000").unwrap();
-        static ref DUMMY_TENANTID: ZTenantId =
-            ZTenantId::from_str("00000000000000000000000000000000").unwrap();
-    }
-
-    /// Construct a dummy OpenLayer for testing
-    fn dummy_open_layer(
-        conf: &'static PageServerConf,
-        segno: u32,
-        start_lsn: Lsn,
-        oldest_pending_lsn: Lsn,
-    ) -> Arc<OpenLayer> {
-        Arc::new(
-            OpenLayer::create(
-                conf,
-                *DUMMY_TIMELINEID,
-                *DUMMY_TENANTID,
-                SegmentTag {
-                    rel: TESTREL_A,
-                    segno,
-                },
-                start_lsn,
-                oldest_pending_lsn,
-            )
-            .unwrap(),
-        )
-    }
-
-    #[test]
-    fn test_open_layers() -> Result<()> {
-        let conf = PageServerConf::dummy_conf(PageServerConf::test_repo_dir("dummy_open_layer"));
-        let conf = Box::leak(Box::new(conf));
-        std::fs::create_dir_all(conf.timeline_path(&DUMMY_TIMELINEID, &DUMMY_TENANTID))?;
-
-        let mut layers = LayerMap::default();
-
-        let gen1 = layers.increment_generation();
-        layers.insert_open(dummy_open_layer(conf, 0, Lsn(0x100), Lsn(0x100)));
-        layers.insert_open(dummy_open_layer(conf, 1, Lsn(0x100), Lsn(0x200)));
-        layers.insert_open(dummy_open_layer(conf, 2, Lsn(0x100), Lsn(0x120)));
-        layers.insert_open(dummy_open_layer(conf, 3, Lsn(0x100), Lsn(0x110)));
-
-        let gen2 = layers.increment_generation();
-        layers.insert_open(dummy_open_layer(conf, 4, Lsn(0x100), Lsn(0x110)));
-        layers.insert_open(dummy_open_layer(conf, 5, Lsn(0x100), Lsn(0x100)));
-
-        // A helper function (closure) to pop the next oldest open entry from the layer map,
-        // and assert that it is what we'd expect
-        let mut assert_pop_layer = |expected_segno: u32, expected_generation: u64| {
-            let (layer_id, l, generation) = layers.peek_oldest_open().unwrap();
-            assert!(l.get_seg_tag().segno == expected_segno);
-            assert!(generation == expected_generation);
-            layers.remove_open(layer_id);
-        };
-
-        assert_pop_layer(0, gen1); // 0x100
-        assert_pop_layer(5, gen2); // 0x100
-        assert_pop_layer(3, gen1); // 0x110
-        assert_pop_layer(4, gen2); // 0x110
-        assert_pop_layer(2, gen1); // 0x120
-        assert_pop_layer(1, gen1); // 0x200
-
-        Ok(())
-    }
-}
--- a/pageserver/src/layered_repository/metadata.rs
+++ b/pageserver/src/layered_repository/metadata.rs
@@ -15,10 +15,12 @@ use zenith_utils::{
    zid::{ZTenantId, ZTimelineId},
 };

-use crate::{
-    layered_repository::{METADATA_CHECKSUM_SIZE, METADATA_MAX_DATA_SIZE, METADATA_MAX_SAFE_SIZE},
-    PageServerConf,
-};
+use crate::config::PageServerConf;
+
+// Taken from PG_CONTROL_MAX_SAFE_SIZE
+const METADATA_MAX_SAFE_SIZE: usize = 512;
+const METADATA_CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
+const METADATA_MAX_DATA_SIZE: usize = METADATA_MAX_SAFE_SIZE - METADATA_CHECKSUM_SIZE;

 /// The name of the metadata file pageserver creates per timeline.
 pub const METADATA_FILE_NAME: &str = "metadata";
@@ -42,6 +44,8 @@ pub struct TimelineMetadata {
    prev_record_lsn: Option<Lsn>,
    ancestor_timeline: Option<ZTimelineId>,
    ancestor_lsn: Lsn,
+    latest_gc_cutoff_lsn: Lsn,
+    initdb_lsn: Lsn,
 }

 /// Points to a place in pageserver's local directory,
@@ -61,12 +65,16 @@ impl TimelineMetadata {
        prev_record_lsn: Option<Lsn>,
        ancestor_timeline: Option<ZTimelineId>,
        ancestor_lsn: Lsn,
+        latest_gc_cutoff_lsn: Lsn,
+        initdb_lsn: Lsn,
    ) -> Self {
        Self {
            disk_consistent_lsn,
            prev_record_lsn,
            ancestor_timeline,
            ancestor_lsn,
+            latest_gc_cutoff_lsn,
+            initdb_lsn,
        }
    }

@@ -121,6 +129,14 @@ impl TimelineMetadata {
    pub fn ancestor_lsn(&self) -> Lsn {
        self.ancestor_lsn
    }
+
+    pub fn latest_gc_cutoff_lsn(&self) -> Lsn {
+        self.latest_gc_cutoff_lsn
+    }
+
+    pub fn initdb_lsn(&self) -> Lsn {
+        self.initdb_lsn
+    }
 }

 /// This module is for direct conversion of metadata to bytes and back.
@@ -139,6 +155,8 @@ mod serialize {
        prev_record_lsn: &'a Option<Lsn>,
        ancestor_timeline: &'a Option<ZTimelineId>,
        ancestor_lsn: &'a Lsn,
+        latest_gc_cutoff_lsn: &'a Lsn,
+        initdb_lsn: &'a Lsn,
    }

    impl<'a> From<&'a TimelineMetadata> for SeTimelineMetadata<'a> {
@@ -148,6 +166,8 @@ mod serialize {
                prev_record_lsn: &other.prev_record_lsn,
                ancestor_timeline: &other.ancestor_timeline,
                ancestor_lsn: &other.ancestor_lsn,
+                latest_gc_cutoff_lsn: &other.latest_gc_cutoff_lsn,
+                initdb_lsn: &other.initdb_lsn,
            }
        }
    }
@@ -158,6 +178,8 @@ mod serialize {
        prev_record_lsn: Option<Lsn>,
        ancestor_timeline: Option<ZTimelineId>,
        ancestor_lsn: Lsn,
+        latest_gc_cutoff_lsn: Lsn,
+        initdb_lsn: Lsn,
    }

    impl From<DeTimelineMetadata> for TimelineMetadata {
@@ -167,6 +189,8 @@ mod serialize {
                prev_record_lsn: other.prev_record_lsn,
                ancestor_timeline: other.ancestor_timeline,
                ancestor_lsn: other.ancestor_lsn,
+                latest_gc_cutoff_lsn: other.latest_gc_cutoff_lsn,
+                initdb_lsn: other.initdb_lsn,
            }
        }
    }
@@ -185,6 +209,8 @@ mod tests {
            prev_record_lsn: Some(Lsn(0x100)),
            ancestor_timeline: Some(TIMELINE_ID),
            ancestor_lsn: Lsn(0),
+            latest_gc_cutoff_lsn: Lsn(0),
+            initdb_lsn: Lsn(0),
        };

        let metadata_bytes = original_metadata
--- a/pageserver/src/layered_repository/page_versions.rs
+++ b/pageserver/src/layered_repository/page_versions.rs
@@ -1,295 +0,0 @@
-//!
-//! Data structure to ingest incoming WAL into an append-only file.
-//!
-//! - The file is considered temporary, and will be discarded on crash
-//! - based on a B-tree
-//!
-
-use std::os::unix::fs::FileExt;
-use std::{collections::HashMap, ops::RangeBounds, slice};
-
-use anyhow::Result;
-use bytes::{Bytes, BytesMut};
-
-use std::cmp::min;
-use std::io::{Seek, SeekFrom};
-
-use zenith_utils::{lsn::Lsn, vec_map::VecMap};
-
-use super::storage_layer::PageVersion;
-use crate::layered_repository::ephemeral_file::EphemeralFile;
-
-use postgres_ffi::pg_constants::BLCKSZ;
-
-use zenith_utils::bin_ser::LeSer;
-
-const EMPTY_SLICE: &[(Lsn, u64)] = &[];
-
-pub struct PageVersions {
-    map: HashMap<u32, VecMap<Lsn, u64>>,
-
-    latest_map: HashMap<u32, (Lsn, u64)>,
-
-    /// The PageVersion structs are stored in a serialized format in this file.
-    /// Each serialized PageVersion is preceded by a 'u32' length field.
-    /// The 'map' stores offsets into this file.
-    file: EphemeralFile,
-}
-
-impl PageVersions {
-    pub fn new(file: EphemeralFile) -> PageVersions {
-        PageVersions {
-            map: HashMap::new(),
-            latest_map: HashMap::new(),
-            file,
-        }
-    }
-
-    pub fn cache_latest(&mut self, blknum: u32, lsn: Lsn, img: &[u8]) -> Result<()> {
-        if img.len() != BLCKSZ as usize {
-            return Ok(());
-        }
-
-        let pos = if let Some((_lsn, pos)) = self.latest_map.get(&blknum) {
-            *pos
-        } else {
-            let pos = self.file.stream_position()?;
-            // round up to nearest page boundary for performance
-            //let pos = (pos + BLCKSZ as u64 - 1) & !(BLCKSZ as u64 - 1);
-
-            self.file.seek(SeekFrom::Start(pos + BLCKSZ as u64))?;
-
-            pos
-        };
-
-        self.file.write_all_at(img, pos)?;
-
-        self.latest_map.insert(blknum, (lsn, pos));
-
-        Ok(())
-    }
-
-    pub fn get_latest(&self, blknum: u32) -> Option<&(Lsn, u64)> {
-        self.latest_map.get(&blknum)
-    }
-
-    pub fn fetch_cached_latest(&self, pos: u64) -> Result<Bytes, std::io::Error> {
-        let mut buf = BytesMut::with_capacity(BLCKSZ as usize);
-        buf.resize(BLCKSZ as usize, 0u8);
-        if let Err(err) = self.file.read_exact_at(buf.as_mut(), pos) {
-            tracing::error!("read_exact_at {} failed: {:?}", pos, err);
-        }
-        Ok(buf.freeze())
-    }
-
-    pub fn append_or_update_last(
-        &mut self,
-        blknum: u32,
-        lsn: Lsn,
-        page_version: PageVersion,
-    ) -> Result<Option<u64>> {
-        // remember starting position
-        let pos = self.file.stream_position()?;
-
-        // make room for the 'length' field by writing zeros as a placeholder.
-        self.file.seek(SeekFrom::Start(pos + 4)).unwrap();
-
-        page_version.ser_into(&mut self.file).unwrap();
-
-        // write the 'length' field.
-        let len = self.file.stream_position()? - pos - 4;
-        let lenbuf = u32::to_ne_bytes(len as u32);
-        self.file.write_all_at(&lenbuf, pos)?;
-
-        let map = self.map.entry(blknum).or_insert_with(VecMap::default);
-        Ok(map.append_or_update_last(lsn, pos as u64).unwrap().0)
-    }
-
-    /// Get all [`PageVersion`]s in a block
-    fn get_block_slice(&self, blknum: u32) -> &[(Lsn, u64)] {
-        self.map
-            .get(&blknum)
-            .map(VecMap::as_slice)
-            .unwrap_or(EMPTY_SLICE)
-    }
-
-    /// Get a range of [`PageVersions`] in a block
-    pub fn get_block_lsn_range<R: RangeBounds<Lsn>>(&self, blknum: u32, range: R) -> &[(Lsn, u64)] {
-        self.map
-            .get(&blknum)
-            .map(|vec_map| vec_map.slice_range(range))
-            .unwrap_or(EMPTY_SLICE)
-    }
-
-    /// Iterate through [`PageVersion`]s in (block, lsn) order.
-    /// If a [`cutoff_lsn`] is set, only show versions with `lsn < cutoff_lsn`
-    pub fn ordered_page_version_iter(&self, cutoff_lsn: Option<Lsn>) -> OrderedPageVersionIter<'_> {
-        let mut ordered_blocks: Vec<u32> = self.map.keys().cloned().collect();
-        ordered_blocks.sort_unstable();
-
-        let slice = ordered_blocks
-            .first()
-            .map(|&blknum| self.get_block_slice(blknum))
-            .unwrap_or(EMPTY_SLICE);
-
-        OrderedPageVersionIter {
-            page_versions: self,
-            ordered_blocks,
-            cur_block_idx: 0,
-            cutoff_lsn,
-            cur_slice_iter: slice.iter(),
-        }
-    }
-
-    /// Returns a 'Read' that reads the page version at given offset.
-    pub fn reader(&self, pos: u64) -> Result<PageVersionReader, std::io::Error> {
-        // read length
-        let mut lenbuf = [0u8; 4];
-        self.file.read_exact_at(&mut lenbuf, pos)?;
-        let len = u32::from_ne_bytes(lenbuf);
-
-        Ok(PageVersionReader {
-            file: &self.file,
-            pos: pos + 4,
-            end_pos: pos + 4 + len as u64,
-        })
-    }
-
-    pub fn get_page_version(&self, pos: u64) -> Result<PageVersion> {
-        let mut reader = self.reader(pos)?;
-        Ok(PageVersion::des_from(&mut reader)?)
-    }
-}
-
-pub struct PageVersionReader<'a> {
-    file: &'a EphemeralFile,
-    pos: u64,
-    end_pos: u64,
-}
-
-impl<'a> std::io::Read for PageVersionReader<'a> {
-    fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
-        let len = min(buf.len(), (self.end_pos - self.pos) as usize);
-        let n = self.file.read_at(&mut buf[..len], self.pos)?;
-        self.pos += n as u64;
-        Ok(n)
-    }
-}
-
-pub struct OrderedPageVersionIter<'a> {
-    page_versions: &'a PageVersions,
-
-    ordered_blocks: Vec<u32>,
-    cur_block_idx: usize,
-
-    cutoff_lsn: Option<Lsn>,
-
-    cur_slice_iter: slice::Iter<'a, (Lsn, u64)>,
-}
-
-impl OrderedPageVersionIter<'_> {
-    fn is_lsn_before_cutoff(&self, lsn: &Lsn) -> bool {
-        if let Some(cutoff_lsn) = self.cutoff_lsn.as_ref() {
-            lsn < cutoff_lsn
-        } else {
-            true
-        }
-    }
-}
-
-impl<'a> Iterator for OrderedPageVersionIter<'a> {
-    type Item = (u32, Lsn, u64);
-
-    fn next(&mut self) -> Option<Self::Item> {
-        loop {
-            if let Some((lsn, pos)) = self.cur_slice_iter.next() {
-                if self.is_lsn_before_cutoff(lsn) {
-                    let blknum = self.ordered_blocks[self.cur_block_idx];
-                    return Some((blknum, *lsn, *pos));
-                }
-            }
-
-            let next_block_idx = self.cur_block_idx + 1;
-            let blknum: u32 = *self.ordered_blocks.get(next_block_idx)?;
-            self.cur_block_idx = next_block_idx;
-            self.cur_slice_iter = self.page_versions.get_block_slice(blknum).iter();
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use bytes::Bytes;
-
-    use super::*;
-    use crate::PageServerConf;
-    use std::fs;
-    use std::str::FromStr;
-    use zenith_utils::zid::{ZTenantId, ZTimelineId};
-
-    fn repo_harness(test_name: &str) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId)> {
-        let repo_dir = PageServerConf::test_repo_dir(test_name);
-        let _ = fs::remove_dir_all(&repo_dir);
-        let conf = PageServerConf::dummy_conf(repo_dir);
-        // Make a static copy of the config. This can never be free'd, but that's
-        // OK in a test.
-        let conf: &'static PageServerConf = Box::leak(Box::new(conf));
-
-        let tenantid = ZTenantId::from_str("11000000000000000000000000000000").unwrap();
-        let timelineid = ZTimelineId::from_str("22000000000000000000000000000000").unwrap();
-        fs::create_dir_all(conf.timeline_path(&timelineid, &tenantid))?;
-
-        Ok((conf, tenantid, timelineid))
-    }
-
-    #[test]
-    fn test_ordered_iter() -> Result<()> {
-        let (conf, tenantid, timelineid) = repo_harness("test_ordered_iter")?;
-
-        let file = EphemeralFile::create(conf, tenantid, timelineid)?;
-
-        let mut page_versions = PageVersions::new(file);
-
-        const BLOCKS: u32 = 1000;
-        const LSNS: u64 = 50;
-
-        let empty_page = Bytes::from_static(&[0u8; 8192]);
-        let empty_page_version = PageVersion::Page(empty_page);
-
-        for blknum in 0..BLOCKS {
-            for lsn in 0..LSNS {
-                let old = page_versions.append_or_update_last(
-                    blknum,
-                    Lsn(lsn),
-                    empty_page_version.clone(),
-                )?;
-                assert!(old.is_none());
-            }
-        }
-
-        let mut iter = page_versions.ordered_page_version_iter(None);
-        for blknum in 0..BLOCKS {
-            for lsn in 0..LSNS {
-                let (actual_blknum, actual_lsn, _pv) = iter.next().unwrap();
-                assert_eq!(actual_blknum, blknum);
-                assert_eq!(Lsn(lsn), actual_lsn);
-            }
-        }
-        assert!(iter.next().is_none());
-        assert!(iter.next().is_none()); // should be robust against excessive next() calls
-
-        const CUTOFF_LSN: Lsn = Lsn(30);
-        let mut iter = page_versions.ordered_page_version_iter(Some(CUTOFF_LSN));
-        for blknum in 0..BLOCKS {
-            for lsn in 0..CUTOFF_LSN.0 {
-                let (actual_blknum, actual_lsn, _pv) = iter.next().unwrap();
-                assert_eq!(actual_blknum, blknum);
-                assert_eq!(Lsn(lsn), actual_lsn);
-            }
-        }
-        assert!(iter.next().is_none());
-        assert!(iter.next().is_none()); // should be robust against excessive next() calls
-
-        Ok(())
-    }
-}
--- a/pageserver/src/layered_repository/par_fsync.rs
+++ b/pageserver/src/layered_repository/par_fsync.rs
@@ -0,0 +1,55 @@
+use std::{
+    io,
+    path::{Path, PathBuf},
+    sync::atomic::{AtomicUsize, Ordering},
+};
+
+use crate::virtual_file::VirtualFile;
+
+fn fsync_path(path: &Path) -> io::Result<()> {
+    let file = VirtualFile::open(path)?;
+    file.sync_all()
+}
+
+fn parallel_worker(paths: &[PathBuf], next_path_idx: &AtomicUsize) -> io::Result<()> {
+    while let Some(path) = paths.get(next_path_idx.fetch_add(1, Ordering::Relaxed)) {
+        fsync_path(path)?;
+    }
+
+    Ok(())
+}
+
+pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
+    const PARALLEL_PATH_THRESHOLD: usize = 1;
+    if paths.len() <= PARALLEL_PATH_THRESHOLD {
+        for path in paths {
+            fsync_path(path)?;
+        }
+        return Ok(());
+    }
+
+    /// Use at most this number of threads.
+    /// Increasing this limit will
+    /// - use more memory
+    /// - increase the cost of spawn/join latency
+    const MAX_NUM_THREADS: usize = 64;
+    let num_threads = paths.len().min(MAX_NUM_THREADS);
+    let next_path_idx = AtomicUsize::new(0);
+
+    crossbeam_utils::thread::scope(|s| -> io::Result<()> {
+        let mut handles = vec![];
+        // Spawn `num_threads - 1`, as the current thread is also a worker.
+        for _ in 1..num_threads {
+            handles.push(s.spawn(|_| parallel_worker(paths, &next_path_idx)));
+        }
+
+        parallel_worker(paths, &next_path_idx)?;
+
+        for handle in handles {
+            handle.join().unwrap()?;
+        }
+
+        Ok(())
+    })
+    .unwrap()
+}
--- a/pageserver/src/layered_repository/storage_layer.rs
+++ b/pageserver/src/layered_repository/storage_layer.rs
@@ -2,95 +2,84 @@
 //! Common traits and structs for layers
 //!

-use crate::relish::RelishTag;
-use crate::repository::WALRecord;
+use crate::repository::{Key, Value};
+use crate::walrecord::ZenithWalRecord;
 use crate::{ZTenantId, ZTimelineId};
 use anyhow::Result;
 use bytes::Bytes;
-use serde::{Deserialize, Serialize};
-use std::fmt;
+use std::ops::Range;
 use std::path::PathBuf;

 use zenith_utils::lsn::Lsn;

-// Size of one segment in pages (10 MB)
-pub const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192;
-
-///
-/// Each relish stored in the repository is divided into fixed-sized "segments",
-/// with 10 MB of key-space, or 1280 8k pages each.
-///
-#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)]
-pub struct SegmentTag {
-    pub rel: RelishTag,
-    pub segno: u32,
-}
-
-impl fmt::Display for SegmentTag {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "{}.{}", self.rel, self.segno)
+pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
+where
+    T: PartialOrd<T>,
+{
+    if a.start < b.start {
+        a.end > b.start
+    } else {
+        b.end > a.start
    }
 }

-impl SegmentTag {
-    pub const fn from_blknum(rel: RelishTag, blknum: u32) -> SegmentTag {
-        SegmentTag {
-            rel,
-            segno: blknum / RELISH_SEG_SIZE,
-        }
-    }
-
-    pub fn blknum_in_seg(&self, blknum: u32) -> bool {
-        blknum / RELISH_SEG_SIZE == self.segno
-    }
+pub fn range_eq<T>(a: &Range<T>, b: &Range<T>) -> bool
+where
+    T: PartialEq<T>,
+{
+    a.start == b.start && a.end == b.end
 }

+/// Struct used to communicate across calls to 'get_value_reconstruct_data'.
 ///
-/// Represents a version of a page at a specific LSN. The LSN is the key of the
-/// entry in the 'page_versions' hash, it is not duplicated here.
+/// Before first call, you can fill in 'page_img' if you have an older cached
+/// version of the page available. That can save work in
+/// 'get_value_reconstruct_data', as it can stop searching for page versions
+/// when all the WAL records going back to the cached image have been collected.
 ///
-/// A page version can be stored as a full page image, or as WAL record that needs
-/// to be applied over the previous page version to reconstruct this version.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub enum PageVersion {
-    Page(Bytes),
-    Wal(WALRecord),
-}
-
+/// When get_value_reconstruct_data returns Complete, 'img' is set to an image
+/// of the page, or the oldest WAL record in 'records' is a will_init-type
+/// record that initializes the page without requiring a previous image.
 ///
-/// Data needed to reconstruct a page version
+/// If 'get_page_reconstruct_data' returns Continue, some 'records' may have
+/// been collected, but there are more records outside the current layer. Pass
+/// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
+/// call, to collect more records.
 ///
-/// 'page_img' is the old base image of the page to start the WAL replay with.
-/// It can be None, if the first WAL record initializes the page (will_init)
-/// 'records' contains the records to apply over the base image.
-///
-pub struct PageReconstructData {
-    pub records: Vec<(Lsn, WALRecord)>,
-    pub page_img: Option<Bytes>,
+#[derive(Debug)]
+pub struct ValueReconstructState {
+    pub records: Vec<(Lsn, ZenithWalRecord)>,
+    pub img: Option<(Lsn, Bytes)>,
 }

 /// Return value from Layer::get_page_reconstruct_data
-pub enum PageReconstructResult {
+#[derive(Clone, Copy, Debug)]
+pub enum ValueReconstructResult {
    /// Got all the data needed to reconstruct the requested page
    Complete,
    /// This layer didn't contain all the required data, the caller should look up
    /// the predecessor layer at the returned LSN and collect more data from there.
-    Continue(Lsn),
+    Continue,
+
    /// This layer didn't contain data needed to reconstruct the page version at
    /// the returned LSN. This is usually considered an error, but might be OK
    /// in some circumstances.
-    Missing(Lsn),
-    /// Use the cached image at `cached_img_lsn` as the base image
-    Cached,
+    Missing,
 }

+/// A Layer contains all data in a "rectangle" consisting of a range of keys and
+/// range of LSNs.
 ///
-/// A Layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs.
 /// There are two kinds of layers, in-memory and on-disk layers. In-memory
-/// layers are used to ingest incoming WAL, and provide fast access
-/// to the recent page versions. On-disk layers are stored as files on disk, and
-/// are immutable. This trait presents the common functionality of
-/// in-memory and on-disk layers.
+/// layers are used to ingest incoming WAL, and provide fast access to the
+/// recent page versions. On-disk layers are stored as files on disk, and are
+/// immutable. This trait presents the common functionality of in-memory and
+/// on-disk layers.
+///
+/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
+/// A delta layer contains all modifications within a range of LSNs and keys.
+/// An image layer is a snapshot of all the data in a key-range, at a single
+/// LSN
 ///
 pub trait Layer: Send + Sync {
    fn get_tenant_id(&self) -> ZTenantId;
@@ -98,21 +87,16 @@ pub trait Layer: Send + Sync {
    /// Identify the timeline this relish belongs to
    fn get_timeline_id(&self) -> ZTimelineId;

-    /// Identify the relish segment
-    fn get_seg_tag(&self) -> SegmentTag;
+    /// Range of segments that this layer covers
+    fn get_key_range(&self) -> Range<Key>;

    /// Inclusive start bound of the LSN range that this layer holds
-    fn get_start_lsn(&self) -> Lsn;
-
    /// Exclusive end bound of the LSN range that this layer holds.
    ///
    /// - For an open in-memory layer, this is MAX_LSN.
    /// - For a frozen in-memory layer or a delta layer, this is a valid end bound.
    /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
-    fn get_end_lsn(&self) -> Lsn;
-
-    /// Is the segment represented by this layer dropped by PostgreSQL?
-    fn is_dropped(&self) -> bool;
+    fn get_lsn_range(&self) -> Range<Lsn>;

    /// Filename used to store this layer on disk. (Even in-memory layers
    /// implement this, to print a handy unique identifier for the layer for
@@ -125,35 +109,18 @@ pub trait Layer: Send + Sync {
    /// It is up to the caller to collect more data from previous layer and
    /// perform WAL redo, if necessary.
    ///
-    /// Note that the 'blknum' is the offset of the page from the beginning
-    /// of the *relish*, not the beginning of the segment. The requested
-    /// 'blknum' must be covered by this segment.
-    ///
-    /// `cached_img_lsn` should be set to a cached page image's lsn < `lsn`.
-    /// This function will only return data after `cached_img_lsn`.
-    ///
    /// See PageReconstructResult for possible return values. The collected data
    /// is appended to reconstruct_data; the caller should pass an empty struct
-    /// on first call. If this returns PageReconstructResult::Continue, look up
-    /// the predecessor layer and call again with the same 'reconstruct_data'
-    /// to collect more data.
-    fn get_page_reconstruct_data(
+    /// on first call, or a struct with a cached older image of the page if one
+    /// is available. If this returns PageReconstructResult::Continue, look up
+    /// the predecessor layer and call again with the same 'reconstruct_data' to
+    /// collect more data.
+    fn get_value_reconstruct_data(
        &self,
-        blknum: u32,
-        lsn: Lsn,
-        cached_img_lsn: Option<Lsn>,
-        reconstruct_data: &mut PageReconstructData,
-    ) -> Result<PageReconstructResult>;
-
-    fn cache_page_image(&self, _blknum: u32, _lsn: Lsn, _img: &[u8]) -> Result<()> {
-        Ok(())
-    }
-
-    /// Return size of the segment at given LSN. (Only for blocky relations.)
-    fn get_seg_size(&self, lsn: Lsn) -> Result<u32>;
-
-    /// Does the segment exist at given LSN? Or was it dropped before it.
-    fn get_seg_exists(&self, lsn: Lsn) -> Result<bool>;
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_data: &mut ValueReconstructState,
+    ) -> Result<ValueReconstructResult>;

    /// Does this layer only contain some data for the segment (incremental),
    /// or does it contain a version of every page? This is important to know
@@ -161,6 +128,12 @@ pub trait Layer: Send + Sync {
    /// the previous non-incremental layer.
    fn is_incremental(&self) -> bool;

+    /// Returns true for layers that are represented in memory.
+    fn is_in_memory(&self) -> bool;
+
+    /// Iterate through all keys and values stored in the layer
+    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_>;
+
    /// Release memory used by this layer. There is no corresponding 'load'
    /// function, that's done implicitly when you call one of the get-functions.
    fn unload(&self) -> Result<()>;
--- a/pageserver/src/layered_repository/utils.rs
+++ b/pageserver/src/layered_repository/utils.rs
@@ -0,0 +1,48 @@
+// Utilities for reading and writing Values
+use std::io::{Error, Write};
+use std::os::unix::fs::FileExt;
+
+use bookfile::BoundedReader;
+
+pub fn read_blob<F: FileExt>(file: &F, off: u64) -> Result<Vec<u8>, Error> {
+    // read length
+    let mut len_buf = [0u8; 4];
+    file.read_exact_at(&mut len_buf, off)?;
+
+    let len = u32::from_ne_bytes(len_buf);
+
+    let mut buf: Vec<u8> = Vec::new();
+    buf.resize(len as usize, 0);
+    file.read_exact_at(&mut buf.as_mut_slice(), off + 4)?;
+
+    Ok(buf)
+}
+
+pub fn read_blob_from_chapter<F: FileExt>(
+    file: &BoundedReader<&F>,
+    off: u64,
+) -> Result<Vec<u8>, Error> {
+    // read length
+    let mut len_buf = [0u8; 4];
+    file.read_exact_at(&mut len_buf, off)?;
+
+    let len = u32::from_ne_bytes(len_buf);
+
+    let mut buf: Vec<u8> = Vec::new();
+    buf.resize(len as usize, 0);
+    file.read_exact_at(&mut buf.as_mut_slice(), off + 4)?;
+
+    Ok(buf)
+}
+
+pub fn write_blob<W: Write>(writer: &mut W, buf: &[u8]) -> Result<u64, Error> {
+    let val_len = buf.len() as u32;
+
+    // write the 'length' field and kind byte.
+    let lenbuf = u32::to_ne_bytes(val_len);
+
+    writer.write_all(&lenbuf)?;
+    writer.write_all(buf)?;
+
+    Ok(4 + val_len as u64)
+}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,55 +1,31 @@
-use layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
-use zenith_utils::postgres_backend::AuthType;
-use zenith_utils::zid::{ZTenantId, ZTimelineId};
-
-use std::path::PathBuf;
-use std::time::Duration;
-
-use lazy_static::lazy_static;
-use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};
-
 pub mod basebackup;
 pub mod branches;
+pub mod config;
 pub mod http;
+pub mod import_datadir;
+pub mod keyspace;
 pub mod layered_repository;
 pub mod page_cache;
 pub mod page_service;
+pub mod pgdatadir_mapping;
 pub mod relish;
 pub mod remote_storage;
 pub mod repository;
-pub mod restore_local_repo;
 pub mod tenant_mgr;
 pub mod tenant_threads;
+pub mod thread_mgr;
 pub mod virtual_file;
-pub mod waldecoder;
+pub mod walingest;
 pub mod walreceiver;
+pub mod walrecord;
 pub mod walredo;

-pub mod defaults {
-    use const_format::formatcp;
-    use std::time::Duration;
+use lazy_static::lazy_static;
+use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};
+use zenith_utils::zid::{ZTenantId, ZTimelineId};

-    pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
-    pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
-    pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
-    pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
-
-    // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
-    // would be more appropriate. But a low value forces the code to be exercised more,
-    // which is good for now to trigger bugs.
-    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
-    pub const DEFAULT_CHECKPOINT_PERIOD: Duration = Duration::from_secs(1);
-
-    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
-    pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100);
-
-    pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
-    pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC_LIMITS: usize = 100;
-
-    pub const DEFAULT_OPEN_MEM_LIMIT: usize = 128 * 1024 * 1024;
-    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
-    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
-}
+use layered_repository::LayeredRepository;
+use pgdatadir_mapping::DatadirTimeline;

 lazy_static! {
    static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!(
@@ -62,172 +38,17 @@ lazy_static! {

 pub const LOG_FILE_NAME: &str = "pageserver.log";

-#[derive(Debug, Clone)]
-pub struct PageServerConf {
-    pub daemonize: bool,
-    pub listen_pg_addr: String,
-    pub listen_http_addr: String,
-    // Flush out an inmemory layer, if it's holding WAL older than this
-    // This puts a backstop on how much WAL needs to be re-digested if the
-    // page server crashes.
-    pub checkpoint_distance: u64,
-    pub checkpoint_period: Duration,
-
-    pub gc_horizon: u64,
-    pub gc_period: Duration,
-    pub superuser: String,
-
-    pub open_mem_limit: usize,
-    pub page_cache_size: usize,
-    pub max_file_descriptors: usize,
-
-    // Repository directory, relative to current working directory.
-    // Normally, the page server changes the current working directory
-    // to the repository, and 'workdir' is always '.'. But we don't do
-    // that during unit testing, because the current directory is global
-    // to the process but different unit tests work on different
-    // repositories.
-    pub workdir: PathBuf,
-
-    pub pg_distrib_dir: PathBuf,
-
-    pub auth_type: AuthType,
-
-    pub auth_validation_public_key_path: Option<PathBuf>,
-    pub remote_storage_config: Option<RemoteStorageConfig>,
-}
-
-impl PageServerConf {
-    //
-    // Repository paths, relative to workdir.
-    //
-
-    fn tenants_path(&self) -> PathBuf {
-        self.workdir.join(TENANTS_SEGMENT_NAME)
-    }
-
-    fn tenant_path(&self, tenantid: &ZTenantId) -> PathBuf {
-        self.tenants_path().join(tenantid.to_string())
-    }
-
-    fn tags_path(&self, tenantid: &ZTenantId) -> PathBuf {
-        self.tenant_path(tenantid).join("refs").join("tags")
-    }
-
-    fn tag_path(&self, tag_name: &str, tenantid: &ZTenantId) -> PathBuf {
-        self.tags_path(tenantid).join(tag_name)
-    }
-
-    fn branches_path(&self, tenantid: &ZTenantId) -> PathBuf {
-        self.tenant_path(tenantid).join("refs").join("branches")
-    }
-
-    fn branch_path(&self, branch_name: &str, tenantid: &ZTenantId) -> PathBuf {
-        self.branches_path(tenantid).join(branch_name)
-    }
-
-    fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf {
-        self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME)
-    }
-
-    fn timeline_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
-        self.timelines_path(tenantid).join(timelineid.to_string())
-    }
-
-    fn ancestor_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
-        self.timeline_path(timelineid, tenantid).join("ancestor")
-    }
-
-    //
-    // Postgres distribution paths
-    //
-
-    pub fn pg_bin_dir(&self) -> PathBuf {
-        self.pg_distrib_dir.join("bin")
-    }
-
-    pub fn pg_lib_dir(&self) -> PathBuf {
-        self.pg_distrib_dir.join("lib")
-    }
-
-    #[cfg(test)]
-    fn test_repo_dir(test_name: &str) -> PathBuf {
-        PathBuf::from(format!("../tmp_check/test_{}", test_name))
-    }
-
-    #[cfg(test)]
-    fn dummy_conf(repo_dir: PathBuf) -> Self {
-        PageServerConf {
-            daemonize: false,
-            checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
-            checkpoint_period: Duration::from_secs(10),
-            gc_horizon: defaults::DEFAULT_GC_HORIZON,
-            gc_period: Duration::from_secs(10),
-            open_mem_limit: defaults::DEFAULT_OPEN_MEM_LIMIT,
-            page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
-            max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
-            listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
-            listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
-            superuser: "zenith_admin".to_string(),
-            workdir: repo_dir,
-            pg_distrib_dir: "".into(),
-            auth_type: AuthType::Trust,
-            auth_validation_public_key_path: None,
-            remote_storage_config: None,
-        }
-    }
-}
-
 /// Config for the Repository checkpointer
 #[derive(Debug, Clone, Copy)]
 pub enum CheckpointConfig {
    // Flush in-memory data that is older than this
    Distance(u64),
    // Flush all in-memory data
+    Flush,
+    // Flush all in-memory data and reconstruct all page images
    Forced,
 }

-/// External backup storage configuration, enough for creating a client for that storage.
-#[derive(Debug, Clone)]
-pub struct RemoteStorageConfig {
-    /// Limits the number of concurrent sync operations between pageserver and the remote storage.
-    pub max_concurrent_sync: usize,
-    /// The storage connection configuration.
-    pub storage: RemoteStorageKind,
-}
+pub type RepositoryImpl = LayeredRepository;

-/// A kind of a remote storage to connect to, with its connection configuration.
-#[derive(Debug, Clone)]
-pub enum RemoteStorageKind {
-    /// Storage based on local file system.
-    /// Specify a root folder to place all stored relish data into.
-    LocalFs(PathBuf),
-    /// AWS S3 based storage, storing all relishes into the root
-    /// of the S3 bucket from the config.
-    AwsS3(S3Config),
-}
-
-/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
-#[derive(Clone)]
-pub struct S3Config {
-    /// Name of the bucket to connect to.
-    pub bucket_name: String,
-    /// The region where the bucket is located at.
-    pub bucket_region: String,
-    /// "Login" to use when connecting to bucket.
-    /// Can be empty for cases like AWS k8s IAM
-    /// where we can allow certain pods to connect
-    /// to the bucket directly without any credentials.
-    pub access_key_id: Option<String>,
-    /// "Password" to use when connecting to bucket.
-    pub secret_access_key: Option<String>,
-}
-
-impl std::fmt::Debug for S3Config {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("S3Config")
-            .field("bucket_name", &self.bucket_name)
-            .field("bucket_region", &self.bucket_region)
-            .finish()
-    }
-}
+pub type DatadirTimelineImpl = DatadirTimeline<RepositoryImpl>;
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -53,7 +53,7 @@ use zenith_utils::{
 };

 use crate::layered_repository::writeback_ephemeral_file;
-use crate::{relish::RelTag, PageServerConf};
+use crate::{config::PageServerConf, relish::RelTag};

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
 const TEST_PAGE_CACHE_SIZE: usize = 10;
@@ -233,6 +233,7 @@ impl std::ops::Deref for PageWriteGuard<'_> {
 impl PageWriteGuard<'_> {
    /// Mark that the buffer contents are now valid.
    pub fn mark_valid(&mut self) {
+        assert!(self.inner.key.is_some());
        assert!(
            !self.valid,
            "mark_valid called on a buffer that was already valid"
@@ -240,6 +241,11 @@ impl PageWriteGuard<'_> {
        self.valid = true;
    }
    pub fn mark_dirty(&mut self) {
+        // only ephemeral pages can be dirty ATM.
+        assert!(matches!(
+            self.inner.key,
+            Some(CacheKey::EphemeralPage { .. })
+        ));
        self.inner.dirty = true;
    }
 }
@@ -251,10 +257,12 @@ impl Drop for PageWriteGuard<'_> {
    /// initializing it, remove the mapping from the page cache.
    ///
    fn drop(&mut self) {
+        assert!(self.inner.key.is_some());
        if !self.valid {
            let self_key = self.inner.key.as_ref().unwrap();
            PAGE_CACHE.get().unwrap().remove_mapping(self_key);
            self.inner.key = None;
+            self.inner.dirty = false;
        }
    }
 }
@@ -347,6 +355,8 @@ impl PageCache {
        }
    }

+    // Section 1.2: Public interface functions for working with Ephemeral pages.
+
    pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> ReadBufResult {
        let mut cache_key = CacheKey::EphemeralPage { file_id, blkno };

@@ -371,6 +381,7 @@ impl PageCache {
                        // remove mapping for old buffer
                        self.remove_mapping(key);
                        inner.key = None;
+                        inner.dirty = false;
                    }
                    _ => {}
                }
@@ -381,8 +392,7 @@ impl PageCache {
    //
    // Section 2: Internal interface functions for lookup/update.
    //
-    // Currently, the page cache only stores materialized page images. In the
-    // future, to add support for a new kind of "thing" to cache, you will need
+    // To add support for a new kind of "thing" to cache, you will need
    // to add public interface routines above, and code to deal with the
    // "mappings" after this section. But the routines in this section should
    // not require changes.
@@ -472,6 +482,7 @@ impl PageCache {
            // Make the slot ready
            let slot = &self.slots[slot_idx];
            inner.key = Some(cache_key.clone());
+            inner.dirty = false;
            slot.usage_count.store(1, Ordering::Relaxed);

            return ReadBufResult::NotFound(PageWriteGuard {
@@ -502,7 +513,7 @@ impl PageCache {

    /// Return a write-locked buffer for given block.
    ///
-    /// Similar to read_for_read(), but the returned buffer is write-locked and
+    /// Similar to lock_for_read(), but the returned buffer is write-locked and
    /// may be modified by the caller even if it's already found in the cache.
    fn lock_for_write(&self, cache_key: &CacheKey) -> WriteBufResult {
        loop {
@@ -532,6 +543,7 @@ impl PageCache {
            // Make the slot ready
            let slot = &self.slots[slot_idx];
            inner.key = Some(cache_key.clone());
+            inner.dirty = false;
            slot.usage_count.store(1, Ordering::Relaxed);

            return WriteBufResult::NotFound(PageWriteGuard {
@@ -672,7 +684,7 @@ impl PageCache {
    }

    //
-    // Section 5: Misc internal helpers
+    // Section 4: Misc internal helpers
    //

    /// Find a slot to evict.
@@ -724,7 +736,7 @@ impl PageCache {
                hash_key: _,
                lsn: _,
            } => {
-                panic!("unexpected dirty materialize page");
+                panic!("unexpected dirty materialized page");
            }
            CacheKey::EphemeralPage { file_id, blkno } => {
                writeback_ephemeral_file(*file_id, *blkno, buf)
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -10,16 +10,15 @@
 //     *callmemaybe <zenith timelineid> $url* -- ask pageserver to start walreceiver on $url
 //

-use anyhow::{anyhow, bail, ensure, Result};
+use anyhow::{bail, ensure, Context, Result};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use lazy_static::lazy_static;
 use regex::Regex;
+use std::io;
 use std::net::TcpListener;
 use std::str;
 use std::str::FromStr;
-use std::sync::Arc;
-use std::thread;
-use std::{io, net::TcpStream};
+use std::sync::{Arc, RwLockReadGuard};
 use tracing::*;
 use zenith_metrics::{register_histogram_vec, HistogramVec};
 use zenith_utils::auth::{self, JwtAuth};
@@ -28,18 +27,20 @@ use zenith_utils::lsn::Lsn;
 use zenith_utils::postgres_backend::is_socket_read_timed_out;
 use zenith_utils::postgres_backend::PostgresBackend;
 use zenith_utils::postgres_backend::{self, AuthType};
-use zenith_utils::pq_proto::{
-    BeMessage, FeMessage, RowDescriptor, HELLO_WORLD_ROW, SINGLE_COL_ROWDESC,
-};
+use zenith_utils::pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC};
 use zenith_utils::zid::{ZTenantId, ZTimelineId};

 use crate::basebackup;
-use crate::branches;
+use crate::config::PageServerConf;
+use crate::pgdatadir_mapping::DatadirTimeline;
 use crate::relish::*;
+use crate::repository::Repository;
 use crate::repository::Timeline;
 use crate::tenant_mgr;
+use crate::thread_mgr;
+use crate::thread_mgr::ThreadKind;
 use crate::walreceiver;
-use crate::PageServerConf;
+use crate::CheckpointConfig;

 // Wrapped in libpq CopyData
 enum PagestreamFeMessage {
@@ -188,30 +189,61 @@ pub fn thread_main(
    listener: TcpListener,
    auth_type: AuthType,
 ) -> anyhow::Result<()> {
-    let mut join_handles = Vec::new();
+    listener.set_nonblocking(true)?;
+    let basic_rt = tokio::runtime::Builder::new_current_thread()
+        .enable_io()
+        .build()?;

-    while !tenant_mgr::shutdown_requested() {
-        let (socket, peer_addr) = listener.accept()?;
-        debug!("accepted connection from {}", peer_addr);
-        socket.set_nodelay(true).unwrap();
-        let local_auth = auth.clone();
+    let tokio_listener = {
+        let _guard = basic_rt.enter();
+        tokio::net::TcpListener::from_std(listener)
+    }?;

-        let handle = thread::Builder::new()
-            .name("serving Page Service thread".into())
-            .spawn(move || {
-                if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) {
-                    error!(%err, "page server thread exited with error");
+    // Wait for a new connection to arrive, or for server shutdown.
+    while let Some(res) = basic_rt.block_on(async {
+        let shutdown_watcher = thread_mgr::shutdown_watcher();
+        tokio::select! {
+            biased;
+
+            _ = shutdown_watcher => {
+                // We were requested to shut down.
+                None
+            }
+
+            res = tokio_listener.accept() => {
+                Some(res)
+            }
+        }
+    }) {
+        match res {
+            Ok((socket, peer_addr)) => {
+                // Connection established. Spawn a new thread to handle it.
+                debug!("accepted connection from {}", peer_addr);
+                let local_auth = auth.clone();
+
+                // PageRequestHandler threads are not associated with any particular
+                // timeline in the thread manager. In practice most connections will
+                // only deal with a particular timeline, but we don't know which one
+                // yet.
+                if let Err(err) = thread_mgr::spawn(
+                    ThreadKind::PageRequestHandler,
+                    None,
+                    None,
+                    "serving Page Service thread",
+                    move || page_service_conn_main(conf, local_auth, socket, auth_type),
+                ) {
+                    // Thread creation failed. Log the error and continue.
+                    error!("could not spawn page service thread: {:?}", err);
                }
-            })
-            .unwrap();
-
-        join_handles.push(handle);
+            }
+            Err(err) => {
+                // accept() failed. Log the error, and loop back to retry on next connection.
+                error!("accept() failed: {:?}", err);
+            }
+        }
    }

-    debug!("page_service loop terminated. wait for connections to cancel");
-    for handle in join_handles.into_iter() {
-        handle.join().unwrap();
-    }
+    debug!("page_service loop terminated");

    Ok(())
 }
@@ -219,10 +251,10 @@ pub fn thread_main(
 fn page_service_conn_main(
    conf: &'static PageServerConf,
    auth: Option<Arc<JwtAuth>>,
-    socket: TcpStream,
+    socket: tokio::net::TcpStream,
    auth_type: AuthType,
 ) -> anyhow::Result<()> {
-    // Immediatsely increment the gauge, then create a job to decrement it on thread exit.
+    // Immediately increment the gauge, then create a job to decrement it on thread exit.
    // One of the pros of `defer!` is that this will *most probably*
    // get called, even in presence of panics.
    let gauge = crate::LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]);
@@ -231,6 +263,19 @@ fn page_service_conn_main(
        gauge.dec();
    }

+    // We use Tokio to accept the connection, but the rest of the code works with a
+    // regular socket. Convert.
+    let socket = socket
+        .into_std()
+        .context("could not convert tokio::net:TcpStream to std::net::TcpStream")?;
+    socket
+        .set_nonblocking(false)
+        .context("could not put socket to blocking mode")?;
+
+    socket
+        .set_nodelay(true)
+        .context("could not set TCP_NODELAY")?;
+
    let mut conn_handler = PageServerHandler::new(conf, auth);
    let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?;
    pgbackend.run(&mut conn_handler)
@@ -279,12 +324,13 @@ impl PageServerHandler {
        let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered();

        // Check that the timeline exists
-        let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
+        let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
+            .context("Cannot handle pagerequests for a remote timeline")?;

        /* switch client to COPYBOTH */
        pgb.write_message(&BeMessage::CopyBothResponse)?;

-        while !tenant_mgr::shutdown_requested() {
+        while !thread_mgr::is_shutdown_requested() {
            match pgb.read_message() {
                Ok(message) => {
                    if let Some(message) = message {
@@ -301,24 +347,24 @@ impl PageServerHandler {
                            PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME
                                .with_label_values(&["get_rel_exists"])
                                .observe_closure_duration(|| {
-                                    self.handle_get_rel_exists_request(&*timeline, &req)
+                                    self.handle_get_rel_exists_request(timeline.as_ref(), &req)
                                }),
                            PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME
                                .with_label_values(&["get_rel_size"])
                                .observe_closure_duration(|| {
-                                    self.handle_get_nblocks_request(&*timeline, &req)
+                                    self.handle_get_nblocks_request(timeline.as_ref(), &req)
                                }),
                            PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME
                                .with_label_values(&["get_page_at_lsn"])
                                .observe_closure_duration(|| {
-                                    self.handle_get_page_at_lsn_request(&*timeline, &req)
+                                    self.handle_get_page_at_lsn_request(timeline.as_ref(), &req)
                                }),
                        };

                        let response = response.unwrap_or_else(|e| {
                            // print the all details to the log with {:#}, but for the client the
                            // error message is enough
-                            error!("error reading relation or page version: {:#}", e);
+                            error!("error reading relation or page version: {:?}", e);
                            PagestreamBeMessage::Error(PagestreamErrorResponse {
                                message: e.to_string(),
                            })
@@ -351,7 +397,12 @@ impl PageServerHandler {
    /// In either case, if the page server hasn't received the WAL up to the
    /// requested LSN yet, we will wait for it to arrive. The return value is
    /// the LSN that should be used to look up the page versions.
-    fn wait_or_get_last_lsn(timeline: &dyn Timeline, lsn: Lsn, latest: bool) -> Result<Lsn> {
+    fn wait_or_get_last_lsn<R: Repository>(
+        timeline: &DatadirTimeline<R>,
+        mut lsn: Lsn,
+        latest: bool,
+        latest_gc_cutoff_lsn: &RwLockReadGuard<Lsn>,
+    ) -> Result<Lsn> {
        if latest {
            // Latest page version was requested. If LSN is given, it is a hint
            // to the page server that there have been no modifications to the
@@ -372,72 +423,79 @@ impl PageServerHandler {
            // walsender completes the authentication and starts streaming the
            // WAL.
            if lsn <= last_record_lsn {
-                Ok(last_record_lsn)
+                lsn = last_record_lsn;
            } else {
-                timeline.wait_lsn(lsn)?;
+                timeline.tline.wait_lsn(lsn)?;
                // Since we waited for 'lsn' to arrive, that is now the last
                // record LSN. (Or close enough for our purposes; the
                // last-record LSN can advance immediately after we return
                // anyway)
-                Ok(lsn)
            }
        } else {
            if lsn == Lsn(0) {
                bail!("invalid LSN(0) in request");
            }
-            timeline.wait_lsn(lsn)?;
-            Ok(lsn)
+            timeline.tline.wait_lsn(lsn)?;
        }
+        ensure!(
+            lsn >= **latest_gc_cutoff_lsn,
+            "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
+            lsn, **latest_gc_cutoff_lsn
+        );
+        Ok(lsn)
    }

-    fn handle_get_rel_exists_request(
+    fn handle_get_rel_exists_request<R: Repository>(
        &self,
-        timeline: &dyn Timeline,
+        timeline: &DatadirTimeline<R>,
        req: &PagestreamExistsRequest,
    ) -> Result<PagestreamBeMessage> {
        let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered();

-        let tag = RelishTag::Relation(req.rel);
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;
+        let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
+        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;

-        let exists = timeline.get_rel_exists(tag, lsn)?;
+        let exists = timeline.get_rel_exists(req.rel, lsn)?;

        Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
            exists,
        }))
    }

-    fn handle_get_nblocks_request(
+    fn handle_get_nblocks_request<R: Repository>(
        &self,
-        timeline: &dyn Timeline,
+        timeline: &DatadirTimeline<R>,
        req: &PagestreamNblocksRequest,
    ) -> Result<PagestreamBeMessage> {
        let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered();
-        let tag = RelishTag::Relation(req.rel);
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;
+        let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
+        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;

-        let n_blocks = timeline.get_relish_size(tag, lsn)?;
-
-        // Return 0 if relation is not found.
-        // This is what postgres smgr expects.
-        let n_blocks = n_blocks.unwrap_or(0);
+        let n_blocks = timeline.get_rel_size(req.rel, lsn)?;

        Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
            n_blocks,
        }))
    }

-    fn handle_get_page_at_lsn_request(
+    fn handle_get_page_at_lsn_request<R: Repository>(
        &self,
-        timeline: &dyn Timeline,
+        timeline: &DatadirTimeline<R>,
        req: &PagestreamGetPageRequest,
    ) -> Result<PagestreamBeMessage> {
        let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn)
            .entered();
-        let tag = RelishTag::Relation(req.rel);
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;
-
-        let page = timeline.get_page_at_lsn(tag, req.blkno, lsn)?;
+        let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
+        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
+        /*
+        // Add a 1s delay to some requests. The delayed causes the requests to
+        // hit the race condition from github issue #1047 more easily.
+        use rand::Rng;
+        if rand::thread_rng().gen::<u8>() < 5 {
+            std::thread::sleep(std::time::Duration::from_millis(1000));
+        }
+        */
+        let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn)?;

        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
            page,
@@ -455,7 +513,14 @@ impl PageServerHandler {
        let _enter = span.enter();

        // check that the timeline exists
-        let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
+        let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
+            .context("Cannot handle basebackup request for a remote timeline")?;
+        let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
+        if let Some(lsn) = lsn {
+            timeline
+                .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
+                .context("invalid basebackup lsn")?;
+        }

        // switch client to COPYOUT
        pgb.write_message(&BeMessage::CopyOutResponse)?;
@@ -525,17 +590,10 @@ impl postgres_backend::Handler for PageServerHandler {
    fn process_query(
        &mut self,
        pgb: &mut PostgresBackend,
-        query_string: Bytes,
+        query_string: &str,
    ) -> anyhow::Result<()> {
        debug!("process query {:?}", query_string);

-        // remove null terminator, if any
-        let mut query_string = query_string;
-        if query_string.last() == Some(&0) {
-            query_string.truncate(query_string.len() - 1);
-        }
-        let query_string = std::str::from_utf8(&query_string)?;
-
        if query_string.starts_with("pagestream ") {
            let (_, params_raw) = query_string.split_at("pagestream ".len());
            let params = params_raw.split(' ').collect::<Vec<_>>();
@@ -578,7 +636,7 @@ impl postgres_backend::Handler for PageServerHandler {
            let re = Regex::new(r"^callmemaybe ([[:xdigit:]]+) ([[:xdigit:]]+) (.*)$").unwrap();
            let caps = re
                .captures(query_string)
-                .ok_or_else(|| anyhow!("invalid callmemaybe: '{}'", query_string))?;
+                .with_context(|| format!("invalid callmemaybe: '{}'", query_string))?;

            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
            let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;
@@ -590,84 +648,27 @@ impl postgres_backend::Handler for PageServerHandler {
                info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered();

            // Check that the timeline exists
-            tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
+            tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
+                .context("Failed to fetch local timeline for callmemaybe requests")?;

-            walreceiver::launch_wal_receiver(self.conf, timelineid, &connstr, tenantid.to_owned());
+            walreceiver::launch_wal_receiver(self.conf, tenantid, timelineid, &connstr)?;

            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("branch_create ") {
-            let err = || anyhow!("invalid branch_create: '{}'", query_string);
-
-            // branch_create <tenantid> <branchname> <startpoint>
-            // TODO lazy static
-            // TODO: escaping, to allow branch names with spaces
-            let re = Regex::new(r"^branch_create ([[:xdigit:]]+) (\S+) ([^\r\n\s;]+)[\r\n\s;]*;?$")
-                .unwrap();
-            let caps = re.captures(query_string).ok_or_else(err)?;
-
-            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
-            let branchname = caps.get(2).ok_or_else(err)?.as_str().to_owned();
-            let startpoint_str = caps.get(3).ok_or_else(err)?.as_str().to_owned();
-
-            self.check_permission(Some(tenantid))?;
-
-            let _enter =
-                info_span!("branch_create", name = %branchname, tenant = %tenantid).entered();
-
-            let branch =
-                branches::create_branch(self.conf, &branchname, &startpoint_str, &tenantid)?;
-            let branch = serde_json::to_vec(&branch)?;
-
-            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
-                .write_message_noflush(&BeMessage::DataRow(&[Some(&branch)]))?
-                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("branch_list ") {
-            // branch_list <zenith tenantid as hex string>
-            let re = Regex::new(r"^branch_list ([[:xdigit:]]+)$").unwrap();
-            let caps = re
-                .captures(query_string)
-                .ok_or_else(|| anyhow!("invalid branch_list: '{}'", query_string))?;
-
-            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
-
-            // since these handlers for tenant/branch commands are deprecated (in favor of http based ones)
-            // just use false in place of include non incremental logical size
-            let branches = crate::branches::get_branches(self.conf, &tenantid, false)?;
-            let branches_buf = serde_json::to_vec(&branches)?;
-
-            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
-                .write_message_noflush(&BeMessage::DataRow(&[Some(&branches_buf)]))?
-                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("tenant_list") {
-            let tenants = crate::tenant_mgr::list_tenants()?;
-            let tenants_buf = serde_json::to_vec(&tenants)?;
-
-            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
-                .write_message_noflush(&BeMessage::DataRow(&[Some(&tenants_buf)]))?
-                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("tenant_create") {
-            let err = || anyhow!("invalid tenant_create: '{}'", query_string);
-
-            // tenant_create <tenantid>
-            let re = Regex::new(r"^tenant_create ([[:xdigit:]]+)$").unwrap();
-            let caps = re.captures(query_string).ok_or_else(err)?;
-
-            self.check_permission(None)?;
-
-            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
-
-            tenant_mgr::create_repository_for_tenant(self.conf, tenantid)?;
-
-            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
-                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("status") {
-            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
-                .write_message_noflush(&HELLO_WORLD_ROW)?
-                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else if query_string.to_ascii_lowercase().starts_with("set ") {
            // important because psycopg2 executes "SET datestyle TO 'ISO'"
            // on connect
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+        } else if query_string.starts_with("failpoints ") {
+            let (_, failpoints) = query_string.split_at("failpoints ".len());
+            for failpoint in failpoints.split(';') {
+                if let Some((name, actions)) = failpoint.split_once('=') {
+                    info!("cfg failpoint: {} {}", name, actions);
+                    fail::cfg(name, actions).unwrap();
+                } else {
+                    bail!("Invalid failpoints format");
+                }
+            }
+            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else if query_string.starts_with("do_gc ") {
            // Run GC immediately on given timeline.
            // FIXME: This is just for tests. See test_runner/batch_others/test_gc.py.
@@ -681,7 +682,7 @@ impl postgres_backend::Handler for PageServerHandler {

            let caps = re
                .captures(query_string)
-                .ok_or_else(|| anyhow!("invalid do_gc: '{}'", query_string))?;
+                .with_context(|| format!("invalid do_gc: '{}'", query_string))?;

            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
            let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;
@@ -691,74 +692,43 @@ impl postgres_backend::Handler for PageServerHandler {
                .unwrap_or(Ok(self.conf.gc_horizon))?;

            let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
-
            let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;
-
            pgb.write_message_noflush(&BeMessage::RowDescription(&[
-                RowDescriptor::int8_col(b"layer_relfiles_total"),
-                RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"),
-                RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"),
-                RowDescriptor::int8_col(b"layer_relfiles_not_updated"),
-                RowDescriptor::int8_col(b"layer_relfiles_needed_as_tombstone"),
-                RowDescriptor::int8_col(b"layer_relfiles_removed"),
-                RowDescriptor::int8_col(b"layer_relfiles_dropped"),
-                RowDescriptor::int8_col(b"layer_nonrelfiles_total"),
-                RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"),
-                RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"),
-                RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"),
-                RowDescriptor::int8_col(b"layer_nonrelfiles_needed_as_tombstone"),
-                RowDescriptor::int8_col(b"layer_nonrelfiles_removed"),
-                RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"),
+                RowDescriptor::int8_col(b"layers_total"),
+                RowDescriptor::int8_col(b"layers_needed_by_cutoff"),
+                RowDescriptor::int8_col(b"layers_needed_by_branches"),
+                RowDescriptor::int8_col(b"layers_not_updated"),
+                RowDescriptor::int8_col(b"layers_removed"),
                RowDescriptor::int8_col(b"elapsed"),
            ]))?
            .write_message_noflush(&BeMessage::DataRow(&[
-                Some(result.ondisk_relfiles_total.to_string().as_bytes()),
-                Some(
-                    result
-                        .ondisk_relfiles_needed_by_cutoff
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(
-                    result
-                        .ondisk_relfiles_needed_by_branches
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(result.ondisk_relfiles_not_updated.to_string().as_bytes()),
-                Some(
-                    result
-                        .ondisk_relfiles_needed_as_tombstone
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(result.ondisk_relfiles_removed.to_string().as_bytes()),
-                Some(result.ondisk_relfiles_dropped.to_string().as_bytes()),
-                Some(result.ondisk_nonrelfiles_total.to_string().as_bytes()),
-                Some(
-                    result
-                        .ondisk_nonrelfiles_needed_by_cutoff
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(
-                    result
-                        .ondisk_nonrelfiles_needed_by_branches
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()),
-                Some(
-                    result
-                        .ondisk_nonrelfiles_needed_as_tombstone
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(result.ondisk_nonrelfiles_removed.to_string().as_bytes()),
-                Some(result.ondisk_nonrelfiles_dropped.to_string().as_bytes()),
+                Some(result.layers_total.to_string().as_bytes()),
+                Some(result.layers_needed_by_cutoff.to_string().as_bytes()),
+                Some(result.layers_needed_by_branches.to_string().as_bytes()),
+                Some(result.layers_not_updated.to_string().as_bytes()),
+                Some(result.layers_removed.to_string().as_bytes()),
                Some(result.elapsed.as_millis().to_string().as_bytes()),
            ]))?
            .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
+        } else if query_string.starts_with("checkpoint ") {
+            // Run checkpoint immediately on given timeline.
+
+            // checkpoint <tenant_id> <timeline_id>
+            let re = Regex::new(r"^checkpoint ([[:xdigit:]]+)\s([[:xdigit:]]+)($|\s)?").unwrap();
+
+            let caps = re
+                .captures(query_string)
+                .with_context(|| format!("invalid checkpoint command: '{}'", query_string))?;
+
+            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
+            let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;
+
+            let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
+                .context("Failed to fetch local timeline for checkpoint request")?;
+
+            timeline.tline.checkpoint(CheckpointConfig::Forced)?;
+            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
+                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else {
            bail!("unknown command");
        }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
--- a/pageserver/src/relish.rs
+++ b/pageserver/src/relish.rs
@@ -1,4 +1,6 @@
 //!
+//! FIXME: relishes are obsolete
+//!
 //! Zenith stores PostgreSQL relations, and some other files, in the
 //! repository.  The relations (i.e. tables and indexes) take up most
 //! of the space in a typical installation, while the other files are
@@ -24,110 +26,11 @@
 //!

 use serde::{Deserialize, Serialize};
+use std::cmp::Ordering;
 use std::fmt;

 use postgres_ffi::relfile_utils::forknumber_to_name;
-use postgres_ffi::{Oid, TransactionId};
-
-///
-/// RelishTag identifies one relish.
-///
-#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
-pub enum RelishTag {
-    // Relations correspond to PostgreSQL relation forks. Each
-    // PostgreSQL relation fork is considered a separate relish.
-    Relation(RelTag),
-
-    // SLRUs include pg_clog, pg_multixact/members, and
-    // pg_multixact/offsets. There are other SLRUs in PostgreSQL, but
-    // they don't need to be stored permanently (e.g. pg_subtrans),
-    // or we do not support them in zenith yet (pg_commit_ts).
-    //
-    // These are currently never requested directly by the compute
-    // nodes, although in principle that would be possible. However,
-    // when a new compute node is created, these are included in the
-    // tarball that we send to the compute node to initialize the
-    // PostgreSQL data directory.
-    //
-    // Each SLRU segment in PostgreSQL is considered a separate
-    // relish. For example, pg_clog/0000, pg_clog/0001, and so forth.
-    //
-    // SLRU segments are divided into blocks, like relations.
-    Slru { slru: SlruKind, segno: u32 },
-
-    // Miscellaneous other files that need to be included in the
-    // tarball at compute node creation. These are non-blocky, and are
-    // expected to be small.
-
-    //
-    // FileNodeMap represents PostgreSQL's 'pg_filenode.map'
-    // files. They are needed to map catalog table OIDs to filenode
-    // numbers. Usually the mapping is done by looking up a relation's
-    // 'relfilenode' field in the 'pg_class' system table, but that
-    // doesn't work for 'pg_class' itself and a few other such system
-    // relations. See PostgreSQL relmapper.c for details.
-    //
-    // Each database has a map file for its local mapped catalogs,
-    // and there is a separate map file for shared catalogs.
-    //
-    // These files are always 512 bytes long (although we don't check
-    // or care about that in the page server).
-    //
-    FileNodeMap { spcnode: Oid, dbnode: Oid },
-
-    //
-    // State files for prepared transactions (e.g pg_twophase/1234)
-    //
-    TwoPhase { xid: TransactionId },
-
-    // The control file, stored in global/pg_control
-    ControlFile,
-
-    // Special entry that represents PostgreSQL checkpoint. It doesn't
-    // correspond to to any physical file in PostgreSQL, but we use it
-    // to track fields needed to restore the checkpoint data in the
-    // control file, when a compute node is created.
-    Checkpoint,
-}
-
-impl RelishTag {
-    pub const fn is_blocky(&self) -> bool {
-        match self {
-            // These relishes work with blocks
-            RelishTag::Relation(_) | RelishTag::Slru { slru: _, segno: _ } => true,
-
-            // and these don't
-            RelishTag::FileNodeMap {
-                spcnode: _,
-                dbnode: _,
-            }
-            | RelishTag::TwoPhase { xid: _ }
-            | RelishTag::ControlFile
-            | RelishTag::Checkpoint => false,
-        }
-    }
-
-    // Physical relishes represent files and use
-    // RelationSizeEntry to track existing and dropped files.
-    // They can be both blocky and non-blocky.
-    pub const fn is_physical(&self) -> bool {
-        match self {
-            // These relishes represent physical files
-            RelishTag::Relation(_)
-            | RelishTag::Slru { .. }
-            | RelishTag::FileNodeMap { .. }
-            | RelishTag::TwoPhase { .. } => true,
-
-            // and these don't
-            RelishTag::ControlFile | RelishTag::Checkpoint => false,
-        }
-    }
-
-    // convenience function to check if this relish is a normal relation.
-    pub const fn is_relation(&self) -> bool {
-        matches!(self, RelishTag::Relation(_))
-    }
-}
+use postgres_ffi::Oid;

 ///
 /// Relation data file segment id throughout the Postgres cluster.
@@ -144,7 +47,10 @@ impl RelishTag {
 /// are used for the same purpose.
 /// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
 ///
-#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)]
+// FIXME: should move 'forknum' as last field to keep this consistent with Postgres.
+// Then we could replace the custo Ord and PartialOrd implementations below with
+// deriving them.
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
 pub struct RelTag {
    pub forknum: u8,
    pub spcnode: Oid,
@@ -152,6 +58,34 @@ pub struct RelTag {
    pub relnode: Oid,
 }

+impl PartialOrd for RelTag {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for RelTag {
+    fn cmp(&self, other: &Self) -> Ordering {
+        let mut cmp;
+
+        cmp = self.spcnode.cmp(&other.spcnode);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.dbnode.cmp(&other.dbnode);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.relnode.cmp(&other.relnode);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.forknum.cmp(&other.forknum);
+
+        cmp
+    }
+}
+
 /// Display RelTag in the same format that's used in most PostgreSQL debug messages:
 ///
 /// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
@@ -170,34 +104,6 @@ impl fmt::Display for RelTag {
    }
 }

-/// Display RelTag in the same format that's used in most PostgreSQL debug messages:
-///
-/// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
-///
-impl fmt::Display for RelishTag {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        match self {
-            RelishTag::Relation(rel) => rel.fmt(f),
-            RelishTag::Slru { slru, segno } => {
-                // e.g. pg_clog/0001
-                write!(f, "{}/{:04X}", slru.to_str(), segno)
-            }
-            RelishTag::FileNodeMap { spcnode, dbnode } => {
-                write!(f, "relmapper file for spc {} db {}", spcnode, dbnode)
-            }
-            RelishTag::TwoPhase { xid } => {
-                write!(f, "pg_twophase/{:08X}", xid)
-            }
-            RelishTag::ControlFile => {
-                write!(f, "control file")
-            }
-            RelishTag::Checkpoint => {
-                write!(f, "checkpoint")
-            }
-        }
-    }
-}
-
 ///
 /// Non-relation transaction status files (clog (a.k.a. pg_xact) and
 /// pg_multixact) in Postgres are handled by SLRU (Simple LRU) buffer,
@@ -224,8 +130,3 @@ impl SlruKind {
        }
    }
 }
-
-pub const FIRST_NONREL_RELISH_TAG: RelishTag = RelishTag::Slru {
-    slru: SlruKind::Clog,
-    segno: 0,
-};
--- a/pageserver/src/remote_storage.rs
+++ b/pageserver/src/remote_storage.rs
@@ -5,11 +5,18 @@
 //! There are a few components the storage machinery consists of:
 //! * [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
 //!     * [`local_fs`] allows to use local file system as an external storage
-//!     * [`rust_s3`] uses AWS S3 bucket entirely as an external storage
+//!     * [`rust_s3`] uses AWS S3 bucket as an external storage
 //!
 //! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync.
+//! Synchronization internals are split into submodules
+//!     * [`storage_sync::compression`] for a custom remote storage format used to store timeline files in archives
+//!     * [`storage_sync::index`] to keep track of remote tenant files, the metadata and their mappings to local files
+//!     * [`storage_sync::upload`] and [`storage_sync::download`] to manage archive creation and upload; download and extraction, respectively
 //!
-//! * public API via to interact with the external world: [`run_storage_sync_thread`] and [`schedule_timeline_upload`]
+//! * public API via to interact with the external world:
+//!     * [`start_local_timeline_sync`] to launch a background async loop to handle the synchronization
+//!     * [`schedule_timeline_checkpoint_upload`] and [`schedule_timeline_download`] to enqueue a new upload and download tasks,
+//!       to be processed by the async loop
 //!
 //! Here's a schematic overview of all interactions backup and the rest of the pageserver perform:
 //!
@@ -17,10 +24,10 @@
 //! |                        |  - - - (init async loop) - - - ->  |                 |
 //! |                        |                                    |                 |
 //! |                        |  ------------------------------->  |      async      |
-//! |       pageserver       |   (schedule frozen layer upload)   | upload/download |
+//! |       pageserver       |    (enqueue timeline sync task)    | upload/download |
 //! |                        |                                    |      loop       |
 //! |                        |  <-------------------------------  |                 |
-//! |                        |    (register downloaded layers)    |                 |
+//! |                        |  (apply new timeline sync states)  |                 |
 //! +------------------------+                                    +---------<-------+
 //!                                                                         |
 //!                                                                         |
@@ -36,89 +43,263 @@
 //!                                                            | access to this storage |
 //!                                                            +------------------------+
 //!
-//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop unitialised, if configured so.
+//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop uninitialised, if configured so.
+//! The loop inits the storage connection and checks the remote files stored.
+//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server).
+//! Based on the remote storage data, the sync logic immediately schedules sync tasks for local timelines and reports about remote only timelines to pageserver, so it can
+//! query their downloads later if they are accessed.
+//!
 //! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata.
-//! If the storage sync loop was successfully started before, pageserver schedules the new image uploads after every checkpoint.
+//! If the storage sync loop was successfully started before, pageserver schedules the new checkpoint file uploads after every checkpoint.
+//! The checkpoint uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either).
 //! See [`crate::layered_repository`] for the upload calls and the adjacent logic.
 //!
+//! Synchronization logic is able to communicate back with updated timeline sync states, [`TimelineSyncState`],
+//! submitted via [`crate::tenant_mgr::set_timeline_states`] function. Tenant manager applies corresponding timeline updates in pageserver's in-memory state.
+//! Such submissions happen in two cases:
+//! * once after the sync loop startup, to signal pageserver which timelines will be synchronized in the near future
+//! * after every loop step, in case a timeline needs to be reloaded or evicted from pageserver's memory
+//!
+//! When the pageserver terminates, the upload loop finishes a current sync task (if any) and exits.
+//!
 //! The storage logic considers `image` as a set of local files, fully representing a certain timeline at given moment (identified with `disk_consistent_lsn`).
 //! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed
 //! by the storage upload, if enabled.
-//! When a certain image gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same image state.
-//! No files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten
-//! when the newer timeline is downloaded.
+//! Yet timeline cannot alter already existing files, and normally cannot remote those too: only a GC process is capable of removing unused files.
+//! This way, remote storage synchronization relies on the fact that every checkpoint is incremental and local files are "immutable":
+//! * when a certain checkpoint gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same state
+//! * no files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten
+//! when the newer image is downloaded
 //!
-//! Meanwhile, the loop inits the storage connection and checks the remote files stored.
-//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server).
-//! Based on the remote image data, the storage sync logic queues image downloads, while accepting any potential upload tasks from pageserver and managing the tasks by their priority.
-//! On the image download, a [`crate::tenant_mgr::register_relish_download`] function is called to register the new image in pageserver, initializing all related threads and internal state.
-//!
-//! When the pageserver terminates, the upload loop finishes a current image sync task (if any) and exits.
+//! To optimize S3 storage (and access), the sync loop compresses the checkpoint files before placing them to S3, and uncompresses them back, keeping track of timeline files and metadata.
+//! Also, the remote file list is queried once only, at startup, to avoid possible extra costs and latency issues.
 //!
 //! NOTES:
 //! * pageserver assumes it has exclusive write access to the remote storage. If supported, the way multiple pageservers can be separated in the same storage
 //! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API.
 //!
-//! * the uploads do not happen right after pageserver startup, they are registered when
-//!     1. pageserver does the checkpoint, which happens further in the future after the server start
-//!     2. pageserver loads the timeline from disk for the first time
-//!
-//! * the uploads do not happen right after the upload registration: the sync loop might be occupied with other tasks, or tasks with bigger priority could be waiting already
-//!
-//! * all synchronization tasks (including the public API to register uploads and downloads and the sync queue management) happens on an image scale: a big set of remote files,
-//! enough to represent (and recover, if needed) a certain timeline state. On the contrary, all internal storage CRUD calls are made per reilsh file from those images.
-//! This way, the synchronization is able to download the image partially, if some state was synced before, but exposes correctly synced images only.
+//! * the sync tasks may not processed immediately after the submission: if they error and get re-enqueued, their execution might be backed off to ensure error cap is not exceeded too fast.
+//! The sync queue processing also happens in batches, so the sync tasks can wait in the queue for some time.

 mod local_fs;
 mod rust_s3;
 mod storage_sync;

 use std::{
+    collections::HashMap,
+    ffi, fs,
    path::{Path, PathBuf},
-    thread,
 };

-use anyhow::{anyhow, ensure, Context};
+use anyhow::{bail, Context};
 use tokio::io;
-use zenith_utils::zid::{ZTenantId, ZTimelineId};
+use tracing::{error, info};
+use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};

-pub use self::storage_sync::schedule_timeline_upload;
+pub use self::storage_sync::{schedule_timeline_checkpoint_upload, schedule_timeline_download};
 use self::{local_fs::LocalFs, rust_s3::S3};
 use crate::{
-    layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME},
-    PageServerConf, RemoteStorageKind,
+    config::{PageServerConf, RemoteStorageKind},
+    layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME},
+    repository::TimelineSyncState,
 };

+pub use storage_sync::compression;
+
+/// A structure to combine all synchronization data to share with pageserver after a successful sync loop initialization.
+/// Successful initialization includes a case when sync loop is not started, in which case the startup data is returned still,
+/// to simplify the received code.
+pub struct SyncStartupData {
+    /// A sync state, derived from initial comparison of local timeline files and the remote archives,
+    /// before any sync tasks are executed.
+    /// To reuse the local file scan logic, the timeline states are returned even if no sync loop get started during init:
+    /// in this case, no remote files exist and all local timelines with correct metadata files are considered ready.
+    pub initial_timeline_states: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>>,
+}
+
 /// Based on the config, initiates the remote storage connection and starts a separate thread
 /// that ensures that pageserver and the remote storage are in sync with each other.
-/// If no external configuraion connection given, no thread or storage initialization is done.
-pub fn run_storage_sync_thread(
+/// If no external configuration connection given, no thread or storage initialization is done.
+/// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states.
+pub fn start_local_timeline_sync(
    config: &'static PageServerConf,
-) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
+) -> anyhow::Result<SyncStartupData> {
+    let local_timeline_files = local_tenant_timeline_files(config)
+        .context("Failed to collect local tenant timeline files")?;
+
    match &config.remote_storage_config {
-        Some(storage_config) => {
-            let max_concurrent_sync = storage_config.max_concurrent_sync;
-            let handle = match &storage_config.storage {
-                RemoteStorageKind::LocalFs(root) => storage_sync::spawn_storage_sync_thread(
+        Some(storage_config) => match &storage_config.storage {
+            RemoteStorageKind::LocalFs(root) => {
+                info!("Using fs root '{}' as a remote storage", root.display());
+                storage_sync::spawn_storage_sync_thread(
                    config,
+                    local_timeline_files,
                    LocalFs::new(root.clone(), &config.workdir)?,
-                    max_concurrent_sync,
-                ),
-                RemoteStorageKind::AwsS3(s3_config) => storage_sync::spawn_storage_sync_thread(
+                    storage_config.max_concurrent_sync,
+                    storage_config.max_sync_errors,
+                )
+            },
+            RemoteStorageKind::AwsS3(s3_config) => {
+                info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
+                    s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
+                storage_sync::spawn_storage_sync_thread(
                    config,
+                    local_timeline_files,
                    S3::new(s3_config, &config.workdir)?,
-                    max_concurrent_sync,
-                ),
-            };
-            handle.map(Some)
+                    storage_config.max_concurrent_sync,
+                    storage_config.max_sync_errors,
+                )
+            },
+        }
+        .context("Failed to spawn the storage sync thread"),
+        None => {
+            info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled");
+            let mut initial_timeline_states: HashMap<
+                ZTenantId,
+                HashMap<ZTimelineId, TimelineSyncState>,
+            > = HashMap::new();
+            for (ZTenantTimelineId{tenant_id, timeline_id}, (timeline_metadata, _)) in
+                local_timeline_files
+            {
+                initial_timeline_states
+                    .entry(tenant_id)
+                    .or_default()
+                    .insert(
+                        timeline_id,
+                        TimelineSyncState::Ready(timeline_metadata.disk_consistent_lsn()),
+                    );
+            }
+            Ok(SyncStartupData {
+                initial_timeline_states,
+            })
        }
-        None => Ok(None),
    }
 }

+fn local_tenant_timeline_files(
+    config: &'static PageServerConf,
+) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, Vec<PathBuf>)>> {
+    let mut local_tenant_timeline_files = HashMap::new();
+    let tenants_dir = config.tenants_path();
+    for tenants_dir_entry in fs::read_dir(&tenants_dir)
+        .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))?
+    {
+        match &tenants_dir_entry {
+            Ok(tenants_dir_entry) => {
+                match collect_timelines_for_tenant(config, &tenants_dir_entry.path()) {
+                    Ok(collected_files) => {
+                        local_tenant_timeline_files.extend(collected_files.into_iter())
+                    }
+                    Err(e) => error!(
+                        "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}",
+                        tenants_dir.display(),
+                        tenants_dir_entry,
+                        e
+                    ),
+                }
+            }
+            Err(e) => error!(
+                "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}",
+                tenants_dir_entry,
+                tenants_dir.display(),
+                e
+            ),
+        }
+    }
+
+    Ok(local_tenant_timeline_files)
+}
+
+fn collect_timelines_for_tenant(
+    config: &'static PageServerConf,
+    tenant_path: &Path,
+) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, Vec<PathBuf>)>> {
+    let mut timelines: HashMap<ZTenantTimelineId, (TimelineMetadata, Vec<PathBuf>)> =
+        HashMap::new();
+    let tenant_id = tenant_path
+        .file_name()
+        .and_then(ffi::OsStr::to_str)
+        .unwrap_or_default()
+        .parse::<ZTenantId>()
+        .context("Could not parse tenant id out of the tenant dir name")?;
+    let timelines_dir = config.timelines_path(&tenant_id);
+
+    for timelines_dir_entry in fs::read_dir(&timelines_dir).with_context(|| {
+        format!(
+            "Failed to list timelines dir entry for tenant {}",
+            tenant_id
+        )
+    })? {
+        match timelines_dir_entry {
+            Ok(timelines_dir_entry) => {
+                let timeline_path = timelines_dir_entry.path();
+                match collect_timeline_files(&timeline_path) {
+                    Ok((timeline_id, metadata, timeline_files)) => {
+                        timelines.insert(
+                            ZTenantTimelineId {
+                                tenant_id,
+                                timeline_id,
+                            },
+                            (metadata, timeline_files),
+                        );
+                    }
+                    Err(e) => error!(
+                        "Failed to process timeline dir contents at '{}', reason: {:?}",
+                        timeline_path.display(),
+                        e
+                    ),
+                }
+            }
+            Err(e) => error!(
+                "Failed to list timelines for entry tenant {}, reason: {:?}",
+                tenant_id, e
+            ),
+        }
+    }
+
+    Ok(timelines)
+}
+
+fn collect_timeline_files(
+    timeline_dir: &Path,
+) -> anyhow::Result<(ZTimelineId, TimelineMetadata, Vec<PathBuf>)> {
+    let mut timeline_files = Vec::new();
+    let mut timeline_metadata_path = None;
+
+    let timeline_id = timeline_dir
+        .file_name()
+        .and_then(ffi::OsStr::to_str)
+        .unwrap_or_default()
+        .parse::<ZTimelineId>()
+        .context("Could not parse timeline id out of the timeline dir name")?;
+    let timeline_dir_entries =
+        fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?;
+    for entry in timeline_dir_entries {
+        let entry_path = entry.context("Failed to list timeline dir entry")?.path();
+        if entry_path.is_file() {
+            if entry_path.file_name().and_then(ffi::OsStr::to_str) == Some(METADATA_FILE_NAME) {
+                timeline_metadata_path = Some(entry_path);
+            } else {
+                timeline_files.push(entry_path);
+            }
+        }
+    }
+
+    let timeline_metadata_path = match timeline_metadata_path {
+        Some(path) => path,
+        None => bail!("No metadata file found in the timeline directory"),
+    };
+    let metadata = TimelineMetadata::from_bytes(
+        &fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?,
+    )
+    .context("Failed to parse timeline metadata file bytes")?;
+
+    Ok((timeline_id, metadata, timeline_files))
+}
+
 /// Storage (potentially remote) API to manage its state.
 /// This storage tries to be unaware of any layered repository context,
-/// providing basic CRUD operations with storage files.
+/// providing basic CRUD operations for storage files.
 #[async_trait::async_trait]
 trait RemoteStorage: Send + Sync {
    /// A way to uniquely reference a file in the remote storage.
@@ -127,8 +308,8 @@ trait RemoteStorage: Send + Sync {
    /// Attempts to derive the storage path out of the local path, if the latter is correct.
    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath>;

-    /// Gets the layered storage information about the given entry.
-    fn info(&self, storage_path: &Self::StoragePath) -> anyhow::Result<RemoteFileInfo>;
+    /// Gets the download path of the given storage file.
+    fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf>;

    /// Lists all items the storage has right now.
    async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>>;
@@ -159,16 +340,6 @@ trait RemoteStorage: Send + Sync {
    async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()>;
 }

-/// Information about a certain remote storage entry.
-#[derive(Debug, PartialEq, Eq)]
-struct RemoteFileInfo {
-    tenant_id: ZTenantId,
-    timeline_id: ZTimelineId,
-    /// Path in the pageserver workdir where the file should go to.
-    download_destination: PathBuf,
-    is_metadata: bool,
-}
-
 fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
    if prefix == path {
        anyhow::bail!(
@@ -185,147 +356,3 @@ fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a
        })
    }
 }
-
-fn parse_ids_from_path<'a, R: std::fmt::Display>(
-    path_segments: impl Iterator<Item = &'a str>,
-    path_log_representation: &R,
-) -> anyhow::Result<(ZTenantId, ZTimelineId)> {
-    let mut segments = path_segments.skip_while(|&segment| segment != TENANTS_SEGMENT_NAME);
-    let tenants_segment = segments.next().ok_or_else(|| {
-        anyhow!(
-            "Found no '{}' segment in the storage path '{}'",
-            TENANTS_SEGMENT_NAME,
-            path_log_representation
-        )
-    })?;
-    ensure!(
-        tenants_segment == TENANTS_SEGMENT_NAME,
-        "Failed to extract '{}' segment from storage path '{}'",
-        TENANTS_SEGMENT_NAME,
-        path_log_representation
-    );
-    let tenant_id = segments
-        .next()
-        .ok_or_else(|| {
-            anyhow!(
-                "Found no tenant id in the storage path '{}'",
-                path_log_representation
-            )
-        })?
-        .parse::<ZTenantId>()
-        .with_context(|| {
-            format!(
-                "Failed to parse tenant id from storage path '{}'",
-                path_log_representation
-            )
-        })?;
-
-    let timelines_segment = segments.next().ok_or_else(|| {
-        anyhow!(
-            "Found no '{}' segment in the storage path '{}'",
-            TIMELINES_SEGMENT_NAME,
-            path_log_representation
-        )
-    })?;
-    ensure!(
-        timelines_segment == TIMELINES_SEGMENT_NAME,
-        "Failed to extract '{}' segment from storage path '{}'",
-        TIMELINES_SEGMENT_NAME,
-        path_log_representation
-    );
-    let timeline_id = segments
-        .next()
-        .ok_or_else(|| {
-            anyhow!(
-                "Found no timeline id in the storage path '{}'",
-                path_log_representation
-            )
-        })?
-        .parse::<ZTimelineId>()
-        .with_context(|| {
-            format!(
-                "Failed to parse timeline id from storage path '{}'",
-                path_log_representation
-            )
-        })?;
-
-    Ok((tenant_id, timeline_id))
-}
-
-/// A set of common test utils to share in unit tests inside the module tree.
-#[cfg(test)]
-mod test_utils {
-    use std::path::{Path, PathBuf};
-
-    use anyhow::ensure;
-
-    use crate::{
-        layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME},
-        repository::repo_harness::{RepoHarness, TIMELINE_ID},
-    };
-
-    /// Gives a timeline path with pageserver workdir stripped off.
-    pub fn relative_timeline_path(harness: &RepoHarness) -> anyhow::Result<PathBuf> {
-        let timeline_path = harness.timeline_path(&TIMELINE_ID);
-        Ok(timeline_path
-            .strip_prefix(&harness.conf.workdir)?
-            .to_path_buf())
-    }
-
-    /// Creates a path with custom tenant id in one of its segments.
-    /// Useful for emulating paths with wrong ids.
-    pub fn custom_tenant_id_path(
-        path_with_tenant_id: &Path,
-        new_tenant_id: &str,
-    ) -> anyhow::Result<PathBuf> {
-        let mut new_path = PathBuf::new();
-        let mut is_tenant_id = false;
-        let mut tenant_id_replaced = false;
-        for segment in path_with_tenant_id {
-            match segment.to_str() {
-                Some(TENANTS_SEGMENT_NAME) => is_tenant_id = true,
-                Some(_tenant_id_str) if is_tenant_id => {
-                    is_tenant_id = false;
-                    new_path.push(new_tenant_id);
-                    tenant_id_replaced = true;
-                    continue;
-                }
-                _ => {}
-            }
-            new_path.push(segment)
-        }
-
-        ensure!(tenant_id_replaced, "Found no tenant id segment to replace");
-        Ok(new_path)
-    }
-
-    /// Creates a path with custom timeline id in one of its segments.
-    /// Useful for emulating paths with wrong ids.
-    pub fn custom_timeline_id_path(
-        path_with_timeline_id: &Path,
-        new_timeline_id: &str,
-    ) -> anyhow::Result<PathBuf> {
-        let mut new_path = PathBuf::new();
-        let mut is_timeline_id = false;
-        let mut timeline_id_replaced = false;
-        for segment in path_with_timeline_id {
-            match segment.to_str() {
-                Some(TIMELINES_SEGMENT_NAME) => is_timeline_id = true,
-                Some(_timeline_id_str) if is_timeline_id => {
-                    is_timeline_id = false;
-                    new_path.push(new_timeline_id);
-                    timeline_id_replaced = true;
-                    continue;
-                }
-                _ => {}
-            }
-            new_path.push(segment)
-        }
-
-        ensure!(
-            timeline_id_replaced,
-            "Found no timeline id segment to replace"
-        );
-        Ok(new_path)
-    }
-}
--- a/pageserver/src/remote_storage/README.md
+++ b/pageserver/src/remote_storage/README.md
@@ -16,8 +16,16 @@ This way, the backups are managed in background, not affecting directly other pa

 Current implementation
 * provides remote storage wrappers for AWS S3 and local FS
-* uploads layers, frozen by pageserver checkpoint thread
-* downloads and registers layers, found on the remote storage, but missing locally
+* synchronizes the differences with local timelines and remote states as fast as possible
+* uploads new relishes, frozen by pageserver checkpoint thread
+* downloads and registers timelines, found on the remote storage, but missing locally, if those are requested somehow via pageserver (e.g. http api, gc)
+* uses compression when deals with files, for better S3 usage
+* maintains an index of what's stored remotely
+* evicts failing tasks and stops the corresponding timelines
+
+The tasks are delayed with every retry and the retries are capped, to avoid poisonous tasks.
+After any task eviction, or any error at startup checks (e.g. obviously different and wrong local and remote states fot the same timeline),
+the timeline has to be stopped from submitting further checkpoint upload tasks, which is done along the corresponding timeline status change.

 No good optimisations or performance testing is done, the feature is disabled by default and gets polished over time.
 It's planned to deal with all questions that are currently on and prepare the feature to be enabled by default in cloud environments.
@@ -27,28 +35,16 @@ It's planned to deal with all questions that are currently on and prepare the fe
 As mentioned, the backup component is rather new and under development currently, so not all things are done properly from the start.
 Here's the list of known compromises with comments:

-* Remote storage model is the same as the `tenants/` directory contents of the pageserver's local workdir storage.
-This is relatively simple to implement, but may be costly to use in AWS S3: an initial data image contains ~782 relish files and a metadata file, ~31 MB combined.
-AWS charges per API call and for traffic either, layers are expected to be updated frequently, so this model most probably is ineffective.
-Additionally, pageservers might need to migrate images between tenants, which does not improve the situation.
+* Remote storage file model is currently a custom archive format, that's not possible to deserialize without a particular Rust code of ours (including `serde`).
+We also don't optimize the archivation and pack every timeline checkpoint separately, so the resulting blob's size that gets on S3 could be arbitrary.
+But, it's a single blob, which is way better than storing ~780 small files separately.

-Storage sync API operates images when backing up or restoring a backup, so we're fluent to repack the layer contents the way we want to, which most probably will be done later.
+* Archive index restoration requires reading every blob's head.
+This could be avoided by a background thread/future storing the serialized index in the remote storage.

 * no proper file comparison

-Currently, every layer contains `Lsn` in their name, to map the data it holds against a certain DB state.
-Then the images with same ids and different `Lsn`'s are compared, files are considered equal if their local file paths are equal (for remote files, "local file path" is their download destination).
-No file contents assertion is done currently, but should be.
-AWS S3 returns file checksums during the `list` operation, so that can be used to ensure the backup consistency, but that needs further research and, since current pageserver impl also needs to deal with layer file checksums.
-
-For now, due to this, we consider local workdir files as source of truth, not removing them ever and adjusting remote files instead, if image files mismatch.
-
-* no proper retry management
-
-Now, the storage sync attempts to redo the upload/download operation for the image files that failed.
-No proper task eviction or backpressure is implemented currently: the tasks will stay in the queue forever, reattempting the downloads.
-
-This will be fixed when more details on the file consistency model will be agreed on.
+No file checksum assertion is done currently, but should be (AWS S3 returns file checksums during the `list` operation)

 * sad rust-s3 api

@@ -62,21 +58,15 @@ But it's already used in the project, so for now it's reused to avoid bloating t
 Based on previous evaluation, even `rusoto-s3` could be a better choice over this library, but needs further benchmarking.


-* gc and branches are ignored
+* gc is ignored

-So far, we don't consider non-main images and don't adjust the remote storage based on GC thread loop results.
-Only checkpointer loop affects the remote storage.
+So far, we don't adjust the remote storage based on GC thread loop results, only checkpointer loop affects the remote storage.
+Index module could be used as a base to implement a deferred GC mechanism, a "defragmentation" that repacks archives into new ones after GC is done removing the files from the archives.

-* more layers should be downloaded on demand
+* bracnhes implementaion could be improved

-Since we download and load remote layers into pageserver, there's a possibility a need for those layers' ancestors arise.
-Most probably, every downloaded image's ancestor is not present in locally too, but currently there's no logic for downloading such ancestors and their metadata,
-so the pageserver is unable to respond property on requests to such ancestors.
+Currently, there's a code to sync the branches along with the timeline files: on upload, every local branch files that are missing remotely are uploaded,
+on the timeline download, missing remote branch files are downlaoded.

-To implement the downloading, more `tenant_mgr` refactoring is needed to properly handle web requests for layers and handle the state changes.
-[Here](https://github.com/zenithdb/zenith/pull/689#issuecomment-931216193) are the details about initial state management updates needed.
-
-* no IT tests
-
-Automated S3 testing is lacking currently, due to no convenient way to enable backups during the tests.
-After it's fixed, benchmark runs should also be carried out to find bottlenecks.
+A branch is a per-tenant entity, yet a current implementaion requires synchronizing a timeline first to get the branch files locally.
+Currently, there's no other way to know about the remote branch files, neither the file contents is verified and updated.
--- a/pageserver/src/remote_storage/local_fs.rs
+++ b/pageserver/src/remote_storage/local_fs.rs
@@ -5,7 +5,6 @@
 //! volume is mounted to the local FS.

 use std::{
-    ffi::OsStr,
    future::Future,
    path::{Path, PathBuf},
    pin::Pin,
@@ -18,8 +17,7 @@ use tokio::{
 };
 use tracing::*;

-use super::{parse_ids_from_path, strip_path_prefix, RemoteFileInfo, RemoteStorage};
-use crate::layered_repository::metadata::METADATA_FILE_NAME;
+use super::{strip_path_prefix, RemoteStorage};

 pub struct LocalFs {
    pageserver_workdir: &'static Path,
@@ -68,26 +66,14 @@ impl RemoteStorage for LocalFs {
        ))
    }

-    fn info(&self, storage_path: &Self::StoragePath) -> anyhow::Result<RemoteFileInfo> {
-        let is_metadata =
-            storage_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME);
+    fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf> {
        let relative_path = strip_path_prefix(&self.root, storage_path)
            .context("local path does not belong to this storage")?;
-        let download_destination = self.pageserver_workdir.join(relative_path);
-        let (tenant_id, timeline_id) = parse_ids_from_path(
-            relative_path.iter().filter_map(|segment| segment.to_str()),
-            &relative_path.display(),
-        )?;
-        Ok(RemoteFileInfo {
-            tenant_id,
-            timeline_id,
-            download_destination,
-            is_metadata,
-        })
+        Ok(self.pageserver_workdir.join(relative_path))
    }

    async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
-        Ok(get_all_files(&self.root).await?.into_iter().collect())
+        get_all_files(&self.root).await
    }

    async fn upload(
@@ -113,11 +99,18 @@ impl RemoteStorage for LocalFs {

        io::copy(&mut from, &mut destination)
            .await
-            .context("Failed to upload a file to the local storage")?;
-        destination
-            .flush()
-            .await
-            .context("Failed to upload a file to the local storage")?;
+            .with_context(|| {
+                format!(
+                    "Failed to upload file to the local storage at '{}'",
+                    target_file_path.display()
+                )
+            })?;
+        destination.flush().await.with_context(|| {
+            format!(
+                "Failed to upload file to the local storage at '{}'",
+                target_file_path.display()
+            )
+        })?;
        Ok(())
    }

@@ -141,9 +134,13 @@ impl RemoteStorage for LocalFs {
                        )
                    })?,
            );
-            io::copy(&mut source, to)
-                .await
-                .context("Failed to download a file from the local storage")?;
+            io::copy(&mut source, to).await.with_context(|| {
+                format!(
+                    "Failed to download file '{}' from the local storage",
+                    file_path.display()
+                )
+            })?;
+            source.flush().await?;
            Ok(())
        } else {
            bail!(
@@ -275,9 +272,6 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()>
 mod pure_tests {
    use crate::{
        layered_repository::metadata::METADATA_FILE_NAME,
-        remote_storage::test_utils::{
-            custom_tenant_id_path, custom_timeline_id_path, relative_timeline_path,
-        },
        repository::repo_harness::{RepoHarness, TIMELINE_ID},
    };

@@ -345,8 +339,8 @@ mod pure_tests {
    }

    #[test]
-    fn info_positive() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("info_positive")?;
+    fn local_path_positive() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("local_path_positive")?;
        let storage_root = PathBuf::from("somewhere").join("else");
        let storage = LocalFs {
            pageserver_workdir: &repo_harness.conf.workdir,
@@ -356,15 +350,12 @@ mod pure_tests {
        let name = "not a metadata";
        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join(name);
        assert_eq!(
-            RemoteFileInfo {
-                tenant_id: repo_harness.tenant_id,
-                timeline_id: TIMELINE_ID,
-                download_destination: local_path.clone(),
-                is_metadata: false,
-            },
+            local_path,
            storage
-                .info(&storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?))
-                .expect("For a valid input, valid S3 info should be parsed"),
+                .local_path(
+                    &storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?)
+                )
+                .expect("For a valid input, valid local path should be parsed"),
            "Should be able to parse metadata out of the correctly named remote delta file"
        );

@@ -373,15 +364,10 @@ mod pure_tests {
            .join(METADATA_FILE_NAME);
        let remote_metadata_path = storage.storage_path(&local_metadata_path)?;
        assert_eq!(
-            RemoteFileInfo {
-                tenant_id: repo_harness.tenant_id,
-                timeline_id: TIMELINE_ID,
-                download_destination: local_metadata_path,
-                is_metadata: true,
-            },
+            local_metadata_path,
            storage
-                .info(&remote_metadata_path)
-                .expect("For a valid input, valid S3 info should be parsed"),
+                .local_path(&remote_metadata_path)
+                .expect("For a valid input, valid local path should be parsed"),
            "Should be able to parse metadata out of the correctly named remote metadata file"
        );

@@ -389,53 +375,30 @@ mod pure_tests {
    }

    #[test]
-    fn info_negatives() -> anyhow::Result<()> {
+    fn local_path_negatives() -> anyhow::Result<()> {
        #[track_caller]
-        #[allow(clippy::ptr_arg)] // have to use &PathBuf due to `storage.info` parameter requirements
-        fn storage_info_error(storage: &LocalFs, storage_path: &PathBuf) -> String {
-            match storage.info(storage_path) {
-                Ok(wrong_info) => panic!(
-                    "Expected storage path input {:?} to cause an error, but got file info: {:?}",
-                    storage_path, wrong_info,
+        #[allow(clippy::ptr_arg)] // have to use &PathBuf due to `storage.local_path` parameter requirements
+        fn local_path_error(storage: &LocalFs, storage_path: &PathBuf) -> String {
+            match storage.local_path(storage_path) {
+                Ok(wrong_path) => panic!(
+                    "Expected local path input {:?} to cause an error, but got file path: {:?}",
+                    storage_path, wrong_path,
                ),
                Err(e) => format!("{:?}", e),
            }
        }

-        let repo_harness = RepoHarness::create("info_negatives")?;
+        let repo_harness = RepoHarness::create("local_path_negatives")?;
        let storage_root = PathBuf::from("somewhere").join("else");
        let storage = LocalFs {
            pageserver_workdir: &repo_harness.conf.workdir,
-            root: storage_root.clone(),
+            root: storage_root,
        };

        let totally_wrong_path = "wrong_wrong_wrong";
-        let error_message = storage_info_error(&storage, &PathBuf::from(totally_wrong_path));
+        let error_message = local_path_error(&storage, &PathBuf::from(totally_wrong_path));
        assert!(error_message.contains(totally_wrong_path));

-        let relative_timeline_path = relative_timeline_path(&repo_harness)?;
-
-        let relative_file_path = custom_tenant_id_path(&relative_timeline_path, "wrong_tenant_id")?
-            .join("wrong_tenant_id_name");
-        let wrong_tenant_id_path = storage_root.join(&relative_file_path);
-        let error_message = storage_info_error(&storage, &wrong_tenant_id_path);
-        assert!(
-            error_message.contains(relative_file_path.to_str().unwrap()),
-            "Error message '{}' does not contain the expected substring",
-            error_message
-        );
-
-        let relative_file_path =
-            custom_timeline_id_path(&relative_timeline_path, "wrong_timeline_id")?
-                .join("wrong_timeline_id_name");
-        let wrong_timeline_id_path = storage_root.join(&relative_file_path);
-        let error_message = storage_info_error(&storage, &wrong_timeline_id_path);
-        assert!(
-            error_message.contains(relative_file_path.to_str().unwrap()),
-            "Error message '{}' does not contain the expected substring",
-            error_message
-        );
-
        Ok(())
    }

@@ -451,7 +414,7 @@ mod pure_tests {
        };

        let storage_path = dummy_storage.storage_path(&original_path)?;
-        let download_destination = dummy_storage.info(&storage_path)?.download_destination;
+        let download_destination = dummy_storage.local_path(&storage_path)?;

        assert_eq!(
            original_path, download_destination,
@@ -465,9 +428,7 @@ mod pure_tests {
 #[cfg(test)]
 mod fs_tests {
    use super::*;
-    use crate::{
-        remote_storage::test_utils::relative_timeline_path, repository::repo_harness::RepoHarness,
-    };
+    use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID};

    use std::io::Write;
    use tempfile::tempdir;
@@ -597,7 +558,7 @@ mod fs_tests {
        assert_eq!(
            first_part_local,
            first_part_remote.as_slice(),
-            "First part bytes should be returned when requrested"
+            "First part bytes should be returned when requested"
        );

        let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
@@ -614,7 +575,7 @@ mod fs_tests {
        assert_eq!(
            second_part_local,
            second_part_remote.as_slice(),
-            "Second part bytes should be returned when requrested"
+            "Second part bytes should be returned when requested"
        );

        Ok(())
@@ -680,14 +641,13 @@ mod fs_tests {
    }

    async fn upload_dummy_file(
-        harness: &RepoHarness,
+        harness: &RepoHarness<'_>,
        storage: &LocalFs,
        name: &str,
    ) -> anyhow::Result<PathBuf> {
-        let storage_path = storage
-            .root
-            .join(relative_timeline_path(harness)?)
-            .join(name);
+        let timeline_path = harness.timeline_path(&TIMELINE_ID);
+        let relative_timeline_path = timeline_path.strip_prefix(&harness.conf.workdir)?;
+        let storage_path = storage.root.join(relative_timeline_path).join(name);
        storage
            .upload(
                create_file_for_upload(
--- a/pageserver/src/remote_storage/rust_s3.rs
+++ b/pageserver/src/remote_storage/rust_s3.rs
@@ -1,17 +1,19 @@
 //! AWS S3 storage wrapper around `rust_s3` library.
-//! Currently does not allow multiple pageservers to use the same bucket concurrently: objects are
-//! placed in the root of the bucket.
+//!
+//! Respects `prefix_in_bucket` property from [`S3Config`],
+//! allowing multiple pageservers to independently work with the same S3 bucket, if
+//! their bucket prefixes are both specified and different.

 use std::path::{Path, PathBuf};

 use anyhow::Context;
 use s3::{bucket::Bucket, creds::Credentials, region::Region};
 use tokio::io::{self, AsyncWriteExt};
+use tracing::debug;

 use crate::{
-    layered_repository::metadata::METADATA_FILE_NAME,
-    remote_storage::{parse_ids_from_path, strip_path_prefix, RemoteFileInfo, RemoteStorage},
-    S3Config,
+    config::S3Config,
+    remote_storage::{strip_path_prefix, RemoteStorage},
 };

 const S3_FILE_SEPARATOR: char = '/';
@@ -24,8 +26,26 @@ impl S3ObjectKey {
        &self.0
    }

-    fn download_destination(&self, pageserver_workdir: &Path) -> PathBuf {
-        pageserver_workdir.join(self.0.split(S3_FILE_SEPARATOR).collect::<PathBuf>())
+    fn download_destination(
+        &self,
+        pageserver_workdir: &Path,
+        prefix_to_strip: Option<&str>,
+    ) -> PathBuf {
+        let path_without_prefix = match prefix_to_strip {
+            Some(prefix) => self.0.strip_prefix(prefix).unwrap_or_else(|| {
+                panic!(
+                    "Could not strip prefix '{}' from S3 object key '{}'",
+                    prefix, self.0
+                )
+            }),
+            None => &self.0,
+        };
+
+        pageserver_workdir.join(
+            path_without_prefix
+                .split(S3_FILE_SEPARATOR)
+                .collect::<PathBuf>(),
+        )
    }
 }

@@ -33,15 +53,27 @@ impl S3ObjectKey {
 pub struct S3 {
    pageserver_workdir: &'static Path,
    bucket: Bucket,
+    prefix_in_bucket: Option<String>,
 }

 impl S3 {
    /// Creates the storage, errors if incorrect AWS S3 configuration provided.
    pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
-        let region = aws_config
-            .bucket_region
-            .parse::<Region>()
-            .context("Failed to parse the s3 region from config")?;
+        debug!(
+            "Creating s3 remote storage around bucket {}",
+            aws_config.bucket_name
+        );
+        let region = match aws_config.endpoint.clone() {
+            Some(endpoint) => Region::Custom {
+                endpoint,
+                region: aws_config.bucket_region.clone(),
+            },
+            None => aws_config
+                .bucket_region
+                .parse::<Region>()
+                .context("Failed to parse the s3 region from config")?,
+        };
+
        let credentials = Credentials::new(
            aws_config.access_key_id.as_deref(),
            aws_config.secret_access_key.as_deref(),
@@ -50,6 +82,20 @@ impl S3 {
            None,
        )
        .context("Failed to create the s3 credentials")?;
+
+        let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
+            let mut prefix = prefix;
+            while prefix.starts_with(S3_FILE_SEPARATOR) {
+                prefix = &prefix[1..]
+            }
+
+            let mut prefix = prefix.to_string();
+            while prefix.ends_with(S3_FILE_SEPARATOR) {
+                prefix.pop();
+            }
+            prefix
+        });
+
        Ok(Self {
            bucket: Bucket::new_with_path_style(
                aws_config.bucket_name.as_str(),
@@ -58,6 +104,7 @@ impl S3 {
            )
            .context("Failed to create the s3 bucket")?,
            pageserver_workdir,
+            prefix_in_bucket,
        })
    }
 }
@@ -68,7 +115,7 @@ impl RemoteStorage for S3 {

    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath> {
        let relative_path = strip_path_prefix(self.pageserver_workdir, local_path)?;
-        let mut key = String::new();
+        let mut key = self.prefix_in_bucket.clone().unwrap_or_default();
        for segment in relative_path {
            key.push(S3_FILE_SEPARATOR);
            key.push_str(&segment.to_string_lossy());
@@ -76,25 +123,15 @@ impl RemoteStorage for S3 {
        Ok(S3ObjectKey(key))
    }

-    fn info(&self, storage_path: &Self::StoragePath) -> anyhow::Result<RemoteFileInfo> {
-        let storage_path_key = &storage_path.0;
-        let is_metadata =
-            storage_path_key.ends_with(&format!("{}{}", S3_FILE_SEPARATOR, METADATA_FILE_NAME));
-        let download_destination = storage_path.download_destination(self.pageserver_workdir);
-        let (tenant_id, timeline_id) =
-            parse_ids_from_path(storage_path_key.split(S3_FILE_SEPARATOR), storage_path_key)?;
-        Ok(RemoteFileInfo {
-            tenant_id,
-            timeline_id,
-            download_destination,
-            is_metadata,
-        })
+    fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf> {
+        Ok(storage_path
+            .download_destination(self.pageserver_workdir, self.prefix_in_bucket.as_deref()))
    }

    async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
        let list_response = self
            .bucket
-            .list(String::new(), None)
+            .list(self.prefix_in_bucket.clone().unwrap_or_default(), None)
            .await
            .context("Failed to list s3 objects")?;

@@ -212,9 +249,7 @@ impl RemoteStorage for S3 {
 #[cfg(test)]
 mod tests {
    use crate::{
-        remote_storage::test_utils::{
-            custom_tenant_id_path, custom_timeline_id_path, relative_timeline_path,
-        },
+        layered_repository::metadata::METADATA_FILE_NAME,
        repository::repo_harness::{RepoHarness, TIMELINE_ID},
    };

@@ -239,7 +274,7 @@ mod tests {

        assert_eq!(
            local_path,
-            key.download_destination(&repo_harness.conf.workdir),
+            key.download_destination(&repo_harness.conf.workdir, None),
            "Download destination should consist of s3 path joined with the pageserver workdir prefix"
        );

@@ -253,14 +288,18 @@ mod tests {
        let segment_1 = "matching";
        let segment_2 = "file";
        let local_path = &repo_harness.conf.workdir.join(segment_1).join(segment_2);
+
+        let storage = dummy_storage(&repo_harness.conf.workdir);
+
        let expected_key = S3ObjectKey(format!(
-            "{SEPARATOR}{}{SEPARATOR}{}",
+            "{}{SEPARATOR}{}{SEPARATOR}{}",
+            storage.prefix_in_bucket.as_deref().unwrap_or_default(),
            segment_1,
            segment_2,
            SEPARATOR = S3_FILE_SEPARATOR,
        ));

-        let actual_key = dummy_storage(&repo_harness.conf.workdir)
+        let actual_key = storage
            .storage_path(local_path)
            .expect("Matching path should map to S3 path normally");
        assert_eq!(
@@ -316,35 +355,38 @@ mod tests {
    }

    #[test]
-    fn info_positive() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("info_positive")?;
+    fn local_path_positive() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("local_path_positive")?;
        let storage = dummy_storage(&repo_harness.conf.workdir);
-        let relative_timeline_path = relative_timeline_path(&repo_harness)?;
+        let timeline_dir = repo_harness.timeline_path(&TIMELINE_ID);
+        let relative_timeline_path = timeline_dir.strip_prefix(&repo_harness.conf.workdir)?;

-        let s3_key = create_s3_key(&relative_timeline_path.join("not a metadata"));
+        let s3_key = create_s3_key(
+            &relative_timeline_path.join("not a metadata"),
+            storage.prefix_in_bucket.as_deref(),
+        );
        assert_eq!(
-            RemoteFileInfo {
-                tenant_id: repo_harness.tenant_id,
-                timeline_id: TIMELINE_ID,
-                download_destination: s3_key.download_destination(&repo_harness.conf.workdir),
-                is_metadata: false,
-            },
+            s3_key.download_destination(
+                &repo_harness.conf.workdir,
+                storage.prefix_in_bucket.as_deref()
+            ),
            storage
-                .info(&s3_key)
+                .local_path(&s3_key)
                .expect("For a valid input, valid S3 info should be parsed"),
            "Should be able to parse metadata out of the correctly named remote delta file"
        );

-        let s3_key = create_s3_key(&relative_timeline_path.join(METADATA_FILE_NAME));
+        let s3_key = create_s3_key(
+            &relative_timeline_path.join(METADATA_FILE_NAME),
+            storage.prefix_in_bucket.as_deref(),
+        );
        assert_eq!(
-            RemoteFileInfo {
-                tenant_id: repo_harness.tenant_id,
-                timeline_id: TIMELINE_ID,
-                download_destination: s3_key.download_destination(&repo_harness.conf.workdir),
-                is_metadata: true,
-            },
+            s3_key.download_destination(
+                &repo_harness.conf.workdir,
+                storage.prefix_in_bucket.as_deref()
+            ),
            storage
-                .info(&s3_key)
+                .local_path(&s3_key)
                .expect("For a valid input, valid S3 info should be parsed"),
            "Should be able to parse metadata out of the correctly named remote metadata file"
        );
@@ -352,43 +394,6 @@ mod tests {
        Ok(())
    }

-    #[test]
-    fn info_negatives() -> anyhow::Result<()> {
-        #[track_caller]
-        fn storage_info_error(storage: &S3, s3_key: &S3ObjectKey) -> String {
-            match storage.info(s3_key) {
-                Ok(wrong_info) => panic!(
-                    "Expected key {:?} to error, but got file info: {:?}",
-                    s3_key, wrong_info,
-                ),
-                Err(e) => e.to_string(),
-            }
-        }
-
-        let repo_harness = RepoHarness::create("info_negatives")?;
-        let storage = dummy_storage(&repo_harness.conf.workdir);
-        let relative_timeline_path = relative_timeline_path(&repo_harness)?;
-
-        let totally_wrong_path = "wrong_wrong_wrong";
-        let error_message =
-            storage_info_error(&storage, &S3ObjectKey(totally_wrong_path.to_string()));
-        assert!(error_message.contains(totally_wrong_path));
-
-        let wrong_tenant_id = create_s3_key(
-            &custom_tenant_id_path(&relative_timeline_path, "wrong_tenant_id")?.join("name"),
-        );
-        let error_message = storage_info_error(&storage, &wrong_tenant_id);
-        assert!(error_message.contains(&wrong_tenant_id.0));
-
-        let wrong_timeline_id = create_s3_key(
-            &custom_timeline_id_path(&relative_timeline_path, "wrong_timeline_id")?.join("name"),
-        );
-        let error_message = storage_info_error(&storage, &wrong_timeline_id);
-        assert!(error_message.contains(&wrong_timeline_id.0));
-
-        Ok(())
-    }
-
    #[test]
    fn download_destination_matches_original_path() -> anyhow::Result<()> {
        let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
@@ -397,7 +402,7 @@ mod tests {
        let dummy_storage = dummy_storage(&repo_harness.conf.workdir);

        let key = dummy_storage.storage_path(&original_path)?;
-        let download_destination = dummy_storage.info(&key)?.download_destination;
+        let download_destination = dummy_storage.local_path(&key)?;

        assert_eq!(
            original_path, download_destination,
@@ -416,18 +421,18 @@ mod tests {
                Credentials::anonymous().unwrap(),
            )
            .unwrap(),
+            prefix_in_bucket: Some("dummy_prefix/".to_string()),
        }
    }

-    fn create_s3_key(relative_file_path: &Path) -> S3ObjectKey {
-        S3ObjectKey(
-            relative_file_path
-                .iter()
-                .fold(String::new(), |mut path_string, segment| {
-                    path_string.push(S3_FILE_SEPARATOR);
-                    path_string.push_str(segment.to_str().unwrap());
-                    path_string
-                }),
-        )
+    fn create_s3_key(relative_file_path: &Path, prefix: Option<&str>) -> S3ObjectKey {
+        S3ObjectKey(relative_file_path.iter().fold(
+            prefix.unwrap_or_default().to_string(),
+            |mut path_string, segment| {
+                path_string.push(S3_FILE_SEPARATOR);
+                path_string.push_str(segment.to_str().unwrap());
+                path_string
+            },
+        ))
    }
 }
--- a/pageserver/src/remote_storage/storage_sync.rs
+++ b/pageserver/src/remote_storage/storage_sync.rs
--- a/pageserver/src/remote_storage/storage_sync/compression.rs
+++ b/pageserver/src/remote_storage/storage_sync/compression.rs
@@ -0,0 +1,613 @@
+//! A set of structs to represent a compressed part of the timeline, and methods to asynchronously compress and uncompress a stream of data,
+//! without holding the entire data in memory.
+//! For the latter, both compress and uncompress functions operate buffered streams (currently hardcoded size of [`ARCHIVE_STREAM_BUFFER_SIZE_BYTES`]),
+//! not attempting to hold the entire archive in memory.
+//!
+//! The compression is done with <a href="https://datatracker.ietf.org/doc/html/rfc8878">zstd</a> streaming algorithm via the `async-compression` crate.
+//! The crate does not contain any knobs to tweak the compression, but otherwise is one of the only ones that's both async and has an API to manage the part of an archive.
+//! Zstd was picked as the best algorithm among the ones available in the crate, after testing the initial timeline file compression.
+//!
+//! Archiving is almost agnostic to timeline file types, with an exception of the metadata file, that's currently distinguished in the [un]compression code.
+//! The metadata file is treated separately when [de]compression is involved, to reduce the risk of corrupting the metadata file.
+//! When compressed, the metadata file is always required and stored as the last file in the archive stream.
+//! When uncompressed, the metadata file gets naturally uncompressed last, to ensure that all other relishes are decompressed successfully first.
+//!
+//! Archive structure:
+//! +----------------------------------------+
+//! | header | file_1, ..., file_k, metadata |
+//! +----------------------------------------+
+//!
+//! The archive consists of two separate zstd archives:
+//! * header archive, that contains all files names and their sizes and relative paths in the timeline directory
+//! Header is a Rust structure, serialized into bytes and compressed with zstd.
+//! * files archive, that has metadata file as the last one, all compressed with zstd into a single binary blob
+//!
+//! Header offset is stored in the file name, along with the `disk_consistent_lsn` from the metadata file.
+//! See [`parse_archive_name`] and [`ARCHIVE_EXTENSION`] for the name details, example: `00000000016B9150-.zst_9732`.
+//! This way, the header could be retrieved without reading an entire archive file.
+
+use std::{
+    collections::BTreeSet,
+    future::Future,
+    io::Cursor,
+    path::{Path, PathBuf},
+    sync::Arc,
+};
+
+use anyhow::{bail, ensure, Context};
+use async_compression::tokio::bufread::{ZstdDecoder, ZstdEncoder};
+use serde::{Deserialize, Serialize};
+use tokio::{
+    fs,
+    io::{self, AsyncReadExt, AsyncWriteExt},
+};
+use tracing::*;
+use zenith_utils::{bin_ser::BeSer, lsn::Lsn};
+
+use crate::layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME};
+
+use super::index::RelativePath;
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct ArchiveHeader {
+    /// All regular timeline files, excluding the metadata file.
+    pub files: Vec<FileEntry>,
+    // Metadata file name is known to the system, as its location relative to the timeline dir,
+    // so no need to store anything but its size in bytes.
+    pub metadata_file_size: u64,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash)]
+pub struct FileEntry {
+    /// Uncompressed file size, bytes.
+    pub size: u64,
+    /// A path, relative to the directory root, used when compressing the directory contents.
+    pub subpath: RelativePath,
+}
+
+const ARCHIVE_EXTENSION: &str = "-.zst_";
+const ARCHIVE_STREAM_BUFFER_SIZE_BYTES: usize = 4 * 1024 * 1024;
+
+/// Streams an archive of files given into a stream target, defined by the closure.
+///
+/// The closure approach is picked for cases like S3, where we would need a name of the file before we can get a stream to write the bytes into.
+/// Current idea is to place the header size in the name of the file, to enable the fast partial remote file index restoration without actually reading remote storage file contents.
+///
+/// Performs the compression in multiple steps:
+/// * prepares an archive header, stripping the `source_dir` prefix from the `files`
+/// * generates the name of the archive
+/// * prepares archive producer future, knowing the header and the file list
+/// An `impl AsyncRead` and `impl AsyncWrite` pair of connected streams is created to implement the partial contents streaming.
+/// The writer end gets into the archive producer future, to put the header and a stream of compressed files.
+/// * prepares archive consumer future, by executing the provided closure
+/// The closure gets the reader end stream and the name of the file to create a future that would stream the file contents elsewhere.
+/// * runs and waits for both futures to complete
+/// * on a successful completion of both futures, header, its size and the user-defined consumer future return data is returned
+/// Due to the design above, the archive name and related data is visible inside the consumer future only, so it's possible to return the data,
+/// needed for future processing.
+pub async fn archive_files_as_stream<Cons, ConsRet, Fut>(
+    source_dir: &Path,
+    files: impl Iterator<Item = &PathBuf>,
+    metadata: &TimelineMetadata,
+    create_archive_consumer: Cons,
+) -> anyhow::Result<(ArchiveHeader, u64, ConsRet)>
+where
+    Cons: FnOnce(Box<dyn io::AsyncRead + Unpin + Send + Sync + 'static>, String) -> Fut
+        + Send
+        + 'static,
+    Fut: Future<Output = anyhow::Result<ConsRet>> + Send + 'static,
+    ConsRet: Send + Sync + 'static,
+{
+    let metadata_bytes = metadata
+        .to_bytes()
+        .context("Failed to create metadata bytes")?;
+    let (archive_header, compressed_header_bytes) =
+        prepare_header(source_dir, files, &metadata_bytes)
+            .await
+            .context("Failed to prepare file for archivation")?;
+
+    let header_size = compressed_header_bytes.len() as u64;
+    let (write, read) = io::duplex(ARCHIVE_STREAM_BUFFER_SIZE_BYTES);
+    let archive_filler = write_archive_contents(
+        source_dir.to_path_buf(),
+        archive_header.clone(),
+        metadata_bytes,
+        write,
+    );
+    let archive_name = archive_name(metadata.disk_consistent_lsn(), header_size);
+    let archive_stream =
+        Cursor::new(compressed_header_bytes).chain(ZstdEncoder::new(io::BufReader::new(read)));
+
+    let (archive_creation_result, archive_upload_result) = tokio::join!(
+        tokio::spawn(archive_filler),
+        tokio::spawn(async move {
+            create_archive_consumer(Box::new(archive_stream), archive_name).await
+        })
+    );
+    archive_creation_result
+        .context("Failed to spawn archive creation future")?
+        .context("Failed to create an archive")?;
+    let upload_return_value = archive_upload_result
+        .context("Failed to spawn archive upload future")?
+        .context("Failed to upload the archive")?;
+
+    Ok((archive_header, header_size, upload_return_value))
+}
+
+/// Similar to [`archive_files_as_stream`], creates a pair of streams to uncompress the 2nd part of the archive,
+/// that contains files and is located after the header.
+/// S3 allows downloading partial file contents for a given file key (i.e. name), to accommodate this retrieval,
+/// a closure is used.
+/// Same concepts with two concurrent futures, user-defined closure, future and return value apply here, but the
+/// consumer and the receiver ends are swapped, since the uncompression happens.
+pub async fn uncompress_file_stream_with_index<Prod, ProdRet, Fut>(
+    destination_dir: PathBuf,
+    files_to_skip: Arc<BTreeSet<PathBuf>>,
+    disk_consistent_lsn: Lsn,
+    header: ArchiveHeader,
+    header_size: u64,
+    create_archive_file_part: Prod,
+) -> anyhow::Result<ProdRet>
+where
+    Prod: FnOnce(Box<dyn io::AsyncWrite + Unpin + Send + Sync + 'static>, String) -> Fut
+        + Send
+        + 'static,
+    Fut: Future<Output = anyhow::Result<ProdRet>> + Send + 'static,
+    ProdRet: Send + Sync + 'static,
+{
+    let (write, mut read) = io::duplex(ARCHIVE_STREAM_BUFFER_SIZE_BYTES);
+    let archive_name = archive_name(disk_consistent_lsn, header_size);
+
+    let (archive_download_result, archive_uncompress_result) = tokio::join!(
+        tokio::spawn(async move { create_archive_file_part(Box::new(write), archive_name).await }),
+        tokio::spawn(async move {
+            uncompress_with_header(&files_to_skip, &destination_dir, header, &mut read).await
+        })
+    );
+
+    let download_value = archive_download_result
+        .context("Failed to spawn archive download future")?
+        .context("Failed to download an archive")?;
+    archive_uncompress_result
+        .context("Failed to spawn archive uncompress future")?
+        .context("Failed to uncompress the archive")?;
+
+    Ok(download_value)
+}
+
+/// Reads archive header from the stream given:
+/// * parses the file name to get the header size
+/// * reads the exact amount of bytes
+/// * uncompresses and deserializes those
+pub async fn read_archive_header<A: io::AsyncRead + Send + Sync + Unpin>(
+    archive_name: &str,
+    from: &mut A,
+) -> anyhow::Result<ArchiveHeader> {
+    let (_, header_size) = parse_archive_name(Path::new(archive_name))?;
+
+    let mut compressed_header_bytes = vec![0; header_size as usize];
+    from.read_exact(&mut compressed_header_bytes)
+        .await
+        .with_context(|| {
+            format!(
+                "Failed to read header header from the archive {}",
+                archive_name
+            )
+        })?;
+
+    let mut header_bytes = Vec::new();
+    ZstdDecoder::new(io::BufReader::new(compressed_header_bytes.as_slice()))
+        .read_to_end(&mut header_bytes)
+        .await
+        .context("Failed to decompress a header from the archive")?;
+
+    Ok(ArchiveHeader::des(&header_bytes)
+        .context("Failed to deserialize a header from the archive")?)
+}
+
+/// Reads the archive metadata out of the archive name:
+/// * `disk_consistent_lsn` of the checkpoint that was archived
+/// * size of the archive header
+pub fn parse_archive_name(archive_path: &Path) -> anyhow::Result<(Lsn, u64)> {
+    let archive_name = archive_path
+        .file_name()
+        .with_context(|| format!("Archive '{}' has no file name", archive_path.display()))?
+        .to_string_lossy();
+    let (lsn_str, header_size_str) =
+        archive_name
+            .rsplit_once(ARCHIVE_EXTENSION)
+            .with_context(|| {
+                format!(
+                    "Archive '{}' has incorrect extension, expected to contain '{}'",
+                    archive_path.display(),
+                    ARCHIVE_EXTENSION
+                )
+            })?;
+    let disk_consistent_lsn = Lsn::from_hex(lsn_str).with_context(|| {
+        format!(
+            "Archive '{}' has an invalid disk consistent lsn in its extension",
+            archive_path.display(),
+        )
+    })?;
+    let header_size = header_size_str.parse::<u64>().with_context(|| {
+        format!(
+            "Archive '{}' has an invalid a header offset number in its extension",
+            archive_path.display(),
+        )
+    })?;
+    Ok((disk_consistent_lsn, header_size))
+}
+
+fn archive_name(disk_consistent_lsn: Lsn, header_size: u64) -> String {
+    let archive_name = format!(
+        "{:016X}{ARCHIVE_EXTENSION}{}",
+        u64::from(disk_consistent_lsn),
+        header_size,
+        ARCHIVE_EXTENSION = ARCHIVE_EXTENSION,
+    );
+    archive_name
+}
+
+pub async fn uncompress_with_header(
+    files_to_skip: &BTreeSet<PathBuf>,
+    destination_dir: &Path,
+    header: ArchiveHeader,
+    archive_after_header: impl io::AsyncRead + Send + Sync + Unpin,
+) -> anyhow::Result<()> {
+    debug!("Uncompressing archive into {}", destination_dir.display());
+    let mut archive = ZstdDecoder::new(io::BufReader::new(archive_after_header));
+
+    if !destination_dir.exists() {
+        fs::create_dir_all(&destination_dir)
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to create target directory at {}",
+                    destination_dir.display()
+                )
+            })?;
+    } else if !destination_dir.is_dir() {
+        bail!(
+            "Destination path '{}' is not a valid directory",
+            destination_dir.display()
+        );
+    }
+    debug!("Will extract {} files from the archive", header.files.len());
+    for entry in header.files {
+        uncompress_entry(
+            &mut archive,
+            &entry.subpath.as_path(destination_dir),
+            entry.size,
+            files_to_skip,
+        )
+        .await
+        .with_context(|| format!("Failed to uncompress archive entry {:?}", entry))?;
+    }
+    uncompress_entry(
+        &mut archive,
+        &destination_dir.join(METADATA_FILE_NAME),
+        header.metadata_file_size,
+        files_to_skip,
+    )
+    .await
+    .context("Failed to uncompress the metadata entry")?;
+    Ok(())
+}
+
+async fn uncompress_entry(
+    archive: &mut ZstdDecoder<io::BufReader<impl io::AsyncRead + Send + Sync + Unpin>>,
+    destination_path: &Path,
+    entry_size: u64,
+    files_to_skip: &BTreeSet<PathBuf>,
+) -> anyhow::Result<()> {
+    if let Some(parent) = destination_path.parent() {
+        fs::create_dir_all(parent).await.with_context(|| {
+            format!(
+                "Failed to create parent directory for {}",
+                destination_path.display()
+            )
+        })?;
+    };
+
+    if files_to_skip.contains(destination_path) {
+        debug!("Skipping {}", destination_path.display());
+        copy_n_bytes(entry_size, archive, &mut io::sink())
+            .await
+            .context("Failed to skip bytes in the archive")?;
+        return Ok(());
+    }
+
+    let mut destination =
+        io::BufWriter::new(fs::File::create(&destination_path).await.with_context(|| {
+            format!(
+                "Failed to open file {} for extraction",
+                destination_path.display()
+            )
+        })?);
+    copy_n_bytes(entry_size, archive, &mut destination)
+        .await
+        .with_context(|| {
+            format!(
+                "Failed to write extracted archive contents into file {}",
+                destination_path.display()
+            )
+        })?;
+    destination
+        .flush()
+        .await
+        .context("Failed to flush the streaming archive bytes")?;
+    Ok(())
+}
+
+async fn write_archive_contents(
+    source_dir: PathBuf,
+    header: ArchiveHeader,
+    metadata_bytes: Vec<u8>,
+    mut archive_input: io::DuplexStream,
+) -> anyhow::Result<()> {
+    debug!("Starting writing files into archive");
+    for file_entry in header.files {
+        let path = file_entry.subpath.as_path(&source_dir);
+        let mut source_file =
+            io::BufReader::new(fs::File::open(&path).await.with_context(|| {
+                format!(
+                    "Failed to open file for archiving to path {}",
+                    path.display()
+                )
+            })?);
+        let bytes_written = io::copy(&mut source_file, &mut archive_input)
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to open add a file into archive, file path {}",
+                    path.display()
+                )
+            })?;
+        ensure!(
+            file_entry.size == bytes_written,
+            "File {} was written to the archive incompletely",
+            path.display()
+        );
+        trace!(
+            "Added file '{}' ({} bytes) into the archive",
+            path.display(),
+            bytes_written
+        );
+    }
+    let metadata_bytes_written = io::copy(&mut metadata_bytes.as_slice(), &mut archive_input)
+        .await
+        .context("Failed to add metadata into the archive")?;
+    ensure!(
+        header.metadata_file_size == metadata_bytes_written,
+        "Metadata file was written to the archive incompletely",
+    );
+
+    archive_input
+        .shutdown()
+        .await
+        .context("Failed to finalize the archive")?;
+    debug!("Successfully streamed all files into the archive");
+    Ok(())
+}
+
+async fn prepare_header(
+    source_dir: &Path,
+    files: impl Iterator<Item = &PathBuf>,
+    metadata_bytes: &[u8],
+) -> anyhow::Result<(ArchiveHeader, Vec<u8>)> {
+    let mut archive_files = Vec::new();
+    for file_path in files {
+        let file_metadata = fs::metadata(file_path).await.with_context(|| {
+            format!(
+                "Failed to read metadata during archive indexing for {}",
+                file_path.display()
+            )
+        })?;
+        ensure!(
+            file_metadata.is_file(),
+            "Archive indexed path {} is not a file",
+            file_path.display()
+        );
+
+        if file_path.file_name().and_then(|name| name.to_str()) != Some(METADATA_FILE_NAME) {
+            let entry = FileEntry {
+                subpath: RelativePath::new(source_dir, file_path).with_context(|| {
+                    format!(
+                        "File '{}' does not belong to pageserver workspace",
+                        file_path.display()
+                    )
+                })?,
+                size: file_metadata.len(),
+            };
+            archive_files.push(entry);
+        }
+    }
+
+    let header = ArchiveHeader {
+        files: archive_files,
+        metadata_file_size: metadata_bytes.len() as u64,
+    };
+
+    debug!("Appending a header for {} files", header.files.len());
+    let header_bytes = header.ser().context("Failed to serialize a header")?;
+    debug!("Header bytes len {}", header_bytes.len());
+    let mut compressed_header_bytes = Vec::new();
+    ZstdEncoder::new(io::BufReader::new(header_bytes.as_slice()))
+        .read_to_end(&mut compressed_header_bytes)
+        .await
+        .context("Failed to compress header bytes")?;
+    debug!(
+        "Compressed header bytes len {}",
+        compressed_header_bytes.len()
+    );
+    Ok((header, compressed_header_bytes))
+}
+
+async fn copy_n_bytes(
+    n: u64,
+    from: &mut (impl io::AsyncRead + Send + Sync + Unpin),
+    into: &mut (impl io::AsyncWrite + Send + Sync + Unpin),
+) -> anyhow::Result<()> {
+    let bytes_written = io::copy(&mut from.take(n), into).await?;
+    ensure!(
+        bytes_written == n,
+        "Failed to read exactly {} bytes from the input, bytes written: {}",
+        n,
+        bytes_written,
+    );
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use tokio::{fs, io::AsyncSeekExt};
+
+    use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID};
+
+    use super::*;
+
+    #[tokio::test]
+    async fn compress_and_uncompress() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("compress_and_uncompress")?;
+        let timeline_dir = repo_harness.timeline_path(&TIMELINE_ID);
+        init_directory(
+            &timeline_dir,
+            vec![
+                ("first", "first_contents"),
+                ("second", "second_contents"),
+                (METADATA_FILE_NAME, "wrong_metadata"),
+            ],
+        )
+        .await?;
+        let timeline_files = list_file_paths_with_contents(&timeline_dir).await?;
+        assert_eq!(
+            timeline_files,
+            vec![
+                (
+                    timeline_dir.join("first"),
+                    FileContents::Text("first_contents".to_string())
+                ),
+                (
+                    timeline_dir.join(METADATA_FILE_NAME),
+                    FileContents::Text("wrong_metadata".to_string())
+                ),
+                (
+                    timeline_dir.join("second"),
+                    FileContents::Text("second_contents".to_string())
+                ),
+            ],
+            "Initial timeline contents should contain two normal files and a wrong metadata file"
+        );
+
+        let metadata = TimelineMetadata::new(Lsn(0x30), None, None, Lsn(0), Lsn(0), Lsn(0));
+        let paths_to_archive = timeline_files
+            .into_iter()
+            .map(|(path, _)| path)
+            .collect::<Vec<_>>();
+
+        let tempdir = tempfile::tempdir()?;
+        let base_path = tempdir.path().to_path_buf();
+        let (header, header_size, archive_target) = archive_files_as_stream(
+            &timeline_dir,
+            paths_to_archive.iter(),
+            &metadata,
+            move |mut archive_streamer, archive_name| async move {
+                let archive_target = base_path.join(&archive_name);
+                let mut archive_file = fs::File::create(&archive_target).await?;
+                io::copy(&mut archive_streamer, &mut archive_file).await?;
+                Ok(archive_target)
+            },
+        )
+        .await?;
+
+        let mut file = fs::File::open(&archive_target).await?;
+        file.seek(io::SeekFrom::Start(header_size)).await?;
+        let target_dir = tempdir.path().join("extracted");
+        uncompress_with_header(&BTreeSet::new(), &target_dir, header, file).await?;
+
+        let extracted_files = list_file_paths_with_contents(&target_dir).await?;
+
+        assert_eq!(
+            extracted_files,
+            vec![
+                (
+                    target_dir.join("first"),
+                    FileContents::Text("first_contents".to_string())
+                ),
+                (
+                    target_dir.join(METADATA_FILE_NAME),
+                    FileContents::Binary(metadata.to_bytes()?)
+                ),
+                (
+                    target_dir.join("second"),
+                    FileContents::Text("second_contents".to_string())
+                ),
+            ],
+            "Extracted files should contain all local timeline files besides its metadata, which should be taken from the arguments"
+        );
+
+        Ok(())
+    }
+
+    async fn init_directory(
+        root: &Path,
+        files_with_contents: Vec<(&str, &str)>,
+    ) -> anyhow::Result<()> {
+        fs::create_dir_all(root).await?;
+        for (file_name, contents) in files_with_contents {
+            fs::File::create(root.join(file_name))
+                .await?
+                .write_all(contents.as_bytes())
+                .await?;
+        }
+        Ok(())
+    }
+
+    #[derive(PartialEq, Eq, PartialOrd, Ord)]
+    enum FileContents {
+        Text(String),
+        Binary(Vec<u8>),
+    }
+
+    impl std::fmt::Debug for FileContents {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            match self {
+                Self::Text(text) => f.debug_tuple("Text").field(text).finish(),
+                Self::Binary(bytes) => f
+                    .debug_tuple("Binary")
+                    .field(&format!("{} bytes", bytes.len()))
+                    .finish(),
+            }
+        }
+    }
+
+    async fn list_file_paths_with_contents(
+        root: &Path,
+    ) -> anyhow::Result<Vec<(PathBuf, FileContents)>> {
+        let mut file_paths = Vec::new();
+
+        let mut dir_listings = vec![fs::read_dir(root).await?];
+        while let Some(mut dir_listing) = dir_listings.pop() {
+            while let Some(entry) = dir_listing.next_entry().await? {
+                let entry_path = entry.path();
+                if entry_path.is_file() {
+                    let contents = match String::from_utf8(fs::read(&entry_path).await?) {
+                        Ok(text) => FileContents::Text(text),
+                        Err(e) => FileContents::Binary(e.into_bytes()),
+                    };
+                    file_paths.push((entry_path, contents));
+                } else if entry_path.is_dir() {
+                    dir_listings.push(fs::read_dir(entry_path).await?);
+                } else {
+                    info!(
+                        "Skipping path '{}' as it's not a file or a directory",
+                        entry_path.display()
+                    );
+                }
+            }
+        }
+
+        file_paths.sort();
+        Ok(file_paths)
+    }
+}
--- a/pageserver/src/remote_storage/storage_sync/download.rs
+++ b/pageserver/src/remote_storage/storage_sync/download.rs
@@ -0,0 +1,428 @@
+//! Timeline synchrnonization logic to put files from archives on remote storage into pageserver's local directory.
+//! Currently, tenant branch files are also downloaded, but this does not appear final.
+
+use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc};
+
+use anyhow::{ensure, Context};
+use futures::{stream::FuturesUnordered, StreamExt};
+use tokio::{fs, sync::RwLock};
+use tracing::{debug, error, trace, warn};
+use zenith_utils::{lsn::Lsn, zid::ZTenantId};
+
+use crate::{
+    config::PageServerConf,
+    layered_repository::metadata::{metadata_path, TimelineMetadata},
+    remote_storage::{
+        storage_sync::{
+            compression, index::TimelineIndexEntry, sync_queue, tenant_branch_files,
+            update_index_description, SyncKind, SyncTask,
+        },
+        RemoteStorage, ZTenantTimelineId,
+    },
+};
+
+use super::{
+    index::{ArchiveId, RemoteTimeline, RemoteTimelineIndex},
+    TimelineDownload,
+};
+
+/// Timeline download result, with extra data, needed for downloading.
+pub(super) enum DownloadedTimeline {
+    /// Remote timeline data is either absent or corrupt, no download possible.
+    Abort,
+    /// Remote timeline data is found, its latest checkpoint's metadata contents (disk_consistent_lsn) is known.
+    /// Initial download failed due to some error, the download task is rescheduled for another retry.
+    FailedAndRescheduled { disk_consistent_lsn: Lsn },
+    /// Remote timeline data is found, its latest checkpoint's metadata contents (disk_consistent_lsn) is known.
+    /// Initial download successful.
+    Successful { disk_consistent_lsn: Lsn },
+}
+
+/// Attempts to download and uncompress files from all remote archives for the timeline given.
+/// Timeline files that already exist locally are skipped during the download, but the local metadata file is
+/// updated in the end of every checkpoint archive extraction.
+///
+/// Before any archives are considered, the branch files are checked locally and remotely, all remote-only files are downloaded.
+///
+/// On an error, bumps the retries count and reschedules the download, with updated archive skip list
+/// (for any new successful archive downloads and extractions).
+pub(super) async fn download_timeline<
+    P: std::fmt::Debug + Send + Sync + 'static,
+    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
+>(
+    conf: &'static PageServerConf,
+    remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
+    sync_id: ZTenantTimelineId,
+    mut download: TimelineDownload,
+    retries: u32,
+) -> DownloadedTimeline {
+    debug!("Downloading layers for sync id {}", sync_id);
+
+    let ZTenantTimelineId {
+        tenant_id,
+        timeline_id,
+    } = sync_id;
+    let index_read = remote_assets.1.read().await;
+    let remote_timeline = match index_read.timeline_entry(&sync_id) {
+        None => {
+            error!("Cannot download: no timeline is present in the index for given ids");
+            return DownloadedTimeline::Abort;
+        }
+        Some(index_entry) => match index_entry {
+            TimelineIndexEntry::Full(remote_timeline) => Cow::Borrowed(remote_timeline),
+            TimelineIndexEntry::Description(_) => {
+                let remote_disk_consistent_lsn = index_entry.disk_consistent_lsn();
+                drop(index_read);
+                debug!("Found timeline description for the given ids, downloading the full index");
+                match update_index_description(
+                    remote_assets.as_ref(),
+                    &conf.timeline_path(&timeline_id, &tenant_id),
+                    sync_id,
+                )
+                .await
+                {
+                    Ok(remote_timeline) => Cow::Owned(remote_timeline),
+                    Err(e) => {
+                        error!("Failed to download full timeline index: {:?}", e);
+                        return match remote_disk_consistent_lsn {
+                            Some(disk_consistent_lsn) => {
+                                sync_queue::push(SyncTask::new(
+                                    sync_id,
+                                    retries,
+                                    SyncKind::Download(download),
+                                ));
+                                DownloadedTimeline::FailedAndRescheduled {
+                                    disk_consistent_lsn,
+                                }
+                            }
+                            None => {
+                                error!("Cannot download: no disk consistent Lsn is present for the index entry");
+                                DownloadedTimeline::Abort
+                            }
+                        };
+                    }
+                }
+            }
+        },
+    };
+    let disk_consistent_lsn = match remote_timeline.checkpoints().max() {
+        Some(lsn) => lsn,
+        None => {
+            debug!("Cannot download: no disk consistent Lsn is present for the remote timeline");
+            return DownloadedTimeline::Abort;
+        }
+    };
+
+    if let Err(e) = download_missing_branches(conf, remote_assets.as_ref(), sync_id.tenant_id).await
+    {
+        error!(
+            "Failed to download missing branches for sync id {}: {:?}",
+            sync_id, e
+        );
+        sync_queue::push(SyncTask::new(
+            sync_id,
+            retries,
+            SyncKind::Download(download),
+        ));
+        return DownloadedTimeline::FailedAndRescheduled {
+            disk_consistent_lsn,
+        };
+    }
+
+    debug!("Downloading timeline archives");
+    let archives_to_download = remote_timeline
+        .checkpoints()
+        .map(ArchiveId)
+        .filter(|remote_archive| !download.archives_to_skip.contains(remote_archive))
+        .collect::<Vec<_>>();
+
+    let archives_total = archives_to_download.len();
+    debug!("Downloading {} archives of a timeline", archives_total);
+    trace!("Archives to download: {:?}", archives_to_download);
+
+    for (archives_downloaded, archive_id) in archives_to_download.into_iter().enumerate() {
+        match try_download_archive(
+            conf,
+            sync_id,
+            Arc::clone(&remote_assets),
+            remote_timeline.as_ref(),
+            archive_id,
+            Arc::clone(&download.files_to_skip),
+        )
+        .await
+        {
+            Err(e) => {
+                let archives_left = archives_total - archives_downloaded;
+                error!(
+                    "Failed to download archive {:?} (archives downloaded: {}; archives left: {}) for tenant {} timeline {}, requeueing the download: {:?}",
+                    archive_id, archives_downloaded, archives_left, tenant_id, timeline_id, e
+                );
+                sync_queue::push(SyncTask::new(
+                    sync_id,
+                    retries,
+                    SyncKind::Download(download),
+                ));
+                return DownloadedTimeline::FailedAndRescheduled {
+                    disk_consistent_lsn,
+                };
+            }
+            Ok(()) => {
+                debug!("Successfully downloaded archive {:?}", archive_id);
+                download.archives_to_skip.insert(archive_id);
+            }
+        }
+    }
+
+    debug!("Finished downloading all timeline's archives");
+    DownloadedTimeline::Successful {
+        disk_consistent_lsn,
+    }
+}
+
+async fn try_download_archive<
+    P: Send + Sync + 'static,
+    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
+>(
+    conf: &'static PageServerConf,
+    ZTenantTimelineId {
+        tenant_id,
+        timeline_id,
+    }: ZTenantTimelineId,
+    remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
+    remote_timeline: &RemoteTimeline,
+    archive_id: ArchiveId,
+    files_to_skip: Arc<BTreeSet<PathBuf>>,
+) -> anyhow::Result<()> {
+    debug!("Downloading archive {:?}", archive_id);
+    let archive_to_download = remote_timeline
+        .archive_data(archive_id)
+        .with_context(|| format!("Archive {:?} not found in remote storage", archive_id))?;
+    let (archive_header, header_size) = remote_timeline
+        .restore_header(archive_id)
+        .context("Failed to restore header when downloading an archive")?;
+
+    match read_local_metadata(conf, timeline_id, tenant_id).await {
+        Ok(local_metadata) => ensure!(
+            // need to allow `<=` instead of `<` due to cases when a failed archive can be redownloaded
+            local_metadata.disk_consistent_lsn() <= archive_to_download.disk_consistent_lsn(),
+            "Cannot download archive with Lsn {} since it's earlier than local Lsn {}",
+            archive_to_download.disk_consistent_lsn(),
+            local_metadata.disk_consistent_lsn()
+        ),
+        Err(e) => warn!("Failed to read local metadata file, assuming it's safe to override its with the download. Read: {:#}", e),
+    }
+    compression::uncompress_file_stream_with_index(
+        conf.timeline_path(&timeline_id, &tenant_id),
+        files_to_skip,
+        archive_to_download.disk_consistent_lsn(),
+        archive_header,
+        header_size,
+        move |mut archive_target, archive_name| async move {
+            let archive_local_path = conf
+                .timeline_path(&timeline_id, &tenant_id)
+                .join(&archive_name);
+            let remote_storage = &remote_assets.0;
+            remote_storage
+                .download_range(
+                    &remote_storage.storage_path(&archive_local_path)?,
+                    header_size,
+                    None,
+                    &mut archive_target,
+                )
+                .await
+        },
+    )
+    .await?;
+
+    Ok(())
+}
+
+async fn read_local_metadata(
+    conf: &'static PageServerConf,
+    timeline_id: zenith_utils::zid::ZTimelineId,
+    tenant_id: ZTenantId,
+) -> anyhow::Result<TimelineMetadata> {
+    let local_metadata_path = metadata_path(conf, timeline_id, tenant_id);
+    let local_metadata_bytes = fs::read(&local_metadata_path)
+        .await
+        .context("Failed to read local metadata file bytes")?;
+    Ok(TimelineMetadata::from_bytes(&local_metadata_bytes)
+        .context("Failed to read local metadata files bytes")?)
+}
+
+async fn download_missing_branches<
+    P: std::fmt::Debug + Send + Sync + 'static,
+    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
+>(
+    conf: &'static PageServerConf,
+    (storage, index): &(S, RwLock<RemoteTimelineIndex>),
+    tenant_id: ZTenantId,
+) -> anyhow::Result<()> {
+    let local_branches = tenant_branch_files(conf, tenant_id)
+        .await
+        .context("Failed to list local branch files for the tenant")?;
+    let local_branches_dir = conf.branches_path(&tenant_id);
+    if !local_branches_dir.exists() {
+        fs::create_dir_all(&local_branches_dir)
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to create local branches directory at path '{}'",
+                    local_branches_dir.display()
+                )
+            })?;
+    }
+
+    if let Some(remote_branches) = index.read().await.branch_files(tenant_id) {
+        let mut remote_only_branches_downloads = remote_branches
+            .difference(&local_branches)
+            .map(|remote_only_branch| async move {
+                let branches_dir = conf.branches_path(&tenant_id);
+                let remote_branch_path = remote_only_branch.as_path(&branches_dir);
+                let storage_path =
+                    storage.storage_path(&remote_branch_path).with_context(|| {
+                        format!(
+                            "Failed to derive a storage path for branch with local path '{}'",
+                            remote_branch_path.display()
+                        )
+                    })?;
+                let mut target_file = fs::OpenOptions::new()
+                    .write(true)
+                    .create_new(true)
+                    .open(&remote_branch_path)
+                    .await
+                    .with_context(|| {
+                        format!(
+                            "Failed to create local branch file at '{}'",
+                            remote_branch_path.display()
+                        )
+                    })?;
+                storage
+                    .download(&storage_path, &mut target_file)
+                    .await
+                    .with_context(|| {
+                        format!(
+                            "Failed to download branch file from the remote path {:?}",
+                            storage_path
+                        )
+                    })?;
+                Ok::<_, anyhow::Error>(())
+            })
+            .collect::<FuturesUnordered<_>>();
+
+        let mut branch_downloads_failed = false;
+        while let Some(download_result) = remote_only_branches_downloads.next().await {
+            if let Err(e) = download_result {
+                branch_downloads_failed = true;
+                error!("Failed to download a branch file: {:?}", e);
+            }
+        }
+        ensure!(
+            !branch_downloads_failed,
+            "Failed to download all branch files"
+        );
+    }
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::BTreeSet;
+
+    use tempfile::tempdir;
+    use tokio::fs;
+    use zenith_utils::lsn::Lsn;
+
+    use crate::{
+        remote_storage::{
+            local_fs::LocalFs,
+            storage_sync::test_utils::{
+                assert_index_descriptions, assert_timeline_files_match, create_local_timeline,
+                dummy_metadata, ensure_correct_timeline_upload, expect_timeline,
+            },
+        },
+        repository::repo_harness::{RepoHarness, TIMELINE_ID},
+    };
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_download_timeline() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("test_download_timeline")?;
+        let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID);
+        let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?;
+        let index = RwLock::new(RemoteTimelineIndex::try_parse_descriptions_from_paths(
+            repo_harness.conf,
+            storage
+                .list()
+                .await?
+                .into_iter()
+                .map(|storage_path| storage.local_path(&storage_path).unwrap()),
+        ));
+        let remote_assets = Arc::new((storage, index));
+        let storage = &remote_assets.0;
+        let index = &remote_assets.1;
+
+        let regular_timeline_path = repo_harness.timeline_path(&TIMELINE_ID);
+        let regular_timeline = create_local_timeline(
+            &repo_harness,
+            TIMELINE_ID,
+            &["a", "b"],
+            dummy_metadata(Lsn(0x30)),
+        )?;
+        ensure_correct_timeline_upload(
+            &repo_harness,
+            Arc::clone(&remote_assets),
+            TIMELINE_ID,
+            regular_timeline,
+        )
+        .await;
+        // upload multiple checkpoints for the same timeline
+        let regular_timeline = create_local_timeline(
+            &repo_harness,
+            TIMELINE_ID,
+            &["c", "d"],
+            dummy_metadata(Lsn(0x40)),
+        )?;
+        ensure_correct_timeline_upload(
+            &repo_harness,
+            Arc::clone(&remote_assets),
+            TIMELINE_ID,
+            regular_timeline,
+        )
+        .await;
+
+        fs::remove_dir_all(&regular_timeline_path).await?;
+        let remote_regular_timeline = expect_timeline(index, sync_id).await;
+
+        download_timeline(
+            repo_harness.conf,
+            Arc::clone(&remote_assets),
+            sync_id,
+            TimelineDownload {
+                files_to_skip: Arc::new(BTreeSet::new()),
+                archives_to_skip: BTreeSet::new(),
+            },
+            0,
+        )
+        .await;
+        assert_index_descriptions(
+            index,
+            RemoteTimelineIndex::try_parse_descriptions_from_paths(
+                repo_harness.conf,
+                remote_assets
+                    .0
+                    .list()
+                    .await
+                    .unwrap()
+                    .into_iter()
+                    .map(|storage_path| storage.local_path(&storage_path).unwrap()),
+            ),
+        )
+        .await;
+        assert_timeline_files_match(&repo_harness, TIMELINE_ID, remote_regular_timeline);
+
+        Ok(())
+    }
+}
--- a/pageserver/src/remote_storage/storage_sync/index.rs
+++ b/pageserver/src/remote_storage/storage_sync/index.rs
@@ -0,0 +1,427 @@
+//! In-memory index to track the tenant files on the remote strorage, mitigating the storage format differences between the local and remote files.
+//! Able to restore itself from the storage archive data and reconstruct archive indices on demand.
+//!
+//! The index is intended to be portable, so deliberately does not store any local paths inside.
+//! This way in the future, the index could be restored fast from its serialized stored form.
+
+use std::{
+    collections::{BTreeMap, BTreeSet, HashMap, HashSet},
+    path::{Path, PathBuf},
+};
+
+use anyhow::{bail, ensure, Context};
+use serde::{Deserialize, Serialize};
+use tracing::debug;
+use zenith_utils::{
+    lsn::Lsn,
+    zid::{ZTenantId, ZTimelineId},
+};
+
+use crate::{
+    config::PageServerConf,
+    layered_repository::TIMELINES_SEGMENT_NAME,
+    remote_storage::{
+        storage_sync::compression::{parse_archive_name, FileEntry},
+        ZTenantTimelineId,
+    },
+};
+
+use super::compression::ArchiveHeader;
+
+/// A part of the filesystem path, that needs a root to become a path again.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub struct RelativePath(String);
+
+impl RelativePath {
+    /// Attempts to strip off the base from path, producing a relative path or an error.
+    pub fn new<P: AsRef<Path>>(base: &Path, path: P) -> anyhow::Result<Self> {
+        let relative = path
+            .as_ref()
+            .strip_prefix(base)
+            .context("path is not relative to base")?;
+        Ok(RelativePath(relative.to_string_lossy().to_string()))
+    }
+
+    /// Joins the relative path with the base path.
+    pub fn as_path(&self, base: &Path) -> PathBuf {
+        base.join(&self.0)
+    }
+}
+
+/// An index to track tenant files that exist on the remote storage.
+/// Currently, timeline archives and branch files are tracked.
+#[derive(Debug, Clone)]
+pub struct RemoteTimelineIndex {
+    branch_files: HashMap<ZTenantId, HashSet<RelativePath>>,
+    timeline_files: HashMap<ZTenantTimelineId, TimelineIndexEntry>,
+}
+
+impl RemoteTimelineIndex {
+    /// Attempts to parse file paths (not checking the file contents) and find files
+    /// that can be tracked wiht the index.
+    /// On parse falures, logs the error and continues, so empty index can be created from not suitable paths.
+    pub fn try_parse_descriptions_from_paths<P: AsRef<Path>>(
+        conf: &'static PageServerConf,
+        paths: impl Iterator<Item = P>,
+    ) -> Self {
+        let mut index = Self {
+            branch_files: HashMap::new(),
+            timeline_files: HashMap::new(),
+        };
+        for path in paths {
+            if let Err(e) = try_parse_index_entry(&mut index, conf, path.as_ref()) {
+                debug!(
+                    "Failed to parse path '{}' as index entry: {:#}",
+                    path.as_ref().display(),
+                    e
+                );
+            }
+        }
+        index
+    }
+
+    pub fn timeline_entry(&self, id: &ZTenantTimelineId) -> Option<&TimelineIndexEntry> {
+        self.timeline_files.get(id)
+    }
+
+    pub fn timeline_entry_mut(
+        &mut self,
+        id: &ZTenantTimelineId,
+    ) -> Option<&mut TimelineIndexEntry> {
+        self.timeline_files.get_mut(id)
+    }
+
+    pub fn add_timeline_entry(&mut self, id: ZTenantTimelineId, entry: TimelineIndexEntry) {
+        self.timeline_files.insert(id, entry);
+    }
+
+    pub fn all_sync_ids(&self) -> impl Iterator<Item = ZTenantTimelineId> + '_ {
+        self.timeline_files.keys().copied()
+    }
+
+    pub fn add_branch_file(&mut self, tenant_id: ZTenantId, path: RelativePath) {
+        self.branch_files
+            .entry(tenant_id)
+            .or_insert_with(HashSet::new)
+            .insert(path);
+    }
+
+    pub fn branch_files(&self, tenant_id: ZTenantId) -> Option<&HashSet<RelativePath>> {
+        self.branch_files.get(&tenant_id)
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum TimelineIndexEntry {
+    /// An archive found on the remote storage, but not yet downloaded, only a metadata from its storage path is available, without archive contents.
+    Description(BTreeMap<ArchiveId, ArchiveDescription>),
+    /// Full archive metadata, including the file list, parsed from the archive header.
+    Full(RemoteTimeline),
+}
+
+impl TimelineIndexEntry {
+    pub fn uploaded_checkpoints(&self) -> BTreeSet<Lsn> {
+        match self {
+            Self::Description(description) => {
+                description.keys().map(|archive_id| archive_id.0).collect()
+            }
+            Self::Full(remote_timeline) => remote_timeline
+                .checkpoint_archives
+                .keys()
+                .map(|archive_id| archive_id.0)
+                .collect(),
+        }
+    }
+
+    /// Gets latest uploaded checkpoint's disk consisten Lsn for the corresponding timeline.
+    pub fn disk_consistent_lsn(&self) -> Option<Lsn> {
+        match self {
+            Self::Description(description) => {
+                description.keys().map(|archive_id| archive_id.0).max()
+            }
+            Self::Full(remote_timeline) => remote_timeline
+                .checkpoint_archives
+                .keys()
+                .map(|archive_id| archive_id.0)
+                .max(),
+        }
+    }
+}
+
+/// Checkpoint archive's id, corresponding to the `disk_consistent_lsn` from the timeline's metadata file during checkpointing.
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
+pub struct ArchiveId(pub(super) Lsn);
+
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
+struct FileId(ArchiveId, ArchiveEntryNumber);
+
+type ArchiveEntryNumber = usize;
+
+/// All archives and files in them, representing a certain timeline.
+/// Uses file and archive IDs to reference those without ownership issues.
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct RemoteTimeline {
+    timeline_files: BTreeMap<FileId, FileEntry>,
+    checkpoint_archives: BTreeMap<ArchiveId, CheckpointArchive>,
+}
+
+/// Archive metadata, enough to restore a header with the timeline data.
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct CheckpointArchive {
+    disk_consistent_lsn: Lsn,
+    metadata_file_size: u64,
+    files: BTreeSet<FileId>,
+    archive_header_size: u64,
+}
+
+impl CheckpointArchive {
+    pub fn disk_consistent_lsn(&self) -> Lsn {
+        self.disk_consistent_lsn
+    }
+}
+
+impl RemoteTimeline {
+    pub fn empty() -> Self {
+        Self {
+            timeline_files: BTreeMap::new(),
+            checkpoint_archives: BTreeMap::new(),
+        }
+    }
+
+    pub fn checkpoints(&self) -> impl Iterator<Item = Lsn> + '_ {
+        self.checkpoint_archives
+            .values()
+            .map(CheckpointArchive::disk_consistent_lsn)
+    }
+
+    /// Lists all relish files in the given remote timeline. Omits the metadata file.
+    pub fn stored_files(&self, timeline_dir: &Path) -> BTreeSet<PathBuf> {
+        self.timeline_files
+            .values()
+            .map(|file_entry| file_entry.subpath.as_path(timeline_dir))
+            .collect()
+    }
+
+    pub fn contains_checkpoint_at(&self, disk_consistent_lsn: Lsn) -> bool {
+        self.checkpoint_archives
+            .contains_key(&ArchiveId(disk_consistent_lsn))
+    }
+
+    pub fn archive_data(&self, archive_id: ArchiveId) -> Option<&CheckpointArchive> {
+        self.checkpoint_archives.get(&archive_id)
+    }
+
+    /// Restores a header of a certain remote archive from the memory data.
+    /// Returns the header and its compressed size in the archive, both can be used to uncompress that archive.
+    pub fn restore_header(&self, archive_id: ArchiveId) -> anyhow::Result<(ArchiveHeader, u64)> {
+        let archive = self
+            .checkpoint_archives
+            .get(&archive_id)
+            .with_context(|| format!("Archive {:?} not found", archive_id))?;
+
+        let mut header_files = Vec::with_capacity(archive.files.len());
+        for (expected_archive_position, archive_file) in archive.files.iter().enumerate() {
+            let &FileId(archive_id, archive_position) = archive_file;
+            ensure!(
+                expected_archive_position == archive_position,
+                "Archive header is corrupt, file # {} from archive {:?} header is missing",
+                expected_archive_position,
+                archive_id,
+            );
+
+            let timeline_file = self.timeline_files.get(archive_file).with_context(|| {
+                format!(
+                    "File with id {:?} not found for archive {:?}",
+                    archive_file, archive_id
+                )
+            })?;
+            header_files.push(timeline_file.clone());
+        }
+
+        Ok((
+            ArchiveHeader {
+                files: header_files,
+                metadata_file_size: archive.metadata_file_size,
+            },
+            archive.archive_header_size,
+        ))
+    }
+
+    /// Updates (creates, if necessary) the data about certain archive contents.
+    pub fn update_archive_contents(
+        &mut self,
+        disk_consistent_lsn: Lsn,
+        header: ArchiveHeader,
+        header_size: u64,
+    ) {
+        let archive_id = ArchiveId(disk_consistent_lsn);
+        let mut common_archive_files = BTreeSet::new();
+        for (file_index, file_entry) in header.files.into_iter().enumerate() {
+            let file_id = FileId(archive_id, file_index);
+            self.timeline_files.insert(file_id, file_entry);
+            common_archive_files.insert(file_id);
+        }
+
+        let metadata_file_size = header.metadata_file_size;
+        self.checkpoint_archives
+            .entry(archive_id)
+            .or_insert_with(|| CheckpointArchive {
+                metadata_file_size,
+                files: BTreeSet::new(),
+                archive_header_size: header_size,
+                disk_consistent_lsn,
+            })
+            .files
+            .extend(common_archive_files.into_iter());
+    }
+}
+
+/// Metadata abput timeline checkpoint archive, parsed from its remote storage path.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct ArchiveDescription {
+    pub header_size: u64,
+    pub disk_consistent_lsn: Lsn,
+    pub archive_name: String,
+}
+
+fn try_parse_index_entry(
+    index: &mut RemoteTimelineIndex,
+    conf: &'static PageServerConf,
+    path: &Path,
+) -> anyhow::Result<()> {
+    let tenants_dir = conf.tenants_path();
+    let tenant_id = path
+        .strip_prefix(&tenants_dir)
+        .with_context(|| {
+            format!(
+                "Path '{}' does not belong to tenants directory '{}'",
+                path.display(),
+                tenants_dir.display(),
+            )
+        })?
+        .iter()
+        .next()
+        .with_context(|| format!("Found no tenant id in path '{}'", path.display()))?
+        .to_string_lossy()
+        .parse::<ZTenantId>()
+        .with_context(|| format!("Failed to parse tenant id from path '{}'", path.display()))?;
+
+    let branches_path = conf.branches_path(&tenant_id);
+    let timelines_path = conf.timelines_path(&tenant_id);
+    match (
+        RelativePath::new(&branches_path, &path),
+        path.strip_prefix(&timelines_path),
+    ) {
+        (Ok(_), Ok(_)) => bail!(
+            "Path '{}' cannot start with both branches '{}' and the timelines '{}' prefixes",
+            path.display(),
+            branches_path.display(),
+            timelines_path.display()
+        ),
+        (Ok(branches_entry), Err(_)) => index.add_branch_file(tenant_id, branches_entry),
+        (Err(_), Ok(timelines_subpath)) => {
+            let mut segments = timelines_subpath.iter();
+            let timeline_id = segments
+                .next()
+                .with_context(|| {
+                    format!(
+                        "{} directory of tenant {} (path '{}') is not an index entry",
+                        TIMELINES_SEGMENT_NAME,
+                        tenant_id,
+                        path.display()
+                    )
+                })?
+                .to_string_lossy()
+                .parse::<ZTimelineId>()
+                .with_context(|| {
+                    format!("Failed to parse timeline id from path '{}'", path.display())
+                })?;
+
+            let (disk_consistent_lsn, header_size) =
+                parse_archive_name(path).with_context(|| {
+                    format!(
+                        "Failed to parse archive name out in path '{}'",
+                        path.display()
+                    )
+                })?;
+
+            let archive_name = path
+                .file_name()
+                .with_context(|| format!("Archive '{}' has no file name", path.display()))?
+                .to_string_lossy()
+                .to_string();
+
+            let sync_id = ZTenantTimelineId {
+                tenant_id,
+                timeline_id,
+            };
+            let timeline_index_entry = index
+                .timeline_files
+                .entry(sync_id)
+                .or_insert_with(|| TimelineIndexEntry::Description(BTreeMap::new()));
+            match timeline_index_entry {
+                TimelineIndexEntry::Description(descriptions) => {
+                    descriptions.insert(
+                        ArchiveId(disk_consistent_lsn),
+                        ArchiveDescription {
+                            header_size,
+                            disk_consistent_lsn,
+                            archive_name,
+                        },
+                    );
+                }
+                TimelineIndexEntry::Full(_) => {
+                    bail!("Cannot add parsed archive description to its full context in index with sync id {}", sync_id)
+                }
+            }
+        }
+        (Err(branches_error), Err(timelines_strip_error)) => {
+            bail!(
+                "Path '{}' is not an index entry: it's neither parsable as a branch entry '{:#}' nor as an archive entry '{}'",
+                path.display(),
+                branches_error,
+                timelines_strip_error,
+            )
+        }
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn header_restoration_preserves_file_order() {
+        let header = ArchiveHeader {
+            files: vec![
+                FileEntry {
+                    size: 5,
+                    subpath: RelativePath("one".to_string()),
+                },
+                FileEntry {
+                    size: 1,
+                    subpath: RelativePath("two".to_string()),
+                },
+                FileEntry {
+                    size: 222,
+                    subpath: RelativePath("zero".to_string()),
+                },
+            ],
+            metadata_file_size: 5,
+        };
+
+        let lsn = Lsn(1);
+        let mut remote_timeline = RemoteTimeline::empty();
+        remote_timeline.update_archive_contents(lsn, header.clone(), 15);
+
+        let (restored_header, _) = remote_timeline
+            .restore_header(ArchiveId(lsn))
+            .expect("Should be able to restore header from a valid remote timeline");
+
+        assert_eq!(
+            header, restored_header,
+            "Header restoration should preserve file order"
+        );
+    }
+}
--- a/pageserver/src/remote_storage/storage_sync/upload.rs
+++ b/pageserver/src/remote_storage/storage_sync/upload.rs
@@ -0,0 +1,573 @@
+//! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints.
+//! Currently, tenant branch files are also uploaded, but this does not appear final.
+
+use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc};
+
+use anyhow::{ensure, Context};
+use futures::{stream::FuturesUnordered, StreamExt};
+use tokio::{fs, sync::RwLock};
+use tracing::{debug, error, warn};
+use zenith_utils::zid::ZTenantId;
+
+use crate::{
+    config::PageServerConf,
+    remote_storage::{
+        storage_sync::{
+            compression,
+            index::{RemoteTimeline, TimelineIndexEntry},
+            sync_queue, tenant_branch_files, update_index_description, SyncKind, SyncTask,
+        },
+        RemoteStorage, ZTenantTimelineId,
+    },
+};
+
+use super::{compression::ArchiveHeader, index::RemoteTimelineIndex, NewCheckpoint};
+
+/// Attempts to compress and upload given checkpoint files.
+/// No extra checks for overlapping files is made: download takes care of that, ensuring no non-metadata local timeline files are overwritten.
+///
+/// Before the checkpoint files are uploaded, branch files are uploaded, if any local ones are missing remotely.
+///
+/// On an error, bumps the retries count and reschedules the entire task.
+/// On success, populates index data with new downloads.
+pub(super) async fn upload_timeline_checkpoint<
+    P: std::fmt::Debug + Send + Sync + 'static,
+    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
+>(
+    config: &'static PageServerConf,
+    remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
+    sync_id: ZTenantTimelineId,
+    new_checkpoint: NewCheckpoint,
+    retries: u32,
+) -> Option<bool> {
+    debug!("Uploading checkpoint for sync id {}", sync_id);
+    if let Err(e) = upload_missing_branches(config, remote_assets.as_ref(), sync_id.tenant_id).await
+    {
+        error!(
+            "Failed to upload missing branches for sync id {}: {:?}",
+            sync_id, e
+        );
+        sync_queue::push(SyncTask::new(
+            sync_id,
+            retries,
+            SyncKind::Upload(new_checkpoint),
+        ));
+        return Some(false);
+    }
+    let new_upload_lsn = new_checkpoint.metadata.disk_consistent_lsn();
+
+    let index = &remote_assets.1;
+
+    let ZTenantTimelineId {
+        tenant_id,
+        timeline_id,
+    } = sync_id;
+    let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);
+
+    let index_read = index.read().await;
+    let remote_timeline = match index_read.timeline_entry(&sync_id) {
+        None => None,
+        Some(TimelineIndexEntry::Full(remote_timeline)) => Some(Cow::Borrowed(remote_timeline)),
+        Some(TimelineIndexEntry::Description(_)) => {
+            debug!("Found timeline description for the given ids, downloading the full index");
+            match update_index_description(remote_assets.as_ref(), &timeline_dir, sync_id).await {
+                Ok(remote_timeline) => Some(Cow::Owned(remote_timeline)),
+                Err(e) => {
+                    error!("Failed to download full timeline index: {:?}", e);
+                    sync_queue::push(SyncTask::new(
+                        sync_id,
+                        retries,
+                        SyncKind::Upload(new_checkpoint),
+                    ));
+                    return Some(false);
+                }
+            }
+        }
+    };
+
+    let already_contains_upload_lsn = remote_timeline
+        .as_ref()
+        .map(|remote_timeline| remote_timeline.contains_checkpoint_at(new_upload_lsn))
+        .unwrap_or(false);
+    if already_contains_upload_lsn {
+        warn!(
+            "Received a checkpoint with Lsn {} that's already been uploaded to remote storage, skipping the upload.",
+            new_upload_lsn
+        );
+        return None;
+    }
+
+    let already_uploaded_files = remote_timeline
+        .map(|timeline| timeline.stored_files(&timeline_dir))
+        .unwrap_or_default();
+    drop(index_read);
+
+    match try_upload_checkpoint(
+        config,
+        Arc::clone(&remote_assets),
+        sync_id,
+        &new_checkpoint,
+        already_uploaded_files,
+    )
+    .await
+    {
+        Ok((archive_header, header_size)) => {
+            let mut index_write = index.write().await;
+            match index_write.timeline_entry_mut(&sync_id) {
+                Some(TimelineIndexEntry::Full(remote_timeline)) => {
+                    remote_timeline.update_archive_contents(
+                        new_checkpoint.metadata.disk_consistent_lsn(),
+                        archive_header,
+                        header_size,
+                    );
+                }
+                None | Some(TimelineIndexEntry::Description(_)) => {
+                    let mut new_timeline = RemoteTimeline::empty();
+                    new_timeline.update_archive_contents(
+                        new_checkpoint.metadata.disk_consistent_lsn(),
+                        archive_header,
+                        header_size,
+                    );
+                    index_write.add_timeline_entry(sync_id, TimelineIndexEntry::Full(new_timeline));
+                }
+            }
+            debug!("Checkpoint uploaded successfully");
+            Some(true)
+        }
+        Err(e) => {
+            error!(
+                "Failed to upload checkpoint: {:?}, requeueing the upload",
+                e
+            );
+            sync_queue::push(SyncTask::new(
+                sync_id,
+                retries,
+                SyncKind::Upload(new_checkpoint),
+            ));
+            Some(false)
+        }
+    }
+}
+
+async fn try_upload_checkpoint<
+    P: Send + Sync + 'static,
+    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
+>(
+    config: &'static PageServerConf,
+    remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
+    sync_id: ZTenantTimelineId,
+    new_checkpoint: &NewCheckpoint,
+    files_to_skip: BTreeSet<PathBuf>,
+) -> anyhow::Result<(ArchiveHeader, u64)> {
+    let ZTenantTimelineId {
+        tenant_id,
+        timeline_id,
+    } = sync_id;
+    let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);
+
+    let files_to_upload = new_checkpoint
+        .layers
+        .iter()
+        .filter(|&path_to_upload| {
+            if files_to_skip.contains(path_to_upload) {
+                error!(
+                    "Skipping file upload '{}', since it was already uploaded",
+                    path_to_upload.display()
+                );
+                false
+            } else {
+                true
+            }
+        })
+        .collect::<Vec<_>>();
+    ensure!(!files_to_upload.is_empty(), "No files to upload");
+
+    compression::archive_files_as_stream(
+        &timeline_dir,
+        files_to_upload.into_iter(),
+        &new_checkpoint.metadata,
+        move |archive_streamer, archive_name| async move {
+            let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);
+            let remote_storage = &remote_assets.0;
+            remote_storage
+                .upload(
+                    archive_streamer,
+                    &remote_storage.storage_path(&timeline_dir.join(&archive_name))?,
+                )
+                .await
+        },
+    )
+    .await
+    .map(|(header, header_size, _)| (header, header_size))
+}
+
+async fn upload_missing_branches<
+    P: std::fmt::Debug + Send + Sync + 'static,
+    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
+>(
+    config: &'static PageServerConf,
+    (storage, index): &(S, RwLock<RemoteTimelineIndex>),
+    tenant_id: ZTenantId,
+) -> anyhow::Result<()> {
+    let local_branches = tenant_branch_files(config, tenant_id)
+        .await
+        .context("Failed to list local branch files for the tenant")?;
+    let index_read = index.read().await;
+    let remote_branches = index_read
+        .branch_files(tenant_id)
+        .cloned()
+        .unwrap_or_default();
+    drop(index_read);
+
+    let mut branch_uploads = local_branches
+        .difference(&remote_branches)
+        .map(|local_only_branch| async move {
+            let local_branch_path = local_only_branch.as_path(&config.branches_path(&tenant_id));
+            let storage_path = storage.storage_path(&local_branch_path).with_context(|| {
+                format!(
+                    "Failed to derive a storage path for branch with local path '{}'",
+                    local_branch_path.display()
+                )
+            })?;
+            let local_branch_file = fs::OpenOptions::new()
+                .read(true)
+                .open(&local_branch_path)
+                .await
+                .with_context(|| {
+                    format!(
+                        "Failed to open local branch file {} for reading",
+                        local_branch_path.display()
+                    )
+                })?;
+            storage
+                .upload(local_branch_file, &storage_path)
+                .await
+                .with_context(|| {
+                    format!(
+                        "Failed to upload branch file to the remote path {:?}",
+                        storage_path
+                    )
+                })?;
+            Ok::<_, anyhow::Error>(local_only_branch)
+        })
+        .collect::<FuturesUnordered<_>>();
+
+    let mut branch_uploads_failed = false;
+    while let Some(upload_result) = branch_uploads.next().await {
+        match upload_result {
+            Ok(local_only_branch) => index
+                .write()
+                .await
+                .add_branch_file(tenant_id, local_only_branch.clone()),
+            Err(e) => {
+                error!("Failed to upload branch file: {:?}", e);
+                branch_uploads_failed = true;
+            }
+        }
+    }
+
+    ensure!(!branch_uploads_failed, "Failed to upload all branch files");
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use tempfile::tempdir;
+    use zenith_utils::lsn::Lsn;
+
+    use crate::{
+        remote_storage::{
+            local_fs::LocalFs,
+            storage_sync::{
+                index::ArchiveId,
+                test_utils::{
+                    assert_index_descriptions, create_local_timeline, dummy_metadata,
+                    ensure_correct_timeline_upload, expect_timeline,
+                },
+            },
+        },
+        repository::repo_harness::{RepoHarness, TIMELINE_ID},
+    };
+
+    use super::*;
+
+    #[tokio::test]
+    async fn reupload_timeline() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("reupload_timeline")?;
+        let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID);
+        let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?;
+        let index = RwLock::new(RemoteTimelineIndex::try_parse_descriptions_from_paths(
+            repo_harness.conf,
+            storage
+                .list()
+                .await?
+                .into_iter()
+                .map(|storage_path| storage.local_path(&storage_path).unwrap()),
+        ));
+        let remote_assets = Arc::new((storage, index));
+        let index = &remote_assets.1;
+
+        let first_upload_metadata = dummy_metadata(Lsn(0x10));
+        let first_checkpoint = create_local_timeline(
+            &repo_harness,
+            TIMELINE_ID,
+            &["a", "b"],
+            first_upload_metadata.clone(),
+        )?;
+        let local_timeline_path = repo_harness.timeline_path(&TIMELINE_ID);
+        ensure_correct_timeline_upload(
+            &repo_harness,
+            Arc::clone(&remote_assets),
+            TIMELINE_ID,
+            first_checkpoint,
+        )
+        .await;
+
+        let uploaded_timeline = expect_timeline(index, sync_id).await;
+        let uploaded_archives = uploaded_timeline
+            .checkpoints()
+            .map(ArchiveId)
+            .collect::<Vec<_>>();
+        assert_eq!(
+            uploaded_archives.len(),
+            1,
+            "Only one archive is expected after a first upload"
+        );
+        let first_uploaded_archive = uploaded_archives.first().copied().unwrap();
+        assert_eq!(
+            uploaded_timeline.checkpoints().last(),
+            Some(first_upload_metadata.disk_consistent_lsn()),
+            "Metadata that was uploaded, should have its Lsn stored"
+        );
+        assert_eq!(
+            uploaded_timeline
+                .archive_data(uploaded_archives.first().copied().unwrap())
+                .unwrap()
+                .disk_consistent_lsn(),
+            first_upload_metadata.disk_consistent_lsn(),
+            "Uploaded archive should have corresponding Lsn"
+        );
+        assert_eq!(
+            uploaded_timeline.stored_files(&local_timeline_path),
+            vec![local_timeline_path.join("a"), local_timeline_path.join("b")]
+                .into_iter()
+                .collect(),
+            "Should have all files from the first checkpoint"
+        );
+
+        let second_upload_metadata = dummy_metadata(Lsn(0x40));
+        let second_checkpoint = create_local_timeline(
+            &repo_harness,
+            TIMELINE_ID,
+            &["b", "c"],
+            second_upload_metadata.clone(),
+        )?;
+        assert!(
+            first_upload_metadata.disk_consistent_lsn()
+                < second_upload_metadata.disk_consistent_lsn()
+        );
+        ensure_correct_timeline_upload(
+            &repo_harness,
+            Arc::clone(&remote_assets),
+            TIMELINE_ID,
+            second_checkpoint,
+        )
+        .await;
+
+        let updated_timeline = expect_timeline(index, sync_id).await;
+        let mut updated_archives = updated_timeline
+            .checkpoints()
+            .map(ArchiveId)
+            .collect::<Vec<_>>();
+        assert_eq!(
+            updated_archives.len(),
+            2,
+            "Two archives are expected after a successful update of the upload"
+        );
+        updated_archives.retain(|archive_id| archive_id != &first_uploaded_archive);
+        assert_eq!(
+            updated_archives.len(),
+            1,
+            "Only one new archive is expected among the uploaded"
+        );
+        let second_uploaded_archive = updated_archives.last().copied().unwrap();
+        assert_eq!(
+            updated_timeline.checkpoints().max(),
+            Some(second_upload_metadata.disk_consistent_lsn()),
+            "Metadata that was uploaded, should have its Lsn stored"
+        );
+        assert_eq!(
+            updated_timeline
+                .archive_data(second_uploaded_archive)
+                .unwrap()
+                .disk_consistent_lsn(),
+            second_upload_metadata.disk_consistent_lsn(),
+            "Uploaded archive should have corresponding Lsn"
+        );
+        assert_eq!(
+            updated_timeline.stored_files(&local_timeline_path),
+            vec![
+                local_timeline_path.join("a"),
+                local_timeline_path.join("b"),
+                local_timeline_path.join("c"),
+            ]
+            .into_iter()
+            .collect(),
+            "Should have all files from both checkpoints without duplicates"
+        );
+
+        let third_upload_metadata = dummy_metadata(Lsn(0x20));
+        let third_checkpoint = create_local_timeline(
+            &repo_harness,
+            TIMELINE_ID,
+            &["d"],
+            third_upload_metadata.clone(),
+        )?;
+        assert_ne!(
+            third_upload_metadata.disk_consistent_lsn(),
+            first_upload_metadata.disk_consistent_lsn()
+        );
+        assert!(
+            third_upload_metadata.disk_consistent_lsn()
+                < second_upload_metadata.disk_consistent_lsn()
+        );
+        ensure_correct_timeline_upload(
+            &repo_harness,
+            Arc::clone(&remote_assets),
+            TIMELINE_ID,
+            third_checkpoint,
+        )
+        .await;
+
+        let updated_timeline = expect_timeline(index, sync_id).await;
+        let mut updated_archives = updated_timeline
+            .checkpoints()
+            .map(ArchiveId)
+            .collect::<Vec<_>>();
+        assert_eq!(
+            updated_archives.len(),
+            3,
+            "Three archives are expected after two successful updates of the upload"
+        );
+        updated_archives.retain(|archive_id| {
+            archive_id != &first_uploaded_archive && archive_id != &second_uploaded_archive
+        });
+        assert_eq!(
+            updated_archives.len(),
+            1,
+            "Only one new archive is expected among the uploaded"
+        );
+        let third_uploaded_archive = updated_archives.last().copied().unwrap();
+        assert!(
+            updated_timeline.checkpoints().max().unwrap()
+                > third_upload_metadata.disk_consistent_lsn(),
+            "Should not influence the last lsn by uploading an older checkpoint"
+        );
+        assert_eq!(
+            updated_timeline
+                .archive_data(third_uploaded_archive)
+                .unwrap()
+                .disk_consistent_lsn(),
+            third_upload_metadata.disk_consistent_lsn(),
+            "Uploaded archive should have corresponding Lsn"
+        );
+        assert_eq!(
+            updated_timeline.stored_files(&local_timeline_path),
+            vec![
+                local_timeline_path.join("a"),
+                local_timeline_path.join("b"),
+                local_timeline_path.join("c"),
+                local_timeline_path.join("d"),
+            ]
+            .into_iter()
+            .collect(),
+            "Should have all files from three checkpoints without duplicates"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn reupload_timeline_rejected() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("reupload_timeline_rejected")?;
+        let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID);
+        let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?;
+        let index = RwLock::new(RemoteTimelineIndex::try_parse_descriptions_from_paths(
+            repo_harness.conf,
+            storage
+                .list()
+                .await?
+                .into_iter()
+                .map(|storage_path| storage.local_path(&storage_path).unwrap()),
+        ));
+        let remote_assets = Arc::new((storage, index));
+        let storage = &remote_assets.0;
+        let index = &remote_assets.1;
+
+        let first_upload_metadata = dummy_metadata(Lsn(0x10));
+        let first_checkpoint = create_local_timeline(
+            &repo_harness,
+            TIMELINE_ID,
+            &["a", "b"],
+            first_upload_metadata.clone(),
+        )?;
+        ensure_correct_timeline_upload(
+            &repo_harness,
+            Arc::clone(&remote_assets),
+            TIMELINE_ID,
+            first_checkpoint,
+        )
+        .await;
+        let after_first_uploads = RemoteTimelineIndex::try_parse_descriptions_from_paths(
+            repo_harness.conf,
+            remote_assets
+                .0
+                .list()
+                .await
+                .unwrap()
+                .into_iter()
+                .map(|storage_path| storage.local_path(&storage_path).unwrap()),
+        );
+
+        let normal_upload_metadata = dummy_metadata(Lsn(0x20));
+        assert_ne!(
+            normal_upload_metadata.disk_consistent_lsn(),
+            first_upload_metadata.disk_consistent_lsn()
+        );
+
+        let checkpoint_with_no_files = create_local_timeline(
+            &repo_harness,
+            TIMELINE_ID,
+            &[],
+            normal_upload_metadata.clone(),
+        )?;
+        upload_timeline_checkpoint(
+            repo_harness.conf,
+            Arc::clone(&remote_assets),
+            sync_id,
+            checkpoint_with_no_files,
+            0,
+        )
+        .await;
+        assert_index_descriptions(index, after_first_uploads.clone()).await;
+
+        let checkpoint_with_uploaded_lsn = create_local_timeline(
+            &repo_harness,
+            TIMELINE_ID,
+            &["something", "new"],
+            first_upload_metadata.clone(),
+        )?;
+        upload_timeline_checkpoint(
+            repo_harness.conf,
+            Arc::clone(&remote_assets),
+            sync_id,
+            checkpoint_with_uploaded_lsn,
+            0,
+        )
+        .await;
+        assert_index_descriptions(index, after_first_uploads.clone()).await;
+
+        Ok(())
+    }
+}
--- a/Show More
+++ b/Show More