Fix cherry-pick

Allow prev_lsn hint for fullbackup (#2052 )
Fix test. Add zenith.signal to fullbackup
2026-02-15 16:40:37 +00:00 · 2022-07-12 13:11:06 -04:00 · 2022-07-12 13:10:58 -04:00 · 2022-07-12 13:10:51 -04:00 · 2022-07-12 13:10:36 -04:00 · 2022-03-21 12:46:07 +02:00
202 changed files with 17116 additions and 7820 deletions
--- a/.circleci/ansible/ansible.cfg
+++ b/.circleci/ansible/ansible.cfg
@@ -0,0 +1,10 @@
+[defaults]
+
+localhost_warning = False
+host_key_checking = False
+timeout = 30
+
+[ssh_connection]
+ssh_args   = -F ./ansible.ssh.cfg
+scp_if_ssh = True
+pipelining = True
--- a/.circleci/ansible/ansible.ssh.cfg
+++ b/.circleci/ansible/ansible.ssh.cfg
@@ -0,0 +1,11 @@
+Host tele.zenith.tech
+    User admin
+    Port 3023
+    StrictHostKeyChecking no
+    UserKnownHostsFile /dev/null
+
+Host * !tele.zenith.tech
+    User admin
+    StrictHostKeyChecking no
+    UserKnownHostsFile /dev/null
+    ProxyJump tele.zenith.tech
--- a/.circleci/ansible/deploy.yaml
+++ b/.circleci/ansible/deploy.yaml
@@ -0,0 +1,174 @@
+- name: Upload Zenith binaries
+  hosts: pageservers:safekeepers
+  gather_facts: False
+  remote_user: admin
+  vars:
+    force_deploy: false
+
+  tasks:
+
+    - name: get latest version of Zenith binaries
+      ignore_errors: true
+      register: current_version_file
+      set_fact:
+        current_version: "{{ lookup('file', '.zenith_current_version') | trim }}"
+      tags:
+      - pageserver
+      - safekeeper
+
+    - name: set zero value for current_version
+      when: current_version_file is failed
+      set_fact:
+        current_version: "0"
+      tags:
+      - pageserver
+      - safekeeper
+
+    - name: get deployed version from content of remote file
+      ignore_errors: true
+      ansible.builtin.slurp:
+        src: /usr/local/.zenith_current_version
+      register: remote_version_file
+      tags:
+      - pageserver
+      - safekeeper
+
+    - name: decode remote file content
+      when: remote_version_file is succeeded
+      set_fact:
+        remote_version: "{{ remote_version_file['content'] | b64decode | trim }}"
+      tags:
+      - pageserver
+      - safekeeper
+
+    - name: set zero value for remote_version
+      when: remote_version_file is failed
+      set_fact:
+        remote_version: "0"
+      tags:
+      - pageserver
+      - safekeeper
+
+    - name: inform about versions
+      debug: msg="Version to deploy - {{ current_version }}, version on storage node - {{ remote_version }}"
+      tags:
+      - pageserver
+      - safekeeper
+
+
+    - name: upload and extract Zenith binaries to /usr/local
+      when: current_version > remote_version or force_deploy
+      ansible.builtin.unarchive:
+        owner: root
+        group: root
+        src: zenith_install.tar.gz
+        dest: /usr/local
+      become: true
+      tags:
+      - pageserver
+      - safekeeper
+      - binaries
+      - putbinaries
+
+- name: Deploy pageserver
+  hosts: pageservers
+  gather_facts: False
+  remote_user: admin
+  vars:
+    force_deploy: false
+
+  tasks:
+    - name: init pageserver
+      when: current_version > remote_version or force_deploy
+      shell:
+        cmd: sudo -u pageserver /usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" --init -D /storage/pageserver/data
+      args:
+        creates: "/storage/pageserver/data/tenants"
+      environment:
+        ZENITH_REPO_DIR: "/storage/pageserver/data"
+        LD_LIBRARY_PATH: "/usr/local/lib"
+      become: true
+      tags:
+      - pageserver
+
+    - name: upload systemd service definition
+      when: current_version > remote_version or force_deploy
+      ansible.builtin.template:
+        src: systemd/pageserver.service
+        dest: /etc/systemd/system/pageserver.service
+        owner: root
+        group: root
+        mode: '0644'
+      become: true
+      tags:
+      - pageserver
+
+    - name: start systemd service
+      when: current_version > remote_version or force_deploy
+      ansible.builtin.systemd:
+        daemon_reload: yes
+        name: pageserver
+        enabled: yes
+        state: restarted
+      become: true
+      tags:
+      - pageserver
+
+    - name: post version to console
+      when: (current_version > remote_version or force_deploy) and console_mgmt_base_url is defined
+      shell:
+        cmd: |
+          INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
+          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID
+      tags:
+      - pageserver
+
+- name: Deploy safekeeper
+  hosts: safekeepers
+  gather_facts: False
+  remote_user: admin
+  vars:
+    force_deploy: false
+
+  tasks:
+
+    # in the future safekeepers should discover pageservers byself
+    # but currently use first pageserver that was discovered
+    - name: set first pageserver var for safekeepers
+      when: current_version > remote_version or force_deploy
+      set_fact:
+        first_pageserver: "{{ hostvars[groups['pageservers'][0]]['inventory_hostname'] }}"
+      tags:
+      - safekeeper
+
+    - name: upload systemd service definition
+      when: current_version > remote_version or force_deploy
+      ansible.builtin.template:
+        src: systemd/safekeeper.service
+        dest: /etc/systemd/system/safekeeper.service
+        owner: root
+        group: root
+        mode: '0644'
+      become: true
+      tags:
+      - safekeeper
+
+    - name: start systemd service
+      when: current_version > remote_version or force_deploy
+      ansible.builtin.systemd:
+        daemon_reload: yes
+        name: safekeeper
+        enabled: yes
+        state: restarted
+      become: true
+      tags:
+      - safekeeper
+
+    - name: post version to console
+      when: (current_version > remote_version or force_deploy) and console_mgmt_base_url is defined
+      shell:
+        cmd: |
+          INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
+          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID
+      tags:
+      - safekeeper
--- a/.circleci/ansible/get_binaries.sh
+++ b/.circleci/ansible/get_binaries.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+set -e
+
+RELEASE=${RELEASE:-false}
+
+# look at docker hub for latest tag fo zenith docker image
+if [ "${RELEASE}" = "true" ]; then
+    echo "search latest relase tag"
+    VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/zenithdb/zenith/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | tail -1)
+    if [ -z "${VERSION}" ]; then
+        echo "no any docker tags found, exiting..."
+        exit 1
+    else
+        TAG="release-${VERSION}"
+    fi
+else
+    echo "search latest dev tag"
+    VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/zenithdb/zenith/tags |jq -r -S '.[].name' | grep -v release | tail -1)
+    if [ -z "${VERSION}" ]; then
+        echo "no any docker tags found, exiting..."
+        exit 1
+    else
+        TAG="${VERSION}"
+    fi
+fi
+
+echo "found ${VERSION}"
+
+# do initial cleanup
+rm -rf zenith_install postgres_install.tar.gz zenith_install.tar.gz .zenith_current_version
+mkdir zenith_install
+
+# retrive binaries from docker image
+echo "getting binaries from docker image"
+docker pull --quiet zenithdb/zenith:${TAG}
+ID=$(docker create zenithdb/zenith:${TAG})
+docker cp ${ID}:/data/postgres_install.tar.gz .
+tar -xzf postgres_install.tar.gz -C zenith_install
+docker cp ${ID}:/usr/local/bin/pageserver zenith_install/bin/
+docker cp ${ID}:/usr/local/bin/safekeeper zenith_install/bin/
+docker cp ${ID}:/usr/local/bin/proxy zenith_install/bin/
+docker cp ${ID}:/usr/local/bin/postgres zenith_install/bin/
+docker rm -vf ${ID}
+
+# store version to file (for ansible playbooks) and create binaries tarball
+echo ${VERSION} > zenith_install/.zenith_current_version
+echo ${VERSION} > .zenith_current_version
+tar -czf zenith_install.tar.gz -C zenith_install .
+
+# do final cleaup
+rm -rf zenith_install postgres_install.tar.gz
--- a/.circleci/ansible/production.hosts
+++ b/.circleci/ansible/production.hosts
@@ -0,0 +1,7 @@
+[pageservers]
+zenith-1-ps-1
+
+[safekeepers]
+zenith-1-sk-1
+zenith-1-sk-2
+zenith-1-sk-3
--- a/.circleci/ansible/staging.hosts
+++ b/.circleci/ansible/staging.hosts
@@ -0,0 +1,7 @@
+[pageservers]
+zenith-us-stage-ps-1
+
+[safekeepers]
+zenith-us-stage-sk-1
+zenith-us-stage-sk-2
+zenith-us-stage-sk-3
--- a/.circleci/ansible/systemd/pageserver.service
+++ b/.circleci/ansible/systemd/pageserver.service
@@ -0,0 +1,18 @@
+[Unit]
+Description=Zenith pageserver
+After=network.target auditd.service
+
+[Service]
+Type=simple
+User=pageserver
+Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib
+ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /storage/pageserver/data
+ExecReload=/bin/kill -HUP $MAINPID
+KillMode=mixed
+KillSignal=SIGINT
+Restart=on-failure
+TimeoutSec=10
+LimitNOFILE=30000000
+
+[Install]
+WantedBy=multi-user.target
--- a/.circleci/ansible/systemd/safekeeper.service
+++ b/.circleci/ansible/systemd/safekeeper.service
@@ -0,0 +1,18 @@
+[Unit]
+Description=Zenith safekeeper
+After=network.target auditd.service
+
+[Service]
+Type=simple
+User=safekeeper
+Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
+ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data
+ExecReload=/bin/kill -HUP $MAINPID
+KillMode=mixed
+KillSignal=SIGINT
+Restart=on-failure
+TimeoutSec=10
+LimitNOFILE=30000000
+
+[Install]
+WantedBy=multi-user.target
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,29 +1,28 @@
 version: 2.1

 executors:
-  zenith-build-executor:
+  zenith-xlarge-executor:
    resource_class: xlarge
    docker:
      # NB: when changed, do not forget to update rust image tag in all Dockerfiles
-      - image: cimg/rust:1.56.1
-  zenith-python-executor:
+      - image: zimg/rust:1.56
+  zenith-executor:
    docker:
-      - image: cimg/python:3.7.10  # Oldest available 3.7 with Ubuntu 20.04 (for GLIBC and Rust) at CirlceCI
+      - image: zimg/rust:1.56

 jobs:
  check-codestyle-rust:
-    executor: zenith-build-executor
+    executor: zenith-xlarge-executor
    steps:
      - checkout
      - run:
          name: rustfmt
          when: always
-          command: |
-            cargo fmt --all -- --check
+          command: cargo fmt --all -- --check

  # A job to build postgres
  build-postgres:
-    executor: zenith-build-executor
+    executor: zenith-xlarge-executor
    parameters:
      build_type:
        type: enum
@@ -38,8 +37,7 @@ jobs:
        # Note this works even though the submodule hasn't been checkout out yet.
      - run:
          name: Get postgres cache key
-          command: |
-            git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
+          command: git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres

      - restore_cache:
          name: Restore postgres cache
@@ -47,15 +45,6 @@ jobs:
            # Restore ONLY if the rev key matches exactly
            - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}

-        # FIXME We could cache our own docker container, instead of installing packages every time.
-      - run:
-          name: apt install dependencies
-          command: |
-            if [ ! -e tmp_install/bin/postgres ]; then
-              sudo apt update
-              sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
-            fi
-
        # Build postgres if the restore_cache didn't find a build.
        # `make` can't figure out whether the cache is valid, since
        # it only compares file timestamps.
@@ -65,7 +54,8 @@ jobs:
            if [ ! -e tmp_install/bin/postgres ]; then
              # "depth 1" saves some time by not cloning the whole repo
              git submodule update --init --depth 1
-              make postgres -j8
+              # bail out on any warnings
+              COPT='-Werror' mold -run make postgres -j$(nproc)
            fi

      - save_cache:
@@ -76,7 +66,7 @@ jobs:

  # A job to build zenith rust code
  build-zenith:
-    executor: zenith-build-executor
+    executor: zenith-xlarge-executor
    parameters:
      build_type:
        type: enum
@@ -84,12 +74,6 @@ jobs:
    environment:
      BUILD_TYPE: << parameters.build_type >>
    steps:
-      - run:
-          name: apt install dependencies
-          command: |
-            sudo apt update
-            sudo apt install libssl-dev clang
-
        # Checkout the git repo (without submodules)
      - checkout

@@ -127,7 +111,7 @@ jobs:
            fi

            export CARGO_INCREMENTAL=0
-            "${cov_prefix[@]}" cargo build $CARGO_FLAGS --bins --tests
+            "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --bins --tests

      - save_cache:
          name: Save rust cache
@@ -211,6 +195,14 @@ jobs:
          command: |
            cp -a tmp_install /tmp/zenith/pg_install

+      - run:
+          name: Merge coverage data
+          command: |
+            # This will speed up workspace uploads
+            if [[ $BUILD_TYPE == "debug" ]]; then
+              scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge
+            fi
+
        # Save the rust binaries and coverage data for other jobs in this workflow.
      - persist_to_workspace:
          root: /tmp/zenith
@@ -218,23 +210,30 @@ jobs:
            - "*"

  check-codestyle-python:
-    executor: zenith-python-executor
+    executor: zenith-executor
    steps:
      - checkout
+      - restore_cache:
+          keys:
+            - v1-python-deps-{{ checksum "poetry.lock" }}
      - run:
          name: Install deps
-          command: pipenv --python 3.7 install --dev
+          command: ./scripts/pysync
+      - save_cache:
+          key: v1-python-deps-{{ checksum "poetry.lock" }}
+          paths:
+            - /home/circleci/.cache/pypoetry/virtualenvs
      - run:
          name: Run yapf to ensure code format
          when: always
-          command: pipenv run yapf --recursive --diff .
+          command: poetry run yapf --recursive --diff .
      - run:
          name: Run mypy to check types
          when: always
-          command: pipenv run mypy .
+          command: poetry run mypy .

  run-pytest:
-    executor: zenith-python-executor
+    executor: zenith-executor
    parameters:
      # pytest args to specify the tests to run.
      #
@@ -273,9 +272,16 @@ jobs:
          condition: << parameters.needs_postgres_source >>
          steps:
            - run: git submodule update --init --depth 1
+      - restore_cache:
+          keys:
+            - v1-python-deps-{{ checksum "poetry.lock" }}
      - run:
          name: Install deps
-          command: pipenv --python 3.7 install
+          command: ./scripts/pysync
+      - save_cache:
+          key: v1-python-deps-{{ checksum "poetry.lock" }}
+          paths:
+            - /home/circleci/.cache/pypoetry/virtualenvs
      - run:
          name: Run pytest
          # pytest doesn't output test logs in real time, so CI job may fail with
@@ -292,6 +298,7 @@ jobs:
            - PLATFORM: zenith-local-ci
          command: |
            PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
+            rm -rf $PERF_REPORT_DIR

            TEST_SELECTION="test_runner/<< parameters.test_selection >>"
            EXTRA_PARAMS="<< parameters.extra_params >>"
@@ -327,7 +334,7 @@ jobs:
            # -n4 uses four processes to run tests via pytest-xdist
            # -s is not used to prevent pytest from capturing output, because tests are running
            # in parallel and logs are mixed between different tests
-            "${cov_prefix[@]}" pipenv run pytest \
+            "${cov_prefix[@]}" ./scripts/pytest \
              --junitxml=$TEST_OUTPUT/junit.xml \
              --tb=short \
              --verbose \
@@ -336,7 +343,6 @@ jobs:

            if << parameters.save_perf_report >>; then
              if [[ $CIRCLE_BRANCH == "main" ]]; then
-                # TODO: reuse scripts/git-upload
                export REPORT_FROM="$PERF_REPORT_DIR"
                export REPORT_TO=local
                scripts/generate_and_push_perf_report.sh
@@ -357,6 +363,13 @@ jobs:
      # The store_test_results step tells CircleCI where to find the junit.xml file.
      - store_test_results:
          path: /tmp/test_output
+      - run:
+          name: Merge coverage data
+          command: |
+            # This will speed up workspace uploads
+            if [[ $BUILD_TYPE == "debug" ]]; then
+              scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge
+            fi
      # Save coverage data (if any)
      - persist_to_workspace:
          root: /tmp/zenith
@@ -364,7 +377,7 @@ jobs:
            - "*"

  coverage-report:
-    executor: zenith-build-executor
+    executor: zenith-xlarge-executor
    steps:
      - attach_workspace:
          at: /tmp/zenith
@@ -376,12 +389,6 @@ jobs:
            # there's no way to clean out old packages, so the cache grows every time something
            # changes.
            - v04-rust-cache-deps-debug-{{ checksum "Cargo.lock" }}
-      - run:
-          name: Install llvm-tools
-          command: |
-            # TODO: install a proper symbol demangler, e.g. rustfilt
-            # TODO: we should embed this into a docker image
-            rustup component add llvm-tools-preview
      - run:
          name: Build coverage report
          command: |
@@ -433,8 +440,14 @@ jobs:
          command: |
            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
            DOCKER_TAG=$(git log --oneline|wc -l)
-            docker build --build-arg GIT_VERSION=$CIRCLE_SHA1 -t zenithdb/zenith:latest . && docker push zenithdb/zenith:latest
-            docker tag zenithdb/zenith:latest zenithdb/zenith:${DOCKER_TAG} && docker push zenithdb/zenith:${DOCKER_TAG}
+            docker build \
+              --pull \
+              --build-arg GIT_VERSION=${CIRCLE_SHA1} \
+              --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
+              --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
+              --tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:latest .
+            docker push zenithdb/zenith:${DOCKER_TAG}
+            docker push zenithdb/zenith:latest

  # Build zenithdb/compute-node:latest image and push it to Docker hub
  docker-image-compute:
@@ -461,8 +474,63 @@ jobs:
          command: |
            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
            DOCKER_TAG=$(git log --oneline|wc -l)
-            docker build -t zenithdb/compute-node:latest vendor/postgres && docker push zenithdb/compute-node:latest
-            docker tag zenithdb/compute-node:latest zenithdb/compute-node:${DOCKER_TAG} && docker push zenithdb/compute-node:${DOCKER_TAG}
+            docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:latest vendor/postgres
+            docker push zenithdb/compute-node:${DOCKER_TAG}
+            docker push zenithdb/compute-node:latest
+
+  # Build production zenithdb/zenith:release image and push it to Docker hub
+  docker-image-release:
+    docker:
+      - image: cimg/base:2021.04
+    steps:
+      - checkout
+      - setup_remote_docker:
+          docker_layer_caching: true
+      - run:
+          name: Init postgres submodule
+          command: git submodule update --init --depth 1
+      - run:
+          name: Build and push Docker image
+          command: |
+            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
+            DOCKER_TAG="release-$(git log --oneline|wc -l)"
+            docker build \
+              --pull \
+              --build-arg GIT_VERSION=${CIRCLE_SHA1} \
+              --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
+              --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
+              --tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:release .
+            docker push zenithdb/zenith:${DOCKER_TAG}
+            docker push zenithdb/zenith:release
+
+  # Build production zenithdb/compute-node:release image and push it to Docker hub
+  docker-image-compute-release:
+    docker:
+      - image: cimg/base:2021.04
+    steps:
+      - checkout
+      - setup_remote_docker:
+          docker_layer_caching: true
+      # Build zenithdb/compute-tools:release image and push it to Docker hub
+      # TODO: this should probably also use versioned tag, not just :latest.
+      # XXX: but should it? We build and use it only locally now.
+      - run:
+          name: Build and push compute-tools Docker image
+          command: |
+            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
+            docker build -t zenithdb/compute-tools:release -f Dockerfile.compute-tools .
+            docker push zenithdb/compute-tools:release
+      - run:
+          name: Init postgres submodule
+          command: git submodule update --init --depth 1
+      - run:
+          name: Build and push compute-node Docker image
+          command: |
+            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
+            DOCKER_TAG="release-$(git log --oneline|wc -l)"
+            docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:release vendor/postgres
+            docker push zenithdb/compute-node:${DOCKER_TAG}
+            docker push zenithdb/compute-node:release

  deploy-staging:
    docker:
@@ -470,40 +538,25 @@ jobs:
    steps:
      - checkout
      - setup_remote_docker
-      - run:
-          name: Get Zenith binaries
-          command: |
-            rm -rf zenith_install postgres_install.tar.gz zenith_install.tar.gz
-            mkdir zenith_install
-            DOCKER_TAG=$(git log --oneline|wc -l)
-            docker pull --quiet zenithdb/zenith:${DOCKER_TAG}
-            ID=$(docker create zenithdb/zenith:${DOCKER_TAG})
-            docker cp $ID:/data/postgres_install.tar.gz .
-            tar -xzf postgres_install.tar.gz -C zenith_install && rm postgres_install.tar.gz
-            docker cp $ID:/usr/local/bin/pageserver zenith_install/bin/
-            docker cp $ID:/usr/local/bin/safekeeper zenith_install/bin/
-            docker cp $ID:/usr/local/bin/proxy zenith_install/bin/
-            docker cp $ID:/usr/local/bin/postgres zenith_install/bin/
-            docker rm -v $ID
-            echo ${DOCKER_TAG} | tee zenith_install/.zenith_current_version
-            tar -czf zenith_install.tar.gz -C zenith_install .
-            ls -la zenith_install.tar.gz
      - run:
          name: Setup ansible
          command: |
            pip install --progress-bar off --user ansible boto3
-            ansible-galaxy collection install amazon.aws
      - run:
-          name: Apply re-deploy playbook
-          environment:
-            ANSIBLE_HOST_KEY_CHECKING: false
+          name: Redeploy
          command: |
-            echo "${STAGING_SSH_KEY}" | base64 --decode | ssh-add -
-            export AWS_REGION=${STAGING_AWS_REGION}
-            export AWS_ACCESS_KEY_ID=${STAGING_AWS_ACCESS_KEY_ID}
-            export AWS_SECRET_ACCESS_KEY=${STAGING_AWS_SECRET_ACCESS_KEY}
-            ansible-playbook .circleci/storage-redeploy.playbook.yml
-            rm -f zenith_install.tar.gz
+            cd "$(pwd)/.circleci/ansible"
+
+            ./get_binaries.sh
+
+            echo "${TELEPORT_SSH_KEY}"  | tr -d '\n'| base64 --decode >ssh-key
+            echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
+            chmod 0600 ssh-key
+            ssh-add ssh-key
+            rm -f ssh-key ssh-key-cert.pub
+
+            ansible-playbook deploy.yaml -i staging.hosts
+            rm -f zenith_install.tar.gz .zenith_current_version

  deploy-staging-proxy:
    docker:
@@ -526,7 +579,57 @@ jobs:
          name: Re-deploy proxy
          command: |
            DOCKER_TAG=$(git log --oneline|wc -l)
-            helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/proxy.staging.yaml --set image.tag=${DOCKER_TAG} --wait
+            helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
+
+
+  deploy-release:
+    docker:
+      - image: cimg/python:3.10
+    steps:
+      - checkout
+      - setup_remote_docker
+      - run:
+          name: Setup ansible
+          command: |
+            pip install --progress-bar off --user ansible boto3
+      - run:
+          name: Redeploy
+          command: |
+            cd "$(pwd)/.circleci/ansible"
+
+            RELEASE=true ./get_binaries.sh
+
+            echo "${TELEPORT_SSH_KEY}"  | tr -d '\n'| base64 --decode >ssh-key
+            echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
+            chmod 0600 ssh-key
+            ssh-add ssh-key
+            rm -f ssh-key ssh-key-cert.pub
+
+            ansible-playbook deploy.yaml -i production.hosts -e console_mgmt_base_url=http://console-release.local
+            rm -f zenith_install.tar.gz .zenith_current_version
+
+  deploy-release-proxy:
+    docker:
+      - image: cimg/base:2021.04
+    environment:
+      KUBECONFIG: .kubeconfig
+    steps:
+      - checkout
+      - run:
+          name: Store kubeconfig file
+          command: |
+            echo "${PRODUCTION_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
+            chmod 0600 ${KUBECONFIG}
+      - run:
+          name: Setup helm v3
+          command: |
+            curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+            helm repo add zenithdb https://zenithdb.github.io/helm-charts
+      - run:
+          name: Re-deploy proxy
+          command: |
+            DOCKER_TAG="release-$(git log --oneline|wc -l)"
+            helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait

  # Trigger a new remote CI job
  remote-ci-trigger:
@@ -591,6 +694,7 @@ workflows:
            - build-postgres-<< matrix.build_type >>
      - run-pytest:
          name: pg_regress-tests-<< matrix.build_type >>
+          context: PERF_TEST_RESULT_CONNSTR
          matrix:
            parameters:
              build_type: ["debug", "release"]
@@ -608,6 +712,7 @@ workflows:
            - build-zenith-<< matrix.build_type >>
      - run-pytest:
          name: benchmarks
+          context: PERF_TEST_RESULT_CONNSTR
          build_type: release
          test_selection: performance
          run_in_parallel: false
@@ -660,6 +765,47 @@ workflows:
                - main
          requires:
            - docker-image
+
+      - docker-image-release:
+          # Context gives an ability to login
+          context: Docker Hub
+          # Build image only for commits to main
+          filters:
+            branches:
+              only:
+                - release
+          requires:
+            - pg_regress-tests-release
+            - other-tests-release
+      - docker-image-compute-release:
+          # Context gives an ability to login
+          context: Docker Hub
+          # Build image only for commits to main
+          filters:
+            branches:
+              only:
+                - release
+          requires:
+            - pg_regress-tests-release
+            - other-tests-release
+      - deploy-release:
+          # Context gives an ability to login
+          context: Docker Hub
+          # deploy only for commits to main
+          filters:
+            branches:
+              only:
+                - release
+          requires:
+            - docker-image-release
+      - deploy-release-proxy:
+          # deploy only for commits to main
+          filters:
+            branches:
+              only:
+                - release
+          requires:
+            - docker-image-release
      - remote-ci-trigger:
          # Context passes credentials for gh api
          context: CI_ACCESS_TOKEN
--- a/.circleci/helm-values/production.proxy.yaml
+++ b/.circleci/helm-values/production.proxy.yaml
@@ -0,0 +1,35 @@
+# Helm chart values for zenith-proxy.
+# This is a YAML-formatted file.
+
+settings:
+  authEndpoint: "https://console.zenith.tech/authenticate_proxy_request/"
+  uri: "https://console.zenith.tech/psql_session/"
+
+# -- Additional labels for zenith-proxy pods
+podLabels:
+  zenith_service: proxy
+  zenith_env: production
+  zenith_region: us-west-2
+  zenith_region_slug: oregon
+
+service:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal
+    external-dns.alpha.kubernetes.io/hostname: proxy-release.local
+  type: LoadBalancer
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: start.zenith.tech
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
--- a/.circleci/helm-values/staging.proxy.yaml
+++ b/.circleci/helm-values/staging.proxy.yaml
@@ -5,6 +5,13 @@ settings:
  authEndpoint: "https://console.stage.zenith.tech/authenticate_proxy_request/"
  uri: "https://console.stage.zenith.tech/psql_session/"

+# -- Additional labels for zenith-proxy pods
+podLabels:
+  zenith_service: proxy
+  zenith_env: staging
+  zenith_region: us-east-1
+  zenith_region_slug: virginia
+
 exposedService:
  annotations:
    service.beta.kubernetes.io/aws-load-balancer-type: external
@@ -17,4 +24,4 @@ metrics:
  serviceMonitor:
    enabled: true
    selector:
-      prometheus: zenith
+      release: kube-prometheus-stack
--- a/.circleci/storage-redeploy.playbook.yml
+++ b/.circleci/storage-redeploy.playbook.yml
@@ -1,138 +0,0 @@
- name: discover storage nodes
-  hosts: localhost
-  connection: local
-  gather_facts: False
-
-  tasks:
-
-    - name: discover safekeepers
-      no_log: true
-      ec2_instance_info:
-        filters:
-          "tag:zenith_env": "staging"
-          "tag:zenith_service": "safekeeper"
-      register: ec2_safekeepers
-
-    - name: discover pageservers
-      no_log: true
-      ec2_instance_info:
-        filters:
-          "tag:zenith_env": "staging"
-          "tag:zenith_service": "pageserver"
-      register: ec2_pageservers
-
-    - name: add safekeepers to host group
-      no_log: true
-      add_host:
-        name: safekeeper-{{ ansible_loop.index }}
-        ansible_host: "{{ item.public_ip_address }}"
-        groups:
-          - storage
-          - safekeepers
-      with_items: "{{ ec2_safekeepers.instances }}"
-      loop_control:
-        extended: yes
-
-    - name: add pageservers to host group
-      no_log: true
-      add_host:
-        name: pageserver-{{ ansible_loop.index }}
-        ansible_host: "{{ item.public_ip_address }}"
-        groups:
-          - storage
-          - pageservers
-      with_items: "{{ ec2_pageservers.instances }}"
-      loop_control:
-        extended: yes
-
- name: Retrive versions
-  hosts: storage
-  gather_facts: False
-  remote_user: admin
-
-  tasks:
-
-    - name: Get current version of binaries
-      set_fact:
-        current_version: "{{lookup('file', '../zenith_install/.zenith_current_version') }}"
-
-    - name: Check that file with version exists on host
-      stat:
-        path: /usr/local/.zenith_current_version
-      register: version_file
-
-    - name: Try to get current version from the host
-      when: version_file.stat.exists
-      ansible.builtin.fetch:
-        src: /usr/local/.zenith_current_version
-        dest: .remote_version.{{ inventory_hostname }}
-        fail_on_missing: no
-        flat: yes
-
-    - name: Store remote version to variable
-      when: version_file.stat.exists
-      set_fact:
-        remote_version: "{{ lookup('file', '.remote_version.{{ inventory_hostname }}') }}"
-
-    - name: Store default value of remote version to variable in case when remote version file not found
-      when: not version_file.stat.exists
-      set_fact:
-        remote_version: "000"
-
- name: Extract Zenith binaries
-  hosts: storage
-  gather_facts: False
-  remote_user: admin
-
-  tasks:
-
-    - name: Inform about version conflict
-      when: current_version <= remote_version
-      debug: msg="Current version {{ current_version }} LE than remote {{ remote_version }}"
-
-    - name: Extract Zenith binaries to /usr/local
-      when: current_version > remote_version
-      ansible.builtin.unarchive:
-        src: ../zenith_install.tar.gz
-        dest: /usr/local
-      become: true
-
- name: Restart safekeepers
-  hosts: safekeepers
-  gather_facts: False
-  remote_user: admin
-
-  tasks:
-
-    - name: Inform about version conflict
-      when: current_version <= remote_version
-      debug: msg="Current version {{ current_version }} LE than remote {{ remote_version }}"
-
-    - name: Restart systemd service
-      when: current_version > remote_version
-      ansible.builtin.systemd:
-        daemon_reload: yes
-        name: safekeeper
-        enabled: yes
-        state: restarted
-      become: true
-
- name: Restart pageservers
-  hosts: pageservers
-  gather_facts: False
-  remote_user: admin
-
-  tasks:
-
-    - name: Inform about version conflict
-      when: current_version <= remote_version
-      debug: msg="Current version {{ current_version }} LE than remote {{ remote_version }}"
-
-    - name: Restart systemd service
-      when: current_version > remote_version
-      ansible.builtin.systemd:
-        daemon_reload: yes
-        name: pageserver
-        enabled: yes
-        state: restarted
-      become: true
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -3,7 +3,7 @@ name: benchmarking
 on:
  # uncomment to run on push for debugging your PR
  # push:
-  #   branches: [ mybranch ]
+  #   branches: [ your branch ]
  schedule:
    # * is a special character in YAML so you have to quote this string
    #          ┌───────────── minute (0 - 59)
@@ -36,20 +36,20 @@ jobs:
    # see https://github.com/actions/setup-python/issues/162
    # and probably https://github.com/actions/setup-python/issues/162#issuecomment-865387976 in particular
    # so the simplest solution to me is to use already installed system python and spin virtualenvs for job runs.
-    # there is Python 3.7.10 already installed on the machine so use it to install pipenv and then use pipenv's virtuealenvs
-    - name: Install pipenv & deps
+    # there is Python 3.7.10 already installed on the machine so use it to install poetry and then use poetry's virtuealenvs
+    - name: Install poetry & deps
      run: |
-        python3 -m pip install --upgrade pipenv wheel
-        # since pip/pipenv caches are reused there shouldn't be any troubles with install every time
-        pipenv install
+        python3 -m pip install --upgrade poetry wheel
+        # since pip/poetry caches are reused there shouldn't be any troubles with install every time
+        ./scripts/pysync

    - name: Show versions
      run: |
        echo Python
        python3 --version
-        pipenv run python3 --version
-        echo Pipenv
-        pipenv --version
+        poetry run python3 --version
+        echo Poetry
+        poetry --version
        echo Pgbench
        $PG_BIN/pgbench --version

@@ -89,11 +89,15 @@ jobs:
        BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
        REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
      run: |
+        # just to be sure that no data was cached on self hosted runner
+        # since it might generate duplicates when calling ingest_perf_test_result.py
+        rm -rf perf-report-staging
        mkdir -p perf-report-staging
-        pipenv run pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging
+        ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging

    - name: Submit result
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
      run: |
        REPORT_FROM=$(realpath perf-report-staging) REPORT_TO=staging scripts/generate_and_push_perf_report.sh
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,3 +16,8 @@ members = [
 # This is useful for profiling and, to some extent, debug.
 # Besides, debug info should not affect the performance.
 debug = true
+
+# This is only needed for proxy's tests
+# TODO: we should probably fork tokio-postgres-rustls instead
+[patch.crates-io]
+tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
--- a/86
+++ b/86
@@ -1,62 +1,62 @@
+# Build Postgres
 #
-# Docker image for console integration testing.
-#
+#FROM zimg/rust:1.56 AS pg-build
+FROM zenithdb/build:buster-20220309 AS pg-build
+WORKDIR /pg
+
+USER root
+
+COPY vendor/postgres vendor/postgres
+COPY Makefile Makefile

-#
-# Build Postgres separately --- this layer will be rebuilt only if one of
-# mentioned paths will get any changes.
-#
-FROM zenithdb/build:buster AS pg-build
-WORKDIR /zenith
-COPY ./vendor/postgres vendor/postgres
-COPY ./Makefile Makefile
 ENV BUILD_TYPE release
-RUN make -j $(getconf _NPROCESSORS_ONLN) -s postgres
-RUN rm -rf postgres_install/build
+RUN set -e \
+    && make -j $(nproc) -s postgres \
+    && rm -rf tmp_install/build \
+    && tar -C tmp_install -czf /postgres_install.tar.gz .

-#
 # Build zenith binaries
 #
-# TODO: build cargo deps as separate layer. We used cargo-chef before but that was
-# net time waste in a lot of cases. Copying Cargo.lock with empty lib.rs should do the work.
-#
-FROM zenithdb/build:buster AS build
+#FROM zimg/rust:1.56 AS build
+FROM zenithdb/build:buster-20220309 AS build
+ARG GIT_VERSION=local

-ARG GIT_VERSION
-RUN if [ -z "$GIT_VERSION" ]; then echo "GIT_VERSION is reqired, use build_arg to pass it"; exit 1; fi
-
-WORKDIR /zenith
-COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
+ARG CACHEPOT_BUCKET=zenith-rust-cachepot
+ARG AWS_ACCESS_KEY_ID
+ARG AWS_SECRET_ACCESS_KEY
+#ENV RUSTC_WRAPPER cachepot
+ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot

+COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
 COPY . .
-RUN GIT_VERSION=$GIT_VERSION cargo build --release

+RUN cargo build --release
+
+# Build final image
 #
-# Copy binaries to resulting image.
-#
-FROM debian:buster-slim
+FROM debian:bullseye-slim
 WORKDIR /data

-RUN apt-get update && apt-get -yq install libreadline-dev libseccomp-dev openssl ca-certificates && \
-    mkdir zenith_install
+RUN set -e \
+    && apt-get update \
+    && apt-get install -y \
+        libreadline-dev \
+        libseccomp-dev \
+        openssl \
+        ca-certificates \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
+    && useradd -d /data zenith \
+    && chown -R zenith:zenith /data
+
+COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/pageserver /usr/local/bin
+COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/safekeeper /usr/local/bin
+COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/proxy      /usr/local/bin
+
+COPY --from=pg-build /pg/tmp_install/         /usr/local/
+COPY --from=pg-build /postgres_install.tar.gz /data/

-COPY --from=build /zenith/target/release/pageserver /usr/local/bin
-COPY --from=build /zenith/target/release/safekeeper /usr/local/bin
-COPY --from=build /zenith/target/release/proxy /usr/local/bin
-COPY --from=pg-build /zenith/tmp_install postgres_install
 COPY docker-entrypoint.sh /docker-entrypoint.sh

-# Remove build artifacts (~ 500 MB)
-RUN rm -rf postgres_install/build && \
-    # 'Install' Postgres binaries locally
-    cp -r postgres_install/* /usr/local/ && \
-    # Prepare an archive of Postgres binaries (should be around 11 MB)
-    # and keep it inside container for an ease of deploy pipeline.
-    cd postgres_install && tar -czf /data/postgres_install.tar.gz . && cd .. && \
-    rm -rf postgres_install
-
-RUN useradd -d /data zenith && chown -R zenith:zenith /data
-
 VOLUME ["/data"]
 USER zenith
 EXPOSE 6400
--- a/Dockerfile.build
+++ b/Dockerfile.build
@@ -1,16 +1,23 @@
-#
-# Image with all the required dependencies to build https://github.com/zenithdb/zenith
-# and Postgres from https://github.com/zenithdb/postgres
-# Also includes some rust development and build tools.
-# NB: keep in sync with rust image version in .circle/config.yml
-#
 FROM rust:1.56.1-slim-buster
-WORKDIR /zenith
+WORKDIR /home/circleci/project

-# Install postgres and zenith build dependencies
-# clang is for rocksdb
-RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-                                          libseccomp-dev pkg-config libssl-dev clang
+RUN set -e \
+    && apt-get update \
+    && apt-get -yq install \
+        automake \
+        libtool \
+        build-essential \
+        bison \
+        flex \
+        libreadline-dev \
+        zlib1g-dev \
+        libxml2-dev \
+        libseccomp-dev \
+        pkg-config \
+        libssl-dev \
+        clang

-# Install rust tools
-RUN rustup component add clippy && cargo install cargo-audit
+RUN set -e \
+    && rustup component add clippy \
+    && cargo install cargo-audit \
+    && cargo install --git https://github.com/paritytech/cachepot
--- a/30
+++ b/30
@@ -1,30 +0,0 @@
-[[source]]
-url = "https://pypi.python.org/simple"
-verify_ssl = true
-name = "pypi"
-
-[packages]
-pytest = ">=6.0.0"
-typing-extensions = "*"
-pyjwt = {extras = ["crypto"], version = "*"}
-requests = "*"
-pytest-xdist = "*"
-asyncpg = "*"
-cached-property = "*"
-psycopg2-binary = "*"
-jinja2 = "*"
-
-[dev-packages]
-# Behavior may change slightly between versions. These are run continuously,
-# so we pin exact versions to avoid suprising breaks. Update if comfortable.
-yapf = "==0.31.0"
-mypy = "==0.910"
-# Non-pinned packages follow.
-pipenv = "*"
-flake8 = "*"
-types-requests = "*"
-types-psycopg2 = "*"
-
-[requires]
-# we need at least 3.7, but pipenv doesn't allow to say this directly
-python_version = "3"
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,652 +0,0 @@
-{
-    "_meta": {
-        "hash": {
-            "sha256": "c309cb963a7b07ae3d30e9cbf08b495f77bdecc0e5356fc89d133c4fbcb65b2b"
-        },
-        "pipfile-spec": 6,
-        "requires": {
-            "python_version": "3"
-        },
-        "sources": [
-            {
-                "name": "pypi",
-                "url": "https://pypi.python.org/simple",
-                "verify_ssl": true
-            }
-        ]
-    },
-    "default": {
-        "asyncpg": {
-            "hashes": [
-                "sha256:129d501f3d30616afd51eb8d3142ef51ba05374256bd5834cec3ef4956a9b317",
-                "sha256:29ef6ae0a617fc13cc2ac5dc8e9b367bb83cba220614b437af9b67766f4b6b20",
-                "sha256:41704c561d354bef01353835a7846e5606faabbeb846214dfcf666cf53319f18",
-                "sha256:556b0e92e2b75dc028b3c4bc9bd5162ddf0053b856437cf1f04c97f9c6837d03",
-                "sha256:8ff5073d4b654e34bd5eaadc01dc4d68b8a9609084d835acd364cd934190a08d",
-                "sha256:a458fc69051fbb67d995fdda46d75a012b5d6200f91e17d23d4751482640ed4c",
-                "sha256:a7095890c96ba36f9f668eb552bb020dddb44f8e73e932f8573efc613ee83843",
-                "sha256:a738f4807c853623d3f93f0fea11f61be6b0e5ca16ea8aeb42c2c7ee742aa853",
-                "sha256:c4fc0205fe4ddd5aeb3dfdc0f7bafd43411181e1f5650189608e5971cceacff1",
-                "sha256:dd2fa063c3344823487d9ddccb40802f02622ddf8bf8a6cc53885ee7a2c1c0c6",
-                "sha256:ddffcb85227bf39cd1bedd4603e0082b243cf3b14ced64dce506a15b05232b83",
-                "sha256:e36c6806883786b19551bb70a4882561f31135dc8105a59662e0376cf5b2cbc5",
-                "sha256:eed43abc6ccf1dc02e0d0efc06ce46a411362f3358847c6b0ec9a43426f91ece"
-            ],
-            "index": "pypi",
-            "version": "==0.24.0"
-        },
-        "attrs": {
-            "hashes": [
-                "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1",
-                "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==21.2.0"
-        },
-        "cached-property": {
-            "hashes": [
-                "sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130",
-                "sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"
-            ],
-            "index": "pypi",
-            "version": "==1.5.2"
-        },
-        "certifi": {
-            "hashes": [
-                "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
-                "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
-            ],
-            "version": "==2021.10.8"
-        },
-        "cffi": {
-            "hashes": [
-                "sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3",
-                "sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2",
-                "sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636",
-                "sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20",
-                "sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728",
-                "sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27",
-                "sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66",
-                "sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443",
-                "sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0",
-                "sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7",
-                "sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39",
-                "sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605",
-                "sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a",
-                "sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37",
-                "sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029",
-                "sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139",
-                "sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc",
-                "sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df",
-                "sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14",
-                "sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880",
-                "sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2",
-                "sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a",
-                "sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e",
-                "sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474",
-                "sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024",
-                "sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8",
-                "sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0",
-                "sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e",
-                "sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a",
-                "sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e",
-                "sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032",
-                "sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6",
-                "sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e",
-                "sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b",
-                "sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e",
-                "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954",
-                "sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962",
-                "sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c",
-                "sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4",
-                "sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55",
-                "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962",
-                "sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023",
-                "sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c",
-                "sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6",
-                "sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8",
-                "sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382",
-                "sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7",
-                "sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc",
-                "sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997",
-                "sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"
-            ],
-            "version": "==1.15.0"
-        },
-        "charset-normalizer": {
-            "hashes": [
-                "sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0",
-                "sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b"
-            ],
-            "markers": "python_version >= '3'",
-            "version": "==2.0.7"
-        },
-        "cryptography": {
-            "hashes": [
-                "sha256:07bb7fbfb5de0980590ddfc7f13081520def06dc9ed214000ad4372fb4e3c7f6",
-                "sha256:18d90f4711bf63e2fb21e8c8e51ed8189438e6b35a6d996201ebd98a26abbbe6",
-                "sha256:1ed82abf16df40a60942a8c211251ae72858b25b7421ce2497c2eb7a1cee817c",
-                "sha256:22a38e96118a4ce3b97509443feace1d1011d0571fae81fc3ad35f25ba3ea999",
-                "sha256:2d69645f535f4b2c722cfb07a8eab916265545b3475fdb34e0be2f4ee8b0b15e",
-                "sha256:4a2d0e0acc20ede0f06ef7aa58546eee96d2592c00f450c9acb89c5879b61992",
-                "sha256:54b2605e5475944e2213258e0ab8696f4f357a31371e538ef21e8d61c843c28d",
-                "sha256:7075b304cd567694dc692ffc9747f3e9cb393cc4aa4fb7b9f3abd6f5c4e43588",
-                "sha256:7b7ceeff114c31f285528ba8b390d3e9cfa2da17b56f11d366769a807f17cbaa",
-                "sha256:7eba2cebca600a7806b893cb1d541a6e910afa87e97acf2021a22b32da1df52d",
-                "sha256:928185a6d1ccdb816e883f56ebe92e975a262d31cc536429041921f8cb5a62fd",
-                "sha256:9933f28f70d0517686bd7de36166dda42094eac49415459d9bdf5e7df3e0086d",
-                "sha256:a688ebcd08250eab5bb5bca318cc05a8c66de5e4171a65ca51db6bd753ff8953",
-                "sha256:abb5a361d2585bb95012a19ed9b2c8f412c5d723a9836418fab7aaa0243e67d2",
-                "sha256:c10c797ac89c746e488d2ee92bd4abd593615694ee17b2500578b63cad6b93a8",
-                "sha256:ced40344e811d6abba00295ced98c01aecf0c2de39481792d87af4fa58b7b4d6",
-                "sha256:d57e0cdc1b44b6cdf8af1d01807db06886f10177469312fbde8f44ccbb284bc9",
-                "sha256:d99915d6ab265c22873f1b4d6ea5ef462ef797b4140be4c9d8b179915e0985c6",
-                "sha256:eb80e8a1f91e4b7ef8b33041591e6d89b2b8e122d787e87eeb2b08da71bb16ad",
-                "sha256:ebeddd119f526bcf323a89f853afb12e225902a24d29b55fe18dd6fcb2838a76"
-            ],
-            "version": "==35.0.0"
-        },
-        "execnet": {
-            "hashes": [
-                "sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5",
-                "sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==1.9.0"
-        },
-        "idna": {
-            "hashes": [
-                "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
-                "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
-            ],
-            "markers": "python_version >= '3'",
-            "version": "==3.3"
-        },
-        "importlib-metadata": {
-            "hashes": [
-                "sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
-                "sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
-            ],
-            "markers": "python_version < '3.8'",
-            "version": "==4.8.1"
-        },
-        "iniconfig": {
-            "hashes": [
-                "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
-                "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
-            ],
-            "version": "==1.1.1"
-        },
-        "jinja2": {
-            "hashes": [
-                "sha256:827a0e32839ab1600d4eb1c4c33ec5a8edfbc5cb42dafa13b81f182f97784b45",
-                "sha256:8569982d3f0889eed11dd620c706d39b60c36d6d25843961f33f77fb6bc6b20c"
-            ],
-            "index": "pypi",
-            "version": "==3.0.2"
-        },
-        "markupsafe": {
-            "hashes": [
-                "sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298",
-                "sha256:023cb26ec21ece8dc3907c0e8320058b2e0cb3c55cf9564da612bc325bed5e64",
-                "sha256:0446679737af14f45767963a1a9ef7620189912317d095f2d9ffa183a4d25d2b",
-                "sha256:04635854b943835a6ea959e948d19dcd311762c5c0c6e1f0e16ee57022669194",
-                "sha256:0717a7390a68be14b8c793ba258e075c6f4ca819f15edfc2a3a027c823718567",
-                "sha256:0955295dd5eec6cb6cc2fe1698f4c6d84af2e92de33fbcac4111913cd100a6ff",
-                "sha256:0d4b31cc67ab36e3392bbf3862cfbadac3db12bdd8b02a2731f509ed5b829724",
-                "sha256:10f82115e21dc0dfec9ab5c0223652f7197feb168c940f3ef61563fc2d6beb74",
-                "sha256:168cd0a3642de83558a5153c8bd34f175a9a6e7f6dc6384b9655d2697312a646",
-                "sha256:1d609f577dc6e1aa17d746f8bd3c31aa4d258f4070d61b2aa5c4166c1539de35",
-                "sha256:1f2ade76b9903f39aa442b4aadd2177decb66525062db244b35d71d0ee8599b6",
-                "sha256:20dca64a3ef2d6e4d5d615a3fd418ad3bde77a47ec8a23d984a12b5b4c74491a",
-                "sha256:2a7d351cbd8cfeb19ca00de495e224dea7e7d919659c2841bbb7f420ad03e2d6",
-                "sha256:2d7d807855b419fc2ed3e631034685db6079889a1f01d5d9dac950f764da3dad",
-                "sha256:2ef54abee730b502252bcdf31b10dacb0a416229b72c18b19e24a4509f273d26",
-                "sha256:36bc903cbb393720fad60fc28c10de6acf10dc6cc883f3e24ee4012371399a38",
-                "sha256:37205cac2a79194e3750b0af2a5720d95f786a55ce7df90c3af697bfa100eaac",
-                "sha256:3c112550557578c26af18a1ccc9e090bfe03832ae994343cfdacd287db6a6ae7",
-                "sha256:3dd007d54ee88b46be476e293f48c85048603f5f516008bee124ddd891398ed6",
-                "sha256:4296f2b1ce8c86a6aea78613c34bb1a672ea0e3de9c6ba08a960efe0b0a09047",
-                "sha256:47ab1e7b91c098ab893b828deafa1203de86d0bc6ab587b160f78fe6c4011f75",
-                "sha256:49e3ceeabbfb9d66c3aef5af3a60cc43b85c33df25ce03d0031a608b0a8b2e3f",
-                "sha256:4dc8f9fb58f7364b63fd9f85013b780ef83c11857ae79f2feda41e270468dd9b",
-                "sha256:4efca8f86c54b22348a5467704e3fec767b2db12fc39c6d963168ab1d3fc9135",
-                "sha256:53edb4da6925ad13c07b6d26c2a852bd81e364f95301c66e930ab2aef5b5ddd8",
-                "sha256:5855f8438a7d1d458206a2466bf82b0f104a3724bf96a1c781ab731e4201731a",
-                "sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a",
-                "sha256:5b6d930f030f8ed98e3e6c98ffa0652bdb82601e7a016ec2ab5d7ff23baa78d1",
-                "sha256:5bb28c636d87e840583ee3adeb78172efc47c8b26127267f54a9c0ec251d41a9",
-                "sha256:60bf42e36abfaf9aff1f50f52644b336d4f0a3fd6d8a60ca0d054ac9f713a864",
-                "sha256:611d1ad9a4288cf3e3c16014564df047fe08410e628f89805e475368bd304914",
-                "sha256:6300b8454aa6930a24b9618fbb54b5a68135092bc666f7b06901f897fa5c2fee",
-                "sha256:63f3268ba69ace99cab4e3e3b5840b03340efed0948ab8f78d2fd87ee5442a4f",
-                "sha256:6557b31b5e2c9ddf0de32a691f2312a32f77cd7681d8af66c2692efdbef84c18",
-                "sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8",
-                "sha256:6a7fae0dd14cf60ad5ff42baa2e95727c3d81ded453457771d02b7d2b3f9c0c2",
-                "sha256:6c4ca60fa24e85fe25b912b01e62cb969d69a23a5d5867682dd3e80b5b02581d",
-                "sha256:6fcf051089389abe060c9cd7caa212c707e58153afa2c649f00346ce6d260f1b",
-                "sha256:7d91275b0245b1da4d4cfa07e0faedd5b0812efc15b702576d103293e252af1b",
-                "sha256:89c687013cb1cd489a0f0ac24febe8c7a666e6e221b783e53ac50ebf68e45d86",
-                "sha256:8d206346619592c6200148b01a2142798c989edcb9c896f9ac9722a99d4e77e6",
-                "sha256:905fec760bd2fa1388bb5b489ee8ee5f7291d692638ea5f67982d968366bef9f",
-                "sha256:97383d78eb34da7e1fa37dd273c20ad4320929af65d156e35a5e2d89566d9dfb",
-                "sha256:984d76483eb32f1bcb536dc27e4ad56bba4baa70be32fa87152832cdd9db0833",
-                "sha256:99df47edb6bda1249d3e80fdabb1dab8c08ef3975f69aed437cb69d0a5de1e28",
-                "sha256:9f02365d4e99430a12647f09b6cc8bab61a6564363f313126f775eb4f6ef798e",
-                "sha256:a30e67a65b53ea0a5e62fe23682cfe22712e01f453b95233b25502f7c61cb415",
-                "sha256:ab3ef638ace319fa26553db0624c4699e31a28bb2a835c5faca8f8acf6a5a902",
-                "sha256:aca6377c0cb8a8253e493c6b451565ac77e98c2951c45f913e0b52facdcff83f",
-                "sha256:add36cb2dbb8b736611303cd3bfcee00afd96471b09cda130da3581cbdc56a6d",
-                "sha256:b2f4bf27480f5e5e8ce285a8c8fd176c0b03e93dcc6646477d4630e83440c6a9",
-                "sha256:b7f2d075102dc8c794cbde1947378051c4e5180d52d276987b8d28a3bd58c17d",
-                "sha256:baa1a4e8f868845af802979fcdbf0bb11f94f1cb7ced4c4b8a351bb60d108145",
-                "sha256:be98f628055368795d818ebf93da628541e10b75b41c559fdf36d104c5787066",
-                "sha256:bf5d821ffabf0ef3533c39c518f3357b171a1651c1ff6827325e4489b0e46c3c",
-                "sha256:c47adbc92fc1bb2b3274c4b3a43ae0e4573d9fbff4f54cd484555edbf030baf1",
-                "sha256:cdfba22ea2f0029c9261a4bd07e830a8da012291fbe44dc794e488b6c9bb353a",
-                "sha256:d6c7ebd4e944c85e2c3421e612a7057a2f48d478d79e61800d81468a8d842207",
-                "sha256:d7f9850398e85aba693bb640262d3611788b1f29a79f0c93c565694658f4071f",
-                "sha256:d8446c54dc28c01e5a2dbac5a25f071f6653e6e40f3a8818e8b45d790fe6ef53",
-                "sha256:deb993cacb280823246a026e3b2d81c493c53de6acfd5e6bfe31ab3402bb37dd",
-                "sha256:e0f138900af21926a02425cf736db95be9f4af72ba1bb21453432a07f6082134",
-                "sha256:e9936f0b261d4df76ad22f8fee3ae83b60d7c3e871292cd42f40b81b70afae85",
-                "sha256:f0567c4dc99f264f49fe27da5f735f414c4e7e7dd850cfd8e69f0862d7c74ea9",
-                "sha256:f5653a225f31e113b152e56f154ccbe59eeb1c7487b39b9d9f9cdb58e6c79dc5",
-                "sha256:f826e31d18b516f653fe296d967d700fddad5901ae07c622bb3705955e1faa94",
-                "sha256:f8ba0e8349a38d3001fae7eadded3f6606f0da5d748ee53cc1dab1d6527b9509",
-                "sha256:f9081981fe268bd86831e5c75f7de206ef275defcb82bc70740ae6dc507aee51",
-                "sha256:fa130dd50c57d53368c9d59395cb5526eda596d3ffe36666cd81a44d56e48872"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==2.0.1"
-        },
-        "packaging": {
-            "hashes": [
-                "sha256:096d689d78ca690e4cd8a89568ba06d07ca097e3306a4381635073ca91479966",
-                "sha256:14317396d1e8cdb122989b916fa2c7e9ca8e2be9e8060a6eff75b6b7b4d8a7e0"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==21.2"
-        },
-        "pluggy": {
-            "hashes": [
-                "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
-                "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==1.0.0"
-        },
-        "psycopg2-binary": {
-            "hashes": [
-                "sha256:0b7dae87f0b729922e06f85f667de7bf16455d411971b2043bbd9577af9d1975",
-                "sha256:0f2e04bd2a2ab54fa44ee67fe2d002bb90cee1c0f1cc0ebc3148af7b02034cbd",
-                "sha256:123c3fb684e9abfc47218d3784c7b4c47c8587951ea4dd5bc38b6636ac57f616",
-                "sha256:1473c0215b0613dd938db54a653f68251a45a78b05f6fc21af4326f40e8360a2",
-                "sha256:14db1752acdd2187d99cb2ca0a1a6dfe57fc65c3281e0f20e597aac8d2a5bd90",
-                "sha256:1e3a362790edc0a365385b1ac4cc0acc429a0c0d662d829a50b6ce743ae61b5a",
-                "sha256:1e85b74cbbb3056e3656f1cc4781294df03383127a8114cbc6531e8b8367bf1e",
-                "sha256:20f1ab44d8c352074e2d7ca67dc00843067788791be373e67a0911998787ce7d",
-                "sha256:24b0b6688b9f31a911f2361fe818492650795c9e5d3a1bc647acbd7440142a4f",
-                "sha256:2f62c207d1740b0bde5c4e949f857b044818f734a3d57f1d0d0edc65050532ed",
-                "sha256:3242b9619de955ab44581a03a64bdd7d5e470cc4183e8fcadd85ab9d3756ce7a",
-                "sha256:35c4310f8febe41f442d3c65066ca93cccefd75013df3d8c736c5b93ec288140",
-                "sha256:4235f9d5ddcab0b8dbd723dca56ea2922b485ea00e1dafacf33b0c7e840b3d32",
-                "sha256:542875f62bc56e91c6eac05a0deadeae20e1730be4c6334d8f04c944fcd99759",
-                "sha256:5ced67f1e34e1a450cdb48eb53ca73b60aa0af21c46b9b35ac3e581cf9f00e31",
-                "sha256:661509f51531ec125e52357a489ea3806640d0ca37d9dada461ffc69ee1e7b6e",
-                "sha256:7360647ea04db2e7dff1648d1da825c8cf68dc5fbd80b8fb5b3ee9f068dcd21a",
-                "sha256:736b8797b58febabb85494142c627bd182b50d2a7ec65322983e71065ad3034c",
-                "sha256:8c13d72ed6af7fd2c8acbd95661cf9477f94e381fce0792c04981a8283b52917",
-                "sha256:988b47ac70d204aed01589ed342303da7c4d84b56c2f4c4b8b00deda123372bf",
-                "sha256:995fc41ebda5a7a663a254a1dcac52638c3e847f48307b5416ee373da15075d7",
-                "sha256:a36c7eb6152ba5467fb264d73844877be8b0847874d4822b7cf2d3c0cb8cdcb0",
-                "sha256:aed4a9a7e3221b3e252c39d0bf794c438dc5453bc2963e8befe9d4cd324dff72",
-                "sha256:aef9aee84ec78af51107181d02fe8773b100b01c5dfde351184ad9223eab3698",
-                "sha256:b0221ca5a9837e040ebf61f48899926b5783668b7807419e4adae8175a31f773",
-                "sha256:b4d7679a08fea64573c969f6994a2631908bb2c0e69a7235648642f3d2e39a68",
-                "sha256:c250a7ec489b652c892e4f0a5d122cc14c3780f9f643e1a326754aedf82d9a76",
-                "sha256:ca86db5b561b894f9e5f115d6a159fff2a2570a652e07889d8a383b5fae66eb4",
-                "sha256:cfc523edecddaef56f6740d7de1ce24a2fdf94fd5e704091856a201872e37f9f",
-                "sha256:d92272c7c16e105788efe2cfa5d680f07e34e0c29b03c1908f8636f55d5f915a",
-                "sha256:da113b70f6ec40e7d81b43d1b139b9db6a05727ab8be1ee559f3a69854a69d34",
-                "sha256:f6fac64a38f6768e7bc7b035b9e10d8a538a9fadce06b983fb3e6fa55ac5f5ce",
-                "sha256:f8559617b1fcf59a9aedba2c9838b5b6aa211ffedecabca412b92a1ff75aac1a",
-                "sha256:fbb42a541b1093385a2d8c7eec94d26d30437d0e77c1d25dae1dcc46741a385e"
-            ],
-            "index": "pypi",
-            "version": "==2.9.1"
-        },
-        "py": {
-            "hashes": [
-                "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3",
-                "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==1.10.0"
-        },
-        "pycparser": {
-            "hashes": [
-                "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
-                "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==2.20"
-        },
-        "pyjwt": {
-            "extras": [
-                "crypto"
-            ],
-            "hashes": [
-                "sha256:b888b4d56f06f6dcd777210c334e69c737be74755d3e5e9ee3fe67dc18a0ee41",
-                "sha256:e0c4bb8d9f0af0c7f5b1ec4c5036309617d03d56932877f2f7a0beeb5318322f"
-            ],
-            "index": "pypi",
-            "version": "==2.3.0"
-        },
-        "pyparsing": {
-            "hashes": [
-                "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
-                "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
-            ],
-            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==2.4.7"
-        },
-        "pytest": {
-            "hashes": [
-                "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89",
-                "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"
-            ],
-            "index": "pypi",
-            "version": "==6.2.5"
-        },
-        "pytest-forked": {
-            "hashes": [
-                "sha256:6aa9ac7e00ad1a539c41bec6d21011332de671e938c7637378ec9710204e37ca",
-                "sha256:dc4147784048e70ef5d437951728825a131b81714b398d5d52f17c7c144d8815"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==1.3.0"
-        },
-        "pytest-xdist": {
-            "hashes": [
-                "sha256:7b61ebb46997a0820a263553179d6d1e25a8c50d8a8620cd1aa1e20e3be99168",
-                "sha256:89b330316f7fc475f999c81b577c2b926c9569f3d397ae432c0c2e2496d61ff9"
-            ],
-            "index": "pypi",
-            "version": "==2.4.0"
-        },
-        "requests": {
-            "hashes": [
-                "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
-                "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
-            ],
-            "index": "pypi",
-            "version": "==2.26.0"
-        },
-        "toml": {
-            "hashes": [
-                "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
-                "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
-            ],
-            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==0.10.2"
-        },
-        "typing-extensions": {
-            "hashes": [
-                "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
-                "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
-                "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
-            ],
-            "index": "pypi",
-            "version": "==3.10.0.2"
-        },
-        "urllib3": {
-            "hashes": [
-                "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece",
-                "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
-            "version": "==1.26.7"
-        },
-        "zipp": {
-            "hashes": [
-                "sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
-                "sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==3.6.0"
-        }
-    },
-    "develop": {
-        "backports.entry-points-selectable": {
-            "hashes": [
-                "sha256:988468260ec1c196dab6ae1149260e2f5472c9110334e5d51adcb77867361f6a",
-                "sha256:a6d9a871cde5e15b4c4a53e3d43ba890cc6861ec1332c9c2428c92f977192acc"
-            ],
-            "markers": "python_version >= '2.7'",
-            "version": "==1.1.0"
-        },
-        "certifi": {
-            "hashes": [
-                "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
-                "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
-            ],
-            "version": "==2021.10.8"
-        },
-        "distlib": {
-            "hashes": [
-                "sha256:c8b54e8454e5bf6237cc84c20e8264c3e991e824ef27e8f1e81049867d861e31",
-                "sha256:d982d0751ff6eaaab5e2ec8e691d949ee80eddf01a62eaa96ddb11531fe16b05"
-            ],
-            "version": "==0.3.3"
-        },
-        "filelock": {
-            "hashes": [
-                "sha256:7afc856f74fa7006a289fd10fa840e1eebd8bbff6bffb69c26c54a0512ea8cf8",
-                "sha256:bb2a1c717df74c48a2d00ed625e5a66f8572a3a30baacb7657add1d7bac4097b"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==3.3.2"
-        },
-        "flake8": {
-            "hashes": [
-                "sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d",
-                "sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"
-            ],
-            "index": "pypi",
-            "version": "==4.0.1"
-        },
-        "importlib-metadata": {
-            "hashes": [
-                "sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
-                "sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
-            ],
-            "markers": "python_version < '3.8'",
-            "version": "==4.8.1"
-        },
-        "mccabe": {
-            "hashes": [
-                "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
-                "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
-            ],
-            "version": "==0.6.1"
-        },
-        "mypy": {
-            "hashes": [
-                "sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9",
-                "sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a",
-                "sha256:119bed3832d961f3a880787bf621634ba042cb8dc850a7429f643508eeac97b9",
-                "sha256:1a85e280d4d217150ce8cb1a6dddffd14e753a4e0c3cf90baabb32cefa41b59e",
-                "sha256:3c4b8ca36877fc75339253721f69603a9c7fdb5d4d5a95a1a1b899d8b86a4de2",
-                "sha256:3e382b29f8e0ccf19a2df2b29a167591245df90c0b5a2542249873b5c1d78212",
-                "sha256:42c266ced41b65ed40a282c575705325fa7991af370036d3f134518336636f5b",
-                "sha256:53fd2eb27a8ee2892614370896956af2ff61254c275aaee4c230ae771cadd885",
-                "sha256:704098302473cb31a218f1775a873b376b30b4c18229421e9e9dc8916fd16150",
-                "sha256:7df1ead20c81371ccd6091fa3e2878559b5c4d4caadaf1a484cf88d93ca06703",
-                "sha256:866c41f28cee548475f146aa4d39a51cf3b6a84246969f3759cb3e9c742fc072",
-                "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457",
-                "sha256:adaeee09bfde366d2c13fe6093a7df5df83c9a2ba98638c7d76b010694db760e",
-                "sha256:b6fb13123aeef4a3abbcfd7e71773ff3ff1526a7d3dc538f3929a49b42be03f0",
-                "sha256:b94e4b785e304a04ea0828759172a15add27088520dc7e49ceade7834275bedb",
-                "sha256:c0df2d30ed496a08de5daed2a9ea807d07c21ae0ab23acf541ab88c24b26ab97",
-                "sha256:c6c2602dffb74867498f86e6129fd52a2770c48b7cd3ece77ada4fa38f94eba8",
-                "sha256:ceb6e0a6e27fb364fb3853389607cf7eb3a126ad335790fa1e14ed02fba50811",
-                "sha256:d9dd839eb0dc1bbe866a288ba3c1afc33a202015d2ad83b31e875b5905a079b6",
-                "sha256:e4dab234478e3bd3ce83bac4193b2ecd9cf94e720ddd95ce69840273bf44f6de",
-                "sha256:ec4e0cd079db280b6bdabdc807047ff3e199f334050db5cbb91ba3e959a67504",
-                "sha256:ecd2c3fe726758037234c93df7e98deb257fd15c24c9180dacf1ef829da5f921",
-                "sha256:ef565033fa5a958e62796867b1df10c40263ea9ded87164d67572834e57a174d"
-            ],
-            "index": "pypi",
-            "version": "==0.910"
-        },
-        "mypy-extensions": {
-            "hashes": [
-                "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d",
-                "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"
-            ],
-            "version": "==0.4.3"
-        },
-        "pipenv": {
-            "hashes": [
-                "sha256:05958fadcd70b2de6a27542fcd2bd72dd5c59c6d35307fdac3e06361fb06e30e",
-                "sha256:d180f5be4775c552fd5e69ae18a9d6099d9dafb462efe54f11c72cb5f4d5e977"
-            ],
-            "index": "pypi",
-            "version": "==2021.5.29"
-        },
-        "platformdirs": {
-            "hashes": [
-                "sha256:367a5e80b3d04d2428ffa76d33f124cf11e8fff2acdaa9b43d545f5c7d661ef2",
-                "sha256:8868bbe3c3c80d42f20156f22e7131d2fb321f5bc86a2a345375c6481a67021d"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==2.4.0"
-        },
-        "pycodestyle": {
-            "hashes": [
-                "sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20",
-                "sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==2.8.0"
-        },
-        "pyflakes": {
-            "hashes": [
-                "sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c",
-                "sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==2.4.0"
-        },
-        "six": {
-            "hashes": [
-                "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
-                "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==1.16.0"
-        },
-        "toml": {
-            "hashes": [
-                "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
-                "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
-            ],
-            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==0.10.2"
-        },
-        "typed-ast": {
-            "hashes": [
-                "sha256:01ae5f73431d21eead5015997ab41afa53aa1fbe252f9da060be5dad2c730ace",
-                "sha256:067a74454df670dcaa4e59349a2e5c81e567d8d65458d480a5b3dfecec08c5ff",
-                "sha256:0fb71b8c643187d7492c1f8352f2c15b4c4af3f6338f21681d3681b3dc31a266",
-                "sha256:1b3ead4a96c9101bef08f9f7d1217c096f31667617b58de957f690c92378b528",
-                "sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6",
-                "sha256:209596a4ec71d990d71d5e0d312ac935d86930e6eecff6ccc7007fe54d703808",
-                "sha256:2c726c276d09fc5c414693a2de063f521052d9ea7c240ce553316f70656c84d4",
-                "sha256:398e44cd480f4d2b7ee8d98385ca104e35c81525dd98c519acff1b79bdaac363",
-                "sha256:52b1eb8c83f178ab787f3a4283f68258525f8d70f778a2f6dd54d3b5e5fb4341",
-                "sha256:5feca99c17af94057417d744607b82dd0a664fd5e4ca98061480fd8b14b18d04",
-                "sha256:7538e495704e2ccda9b234b82423a4038f324f3a10c43bc088a1636180f11a41",
-                "sha256:760ad187b1041a154f0e4d0f6aae3e40fdb51d6de16e5c99aedadd9246450e9e",
-                "sha256:777a26c84bea6cd934422ac2e3b78863a37017618b6e5c08f92ef69853e765d3",
-                "sha256:95431a26309a21874005845c21118c83991c63ea800dd44843e42a916aec5899",
-                "sha256:9ad2c92ec681e02baf81fdfa056fe0d818645efa9af1f1cd5fd6f1bd2bdfd805",
-                "sha256:9c6d1a54552b5330bc657b7ef0eae25d00ba7ffe85d9ea8ae6540d2197a3788c",
-                "sha256:aee0c1256be6c07bd3e1263ff920c325b59849dc95392a05f258bb9b259cf39c",
-                "sha256:af3d4a73793725138d6b334d9d247ce7e5f084d96284ed23f22ee626a7b88e39",
-                "sha256:b36b4f3920103a25e1d5d024d155c504080959582b928e91cb608a65c3a49e1a",
-                "sha256:b9574c6f03f685070d859e75c7f9eeca02d6933273b5e69572e5ff9d5e3931c3",
-                "sha256:bff6ad71c81b3bba8fa35f0f1921fb24ff4476235a6e94a26ada2e54370e6da7",
-                "sha256:c190f0899e9f9f8b6b7863debfb739abcb21a5c054f911ca3596d12b8a4c4c7f",
-                "sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075",
-                "sha256:cae53c389825d3b46fb37538441f75d6aecc4174f615d048321b716df2757fb0",
-                "sha256:dd4a21253f42b8d2b48410cb31fe501d32f8b9fbeb1f55063ad102fe9c425e40",
-                "sha256:dde816ca9dac1d9c01dd504ea5967821606f02e510438120091b84e852367428",
-                "sha256:f2362f3cb0f3172c42938946dbc5b7843c2a28aec307c49100c8b38764eb6927",
-                "sha256:f328adcfebed9f11301eaedfa48e15bdece9b519fb27e6a8c01aa52a17ec31b3",
-                "sha256:f8afcf15cc511ada719a88e013cec87c11aff7b91f019295eb4530f96fe5ef2f",
-                "sha256:fb1bbeac803adea29cedd70781399c99138358c26d05fcbd23c13016b7f5ec65"
-            ],
-            "markers": "python_version < '3.8'",
-            "version": "==1.4.3"
-        },
-        "types-psycopg2": {
-            "hashes": [
-                "sha256:77ed80f2668582654623e04fb3d741ecce93effcc39c929d7e02f4a917a538ce",
-                "sha256:98a6e0e9580cd7eb4bd4d20f7c7063d154b2589a2b90c0ce4e3ca6085cde77c6"
-            ],
-            "index": "pypi",
-            "version": "==2.9.1"
-        },
-        "types-requests": {
-            "hashes": [
-                "sha256:b279284e51f668e38ee12d9665e4d789089f532dc2a0be4a1508ca0efd98ba9e",
-                "sha256:ba1d108d512e294b6080c37f6ae7cb2a2abf527560e2b671d1786c1fc46b541a"
-            ],
-            "index": "pypi",
-            "version": "==2.25.11"
-        },
-        "typing-extensions": {
-            "hashes": [
-                "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
-                "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
-                "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
-            ],
-            "index": "pypi",
-            "version": "==3.10.0.2"
-        },
-        "virtualenv": {
-            "hashes": [
-                "sha256:4b02e52a624336eece99c96e3ab7111f469c24ba226a53ec474e8e787b365814",
-                "sha256:576d05b46eace16a9c348085f7d0dc8ef28713a2cabaa1cf0aea41e8f12c9218"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==20.10.0"
-        },
-        "virtualenv-clone": {
-            "hashes": [
-                "sha256:418ee935c36152f8f153c79824bb93eaf6f0f7984bae31d3f48f350b9183501a",
-                "sha256:44d5263bceed0bac3e1424d64f798095233b64def1c5689afa43dc3223caf5b0"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==0.5.7"
-        },
-        "yapf": {
-            "hashes": [
-                "sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d",
-                "sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"
-            ],
-            "index": "pypi",
-            "version": "==0.31.0"
-        },
-        "zipp": {
-            "hashes": [
-                "sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
-                "sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==3.6.0"
-        }
-    }
-}
--- a/README.md
+++ b/README.md
@@ -28,12 +28,12 @@ apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libsec
 libssl-dev clang pkg-config libpq-dev
 ```

-[Rust] 1.55 or later is also required.
+[Rust] 1.56.1 or later is also required.

 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.

 To run the integration tests or Python scripts (not required to use the code), install
-Python (3.7 or higher), and install python3 packages using `pipenv install` in the project directory.
+Python (3.7 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory.

 2. Build zenith and patched postgres
 ```sh
@@ -57,12 +57,12 @@ pageserver init succeeded
 Starting pageserver at 'localhost:64000' in '.zenith'
 Pageserver started
 initializing for single for 7676
-Starting safekeeper at 'localhost:5454' in '.zenith/safekeepers/single'
+Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/single'
 Safekeeper started

 # start postgres compute node
 > ./target/debug/zenith pg start main
-Starting new postgres main on main...
+Starting new postgres main on timeline 5b014a9e41b4b63ce1a1febc04503636 ...
 Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432
 Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres'
 waiting for server to start.... done
@@ -70,8 +70,8 @@ server started

 # check list of running postgres instances
 > ./target/debug/zenith pg list
-BRANCH	ADDRESS		LSN		STATUS
-main	127.0.0.1:55432	0/1609610	running
+NODE	ADDRESS	TIMELINES	BRANCH NAME	LSN		STATUS
+main	127.0.0.1:55432	5b014a9e41b4b63ce1a1febc04503636	main	0/1609610	running
 ```

 4. Now it is possible to connect to postgres and run some queries:
@@ -91,13 +91,13 @@ postgres=# select * from t;
 5. And create branches and run postgres on them:
 ```sh
 # create branch named migration_check
-> ./target/debug/zenith branch migration_check main
-Created branch 'migration_check' at 0/1609610
+> ./target/debug/zenith timeline branch --branch-name migration_check
+Created timeline '0e9331cad6efbafe6a88dd73ae21a5c9' at Lsn 0/16F5830 for tenant: c03ba6b7ad4c5e9cf556f059ade44229. Ancestor timeline: 'main'

 # check branches tree
-> ./target/debug/zenith branch
- main
- ┗━ @0/1609610: migration_check
+> ./target/debug/zenith timeline list
+ main [5b014a9e41b4b63ce1a1febc04503636]
+ ┗━ @0/1609610: migration_check [0e9331cad6efbafe6a88dd73ae21a5c9]

 # start postgres on that branch
 > ./target/debug/zenith pg start migration_check
@@ -128,8 +128,7 @@ INSERT 0 1
 ```sh
 git clone --recursive https://github.com/zenithdb/zenith.git
 make # builds also postgres and installs it to ./tmp_install
-cd test_runner
-pipenv run pytest
+./scripts/pytest
 ```

 ## Documentation
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -1,17 +1,14 @@
 [package]
 name = "compute_tools"
 version = "0.1.0"
-authors = ["Alexey Kondratov <kondratov.aleksey@gmail.com>"]
-edition = "2018"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+edition = "2021"

 [dependencies]
 libc = "0.2"
 anyhow = "1.0"
 chrono = "0.4"
-clap = "2.33"
-env_logger = "0.8"
+clap = "3.0"
+env_logger = "0.9"
 hyper = { version = "0.14", features = ["full"] }
 log = { version = "0.4", features = ["std", "serde"] }
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
--- a/compute_tools/src/bin/zenith_ctl.rs
+++ b/compute_tools/src/bin/zenith_ctl.rs
@@ -34,7 +34,7 @@ use std::sync::{Arc, RwLock};

 use anyhow::{Context, Result};
 use chrono::Utc;
-use libc::{prctl, PR_SET_PDEATHSIG, SIGINT};
+use clap::Arg;
 use log::info;
 use postgres::{Client, NoTls};

@@ -155,20 +155,6 @@ fn run_compute(state: &Arc<RwLock<ComputeState>>) -> Result<ExitStatus> {
 }

 fn main() -> Result<()> {
-    // During configuration we are starting Postgres as a child process. If we
-    // fail we do not want to leave it running. PR_SET_PDEATHSIG sets the signal
-    // that will be sent to the child process when the parent dies. NB: this is
-    // cleared for the child of a fork(). SIGINT means fast shutdown for Postgres.
-    // This does not matter much for Docker, where `zenith_ctl` is an entrypoint,
-    // so the whole container will exit if it exits. But could be useful when
-    // `zenith_ctl` is used in e.g. systemd.
-    // XXX: this appears to just don't work. When `main` exits, the child process
-    // `postgres` is re-assigned to a new parent (`/lib/systemd/systemd --user`
-    // in my case).
-    unsafe {
-        prctl(PR_SET_PDEATHSIG, SIGINT);
-    }
-
    // TODO: re-use `zenith_utils::logging` later
    init_logger(DEFAULT_LOG_LEVEL)?;

@@ -177,34 +163,34 @@ fn main() -> Result<()> {
    let matches = clap::App::new("zenith_ctl")
        .version(version.unwrap_or("unknown"))
        .arg(
-            clap::Arg::with_name("connstr")
-                .short("C")
+            Arg::new("connstr")
+                .short('C')
                .long("connstr")
                .value_name("DATABASE_URL")
                .required(true),
        )
        .arg(
-            clap::Arg::with_name("pgdata")
-                .short("D")
+            Arg::new("pgdata")
+                .short('D')
                .long("pgdata")
                .value_name("DATADIR")
                .required(true),
        )
        .arg(
-            clap::Arg::with_name("pgbin")
-                .short("b")
+            Arg::new("pgbin")
+                .short('b')
                .long("pgbin")
                .value_name("POSTGRES_PATH"),
        )
        .arg(
-            clap::Arg::with_name("spec")
-                .short("s")
+            Arg::new("spec")
+                .short('s')
                .long("spec")
                .value_name("SPEC_JSON"),
        )
        .arg(
-            clap::Arg::with_name("spec-path")
-                .short("S")
+            Arg::new("spec-path")
+                .short('S')
                .long("spec-path")
                .value_name("SPEC_PATH"),
        )
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -5,7 +5,7 @@ use std::process::Command;
 use std::str::FromStr;
 use std::{fs, thread, time};

-use anyhow::{anyhow, Result};
+use anyhow::{bail, Result};
 use postgres::{Client, Transaction};
 use serde::Deserialize;

@@ -171,7 +171,7 @@ impl PgQuote for PgIdent {
    /// always quotes provided string with `""` and escapes every `"`. Not idempotent,
    /// i.e. if string is already escaped it will be escaped again.
    fn quote(&self) -> String {
-        let result = format!("\"{}\"", self.replace("\"", "\"\""));
+        let result = format!("\"{}\"", self.replace('"', "\"\""));
        result
    }
 }
@@ -226,7 +226,7 @@ pub fn wait_for_postgres(port: &str, pgdata: &Path) -> Result<()> {
        // but postgres starts listening almost immediately, even if it is not really
        // ready to accept connections).
        if slept >= POSTGRES_WAIT_TIMEOUT {
-            return Err(anyhow!("timed out while waiting for Postgres to start"));
+            bail!("timed out while waiting for Postgres to start");
        }

        if pid_path.exists() {
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -215,7 +215,7 @@ pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
        if let Some(r) = pg_db {
            // XXX: db owner name is returned as quoted string from Postgres,
            // when quoting is needed.
-            let new_owner = if r.owner.starts_with('\"') {
+            let new_owner = if r.owner.starts_with('"') {
                db.owner.quote()
            } else {
                db.owner.clone()
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -1,15 +1,13 @@
 [package]
 name = "control_plane"
 version = "0.1.0"
-authors = ["Stas Kelvich <stas@zenith.tech>"]
-edition = "2018"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+edition = "2021"

 [dependencies]
 tar = "0.4.33"
-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
+postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
 serde = { version = "1.0", features = ["derive"] }
+serde_with = "1.12.0"
 toml = "0.5"
 lazy_static = "1.4"
 regex = "1"
@@ -20,5 +18,6 @@ url = "2.2.2"
 reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }

 pageserver = { path = "../pageserver" }
+walkeeper = { path = "../walkeeper" }
 zenith_utils = { path = "../zenith_utils" }
 workspace_hack = { path = "../workspace_hack" }
--- a/control_plane/safekeepers.conf
+++ b/control_plane/safekeepers.conf
@@ -1,20 +1,20 @@
 # Page server and three safekeepers.
 [pageserver]
-listen_pg_addr = 'localhost:64000'
-listen_http_addr = 'localhost:9898'
+listen_pg_addr = '127.0.0.1:64000'
+listen_http_addr = '127.0.0.1:9898'
 auth_type = 'Trust'

 [[safekeepers]]
-name = 'sk1'
+id = 1
 pg_port = 5454
 http_port = 7676

 [[safekeepers]]
-name = 'sk2'
+id = 2
 pg_port = 5455
 http_port = 7677

 [[safekeepers]]
-name = 'sk3'
+id = 3
 pg_port = 5456
 http_port = 7678
--- a/control_plane/simple.conf
+++ b/control_plane/simple.conf
@@ -1,11 +1,11 @@
 # Minimal zenith environment with one safekeeper. This is equivalent to the built-in
 # defaults that you get with no --config
 [pageserver]
-listen_pg_addr = 'localhost:64000'
-listen_http_addr = 'localhost:9898'
+listen_pg_addr = '127.0.0.1:64000'
+listen_http_addr = '127.0.0.1:9898'
 auth_type = 'Trust'

 [[safekeepers]]
-name = 'single'
+id = 1
 pg_port = 5454
 http_port = 7676
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -37,7 +37,7 @@ impl ComputeControlPlane {
    // pgdatadirs
    // |- tenants
    // |  |- <tenant_id>
-    // |  |   |- <branch name>
+    // |  |   |- <node name>
    pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
        let pageserver = Arc::new(PageServerNode::from_env(&env));

@@ -52,7 +52,7 @@ impl ComputeControlPlane {
                .with_context(|| format!("failed to list {}", tenant_dir.path().display()))?
            {
                let node = PostgresNode::from_dir_entry(timeline_dir?, &env, &pageserver)?;
-                nodes.insert((node.tenantid, node.name.clone()), Arc::new(node));
+                nodes.insert((node.tenant_id, node.name.clone()), Arc::new(node));
            }
        }

@@ -73,44 +73,14 @@ impl ComputeControlPlane {
            .unwrap_or(self.base_port)
    }

-    // FIXME: see also parse_point_in_time in branches.rs.
-    fn parse_point_in_time(
-        &self,
-        tenantid: ZTenantId,
-        s: &str,
-    ) -> Result<(ZTimelineId, Option<Lsn>)> {
-        let mut strings = s.split('@');
-        let name = strings.next().unwrap();
-
-        let lsn: Option<Lsn>;
-        if let Some(lsnstr) = strings.next() {
-            lsn = Some(
-                Lsn::from_str(lsnstr)
-                    .with_context(|| "invalid LSN in point-in-time specification")?,
-            );
-        } else {
-            lsn = None
-        }
-
-        // Resolve the timeline ID, given the human-readable branch name
-        let timeline_id = self
-            .pageserver
-            .branch_get_by_name(&tenantid, name)?
-            .timeline_id;
-
-        Ok((timeline_id, lsn))
-    }
-
    pub fn new_node(
        &mut self,
-        tenantid: ZTenantId,
+        tenant_id: ZTenantId,
        name: &str,
-        timeline_spec: &str,
+        timeline_id: ZTimelineId,
+        lsn: Option<Lsn>,
        port: Option<u16>,
    ) -> Result<Arc<PostgresNode>> {
-        // Resolve the human-readable timeline spec into timeline ID and LSN
-        let (timelineid, lsn) = self.parse_point_in_time(tenantid, timeline_spec)?;
-
        let port = port.unwrap_or_else(|| self.get_port());
        let node = Arc::new(PostgresNode {
            name: name.to_owned(),
@@ -118,9 +88,9 @@ impl ComputeControlPlane {
            env: self.env.clone(),
            pageserver: Arc::clone(&self.pageserver),
            is_test: false,
-            timelineid,
+            timeline_id,
            lsn,
-            tenantid,
+            tenant_id,
            uses_wal_proposer: false,
        });

@@ -128,7 +98,7 @@ impl ComputeControlPlane {
        node.setup_pg_conf(self.env.pageserver.auth_type)?;

        self.nodes
-            .insert((tenantid, node.name.clone()), Arc::clone(&node));
+            .insert((tenant_id, node.name.clone()), Arc::clone(&node));

        Ok(node)
    }
@@ -143,9 +113,9 @@ pub struct PostgresNode {
    pub env: LocalEnv,
    pageserver: Arc<PageServerNode>,
    is_test: bool,
-    pub timelineid: ZTimelineId,
+    pub timeline_id: ZTimelineId,
    pub lsn: Option<Lsn>, // if it's a read-only node. None for primary
-    pub tenantid: ZTenantId,
+    pub tenant_id: ZTenantId,
    uses_wal_proposer: bool,
 }

@@ -177,8 +147,8 @@ impl PostgresNode {
        // Read a few options from the config file
        let context = format!("in config file {}", cfg_path_str);
        let port: u16 = conf.parse_field("port", &context)?;
-        let timelineid: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?;
-        let tenantid: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?;
+        let timeline_id: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?;
+        let tenant_id: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?;
        let uses_wal_proposer = conf.get("wal_acceptors").is_some();

        // parse recovery_target_lsn, if any
@@ -192,9 +162,9 @@ impl PostgresNode {
            env: env.clone(),
            pageserver: Arc::clone(pageserver),
            is_test: false,
-            timelineid,
+            timeline_id,
            lsn: recovery_target_lsn,
-            tenantid,
+            tenant_id,
            uses_wal_proposer,
        })
    }
@@ -245,24 +215,24 @@ impl PostgresNode {
        );

        let sql = if let Some(lsn) = lsn {
-            format!("basebackup {} {} {}", self.tenantid, self.timelineid, lsn)
+            format!("basebackup {} {} {}", self.tenant_id, self.timeline_id, lsn)
        } else {
-            format!("basebackup {} {}", self.tenantid, self.timelineid)
+            format!("basebackup {} {}", self.tenant_id, self.timeline_id)
        };

        let mut client = self
            .pageserver
            .page_server_psql_client()
-            .with_context(|| "connecting to page server failed")?;
+            .context("connecting to page server failed")?;

        let copyreader = client
            .copy_out(sql.as_str())
-            .with_context(|| "page server 'basebackup' command failed")?;
+            .context("page server 'basebackup' command failed")?;

        // Read the archive directly from the `CopyOutReader`
        tar::Archive::new(copyreader)
            .unpack(&self.pgdata())
-            .with_context(|| "extracting base backup failed")?;
+            .context("extracting base backup failed")?;

        Ok(())
    }
@@ -333,19 +303,31 @@ impl PostgresNode {
        conf.append("shared_preload_libraries", "zenith");
        conf.append_line("");
        conf.append("zenith.page_server_connstring", &pageserver_connstr);
-        conf.append("zenith.zenith_tenant", &self.tenantid.to_string());
-        conf.append("zenith.zenith_timeline", &self.timelineid.to_string());
+        conf.append("zenith.zenith_tenant", &self.tenant_id.to_string());
+        conf.append("zenith.zenith_timeline", &self.timeline_id.to_string());
        if let Some(lsn) = self.lsn {
            conf.append("recovery_target_lsn", &lsn.to_string());
        }
+
        conf.append_line("");
+        // Configure backpressure
+        // - Replication write lag depends on how fast the walreceiver can process incoming WAL.
+        //   This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec,
+        //   so to avoid expiration of 1 minute timeout, this lag should not be larger than 600MB.
+        //   Actually latency should be much smaller (better if < 1sec). But we assume that recently
+        //   updates pages are not requested from pageserver.
+        // - Replication flush lag depends on speed of persisting data by checkpointer (creation of
+        //   delta/image layers) and advancing disk_consistent_lsn. Safekeepers are able to
+        //   remove/archive WAL only beyond disk_consistent_lsn. Too large a lag can cause long
+        //   recovery time (in case of pageserver crash) and disk space overflow at safekeepers.
+        // - Replication apply lag depends on speed of uploading changes to S3 by uploader thread.
+        //   To be able to restore database in case of pageserver node crash, safekeeper should not
+        //   remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers
+        //   (if they are not able to upload WAL to S3).
+        conf.append("max_replication_write_lag", "500MB");
+        conf.append("max_replication_flush_lag", "10GB");

        if !self.env.safekeepers.is_empty() {
-            // Configure backpressure
-            // In setup with safekeepers apply_lag depends on
-            // speed of data checkpointing on pageserver (see disk_consistent_lsn).
-            conf.append("max_replication_apply_lag", "1500MB");
-
            // Configure the node to connect to the safekeepers
            conf.append("synchronous_standby_names", "walproposer");

@@ -358,11 +340,6 @@ impl PostgresNode {
                .join(",");
            conf.append("wal_acceptors", &wal_acceptors);
        } else {
-            // Configure backpressure
-            // In setup without safekeepers, flush_lag depends on
-            // speed of of data checkpointing on pageserver (see disk_consistent_lsn)
-            conf.append("max_replication_flush_lag", "1500MB");
-
            // We only use setup without safekeepers for tests,
            // and don't care about data durability on pageserver,
            // so set more relaxed synchronous_commit.
@@ -405,7 +382,7 @@ impl PostgresNode {
    }

    pub fn pgdata(&self) -> PathBuf {
-        self.env.pg_data_dir(&self.tenantid, &self.name)
+        self.env.pg_data_dir(&self.tenant_id, &self.name)
    }

    pub fn status(&self) -> &str {
@@ -443,7 +420,7 @@ impl PostgresNode {
        if let Some(token) = auth_token {
            cmd.env("ZENITH_AUTH_TOKEN", token);
        }
-        let pg_ctl = cmd.status().with_context(|| "pg_ctl failed")?;
+        let pg_ctl = cmd.status().context("pg_ctl failed")?;

        if !pg_ctl.success() {
            anyhow::bail!("pg_ctl failed");
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -3,16 +3,19 @@
 //! Now it also provides init method which acts like a stub for proper installation
 //! script which will use local paths.

-use anyhow::{bail, Context};
+use anyhow::{bail, ensure, Context};
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
+use std::collections::HashMap;
 use std::env;
-use std::fmt::Write;
 use std::fs;
 use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
 use zenith_utils::auth::{encode_from_key_file, Claims, Scope};
 use zenith_utils::postgres_backend::AuthType;
-use zenith_utils::zid::{opt_display_serde, ZTenantId};
+use zenith_utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId};
+
+use crate::safekeeper::SafekeeperNode;

 //
 // This data structures represents zenith CLI config
@@ -21,7 +24,8 @@ use zenith_utils::zid::{opt_display_serde, ZTenantId};
 // to 'zenith init --config=<path>' option. See control_plane/simple.conf for
 // an example.
 //
-#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde_as]
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct LocalEnv {
    // Base directory for all the nodes (the pageserver, safekeepers and
    // compute nodes).
@@ -45,9 +49,9 @@ pub struct LocalEnv {

    // Default tenant ID to use with the 'zenith' command line utility, when
    // --tenantid is not explicitly specified.
-    #[serde(with = "opt_display_serde")]
    #[serde(default)]
-    pub default_tenantid: Option<ZTenantId>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub default_tenant_id: Option<ZTenantId>,

    // used to issue tokens during e.g pg start
    #[serde(default)]
@@ -57,11 +61,21 @@ pub struct LocalEnv {

    #[serde(default)]
    pub safekeepers: Vec<SafekeeperConf>,
+
+    /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
+    #[serde(default)]
+    // A `HashMap<String, HashMap<ZTenantId, ZTimelineId>>` would be more appropriate here,
+    // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
+    // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
+    #[serde_as(as = "HashMap<_, Vec<(DisplayFromStr, DisplayFromStr)>>")]
+    branch_name_mappings: HashMap<String, Vec<(ZTenantId, ZTimelineId)>>,
 }

-#[derive(Serialize, Deserialize, Clone, Debug)]
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default)]
 pub struct PageServerConf {
+    // node id
+    pub id: ZNodeId,
    // Pageserver connection settings
    pub listen_pg_addr: String,
    pub listen_http_addr: String,
@@ -76,6 +90,7 @@ pub struct PageServerConf {
 impl Default for PageServerConf {
    fn default() -> Self {
        Self {
+            id: ZNodeId(0),
            listen_pg_addr: String::new(),
            listen_http_addr: String::new(),
            auth_type: AuthType::Trust,
@@ -84,10 +99,10 @@ impl Default for PageServerConf {
    }
 }

-#[derive(Serialize, Deserialize, Clone, Debug)]
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default)]
 pub struct SafekeeperConf {
-    pub name: String,
+    pub id: ZNodeId,
    pub pg_port: u16,
    pub http_port: u16,
    pub sync: bool,
@@ -96,7 +111,7 @@ pub struct SafekeeperConf {
 impl Default for SafekeeperConf {
    fn default() -> Self {
        Self {
-            name: String::new(),
+            id: ZNodeId(0),
            pg_port: 0,
            http_port: 0,
            sync: true,
@@ -136,8 +151,64 @@ impl LocalEnv {
        self.base_data_dir.clone()
    }

-    pub fn safekeeper_data_dir(&self, node_name: &str) -> PathBuf {
-        self.base_data_dir.join("safekeepers").join(node_name)
+    pub fn safekeeper_data_dir(&self, data_dir_name: &str) -> PathBuf {
+        self.base_data_dir.join("safekeepers").join(data_dir_name)
+    }
+
+    pub fn register_branch_mapping(
+        &mut self,
+        branch_name: String,
+        tenant_id: ZTenantId,
+        timeline_id: ZTimelineId,
+    ) -> anyhow::Result<()> {
+        let existing_values = self
+            .branch_name_mappings
+            .entry(branch_name.clone())
+            .or_default();
+
+        let existing_ids = existing_values
+            .iter()
+            .find(|(existing_tenant_id, _)| existing_tenant_id == &tenant_id);
+
+        if let Some((_, old_timeline_id)) = existing_ids {
+            if old_timeline_id == &timeline_id {
+                Ok(())
+            } else {
+                bail!(
+                    "branch '{}' is already mapped to timeline {}, cannot map to another timeline {}",
+                    branch_name,
+                    old_timeline_id,
+                    timeline_id
+                );
+            }
+        } else {
+            existing_values.push((tenant_id, timeline_id));
+            Ok(())
+        }
+    }
+
+    pub fn get_branch_timeline_id(
+        &self,
+        branch_name: &str,
+        tenant_id: ZTenantId,
+    ) -> Option<ZTimelineId> {
+        self.branch_name_mappings
+            .get(branch_name)?
+            .iter()
+            .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id)
+            .map(|&(_, timeline_id)| timeline_id)
+            .map(ZTimelineId::from)
+    }
+
+    pub fn timeline_name_mappings(&self) -> HashMap<ZTenantTimelineId, String> {
+        self.branch_name_mappings
+            .iter()
+            .flat_map(|(name, tenant_timelines)| {
+                tenant_timelines.iter().map(|&(tenant_id, timeline_id)| {
+                    (ZTenantTimelineId::new(tenant_id, timeline_id), name.clone())
+                })
+            })
+            .collect()
    }

    /// Create a LocalEnv from a config file.
@@ -179,8 +250,8 @@ impl LocalEnv {
        }

        // If no initial tenant ID was given, generate it.
-        if env.default_tenantid.is_none() {
-            env.default_tenantid = Some(ZTenantId::generate());
+        if env.default_tenant_id.is_none() {
+            env.default_tenant_id = Some(ZTenantId::generate());
        }

        env.base_data_dir = base_path();
@@ -210,6 +281,39 @@ impl LocalEnv {
        Ok(env)
    }

+    pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> {
+        // Currently, the user first passes a config file with 'zenith init --config=<path>'
+        // We read that in, in `create_config`, and fill any missing defaults. Then it's saved
+        // to .zenith/config. TODO: We lose any formatting and comments along the way, which is
+        // a bit sad.
+        let mut conf_content = r#"# This file describes a locale deployment of the page server
+# and safekeeeper node. It is read by the 'zenith' command-line
+# utility.
+"#
+        .to_string();
+
+        // Convert the LocalEnv to a toml file.
+        //
+        // This could be as simple as this:
+        //
+        // conf_content += &toml::to_string_pretty(env)?;
+        //
+        // But it results in a "values must be emitted before tables". I'm not sure
+        // why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
+        // Maybe rust reorders the fields to squeeze avoid padding or something?
+        // In any case, converting to toml::Value first, and serializing that, works.
+        // See https://github.com/alexcrichton/toml-rs/issues/142
+        conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?;
+
+        let target_config_path = base_path.join("config");
+        fs::write(&target_config_path, conf_content).with_context(|| {
+            format!(
+                "Failed to write config file into path '{}'",
+                target_config_path.display()
+            )
+        })
+    }
+
    // this function is used only for testing purposes in CLI e g generate tokens during init
    pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result<String> {
        let private_key_path = if self.private_key_path.is_absolute() {
@@ -228,15 +332,15 @@ impl LocalEnv {
    pub fn init(&mut self) -> anyhow::Result<()> {
        // check if config already exists
        let base_path = &self.base_data_dir;
-        if base_path == Path::new("") {
-            bail!("repository base path is missing");
-        }
-        if base_path.exists() {
-            bail!(
-                "directory '{}' already exists. Perhaps already initialized?",
-                base_path.to_str().unwrap()
-            );
-        }
+        ensure!(
+            base_path != Path::new(""),
+            "repository base path is missing"
+        );
+        ensure!(
+            !base_path.exists(),
+            "directory '{}' already exists. Perhaps already initialized?",
+            base_path.display()
+        );

        fs::create_dir(&base_path)?;

@@ -251,7 +355,7 @@ impl LocalEnv {
                .arg("2048")
                .stdout(Stdio::null())
                .output()
-                .with_context(|| "failed to generate auth private key")?;
+                .context("failed to generate auth private key")?;
            if !keygen_output.status.success() {
                bail!(
                    "openssl failed: '{}'",
@@ -270,7 +374,7 @@ impl LocalEnv {
                .args(&["-out", public_key_path.to_str().unwrap()])
                .stdout(Stdio::null())
                .output()
-                .with_context(|| "failed to generate auth private key")?;
+                .context("failed to generate auth private key")?;
            if !keygen_output.status.success() {
                bail!(
                    "openssl failed: '{}'",
@@ -285,39 +389,10 @@ impl LocalEnv {
        fs::create_dir_all(self.pg_data_dirs_path())?;

        for safekeeper in &self.safekeepers {
-            fs::create_dir_all(self.safekeeper_data_dir(&safekeeper.name))?;
+            fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?;
        }

-        let mut conf_content = String::new();
-
-        // Currently, the user first passes a config file with 'zenith init --config=<path>'
-        // We read that in, in `create_config`, and fill any missing defaults. Then it's saved
-        // to .zenith/config. TODO: We lose any formatting and comments along the way, which is
-        // a bit sad.
-        write!(
-            &mut conf_content,
-            r#"# This file describes a locale deployment of the page server
-# and safekeeeper node. It is read by the 'zenith' command-line
-# utility.
-"#
-        )?;
-
-        // Convert the LocalEnv to a toml file.
-        //
-        // This could be as simple as this:
-        //
-        // conf_content += &toml::to_string_pretty(env)?;
-        //
-        // But it results in a "values must be emitted before tables". I'm not sure
-        // why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
-        // Maybe rust reorders the fields to squeeze avoid padding or something?
-        // In any case, converting to toml::Value first, and serializing that, works.
-        // See https://github.com/alexcrichton/toml-rs/issues/142
-        conf_content += &toml::to_string_pretty(&toml::Value::try_from(&self)?)?;
-
-        fs::write(base_path.join("config"), conf_content)?;
-
-        Ok(())
+        self.persist_config(base_path)
    }
 }

--- a/control_plane/src/postgresql_conf.rs
+++ b/control_plane/src/postgresql_conf.rs
@@ -4,7 +4,7 @@
 /// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just
 /// enough to extract a few settings we need in Zenith, assuming you don't do
 /// funny stuff like include-directives or funny escaping.
-use anyhow::{anyhow, bail, Context, Result};
+use anyhow::{bail, Context, Result};
 use lazy_static::lazy_static;
 use regex::Regex;
 use std::collections::HashMap;
@@ -78,7 +78,7 @@ impl PostgresConf {
        <T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
    {
        self.get(field_name)
-            .ok_or_else(|| anyhow!("could not find '{}' option {}", field_name, context))?
+            .with_context(|| format!("could not find '{}' option {}", field_name, context))?
            .parse::<T>()
            .with_context(|| format!("could not parse '{}' option {}", field_name, context))
    }
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -14,7 +14,9 @@ use postgres::Config;
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
+use walkeeper::http::models::TimelineCreateRequest;
 use zenith_utils::http::error::HttpErrorBody;
+use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId};

 use crate::local_env::{LocalEnv, SafekeeperConf};
 use crate::storage::PageServerNode;
@@ -61,7 +63,7 @@ impl ResponseErrorMessageExt for Response {
 //
 #[derive(Debug)]
 pub struct SafekeeperNode {
-    pub name: String,
+    pub id: ZNodeId,

    pub conf: SafekeeperConf,

@@ -77,15 +79,15 @@ impl SafekeeperNode {
    pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
        let pageserver = Arc::new(PageServerNode::from_env(env));

-        println!("initializing for {} for {}", conf.name, conf.http_port);
+        println!("initializing for sk {} for {}", conf.id, conf.http_port);

        SafekeeperNode {
-            name: conf.name.clone(),
+            id: conf.id,
            conf: conf.clone(),
            pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
            env: env.clone(),
            http_client: Client::new(),
-            http_base_url: format!("http://localhost:{}/v1", conf.http_port),
+            http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
            pageserver,
        }
    }
@@ -93,13 +95,17 @@ impl SafekeeperNode {
    /// Construct libpq connection string for connecting to this safekeeper.
    fn safekeeper_connection_config(port: u16) -> Config {
        // TODO safekeeper authentication not implemented yet
-        format!("postgresql://no_user@localhost:{}/no_db", port)
+        format!("postgresql://no_user@127.0.0.1:{}/no_db", port)
            .parse()
            .unwrap()
    }

+    pub fn datadir_path_by_id(env: &LocalEnv, sk_id: ZNodeId) -> PathBuf {
+        env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref())
+    }
+
    pub fn datadir_path(&self) -> PathBuf {
-        self.env.safekeeper_data_dir(&self.name)
+        SafekeeperNode::datadir_path_by_id(&self.env, self.id)
    }

    pub fn pid_file(&self) -> PathBuf {
@@ -114,12 +120,13 @@ impl SafekeeperNode {
        );
        io::stdout().flush().unwrap();

-        let listen_pg = format!("localhost:{}", self.conf.pg_port);
-        let listen_http = format!("localhost:{}", self.conf.http_port);
+        let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
+        let listen_http = format!("127.0.0.1:{}", self.conf.http_port);

        let mut cmd = Command::new(self.env.safekeeper_bin()?);
        fill_rust_env_vars(
            cmd.args(&["-D", self.datadir_path().to_str().unwrap()])
+                .args(&["--id", self.id.to_string().as_ref()])
                .args(&["--listen-pg", &listen_pg])
                .args(&["--listen-http", &listen_http])
                .args(&["--recall", "1 second"])
@@ -183,7 +190,7 @@ impl SafekeeperNode {
    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
        let pid_file = self.pid_file();
        if !pid_file.exists() {
-            println!("Safekeeper {} is already stopped", self.name);
+            println!("Safekeeper {} is already stopped", self.id);
            return Ok(());
        }
        let pid = read_pidfile(&pid_file)?;
@@ -255,4 +262,25 @@ impl SafekeeperNode {
            .error_from_body()?;
        Ok(())
    }
+
+    pub fn timeline_create(
+        &self,
+        tenant_id: ZTenantId,
+        timeline_id: ZTimelineId,
+        peer_ids: Vec<ZNodeId>,
+    ) -> Result<()> {
+        Ok(self
+            .http_request(
+                Method::POST,
+                format!("{}/{}", self.http_base_url, "timeline"),
+            )
+            .json(&TimelineCreateRequest {
+                tenant_id,
+                timeline_id,
+                peer_ids,
+            })
+            .send()?
+            .error_from_body()?
+            .json()?)
+    }
 }
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -5,22 +5,23 @@ use std::process::Command;
 use std::time::Duration;
 use std::{io, result, thread};

-use anyhow::bail;
+use anyhow::{bail, Context};
 use nix::errno::Errno;
 use nix::sys::signal::{kill, Signal};
 use nix::unistd::Pid;
-use pageserver::http::models::{BranchCreateRequest, TenantCreateRequest};
+use pageserver::http::models::{TenantCreateRequest, TimelineCreateRequest};
+use pageserver::timelines::TimelineInfo;
 use postgres::{Config, NoTls};
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
 use zenith_utils::http::error::HttpErrorBody;
+use zenith_utils::lsn::Lsn;
 use zenith_utils::postgres_backend::AuthType;
-use zenith_utils::zid::ZTenantId;
+use zenith_utils::zid::{ZTenantId, ZTimelineId};

 use crate::local_env::LocalEnv;
 use crate::{fill_rust_env_vars, read_pidfile};
-use pageserver::branches::BranchInfo;
 use pageserver::tenant_mgr::TenantInfo;
 use zenith_utils::connstring::connection_address;

@@ -98,11 +99,14 @@ impl PageServerNode {

    pub fn init(
        &self,
-        create_tenant: Option<&str>,
+        create_tenant: Option<ZTenantId>,
+        initial_timeline_id: Option<ZTimelineId>,
        config_overrides: &[&str],
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<ZTimelineId> {
        let mut cmd = Command::new(self.env.pageserver_bin()?);

+        let id = format!("id={}", self.env.pageserver.id);
+
        // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
        let base_data_dir_param = self.env.base_data_dir.display().to_string();
        let pg_distrib_dir_param =
@@ -122,6 +126,7 @@ impl PageServerNode {
        args.extend(["-c", &authg_type_param]);
        args.extend(["-c", &listen_http_addr_param]);
        args.extend(["-c", &listen_pg_addr_param]);
+        args.extend(["-c", &id]);

        for config_override in config_overrides {
            args.extend(["-c", config_override]);
@@ -134,19 +139,24 @@ impl PageServerNode {
            ]);
        }

-        if let Some(tenantid) = create_tenant {
-            args.extend(["--create-tenant", tenantid])
+        let create_tenant = create_tenant.map(|id| id.to_string());
+        if let Some(tenant_id) = create_tenant.as_deref() {
+            args.extend(["--create-tenant", tenant_id])
        }

-        let status = fill_rust_env_vars(cmd.args(args))
-            .status()
-            .expect("pageserver init failed");
+        let initial_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate);
+        let initial_timeline_id_string = initial_timeline_id.to_string();
+        args.extend(["--initial-timeline-id", &initial_timeline_id_string]);

-        if !status.success() {
+        let init_output = fill_rust_env_vars(cmd.args(args))
+            .output()
+            .context("pageserver init failed")?;
+
+        if !init_output.status.success() {
            bail!("pageserver init failed");
        }

-        Ok(())
+        Ok(initial_timeline_id)
    }

    pub fn repo_path(&self) -> PathBuf {
@@ -307,7 +317,7 @@ impl PageServerNode {
    }

    pub fn check_status(&self) -> Result<()> {
-        self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
+        self.http_request(Method::GET, format!("{}/status", self.http_base_url))
            .send()?
            .error_from_body()?;
        Ok(())
@@ -315,64 +325,69 @@ impl PageServerNode {

    pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
        Ok(self
-            .http_request(Method::GET, format!("{}/{}", self.http_base_url, "tenant"))
+            .http_request(Method::GET, format!("{}/tenant", self.http_base_url))
            .send()?
            .error_from_body()?
            .json()?)
    }

-    pub fn tenant_create(&self, tenantid: ZTenantId) -> Result<()> {
-        Ok(self
-            .http_request(Method::POST, format!("{}/{}", self.http_base_url, "tenant"))
-            .json(&TenantCreateRequest {
-                tenant_id: tenantid,
+    pub fn tenant_create(
+        &self,
+        new_tenant_id: Option<ZTenantId>,
+    ) -> anyhow::Result<Option<ZTenantId>> {
+        let tenant_id_string = self
+            .http_request(Method::POST, format!("{}/tenant", self.http_base_url))
+            .json(&TenantCreateRequest { new_tenant_id })
+            .send()?
+            .error_from_body()?
+            .json::<Option<String>>()?;
+
+        tenant_id_string
+            .map(|id| {
+                id.parse().with_context(|| {
+                    format!(
+                        "Failed to parse tennat creation response as tenant id: {}",
+                        id
+                    )
+                })
            })
-            .send()?
-            .error_from_body()?
-            .json()?)
+            .transpose()
    }

-    pub fn branch_list(&self, tenantid: &ZTenantId) -> Result<Vec<BranchInfo>> {
-        Ok(self
+    pub fn timeline_list(&self, tenant_id: &ZTenantId) -> anyhow::Result<Vec<TimelineInfo>> {
+        let timeline_infos: Vec<TimelineInfo> = self
            .http_request(
                Method::GET,
-                format!("{}/branch/{}", self.http_base_url, tenantid),
+                format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
            )
            .send()?
            .error_from_body()?
-            .json()?)
+            .json()?;
+
+        Ok(timeline_infos)
    }

-    pub fn branch_create(
+    pub fn timeline_create(
        &self,
-        branch_name: &str,
-        startpoint: &str,
-        tenantid: &ZTenantId,
-    ) -> Result<BranchInfo> {
-        Ok(self
-            .http_request(Method::POST, format!("{}/branch", self.http_base_url))
-            .json(&BranchCreateRequest {
-                tenant_id: tenantid.to_owned(),
-                name: branch_name.to_owned(),
-                start_point: startpoint.to_owned(),
+        tenant_id: ZTenantId,
+        new_timeline_id: Option<ZTimelineId>,
+        ancestor_start_lsn: Option<Lsn>,
+        ancestor_timeline_id: Option<ZTimelineId>,
+    ) -> anyhow::Result<Option<TimelineInfo>> {
+        let timeline_info_response = self
+            .http_request(
+                Method::POST,
+                format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
+            )
+            .json(&TimelineCreateRequest {
+                new_timeline_id,
+                ancestor_start_lsn,
+                ancestor_timeline_id,
            })
            .send()?
            .error_from_body()?
-            .json()?)
-    }
+            .json::<Option<TimelineInfo>>()?;

-    pub fn branch_get_by_name(
-        &self,
-        tenantid: &ZTenantId,
-        branch_name: &str,
-    ) -> Result<BranchInfo> {
-        Ok(self
-            .http_request(
-                Method::GET,
-                format!("{}/branch/{}/{}", self.http_base_url, tenantid, branch_name),
-            )
-            .send()?
-            .error_for_status()?
-            .json()?)
+        Ok(timeline_info_response)
    }
 }
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -4,7 +4,7 @@ set -eux
 if [ "$1" = 'pageserver' ]; then
    if [ ! -d "/data/tenants" ]; then
        echo "Initializing pageserver data directory"
-        pageserver --init -D /data -c "pg_distrib_dir='/usr/local'"
+        pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10"
    fi
    echo "Staring pageserver at 0.0.0.0:6400"
    pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /data
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -7,32 +7,14 @@ Currently we build two main images:
 - [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
 - [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres).

-And two intermediate images used either to reduce build time or to deliver some additional binary tools from other repos:
+And additional intermediate images:

- [zenithdb/build](https://hub.docker.com/repository/docker/zenithdb/build) — image with all the dependencies required to build Zenith and compute node images. This image is based on `rust:slim-buster`, so it also has a proper `rust` environment. Built from [/Dockerfile.build](/Dockerfile.build).
 - [zenithdb/compute-tools](https://hub.docker.com/repository/docker/zenithdb/compute-tools) — compute node configuration management tools.

 ## Building pipeline

 1. Image `zenithdb/compute-tools` is re-built automatically.

-2. Image `zenithdb/build` is built manually. If you want to introduce any new compile time dependencies to Zenith or compute node you have to update this image as well, build it and push to Docker Hub.
+2. Image `zenithdb/compute-node` is built independently in the [zenithdb/postgres](https://github.com/zenithdb/postgres) repo.

-Build:
-```sh
-docker build -t zenithdb/build:buster -f Dockerfile.build .
-```
-
-Login:
-```sh
-docker login
-```
-
-Push to Docker Hub:
-```sh
-docker push zenithdb/build:buster
-```
-
-3. Image `zenithdb/compute-node` is built independently in the [zenithdb/postgres](https://github.com/zenithdb/postgres) repo.
-
-4. Image `zenithdb/zenith` is built in this repo after a successful `release` tests run and pushed to Docker Hub automatically.
+3. Image `zenithdb/zenith` is built in this repo after a successful `release` tests run and pushed to Docker Hub automatically.
--- a/docs/glossary.md
+++ b/docs/glossary.md
@@ -2,6 +2,16 @@

 ### Authentication

+### Backpresssure
+
+Backpressure is used to limit the lag between pageserver and compute node or WAL service.
+
+If compute node or WAL service run far ahead of Page Server,
+the time of serving page requests increases. This may lead to timeout errors.
+
+To tune backpressure limits use `max_replication_write_lag`, `max_replication_flush_lag` and `max_replication_apply_lag` settings.
+When lag between current LSN (pg_current_wal_flush_lsn() at compute node) and minimal write/flush/apply position of replica exceeds the limit
+backends performing writes are blocked until the replica is caught up.
 ### Base image (page image)

 ### Basebackup
@@ -76,7 +86,37 @@ The layer map tracks what layers exist for all the relishes in a timeline.
 Zenith repository implementation that keeps data in layers.
 ### LSN

+The Log Sequence Number (LSN) is a unique identifier of the WAL record[] in the WAL log.
+The insert position is a byte offset into the logs, increasing monotonically with each new record.
+Internally, an LSN is a 64-bit integer, representing a byte position in the write-ahead log stream.
+It is printed as two hexadecimal numbers of up to 8 digits each, separated by a slash.
+Check also [PostgreSQL doc about pg_lsn type](https://www.postgresql.org/docs/devel/datatype-pg-lsn.html)
+Values can be compared to calculate the volume of WAL data that separates them, so they are used to measure the progress of replication and recovery.

+In postgres and Zenith lsns are used to describe certain points in WAL handling.
+
+PostgreSQL LSNs and functions to monitor them:
+* `pg_current_wal_insert_lsn()` - Returns the current write-ahead log insert location.
+* `pg_current_wal_lsn()` - Returns the current write-ahead log write location.
+* `pg_current_wal_flush_lsn()` - Returns the current write-ahead log flush location.
+* `pg_last_wal_receive_lsn()` - Returns the last write-ahead log location that has been received and synced to disk by streaming replication. While streaming replication is in progress this will increase monotonically.
+* `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically. 
+[source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html):
+
+Zenith safekeeper LSNs. For more check [walkeeper/README_PROTO.md](/walkeeper/README_PROTO.md)
+* `CommitLSN`: position in WAL confirmed by quorum safekeepers.
+* `RestartLSN`: position in WAL confirmed by all safekeepers.
+* `FlushLSN`: part of WAL persisted to the disk by safekeeper.
+* `VCL`: the largerst LSN for which we can guarantee availablity of all prior records.
+
+Zenith pageserver LSNs:
+* `last_record_lsn` - the end of last processed WAL record.
+* `disk_consistent_lsn` - data is known to be fully flushed and fsync'd to local disk on pageserver up to this LSN.
+* `remote_consistent_lsn` - The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash.
+TODO: use this name consistently in remote storage code. Now `disk_consistent_lsn` is used and meaning depends on the context.
+* `ancestor_lsn` - LSN of the branch point (the LSN at which this branch was created)
+
+TODO: add table that describes mapping between PostgreSQL (compute), safekeeper and pageserver LSNs.
 ### Page (block)

 The basic structure used to store relation data. All pages are of the same size.
--- a/docs/pageserver-tenant-migration.md
+++ b/docs/pageserver-tenant-migration.md
@@ -0,0 +1,22 @@
+## Pageserver tenant migration
+
+### Overview
+
+This feature allows to migrate a timeline from one pageserver to another by utilizing remote storage capability.
+
+### Migration process
+
+Pageserver implements two new http handlers: timeline attach and timeline detach.
+Timeline migration is performed in a following way:
+1. Timeline attach is called on a target pageserver. This asks pageserver to download latest checkpoint uploaded to s3.
+2. For now it is necessary to manually initialize replication stream via callmemaybe call so target pageserver initializes replication from safekeeper (it is desired to avoid this and initialize replication directly in attach handler, but this requires some refactoring (probably [#997](https://github.com/zenithdb/zenith/issues/997)/[#1049](https://github.com/zenithdb/zenith/issues/1049))
+3. Replication state can be tracked via timeline detail pageserver call.
+4. Compute node should be restarted with new pageserver connection string. Issue with multiple compute nodes for one timeline is handled on the safekeeper consensus level. So this is not a problem here.Currently responsibility for rescheduling the compute with updated config lies on external coordinator (console).
+5. Timeline is detached from old pageserver. On disk data is removed.
+
+
+### Implementation details
+
+Now safekeeper needs to track which pageserver it is replicating to. This introduces complications into replication code:
+* We need to distinguish different pageservers (now this is done by connection string which is imperfect and is covered here: https://github.com/zenithdb/zenith/issues/1105). Callmemaybe subscription management also needs to track that (this is already implemented).
+* We need to track which pageserver is the primary. This is needed to avoid reconnections to non primary pageservers. Because we shouldn't reconnect to them when they decide to stop their walreceiver. I e this can appear when there is a load on the compute and we are trying to detach timeline from old pageserver. In this case callmemaybe will try to reconnect to it because replication termination condition is not met (page server with active compute could never catch up to the latest lsn, so there is always some wal tail)
--- a/docs/rfcs/002-storage.md
+++ b/docs/rfcs/002-storage.md
@@ -0,0 +1,186 @@
+# Zenith storage node — alternative
+
+## **Design considerations**
+
+Simplify storage operations for people => Gain adoption/installs on laptops and small private installation => Attract customers to DBaaS by seamless integration between our tooling and cloud.
+
+Proposed architecture addresses:
+
+- High availability -- tolerates n/2 - 1 failures
+- Multi-tenancy -- one storage for all databases
+- Elasticity -- increase storage size on the go by adding nodes
+- Snapshots / backups / PITR with S3 offload
+- Compression
+
+Minuses are:
+
+- Quite a lot of work
+- Single page access may touch few disk pages
+- Some bloat in data — may slowdown sequential scans
+
+## **Summary**
+
+Storage cluster is sharded key-value store with ordered keys. Key (****page_key****) is a tuple of `(pg_id, db_id, timeline_id, rel_id, forkno, segno, pageno, lsn)`. Value is either page or page diff/wal. Each chunk (chunk == shard) stores approx 50-100GB ~~and automatically splits in half when grows bigger then soft 100GB limit~~. by having a fixed range of pageno's it is responsible for. Chunks placement on storage nodes is stored in a separate metadata service, so chunk can be freely moved around the cluster if it is need. Chunk itself is a filesystem directory with following sub directories:
+
+```
+
+|-chunk_42/
+  |-store/ -- contains lsm with pages/pagediffs ranging from
+  |	      page_key_lo to page_key_hi
+  |-wal/
+  |  |- db_1234/ db-specific wal files with pages from page_key_lo
+  |		 to page_key_hi
+  |
+  |-chunk.meta -- small file with snapshot references
+		  (page_key_prefix+lsn+name)
+		  and PITR regions (page_key_start, page_key_end)
+```
+
+## **Chunk**
+
+Chunk is responsible for storing pages potentially from different databases and relations. Each page is addressed by a lexicographically ordered tuple (****page_key****) with following fields:
+
+- `pg_id` -- unique id of given postgres instance (or postgres cluster as it is called in postgres docs)
+- `db_id` -- database that was created by 'CREATE DATABASE' in a given postgres instance
+- `db_timeline` -- used to create Copy-on-Write instances from snapshots, described later
+- `rel_id` -- tuple of (relation_id, 0) for tables and (indexed_relation_id, rel_id) for indices. Done this way so table indices were closer to table itself on our global key space.
+- `(forkno, segno, pageno)` -- page coordinates in postgres data files
+- `lsn_timeline` -- postgres feature, increments when PITR was done.
+- `lsn` -- lsn of current page version.
+
+Chunk stores pages and page diffs ranging from page_key_lo to page_key_hi. Processing node looks at page in wal record and sends record to a chunk responsible for this page range. When wal record arrives to a chunk it is initially stored in `chunk_id/wal/db_id/wal_segno.wal`. Then background process moves records from that wal files to the lsm tree in `chunk_id/store`. Or, more precisely, wal records would be materialized into lsm memtable and when that memtable is flushed to SSTable on disk we may trim the wal. That way some not durably (in the distributed sense) committed pages may enter the tree -- here we rely on processing node behavior: page request from processing node should contain proper lsm horizons so that storage node may respond with proper page version.
+
+LSM here is a usual LSM for variable-length values: at first data is stored in memory (we hold incoming wal records to be able to regenerate it after restart) at some balanced tree. When this tree grows big enough we dump it into disk file (SSTable) sorting records by key. Then SStables are mergesorted in the background to a different files. All file operation are sequential and do not require WAL for durability.
+
+Content of SSTable can be following:
+
+```jsx
+(pg_id, db_id, ... , pageno=42, lsn=100) (full 8k page data)
+(pg_id, db_id, ... , pageno=42, lsn=150) (per-page diff)
+(pg_id, db_id, ... , pageno=42, lsn=180) (per-page diff)
+(pg_id, db_id, ... , pageno=42, lsn=200) (per-page diff)
+(pg_id, db_id, ... , pageno=42, lsn=220) (full 8k page data)
+(pg_id, db_id, ... , pageno=42, lsn=250) (per-page diff)
+(pg_id, db_id, ... , pageno=42, lsn=270) (per-page diff)
+(pg_id, db_id, ... , pageno=5000, lsn=100) (full 8k page data)
+```
+
+So query for `pageno=42 up to lsn=260` would need to find closest entry less then this key, iterate back to the latest full page and iterate forward to apply diffs. How often page is materialized in lsn-version sequence is up to us -- let's say each 5th version should be a full page.
+
+### **Page deletion**
+
+To delete old pages we insert blind deletion marker `(pg_id, db_id, #trim_lsn < 150)` into a lsm tree. During merges such marker would indicate that all pages with smaller lsn should be discarded. Delete marker will travel down the tree levels hierarchy until it reaches last level. In non-PITR scenario where old page version are not needed at all such deletion marker would (in average) prevent old page versions propagation down the tree -- so all bloat would concentrate at higher tree layers without affecting bigger bottom layers.
+
+### **Recovery**
+
+Upon storage node restart recent WAL files are applied to appropriate pages and resulting pages stored in lsm memtable. So this should be fast since we are not writing anything to disk.
+
+### **Checkpointing**
+
+No such mechanism is needed. Or we may look at the storage node as at kind of continuous chekpointer.
+
+### **Full page writes (torn page protection)**
+
+Storage node never updates individual pages, only merges SSTable, so torn pages is not an issue.
+
+### **Snapshot**
+
+That is the part that I like about this design -- snapshot creation is instant and cheap operation that can have flexible granularity level: whole instance, database, table. Snapshot creation inserts a record in `chunk.meta` file with lsn of this snapshot and key prefix `(pg_id, db_id, db_timeline, rel_id, *)` that prohibits pages deletion within this range. Storage node may not know anything about page internals, but by changing number of fields in our prefix we may change snapshot granularity.
+
+It is again useful to remap `rel_id` to `(indexed_relation_id, rel_id)` so that snapshot of relation would include it's indices. Also table snapshot would trickily interact with catalog. Probably all table snapshots should hold also a catalog snapshot. And when node is started with such snapshot it should check that only tables from snapshot are queried. I assume here that for snapshot reading one need to start a new postgres instance.
+
+Storage consumed by snapshot is proportional to the amount of data changed. We may have some heuristic (calculated based on cost of different storages) about when to offload old snapshot to s3. For example, if current database has more then 40% of changed pages with respect to previous snapshot then we may offload that snapshot to s3, and release this space.
+
+**Starting db from snapshot**
+
+When we are starting database from snapshot it can be done in two ways. First, we may create new db_id, move all the data from snapshot to a new db and start a database. Second option is to create Copy-on-Write (CoW) instance out of snapshot and read old pages from old snapshot and store new pages separately. That is why there is `db_timeline` key field near `db_id` -- CoW (🐮) database should create new `db_timeline` and remember old `db_timeline`. Such a database can have hashmap of pages that it is changed to query pages from proper snapshot on the first try. `db_timeline` is located near `db_id` so that new page versions generated by new instance would not bloat data of initial snapshot. It is not clear for whether it is possibly to effectively support "stacked" CoW snapshot, so we may disallow them. (Well, one way to support them is to move `db_timeline` close to `lsn` -- so we may scan neighboring pages and find right one. But again that way we bloat snapshot with unrelated data and may slowdown full scans that are happening in different database).
+
+**Snapshot export/import**
+
+Once we may start CoW instances it is easy to run auxiliary postgres instance on this snapshot and run `COPY FROM (...) TO stdout` or `pg_dump` and export data from the snapshot to some portable formats. Also we may start postgres on a new empty database and run `COPY FROM stdin`. This way we can initialize new non-CoW databases and transfer snapshots via network.
+
+### **PITR area**
+
+In described scheme PITR is just a prohibition to delete any versions within some key prefix, either it is a database or a table key prefix. So PITR may have different settings for different tables, databases, etc.
+
+PITR is quite bloaty, so we may aggressively offload it to s3 -- we may push same (or bigger) SSTables to s3 and maintain lsm structure there.
+
+### **Compression**
+
+Since we are storing page diffs of variable sizes there is no structural dependency on a page size and we may compress it. Again that could be enabled only on pages with some key prefixes, so we may have this with db/table granularity.
+
+### **Chunk metadata**
+
+Chunk metadata is a file lies in chunk directory that stores info about current snapshots and PITR regions. Chunck should always consult this data when merging SSTables and applying delete markers.
+
+### **Chunk splitting**
+
+*(NB: following paragraph is about how to avoid page splitting)*
+
+When chunks hits some soft storage limit (let's say 100Gb) it should be split in half and global matadata about chunk boundaries should be updated. Here i assume that chunk split is a local operation happening on single node. Process of chink splitting should look like following:
+
+1. Find separation key and spawn two new chunks with [lo, mid) [mid, hi) boundaries.
+
+2. Prohibit WAL deletion and old SSTables deletion on original chunk.
+
+3. On each lsm layer we would need to split only one SSTable, all other would fit within left or right range. Symlink/split that files to new chunks.
+
+4. Start WAL replay on new chunks.
+
+5. Update global metadata about new chunk boundaries.
+
+6. Eventually (metadata update should be pushed to processing node by metadata service) storage node will start sending WAL and page requests to the new nodes.
+
+7. New chunk may start serving read queries when following conditions are met:
+
+a) it receives at least on WAL record from processing node
+
+b) it replayed all WAL up to the new received one
+
+c) checked by downlinks that there were no WAL gaps.
+
+Chunk split as it is described here is quite fast operation when it is happening on the local disk -- vast majority of files will be just moved without copying anything. I suggest to keep split always local and not to mix it with chunk moving around cluster. So if we want to split some chunk but there is small amount of free space left on the device, we should first move some chunks away from the node and then proceed with splitting.
+
+### Fixed chunks
+
+Alternative strategy is to not to split at all and have pageno-fixed chunk boundaries. When table is created we first materialize this chunk by storing first new pages only and chunks is small. Then chunk is growing while table is filled, but it can't grow substantially bigger then allowed pageno range, so at max it would be 1GB or whatever limit we want + some bloat due to snapshots and old page versions.
+
+### **Chunk lsm internals**
+
+So how to implement chunk's lsm?
+
+- Write from scratch and use RocksDB to prototype/benchmark, then switch to own lsm implementation. RocksDB can provide some sanity check for performance of home-brewed implementation and it would be easier to prototype.
+- Use postgres as lego constructor. We may model memtable with postgres B-tree referencing some in-memory log of incoming records. SSTable merging may reuse postgres external merging algorithm, etc. One thing that would definitely not fit (or I didn't came up with idea how to fit that) -- is multi-tenancy. If we are storing pages from different databases we can't use postgres buffer pool, since there is no db_id in the page header. We can add new field there but IMO it would be no go for committing that to vanilla.
+
+Other possibility is to not to try to fit few databases in one storage node. But that way it is no go for multi-tenant cloud installation: we would need to run a lot of storage node instances on one physical storage node, all with it own local page cache. So that would be much closer to ordinary managed RDS.
+
+Multi-tenant storage makes sense even on a laptop, when you work with different databases, running tests with temp database, etc. And when installation grows bigger it start to make more and more sense, so it seems important.
+
+# Storage fleet
+
+# **Storage fleet**
+
+- When database is smaller then a chunk size we naturally can store them in one chunk (since their page_key would fit in some chunk's [hi, lo) range).
+
+<img width="937" alt="Screenshot_2021-02-22_at_16 49 17" src="https://user-images.githubusercontent.com/284219/108729836-ffcbd200-753b-11eb-9412-db802ec30021.png">
+
+Few databases are stored in one chunk, replicated three times
+
+- When database can't fit into one storage node it can occupy lots of chunks that were split while database was growing. Chunk placement on nodes is controlled by us with some automatization, but we alway may manually move chunks around the cluster.
+
+<img width="940" alt="Screenshot_2021-02-22_at_16 49 10" src="https://user-images.githubusercontent.com/284219/108729815-fb071e00-753b-11eb-86e0-be6703e47d82.png">
+
+Here one big database occupies two set of nodes. Also some chunks were moved around to restore replication factor after disk failure. In this case we also have "sharded" storage for a big database and issue wal writes to different chunks in parallel.
+
+## **Chunk placement strategies**
+
+There are few scenarios where we may want to move chunks around the cluster:
+
+- disk usage on some node is big
+- some disk experienced a failure
+- some node experienced a failure or need maintenance
+
+## **Chunk replication**
+
+Chunk replication may be done by cloning page ranges with respect to some lsn from peer nodes, updating global metadata, waiting for WAL to come, replaying previous WAL and becoming online -- more or less like during chunk split.
+
--- a/docs/rfcs/003-laptop-cli.md
+++ b/docs/rfcs/003-laptop-cli.md
@@ -0,0 +1,267 @@
+# Command line interface (end-user)
+
+Zenith CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside zenith distribution at least at the start.
+
+This proposal is focused on managing local installations. For cluster operations, different tooling would be needed. The point of integration between the two is storage URL: no matter how complex cluster setup is it may provide an endpoint where the user may push snapshots.
+
+The most important concept here is a snapshot, which can be created/pushed/pulled/exported. Also, we may start temporary read-only postgres instance over any local snapshot. A more complex scenario would consist of several basic operations over snapshots.
+
+# Possible usage scenarios
+
+## Install zenith, run a postgres
+
+```
+> brew install pg-zenith 
+> zenith pg create # creates pgdata with default pattern pgdata$i
+> zenith pg list
+ID            PGDATA        USED    STORAGE            ENDPOINT
+primary1      pgdata1       0G      zenith-local       localhost:5432
+```
+
+## Import standalone postgres to zenith
+
+```
+> zenith snapshot import --from=basebackup://replication@localhost:5432/ oldpg
+[====================------------] 60% | 20MB/s
+> zenith snapshot list
+ID          SIZE        PARENT
+oldpg       5G          -
+
+> zenith pg create --snapshot oldpg
+Started postgres on localhost:5432
+
+> zenith pg list
+ID            PGDATA        USED    STORAGE            ENDPOINT
+primary1      pgdata1       5G      zenith-local       localhost:5432
+
+> zenith snapshot destroy oldpg
+Ok
+```
+
+Also, we may start snapshot import implicitly by looking at snapshot schema
+
+```
+> zenith pg create --snapshot basebackup://replication@localhost:5432/
+Downloading snapshot... Done.
+Started postgres on localhost:5432
+Destroying snapshot... Done.
+```
+
+## Pull snapshot with some publicly shared database
+
+Since we may export the whole snapshot as one big file (tar of basebackup, maybe with some manifest) it may be shared over conventional means: http, ssh, [git+lfs](https://docs.github.com/en/github/managing-large-files/about-git-large-file-storage).
+
+```
+> zenith pg create --snapshot http://learn-postgres.com/movies_db.zenith movies
+```
+
+## Create snapshot and push it to the cloud
+
+```
+> zenith snapshot create pgdata1@snap1
+> zenith snapshot push --to ssh://stas@zenith.tech pgdata1@snap1
+```
+
+## Rollback database to the snapshot
+
+One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `zenith pg checkout`.
+
+```
+> zenith pg list
+ID            PGDATA        USED    STORAGE            ENDPOINT
+primary1      pgdata1       5G      zenith-local       localhost:5432
+
+> zenith snapshot create pgdata1@snap1
+
+> zenith snapshot list
+ID                    SIZE        PARENT
+oldpg                 5G          -
+pgdata1@snap1         6G          -
+pgdata1@CURRENT       6G          -
+
+> zenith pg checkout pgdata1@snap1
+Stopping postgres on pgdata1.
+Rolling back pgdata1@CURRENT to pgdata1@snap1.
+Starting postgres on pgdata1.
+
+> zenith snapshot list
+ID                    SIZE        PARENT
+oldpg                 5G          -
+pgdata1@snap1         6G          -
+pgdata1@HEAD{0}       6G          -
+pgdata1@CURRENT       6G          -
+```
+
+Some notes: pgdata1@CURRENT -- implicit snapshot representing the current state of the database in the data directory. When we are checking out some snapshot CURRENT will be set to this snapshot and the old CURRENT state will be named HEAD{0} (0 is the number of postgres timeline, it would be incremented after each such checkout).
+
+## Configure PITR area (Point In Time Recovery).
+
+PITR area acts like a continuous snapshot where you can reset the database to any point in time within this area (by area I mean some TTL period or some size limit, both possibly infinite).
+
+```
+> zenith pitr create --storage s3tank --ttl 30d --name pitr_last_month
+```
+
+Resetting the database to some state in past would require creating a snapshot on some lsn / time in this pirt area.
+
+# Manual
+
+## storage
+
+Storage is either zenith pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default.
+
+**zenith storage attach** -t [native|s3] -c key=value -n name
+
+Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=zenith.tech/stas/mystore. Other possible term for native is 'zstore'.
+
+
+**zenith storage list**
+
+Show currently attached storages. For example:
+
+```
+> zenith storage list
+NAME            USED    TYPE                OPTIONS          PATH
+local           5.1G    zenith-local                         /opt/zenith/store/local
+local.compr     20.4G   zenith-local        comression=on    /opt/zenith/store/local.compr
+zcloud          60G     zenith-remote                        zenith.tech/stas/mystore
+s3tank          80G     S3
+```
+
+**zenith storage detach**
+
+**zenith storage show**
+
+
+
+## pg
+
+Manages postgres data directories and can start postgreses with proper configuration. An experienced user may avoid using that (except pg create) and configure/run postgres by themself.
+
+Pg is a term for a single postgres running on some data. I'm trying to avoid here separation of datadir management and postgres instance management -- both that concepts bundled here together.
+
+**zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata
+
+Creates (initializes) new data directory in given storage and starts postgres. I imagine that storage for this operation may be only local and data movement to remote location happens through snapshots/pitr.
+
+--no-start: just init datadir without creating 
+
+--snapshot snap: init from the snapshot. Snap is a name or URL (zenith.tech/stas/mystore/snap1)
+
+--cow: initialize Copy-on-Write data directory on top of some snapshot (makes sense if it is a snapshot of currently running a database)
+
+**zenith pg destroy**
+
+**zenith pg start** [--replica] pgdata
+
+Start postgres with proper extensions preloaded/installed.
+
+**zenith pg checkout**
+
+Rollback data directory to some previous snapshot. 
+
+**zenith pg stop** pg_id
+
+**zenith pg list**
+
+```
+ROLE                 PGDATA        USED    STORAGE            ENDPOINT
+primary              my_pg         5.1G    local              localhost:5432
+replica-1                                                     localhost:5433
+replica-2                                                     localhost:5434
+primary              my_pg2        3.2G    local.compr        localhost:5435
+-                    my_pg3        9.2G    local.compr        -
+```
+
+**zenith pg show**
+
+```
+my_pg:
+    storage: local
+    space used on local: 5.1G
+    space used on all storages: 15.1G
+    snapshots:
+        on local:
+            snap1: 1G
+            snap2: 1G
+        on zcloud:
+            snap2: 1G
+        on s3tank:
+            snap5: 2G
+    pitr:
+        on s3tank:
+            pitr_one_month: 45G
+
+```
+
+**zenith pg start-rest/graphql** pgdata
+
+Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, just an idea.
+
+
+## snapshot
+
+Snapshot creation is cheap -- no actual data is copied, we just start retaining old pages. Snapshot size means the amount of retained data, not all data. Snapshot name looks like pgdata_name@tag_name. tag_name is set by the user during snapshot creation. There are some reserved tag names: CURRENT represents the current state of the data directory; HEAD{i} represents the data directory state that resided in the database before i-th checkout.
+
+**zenith snapshot create** pgdata_name@snap_name
+
+Creates a new snapshot in the same storage where pgdata_name exists.
+
+**zenith snapshot push** --to url pgdata_name@snap_name
+
+Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `zenith snapshot recv` before push happens. If url has some special schema like zenith:// receiving side may require auth start `zenith snapshot recv` on the go.
+
+**zenith snapshot recv**
+
+Starts a port listening for a basebackup stream, prints connection info to stdout (so that user may use that in push command), and expects data on that socket.
+
+**zenith snapshot pull** --from url or path
+
+Connects to a remote zenith/s3/file and pulls snapshot. The remote site should be zenith service or files in our format.
+
+**zenith snapshot import** --from basebackup://<...>  or path
+
+Creates a new snapshot out of running postgres via basebackup protocol or basebackup files.
+
+**zenith snapshot export**
+
+Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be zenith own format which is handy for us (but I think just tar of basebackup would be okay).
+
+**zenith snapshot diff** snap1 snap2
+
+Shows size of data changed between two snapshots. We also may provide options to diff schema/data in tables. To do that start temp read-only postgreses.
+
+**zenith snapshot destroy**
+
+## pitr
+
+Pitr represents wal stream and ttl policy for that stream
+
+XXX: any suggestions on a better name?
+
+**zenith pitr create** name
+
+--ttl = inf | period
+
+--size-limit = inf | limit
+
+--storage = storage_name
+
+**zenith pitr extract-snapshot** pitr_name --lsn xxx
+
+Creates a snapshot out of some lsn in PITR area. The obtained snapshot may be managed with snapshot routines (move/send/export)
+
+**zenith pitr gc** pitr_name
+
+Force garbage collection on some PITR area.
+
+**zenith pitr list**
+
+**zenith pitr destroy**
+
+
+## console
+
+**zenith console**
+
+Opens browser targeted at web console with the more or less same functionality as described here.
--- a/docs/rfcs/004-durability.md
+++ b/docs/rfcs/004-durability.md
@@ -0,0 +1,218 @@
+Durability & Consensus
+======================
+
+When a transaction commits, a commit record is generated in the WAL.
+When do we consider the WAL record as durable, so that we can
+acknowledge the commit to the client and be reasonably certain that we
+will not lose the transaction?
+
+Zenith uses a group of WAL safekeeper nodes to hold the generated WAL.
+A WAL record is considered durable, when it has been written to a
+majority of WAL safekeeper nodes. In this document, I use 5
+safekeepers, because I have five fingers. A WAL record is durable,
+when at least 3 safekeepers have written it to disk.
+
+First, assume that only one primary node can be running at a
+time. This can be achieved by Kubernetes or etcd or some
+cloud-provider specific facility, or we can implement it
+ourselves. These options are discussed in later chapters.  For now,
+assume that there is a Magic STONITH Fairy that ensures that.
+
+In addition to the WAL safekeeper nodes, the WAL is archived in
+S3. WAL that has been archived to S3 can be removed from the
+safekeepers, so the safekeepers don't need a lot of disk space.
+
+
+                                +----------------+
+                        +-----> | WAL safekeeper |
+                        |       +----------------+
+                        |       +----------------+
+                        +-----> | WAL safekeeper |
+------------+          |       +----------------+
+|  Primary   |          |       +----------------+
+| Processing | ---------+-----> | WAL safekeeper |
+|   Node     |          |       +----------------+
+------------+          |       +----------------+
+            \           +-----> | WAL safekeeper |
+             \          |       +----------------+
+              \         |       +----------------+
+               \        +-----> | WAL safekeeper |
+                \               +----------------+
+                 \
+                  \
+                   \
+                    \
+                     \      +--------+
+					  \		|        |
+					   +-->	|   S3   |
+							|        |
+                            +--------+
+
+
+Every WAL safekeeper holds a section of WAL, and a VCL value.
+The WAL can be divided into three portions:
+
+
+                                    VCL                   LSN
+                                     |                     |
+                                     V                     V
+.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX
+Archived WAL       Completed WAL          In-flight WAL
+
+
+Note that all this WAL kept in a safekeeper is a contiguous section.
+This is different from Aurora: In Aurora, there can be holes in the
+WAL, and there is a Gossip protocol to fill the holes. That could be
+implemented in the future, but let's keep it simple for now. WAL needs
+to be written to a safekeeper in order. However, during crash
+recovery, In-flight WAL that has already been stored in a safekeeper
+can be truncated or overwritten.
+
+The Archived WAL has already been stored in S3, and can be removed from
+the safekeeper.
+
+The Completed WAL has been written to at least three safekeepers. The
+algorithm ensures that it is not lost, when at most two nodes fail at
+the same time.
+
+The In-flight WAL has been persisted in the safekeeper, but if a crash
+happens, it may still be overwritten or truncated.
+
+
+The VCL point is determined in the Primary. It is not strictly
+necessary to store it in the safekeepers, but it allows some
+optimizations and sanity checks and is probably generally useful for
+the system as whole. The VCL values stored in the safekeepers can lag
+behind the VCL computed by the primary.
+
+
+Primary node Normal operation
+-----------------------------
+
+1. Generate some WAL.
+
+2. Send the WAL to all the safekeepers that you can reach.
+
+3. As soon as a quorum of safekeepers have acknowledged that they have
+   received and durably stored the WAL up to that LSN, update local VCL
+   value in memory, and acknowledge commits to the clients.
+
+4. Send the new VCL to all the safekeepers that were part of the quorum.
+   (Optional)
+
+
+Primary Crash recovery
+----------------------
+
+When a new Primary node starts up, before it can generate any new WAL
+it needs to contact a majority of the WAL safekeepers to compute the
+VCL. Remember that there is a Magic STONITH fairy that ensures that
+only node process can be doing this at a time.
+
+1. Contact all WAL safekeepers. Find the Max((Epoch, LSN)) tuple among the ones you
+   can reach. This is the Winner safekeeper, and its LSN becomes the new VCL.
+
+2. Update the other safekeepers you can reach, by copying all the WAL
+   from the Winner, starting from each safekeeper's old VCL point. Any old
+   In-Flight WAL from previous Epoch is truncated away.
+
+3. Increment Epoch, and send the new Epoch to the quorum of
+   safekeepers.  (This ensures that if any of the safekeepers that we
+   could not reach later come back online, they will be considered as
+   older than this in any future recovery)
+
+You can now start generating new WAL, starting from the newly-computed
+VCL.
+
+Optimizations
+-------------
+
+As described, the Primary node sends all the WAL to all the WAL safekeepers. That
+can be a lot of network traffic. Instead of sending the WAL directly from Primary,
+some safekeepers can be daisy-chained off other safekeepers, or there can be a
+broadcast mechanism among them. There should still be a direct connection from the
+each safekeeper to the Primary for the acknowledgments though.
+
+Similarly, the responsibility for archiving WAL to S3 can be delegated to one of
+the safekeepers, to reduce the load on the primary.
+
+
+Magic STONITH fairy
+-------------------
+
+Now that we have a system that works as long as only one primary node is running at a time, how
+do we ensure that?
+
+1. Use etcd to grant a lease on a key. The primary node is only allowed to operate as primary
+   when it's holding a valid lease. If the primary node dies, the lease expires after a timeout
+   period, and a new node is allowed to become the primary.
+
+2. Use S3 to store the lease. S3's consistency guarantees are more lenient, so in theory you
+   cannot do this safely. In practice, it would probably be OK if you make the lease times and
+   timeouts long enough. This has the advantage that we don't need to introduce a new
+   component to the architecture.
+
+3. Use Raft or Paxos, with the WAL safekeepers acting as the Acceptors to form the quorum. The
+   next chapter describes this option.
+
+
+Built-in Paxos
+--------------
+
+The WAL safekeepers act as PAXOS Acceptors, and the Processing nodes
+as both Proposers and Learners.
+
+Each WAL safekeeper holds an Epoch value in addition to the VCL and
+the WAL. Each request by the primary to safekeep WAL is accompanied by
+an Epoch value. If a safekeeper receives a request with Epoch that
+doesn't match its current Accepted Epoch, it must ignore (NACK) it.
+(In different Paxos papers, Epochs are called "terms" or "round
+numbers")
+
+When a node wants to become the primary, it generates a new Epoch
+value that is higher than any previously observed Epoch value, and
+globally unique.
+
+
+Accepted Epoch: 555                VCL                   LSN
+                                     |                     |
+                                     V                     V
+.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX
+Archived WAL       Completed WAL          In-flight WAL
+
+
+Primary node startup:
+
+1. Contact all WAL safekeepers that you can reach (if you cannot
+   connect to a quorum of them, you can give up immediately). Find the
+   latest Epoch among them.
+
+2. Generate a new globally unique Epoch, greater than the latest Epoch
+   found in previous step.
+
+2. Send the new Epoch in a Prepare message to a quorum of
+   safekeepers. (PAXOS Prepare message)
+
+3. Each safekeeper responds with a Promise. If a safekeeper has
+   already made a promise with a higher Epoch, it doesn't respond (or
+   responds with a NACK). After making a promise, the safekeeper stops
+   responding to any write requests with earlier Epoch.
+
+4. Once you have received a majority of promises, you know that the
+   VCL cannot advance on the old Epoch anymore. This effectively kills
+   any old primary server.
+
+5. Find the highest written LSN among the quorum of safekeepers (these
+   can be included in the Promise messages already). This is the new
+   VCL.  If a new node starts the election process after this point,
+   it will compute the same or higher VCL.
+
+6. Copy the WAL from the safekeeper with the highest LSN to the other
+   safekeepers in the quorum, using the new Epoch. (PAXOS Accept
+   phase)
+
+7. You can now start generating new WAL starting from the VCL. If
+   another process starts the election process after this point and
+   gains control of a majority of the safekeepers, we will no longer
+   be able to advance the VCL.
+
--- a/docs/rfcs/005-zenith_local.md
+++ b/docs/rfcs/005-zenith_local.md
@@ -0,0 +1,103 @@
+# Zenith local
+
+Here I list some objectives to keep in mind when discussing zenith-local design and a proposal that brings all components together.  Your comments on both parts are very welcome.
+
+#### Why do we need it?
+- For distribution - this easy to use binary will help us to build adoption among developers.
+- For internal use - to test all components together.
+
+In my understanding, we consider it to be just a mock-up version of zenith-cloud.
+> Question: How much should we care about durability and security issues for a local setup?
+
+
+#### Why is it better than a simple local postgres?
+
+- Easy one-line setup. As simple as `cargo install zenith && zenith start`
+
+- Quick and cheap creation of compute nodes over the same storage.
+> Question: How can we describe a use-case for this feature?
+
+- Zenith-local can work with S3 directly. 
+
+- Push and pull images (snapshots) to remote S3 to exchange data with other users.
+
+- Quick and cheap snapshot checkouts to switch back and forth in the database history.
+> Question: Do we want it in the very first release? This feature seems quite complicated.
+
+#### Distribution:
+
+Ideally, just one binary that incorporates all elements we need.
+> Question: Let's discuss pros and cons of having a separate package with modified PostgreSQL.
+
+#### Components:
+
+- **zenith-CLI** - interface for end-users.  Turns commands to REST requests and handles responces to show them in a user-friendly way.  
+CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md
+WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli
+
+- **zenith-console** - WEB UI with same functionality as CLI.
+>Note: not for the first release.
+
+- **zenith-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below.
+    > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping zenith-local.
+
+- **zenith-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation).
+> Question: Probably, for local setup we should be able to bypass page-storage and interact directly with S3 to avoid double caching in shared buffers and page-server?
+
+WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src
+
+- **zenith-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to zenith.
+> Question: How should it operate in a local setup? Will we manage it ourselves or ask user to provide credentials for existing S3 object storage (i.e. minio)?
+> Question: Do we use it together with local page store or they are interchangeable?
+
+WIP code is ???
+
+- **zenith-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed.
+> Question: How should it operate in a local setup? In my understanding it should push WAL directly to S3 (if we use it) or store all data locally (if we use local page storage). The latter option seems meaningless (extra overhead and no gain), but it is still good to test the system.
+
+WIP code is here: https://github.com/libzenith/postgres/tree/main/src/bin/safekeeper
+
+- **zenith-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database.
+ 
+ WIP code is in main branch and here: https://github.com/libzenith/postgres/commits/compute_node
+
+#### REST API:
+
+Service endpoint: `http://localhost:3000`
+
+Resources:
+- /storages - Where data lives: zenith-pageserver or zenith-s3
+- /pgs - Postgres - zenith-computenode
+- /snapshots - snapshots **TODO**
+
+>Question: Do we want to extend this API to manage zenith components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all?
+
+Methods and their mapping to CLI:
+
+- /storages - zenith-pageserver or zenith-s3
+
+CLI  | REST API
+------------- | -------------
+storage attach -n name --type [native\s3]  --path=[datadir\URL] | PUT  -d { "name": "name", "type": "native", "path": "/tmp" } /storages
+storage detach -n name | DELETE /storages/:storage_name 
+storage list | GET /storages
+storage show -n name | GET /storages/:storage_name 
+
+
+- /pgs - zenith-computenode
+
+CLI  | REST API
+------------- | -------------
+pg create -n name --s storage_name | PUT  -d { "name": "name", "storage_name": "storage_name" } /pgs
+pg destroy -n name | DELETE /pgs/:pg_name 
+pg start -n name --replica | POST -d {"action": "start", "is_replica":"replica"}  /pgs/:pg_name /actions
+pg stop -n name | POST  -d {"action": "stop"}  /pgs/:pg_name /actions
+pg promote -n name | POST  -d {"action": "promote"}  /pgs/:pg_name /actions
+pg list | GET /pgs
+pg show -n name | GET /pgs/:pg_name 
+
+- /snapshots **TODO**
+
+CLI  | REST API
+------------- | -------------
+
--- a/docs/rfcs/006-laptop-cli-v2-CLI.md
+++ b/docs/rfcs/006-laptop-cli-v2-CLI.md
@@ -0,0 +1,64 @@
+Zenith CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog".
+
+# CLI v2 (after chatting with Carl)
+
+Zenith introduces the notion of a repository.
+
+```bash
+zenith init
+zenith clone zenith://zenith.tech/piedpiper/northwind -- clones a repo to the northwind directory
+```
+
+Once you have a cluster catalog you can explore it
+
+```bash
+zenith log -- returns a list of commits
+zenith status -- returns if there are changes in the catalog that can be committed
+zenith commit -- commits the changes and generates a new commit hash
+zenith branch experimental <hash> -- creates a branch called testdb based on a given commit hash
+```
+
+To make changes in the catalog you need to run compute nodes
+
+```bash
+-- here is how you a compute node
+zenith start /home/pipedpiper/northwind:main -- starts a compute instance
+zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud
+-- you can start a compute node against any hash or branch
+zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start anothe compute instance (on different port)
+-- you can start a compute node against any hash or branch
+zenith start /home/pipedpiper/northwind:<hash> --port 8009 -- start anothe compute instance (on different port)
+
+-- After running some DML you can run 
+-- zenith status and see how there are two WAL streams one on top of 
+-- the main branch
+zenith status 
+-- and another on top of the experimental branch
+zenith status -b experimental
+
+-- you can commit each branch separately
+zenith commit main
+-- or
+zenith commit -c /home/pipedpiper/northwind:experimental
+```
+
+Starting compute instances against cloud environments
+
+```bash
+-- you can start a compute instance against the cloud environment
+-- in this case all of the changes will be streamed into the cloud
+zenith start https://zenith:tech/pipedpiper/northwind:main
+zenith start https://zenith:tech/pipedpiper/northwind:main
+zenith status -c https://zenith:tech/pipedpiper/northwind:main
+zenith commit -c https://zenith:tech/pipedpiper/northwind:main
+zenith branch -c https://zenith:tech/pipedpiper/northwind:<hash> experimental
+```
+
+Pushing data into the cloud
+
+```bash
+-- pull all the commits from the cloud
+zenith pull
+-- push all the commits to the cloud
+zenith push
+```
--- a/docs/rfcs/006-laptop-cli-v2-repository-structure.md
+++ b/docs/rfcs/006-laptop-cli-v2-repository-structure.md
@@ -0,0 +1,140 @@
+# Repository format
+
+A Zenith repository is similar to a traditional PostgreSQL backup
+archive, like a WAL-G bucket or pgbarman backup catalogue. It holds
+multiple versions of a PostgreSQL database cluster.
+
+The distinguishing feature is that you can launch a Zenith Postgres
+server directly against a branch in the repository, without having to
+"restore" it first. Also, Zenith manages the storage automatically,
+there is no separation between full and incremental backups nor WAL
+archive. Zenith relies heavily on the WAL, and uses concepts similar
+to incremental backups and WAL archiving internally, but it is hidden
+from the user.
+
+## Directory structure, version 1
+
+This first version is pretty straightforward but not very
+efficient. Just something to get us started.
+
+The repository directory looks like this:
+
+    .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/
+    .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots/<lsn>/
+    .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history
+    
+    .zenith/refs/branches/mybranch
+    .zenith/refs/tags/foo
+    .zenith/refs/tags/bar
+    
+    .zenith/datadirs/<timeline uuid>
+
+### Timelines
+
+A timeline is similar to PostgeSQL's timeline, but is identified by a
+UUID instead of a 32-bit timeline Id.  For user convenience, it can be
+given a name that refers to the UUID (called a branch).
+
+All WAL is generated on a timeline. You can launch a read-only node
+against a tag or arbitrary LSN on a timeline, but in order to write,
+you need to create a timeline.
+
+Each timeline is stored in a directory under .zenith/timelines. It
+consists of a WAL archive, containing all the WAL in the standard
+PostgreSQL format, under the wal/ subdirectory.
+
+The 'snapshots/' subdirectory, contains "base backups" of the data
+directory at a different LSNs. Each snapshot is simply a copy of the
+Postgres data directory.
+
+When a new timeline is forked from a previous timeline, the ancestor
+timeline's UUID is stored in the 'history' file.
+
+### Refs
+
+There are two kinds of named objects in the repository: branches and
+tags.  A branch is a human-friendly name for a timeline UUID, and a
+tag is a human-friendly name for a specific LSN on a timeline
+(timeline UUID + LSN).  Like in git, these are just for user
+convenience; you can also use timeline UUIDs and LSNs directly.
+
+Refs do have one additional purpose though: naming a timeline or LSN
+prevents it from being automatically garbage collected.
+
+The refs directory contains a small text file for each tag/branch. It
+contains the UUID of the timeline (and LSN, for tags).
+
+### Datadirs
+
+.zenith/datadirs contains PostgreSQL data directories. You can launch
+a Postgres instance on one of them with:
+
+```
+  postgres -D .zenith/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c
+```
+
+All the actual data is kept in the timeline directories, under
+.zenith/timelines. The data directories are only needed for active
+PostgreQSL instances. After an instance is stopped, the data directory
+can be safely removed. "zenith start" will recreate it quickly from
+the data in .zenith/timelines, if it's missing.
+
+## Version 2
+
+The format described above isn't very different from a traditional
+daily base backup + WAL archive configuration. The main difference is
+the nicer naming of branches and tags.
+
+That's not very efficient. For performance, we need something like
+incremental backups that don't require making a full copy of all
+data. So only store modified files or pages. And instead of having to
+replay all WAL from the last snapshot, "slice" the WAL into
+per-relation WAL files and only recover what's needed when a table is
+accessed.
+
+In version 2, the file format in the "snapshots" subdirectory gets
+more advanced. The exact format is TODO. But it should support:
+- storing WAL records of individual relations/pages
+- storing a delta from an older snapshot
+- compression
+
+
+## Operations
+
+### Garbage collection
+
+When you run "zenith gc", old timelines that are no longer needed are
+removed. That involves collecting the list of "unreachable" objects,
+starting from the named branches and tags.
+
+Also, if enough WAL has been generated on a timeline since last
+snapshot, a new snapshot or delta is created.
+
+### zenith push/pull
+
+Compare the tags and branches on both servers, and copy missing ones.
+For each branch, compare the timeline it points to in both servers. If
+one is behind the other, copy the missing parts.
+
+FIXME: how do you prevent confusion if you have to clones of the same
+repository, launch an instance on the same branch in both clones, and
+later try to push/pull between them? Perhaps create a new timeline
+every time you start up an instance? Then you would detect that the
+timelines have diverged. That would match with the "epoch" concept
+that we have in the WAL safekeepr
+
+### zenith checkout/commit
+
+In this format, there is no concept of a "working tree", and hence no
+concept of checking out or committing. All modifications are done on
+a branch or a timeline. As soon as you launch a server, the changes are
+appended to the timeline.
+
+You can easily fork off a temporary timeline to emulate a "working tree".
+You can later remove it and have it garbage collected, or to "commit",
+re-point the branch to the new timeline.
+
+If we want to have a worktree and "zenith checkout/commit" concept, we can
+emulate that with a temporary timeline. Create the temporary timeline at
+"zenith checkout", and have "zenith commit" modify the branch to point to
+the new timeline.
--- a/docs/rfcs/007-serverless-on-laptop.md
+++ b/docs/rfcs/007-serverless-on-laptop.md
@@ -0,0 +1,93 @@
+How it works now
+----------------
+
+1. Create repository, start page server on it
+
+```
+$ zenith init
+...
+created main branch
+new zenith repository was created in .zenith
+
+$ zenith pageserver start
+Starting pageserver at '127.0.0.1:64000' in .zenith
+Page server started
+```
+
+2. Create a branch, and start a Postgres instance on it
+
+```
+$ zenith branch heikki main
+branching at end of WAL: 0/15ECF68
+
+$ zenith pg create heikki
+Initializing Postgres on timeline 76cf9279915be7797095241638e64644...
+Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/pg1 port=55432
+
+$ zenith pg start pg1
+Starting postgres node at 'host=127.0.0.1 port=55432 user=heikki'
+waiting for server to start.... done
+server started
+```
+
+
+3. Connect to it and run queries
+
+```
+$ psql "dbname=postgres port=55432"
+psql (14devel)
+Type "help" for help.
+
+postgres=# 
+```
+
+
+Proposal: Serverless on your Laptop
+-----------------------------------
+
+We've been talking about doing the "pg create" step automatically at
+"pg start", to eliminate that step. What if we go further, go
+serverless on your laptop, so that the workflow becomes just:
+
+1. Create repository, start page server on it (same as before)
+
+```
+$ zenith init
+...
+created main branch
+new zenith repository was created in .zenith
+
+$ zenith pageserver start
+Starting pageserver at '127.0.0.1:64000' in .zenith
+Page server started
+```
+
+2. Create branch
+
+```
+$ zenith branch heikki main
+branching at end of WAL: 0/15ECF68
+```
+
+3. Connect to it:
+
+```
+$ psql "dbname=postgres port=5432 branch=heikki"
+psql (14devel)
+Type "help" for help.
+
+postgres=# 
+```
+
+
+The trick behind the scenes is that when you launch the page server,
+it starts to listen on port 5432. When you connect to it with psql, it
+looks at the 'branch' parameter that you passed in the connection
+string. It automatically performs the "pg create" and "pg start" steps
+for that branch, and then forwards the connection to the Postgres
+instance that it launched. After you disconnect, if there are no more
+active connections to the server running on the branch, it can
+automatically shut it down again.
+
+This is how serverless would work in the cloud. We can do it on your
+laptop, too.
--- a/docs/rfcs/008-push-pull.md
+++ b/docs/rfcs/008-push-pull.md
@@ -0,0 +1,66 @@
+# Push and pull between pageservers
+
+Here is a proposal about implementing push/pull mechanics between pageservers. We also want to be able to push/pull to S3 but that would depend on the exact storage format so we don't touch that in this proposal.
+
+## Origin management
+
+The origin represents connection info for some remote pageserver. Let's use here same commands as git uses except using explicit list subcommand (git uses `origin -v` for that).
+
+```
+zenith origin add <name> <connection_uri>
+zenith origin list
+zenith origin remove <name>
+```
+
+Connection URI a string of form `postgresql://user:pass@hostname:port` (https://www.postgresql.org/docs/13/libpq-connect.html#id-1.7.3.8.3.6). We can start with libpq password auth and later add support for client certs or require ssh as transport or invent some other kind of transport.
+
+Behind the scenes, this commands may update toml file inside .zenith directory.
+
+## Push
+
+### Pushing branch
+
+```
+zenith push mybranch cloudserver # push to eponymous branch in cloudserver
+zenith push mybranch cloudserver:otherbranch # push to a different branch in cloudserver
+```
+
+Exact mechanics would be slightly different in the following situations:
+
+1) Destination branch does not exist.
+
+    That is the simplest scenario. We can just create an empty branch (or timeline in internal terminology) and transfer all the pages/records that we have in our timeline. Right now each timeline is quite independent of other timelines so I suggest skipping any checks that there is a common ancestor and just fill it with data. Later when CoW timelines will land to the pageserver we may add that check and decide whether this timeline belongs to this pageserver repository or not [*].
+
+    The exact mechanics may be the following:
+
+    * CLI asks local pageserver to perform push and hands over connection uri: `perform_push <branch_name> <uri>`.
+    * local pageserver connects to the remote pageserver and runs `branch_push <branch_name> <timetine_id>`
+        Handler for branch_create would create destination timeline and switch connection to copyboth mode.
+    * Sending pageserver may start iterator on that timeline and send all the records as copy messages.
+
+2) Destination branch exists and latest_valid_lsn is less than ours.
+
+    In this case, we need to send missing records. To do that we need to find all pages that were changed since that remote LSN. Right now we don't have any tracking mechanism for that, so let's just iterate over all records and send ones that are newer than remote LSN. Later we probably should add a sparse bitmap that would track changed pages to avoid full scan.
+
+3) Destination branch exists and latest_valid_lsn is bigger than ours.
+
+    In this case, we can't push to that branch. We can only pull.
+
+### Pulling branch
+
+Here we need to handle the same three cases, but also keep in mind that local pageserver can be behind NAT and we can't trivially re-use pushing by asking remote to 'perform_push' to our address. So we would need a new set of commands:
+
+* CLI calls `perform_pull <branch_name> <uri>` on local pageserver.
+* local pageserver calls `branch_pull <branch_name> <timetine_id>` on remote pageserver.
+* remote pageserver sends records in our direction
+
+But despite the different set of commands code that performs iteration over records and receiving code that inserts that records can be the same for both pull and push.
+
+
+
+[*] It looks to me that there are two different possible approaches to handling unrelated timelines:
+
+1) Allow storing unrelated timelines in one repo. Some timelines may have parents and some may not.
+2) Transparently create and manage several repositories in one pageserver.
+
+But that is the topic for a separate RFC/discussion.
--- a/docs/rfcs/009-snapshot-first-storage-cli.md
+++ b/docs/rfcs/009-snapshot-first-storage-cli.md
@@ -0,0 +1,56 @@
+While working on export/import commands, I understood that they fit really well into "snapshot-first design".
+
+We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files.
+
+Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postges to zenith.
+
+So here is an attemt to design consistent CLI for diferent usage scenarios:
+
+#### 1. Start empty pageserver.
+That is what we have now.
+Init empty pageserver using `initdb` in temporary directory.
+
+`--storage_dest=FILE_PREFIX | S3_PREFIX |...` option defines object storage type, all other parameters are passed via env variables. Inspired by WAL-G style naming : https://wal-g.readthedocs.io/STORAGES/.
+
+Save`storage_dest` and other parameters in config. 
+Push snapshots to `storage_dest` in background.
+
+```
+zenith init --storage_dest=S3_PREFIX
+zenith start
+```
+
+#### 2. Restart pageserver (manually or crash-recovery).
+Take `storage_dest` from pageserver config, start pageserver from latest snapshot in `storage_dest`. 
+Push snapshots to `storage_dest` in background.
+
+```
+zenith start
+```
+
+#### 3. Import.
+Start pageserver from existing snapshot.
+Path to snapshot provided via `--snapshot_path=FILE_PREFIX | S3_PREFIX | ...`
+Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time operation.
+Save`storage_dest` parameters in config. 
+Push snapshots to `storage_dest` in background.
+```
+//I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage.
+zenith init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX
+zenith start
+```
+How to pass credentials needed for `snapshot_path`?
+
+#### 4. Export.
+Manually push snapshot to `snapshot_path` which differs from `storage_dest` 
+Optionally set `snapshot_format`, which can be plain pgdata format or zenith format.
+```
+zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata
+```
+
+#### Notes and questions
+- walkeeper s3_offload should use same (similar) syntax for storage. How to set it in UI?
+- Why do we need `zenith init` as a separate command? Can't we init everything at first start?
+- We can think of better names for all options.
+- Export to plain postgres format will be useless, if we are not 100% compatible on page level.
+I can recall at least one such difference - PD_WAL_LOGGED flag in pages.
--- a/docs/rfcs/009-snapshot-first-storage-pitr.md
+++ b/docs/rfcs/009-snapshot-first-storage-pitr.md
@@ -0,0 +1,227 @@
+# Preface
+
+GetPage@LSN can be called with older LSNs, and the page server needs
+to be able to reconstruct older page versions. That's needed for
+having read-only replicas that lag behind the primary, or that are
+"anchored" at an older LSN, and internally in the page server whne you
+branch at an older point in time. How do you do that?
+
+For now, I'm not considering incremental snapshots at all. I don't
+think that changes things. So whenever you create a snapshot or a
+snapshot file, it contains an image of all the pages, there is no need
+to look at an older snapshot file.
+
+Also, I'm imagining that this works on a per-relation basis, so that
+each snapshot file contains data for one relation. A "relation" is a
+fuzzy concept - it could actually be one 1 GB relation segment. Or it
+could include all the different "forks" of a relation, or you could
+treat each fork as a separate relation for storage purpose. And once
+we have the "non-relational" work is finished, a "relation" could
+actually mean some other versioned object kept in the PostgreSQL data
+directory. Let's ignore that for now.
+
+# Eric's RFC:
+
+Every now and then, you create a "snapshot". It means that you create
+a new snapshot file for each relation that was modified after the last
+snapshot, and write out the contents the relation as it is/was at the
+snapshot LSN. Write-ahead log is stored separately in S3 by the WAL
+safekeeping service, in the original PostgreSQL WAL file format.
+
+    SNAPSHOT @100       WAL
+       .                 |
+       .                 |
+       .                 |
+       .                 |
+    SNAPSHOT @200        |
+       .                 |
+       .                 |
+       .                 |
+       .                 |
+    SNAPSHOT @300        |
+       .                 |
+       .                 V
+    IN-MEMORY @400
+
+If a GetPage@LSN request comes from the primary, you return the latest
+page from the in-memory layer. If there is no trace of the page in
+memory, it means that it hasn't been modified since the last snapshot,
+so you return the page from the latest snapshot, at LSN 300 in the
+above example.
+
+PITR is implemented using the original WAL files:
+
+If a GetPage@LSN request comes from a read replica with LSN 250, you
+read the image of the page from the snapshot at LSN 200, and you also
+scan the WAL between 200 and 250, and apply all WAL records for the
+requested page, to reconstruct it at LSN 250.
+
+Scanning the WAL naively for every GetPage@LSN request would be
+expensive, so in practice you'd construct an in-memory data structure
+of all the WAL between 200 and 250 once that allows quickly looking up
+records for a given page.
+
+## Problems/questions
+
+I think you'll need to store the list of snapshot LSNs on each
+timeline somewhere.
+
+If the latest snapshot of a relation is at LSN 100, and you request a
+page at LSN 1000000, how do you know if there are some modifications
+to it between 100 and 1000000 that you need to replay? You can scan
+all the WAL between 100 and 1000000, but that would be expensive.
+
+You can skip that, if you know that a snapshot was taken e.g. at LSN
+999900. Then you know that the fact that there is no snapshot file at
+999900 means that the relation hasn't been modified between
+100-999900.  Then you only need to scan the WAL between 999900 and
+1000000. However, there is no trace of a snapshot happening at LSN
+999900 in the snapshot file for this relation, so you need to get
+that information from somewhere else.
+
+Where do you get that information from? Perhaps you can scan all the
+other relations, and if you see a snapshot file for *any* relation at
+LSN 999900, you know that if there were modifications to this
+relation, there would be a newer snapshot file for it, too. In other
+words, the list of snapshots that have been taken can be constructed
+by scanning all relations and computing the union of all snapshot LSNs
+that you see for any relation. But that's expensive so at least you
+should keep that in memory, after computing it once. Also, if you rely
+on that, it's not possible to have snapshots at different intervals
+for different files. That seems limiting.
+
+Another option is to explicitly store a list of snapshot LSNs in a
+separate metadata file.
+
+
+# Current implementation in the 'layered_repo' branch:
+
+We store snapshot files like in the RFC, but each snapshot file also
+contains all the WAL in the range of LSNs, so that you don't need to
+fetch the WAL separately from S3. So you have "layers" like this:
+
+    SNAPSHOT+WAL 100-200
+          |
+          |
+          |
+          |
+    SNAPSHOT+WAL 200-300
+          |
+          |
+          |
+          |
+    IN-MEMORY 300-
+
+Each "snapshot+WAL" is a file that contains a snapshot - i.e. full
+copy of each page in the relation, at the *start* LSN. In addition to
+that, it contains all the WAL applicable to the relation from the
+start LSN to the end LSN. With that, you can reconstruct any page
+version in the range that the file covers.
+
+
+## Problems/questions
+
+I can see one potential performance issue here, compared to the RFC.
+Let's focus on a single relation for now. Imagine that you start from
+an empty relation, and you receive WAL from 100 to 200, containing
+a bunch of inserts and updates to the relation. You now have all that
+WAL in memory:
+
+    memory:  WAL from 100-200
+
+We decide that it's time to materialize that to a snapshot file on
+disk.  We materialize full image of the relation as it was at LSN 100
+to the snapshot file, and include all of the WAL. Since the relation
+was initially empty, the "image" at the beginning of th range is empty
+too.
+
+So now you have one file on on disk:
+
+    SNAPSHOT+WAL 100-200
+
+It contains a full image of the relation at LSN 100 and all WAL
+between 100-200. (It's actually stored as a serialized BTreeMap of
+page versions, with the page images and WAL records all stored
+together in the same BtreeMap. But for this story, that's not
+important.)
+
+We now receive more WAL updating the relation, up to LSN 300. We
+decide it's time to materialize a new snapshot file, and we now have
+two files:
+
+    SNAPSHOT+WAL 100-200
+    SNAPSHOT+WAL 200-300
+
+Note that the latest "full snapshot" that we store on disk always lags
+behind by one snapshot cycle. The first file contains a full image of
+the relation at LSN 100, the second at LSN 200. When we have received
+WAL up to LSN 300, we write a materialized image at LSN 200. That
+seems a bit silly. In the design per your RFC, you would write a
+snapshots at LSNs 200 and 300, instead. That seems better.
+
+
+
+# Third option (not implemented yet)
+
+Store snapshot files like in the RFC, but also store per-relation
+WAL files that contain WAL in a range of LSNs for that relation.
+
+    SNAPSHOT @100   WAL 100-200
+       .                 |
+       .                 |
+       .                 |
+       .                 |
+    SNAPSHOT @200   WAL 200-300
+       .                 |
+       .                 |
+       .                 |
+       .                 |
+    SNAPSHOT @300
+       .
+       .
+    IN-MEMORY 300-
+
+
+This could be the best of both worlds. The snapshot files would be
+independent of the PostgreSQL WAL format. When it's time to write
+snapshot file @300, you write a full image of the relation at LSN 300,
+and you write the WAL that you had accumulated between 200 and 300 to
+a separate file. That way, you don't "lag behind" for one snapshot
+cycle like in the current implementation. But you still have the WAL
+for a particular relation readily available alongside the snapshot
+files, and you don't need to track what snapshot LSNs exist
+separately.
+
+(If we wanted to minize the number of files, you could include the
+snapshot @300 and the WAL between 200 and 300 in the same file, but I
+feel it's probably better to keep them separate)
+
+
+
+# Further thoughts
+
+There's no fundamental reason why the LSNs of the snapshot files and the
+ranges of the WAL files would need to line up. So this would be possible
+too:
+
+    SNAPSHOT @100   WAL 100-150
+       .                 |
+       .                 |
+       .            WAL 150-250
+       .                 |
+    SNAPSHOT @200        |
+       .                 |
+       .            WAL 250-400
+       .                 |
+       .                 |
+    SNAPSHOT @300        |
+       .                 |
+       .                 |
+    IN-MEMORY 300-
+
+I'm not sure what the benefit of this would be. You could materialize
+additional snapshot files in the middle of a range covered by a WAL
+file, maybe? Might be useful to speed up access when you create a new
+branch in the middle of an LSN range or if there's some other reason
+to believe that a particular LSN is "interesting" and there will be
+a lot of requests using it.
--- a/docs/rfcs/009-snapshot-first-storage.md
+++ b/docs/rfcs/009-snapshot-first-storage.md
@@ -0,0 +1,148 @@
+# Snapshot-first storage architecture
+
+Goals:
+- Long-term storage of database pages.
+- Easy snapshots; simple snapshot and branch management.
+- Allow cloud-based snapshot/branch management.
+- Allow cloud-centric branching; decouple branch state from running pageserver.
+- Allow customer ownership of data via s3 permissions.
+- Provide same or better performance for typical workloads, vs plain postgres.
+
+Non-goals:
+- Service database reads from s3 (reads should be serviced from the pageserver cache).
+- Keep every version of every page / Implement point-in-time recovery (possibly a future paid feature, based on WAL replay from an existing snapshot).
+
+## Principle of operation
+
+The database “lives in s3”. This means that all of the long term page storage is in s3, and the “live database”-- the version that lives in the pageserver-- is a set of “dirty pages” that haven’t yet been written back to s3.
+
+In practice, this is mostly similar to storing frequent snapshots to s3 of a database that lives primarily elsewhere.
+
+The main difference is that s3 is authoritative about which branches exist; pageservers consume branches, snapshots, and related metadata by reading them from s3. This allows cloud-based management of branches and snapshots, regardless of whether a pageserver is running or not.
+
+It’s expected that a pageserver should keep a copy of all pages, to shield users from s3 latency. A cheap/slow pageserver that falls back to s3 for some reads would be possible, but doesn’t seem very useful right now.
+
+Because s3 keeps all history, and the safekeeper(s) preserve any WAL records needed to reconstruct the most recent changes, the pageserver can store dirty pages in RAM or using non-durable local storage; this should allow very good write performance, since there is no need for fsync or journaling.
+
+Objects in s3 are immutable snapshots, never to be modified once written (only deleted).
+
+Objects in s3 are files, each containing a set of pages for some branch/relation/segment as of a specific time (LSN). A snapshot could be complete (meaning it has a copy of every page), or it could be incremental (containing only the pages that were modified since the previous snapshot). It’s expected that most snapshots are incremental to keep storage costs low.
+
+It’s expected that the pageserver would upload new snapshot objects frequently, e.g. somewhere between 30 seconds and 15 minutes, depending on cost/performance balance.
+
+No-longer needed snapshots can be “squashed”-- meaning snapshot N and snapshot N+1 can be read by some cloud agent software, which writes out a new object containing the combined set of pages (keeping only the newest version of each page) and then deletes the original snapshots.
+
+A pageserver only needs to store the set of pages needed to satisfy operations in flight: if a snapshot is still being written, the pageserver needs to hold historical pages so that snapshot captures a consistent moment in time (similar to what is needed to satisfy a slow replica).
+
+WAL records can be discarded once a snapshot has been stored to s3. (Unless we want to keep them longer as part of a point-in-time recovery feature.)
+
+## Pageserver operation
+
+To start a pageserver from a stored snapshot, the pageserver downloads a set of snapshots sufficient to start handling requests. We assume this includes the latest copy of every page, though it might be possible to start handling requests early, and retrieve pages for the first time only when needed.
+
+To halt a pageserver, one final snapshot should be written containing all pending WAL updates; then the pageserver and safekeepers can shut down.
+
+It’s assumed there is some cloud management service that ensures only one pageserver is active and servicing writes to a given branch.
+
+The pageserver needs to be able to track whether a given page has been modified since the last snapshot, and should be able to produce the set of dirty pages efficiently to create a new snapshot.
+
+The pageserver need only store pages that are “reachable” from a particular LSN. For example, a page may be written four times, at LSN 100, 200, 300, and 400. If no snapshot is being created when LSN 200 is written, the page at LSN 100 can be discarded. If a snapshot is triggered when the pageserver is at LSN 299, the pageserver must preserve the page from LSN 200 until that snapshot is complete. As before, the page at LSN 300 can be discarded when the LSN 400 pages is written (regardless of whether the LSN 200 snapshot has completed.)
+
+If the pageserver is servicing multiple branches, those branches may contain common history. While it would be possible to serve branches with zero knowledge of their common history, a pageserver could save a lot of space using an awareness of branch history to share the common set of pages. Computing the “liveness” of a historical page may be tricky in the face of multiple branches.
+
+The pageserver may store dirty pages to memory or to local block storage; any local block storage format is only temporary “overflow” storage, and is not expected to be readable by future software versions.
+
+The pageserver may store clean pages (those that are captured in a snapshot) any way it likes: in memory, in a local filesystem (possibly keeping a local copy of the snapshot file), or using some custom storage format. Reading pages from s3 would be functional, but is expected to be prohibitively slow.
+
+The mechanism for recovery after a pageserver failure is WAL redo. If we find that too slow in some situations (e.g. write-heavy workload causes long startup), we can write more frequent snapshots to keep the number of outstanding WAL records low. If that’s still not good enough, we could look at other options (e.g. redundant pageserver or an EBS page journal).
+
+A read-only pageserver is possible; such a pageserver could be a read-only cache of a specific snapshot, or could auto-update to the latest snapshot on some branch. Either way, no safekeeper is required. Multiple read-only pageservers could exist for a single branch or snapshot.
+
+## Cloud snapshot manager operation
+
+Cloud software may wish to do the following operations (commanded by a user, or based on some pre-programmed policy or other cloud agent):
+Create/delete/clone/rename a database
+Create a new branch (possibly from a historical snapshot)
+Start/stop the pageserver/safekeeper on a branch
+List databases/branches/snapshots that are visible to this user account
+
+Some metadata operations (e.g. list branches/snapshots of a particular db) could be performed by scanning the contents of a bucket and inspecting the file headers of each snapshot object. This might not be fast enough; it might be necessary to build a metadata service that can respond more quickly to some queries.
+
+This is especially true if there are public databases: there may be many thousands of buckets that are public, and scanning all of them is not a practical strategy for answering metadata queries.
+
+## Snapshot names, deletion and concurrency
+
+There may be race conditions between operations-- in particular, a “squash” operation may replace two snapshot objects (A, B) with some combined object (C). Since C is logically equivalent to B, anything that attempts to access B should be able to seamlessly switch over to C. It’s assumed that concurrent delete won’t disrupt a read in flight, but it may be possible for some process to read B’s header, and then discover on the next operation that B is gone.
+
+For this reason, any attempted read should attempt a fallback procedure (list objects; search list for an equivalent object) if an attempted read fails.  This requires a predictable naming scheme, e.g. `XXXX_YYYY_ZZZZ_DDDD`, where `XXXX` is the branch unique id, and `YYYY` and `ZZZZ` are the starting/ending LSN values.  `DDDD` is a timestamp indicating when the object was created; this is used to disambiguate a series of empty snapshots, or to help a snapshot policy engine understand which snapshots should be kept or discarded.
+
+## Branching
+
+A user may request a new branch from the cloud user interface. There is a sequence of things that needs to happen:
+- If the branch is supposed to be based on the latest contents, the pageserver should perform an immediate snapshot. This is the parent snapshot for the new branch.
+- Cloud software should create the new branch, by generating a new (random) unique branch identifier, and creating a placeholder snapshot object.
+    - The placeholder object is an empty snapshot containing only metadata (which anchors it to the right parent history) and no pages.
+    - The placeholder can be discarded when the first snapshot (containing data) is completed. Discarding is equivalent to squashing, when the snapshot contains no data.
+- If the branch needs to be started immediately, a pageserver should be notified that it needs to start servicing the branch. This may not be the same pageserver that services the parent branch, though the common history may make it the best choice.
+
+Some of these steps could be combined into the pageserver, but that process would not be possible under all cases (e.g. if no pageserver is currently running, or if the branch is based on an older snapshot, or if a different pageserver will be serving the new branch). Regardless of which software drives the process, the result should look the same.
+
+## Long-term file format
+
+Snapshot files (and any other object stored in s3) must be readable by future software versions.
+
+It should be possible to build multiple tools (in addition to the pageserver) that can read and write this file format-- for example, to allow cloud snapshot management.
+
+Files should contain the following metadata, in addition to the set of pages:
+- The version of the file format.
+- A unique identifier for this branch (should be worldwide-unique and unchanging).
+- Optionally, any human-readable names assigned to this branch (for management UI/debugging/logging).
+- For incremental snapshots, the identifier of the predecessor snapshot. For new branches, this will be the parent snapshot (the point at which history diverges).
+- The location of the predecessor branch snapshot, if different from this branch’s location.
+- The LSN range `(parent, latest]` for this snapshot. For complete snapshots, the parent LSN can be 0.
+- The UTC timestamp of the snapshot creation (which may be different from the time of its highest LSN, if the database is idle).
+- A SHA2 checksum over the entire file (excluding the checksum itself), to preserve file integrity.
+
+A file may contain no pages, and an empty LSN range (probably `(latest, latest]`?), which serves as a placeholder for either a newly-created branch, or a snapshot of an idle database.
+
+Any human-readable names stored in the file may fall out of date if database/branch renames are allowed; there may need to be a cloud metadata service to query (current name -> unique identifier). We may choose instead to not store human-readable names in the database, or treat them as debugging information only.
+
+## S3 semantics, and other kinds of storage
+
+For development and testing, it may be easier to use other kinds of storage in place of s3. For example, a directory full of files can substitute for an s3 bucket with multiple objects. This mode is expected to match the s3 semantics (e.g. don’t edit existing files or use symlinks). Unit tests may omit files entirely and use an in-memory mock bucket.
+
+Some users may want to use a local or network filesystem in place of s3. This isn’t prohibited but it’s not a priority, either.
+
+Alternate implementations of s3 should be supported, including Google Cloud Storage.
+
+Azure Blob Storage should be supported. We assume (without evidence) that it’s semantically equivalent to s3 for this purpose.
+
+The properties of s3 that we depend on are:
+list objects
+streaming read of entire object
+read byte range from object
+streaming write new object (may use multipart upload for better relialibity)
+delete object (that should not disrupt an already-started read).
+
+Uploaded files, restored backups, or s3 buckets controlled by users could contain malicious content. We should always validate that objects contain the content they’re supposed to. Incorrect, Corrupt or malicious-looking contents should cause software (cloud tools, pageserver) to fail gracefully.
+
+## Notes
+
+Possible simplifications, for a first draft implementation:
+- Assume that dirty pages fit in pageserver RAM. Can use kernel virtual memory to page out to disk if needed. Can improve this later.
+- Don’t worry about the details of the squashing process yet.
+- Don’t implement cloud metadata service; try to make everything work using basic s3 list-objects and reads.
+- Don’t implement rename, delete at first.
+- Don’t implement public/private, just use s3 permissions.
+- Don’t worry about sharing history yet-- each user has their own bucket and a full copy of all data.
+- Don’t worry about history that spans multiple buckets.
+- Don’t worry about s3 regions.
+- Don’t support user-writeable s3 buckets; users get only read-only access at most.
+
+Open questions:
+- How important is point-in-time recovery? When should we add this? How should it work?
+- Should snapshot files use compression?
+- Should we use snapshots for async replication? A spare pageserver could stay mostly warmed up by consuming snapshots as they’re created.
+- Should manual snapshots, or snapshots triggered by branch creation, be named differently from snapshots that are triggered by a snapshot policy?
+- When a new branch is created, should it always be served by the same pageserver that owns its parent branch? When should we start a new pageserver?
+- How can pageserver software upgrade be done with minimal downtime?
--- a/docs/rfcs/010-storage_details.md
+++ b/docs/rfcs/010-storage_details.md
@@ -0,0 +1,144 @@
+# Storage details
+
+Here I tried to describe the current state of thinking about our storage subsystem as I understand it. Feel free to correct me. Also, I tried to address items from Heikki's TODO and be specific on some of the details.
+
+## Overview
+
+![storage](images/storage.jpeg)
+
+### MemStore
+
+MemStore holds the data between `latest_snapshot_lsn` and `latest_lsn`. It consists of PageIndex that holds references to WAL records or pages, PageStore that stores recently materialized pages, and WalStore that stores recently received WAL.
+
+### PageIndex
+
+PageIndex is an ordered collection that maps `(BufferTag, LSN)` to one of the following references (by reference I mean some information that is needed to access that data, e.g. file_id and offset):
+
+* PageStoreRef -- page offset in the PageStore
+* LocalStoreRef -- snapshot_id and page offset inside of that snapshot
+* WalStoreRef -- offset (and size optionally) of WalRecord in WalStore
+
+PageIndex holds information about all the pages in all incremental snapshots and in the latest full snapshot. If we aren't using page compression inside snapshots we actually can avoid storing references to the full snapshot and calculate page offsets based on relation sizes metadata in the full snapshot (assuming that full snapshot stores pages sorted by page number). However, I would suggest embracing page compression from the beginning and treat all pages as variable-sized.
+
+We assume that PageIndex is few orders of magnitude smaller than addressed data hence it should fit memory. We also don't care about crash tolerance as we can rebuild it from snapshots metadata and WAL records from WalStore or/and Safekeeper.
+
+### WalStore
+
+WalStore is a queue of recent WalRecords. I imagine that we can store recent WAL the same way as Postgres does -- as 16MB files on disk. On top of that, we can add some fixed-size cache that would keep some amount of segments in memory.
+
+For now, we may rely on the Safekeeper to safely store that recent WAL. But generally, I think we can pack all S3 operations into the page server so that it would be also responsible for the recent WAL pushdown to S3 (and Safekeeper may just delete WAL that was confirmed as S3-durable by the page server).
+
+### PageStore
+
+PageStore is storage for recently materialized pages (or in other words cache of getPage results). It is also can be implemented as a file-based queue with some memory cache on top of it.
+
+There are few possible options for PageStore:
+
+a) we just add all recently materialized pages there (so several versions of the same page can be stored there) -- that is more or less how it happens now with the current RocksDB implementation.
+
+b) overwrite older pages with the newer pages -- if there is no replica we probably don't need older pages. During page overwrite, we would also need to change PageStoreRef back to WalStoreRef in PageIndex.
+
+I imagine that newly created pages would just be added to the back of PageStore (again in queue-like fashion) and this way there wouldn't be any meaningful ordering inside of that queue. When we are forming a new incremental snapshot we may prohibit any updates to the current set of pages in PageStore (giving up on single page version rule) and cut off that whole set when snapshot creation is complete.
+
+With option b) we can also treat PageStor as an uncompleted increamental snapshot.
+
+### LocalStore
+
+LocalStore keeps the latest full snapshot and set of incremental snapshots on top of it. We add new snapshots when the number of changed pages grows bigger than a certain threshold.
+
+## Granularity
+
+By granularity, I mean a set of pages that goes into a certain full snapshot. Following things should be taken into account:
+
+* can we shard big databases between page servers?
+* how much time will we spend applying WAL to access certain pages with older LSN's?
+* how many files do we create for a single database?
+
+I can think of the following options here:
+
+1. whole database goes to one full snapshot.
+    * +: we never create a lot of files for one database
+    * +: the approach is quite straightforward, moving data around is simple
+    * -: can not be sharded
+    * -: long recovery -- we always need to recover the whole database
+2. table segment is the unit of snapshotting
+    * +: straightforward for sharding
+    * +: individual segment can be quickly recovered with sliced WAL
+    * -: full snapshot can be really small (e.g. when the corresponding segment consists of a single page) and we can blow amount of files. Then we would spend eternity in directory scans and the amount of metadata for sharding can be also quite big.
+3. range-partitioned snapshots -- snapshot includes all pages between [BuffTagLo, BuffTagHi] mixing different relations, databases, and potentially clusters (albeit from one tenant only). When full snapshot outgrows a certain limit (could be also a few gigabytes) we split the snapshot in two during the next full snapshot write. That approach would also require pages sorted by BuffTag inside our snapshots.
+    * +: addresses all mentioned issues
+    * -: harder to implement
+
+I think it is okay to start with table segments granularity and just check how we will perform in cases of lots of small tables and check is there any way besides c) to deal with it.
+
+Both PageStore and WalStore should be "sharded" by this granularity level.
+
+## Security
+
+We can generate different IAM keys for each tenant and potentially share them with users (in read-only mode?) or even allow users to provide their S3 buckets credentials.
+
+Also, S3 backups are usually encrypted by per-tenant privates keys. I'm not sure in what threat model such encryption would improve something (taking into account per-tenant IAM keys), but it seems that everybody is doing that (both AMZN and YNDX). Most likely that comes as a requirement about "cold backups" by some certification procedure.
+
+## Dynamics
+
+### WAL stream handling
+
+When a new WAL record is received we need to parse BufferTags in that record and insert them in PageIndex with WalStoreRef as a value.
+
+### getPage queries
+
+Look up the page in PageIndex. If the value is a page reference then just respond with that page. If the referenced value is WAL record then find the most recent page with the same BuffTag (that is why we need ordering in PageIndex); recover it by applying WAL records; save it in PageStore; respond with that page.
+
+### Starting page server without local data
+
+* build set of latest full snapshots and incremental snapshots on top of them
+* load all their metadata into PageIndex
+* Safekeeper should connect soon and we can ask for a WAL stream starting from the latest incremental snapshot
+* for databases that are connected to us through the Safekeeper we can start loading the set of the latest snapshots or we can do that lazily based on getPage request (I'd better avoid doing that lazily for now without some access stats from the previous run and just transfer all data for active database from S3 to LocalStore).
+
+### Starting page server with local data (aka restart or reboot)
+
+* check that local snapshot files are consistent with S3
+
+### Snapshot creation
+
+Track size of future snapshots based on info in MemStore and when it exceeds some threshold (taking into account our granularity level) create a new incremental snapshot. Always emit incremental snapshots from MemStore.
+
+To create a new snapshot we need to walk through WalStore to get the list of all changed pages, sort it, and get the latest versions of that pages from PageStore or by WAL replay. It makes sense to maintain that set in memory while we are receiving the WAL stream to avoid parsing WAL during snapshot creation.
+
+Full snapshot creation can be done by GC (or we can call that entity differently -- e.g. merger?) by merging the previous full snapshot with several incremental snapshots.
+
+### S3 pushdown
+
+When we have several full snapshots GC can push the old one with its increments to S3.
+
+### Branch creation
+
+Create a new timeline and replay sliced WAL up to a requested point. When the page is not in PageIndex ask the parent timeline about a page. Relation sizes are tricky.
+
+## File formats
+
+As far as I understand Bookfile/Aversion addresses versioning and serialization parts.
+
+As for exact data that should go to snapshots I think it is the following for each snapshot:
+
+* format version number
+* set of key/values to interpret content (e.g. is page compression enabled, is that a full or incremental snapshot, previous snapshot id, is there WAL at the end on file, etc) -- it is up to a reader to decide what to do if some keys are missing or some unknow key are present. If we add something backward compatible to the file we can keep the version number.
+* array of [BuffTag, corresponding offset in file] for pages -- IIUC that is analogous to ToC in Bookfile
+* array of [(BuffTag, LSN), corresponding offset in file] for the WAL records
+* pages, one by one
+* WAL records, one by one
+
+It is also important to be able to load metadata quickly since it would be one of the main factors impacting the time of page server start. E.g. if would store/cache about 10TB of data per page server, the size of uncompressed page references would be about 30GB (10TB / ( 8192 bytes page size / ( ~18 bytes per ObjectTag + 8 bytes offset in the file))).
+
+1) Since our ToC/array of entries can be sorted by ObjectTag we can store the whole BufferTag only when realtion_id is changed and store only delta-encoded offsets for a given relation. That would reduce the average per-page metadata size to something less than 4 bytes instead of 26 (assuming that pages would follow the same order and offset delatas would be small).
+2) It makes sense to keep ToC at the beginning of the file to avoid extra seeks to locate it. Doesn't matter too much with the local files but matters on S3 -- if we are accessing a lot of ~1Gb files with the size of metadata ~ 1Mb then the time to transfer this metadata would be comparable with access latency itself (which is about a half of a second). So by slurping metadata with one read of file header instead of N reads we can improve the speed of page server start by this N factor.
+
+I think both of that optimizations can be done later, but that is something to keep in mind when we are designing our storage serialization routines.
+
+Also, there were some discussions about how to embed WAL in incremental snapshots. So far following ideas were mentioned:
+1. snapshot lsn=200, includes WAL in range 200-300
+2. snapshot lsn=200, includes WAL in range 100-200
+3. data snapshots are separated from WAL snapshots
+
+Both options 2 and 3 look good. I'm inclined towards option 3 as it would allow us to apply different S3 pushdown strategies for data and WAL files (e.g. we may keep data snapshot until the next full snapshot, but we may push WAL snapshot to S3 just when they appeared if there are no replicas).
--- a/docs/rfcs/011-retention-policy.md
+++ b/docs/rfcs/011-retention-policy.md
@@ -0,0 +1,91 @@
+# User-visible timeline history
+
+The user can specify a retention policy. The retention policy is
+presented to the user as a PITR period and snapshots. The PITR period
+is the amount of recent history that needs to be retained, as minutes,
+hours, or days. Within that period, you can create a branch or
+snapshot at any point in time, open a compute node, and start running
+queries. Internally, a PITR period is represented as a range of LSNs
+
+The user can also create snapshots. A snapshot is a point in time,
+internally represented by an LSN. The user gives the snapshot a name.
+
+The user can also specify an interval, at which the system creates
+snapshots automatically. For example, create a snapshot every night at
+2 AM. After some user-specified time, old automatically created
+snapshots are removed.
+
+                     Snapshot       Snapshot
+         PITR        "Monday"       "Tuesday"        PITR
+    ----######----------+-------------+-------------######>
+
+If there are multiple branches, you can specify different policies or
+different branches.
+
+The PITR period and user-visible snapshots together define the
+retention policy.
+
+NOTE: As presented here, this is probably overly flexible. In reality,
+we want to keep the user interface simple. Only allow a PITR period at
+the tip of a branch, for example. But that doesn't make much
+difference to the internals.
+
+
+# Retention policy behind the scenes
+
+The retention policy consists of points (for snapshots) and ranges
+(for PITR periods).
+
+The system must be able to reconstruct any page within the retention
+policy. Other page versions can be garbage collected away. We have a
+lot of flexibility on when to perform the garbage collection and how
+aggressive it is.
+
+
+# Base images and WAL slices
+
+The page versions are stored in two kinds of files: base images and
+WAL slices. A base image contains a dump of all the pages of one
+relation at a specific LSN. A WAL slice contains all the WAL in an LSN
+range.
+
+
+    |
+    |
+    |
+    | --Base img @100   +
+    |                   |
+    |                   | WAL slice
+    |                   | 100-200
+    |                   |
+    | --Base img @200   +
+    |                   |
+    |                   | WAL slice
+    |                   | 200-300
+    |                   |
+    |                   +
+    |
+    V
+
+
+To recover a page e.g. at LSN 150, you need the base image at LSN 100,
+and the WAL slice 100-200.
+
+All of this works at a per-relation or per-relation-segment basis. If
+a relation is updated very frequently, we create base images and WAL
+slices for it more quickly. For a relation that's updated
+infrequently, we hold the recent WAL for that relation longer, and
+only write it out when we need to release the disk space occupied by
+the original WAL. (We need a backstop like that, because until all the
+WAL/base images have been been durably copied to S3, we must keep the
+original WAL for that period somewhere, in the WAL service or in S3.)
+
+
+# Branching
+
+Internally, branch points are also "retention points", in addition to
+the user-visible snapshots. If a branch has been forked off at LSN
+100, we need to be able to reconstruct any page on the parent branch
+at that LSN, because it is needed by the child branch. If a page is
+modified in the child, we don't need to keep that in the parent
+anymore, though.
--- a/docs/rfcs/012-background-tasks.md
+++ b/docs/rfcs/012-background-tasks.md
@@ -0,0 +1,38 @@
+# Eviction
+
+ Write out in-memory layer to disk, into a delta layer.
+
+- To release memory
+- To make it possible to advance disk_consistent_lsn and allow the WAL
+  service to release some WAL.
+
+- Triggered if we are short on memory
+- Or if the oldest in-memory layer is so old that it's holding back
+  the WAL service from removing old WAL
+
+# Materialization
+
+Create a new image layer of a segment, by performing WAL redo
+
+- To reduce the amount of WAL that needs to be replayed on a GetPage request.
+- To allow garbage collection of old layers
+
+- Triggered by distance to last full image of a page
+
+# Coalescing
+
+Replace N consecutive layers of a segment with one larger layer.
+
+- To reduce the number of small files that needs to be uploaded to S3
+
+
+# Bundling
+
+Zip together multiple small files belonging to different segments.
+
+- To reduce the number of small files that needs to be uploaded to S3
+
+
+# Garbage collection
+
+Remove a layer that's older than the GC horizon, and isn't needed anymore.
--- a/docs/rfcs/013-term-history.md
+++ b/docs/rfcs/013-term-history.md
@@ -0,0 +1,147 @@
+# What
+
+Currently, apart from WAL safekeeper persistently stores only two logical clock
+counter (aka term) values, sourced from the same sequence. The first is bumped
+whenever safekeeper gives vote to proposer (or acknowledges already elected one)
+and e.g. prevents electing two proposers with the same term -- it is actually
+called `term` in the code. The second, called `epoch`, reflects progress of log
+receival and this might lag behind `term`; safekeeper switches to epoch `n` when
+it has received all committed log records from all `< n` terms. This roughly
+correspones to proposed in
+
+https://github.com/zenithdb/rfcs/pull/3/files
+
+
+This makes our biggest our difference from Raft. In Raft, every log record is
+stamped with term in which it was generated; while we essentialy store in
+`epoch` only the term of the highest record on this safekeeper -- when we know
+it -- because during recovery generally we don't, and `epoch` is bumped directly
+to the term of the proposer who performs the recovery when it is finished. It is
+not immediately obvious that this simplification is safe. I thought and I still
+think it is; model checking confirmed that. However, some details now make me
+believe it is better to keep full term switching history (which is equivalent to
+knowing term of each record).
+
+# Why
+
+Without knowing full history (list of <term, LSN> pairs) of terms it is hard to
+determine the exact divergence point, and if we don't perform truncation at that
+point safety becomes questionable. Consider the following history, with
+safekeepers A, B, C, D, E. n_m means record created by proposer in term n with
+LSN m; (t=x, e=y) means safekeeper currently has term x and epoch y.
+
+1) P1 in term 1 writes 1.1 everywhere, which is committed, and some more only
+on A.
+
+<pre>
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=1, e=1) 1.1
+D(t=1, e=1) 1.1
+E(t=1, e=1) 1.1
+</pre>
+
+2) P2 is elected by CDE in term 2, epochStartLsn is 2, and writes 2.2, 2.3 on CD:
+
+<pre>
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=2, e=2) 1.1 2.2 2.3
+D(t=2, e=2) 1.1 2.2 2.3
+E(t=2, e=1) 1.1
+</pre>
+
+
+3) P3 is elected by CDE in term 3, epochStartLsn is 4, and writes 3.4 on D:
+
+<pre>
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=3, e=2) 1.1 2.2 2.3
+D(t=3, e=3) 1.1 2.2 2.3 3.4
+E(t=3, e=1) 1.1
+</pre>
+
+
+Now, A gets back and P3 starts recovering it. How it should proceed? There are
+two options.
+
+## Don't try to find divergence point at all
+
+...start sending WAL conservatively since the horizon (1.1), and truncate
+obsolete part of WAL only when recovery is finished, i.e. epochStartLsn (4) is
+reached, i.e. 2.3 transferred -- that's what https://github.com/zenithdb/zenith/pull/505 proposes.
+
+Then the following is possible:
+
+4) P3 moves one record 2.2 to A.
+
+<pre>
+A(t=1, e=1) 1.1 <b>2.2</b> 1.3 1.4
+B(t=1, e=1) 1.1 1.2
+C(t=3, e=2) 1.1 2.2 2.3
+D(t=3, e=3) 1.1 2.2 2.3 3.4
+E(t=3, e=1) 1.1
+</pre>
+
+Now log of A is basically corrupted. Moreover, since ABE are all in epoch 1 and
+A's log is the longest one, they can elect P4 who will commit such log.
+
+Note that this particular history couldn't happen if we forbid to *create* new
+records in term n until majority of safekeepers switch to it. It would force CDE
+to switch to 2 before 2.2 is created, and A could never become donor while his
+log is corrupted. Generally with this additional barrier I believe the algorithm
+becomes safe, but
+ - I don't like this kind of artificial barrier;
+ - I also feel somewhat discomfortable about even temporary having intentionally
+   corrupted WAL;
+ - I'd still model check the idea.
+
+## Find divergence point and truncate at it
+
+Then step 4 would delete 1.3 1.4 on A, and we are ok. The question is, how do we
+do that? Without term switching history we have to resort to sending again since
+the horizon and memcmp'ing records, which is inefficient and ugly. Or we can
+maintain full history and determine truncation point by comparing 'wrong' and
+'right' histories -- much like pg_rewind does -- and perform truncation + start
+streaming right there.
+
+# Proposal
+
+- Add term history as array of <term, LSN> pairs to safekeeper controlfile.
+- Return it to proposer with VoteResponse so 1) proposer can tell it to other
+  nodes and 2) determine personal streaming starting point. However, since we
+  don't append WAL and update controlfile atomically, let's first always update
+  controlfile but send only the history of what we really have (up to highest
+  term in history where begin_lsn >= end of wal; this highest term replaces
+  current `epoch`). We also send end of wal as we do now to determine the donor.
+- Create ProposerAnnouncement message which proposer sends before starting
+  streaming. It announces proposer as elected and
+  1) Truncates wrong part of WAL on safekeeper
+     (divergence point is already calculated at proposer, but can be
+     cross-verified here).
+  2) Communicates the 'right' history of its term (taken from donor). Seems
+     better to immediately put the history in the controlfile,
+	 though safekeeper might not have full WAL for previous terms in it --
+	 this way is simpler, and we can't update WAL and controlfile atomically anyway.
+
+	 This also constitutes analogue of current epoch bump for those safekeepers
+     which don't need recovery, which is important for sync-safekeepers (bump
+     epoch without waiting records from new term).
+- After ProposerAnnouncement proposer streams WAL since calculated starting
+  point -- only what is missing.
+
+
+pros/cons:
+ (more) clear safety of WAL truncation -- we get very close to Raft
+ no unnecessary data sending (faster recovery for not-oldest-safekeepers, matters
+   only for 5+ nodes)
+ adds some observability at safekeepers
+
+- complexity, but not that much
+
+
+# Misc
+
+- During model checking I did truncation on first locally non existent or
+  different record -- analogue of 'memcmp' variant described above.
--- a/docs/rfcs/README.md
+++ b/docs/rfcs/README.md
@@ -0,0 +1,95 @@
+This directory contains Request for Comments documents, or RFCs, for
+features or concepts that have been proposed. Alternative names:
+technical design doc, ERD, one-pager
+
+To make a new proposal, create a new text file in this directory and
+open a Pull Request with it. That gives others a chance and a forum
+to comment and discuss the design.
+
+When a feature is implemented and the code changes are committed, also
+include the corresponding RFC in this directory.
+
+Some of the RFCs in this directory have been implemented in some form
+or another, while others are on the roadmap, while still others are
+just obsolete and forgotten about. So read them with a grain of salt,
+but hopefully even the ones that don't reflect reality give useful
+context information.
+
+## What
+
+We use Tech Design RFC’s to summarize what we are planning to
+implement in our system. These RFCs should be created for large or not
+obvious technical tasks, e.g. changes of the architecture or bigger
+tasks that could take over a week, changes that touch multiple
+components or their interaction. RFCs should fit into a couple of
+pages, but could be longer on occasion.
+
+## Why
+
+We’re using RFCs to enable early review and collaboration, reduce
+uncertainties, risk and save time during the implementation phase that
+follows the Tech Design RFC.
+
+Tech Design RFCs also aim to avoid bus factor and are an additional
+measure to keep more peers up to date & familiar with our design and
+architecture.
+
+This is a crucial part for ensuring collaboration across timezones and
+setting up for success a distributed team that works on complex
+topics.
+
+## Prior art
+
+- Rust: [https://github.com/rust-lang/rfcs/blob/master/0000-template.md](https://github.com/rust-lang/rfcs/blob/master/0000-template.md)
+- React.js: [https://github.com/reactjs/rfcs/blob/main/0000-template.md](https://github.com/reactjs/rfcs/blob/main/0000-template.md)
+- Google fuchsia: [https://fuchsia.dev/fuchsia-src/contribute/governance/rfcs/TEMPLATE](https://fuchsia.dev/fuchsia-src/contribute/governance/rfcs/TEMPLATE)
+- Apache: [https://cwiki.apache.org/confluence/display/GEODE/RFC+Template](https://cwiki.apache.org/confluence/display/GEODE/RFC+Template) / [https://cwiki.apache.org/confluence/display/GEODE/Lightweight+RFC+Process](https://cwiki.apache.org/confluence/display/GEODE/Lightweight+RFC+Process)
+
+## How
+
+RFC lifecycle:
+
+- Should be submitted in a pull request with and full RFC text in a commited markdown file and copy of the Summary and Motivation sections also included in the PR body.
+- RFC should be published for review before most of the actual code is written. This isn’t a strict rule, don’t hesitate to experiment and build a POC in parallel with writing an RFC.
+- Add labels to the PR in the same manner as you do Issues. Example TBD
+- Request the review from your peers. Reviewing the RFCs from your peers is a priority, same as reviewing the actual code.
+- The Tech Design RFC should evolve based on the feedback received and further during the development phase if problems are discovered with the taken approach
+- RFCs stop evolving once the consensus is found or the proposal is implemented and merged.
+- RFCs are not intended as a documentation that’s kept up to date **after** the implementation is finished. Do not update the Tech Design RFC when merged functionality evolves later on. In such situation a new RFC may be appropriate.
+
+### RFC template
+
+Note, a lot of the sections are marked as ‘if relevant’. They are included into the template as a reminder and to help inspiration.
+
+```
+# Name
+Created on ..
+Implemented on ..
+
+## Summary
+
+## Motivation
+
+## Non Goals (if relevant)
+
+## Impacted components (e.g. pageserver, safekeeper, console, etc)
+
+## Proposed implementation
+
+### Reliability, failure modes and corner cases (if relevant)
+
+### Interaction/Sequence diagram (if relevant)
+
+### Scalability (if relevant)
+
+### Security implications (if relevant)
+
+### Unresolved questions (if relevant)
+
+## Alternative implementation (if relevant)
+
+## Pros/cons of proposed approaches (if relevant)
+
+## Definition of Done (if relevant)
+
+```
--- a/docs/rfcs/images/storage.jpeg
+++ b/docs/rfcs/images/storage.jpeg
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -87,31 +87,29 @@ so manual installation of dependencies is not recommended.
 A single virtual environment with all dependencies is described in the single `Pipfile`.

 ### Prerequisites
- Install Python 3.7 (the minimal supported version)
-    - Later version (e.g. 3.8) is ok if you don't write Python code
-    - You can install Python 3.7 separately, e.g.:
+- Install Python 3.7 (the minimal supported version) or greater.
+    - Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesnt work as expected.
+    - If you have some trouble with other version you can resolve it by installing Python 3.7 separately, via pyenv or via system package manager e.g.:
      ```bash
      # In Ubuntu
      sudo add-apt-repository ppa:deadsnakes/ppa
      sudo apt update
      sudo apt install python3.7
      ```
- Install `pipenv`
-    - Exact version of `pipenv` is not important, you can use Debian/Ubuntu package `pipenv`.
- Install dependencies via either
-  * `pipenv --python 3.7 install --dev` if you will write Python code, or
-  * `pipenv install` if you only want to run Python scripts and don't have Python 3.7.
+- Install `poetry`
+    - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation)`.
+- Install dependencies via `./scripts/pysync`. Note that CI uses Python 3.7 so if you have different version some linting tools can yield different result locally vs in the CI.

-Run `pipenv shell` to activate the virtual environment.
-Alternatively, use `pipenv run` to run a single command in the venv, e.g. `pipenv run pytest`.
+Run `poetry shell` to activate the virtual environment.
+Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.

 ### Obligatory checks
 We force code formatting via `yapf` and type hints via `mypy`.
 Run the following commands in the repository's root (next to `setup.cfg`):

 ```bash
-pipenv run yapf -ri .  # All code is reformatted
-pipenv run mypy .  # Ensure there are no typing errors
+poetry run yapf -ri .  # All code is reformatted
+poetry run mypy .  # Ensure there are no typing errors
 ```

 **WARNING**: do not run `mypy` from a directory other than the root of the repository.
@@ -123,17 +121,6 @@ Also consider:
 * Adding more type hints to your code to avoid `Any`.

 ### Changing dependencies
-You have to update `Pipfile.lock` if you have changed `Pipfile`:
+To add new package or change an existing one you can use `poetry add` or `poetry update` or edit `pyproject.toml` manually. Do not forget to run `poetry lock` in the latter case.

-```bash
-pipenv --python 3.7 install --dev  # Re-create venv for Python 3.7 and install recent pipenv inside
-pipenv run pipenv --version  # Should be at least 2021.5.29
-pipenv run pipenv lock  # Regenerate Pipfile.lock
-```
-
-As the minimal supported version is Python 3.7 and we use it in CI,
-you have to use a Python 3.7 environment when updating `Pipfile.lock`.
-Otherwise some back-compatibility packages will be missing.
-
-It is also important to run recent `pipenv`.
-Older versions remove markers from `Pipfile.lock`.
+More details are available in poetry's [documentation](https://python-poetry.org/docs/).
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -1,11 +1,10 @@
 [package]
 name = "pageserver"
 version = "0.1.0"
-authors = ["Stas Kelvich <stas@zenith.tech>"]
-edition = "2018"
+edition = "2021"

 [dependencies]
-bookfile = { git = "https://github.com/zenithdb/bookfile.git", branch="generic-readext" }
+bookfile = { git = "https://github.com/neondatabase/bookfile.git", branch="main" }
 chrono = "0.4.19"
 rand = "0.8.3"
 regex = "1.4.5"
@@ -15,24 +14,25 @@ futures = "0.3.13"
 hyper = "0.14"
 lazy_static = "1.4.0"
 log = "0.4.14"
-clap = "2.33.0"
+clap = "3.0"
 daemonize = "0.4.1"
+itertools = "0.10.3"
 tokio = { version = "1.11", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
-postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
-postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
-tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
+postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
+postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
+postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
+tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
 tokio-stream = "0.1.8"
-routerify = "2"
 anyhow = { version = "1.0", features = ["backtrace"] }
 crc32c = "0.6.0"
 thiserror = "1.0"
-hex = { version = "0.4.3", features = ["serde"] }
 tar = "0.4.33"
 humantime = "2.1.0"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
-toml_edit = { version = "0.12", features = ["easy"] }
+serde_with = "1.12.0"
+
+toml_edit = { version = "0.13", features = ["easy"] }
 scopeguard = "1.1.0"
 async-trait = "0.1"
 const_format = "0.2.21"
@@ -42,8 +42,8 @@ signal-hook = "0.3.10"
 url = "2"
 nix = "0.23"
 once_cell = "1.8.0"
-parking_lot = "0.11.2"
 crossbeam-utils = "0.8.5"
+fail = "0.5.0"

 rust-s3 = { version = "0.28", default-features = false, features = ["no-verify-ssl", "tokio-rustls-tls"] }
 async-compression = {version = "0.3", features = ["zstd", "tokio"]}
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -10,8 +10,9 @@
 //! This module is responsible for creation of such tarball
 //! from data stored in object storage.
 //!
-use anyhow::{Context, Result};
+use anyhow::{anyhow, Context, Result};
 use bytes::{BufMut, BytesMut};
+use itertools::Itertools;
 use log::*;
 use std::fmt::Write as FmtWrite;
 use std::io;
@@ -34,9 +35,11 @@ pub struct Basebackup<'a> {
    timeline: &'a Arc<dyn Timeline>,
    pub lsn: Lsn,
    prev_record_lsn: Lsn,
+    full_backup: bool,
 }

-// Create basebackup with non-rel data in it. Omit relational data.
+// Create basebackup with non-rel data in it.
+// Only include relational data if 'full_backup' is true.
 //
 // Currently we use empty lsn in two cases:
 //  * During the basebackup right after timeline creation
@@ -48,6 +51,8 @@ impl<'a> Basebackup<'a> {
        write: &'a mut dyn Write,
        timeline: &'a Arc<dyn Timeline>,
        req_lsn: Option<Lsn>,
+        prev_lsn: Option<Lsn>,
+        full_backup: bool,
    ) -> Result<Basebackup<'a>> {
        // Compute postgres doesn't have any previous WAL files, but the first
        // record that it's going to write needs to include the LSN of the
@@ -82,16 +87,27 @@ impl<'a> Basebackup<'a> {
            (end_of_timeline.prev, end_of_timeline.last)
        };

+        // Consolidate the derived and the provided prev_lsn values
+        let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
+            if backup_prev != Lsn(0) {
+                anyhow::ensure!(backup_prev == provided_prev_lsn)
+            }
+            provided_prev_lsn
+        } else {
+            backup_prev
+        };
+
        info!(
-            "taking basebackup lsn={}, prev_lsn={}",
-            backup_lsn, backup_prev
+            "taking basebackup lsn={}, prev_lsn={} (full_backup={})",
+            backup_lsn, prev_lsn, full_backup
        );

        Ok(Basebackup {
            ar: Builder::new(write),
            timeline,
            lsn: backup_lsn,
-            prev_record_lsn: backup_prev,
+            prev_record_lsn: prev_lsn,
+            full_backup,
        })
    }

@@ -130,6 +146,14 @@ impl<'a> Basebackup<'a> {
            }
        }

+        // Gather relational files if we are doing a full backup.
+        if self.full_backup {
+            let all_rels = self.timeline.list_rels(0, 0, self.lsn)?;
+            for rel in all_rels {
+                self.add_rel(rel)?;
+            }
+        }
+
        // Generate pg_control and bootstrap WAL segment.
        self.add_pgcontrol_file()?;
        self.ar.finish()?;
@@ -137,6 +161,51 @@ impl<'a> Basebackup<'a> {
        Ok(())
    }

+    fn add_rel(&mut self, rel: RelishTag) -> anyhow::Result<()> {
+        let tag = match rel {
+            RelishTag::Relation(tag) => tag,
+            _ => {
+                return Err(anyhow!("expected RelishTag::Rel, got {:?}", rel));
+            }
+        };
+
+        // Function that adds relation segment data to archive
+        let mut add_file = |segment_index, data: &Vec<u8>| -> anyhow::Result<()> {
+            let file_name = tag.to_segfile_name(segment_index as u32);
+            let header = new_tar_header(&file_name, data.len() as u64)?;
+            self.ar.append(&header, data.as_slice())?;
+            Ok(())
+        };
+
+        let nblocks = match self.timeline.get_relish_size(rel, self.lsn)? {
+            Some(nblocks) => nblocks,
+            None => {
+                warn!("rel {} is truncated in timeline", tag);
+                return Ok(());
+            }
+        };
+
+        // If the relation is empty, create an empty file
+        if nblocks == 0 {
+            add_file(0, &vec![])?;
+            return Ok(());
+        }
+
+        // Add a file for each chunk of blocks (aka segment)
+        let chunks = (0..nblocks).chunks(pg_constants::RELSEG_SIZE as usize);
+        for (seg, blocks) in chunks.into_iter().enumerate() {
+            let mut segment_data: Vec<u8> = vec![];
+            for blknum in blocks {
+                let img = self.timeline.get_page_at_lsn(rel, blknum, self.lsn)?;
+                segment_data.extend_from_slice(&img[..]);
+            }
+
+            add_file(seg, &segment_data)?;
+        }
+
+        Ok(())
+    }
+
    //
    // Generate SLRU segment files from repository.
    //
--- a/pageserver/src/bin/dump_layerfile.rs
+++ b/pageserver/src/bin/dump_layerfile.rs
@@ -13,7 +13,7 @@ fn main() -> Result<()> {
        .about("Dump contents of one layer file, for debugging")
        .version(GIT_VERSION)
        .arg(
-            Arg::with_name("path")
+            Arg::new("path")
                .help("Path to file to dump")
                .required(true)
                .index(1),
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -2,7 +2,14 @@

 use std::{env, path::Path, str::FromStr};
 use tracing::*;
-use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType, tcp_listener, GIT_VERSION};
+use zenith_utils::{
+    auth::JwtAuth,
+    logging,
+    postgres_backend::AuthType,
+    tcp_listener,
+    zid::{ZTenantId, ZTimelineId},
+    GIT_VERSION,
+};

 use anyhow::{bail, Context, Result};

@@ -10,11 +17,13 @@ use clap::{App, Arg};
 use daemonize::Daemonize;

 use pageserver::{
-    branches,
    config::{defaults::*, PageServerConf},
-    http, page_cache, page_service, remote_storage, tenant_mgr, thread_mgr,
+    http, page_cache, page_service,
+    remote_storage::{self, SyncStartupData},
+    repository::TimelineSyncStatusUpdate,
+    tenant_mgr, thread_mgr,
    thread_mgr::ThreadKind,
-    virtual_file, LOG_FILE_NAME,
+    timelines, virtual_file, LOG_FILE_NAME,
 };
 use zenith_utils::http::endpoint;
 use zenith_utils::postgres_backend;
@@ -27,41 +36,48 @@ fn main() -> Result<()> {
        .about("Materializes WAL stream to pages and serves them to the postgres")
        .version(GIT_VERSION)
        .arg(
-            Arg::with_name("daemonize")
-                .short("d")
+            Arg::new("daemonize")
+                .short('d')
                .long("daemonize")
                .takes_value(false)
                .help("Run in the background"),
        )
        .arg(
-            Arg::with_name("init")
+            Arg::new("init")
                .long("init")
                .takes_value(false)
-                .help("Initialize pageserver repo"),
+                .help("Initialize pageserver service: creates an initial config, tenant and timeline, if specified"),
        )
        .arg(
-            Arg::with_name("workdir")
-                .short("D")
+            Arg::new("workdir")
+                .short('D')
                .long("workdir")
                .takes_value(true)
                .help("Working directory for the pageserver"),
        )
        .arg(
-            Arg::with_name("create-tenant")
+            Arg::new("create-tenant")
                .long("create-tenant")
                .takes_value(true)
                .help("Create tenant during init")
                .requires("init"),
        )
+        .arg(
+            Arg::new("initial-timeline-id")
+                .long("initial-timeline-id")
+                .takes_value(true)
+                .help("Use a specific timeline id during init and tenant creation")
+                .requires("create-tenant"),
+        )
        // See `settings.md` for more details on the extra configuration patameters pageserver can process
        .arg(
-            Arg::with_name("config-override")
-                .short("c")
+            Arg::new("config-override")
+                .short('c')
                .takes_value(true)
                .number_of_values(1)
-                .multiple(true)
+                .multiple_occurrences(true)
                .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there).
-                Any option has to be a valid toml document, example: `-c \"foo='hey'\"` `-c \"foo={value=1}\"`"),
+                Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
        )
        .get_matches();

@@ -72,7 +88,16 @@ fn main() -> Result<()> {
    let cfg_file_path = workdir.join("pageserver.toml");

    let init = arg_matches.is_present("init");
-    let create_tenant = arg_matches.value_of("create-tenant");
+    let create_tenant = arg_matches
+        .value_of("create-tenant")
+        .map(ZTenantId::from_str)
+        .transpose()
+        .context("Failed to parse tenant id from the arguments")?;
+    let initial_timeline_id = arg_matches
+        .value_of("initial-timeline-id")
+        .map(ZTimelineId::from_str)
+        .transpose()
+        .context("Failed to parse timeline id from the arguments")?;

    // Set CWD to workdir for non-daemon modes
    env::set_current_dir(&workdir).with_context(|| {
@@ -115,7 +140,14 @@ fn main() -> Result<()> {
                    option_line
                )
            })?;
+
            for (key, item) in doc.iter() {
+                if key == "id" {
+                    anyhow::ensure!(
+                        init,
+                        "node id can only be set during pageserver init and cannot be overridden"
+                    );
+                }
                toml.insert(key, item.clone());
            }
        }
@@ -136,7 +168,8 @@ fn main() -> Result<()> {

    // Create repo and exit if init was requested
    if init {
-        branches::init_pageserver(conf, create_tenant).context("Failed to init pageserver")?;
+        timelines::init_pageserver(conf, create_tenant, initial_timeline_id)
+            .context("Failed to init pageserver")?;
        // write the config file
        std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| {
            format!(
@@ -197,11 +230,47 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
    }

    let signals = signals::install_shutdown_handlers()?;
-    let sync_startup = remote_storage::start_local_timeline_sync(conf)
+
+    // Initialize repositories with locally available timelines.
+    // Timelines that are only partially available locally (remote storage has more data than this pageserver)
+    // are scheduled for download and added to the repository once download is completed.
+    let SyncStartupData {
+        remote_index,
+        local_timeline_init_statuses,
+    } = remote_storage::start_local_timeline_sync(conf)
        .context("Failed to set up local files sync with external storage")?;

-    // Initialize tenant manager.
-    tenant_mgr::set_timeline_states(conf, sync_startup.initial_timeline_states);
+    for (tenant_id, local_timeline_init_statuses) in local_timeline_init_statuses {
+        // initialize local tenant
+        let repo = tenant_mgr::load_local_repo(conf, tenant_id, &remote_index);
+        for (timeline_id, init_status) in local_timeline_init_statuses {
+            match init_status {
+                remote_storage::LocalTimelineInitStatus::LocallyComplete => {
+                    debug!("timeline {} for tenant {} is locally complete, registering it in repository", tenant_id, timeline_id);
+                    // Lets fail here loudly to be on the safe side.
+                    // XXX: It may be a better api to actually distinguish between repository startup
+                    //   and processing of newly downloaded timelines.
+                    repo.apply_timeline_remote_sync_status_update(
+                        timeline_id,
+                        TimelineSyncStatusUpdate::Downloaded,
+                    )
+                    .with_context(|| {
+                        format!(
+                            "Failed to bootstrap timeline {} for tenant {}",
+                            timeline_id, tenant_id
+                        )
+                    })?
+                }
+                remote_storage::LocalTimelineInitStatus::NeedsSync => {
+                    debug!(
+                        "timeline {} for tenant {} needs sync, \
+                         so skipped for adding into repository until sync is finished",
+                        tenant_id, timeline_id
+                    );
+                }
+            }
+        }
+    }

    // initialize authentication for incoming connections
    let auth = match &conf.auth_type {
@@ -223,7 +292,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
        None,
        "http_endpoint_thread",
        move || {
-            let router = http::make_router(conf, auth_cloned);
+            let router = http::make_router(conf, auth_cloned, remote_index);
            endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher())
        },
    )?;
--- a/pageserver/src/bin/pageserver_zst.rs
+++ b/pageserver/src/bin/pageserver_zst.rs
@@ -0,0 +1,334 @@
+//! A CLI helper to deal with remote storage (S3, usually) blobs as archives.
+//! See [`compression`] for more details about the archives.
+
+use std::{collections::BTreeSet, path::Path};
+
+use anyhow::{bail, ensure, Context};
+use clap::{App, Arg};
+use pageserver::{
+    layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME},
+    remote_storage::compression,
+};
+use tokio::{fs, io};
+use zenith_utils::GIT_VERSION;
+
+const LIST_SUBCOMMAND: &str = "list";
+const ARCHIVE_ARG_NAME: &str = "archive";
+
+const EXTRACT_SUBCOMMAND: &str = "extract";
+const TARGET_DIRECTORY_ARG_NAME: &str = "target_directory";
+
+const CREATE_SUBCOMMAND: &str = "create";
+const SOURCE_DIRECTORY_ARG_NAME: &str = "source_directory";
+
+#[tokio::main(flavor = "current_thread")]
+async fn main() -> anyhow::Result<()> {
+    let arg_matches = App::new("pageserver zst blob [un]compressor utility")
+        .version(GIT_VERSION)
+        .subcommands(vec![
+            App::new(LIST_SUBCOMMAND)
+                .about("List the archive contents")
+                .arg(
+                    Arg::new(ARCHIVE_ARG_NAME)
+                        .required(true)
+                        .takes_value(true)
+                        .help("An archive to list the contents of"),
+                ),
+            App::new(EXTRACT_SUBCOMMAND)
+                .about("Extracts the archive into the directory")
+                .arg(
+                    Arg::new(ARCHIVE_ARG_NAME)
+                        .required(true)
+                        .takes_value(true)
+                        .help("An archive to extract"),
+                )
+                .arg(
+                    Arg::new(TARGET_DIRECTORY_ARG_NAME)
+                        .required(false)
+                        .takes_value(true)
+                        .help("A directory to extract the archive into. Optional, will use the current directory if not specified"),
+                ),
+            App::new(CREATE_SUBCOMMAND)
+                .about("Creates an archive with the contents of a directory (only the first level files are taken, metadata file has to be present in the same directory)")
+                .arg(
+                    Arg::new(SOURCE_DIRECTORY_ARG_NAME)
+                        .required(true)
+                        .takes_value(true)
+                        .help("A directory to use for creating the archive"),
+                )
+                .arg(
+                    Arg::new(TARGET_DIRECTORY_ARG_NAME)
+                        .required(false)
+                        .takes_value(true)
+                        .help("A directory to create the archive in. Optional, will use the current directory if not specified"),
+                ),
+        ])
+        .get_matches();
+
+    let subcommand_name = match arg_matches.subcommand_name() {
+        Some(name) => name,
+        None => bail!("No subcommand specified"),
+    };
+
+    let subcommand_matches = match arg_matches.subcommand_matches(subcommand_name) {
+        Some(matches) => matches,
+        None => bail!(
+            "No subcommand arguments were recognized for subcommand '{}'",
+            subcommand_name
+        ),
+    };
+
+    let target_dir = Path::new(
+        subcommand_matches
+            .value_of(TARGET_DIRECTORY_ARG_NAME)
+            .unwrap_or("./"),
+    );
+
+    match subcommand_name {
+        LIST_SUBCOMMAND => {
+            let archive = match subcommand_matches.value_of(ARCHIVE_ARG_NAME) {
+                Some(archive) => Path::new(archive),
+                None => bail!("No '{}' argument is specified", ARCHIVE_ARG_NAME),
+            };
+            list_archive(archive).await
+        }
+        EXTRACT_SUBCOMMAND => {
+            let archive = match subcommand_matches.value_of(ARCHIVE_ARG_NAME) {
+                Some(archive) => Path::new(archive),
+                None => bail!("No '{}' argument is specified", ARCHIVE_ARG_NAME),
+            };
+            extract_archive(archive, target_dir).await
+        }
+        CREATE_SUBCOMMAND => {
+            let source_dir = match subcommand_matches.value_of(SOURCE_DIRECTORY_ARG_NAME) {
+                Some(source) => Path::new(source),
+                None => bail!("No '{}' argument is specified", SOURCE_DIRECTORY_ARG_NAME),
+            };
+            create_archive(source_dir, target_dir).await
+        }
+        unknown => bail!("Unknown subcommand {}", unknown),
+    }
+}
+
+async fn list_archive(archive: &Path) -> anyhow::Result<()> {
+    let archive = archive.canonicalize().with_context(|| {
+        format!(
+            "Failed to get the absolute path for the archive path '{}'",
+            archive.display()
+        )
+    })?;
+    ensure!(
+        archive.is_file(),
+        "Path '{}' is not an archive file",
+        archive.display()
+    );
+    println!("Listing an archive at path '{}'", archive.display());
+    let archive_name = match archive.file_name().and_then(|name| name.to_str()) {
+        Some(name) => name,
+        None => bail!(
+            "Failed to get the archive name from the path '{}'",
+            archive.display()
+        ),
+    };
+
+    let archive_bytes = fs::read(&archive)
+        .await
+        .context("Failed to read the archive bytes")?;
+
+    let header = compression::read_archive_header(archive_name, &mut archive_bytes.as_slice())
+        .await
+        .context("Failed to read the archive header")?;
+
+    let empty_path = Path::new("");
+    println!("-------------------------------");
+
+    let longest_path_in_archive = header
+        .files
+        .iter()
+        .filter_map(|file| Some(file.subpath.as_path(empty_path).to_str()?.len()))
+        .max()
+        .unwrap_or_default()
+        .max(METADATA_FILE_NAME.len());
+
+    for regular_file in &header.files {
+        println!(
+            "File: {:width$} uncompressed size: {} bytes",
+            regular_file.subpath.as_path(empty_path).display(),
+            regular_file.size,
+            width = longest_path_in_archive,
+        )
+    }
+    println!(
+        "File: {:width$} uncompressed size: {} bytes",
+        METADATA_FILE_NAME,
+        header.metadata_file_size,
+        width = longest_path_in_archive,
+    );
+    println!("-------------------------------");
+
+    Ok(())
+}
+
+async fn extract_archive(archive: &Path, target_dir: &Path) -> anyhow::Result<()> {
+    let archive = archive.canonicalize().with_context(|| {
+        format!(
+            "Failed to get the absolute path for the archive path '{}'",
+            archive.display()
+        )
+    })?;
+    ensure!(
+        archive.is_file(),
+        "Path '{}' is not an archive file",
+        archive.display()
+    );
+    let archive_name = match archive.file_name().and_then(|name| name.to_str()) {
+        Some(name) => name,
+        None => bail!(
+            "Failed to get the archive name from the path '{}'",
+            archive.display()
+        ),
+    };
+
+    if !target_dir.exists() {
+        fs::create_dir_all(target_dir).await.with_context(|| {
+            format!(
+                "Failed to create the target dir at path '{}'",
+                target_dir.display()
+            )
+        })?;
+    }
+    let target_dir = target_dir.canonicalize().with_context(|| {
+        format!(
+            "Failed to get the absolute path for the target dir path '{}'",
+            target_dir.display()
+        )
+    })?;
+    ensure!(
+        target_dir.is_dir(),
+        "Path '{}' is not a directory",
+        target_dir.display()
+    );
+    let mut dir_contents = fs::read_dir(&target_dir)
+        .await
+        .context("Failed to list the target directory contents")?;
+    let dir_entry = dir_contents
+        .next_entry()
+        .await
+        .context("Failed to list the target directory contents")?;
+    ensure!(
+        dir_entry.is_none(),
+        "Target directory '{}' is not empty",
+        target_dir.display()
+    );
+
+    println!(
+        "Extracting an archive at path '{}' into directory '{}'",
+        archive.display(),
+        target_dir.display()
+    );
+
+    let mut archive_file = fs::File::open(&archive).await.with_context(|| {
+        format!(
+            "Failed to get the archive name from the path '{}'",
+            archive.display()
+        )
+    })?;
+    let header = compression::read_archive_header(archive_name, &mut archive_file)
+        .await
+        .context("Failed to read the archive header")?;
+    compression::uncompress_with_header(&BTreeSet::new(), &target_dir, header, &mut archive_file)
+        .await
+        .context("Failed to extract the archive")
+}
+
+async fn create_archive(source_dir: &Path, target_dir: &Path) -> anyhow::Result<()> {
+    let source_dir = source_dir.canonicalize().with_context(|| {
+        format!(
+            "Failed to get the absolute path for the source dir path '{}'",
+            source_dir.display()
+        )
+    })?;
+    ensure!(
+        source_dir.is_dir(),
+        "Path '{}' is not a directory",
+        source_dir.display()
+    );
+
+    if !target_dir.exists() {
+        fs::create_dir_all(target_dir).await.with_context(|| {
+            format!(
+                "Failed to create the target dir at path '{}'",
+                target_dir.display()
+            )
+        })?;
+    }
+    let target_dir = target_dir.canonicalize().with_context(|| {
+        format!(
+            "Failed to get the absolute path for the target dir path '{}'",
+            target_dir.display()
+        )
+    })?;
+    ensure!(
+        target_dir.is_dir(),
+        "Path '{}' is not a directory",
+        target_dir.display()
+    );
+
+    println!(
+        "Compressing directory '{}' and creating resulting archive in directory '{}'",
+        source_dir.display(),
+        target_dir.display()
+    );
+
+    let mut metadata_file_contents = None;
+    let mut files_co_archive = Vec::new();
+
+    let mut source_dir_contents = fs::read_dir(&source_dir)
+        .await
+        .context("Failed to read the source directory contents")?;
+
+    while let Some(source_dir_entry) = source_dir_contents
+        .next_entry()
+        .await
+        .context("Failed to read a source dir entry")?
+    {
+        let entry_path = source_dir_entry.path();
+        if entry_path.is_file() {
+            if entry_path.file_name().and_then(|name| name.to_str()) == Some(METADATA_FILE_NAME) {
+                let metadata_bytes = fs::read(entry_path)
+                    .await
+                    .context("Failed to read metata file bytes in the source dir")?;
+                metadata_file_contents = Some(
+                    TimelineMetadata::from_bytes(&metadata_bytes)
+                        .context("Failed to parse metata file contents in the source dir")?,
+                );
+            } else {
+                files_co_archive.push(entry_path);
+            }
+        }
+    }
+
+    let metadata = match metadata_file_contents {
+        Some(metadata) => metadata,
+        None => bail!(
+            "No metadata file found in the source dir '{}', cannot create the archive",
+            source_dir.display()
+        ),
+    };
+
+    let _ = compression::archive_files_as_stream(
+        &source_dir,
+        files_co_archive.iter(),
+        &metadata,
+        move |mut archive_streamer, archive_name| async move {
+            let archive_target = target_dir.join(&archive_name);
+            let mut archive_file = fs::File::create(&archive_target).await?;
+            io::copy(&mut archive_streamer, &mut archive_file).await?;
+            Ok(archive_target)
+        },
+    )
+    .await
+    .context("Failed to create an archive")?;
+
+    Ok(())
+}
--- a/pageserver/src/bin/update_metadata.rs
+++ b/pageserver/src/bin/update_metadata.rs
@@ -14,20 +14,20 @@ fn main() -> Result<()> {
        .about("Dump or update metadata file")
        .version(GIT_VERSION)
        .arg(
-            Arg::with_name("path")
+            Arg::new("path")
                .help("Path to metadata file")
                .required(true),
        )
        .arg(
-            Arg::with_name("disk_lsn")
-                .short("d")
+            Arg::new("disk_lsn")
+                .short('d')
                .long("disk_lsn")
                .takes_value(true)
                .help("Replace disk constistent lsn"),
        )
        .arg(
-            Arg::with_name("prev_lsn")
-                .short("p")
+            Arg::new("prev_lsn")
+                .short('p')
                .long("prev_lsn")
                .takes_value(true)
                .help("Previous record LSN"),
--- a/pageserver/src/branches.rs
+++ b/pageserver/src/branches.rs
@@ -1,430 +0,0 @@
-//!
-//! Branch management code
-//!
-// TODO: move all paths construction to conf impl
-//
-
-use anyhow::{anyhow, bail, Context, Result};
-use postgres_ffi::ControlFileData;
-use serde::{Deserialize, Serialize};
-use std::{
-    fs,
-    path::Path,
-    process::{Command, Stdio},
-    str::FromStr,
-    sync::Arc,
-};
-use tracing::*;
-
-use zenith_utils::crashsafe_dir;
-use zenith_utils::logging;
-use zenith_utils::lsn::Lsn;
-use zenith_utils::zid::{ZTenantId, ZTimelineId};
-
-use crate::walredo::WalRedoManager;
-use crate::CheckpointConfig;
-use crate::{config::PageServerConf, repository::Repository};
-use crate::{import_datadir, LOG_FILE_NAME};
-use crate::{repository::RepositoryTimeline, tenant_mgr};
-
-#[derive(Serialize, Deserialize, Clone)]
-pub struct BranchInfo {
-    pub name: String,
-    #[serde(with = "hex")]
-    pub timeline_id: ZTimelineId,
-    pub latest_valid_lsn: Lsn,
-    pub ancestor_id: Option<String>,
-    pub ancestor_lsn: Option<String>,
-    pub current_logical_size: usize,
-    pub current_logical_size_non_incremental: Option<usize>,
-}
-
-impl BranchInfo {
-    pub fn from_path<T: AsRef<Path>>(
-        path: T,
-        repo: &Arc<dyn Repository>,
-        include_non_incremental_logical_size: bool,
-    ) -> Result<Self> {
-        let path = path.as_ref();
-        let name = path.file_name().unwrap().to_string_lossy().to_string();
-        let timeline_id = std::fs::read_to_string(path)
-            .with_context(|| {
-                format!(
-                    "Failed to read branch file contents at path '{}'",
-                    path.display()
-                )
-            })?
-            .parse::<ZTimelineId>()?;
-
-        let timeline = match repo.get_timeline(timeline_id)? {
-            RepositoryTimeline::Local(local_entry) => local_entry,
-            RepositoryTimeline::Remote { .. } => {
-                bail!("Timeline {} is remote, no branches to display", timeline_id)
-            }
-        };
-
-        // we use ancestor lsn zero if we don't have an ancestor, so turn this into an option based on timeline id
-        let (ancestor_id, ancestor_lsn) = match timeline.get_ancestor_timeline_id() {
-            Some(ancestor_id) => (
-                Some(ancestor_id.to_string()),
-                Some(timeline.get_ancestor_lsn().to_string()),
-            ),
-            None => (None, None),
-        };
-
-        // non incremental size calculation can be heavy, so let it be optional
-        // needed for tests to check size calculation
-        let current_logical_size_non_incremental = include_non_incremental_logical_size
-            .then(|| {
-                timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn())
-            })
-            .transpose()?;
-
-        Ok(BranchInfo {
-            name,
-            timeline_id,
-            latest_valid_lsn: timeline.get_last_record_lsn(),
-            ancestor_id,
-            ancestor_lsn,
-            current_logical_size: timeline.get_current_logical_size(),
-            current_logical_size_non_incremental,
-        })
-    }
-}
-
-#[derive(Debug, Clone, Copy)]
-pub struct PointInTime {
-    pub timelineid: ZTimelineId,
-    pub lsn: Lsn,
-}
-
-pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str>) -> Result<()> {
-    // Initialize logger
-    // use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages
-    let _log_file = logging::init(LOG_FILE_NAME, true)?;
-
-    // We don't use the real WAL redo manager, because we don't want to spawn the WAL redo
-    // process during repository initialization.
-    //
-    // FIXME: That caused trouble, because the WAL redo manager spawned a thread that launched
-    // initdb in the background, and it kept running even after the "zenith init" had exited.
-    // In tests, we started the  page server immediately after that, so that initdb was still
-    // running in the background, and we failed to run initdb again in the same directory. This
-    // has been solved for the rapid init+start case now, but the general race condition remains
-    // if you restart the server quickly. The WAL redo manager doesn't use a separate thread
-    // anymore, but I think that could still happen.
-    let dummy_redo_mgr = Arc::new(crate::walredo::DummyRedoManager {});
-
-    if let Some(tenantid) = create_tenant {
-        let tenantid = ZTenantId::from_str(tenantid)?;
-        println!("initializing tenantid {}", tenantid);
-        create_repo(conf, tenantid, dummy_redo_mgr).with_context(|| "failed to create repo")?;
-    }
-    crashsafe_dir::create_dir_all(conf.tenants_path())?;
-
-    println!("pageserver init succeeded");
-    Ok(())
-}
-
-pub fn create_repo(
-    conf: &'static PageServerConf,
-    tenantid: ZTenantId,
-    wal_redo_manager: Arc<dyn WalRedoManager + Send + Sync>,
-) -> Result<Arc<dyn Repository>> {
-    let repo_dir = conf.tenant_path(&tenantid);
-    if repo_dir.exists() {
-        bail!("repo for {} already exists", tenantid)
-    }
-
-    // top-level dir may exist if we are creating it through CLI
-    crashsafe_dir::create_dir_all(&repo_dir)
-        .with_context(|| format!("could not create directory {}", repo_dir.display()))?;
-
-    crashsafe_dir::create_dir(conf.timelines_path(&tenantid))?;
-    crashsafe_dir::create_dir_all(conf.branches_path(&tenantid))?;
-    crashsafe_dir::create_dir_all(conf.tags_path(&tenantid))?;
-
-    info!("created directory structure in {}", repo_dir.display());
-
-    // create a new timeline directory
-    let timeline_id = ZTimelineId::generate();
-    let timelinedir = conf.timeline_path(&timeline_id, &tenantid);
-
-    crashsafe_dir::create_dir(&timelinedir)?;
-
-    let repo = Arc::new(crate::layered_repository::LayeredRepository::new(
-        conf,
-        wal_redo_manager,
-        tenantid,
-        conf.remote_storage_config.is_some(),
-    ));
-
-    // Load data into pageserver
-    // TODO To implement zenith import we need to
-    //      move data loading out of create_repo()
-    bootstrap_timeline(conf, tenantid, timeline_id, repo.as_ref())?;
-
-    Ok(repo)
-}
-
-// Returns checkpoint LSN from controlfile
-fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
-    // Read control file to extract the LSN
-    let controlfile_path = path.join("global").join("pg_control");
-    let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?;
-    let lsn = controlfile.checkPoint;
-
-    Ok(Lsn(lsn))
-}
-
-// Create the cluster temporarily in 'initdbpath' directory inside the repository
-// to get bootstrap data for timeline initialization.
-//
-fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
-    info!("running initdb in {}... ", initdbpath.display());
-
-    let initdb_path = conf.pg_bin_dir().join("initdb");
-    let initdb_output = Command::new(initdb_path)
-        .args(&["-D", initdbpath.to_str().unwrap()])
-        .args(&["-U", &conf.superuser])
-        .args(&["-E", "utf8"])
-        .arg("--no-instructions")
-        // This is only used for a temporary installation that is deleted shortly after,
-        // so no need to fsync it
-        .arg("--no-sync")
-        .env_clear()
-        .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
-        .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
-        .stdout(Stdio::null())
-        .output()
-        .with_context(|| "failed to execute initdb")?;
-    if !initdb_output.status.success() {
-        anyhow::bail!(
-            "initdb failed: '{}'",
-            String::from_utf8_lossy(&initdb_output.stderr)
-        );
-    }
-
-    Ok(())
-}
-
-//
-// - run initdb to init temporary instance and get bootstrap data
-// - after initialization complete, remove the temp dir.
-//
-fn bootstrap_timeline(
-    conf: &'static PageServerConf,
-    tenantid: ZTenantId,
-    tli: ZTimelineId,
-    repo: &dyn Repository,
-) -> Result<()> {
-    let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered();
-
-    let initdb_path = conf.tenant_path(&tenantid).join("tmp");
-
-    // Init temporarily repo to get bootstrap data
-    run_initdb(conf, &initdb_path)?;
-    let pgdata_path = initdb_path;
-
-    let lsn = get_lsn_from_controlfile(&pgdata_path)?.align();
-
-    // Import the contents of the data directory at the initial checkpoint
-    // LSN, and any WAL after that.
-    // Initdb lsn will be equal to last_record_lsn which will be set after import.
-    // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline.
-    let timeline = repo.create_empty_timeline(tli, lsn)?;
-    import_datadir::import_timeline_from_postgres_datadir(
-        &pgdata_path,
-        timeline.writer().as_ref(),
-        lsn,
-    )?;
-    timeline.checkpoint(CheckpointConfig::Forced)?;
-
-    println!(
-        "created initial timeline {} timeline.lsn {}",
-        tli,
-        timeline.get_last_record_lsn()
-    );
-
-    let data = tli.to_string();
-    fs::write(conf.branch_path("main", &tenantid), data)?;
-    println!("created main branch");
-
-    // Remove temp dir. We don't need it anymore
-    fs::remove_dir_all(pgdata_path)?;
-
-    Ok(())
-}
-
-pub(crate) fn get_branches(
-    conf: &PageServerConf,
-    tenantid: &ZTenantId,
-    include_non_incremental_logical_size: bool,
-) -> Result<Vec<BranchInfo>> {
-    let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?;
-
-    // Each branch has a corresponding record (text file) in the refs/branches
-    // with timeline_id.
-    let branches_dir = conf.branches_path(tenantid);
-
-    std::fs::read_dir(&branches_dir)
-        .with_context(|| {
-            format!(
-                "Found no branches directory '{}' for tenant {}",
-                branches_dir.display(),
-                tenantid
-            )
-        })?
-        .map(|dir_entry_res| {
-            let dir_entry = dir_entry_res.with_context(|| {
-                format!(
-                    "Failed to list branches directory '{}' content for tenant {}",
-                    branches_dir.display(),
-                    tenantid
-                )
-            })?;
-            BranchInfo::from_path(
-                dir_entry.path(),
-                &repo,
-                include_non_incremental_logical_size,
-            )
-        })
-        .collect()
-}
-
-pub(crate) fn create_branch(
-    conf: &PageServerConf,
-    branchname: &str,
-    startpoint_str: &str,
-    tenantid: &ZTenantId,
-) -> Result<BranchInfo> {
-    let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?;
-
-    if conf.branch_path(branchname, tenantid).exists() {
-        anyhow::bail!("branch {} already exists", branchname);
-    }
-
-    let mut startpoint = parse_point_in_time(conf, startpoint_str, tenantid)?;
-    let timeline = repo
-        .get_timeline(startpoint.timelineid)?
-        .local_timeline()
-        .ok_or_else(|| anyhow!("Cannot branch off the timeline that's not present locally"))?;
-    if startpoint.lsn == Lsn(0) {
-        // Find end of WAL on the old timeline
-        let end_of_wal = timeline.get_last_record_lsn();
-        info!("branching at end of WAL: {}", end_of_wal);
-        startpoint.lsn = end_of_wal;
-    } else {
-        // Wait for the WAL to arrive and be processed on the parent branch up
-        // to the requested branch point. The repository code itself doesn't
-        // require it, but if we start to receive WAL on the new timeline,
-        // decoding the new WAL might need to look up previous pages, relation
-        // sizes etc. and that would get confused if the previous page versions
-        // are not in the repository yet.
-        timeline.wait_lsn(startpoint.lsn)?;
-    }
-    startpoint.lsn = startpoint.lsn.align();
-    if timeline.get_start_lsn() > startpoint.lsn {
-        anyhow::bail!(
-            "invalid startpoint {} for the branch {}: less than timeline start {}",
-            startpoint.lsn,
-            branchname,
-            timeline.get_start_lsn()
-        );
-    }
-
-    let new_timeline_id = ZTimelineId::generate();
-
-    // Forward entire timeline creation routine to repository
-    // backend, so it can do all needed initialization
-    repo.branch_timeline(startpoint.timelineid, new_timeline_id, startpoint.lsn)?;
-
-    // Remember the human-readable branch name for the new timeline.
-    // FIXME: there's a race condition, if you create a branch with the same
-    // name concurrently.
-    let data = new_timeline_id.to_string();
-    fs::write(conf.branch_path(branchname, tenantid), data)?;
-
-    Ok(BranchInfo {
-        name: branchname.to_string(),
-        timeline_id: new_timeline_id,
-        latest_valid_lsn: startpoint.lsn,
-        ancestor_id: Some(startpoint.timelineid.to_string()),
-        ancestor_lsn: Some(startpoint.lsn.to_string()),
-        current_logical_size: 0,
-        current_logical_size_non_incremental: Some(0),
-    })
-}
-
-//
-// Parse user-given string that represents a point-in-time.
-//
-// We support multiple variants:
-//
-// Raw timeline id in hex, meaning the end of that timeline:
-//    bc62e7d612d0e6fe8f99a6dd2f281f9d
-//
-// A specific LSN on a timeline:
-//    bc62e7d612d0e6fe8f99a6dd2f281f9d@2/15D3DD8
-//
-// Same, with a human-friendly branch name:
-//    main
-//    main@2/15D3DD8
-//
-// Human-friendly tag name:
-//    mytag
-//
-//
-fn parse_point_in_time(
-    conf: &PageServerConf,
-    s: &str,
-    tenantid: &ZTenantId,
-) -> Result<PointInTime> {
-    let mut strings = s.split('@');
-    let name = strings.next().unwrap();
-
-    let lsn: Option<Lsn>;
-    if let Some(lsnstr) = strings.next() {
-        lsn = Some(
-            Lsn::from_str(lsnstr).with_context(|| "invalid LSN in point-in-time specification")?,
-        );
-    } else {
-        lsn = None
-    }
-
-    // Check if it's a tag
-    if lsn.is_none() {
-        let tagpath = conf.tag_path(name, tenantid);
-        if tagpath.exists() {
-            let pointstr = fs::read_to_string(tagpath)?;
-
-            return parse_point_in_time(conf, &pointstr, tenantid);
-        }
-    }
-
-    // Check if it's a branch
-    // Check if it's branch @ LSN
-    let branchpath = conf.branch_path(name, tenantid);
-    if branchpath.exists() {
-        let pointstr = fs::read_to_string(branchpath)?;
-
-        let mut result = parse_point_in_time(conf, &pointstr, tenantid)?;
-
-        result.lsn = lsn.unwrap_or(Lsn(0));
-        return Ok(result);
-    }
-
-    // Check if it's a timelineid
-    // Check if it's timelineid @ LSN
-    if let Ok(timelineid) = ZTimelineId::from_str(name) {
-        let tlipath = conf.timeline_path(&timelineid, tenantid);
-        if tlipath.exists() {
-            return Ok(PointInTime {
-                timelineid,
-                lsn: lsn.unwrap_or(Lsn(0)),
-            });
-        }
-    }
-
-    bail!("could not parse point-in-time {}", s);
-}
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -4,11 +4,11 @@
 //! file, or on the command line.
 //! See also `settings.md` for better description on every parameter.

-use anyhow::{anyhow, bail, ensure, Context, Result};
+use anyhow::{bail, ensure, Context, Result};
 use toml_edit;
 use toml_edit::{Document, Item};
 use zenith_utils::postgres_backend::AuthType;
-use zenith_utils::zid::{ZTenantId, ZTimelineId};
+use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId};

 use std::convert::TryInto;
 use std::env;
@@ -36,6 +36,9 @@ pub mod defaults {
    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
    pub const DEFAULT_GC_PERIOD: &str = "100 s";

+    pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
+    pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
+
    pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
    pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC: usize = 100;
    pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
@@ -59,6 +62,9 @@ pub mod defaults {
 #gc_period = '{DEFAULT_GC_PERIOD}'
 #gc_horizon = {DEFAULT_GC_HORIZON}

+#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
+#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
+
 #max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}

 # initial superuser role name to use when creating a new tenant
@@ -72,6 +78,10 @@ pub mod defaults {

 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct PageServerConf {
+    // Identifier of that particular pageserver so e g safekeepers
+    // can safely distinguish different pageservers
+    pub id: ZNodeId,
+
    /// Example (default): 127.0.0.1:64000
    pub listen_pg_addr: String,
    /// Example (default): 127.0.0.1:9898
@@ -85,6 +95,12 @@ pub struct PageServerConf {

    pub gc_horizon: u64,
    pub gc_period: Duration,
+
+    // Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call.
+    pub wait_lsn_timeout: Duration,
+    // How long to wait for WAL redo to complete.
+    pub wal_redo_timeout: Duration,
+
    pub superuser: String,

    pub page_cache_size: usize,
@@ -106,6 +122,206 @@ pub struct PageServerConf {
    pub remote_storage_config: Option<RemoteStorageConfig>,
 }

+// use dedicated enum for builder to better indicate the intention
+// and avoid possible confusion with nested options
+pub enum BuilderValue<T> {
+    Set(T),
+    NotSet,
+}
+
+impl<T> BuilderValue<T> {
+    pub fn ok_or<E>(self, err: E) -> Result<T, E> {
+        match self {
+            Self::Set(v) => Ok(v),
+            Self::NotSet => Err(err),
+        }
+    }
+}
+
+// needed to simplify config construction
+struct PageServerConfigBuilder {
+    listen_pg_addr: BuilderValue<String>,
+
+    listen_http_addr: BuilderValue<String>,
+
+    checkpoint_distance: BuilderValue<u64>,
+    checkpoint_period: BuilderValue<Duration>,
+
+    gc_horizon: BuilderValue<u64>,
+    gc_period: BuilderValue<Duration>,
+
+    wait_lsn_timeout: BuilderValue<Duration>,
+    wal_redo_timeout: BuilderValue<Duration>,
+
+    superuser: BuilderValue<String>,
+
+    page_cache_size: BuilderValue<usize>,
+    max_file_descriptors: BuilderValue<usize>,
+
+    workdir: BuilderValue<PathBuf>,
+
+    pg_distrib_dir: BuilderValue<PathBuf>,
+
+    auth_type: BuilderValue<AuthType>,
+
+    //
+    auth_validation_public_key_path: BuilderValue<Option<PathBuf>>,
+    remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,
+
+    id: BuilderValue<ZNodeId>,
+}
+
+impl Default for PageServerConfigBuilder {
+    fn default() -> Self {
+        use self::BuilderValue::*;
+        use defaults::*;
+        Self {
+            listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()),
+            listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()),
+            checkpoint_distance: Set(DEFAULT_CHECKPOINT_DISTANCE),
+            checkpoint_period: Set(humantime::parse_duration(DEFAULT_CHECKPOINT_PERIOD)
+                .expect("cannot parse default checkpoint period")),
+            gc_horizon: Set(DEFAULT_GC_HORIZON),
+            gc_period: Set(humantime::parse_duration(DEFAULT_GC_PERIOD)
+                .expect("cannot parse default gc period")),
+            wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
+                .expect("cannot parse default wait lsn timeout")),
+            wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
+                .expect("cannot parse default wal redo timeout")),
+            superuser: Set(DEFAULT_SUPERUSER.to_string()),
+            page_cache_size: Set(DEFAULT_PAGE_CACHE_SIZE),
+            max_file_descriptors: Set(DEFAULT_MAX_FILE_DESCRIPTORS),
+            workdir: Set(PathBuf::new()),
+            pg_distrib_dir: Set(env::current_dir()
+                .expect("cannot access current directory")
+                .join("tmp_install")),
+            auth_type: Set(AuthType::Trust),
+            auth_validation_public_key_path: Set(None),
+            remote_storage_config: Set(None),
+            id: NotSet,
+        }
+    }
+}
+
+impl PageServerConfigBuilder {
+    pub fn listen_pg_addr(&mut self, listen_pg_addr: String) {
+        self.listen_pg_addr = BuilderValue::Set(listen_pg_addr)
+    }
+
+    pub fn listen_http_addr(&mut self, listen_http_addr: String) {
+        self.listen_http_addr = BuilderValue::Set(listen_http_addr)
+    }
+
+    pub fn checkpoint_distance(&mut self, checkpoint_distance: u64) {
+        self.checkpoint_distance = BuilderValue::Set(checkpoint_distance)
+    }
+
+    pub fn checkpoint_period(&mut self, checkpoint_period: Duration) {
+        self.checkpoint_period = BuilderValue::Set(checkpoint_period)
+    }
+
+    pub fn gc_horizon(&mut self, gc_horizon: u64) {
+        self.gc_horizon = BuilderValue::Set(gc_horizon)
+    }
+
+    pub fn gc_period(&mut self, gc_period: Duration) {
+        self.gc_period = BuilderValue::Set(gc_period)
+    }
+
+    pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) {
+        self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout)
+    }
+
+    pub fn wal_redo_timeout(&mut self, wal_redo_timeout: Duration) {
+        self.wal_redo_timeout = BuilderValue::Set(wal_redo_timeout)
+    }
+
+    pub fn superuser(&mut self, superuser: String) {
+        self.superuser = BuilderValue::Set(superuser)
+    }
+
+    pub fn page_cache_size(&mut self, page_cache_size: usize) {
+        self.page_cache_size = BuilderValue::Set(page_cache_size)
+    }
+
+    pub fn max_file_descriptors(&mut self, max_file_descriptors: usize) {
+        self.max_file_descriptors = BuilderValue::Set(max_file_descriptors)
+    }
+
+    pub fn workdir(&mut self, workdir: PathBuf) {
+        self.workdir = BuilderValue::Set(workdir)
+    }
+
+    pub fn pg_distrib_dir(&mut self, pg_distrib_dir: PathBuf) {
+        self.pg_distrib_dir = BuilderValue::Set(pg_distrib_dir)
+    }
+
+    pub fn auth_type(&mut self, auth_type: AuthType) {
+        self.auth_type = BuilderValue::Set(auth_type)
+    }
+
+    pub fn auth_validation_public_key_path(
+        &mut self,
+        auth_validation_public_key_path: Option<PathBuf>,
+    ) {
+        self.auth_validation_public_key_path = BuilderValue::Set(auth_validation_public_key_path)
+    }
+
+    pub fn remote_storage_config(&mut self, remote_storage_config: Option<RemoteStorageConfig>) {
+        self.remote_storage_config = BuilderValue::Set(remote_storage_config)
+    }
+
+    pub fn id(&mut self, node_id: ZNodeId) {
+        self.id = BuilderValue::Set(node_id)
+    }
+
+    pub fn build(self) -> Result<PageServerConf> {
+        Ok(PageServerConf {
+            listen_pg_addr: self
+                .listen_pg_addr
+                .ok_or(anyhow::anyhow!("missing listen_pg_addr"))?,
+            listen_http_addr: self
+                .listen_http_addr
+                .ok_or(anyhow::anyhow!("missing listen_http_addr"))?,
+            checkpoint_distance: self
+                .checkpoint_distance
+                .ok_or(anyhow::anyhow!("missing checkpoint_distance"))?,
+            checkpoint_period: self
+                .checkpoint_period
+                .ok_or(anyhow::anyhow!("missing checkpoint_period"))?,
+            gc_horizon: self
+                .gc_horizon
+                .ok_or(anyhow::anyhow!("missing gc_horizon"))?,
+            gc_period: self.gc_period.ok_or(anyhow::anyhow!("missing gc_period"))?,
+            wait_lsn_timeout: self
+                .wait_lsn_timeout
+                .ok_or(anyhow::anyhow!("missing wait_lsn_timeout"))?,
+            wal_redo_timeout: self
+                .wal_redo_timeout
+                .ok_or(anyhow::anyhow!("missing wal_redo_timeout"))?,
+            superuser: self.superuser.ok_or(anyhow::anyhow!("missing superuser"))?,
+            page_cache_size: self
+                .page_cache_size
+                .ok_or(anyhow::anyhow!("missing page_cache_size"))?,
+            max_file_descriptors: self
+                .max_file_descriptors
+                .ok_or(anyhow::anyhow!("missing max_file_descriptors"))?,
+            workdir: self.workdir.ok_or(anyhow::anyhow!("missing workdir"))?,
+            pg_distrib_dir: self
+                .pg_distrib_dir
+                .ok_or(anyhow::anyhow!("missing pg_distrib_dir"))?,
+            auth_type: self.auth_type.ok_or(anyhow::anyhow!("missing auth_type"))?,
+            auth_validation_public_key_path: self
+                .auth_validation_public_key_path
+                .ok_or(anyhow::anyhow!("missing auth_validation_public_key_path"))?,
+            remote_storage_config: self
+                .remote_storage_config
+                .ok_or(anyhow::anyhow!("missing remote_storage_config"))?,
+            id: self.id.ok_or(anyhow::anyhow!("missing id"))?,
+        })
+    }
+}
+
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
@@ -144,6 +360,13 @@ pub struct S3Config {
    pub access_key_id: Option<String>,
    /// "Password" to use when connecting to bucket.
    pub secret_access_key: Option<String>,
+    /// A base URL to send S3 requests to.
+    /// By default, the endpoint is derived from a region name, assuming it's
+    /// an AWS S3 region name, erroring on wrong region name.
+    /// Endpoint provides a way to support other S3 flavors and their regions.
+    ///
+    /// Example: `http://127.0.0.1:5000`
+    pub endpoint: Option<String>,
 }

 impl std::fmt::Debug for S3Config {
@@ -169,22 +392,6 @@ impl PageServerConf {
        self.tenants_path().join(tenantid.to_string())
    }

-    pub fn tags_path(&self, tenantid: &ZTenantId) -> PathBuf {
-        self.tenant_path(tenantid).join("refs").join("tags")
-    }
-
-    pub fn tag_path(&self, tag_name: &str, tenantid: &ZTenantId) -> PathBuf {
-        self.tags_path(tenantid).join(tag_name)
-    }
-
-    pub fn branches_path(&self, tenantid: &ZTenantId) -> PathBuf {
-        self.tenant_path(tenantid).join("refs").join("branches")
-    }
-
-    pub fn branch_path(&self, branch_name: &str, tenantid: &ZTenantId) -> PathBuf {
-        self.branches_path(tenantid).join(branch_name)
-    }
-
    pub fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf {
        self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME)
    }
@@ -193,10 +400,6 @@ impl PageServerConf {
        self.timelines_path(tenantid).join(timelineid.to_string())
    }

-    pub fn ancestor_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
-        self.timeline_path(timelineid, tenantid).join("ancestor")
-    }
-
    //
    // Postgres distribution paths
    //
@@ -214,57 +417,41 @@ impl PageServerConf {
    ///
    /// This leaves any options not present in the file in the built-in defaults.
    pub fn parse_and_validate(toml: &Document, workdir: &Path) -> Result<Self> {
-        use defaults::*;
-
-        let mut conf = PageServerConf {
-            workdir: workdir.to_path_buf(),
-
-            listen_pg_addr: DEFAULT_PG_LISTEN_ADDR.to_string(),
-            listen_http_addr: DEFAULT_HTTP_LISTEN_ADDR.to_string(),
-            checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
-            checkpoint_period: humantime::parse_duration(DEFAULT_CHECKPOINT_PERIOD)?,
-            gc_horizon: DEFAULT_GC_HORIZON,
-            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)?,
-            page_cache_size: DEFAULT_PAGE_CACHE_SIZE,
-            max_file_descriptors: DEFAULT_MAX_FILE_DESCRIPTORS,
-
-            pg_distrib_dir: PathBuf::new(),
-            auth_validation_public_key_path: None,
-            auth_type: AuthType::Trust,
-
-            remote_storage_config: None,
-
-            superuser: DEFAULT_SUPERUSER.to_string(),
-        };
+        let mut builder = PageServerConfigBuilder::default();
+        builder.workdir(workdir.to_owned());

        for (key, item) in toml.iter() {
            match key {
-                "listen_pg_addr" => conf.listen_pg_addr = parse_toml_string(key, item)?,
-                "listen_http_addr" => conf.listen_http_addr = parse_toml_string(key, item)?,
-                "checkpoint_distance" => conf.checkpoint_distance = parse_toml_u64(key, item)?,
-                "checkpoint_period" => conf.checkpoint_period = parse_toml_duration(key, item)?,
-                "gc_horizon" => conf.gc_horizon = parse_toml_u64(key, item)?,
-                "gc_period" => conf.gc_period = parse_toml_duration(key, item)?,
-                "initial_superuser_name" => conf.superuser = parse_toml_string(key, item)?,
-                "page_cache_size" => conf.page_cache_size = parse_toml_u64(key, item)? as usize,
+                "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?),
+                "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?),
+                "checkpoint_distance" => builder.checkpoint_distance(parse_toml_u64(key, item)?),
+                "checkpoint_period" => builder.checkpoint_period(parse_toml_duration(key, item)?),
+                "gc_horizon" => builder.gc_horizon(parse_toml_u64(key, item)?),
+                "gc_period" => builder.gc_period(parse_toml_duration(key, item)?),
+                "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?),
+                "wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?),
+                "initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?),
+                "page_cache_size" => builder.page_cache_size(parse_toml_u64(key, item)? as usize),
                "max_file_descriptors" => {
-                    conf.max_file_descriptors = parse_toml_u64(key, item)? as usize
+                    builder.max_file_descriptors(parse_toml_u64(key, item)? as usize)
                }
                "pg_distrib_dir" => {
-                    conf.pg_distrib_dir = PathBuf::from(parse_toml_string(key, item)?)
+                    builder.pg_distrib_dir(PathBuf::from(parse_toml_string(key, item)?))
                }
-                "auth_validation_public_key_path" => {
-                    conf.auth_validation_public_key_path =
-                        Some(PathBuf::from(parse_toml_string(key, item)?))
-                }
-                "auth_type" => conf.auth_type = parse_toml_auth_type(key, item)?,
+                "auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some(
+                    PathBuf::from(parse_toml_string(key, item)?),
+                )),
+                "auth_type" => builder.auth_type(parse_toml_auth_type(key, item)?),
                "remote_storage" => {
-                    conf.remote_storage_config = Some(Self::parse_remote_storage_config(item)?)
+                    builder.remote_storage_config(Some(Self::parse_remote_storage_config(item)?))
                }
+                "id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)),
                _ => bail!("unrecognized pageserver option '{}'", key),
            }
        }

+        let mut conf = builder.build().context("invalid config")?;
+
        if conf.auth_type == AuthType::ZenithJWT {
            let auth_validation_public_key_path = conf
                .auth_validation_public_key_path
@@ -278,9 +465,6 @@ impl PageServerConf {
            );
        }

-        if conf.pg_distrib_dir == PathBuf::new() {
-            conf.pg_distrib_dir = env::current_dir()?.join("tmp_install")
-        };
        if !conf.pg_distrib_dir.join("bin/postgres").exists() {
            bail!(
                "Can't find postgres binary at {}",
@@ -306,9 +490,7 @@ impl PageServerConf {
                })
                .ok()
                .and_then(NonZeroUsize::new)
-                .ok_or_else(|| {
-                    anyhow!("'max_concurrent_sync' must be a non-zero positive integer")
-                })?
+                .context("'max_concurrent_sync' must be a non-zero positive integer")?
        } else {
            NonZeroUsize::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC).unwrap()
        };
@@ -321,7 +503,7 @@ impl PageServerConf {
                })
                .ok()
                .and_then(NonZeroU32::new)
-                .ok_or_else(|| anyhow!("'max_sync_errors' must be a non-zero positive integer"))?
+                .context("'max_sync_errors' must be a non-zero positive integer")?
        } else {
            NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS).unwrap()
        };
@@ -351,6 +533,10 @@ impl PageServerConf {
                    .get("prefix_in_bucket")
                    .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket))
                    .transpose()?,
+                endpoint: toml
+                    .get("endpoint")
+                    .map(|endpoint| parse_toml_string("endpoint", endpoint))
+                    .transpose()?,
            }),
            (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from(
                parse_toml_string("local_path", local_path)?,
@@ -373,10 +559,13 @@ impl PageServerConf {
    #[cfg(test)]
    pub fn dummy_conf(repo_dir: PathBuf) -> Self {
        PageServerConf {
+            id: ZNodeId(0),
            checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
            checkpoint_period: Duration::from_secs(10),
            gc_horizon: defaults::DEFAULT_GC_HORIZON,
            gc_period: Duration::from_secs(10),
+            wait_lsn_timeout: Duration::from_secs(60),
+            wal_redo_timeout: Duration::from_secs(60),
            page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
            max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
            listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
@@ -396,7 +585,7 @@ impl PageServerConf {
 fn parse_toml_string(name: &str, item: &Item) -> Result<String> {
    let s = item
        .as_str()
-        .ok_or_else(|| anyhow!("configure option {} is not a string", name))?;
+        .with_context(|| format!("configure option {} is not a string", name))?;
    Ok(s.to_string())
 }

@@ -405,7 +594,7 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
    // for our use, though.
    let i: i64 = item
        .as_integer()
-        .ok_or_else(|| anyhow!("configure option {} is not an integer", name))?;
+        .with_context(|| format!("configure option {} is not an integer", name))?;
    if i < 0 {
        bail!("configure option {} cannot be negative", name);
    }
@@ -415,7 +604,7 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
 fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> {
    let s = item
        .as_str()
-        .ok_or_else(|| anyhow!("configure option {} is not a string", name))?;
+        .with_context(|| format!("configure option {} is not a string", name))?;

    Ok(humantime::parse_duration(s)?)
 }
@@ -423,7 +612,7 @@ fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> {
 fn parse_toml_auth_type(name: &str, item: &Item) -> Result<AuthType> {
    let v = item
        .as_str()
-        .ok_or_else(|| anyhow!("configure option {} is not a string", name))?;
+        .with_context(|| format!("configure option {} is not a string", name))?;
    AuthType::from_str(v)
 }

@@ -447,20 +636,24 @@ checkpoint_period = '111 s'
 gc_period = '222 s'
 gc_horizon = 222

+wait_lsn_timeout = '111 s'
+wal_redo_timeout = '111 s'
+
 page_cache_size = 444
 max_file_descriptors = 333

 # initial superuser role name to use when creating a new tenant
 initial_superuser_name = 'zzzz'
+id = 10

-    "#;
+"#;

    #[test]
    fn parse_defaults() -> anyhow::Result<()> {
        let tempdir = tempdir()?;
        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
        // we have to create dummy pathes to overcome the validation errors
-        let config_string = format!("pg_distrib_dir='{}'", pg_distrib_dir.display());
+        let config_string = format!("pg_distrib_dir='{}'\nid=10", pg_distrib_dir.display());
        let toml = config_string.parse()?;

        let parsed_config =
@@ -471,12 +664,15 @@ initial_superuser_name = 'zzzz'
        assert_eq!(
            parsed_config,
            PageServerConf {
+                id: ZNodeId(10),
                listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
                listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
                checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
                checkpoint_period: humantime::parse_duration(defaults::DEFAULT_CHECKPOINT_PERIOD)?,
                gc_horizon: defaults::DEFAULT_GC_HORIZON,
                gc_period: humantime::parse_duration(defaults::DEFAULT_GC_PERIOD)?,
+                wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?,
+                wal_redo_timeout: humantime::parse_duration(defaults::DEFAULT_WAL_REDO_TIMEOUT)?,
                superuser: defaults::DEFAULT_SUPERUSER.to_string(),
                page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
                max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
@@ -512,12 +708,15 @@ initial_superuser_name = 'zzzz'
        assert_eq!(
            parsed_config,
            PageServerConf {
+                id: ZNodeId(10),
                listen_pg_addr: "127.0.0.1:64000".to_string(),
                listen_http_addr: "127.0.0.1:9898".to_string(),
                checkpoint_distance: 111,
                checkpoint_period: Duration::from_secs(111),
                gc_horizon: 222,
                gc_period: Duration::from_secs(222),
+                wait_lsn_timeout: Duration::from_secs(111),
+                wal_redo_timeout: Duration::from_secs(111),
                superuser: "zzzz".to_string(),
                page_cache_size: 444,
                max_file_descriptors: 333,
@@ -599,6 +798,7 @@ pg_distrib_dir='{}'
        let prefix_in_bucket = "test_prefix".to_string();
        let access_key_id = "SOMEKEYAAAAASADSAH*#".to_string();
        let secret_access_key = "SOMEsEcReTsd292v".to_string();
+        let endpoint = "http://localhost:5000".to_string();
        let max_concurrent_sync = NonZeroUsize::new(111).unwrap();
        let max_sync_errors = NonZeroU32::new(222).unwrap();

@@ -611,12 +811,13 @@ bucket_name = '{}'
 bucket_region = '{}'
 prefix_in_bucket = '{}'
 access_key_id = '{}'
-secret_access_key = '{}'"#,
-                max_concurrent_sync, max_sync_errors, bucket_name, bucket_region, prefix_in_bucket, access_key_id, secret_access_key
+secret_access_key = '{}'
+endpoint = '{}'"#,
+                max_concurrent_sync, max_sync_errors, bucket_name, bucket_region, prefix_in_bucket, access_key_id, secret_access_key, endpoint
            ),
            format!(
-                "remote_storage={{max_concurrent_sync={}, max_sync_errors={}, bucket_name='{}', bucket_region='{}', prefix_in_bucket='{}', access_key_id='{}', secret_access_key='{}'}}",
-                max_concurrent_sync, max_sync_errors, bucket_name, bucket_region, prefix_in_bucket, access_key_id, secret_access_key
+                "remote_storage={{max_concurrent_sync={}, max_sync_errors={}, bucket_name='{}', bucket_region='{}', prefix_in_bucket='{}', access_key_id='{}', secret_access_key='{}', endpoint='{}'}}",
+                max_concurrent_sync, max_sync_errors, bucket_name, bucket_region, prefix_in_bucket, access_key_id, secret_access_key, endpoint
            ),
        ];

@@ -650,7 +851,8 @@ pg_distrib_dir='{}'
                        bucket_region: bucket_region.clone(),
                        access_key_id: Some(access_key_id.clone()),
                        secret_access_key: Some(secret_access_key.clone()),
-                        prefix_in_bucket: Some(prefix_in_bucket.clone())
+                        prefix_in_bucket: Some(prefix_in_bucket.clone()),
+                        endpoint: Some(endpoint.clone())
                    }),
                },
                "Remote storage config should correctly parse the S3 config"
--- a/pageserver/src/http/models.rs
+++ b/pageserver/src/http/models.rs
@@ -1,17 +1,188 @@
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
+use zenith_utils::{
+    lsn::Lsn,
+    zid::{ZNodeId, ZTenantId, ZTimelineId},
+};

-use crate::ZTenantId;
+use crate::timelines::{LocalTimelineInfo, TimelineInfo};

+#[serde_as]
 #[derive(Serialize, Deserialize)]
-pub struct BranchCreateRequest {
-    #[serde(with = "hex")]
-    pub tenant_id: ZTenantId,
-    pub name: String,
-    pub start_point: String,
+pub struct TimelineCreateRequest {
+    #[serde(default)]
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub new_timeline_id: Option<ZTimelineId>,
+    #[serde(default)]
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub ancestor_timeline_id: Option<ZTimelineId>,
+    #[serde(default)]
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub ancestor_start_lsn: Option<Lsn>,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TenantCreateRequest {
-    #[serde(with = "hex")]
-    pub tenant_id: ZTenantId,
+    #[serde(default)]
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub new_tenant_id: Option<ZTenantId>,
+}
+
+#[serde_as]
+#[derive(Serialize, Deserialize)]
+#[serde(transparent)]
+pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub ZTenantId);
+
+#[derive(Clone)]
+pub enum TimelineInfoV1 {
+    Local {
+        timeline_id: ZTimelineId,
+        tenant_id: ZTenantId,
+        last_record_lsn: Lsn,
+        prev_record_lsn: Option<Lsn>,
+        ancestor_timeline_id: Option<ZTimelineId>,
+        ancestor_lsn: Option<Lsn>,
+        disk_consistent_lsn: Lsn,
+        current_logical_size: Option<usize>,
+        current_logical_size_non_incremental: Option<usize>,
+    },
+    Remote {
+        timeline_id: ZTimelineId,
+        tenant_id: ZTenantId,
+        disk_consistent_lsn: Lsn,
+    },
+}
+
+#[serde_as]
+#[derive(Serialize, Deserialize)]
+pub struct TimelineInfoResponseV1 {
+    pub kind: String,
+    #[serde_as(as = "DisplayFromStr")]
+    timeline_id: ZTimelineId,
+    #[serde_as(as = "DisplayFromStr")]
+    tenant_id: ZTenantId,
+    #[serde_as(as = "DisplayFromStr")]
+    disk_consistent_lsn: Lsn,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    last_record_lsn: Option<Lsn>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    prev_record_lsn: Option<Lsn>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    ancestor_timeline_id: Option<ZTimelineId>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    ancestor_lsn: Option<Lsn>,
+    current_logical_size: Option<usize>,
+    current_logical_size_non_incremental: Option<usize>,
+}
+
+impl From<TimelineInfoV1> for TimelineInfoResponseV1 {
+    fn from(other: TimelineInfoV1) -> Self {
+        match other {
+            TimelineInfoV1::Local {
+                timeline_id,
+                tenant_id,
+                last_record_lsn,
+                prev_record_lsn,
+                ancestor_timeline_id,
+                ancestor_lsn,
+                disk_consistent_lsn,
+                current_logical_size,
+                current_logical_size_non_incremental,
+            } => TimelineInfoResponseV1 {
+                kind: "Local".to_owned(),
+                timeline_id,
+                tenant_id,
+                disk_consistent_lsn,
+                last_record_lsn: Some(last_record_lsn),
+                prev_record_lsn,
+                ancestor_timeline_id,
+                ancestor_lsn,
+                current_logical_size,
+                current_logical_size_non_incremental,
+            },
+            TimelineInfoV1::Remote {
+                timeline_id,
+                tenant_id,
+                disk_consistent_lsn,
+            } => TimelineInfoResponseV1 {
+                kind: "Remote".to_owned(),
+                timeline_id,
+                tenant_id,
+                disk_consistent_lsn,
+                last_record_lsn: None,
+                prev_record_lsn: None,
+                ancestor_timeline_id: None,
+                ancestor_lsn: None,
+                current_logical_size: None,
+                current_logical_size_non_incremental: None,
+            },
+        }
+    }
+}
+
+impl TryFrom<TimelineInfoResponseV1> for TimelineInfoV1 {
+    type Error = anyhow::Error;
+
+    fn try_from(other: TimelineInfoResponseV1) -> anyhow::Result<Self> {
+        Ok(match other.kind.as_str() {
+            "Local" => TimelineInfoV1::Local {
+                timeline_id: other.timeline_id,
+                tenant_id: other.tenant_id,
+                last_record_lsn: other.last_record_lsn.ok_or(anyhow::anyhow!(
+                    "Local timeline should have last_record_lsn"
+                ))?,
+                prev_record_lsn: other.prev_record_lsn,
+                ancestor_timeline_id: other.ancestor_timeline_id.map(ZTimelineId::from),
+                ancestor_lsn: other.ancestor_lsn,
+                disk_consistent_lsn: other.disk_consistent_lsn,
+                current_logical_size: other.current_logical_size,
+                current_logical_size_non_incremental: other.current_logical_size_non_incremental,
+            },
+            "Remote" => TimelineInfoV1::Remote {
+                timeline_id: other.timeline_id,
+                tenant_id: other.tenant_id,
+                disk_consistent_lsn: other.disk_consistent_lsn,
+            },
+            unknown => anyhow::bail!("Unknown timeline kind: {}", unknown),
+        })
+    }
+}
+
+fn from_local(
+    tenant_id: ZTenantId,
+    timeline_id: ZTimelineId,
+    local: &LocalTimelineInfo,
+) -> TimelineInfoV1 {
+    TimelineInfoV1::Local {
+        timeline_id,
+        tenant_id,
+        last_record_lsn: local.last_record_lsn,
+        prev_record_lsn: local.prev_record_lsn,
+        ancestor_timeline_id: local.ancestor_timeline_id.map(ZTimelineId::from),
+        ancestor_lsn: local.ancestor_lsn,
+        disk_consistent_lsn: local.disk_consistent_lsn,
+        current_logical_size: local.current_logical_size,
+        current_logical_size_non_incremental: local.current_logical_size_non_incremental,
+    }
+}
+
+impl From<TimelineInfo> for TimelineInfoV1 {
+    fn from(t: TimelineInfo) -> Self {
+        match (t.local.as_ref(), t.remote.as_ref()) {
+            (None, None) => unreachable!(),
+            (None, Some(remote)) => TimelineInfoV1::Remote {
+                timeline_id: t.timeline_id,
+                tenant_id: t.tenant_id,
+                disk_consistent_lsn: remote.remote_consistent_lsn.unwrap_or(Lsn(0)),
+            },
+            (Some(local), None) => from_local(t.tenant_id, t.timeline_id, local),
+            (Some(local), Some(_)) => from_local(t.tenant_id, t.timeline_id, local),
+        }
+    }
+}
+
+#[derive(Serialize)]
+pub struct StatusResponse {
+    pub id: ZNodeId,
 }
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -17,7 +17,12 @@ paths:
            application/json:
              schema:
                type: object
-  /v1/timeline/{tenant_id}:
+                required:
+                - id
+                properties:
+                  id:
+                    type: integer
+  /v1/tenant/{tenant_id}/timeline:
    parameters:
      - name: tenant_id
        in: path
@@ -25,19 +30,22 @@ paths:
        schema:
          type: string
          format: hex
+      - name: include-non-incremental-logical-size
+        in: query
+        schema:
+          type: string
+          description: Controls calculation of current_logical_size_non_incremental
    get:
-      description: List tenant timelines
+      description: Get timelines for tenant
      responses:
        "200":
-          description: array of brief timeline descriptions
+          description: TimelineInfo
          content:
            application/json:
              schema:
                type: array
                items:
-                  # currently, just a timeline id string, but when remote index gets to be accessed
-                  # remote/local timeline field would be added at least
-                  type: string
+                  $ref: "#/components/schemas/TimelineInfo"
        "400":
          description: Error when no tenant id found in path
          content:
@@ -62,7 +70,7 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-  /v1/timeline/{tenant_id}/{timeline_id}:
+  /v1/tenant/{tenant_id}/timeline/{timeline_id}:
    parameters:
      - name: tenant_id
        in: path
@@ -76,8 +84,13 @@ paths:
        schema:
          type: string
          format: hex
+      - name: include-non-incremental-logical-size
+        in: query
+        schema:
+          type: string
+          description: Controls calculation of current_logical_size_non_incremental
    get:
-      description: Get timeline info for tenant's remote timeline
+      description: Get info about the timeline
      responses:
        "200":
          description: TimelineInfo
@@ -86,7 +99,7 @@ paths:
              schema:
                $ref: "#/components/schemas/TimelineInfo"
        "400":
-          description: Error when no tenant id found in path or no branch name
+          description: Error when no tenant id found in path or no timeline id
          content:
            application/json:
              schema:
@@ -109,7 +122,7 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-  /v1/branch/{tenant_id}:
+  /v1/tenant/{tenant_id}/timeline/:
    parameters:
      - name: tenant_id
        in: path
@@ -117,128 +130,33 @@ paths:
        schema:
          type: string
          format: hex
-      - name: include-non-incremental-logical-size
-        in: query
-        schema:
-          type: string
-          description: Controls calculation of current_logical_size_non_incremental
-    get:
-      description: Get branches for tenant
-      responses:
-        "200":
-          description: BranchInfo
-          content:
-            application/json:
-              schema:
-                type: array
-                items:
-                  $ref: "#/components/schemas/BranchInfo"
-        "400":
-          description: Error when no tenant id found in path
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-  /v1/branch/{tenant_id}/{branch_name}:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-      - name: branch_name
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: include-non-incremental-logical-size
-        in: query
-        schema:
-          type: string
-          description: Controls calculation of current_logical_size_non_incremental
-    get:
-      description: Get branches for tenant
-      responses:
-        "200":
-          description: BranchInfo
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/BranchInfo"
-        "400":
-          description: Error when no tenant id found in path or no branch name
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-  /v1/branch/:
    post:
-      description: Create branch
+      description: |
+        Create a timeline. Returns new timeline id on success.\
+        If no new timeline id is specified in parameters, it would be generated. It's an error to recreate the same timeline.
      requestBody:
        content:
          application/json:
            schema:
              type: object
-              required:
-                - "tenant_id"
-                - "name"
-                - "start_point"
              properties:
-                tenant_id:
+                new_timeline_id:
                  type: string
                  format: hex
-                name:
+                ancestor_timeline_id:
                  type: string
-                start_point:
+                  format: hex
+                ancestor_start_lsn:
                  type: string
      responses:
        "201":
-          description: BranchInfo
+          description: TimelineInfo
          content:
            application/json:
              schema:
-                type: array
-                items:
-                  $ref: "#/components/schemas/BranchInfo"
+                $ref: "#/components/schemas/TimelineInfo"
        "400":
-          description: Malformed branch create request
+          description: Malformed timeline create request
          content:
            application/json:
              schema:
@@ -255,6 +173,12 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
+        "409":
+          description: Timeline already exists, creation skipped
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/AlreadyExistsError"
        "500":
          description: Generic operation error
          content:
@@ -292,27 +216,26 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"
    post:
-      description: Create tenant
+      description: |
+        Create a tenant. Returns new tenant id on success.\
+        If no new tenant id is specified in parameters, it would be generated. It's an error to recreate the same tenant.
      requestBody:
        content:
          application/json:
            schema:
              type: object
-              required:
-                - "tenant_id"
              properties:
-                tenant_id:
+                new_tenant_id:
                  type: string
                  format: hex
      responses:
        "201":
-          description: CREATED
+          description: New tenant created successfully
          content:
            application/json:
              schema:
-                type: array
-                items:
-                  type: string
+                type: string
+                format: hex
        "400":
          description: Malformed tenant create request
          content:
@@ -331,6 +254,12 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
+        "409":
+          description: Tenant already exists, creation skipped
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/AlreadyExistsError"
        "500":
          description: Generic operation error
          content:
@@ -355,35 +284,11 @@ components:
          type: string
        state:
          type: string
-    BranchInfo:
-      type: object
-      required:
-        - name
-        - timeline_id
-        - latest_valid_lsn
-        - current_logical_size
-      properties:
-        name:
-          type: string
-        timeline_id:
-          type: string
-          format: hex
-        ancestor_id:
-          type: string
-        ancestor_lsn:
-          type: string
-        current_logical_size:
-          type: integer
-        current_logical_size_non_incremental:
-          type: integer
    TimelineInfo:
      type: object
      required:
        - timeline_id
        - tenant_id
-        - last_record_lsn
-        - prev_record_lsn
-        - start_lsn
        - disk_consistent_lsn
      properties:
        timeline_id:
@@ -392,19 +297,21 @@ components:
        tenant_id:
          type: string
          format: hex
-        ancestor_timeline_id:
-          type: string
-          format: hex
        last_record_lsn:
          type: string
        prev_record_lsn:
          type: string
-        start_lsn:
+        ancestor_timeline_id:
+          type: string
+          format: hex
+        ancestor_lsn:
          type: string
        disk_consistent_lsn:
          type: string
-        timeline_state:
-          type: string
+        current_logical_size:
+          type: integer
+        current_logical_size_non_incremental:
+          type: integer

    Error:
      type: object
@@ -420,6 +327,13 @@ components:
      properties:
        msg:
          type: string
+    AlreadyExistsError:
+      type: object
+      required:
+        - msg
+      properties:
+        msg:
+          type: string
    ForbiddenError:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1,11 +1,9 @@
 use std::sync::Arc;

-use anyhow::{Context, Result};
-use hyper::header;
+use anyhow::Result;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
-use routerify::{ext::RequestExt, RouterBuilder};
-use serde::Serialize;
+use tokio::sync::RwLock;
 use tracing::*;
 use zenith_utils::auth::JwtAuth;
 use zenith_utils::http::endpoint::attach_openapi_ui;
@@ -16,27 +14,35 @@ use zenith_utils::http::{
    endpoint,
    error::HttpErrorBody,
    json::{json_request, json_response},
-    request::get_request_param,
    request::parse_request_param,
 };
-use zenith_utils::lsn::Lsn;
-use zenith_utils::zid::{opt_display_serde, ZTimelineId};
+use zenith_utils::http::{RequestExt, RouterBuilder};
+use zenith_utils::zid::{ZTenantTimelineId, ZTimelineId};

-use super::models::BranchCreateRequest;
-use super::models::TenantCreateRequest;
-use crate::branches::BranchInfo;
-use crate::repository::TimelineSyncState;
-use crate::{branches, config::PageServerConf, tenant_mgr, ZTenantId};
+use super::models::{
+    StatusResponse, TenantCreateRequest, TenantCreateResponse, TimelineCreateRequest,
+    TimelineInfoResponseV1, TimelineInfoV1,
+};
+use crate::remote_storage::{schedule_timeline_download, RemoteTimelineIndex};
+use crate::timelines::{
+    extract_remote_timeline_info, LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo,
+};
+use crate::{config::PageServerConf, tenant_mgr, timelines, ZTenantId};

 #[derive(Debug)]
 struct State {
    conf: &'static PageServerConf,
    auth: Option<Arc<JwtAuth>>,
+    remote_index: Arc<RwLock<RemoteTimelineIndex>>,
    allowlist_routes: Vec<Uri>,
 }

 impl State {
-    fn new(conf: &'static PageServerConf, auth: Option<Arc<JwtAuth>>) -> Self {
+    fn new(
+        conf: &'static PageServerConf,
+        auth: Option<Arc<JwtAuth>>,
+        remote_index: Arc<RwLock<RemoteTimelineIndex>>,
+    ) -> Self {
        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
            .iter()
            .map(|v| v.parse().unwrap())
@@ -45,6 +51,7 @@ impl State {
            conf,
            auth,
            allowlist_routes,
+            remote_index,
        }
    }
 }
@@ -63,31 +70,62 @@ fn get_config(request: &Request<Body>) -> &'static PageServerConf {
 }

 // healthcheck handler
-async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
-    Ok(Response::builder()
-        .status(StatusCode::OK)
-        .header(header::CONTENT_TYPE, "application/json")
-        .body(Body::from("{}"))
-        .map_err(ApiError::from_err)?)
+async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let config = get_config(&request);
+    Ok(json_response(
+        StatusCode::OK,
+        StatusResponse { id: config.id },
+    )?)
 }

-async fn branch_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let request_data: BranchCreateRequest = json_request(&mut request).await?;
+async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
+    let request_data: TimelineCreateRequest = json_request(&mut request).await?;

-    check_permission(&request, Some(request_data.tenant_id))?;
+    check_permission(&request, Some(tenant_id))?;

-    let response_data = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("/branch_create", name = %request_data.name, tenant = %request_data.tenant_id, startpoint=%request_data.start_point).entered();
-        branches::create_branch(
+    let new_timeline_info = tokio::task::spawn_blocking(move || {
+        let _enter = info_span!("/timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn).entered();
+        timelines::create_timeline(
            get_config(&request),
-            &request_data.name,
-            &request_data.start_point,
-            &request_data.tenant_id,
+            tenant_id,
+            request_data.new_timeline_id.map(ZTimelineId::from),
+            request_data.ancestor_timeline_id.map(ZTimelineId::from),
+            request_data.ancestor_start_lsn,
        )
    })
    .await
    .map_err(ApiError::from_err)??;
-    Ok(json_response(StatusCode::CREATED, response_data)?)
+
+    Ok(match new_timeline_info {
+        Some(info) => json_response(StatusCode::CREATED, info)?,
+        None => json_response(StatusCode::CONFLICT, ())?,
+    })
+}
+
+async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+    let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
+    let local_timeline_infos = tokio::task::spawn_blocking(move || {
+        let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
+        crate::timelines::get_local_timelines(tenant_id, include_non_incremental_logical_size)
+    })
+    .await
+    .map_err(ApiError::from_err)??;
+
+    let remote_index = get_state(&request).remote_index.read().await;
+    let mut response_data = Vec::with_capacity(local_timeline_infos.len());
+    for (timeline_id, local_timeline_info) in local_timeline_infos {
+        response_data.push(TimelineInfo {
+            tenant_id,
+            timeline_id,
+            local: Some(local_timeline_info),
+            remote: extract_remote_timeline_info(tenant_id, timeline_id, &remote_index),
+        })
+    }
+
+    Ok(json_response(StatusCode::OK, response_data)?)
 }

 // Gate non incremental logical size calculation behind a flag
@@ -105,146 +143,134 @@ fn get_include_non_incremental_logical_size(request: &Request<Body>) -> bool {
        .unwrap_or(false)
 }

-async fn branch_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
-
-    let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
-
-    check_permission(&request, Some(tenantid))?;
-
-    let response_data = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("branch_list", tenant = %tenantid).entered();
-        crate::branches::get_branches(
-            get_config(&request),
-            &tenantid,
-            include_non_incremental_logical_size,
-        )
-    })
-    .await
-    .map_err(ApiError::from_err)??;
-    Ok(json_response(StatusCode::OK, response_data)?)
-}
-
-async fn branch_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
-    let branch_name: String = get_request_param(&request, "branch_name")?.to_string();
-    let conf = get_state(&request).conf;
-    let path = conf.branch_path(&branch_name, &tenantid);
-
-    let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
-
-    let response_data = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered();
-        let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
-        BranchInfo::from_path(path, &repo, include_non_incremental_logical_size)
-    })
-    .await
-    .map_err(ApiError::from_err)??;
-
-    Ok(json_response(StatusCode::OK, response_data)?)
-}
-
-async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+// common part for v1 and v2 handlers
+async fn timeline_detail_common(request: Request<Body>) -> Result<TimelineInfo, ApiError> {
    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let conf = get_state(&request).conf;
-    let timelines_dir = conf.timelines_path(&tenant_id);
+    let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
+    let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);

-    let mut timelines_dir_contents =
-        tokio::fs::read_dir(&timelines_dir).await.with_context(|| {
-            format!(
-                "Failed to list timelines dir '{}' contents",
-                timelines_dir.display()
-            )
-        })?;
+    let span = info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id);

-    let mut local_timelines = Vec::new();
-    while let Some(entry) = timelines_dir_contents.next_entry().await.with_context(|| {
-        format!(
-            "Failed to list timelines dir '{}' contents",
-            timelines_dir.display()
-        )
-    })? {
-        let entry_path = entry.path();
-        let entry_type = entry.file_type().await.with_context(|| {
-            format!(
-                "Failed to get file type of timeline dirs' entry '{}'",
-                entry_path.display()
-            )
-        })?;
+    let (local_timeline_info, span) = tokio::task::spawn_blocking(move || {
+        let entered = span.entered();
+        let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
+        let local_timeline = {
+            repo.get_timeline(timeline_id)
+                .map(|timeline| {
+                    LocalTimelineInfo::from_repo_timeline(
+                        timeline,
+                        include_non_incremental_logical_size,
+                    )
+                })
+                .transpose()?
+        };
+        Ok::<_, anyhow::Error>((local_timeline, entered.exit()))
+    })
+    .await
+    .map_err(ApiError::from_err)??;

-        if entry_type.is_dir() {
-            match entry.file_name().to_string_lossy().parse::<ZTimelineId>() {
-                Ok(timeline_id) => local_timelines.push(timeline_id.to_string()),
-                Err(e) => error!(
-                    "Failed to get parse timeline id from timeline dirs' entry '{}': {}",
-                    entry_path.display(),
-                    e
-                ),
-            }
-        }
+    let remote_timeline_info = {
+        let remote_index_read = get_state(&request).remote_index.read().await;
+        remote_index_read
+            .timeline_entry(&ZTenantTimelineId {
+                tenant_id,
+                timeline_id,
+            })
+            .map(|remote_entry| RemoteTimelineInfo {
+                remote_consistent_lsn: remote_entry.disk_consistent_lsn(),
+                awaits_download: remote_entry.get_awaits_download(),
+            })
+    };
+
+    let _enter = span.entered();
+
+    if local_timeline_info.is_none() && remote_timeline_info.is_none() {
+        return Err(ApiError::NotFound(
+            "Timeline is not found neither locally nor remotely".to_string(),
+        ));
    }

-    Ok(json_response(StatusCode::OK, local_timelines)?)
+    Ok(TimelineInfo {
+        tenant_id,
+        timeline_id,
+        local: local_timeline_info,
+        remote: remote_timeline_info,
+    })
 }

-#[derive(Debug, Serialize)]
-#[serde(tag = "type")]
-enum TimelineInfo {
-    Local {
-        #[serde(with = "hex")]
-        timeline_id: ZTimelineId,
-        #[serde(with = "hex")]
-        tenant_id: ZTenantId,
-        #[serde(with = "opt_display_serde")]
-        ancestor_timeline_id: Option<ZTimelineId>,
-        last_record_lsn: Lsn,
-        prev_record_lsn: Lsn,
-        start_lsn: Lsn,
-        disk_consistent_lsn: Lsn,
-        timeline_state: Option<TimelineSyncState>,
-    },
-    Remote {
-        #[serde(with = "hex")]
-        timeline_id: ZTimelineId,
-        #[serde(with = "hex")]
-        tenant_id: ZTenantId,
-    },
+// TODO remove when console adopts v2
+async fn timeline_detail_handler_v1(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let timeline_info = timeline_detail_common(request).await?;
+    Ok(json_response(
+        StatusCode::OK,
+        TimelineInfoResponseV1::from(TimelineInfoV1::from(timeline_info)),
+    )?)
 }

-async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_detail_handler_v2(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let timeline_info = timeline_detail_common(request).await?;
+
+    Ok(json_response(StatusCode::OK, timeline_info)?)
+}
+
+async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
+    let span = info_span!("timeline_attach_handler", tenant = %tenant_id, timeline = %timeline_id);
+
+    let span = tokio::task::spawn_blocking(move || {
+        let entered = span.entered();
+        if tenant_mgr::get_timeline_for_tenant_load(tenant_id, timeline_id).is_ok() {
+            anyhow::bail!("Timeline is already present locally")
+        };
+        Ok(entered.exit())
+    })
+    .await
+    .map_err(ApiError::from_err)??;
+
+    let mut remote_index_write = get_state(&request).remote_index.write().await;
+
+    let _enter = span.entered(); // entered guard cannot live across awaits (non Send)
+    let index_entry = remote_index_write
+        .timeline_entry_mut(&ZTenantTimelineId {
+            tenant_id,
+            timeline_id,
+        })
+        .ok_or_else(|| ApiError::BadRequest("Unknown remote timeline".to_string()))?;
+
+    if index_entry.get_awaits_download() {
+        return Err(ApiError::NotFound(
+            "Timeline download is already in progress".to_string(),
+        ));
+    }
+
+    index_entry.set_awaits_download(true);
+    schedule_timeline_download(tenant_id, timeline_id);
+
+    Ok(json_response(StatusCode::ACCEPTED, ())?)
+}
+
+async fn timeline_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

    let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;

-    let response_data = tokio::task::spawn_blocking(move || {
+    tokio::task::spawn_blocking(move || {
        let _enter =
-            info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id)
+            info_span!("timeline_detach_handler", tenant = %tenant_id, timeline = %timeline_id)
                .entered();
        let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
-        Ok::<_, anyhow::Error>(match repo.get_timeline(timeline_id)?.local_timeline() {
-            None => TimelineInfo::Remote {
-                timeline_id,
-                tenant_id,
-            },
-            Some(timeline) => TimelineInfo::Local {
-                timeline_id,
-                tenant_id,
-                ancestor_timeline_id: timeline.get_ancestor_timeline_id(),
-                disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
-                last_record_lsn: timeline.get_last_record_lsn(),
-                prev_record_lsn: timeline.get_prev_record_lsn(),
-                start_lsn: timeline.get_start_lsn(),
-                timeline_state: repo.get_timeline_state(timeline_id),
-            },
-        })
+        repo.detach_timeline(timeline_id)
    })
    .await
    .map_err(ApiError::from_err)??;

-    Ok(json_response(StatusCode::OK, response_data)?)
+    Ok(json_response(StatusCode::OK, ())?)
 }

 async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -266,14 +292,25 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    check_permission(&request, None)?;

    let request_data: TenantCreateRequest = json_request(&mut request).await?;
+    let remote_index = Arc::clone(&get_state(&request).remote_index);

-    let response_data = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("tenant_create", tenant = %request_data.tenant_id).entered();
-        tenant_mgr::create_repository_for_tenant(get_config(&request), request_data.tenant_id)
+    let target_tenant_id = request_data
+        .new_tenant_id
+        .map(ZTenantId::from)
+        .unwrap_or_else(ZTenantId::generate);
+
+    let new_tenant_id = tokio::task::spawn_blocking(move || {
+        let _enter = info_span!("tenant_create", tenant = ?target_tenant_id).entered();
+
+        tenant_mgr::create_tenant_repository(get_config(&request), target_tenant_id, remote_index)
    })
    .await
    .map_err(ApiError::from_err)??;
-    Ok(json_response(StatusCode::CREATED, response_data)?)
+
+    Ok(match new_tenant_id {
+        Some(id) => json_response(StatusCode::CREATED, TenantCreateResponse(id))?,
+        None => json_response(StatusCode::CONFLICT, ())?,
+    })
 }

 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -286,6 +323,7 @@ async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
 pub fn make_router(
    conf: &'static PageServerConf,
    auth: Option<Arc<JwtAuth>>,
+    remote_index: Arc<RwLock<RemoteTimelineIndex>>,
 ) -> RouterBuilder<hyper::Body, ApiError> {
    let spec = include_bytes!("openapi_spec.yml");
    let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -301,17 +339,27 @@ pub fn make_router(
    }

    router
-        .data(Arc::new(State::new(conf, auth)))
+        .data(Arc::new(State::new(conf, auth, remote_index)))
        .get("/v1/status", status_handler)
-        .get("/v1/timeline/:tenant_id", timeline_list_handler)
-        .get(
-            "/v1/timeline/:tenant_id/:timeline_id",
-            timeline_detail_handler,
-        )
-        .get("/v1/branch/:tenant_id", branch_list_handler)
-        .get("/v1/branch/:tenant_id/:branch_name", branch_detail_handler)
-        .post("/v1/branch", branch_create_handler)
        .get("/v1/tenant", tenant_list_handler)
        .post("/v1/tenant", tenant_create_handler)
+        .get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
+        .post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
+        .get(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id",
+            timeline_detail_handler_v1,
+        )
+        .get(
+            "/v2/tenant/:tenant_id/timeline/:timeline_id",
+            timeline_detail_handler_v2,
+        )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/attach",
+            timeline_attach_handler,
+        )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/detach",
+            timeline_detach_handler,
+        )
        .any(handler_404)
 }
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -7,7 +7,7 @@ use std::fs::File;
 use std::io::{Read, Seek, SeekFrom};
 use std::path::{Path, PathBuf};

-use anyhow::{anyhow, bail, ensure, Result};
+use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use tracing::*;

@@ -126,7 +126,7 @@ pub fn import_timeline_from_postgres_datadir(
    writer.advance_last_record_lsn(lsn);

    // We expect the Postgres server to be shut down cleanly.
-    let pg_control = pg_control.ok_or_else(|| anyhow!("pg_control file not found"))?;
+    let pg_control = pg_control.context("pg_control file not found")?;
    ensure!(
        pg_control.state == DBState_DB_SHUTDOWNED,
        "Postgres cluster was not shut down cleanly"
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
--- a/pageserver/src/layered_repository/delta_layer.rs
+++ b/pageserver/src/layered_repository/delta_layer.rs
@@ -169,7 +169,7 @@ impl DeltaLayerInner {
        if let Some((_entry_lsn, entry)) = slice.last() {
            Ok(*entry)
        } else {
-            Err(anyhow::anyhow!("could not find seg size in delta layer"))
+            bail!("could not find seg size in delta layer")
        }
    }
 }
@@ -208,16 +208,15 @@ impl Layer for DeltaLayer {
        &self,
        blknum: SegmentBlk,
        lsn: Lsn,
-        cached_img_lsn: Option<Lsn>,
        reconstruct_data: &mut PageReconstructData,
    ) -> Result<PageReconstructResult> {
        let mut need_image = true;

        assert!((0..RELISH_SEG_SIZE).contains(&blknum));

-        match &cached_img_lsn {
-            Some(cached_lsn) if &self.end_lsn <= cached_lsn => {
-                return Ok(PageReconstructResult::Cached)
+        match &reconstruct_data.page_img {
+            Some((cached_lsn, _)) if &self.end_lsn <= cached_lsn => {
+                return Ok(PageReconstructResult::Complete)
            }
            _ => {}
        }
@@ -240,9 +239,9 @@ impl Layer for DeltaLayer {
                .iter()
                .rev();
            for ((_blknum, pv_lsn), blob_range) in iter {
-                match &cached_img_lsn {
-                    Some(cached_lsn) if pv_lsn <= cached_lsn => {
-                        return Ok(PageReconstructResult::Cached)
+                match &reconstruct_data.page_img {
+                    Some((cached_lsn, _)) if pv_lsn <= cached_lsn => {
+                        return Ok(PageReconstructResult::Complete)
                    }
                    _ => {}
                }
@@ -252,7 +251,7 @@ impl Layer for DeltaLayer {
                match pv {
                    PageVersion::Page(img) => {
                        // Found a page image, return it
-                        reconstruct_data.page_img = Some(img);
+                        reconstruct_data.page_img = Some((*pv_lsn, img));
                        need_image = false;
                        break;
                    }
--- a/pageserver/src/layered_repository/ephemeral_file.rs
+++ b/pageserver/src/layered_repository/ephemeral_file.rs
@@ -175,7 +175,10 @@ impl Write for EphemeralFile {
    }

    fn flush(&mut self) -> Result<(), std::io::Error> {
-        todo!()
+        // we don't need to flush data:
+        // * we either write input bytes or not, not keeping any intermediate data buffered
+        // * rust unix file `flush` impl does not flush things either, returning `Ok(())`
+        Ok(())
    }
 }

--- a/pageserver/src/layered_repository/image_layer.rs
+++ b/pageserver/src/layered_repository/image_layer.rs
@@ -145,14 +145,15 @@ impl Layer for ImageLayer {
        &self,
        blknum: SegmentBlk,
        lsn: Lsn,
-        cached_img_lsn: Option<Lsn>,
        reconstruct_data: &mut PageReconstructData,
    ) -> Result<PageReconstructResult> {
        assert!((0..RELISH_SEG_SIZE).contains(&blknum));
        assert!(lsn >= self.lsn);

-        match cached_img_lsn {
-            Some(cached_lsn) if self.lsn <= cached_lsn => return Ok(PageReconstructResult::Cached),
+        match reconstruct_data.page_img {
+            Some((cached_lsn, _)) if self.lsn <= cached_lsn => {
+                return Ok(PageReconstructResult::Complete)
+            }
            _ => {}
        }

@@ -173,7 +174,14 @@ impl Layer for ImageLayer {
                    .as_ref()
                    .unwrap()
                    .chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
-                chapter.read_exact_at(&mut buf, offset)?;
+
+                chapter.read_exact_at(&mut buf, offset).with_context(|| {
+                    format!(
+                        "failed to read page from data file {} at offset {}",
+                        self.filename().display(),
+                        offset
+                    )
+                })?;

                buf
            }
@@ -188,7 +196,7 @@ impl Layer for ImageLayer {
            }
        };

-        reconstruct_data.page_img = Some(Bytes::from(buf));
+        reconstruct_data.page_img = Some((self.lsn, Bytes::from(buf)));
        Ok(PageReconstructResult::Complete)
    }

--- a/pageserver/src/layered_repository/inmemory_layer.rs
+++ b/pageserver/src/layered_repository/inmemory_layer.rs
@@ -20,13 +20,15 @@ use crate::{ZTenantId, ZTimelineId};
 use anyhow::{ensure, Result};
 use bytes::Bytes;
 use log::*;
+use std::collections::HashMap;
+use std::io::Seek;
+use std::os::unix::fs::FileExt;
 use std::path::PathBuf;
 use std::sync::{Arc, RwLock};
+use zenith_utils::bin_ser::BeSer;
 use zenith_utils::lsn::Lsn;
 use zenith_utils::vec_map::VecMap;

-use super::page_versions::PageVersions;
-
 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
    tenantid: ZTenantId,
@@ -39,8 +41,20 @@ pub struct InMemoryLayer {
    ///
    start_lsn: Lsn,

-    /// LSN of the oldest page version stored in this layer
-    oldest_pending_lsn: Lsn,
+    ///
+    /// LSN of the oldest page version stored in this layer.
+    ///
+    /// This is different from 'start_lsn' in that we enforce that the 'start_lsn'
+    /// of a layer always matches the 'end_lsn' of its predecessor, even if there
+    /// are no page versions until at a later LSN. That way you can detect any
+    /// missing layer files more easily. 'oldest_lsn' is the first page version
+    /// actually stored in this layer. In the range between 'start_lsn' and
+    /// 'oldest_lsn', there are no changes to the segment.
+    /// 'oldest_lsn' is used to adjust 'disk_consistent_lsn' and that is why it should
+    /// point to the beginning of WAL record. This is the other difference with 'start_lsn'
+    /// which points to end of WAL record. This is why 'oldest_lsn' can be smaller than 'start_lsn'.
+    ///
+    oldest_lsn: Lsn,

    /// The above fields never change. The parts that do change are in 'inner',
    /// and protected by mutex.
@@ -59,11 +73,15 @@ pub struct InMemoryLayerInner {
    /// The drop LSN is recorded in [`end_lsn`].
    dropped: bool,

-    ///
-    /// All versions of all pages in the layer are are kept here.
-    /// Indexed by block number and LSN.
-    ///
-    page_versions: PageVersions,
+    /// The PageVersion structs are stored in a serialized format in this file.
+    /// Each serialized PageVersion is preceded by a 'u32' length field.
+    /// 'page_versions' map stores offsets into this file.
+    file: EphemeralFile,
+
+    /// Metadata about all versions of all pages in the layer is kept
+    /// here.  Indexed by block number and LSN. The value is an offset
+    /// into the ephemeral file where the page version is stored.
+    page_versions: HashMap<SegmentBlk, VecMap<Lsn, u64>>,

    ///
    /// `seg_sizes` tracks the size of the segment at different points in time.
@@ -73,6 +91,14 @@ pub struct InMemoryLayerInner {
    /// a non-blocky rel, 'seg_sizes' is not used and is always empty.
    ///
    seg_sizes: VecMap<Lsn, SegmentBlk>,
+
+    ///
+    /// LSN of the newest page version stored in this layer.
+    ///
+    /// The difference between 'end_lsn' and 'latest_lsn' is the same as between
+    /// 'start_lsn' and 'oldest_lsn'. See comments in 'oldest_lsn'.
+    ///
+    latest_lsn: Lsn,
 }

 impl InMemoryLayerInner {
@@ -91,6 +117,50 @@ impl InMemoryLayerInner {
            panic!("could not find seg size in in-memory layer");
        }
    }
+
+    ///
+    /// Read a page version from the ephemeral file.
+    ///
+    fn read_pv(&self, off: u64) -> Result<PageVersion> {
+        let mut buf = Vec::new();
+        self.read_pv_bytes(off, &mut buf)?;
+        Ok(PageVersion::des(&buf)?)
+    }
+
+    ///
+    /// Read a page version from the ephemeral file, as raw bytes, at
+    /// the given offset.  The bytes are read into 'buf', which is
+    /// expanded if necessary. Returns the size of the page version.
+    ///
+    fn read_pv_bytes(&self, off: u64, buf: &mut Vec<u8>) -> Result<usize> {
+        // read length
+        let mut lenbuf = [0u8; 4];
+        self.file.read_exact_at(&mut lenbuf, off)?;
+        let len = u32::from_ne_bytes(lenbuf) as usize;
+
+        if buf.len() < len {
+            buf.resize(len, 0);
+        }
+        self.file.read_exact_at(&mut buf[0..len], off + 4)?;
+        Ok(len)
+    }
+
+    fn write_pv(&mut self, pv: &PageVersion) -> Result<u64> {
+        // remember starting position
+        let pos = self.file.stream_position()?;
+
+        // make room for the 'length' field by writing zeros as a placeholder.
+        self.file.seek(std::io::SeekFrom::Start(pos + 4)).unwrap();
+
+        pv.ser_into(&mut self.file).unwrap();
+
+        // write the 'length' field.
+        let len = self.file.stream_position()? - pos - 4;
+        let lenbuf = u32::to_ne_bytes(len as u32);
+        self.file.write_all_at(&lenbuf, pos)?;
+
+        Ok(pos)
+    }
 }

 impl Layer for InMemoryLayer {
@@ -100,12 +170,11 @@ impl Layer for InMemoryLayer {
    fn filename(&self) -> PathBuf {
        let inner = self.inner.read().unwrap();

-        let end_lsn;
-        if let Some(drop_lsn) = inner.end_lsn {
-            end_lsn = drop_lsn;
+        let end_lsn = if let Some(drop_lsn) = inner.end_lsn {
+            drop_lsn
        } else {
-            end_lsn = Lsn(u64::MAX);
-        }
+            Lsn(u64::MAX)
+        };

        let delta_filename = DeltaFileName {
            seg: self.seg,
@@ -154,7 +223,6 @@ impl Layer for InMemoryLayer {
        &self,
        blknum: SegmentBlk,
        lsn: Lsn,
-        cached_img_lsn: Option<Lsn>,
        reconstruct_data: &mut PageReconstructData,
    ) -> Result<PageReconstructResult> {
        let mut need_image = true;
@@ -165,33 +233,31 @@ impl Layer for InMemoryLayer {
            let inner = self.inner.read().unwrap();

            // Scan the page versions backwards, starting from `lsn`.
-            let iter = inner
-                .page_versions
-                .get_block_lsn_range(blknum, ..=lsn)
-                .iter()
-                .rev();
-            for (entry_lsn, pos) in iter {
-                match &cached_img_lsn {
-                    Some(cached_lsn) if entry_lsn <= cached_lsn => {
-                        return Ok(PageReconstructResult::Cached)
+            if let Some(vec_map) = inner.page_versions.get(&blknum) {
+                let slice = vec_map.slice_range(..=lsn);
+                for (entry_lsn, pos) in slice.iter().rev() {
+                    match &reconstruct_data.page_img {
+                        Some((cached_lsn, _)) if entry_lsn <= cached_lsn => {
+                            return Ok(PageReconstructResult::Complete)
+                        }
+                        _ => {}
                    }
-                    _ => {}
-                }

-                let pv = inner.page_versions.read_pv(*pos)?;
-                match pv {
-                    PageVersion::Page(img) => {
-                        reconstruct_data.page_img = Some(img);
-                        need_image = false;
-                        break;
-                    }
-                    PageVersion::Wal(rec) => {
-                        reconstruct_data.records.push((*entry_lsn, rec.clone()));
-                        if rec.will_init() {
-                            // This WAL record initializes the page, so no need to go further back
+                    let pv = inner.read_pv(*pos)?;
+                    match pv {
+                        PageVersion::Page(img) => {
+                            reconstruct_data.page_img = Some((*entry_lsn, img));
                            need_image = false;
                            break;
                        }
+                        PageVersion::Wal(rec) => {
+                            reconstruct_data.records.push((*entry_lsn, rec.clone()));
+                            if rec.will_init() {
+                                // This WAL record initializes the page, so no need to go further back
+                                need_image = false;
+                                break;
+                            }
+                        }
                    }
                }
            }
@@ -297,14 +363,22 @@ impl Layer for InMemoryLayer {
            println!("seg_sizes {}: {}", k, v);
        }

-        for (blknum, lsn, pos) in inner.page_versions.ordered_page_version_iter(None) {
-            let pv = inner.page_versions.read_pv(pos)?;
-            let pv_description = match pv {
-                PageVersion::Page(_img) => "page",
-                PageVersion::Wal(_rec) => "wal",
-            };
+        // List the blocks in order
+        let mut page_versions: Vec<(&SegmentBlk, &VecMap<Lsn, u64>)> =
+            inner.page_versions.iter().collect();
+        page_versions.sort_by_key(|k| k.0);

-            println!("blk {} at {}: {}\n", blknum, lsn, pv_description);
+        for (blknum, versions) in page_versions {
+            for (lsn, off) in versions.as_slice() {
+                let pv = inner.read_pv(*off);
+                let pv_description = match pv {
+                    Ok(PageVersion::Page(_img)) => "page",
+                    Ok(PageVersion::Wal(_rec)) => "wal",
+                    Err(_err) => "INVALID",
+                };
+
+                println!("blk {} at {}: {}\n", blknum, lsn, pv_description);
+            }
        }

        Ok(())
@@ -319,8 +393,13 @@ pub struct LayersOnDisk {

 impl InMemoryLayer {
    /// Return the oldest page version that's stored in this layer
-    pub fn get_oldest_pending_lsn(&self) -> Lsn {
-        self.oldest_pending_lsn
+    pub fn get_oldest_lsn(&self) -> Lsn {
+        self.oldest_lsn
+    }
+
+    pub fn get_latest_lsn(&self) -> Lsn {
+        let inner = self.inner.read().unwrap();
+        inner.latest_lsn
    }

    ///
@@ -332,7 +411,7 @@ impl InMemoryLayer {
        tenantid: ZTenantId,
        seg: SegmentTag,
        start_lsn: Lsn,
-        oldest_pending_lsn: Lsn,
+        oldest_lsn: Lsn,
    ) -> Result<InMemoryLayer> {
        trace!(
            "initializing new empty InMemoryLayer for writing {} on timeline {} at {}",
@@ -355,13 +434,15 @@ impl InMemoryLayer {
            tenantid,
            seg,
            start_lsn,
-            oldest_pending_lsn,
+            oldest_lsn,
            incremental: false,
            inner: RwLock::new(InMemoryLayerInner {
                end_lsn: None,
                dropped: false,
-                page_versions: PageVersions::new(file),
+                file,
+                page_versions: HashMap::new(),
                seg_sizes,
+                latest_lsn: oldest_lsn,
            }),
        })
    }
@@ -398,15 +479,21 @@ impl InMemoryLayer {
        let mut inner = self.inner.write().unwrap();

        inner.assert_writeable();
+        assert!(lsn >= inner.latest_lsn);
+        inner.latest_lsn = lsn;

-        let old = inner.page_versions.append_or_update_last(blknum, lsn, pv)?;
-
-        if old.is_some() {
-            // We already had an entry for this LSN. That's odd..
-            warn!(
-                "Page version of rel {} blk {} at {} already exists",
-                self.seg.rel, blknum, lsn
-            );
+        // Write the page version to the file, and remember its offset in 'page_versions'
+        {
+            let off = inner.write_pv(&pv)?;
+            let vec_map = inner.page_versions.entry(blknum).or_default();
+            let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
+            if old.is_some() {
+                // We already had an entry for this LSN. That's odd..
+                warn!(
+                    "Page version of rel {} blk {} at {} already exists",
+                    self.seg.rel, blknum, lsn
+                );
+            }
        }

        // Also update the relation size, if this extended the relation.
@@ -440,16 +527,19 @@ impl InMemoryLayer {
                        gapblknum,
                        blknum
                    );
-                    let old = inner
-                        .page_versions
-                        .append_or_update_last(gapblknum, lsn, zeropv)?;
-                    // We already had an entry for this LSN. That's odd..

-                    if old.is_some() {
-                        warn!(
-                            "Page version of seg {} blk {} at {} already exists",
-                            self.seg, blknum, lsn
-                        );
+                    // Write the page version to the file, and remember its offset in
+                    // 'page_versions'
+                    {
+                        let off = inner.write_pv(&zeropv)?;
+                        let vec_map = inner.page_versions.entry(gapblknum).or_default();
+                        let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
+                        if old.is_some() {
+                            warn!(
+                                "Page version of seg {} blk {} at {} already exists",
+                                self.seg, gapblknum, lsn
+                            );
+                        }
                    }
                }

@@ -509,12 +599,11 @@ impl InMemoryLayer {
        timelineid: ZTimelineId,
        tenantid: ZTenantId,
        start_lsn: Lsn,
-        oldest_pending_lsn: Lsn,
+        oldest_lsn: Lsn,
    ) -> Result<InMemoryLayer> {
        let seg = src.get_seg_tag();

-        assert!(oldest_pending_lsn.is_aligned());
-        assert!(oldest_pending_lsn >= start_lsn);
+        assert!(oldest_lsn.is_aligned());

        trace!(
            "initializing new InMemoryLayer for writing {} on timeline {} at {}",
@@ -538,13 +627,15 @@ impl InMemoryLayer {
            tenantid,
            seg,
            start_lsn,
-            oldest_pending_lsn,
+            oldest_lsn,
            incremental: true,
            inner: RwLock::new(InMemoryLayerInner {
                end_lsn: None,
                dropped: false,
-                page_versions: PageVersions::new(file),
+                file,
+                page_versions: HashMap::new(),
                seg_sizes,
+                latest_lsn: oldest_lsn,
            }),
        })
    }
@@ -571,8 +662,10 @@ impl InMemoryLayer {
                assert!(lsn <= &end_lsn, "{:?} {:?}", lsn, end_lsn);
            }

-            for (_blk, lsn, _pv) in inner.page_versions.ordered_page_version_iter(None) {
-                assert!(lsn <= end_lsn);
+            for (_blk, vec_map) in inner.page_versions.iter() {
+                for (lsn, _pos) in vec_map.as_slice() {
+                    assert!(*lsn <= end_lsn);
+                }
            }
        }
    }
@@ -650,15 +743,19 @@ impl InMemoryLayer {
                self.is_dropped(),
            )?;

-            // Write all page versions
+            // Write all page versions, in block + LSN order
            let mut buf: Vec<u8> = Vec::new();

-            let page_versions_iter = inner
-                .page_versions
-                .ordered_page_version_iter(Some(delta_end_lsn));
-            for (blknum, lsn, pos) in page_versions_iter {
-                let len = inner.page_versions.read_pv_bytes(pos, &mut buf)?;
-                delta_layer_writer.put_page_version(blknum, lsn, &buf[..len])?;
+            let pv_iter = inner.page_versions.iter();
+            let mut pages: Vec<(&SegmentBlk, &VecMap<Lsn, u64>)> = pv_iter.collect();
+            pages.sort_by_key(|(blknum, _vec_map)| *blknum);
+            for (blknum, vec_map) in pages {
+                for (lsn, pos) in vec_map.as_slice() {
+                    if *lsn < delta_end_lsn {
+                        let len = inner.read_pv_bytes(*pos, &mut buf)?;
+                        delta_layer_writer.put_page_version(*blknum, *lsn, &buf[..len])?;
+                    }
+                }
            }

            // Create seg_sizes
--- a/pageserver/src/layered_repository/layer_map.rs
+++ b/pageserver/src/layered_repository/layer_map.rs
@@ -40,7 +40,7 @@ pub struct LayerMap {
    /// All the layers keyed by segment tag
    segs: HashMap<SegmentTag, SegEntry>,

-    /// All in-memory layers, ordered by 'oldest_pending_lsn' and generation
+    /// All in-memory layers, ordered by 'oldest_lsn' and generation
    /// of each layer. This allows easy access to the in-memory layer that
    /// contains the oldest WAL record.
    open_layers: BinaryHeap<OpenLayerEntry>,
@@ -83,16 +83,16 @@ impl LayerMap {

        let layer_id = segentry.update_open(Arc::clone(&layer));

-        let oldest_pending_lsn = layer.get_oldest_pending_lsn();
+        let oldest_lsn = layer.get_oldest_lsn();

-        // After a crash and restart, 'oldest_pending_lsn' of the oldest in-memory
+        // After a crash and restart, 'oldest_lsn' of the oldest in-memory
        // layer becomes the WAL streaming starting point, so it better not point
        // in the middle of a WAL record.
-        assert!(oldest_pending_lsn.is_aligned());
+        assert!(oldest_lsn.is_aligned());

        // Also add it to the binary heap
        let open_layer_entry = OpenLayerEntry {
-            oldest_pending_lsn: layer.get_oldest_pending_lsn(),
+            oldest_lsn: layer.get_oldest_lsn(),
            layer_id,
            generation: self.current_generation,
        };
@@ -191,9 +191,15 @@ impl LayerMap {
    ///
    /// This is used for garbage collection, to determine if an old layer can
    /// be deleted.
-    pub fn newer_image_layer_exists(&self, seg: SegmentTag, lsn: Lsn) -> bool {
+    /// We ignore segments newer than disk_consistent_lsn because they will be removed at restart
+    pub fn newer_image_layer_exists(
+        &self,
+        seg: SegmentTag,
+        lsn: Lsn,
+        disk_consistent_lsn: Lsn,
+    ) -> bool {
        if let Some(segentry) = self.segs.get(&seg) {
-            segentry.newer_image_layer_exists(lsn)
+            segentry.newer_image_layer_exists(lsn, disk_consistent_lsn)
        } else {
            false
        }
@@ -311,13 +317,18 @@ impl SegEntry {
        self.historic.search(lsn)
    }

-    pub fn newer_image_layer_exists(&self, lsn: Lsn) -> bool {
+    pub fn newer_image_layer_exists(&self, lsn: Lsn, disk_consistent_lsn: Lsn) -> bool {
        // We only check on-disk layers, because
        // in-memory layers are not durable

+        // The end-LSN is exclusive, while disk_consistent_lsn is
+        // inclusive. For example, if disk_consistent_lsn is 100, it is
+        // OK for a delta layer to have end LSN 101, but if the end LSN
+        // is 102, then it might not have been fully flushed to disk
+        // before crash.
        self.historic
            .iter_newer(lsn)
-            .any(|layer| !layer.is_incremental())
+            .any(|layer| !layer.is_incremental() && layer.get_end_lsn() <= disk_consistent_lsn + 1)
    }

    // Set new open layer for a SegEntry.
@@ -341,23 +352,23 @@ impl SegEntry {
 }

 /// Entry held in LayerMap::open_layers, with boilerplate comparison routines
-/// to implement a min-heap ordered by 'oldest_pending_lsn' and 'generation'
+/// to implement a min-heap ordered by 'oldest_lsn' and 'generation'
 ///
 /// The generation number associated with each entry can be used to distinguish
 /// recently-added entries (i.e after last call to increment_generation()) from older
-/// entries with the same 'oldest_pending_lsn'.
+/// entries with the same 'oldest_lsn'.
 struct OpenLayerEntry {
-    oldest_pending_lsn: Lsn, // copy of layer.get_oldest_pending_lsn()
+    oldest_lsn: Lsn, // copy of layer.get_oldest_lsn()
    generation: u64,
    layer_id: LayerId,
 }
 impl Ord for OpenLayerEntry {
    fn cmp(&self, other: &Self) -> Ordering {
        // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
-        // to get that. Entries with identical oldest_pending_lsn are ordered by generation
+        // to get that. Entries with identical oldest_lsn are ordered by generation
        other
-            .oldest_pending_lsn
-            .cmp(&self.oldest_pending_lsn)
+            .oldest_lsn
+            .cmp(&self.oldest_lsn)
            .then_with(|| other.generation.cmp(&self.generation))
    }
 }
@@ -426,7 +437,7 @@ mod tests {
        conf: &'static PageServerConf,
        segno: u32,
        start_lsn: Lsn,
-        oldest_pending_lsn: Lsn,
+        oldest_lsn: Lsn,
    ) -> Arc<InMemoryLayer> {
        Arc::new(
            InMemoryLayer::create(
@@ -438,7 +449,7 @@ mod tests {
                    segno,
                },
                start_lsn,
-                oldest_pending_lsn,
+                oldest_lsn,
            )
            .unwrap(),
        )
--- a/pageserver/src/layered_repository/page_versions.rs
+++ b/pageserver/src/layered_repository/page_versions.rs
@@ -1,268 +0,0 @@
-//!
-//! Data structure to ingest incoming WAL into an append-only file.
-//!
-//! - The file is considered temporary, and will be discarded on crash
-//! - based on a B-tree
-//!
-
-use std::os::unix::fs::FileExt;
-use std::{collections::HashMap, ops::RangeBounds, slice};
-
-use anyhow::Result;
-
-use std::cmp::min;
-use std::io::Seek;
-
-use zenith_utils::{lsn::Lsn, vec_map::VecMap};
-
-use super::storage_layer::PageVersion;
-use crate::layered_repository::ephemeral_file::EphemeralFile;
-
-use zenith_utils::bin_ser::BeSer;
-
-const EMPTY_SLICE: &[(Lsn, u64)] = &[];
-
-pub struct PageVersions {
-    map: HashMap<u32, VecMap<Lsn, u64>>,
-
-    /// The PageVersion structs are stored in a serialized format in this file.
-    /// Each serialized PageVersion is preceded by a 'u32' length field.
-    /// The 'map' stores offsets into this file.
-    file: EphemeralFile,
-}
-
-impl PageVersions {
-    pub fn new(file: EphemeralFile) -> PageVersions {
-        PageVersions {
-            map: HashMap::new(),
-            file,
-        }
-    }
-
-    pub fn append_or_update_last(
-        &mut self,
-        blknum: u32,
-        lsn: Lsn,
-        page_version: PageVersion,
-    ) -> Result<Option<u64>> {
-        // remember starting position
-        let pos = self.file.stream_position()?;
-
-        // make room for the 'length' field by writing zeros as a placeholder.
-        self.file.seek(std::io::SeekFrom::Start(pos + 4)).unwrap();
-
-        page_version.ser_into(&mut self.file).unwrap();
-
-        // write the 'length' field.
-        let len = self.file.stream_position()? - pos - 4;
-        let lenbuf = u32::to_ne_bytes(len as u32);
-        self.file.write_all_at(&lenbuf, pos)?;
-
-        let map = self.map.entry(blknum).or_insert_with(VecMap::default);
-        Ok(map.append_or_update_last(lsn, pos as u64).unwrap().0)
-    }
-
-    /// Get all [`PageVersion`]s in a block
-    fn get_block_slice(&self, blknum: u32) -> &[(Lsn, u64)] {
-        self.map
-            .get(&blknum)
-            .map(VecMap::as_slice)
-            .unwrap_or(EMPTY_SLICE)
-    }
-
-    /// Get a range of [`PageVersions`] in a block
-    pub fn get_block_lsn_range<R: RangeBounds<Lsn>>(&self, blknum: u32, range: R) -> &[(Lsn, u64)] {
-        self.map
-            .get(&blknum)
-            .map(|vec_map| vec_map.slice_range(range))
-            .unwrap_or(EMPTY_SLICE)
-    }
-
-    /// Iterate through [`PageVersion`]s in (block, lsn) order.
-    /// If a [`cutoff_lsn`] is set, only show versions with `lsn < cutoff_lsn`
-    pub fn ordered_page_version_iter(&self, cutoff_lsn: Option<Lsn>) -> OrderedPageVersionIter<'_> {
-        let mut ordered_blocks: Vec<u32> = self.map.keys().cloned().collect();
-        ordered_blocks.sort_unstable();
-
-        let slice = ordered_blocks
-            .first()
-            .map(|&blknum| self.get_block_slice(blknum))
-            .unwrap_or(EMPTY_SLICE);
-
-        OrderedPageVersionIter {
-            page_versions: self,
-            ordered_blocks,
-            cur_block_idx: 0,
-            cutoff_lsn,
-            cur_slice_iter: slice.iter(),
-        }
-    }
-
-    ///
-    /// Read a page version.
-    ///
-    pub fn read_pv(&self, off: u64) -> Result<PageVersion> {
-        let mut buf = Vec::new();
-        self.read_pv_bytes(off, &mut buf)?;
-        Ok(PageVersion::des(&buf)?)
-    }
-
-    ///
-    /// Read a page version, as raw bytes, at the given offset. The bytes
-    /// are read into 'buf', which is expanded if necessary. Returns the
-    /// size of the page version.
-    ///
-    pub fn read_pv_bytes(&self, off: u64, buf: &mut Vec<u8>) -> Result<usize> {
-        // read length
-        let mut lenbuf = [0u8; 4];
-        self.file.read_exact_at(&mut lenbuf, off)?;
-        let len = u32::from_ne_bytes(lenbuf) as usize;
-
-        // Resize the buffer to fit the data, if needed.
-        //
-        // We don't shrink the buffer if it's larger than necessary. That avoids
-        // repeatedly shrinking and expanding when you reuse the same buffer to
-        // read multiple page versions. Expanding a Vec requires initializing the
-        // new bytes, which is a waste of time because we're immediately overwriting
-        // it, but there's no way to avoid it without resorting to unsafe code.
-        if buf.len() < len {
-            buf.resize(len, 0);
-        }
-        self.file.read_exact_at(&mut buf[0..len], off + 4)?;
-
-        Ok(len)
-    }
-}
-
-pub struct PageVersionReader<'a> {
-    file: &'a EphemeralFile,
-    pos: u64,
-    end_pos: u64,
-}
-
-impl<'a> std::io::Read for PageVersionReader<'a> {
-    fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
-        let len = min(buf.len(), (self.end_pos - self.pos) as usize);
-        let n = self.file.read_at(&mut buf[..len], self.pos)?;
-        self.pos += n as u64;
-        Ok(n)
-    }
-}
-
-pub struct OrderedPageVersionIter<'a> {
-    page_versions: &'a PageVersions,
-
-    ordered_blocks: Vec<u32>,
-    cur_block_idx: usize,
-
-    cutoff_lsn: Option<Lsn>,
-
-    cur_slice_iter: slice::Iter<'a, (Lsn, u64)>,
-}
-
-impl OrderedPageVersionIter<'_> {
-    fn is_lsn_before_cutoff(&self, lsn: &Lsn) -> bool {
-        if let Some(cutoff_lsn) = self.cutoff_lsn.as_ref() {
-            lsn < cutoff_lsn
-        } else {
-            true
-        }
-    }
-}
-
-impl<'a> Iterator for OrderedPageVersionIter<'a> {
-    type Item = (u32, Lsn, u64);
-
-    fn next(&mut self) -> Option<Self::Item> {
-        loop {
-            if let Some((lsn, pos)) = self.cur_slice_iter.next() {
-                if self.is_lsn_before_cutoff(lsn) {
-                    let blknum = self.ordered_blocks[self.cur_block_idx];
-                    return Some((blknum, *lsn, *pos));
-                }
-            }
-
-            let next_block_idx = self.cur_block_idx + 1;
-            let blknum: u32 = *self.ordered_blocks.get(next_block_idx)?;
-            self.cur_block_idx = next_block_idx;
-            self.cur_slice_iter = self.page_versions.get_block_slice(blknum).iter();
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use bytes::Bytes;
-
-    use super::*;
-    use crate::config::PageServerConf;
-    use std::fs;
-    use std::str::FromStr;
-    use zenith_utils::zid::{ZTenantId, ZTimelineId};
-
-    fn repo_harness(test_name: &str) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId)> {
-        let repo_dir = PageServerConf::test_repo_dir(test_name);
-        let _ = fs::remove_dir_all(&repo_dir);
-        let conf = PageServerConf::dummy_conf(repo_dir);
-        // Make a static copy of the config. This can never be free'd, but that's
-        // OK in a test.
-        let conf: &'static PageServerConf = Box::leak(Box::new(conf));
-
-        let tenantid = ZTenantId::from_str("11000000000000000000000000000000").unwrap();
-        let timelineid = ZTimelineId::from_str("22000000000000000000000000000000").unwrap();
-        fs::create_dir_all(conf.timeline_path(&timelineid, &tenantid))?;
-
-        Ok((conf, tenantid, timelineid))
-    }
-
-    #[test]
-    fn test_ordered_iter() -> Result<()> {
-        let (conf, tenantid, timelineid) = repo_harness("test_ordered_iter")?;
-
-        let file = EphemeralFile::create(conf, tenantid, timelineid)?;
-
-        let mut page_versions = PageVersions::new(file);
-
-        const BLOCKS: u32 = 1000;
-        const LSNS: u64 = 50;
-
-        let empty_page = Bytes::from_static(&[0u8; 8192]);
-        let empty_page_version = PageVersion::Page(empty_page);
-
-        for blknum in 0..BLOCKS {
-            for lsn in 0..LSNS {
-                let old = page_versions.append_or_update_last(
-                    blknum,
-                    Lsn(lsn),
-                    empty_page_version.clone(),
-                )?;
-                assert!(old.is_none());
-            }
-        }
-
-        let mut iter = page_versions.ordered_page_version_iter(None);
-        for blknum in 0..BLOCKS {
-            for lsn in 0..LSNS {
-                let (actual_blknum, actual_lsn, _pv) = iter.next().unwrap();
-                assert_eq!(actual_blknum, blknum);
-                assert_eq!(Lsn(lsn), actual_lsn);
-            }
-        }
-        assert!(iter.next().is_none());
-        assert!(iter.next().is_none()); // should be robust against excessive next() calls
-
-        const CUTOFF_LSN: Lsn = Lsn(30);
-        let mut iter = page_versions.ordered_page_version_iter(Some(CUTOFF_LSN));
-        for blknum in 0..BLOCKS {
-            for lsn in 0..CUTOFF_LSN.0 {
-                let (actual_blknum, actual_lsn, _pv) = iter.next().unwrap();
-                assert_eq!(actual_blknum, blknum);
-                assert_eq!(Lsn(lsn), actual_lsn);
-            }
-        }
-        assert!(iter.next().is_none());
-        assert!(iter.next().is_none()); // should be robust against excessive next() calls
-
-        Ok(())
-    }
-}
--- a/pageserver/src/layered_repository/storage_layer.rs
+++ b/pageserver/src/layered_repository/storage_layer.rs
@@ -71,15 +71,26 @@ pub enum PageVersion {
 }

 ///
-/// Data needed to reconstruct a page version
+/// Struct used to communicate across calls to 'get_page_reconstruct_data'.
 ///
-/// 'page_img' is the old base image of the page to start the WAL replay with.
-/// It can be None, if the first WAL record initializes the page (will_init)
-/// 'records' contains the records to apply over the base image.
+/// Before first call to get_page_reconstruct_data, you can fill in 'page_img'
+/// if you have an older cached version of the page available. That can save
+/// work in 'get_page_reconstruct_data', as it can stop searching for page
+/// versions when all the WAL records going back to the cached image have been
+/// collected.
+///
+/// When get_page_reconstruct_data returns Complete, 'page_img' is set to an
+/// image of the page, or the oldest WAL record in 'records' is a will_init-type
+/// record that initializes the page without requiring a previous image.
+///
+/// If 'get_page_reconstruct_data' returns Continue, some 'records' may have
+/// been collected, but there are more records outside the current layer. Pass
+/// the same PageReconstructData struct in the next 'get_page_reconstruct_data'
+/// call, to collect more records.
 ///
 pub struct PageReconstructData {
    pub records: Vec<(Lsn, ZenithWalRecord)>,
-    pub page_img: Option<Bytes>,
+    pub page_img: Option<(Lsn, Bytes)>,
 }

 /// Return value from Layer::get_page_reconstruct_data
@@ -93,8 +104,6 @@ pub enum PageReconstructResult {
    /// the returned LSN. This is usually considered an error, but might be OK
    /// in some circumstances.
    Missing(Lsn),
-    /// Use the cached image at `cached_img_lsn` as the base image
-    Cached,
 }

 ///
@@ -138,19 +147,16 @@ pub trait Layer: Send + Sync {
    /// It is up to the caller to collect more data from previous layer and
    /// perform WAL redo, if necessary.
    ///
-    /// `cached_img_lsn` should be set to a cached page image's lsn < `lsn`.
-    /// This function will only return data after `cached_img_lsn`.
-    ///
    /// See PageReconstructResult for possible return values. The collected data
    /// is appended to reconstruct_data; the caller should pass an empty struct
-    /// on first call. If this returns PageReconstructResult::Continue, look up
-    /// the predecessor layer and call again with the same 'reconstruct_data'
-    /// to collect more data.
+    /// on first call, or a struct with a cached older image of the page if one
+    /// is available. If this returns PageReconstructResult::Continue, look up
+    /// the predecessor layer and call again with the same 'reconstruct_data' to
+    /// collect more data.
    fn get_page_reconstruct_data(
        &self,
        blknum: SegmentBlk,
        lsn: Lsn,
-        cached_img_lsn: Option<Lsn>,
        reconstruct_data: &mut PageReconstructData,
    ) -> Result<PageReconstructResult>;

--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,5 +1,4 @@
 pub mod basebackup;
-pub mod branches;
 pub mod config;
 pub mod http;
 pub mod import_datadir;
@@ -12,6 +11,7 @@ pub mod repository;
 pub mod tenant_mgr;
 pub mod tenant_threads;
 pub mod thread_mgr;
+pub mod timelines;
 pub mod virtual_file;
 pub mod walingest;
 pub mod walreceiver;
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -10,7 +10,7 @@
 //     *callmemaybe <zenith timelineid> $url* -- ask pageserver to start walreceiver on $url
 //

-use anyhow::{anyhow, bail, ensure, Context, Result};
+use anyhow::{bail, ensure, Context, Result};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use lazy_static::lazy_static;
 use regex::Regex;
@@ -18,7 +18,7 @@ use std::io;
 use std::net::TcpListener;
 use std::str;
 use std::str::FromStr;
-use std::sync::Arc;
+use std::sync::{Arc, RwLockReadGuard};
 use tracing::*;
 use zenith_metrics::{register_histogram_vec, HistogramVec};
 use zenith_utils::auth::{self, JwtAuth};
@@ -27,13 +27,10 @@ use zenith_utils::lsn::Lsn;
 use zenith_utils::postgres_backend::is_socket_read_timed_out;
 use zenith_utils::postgres_backend::PostgresBackend;
 use zenith_utils::postgres_backend::{self, AuthType};
-use zenith_utils::pq_proto::{
-    BeMessage, FeMessage, RowDescriptor, HELLO_WORLD_ROW, SINGLE_COL_ROWDESC,
-};
+use zenith_utils::pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC};
 use zenith_utils::zid::{ZTenantId, ZTimelineId};

 use crate::basebackup;
-use crate::branches;
 use crate::config::PageServerConf;
 use crate::relish::*;
 use crate::repository::Timeline;
@@ -301,7 +298,7 @@ lazy_static! {
    static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!(
        "pageserver_smgr_query_time",
        "Time spent on smgr query handling",
-        &["smgr_query_type"],
+        &["smgr_query_type", "tenant_id", "timeline_id"],
        TIME_BUCKETS.into()
    )
    .expect("failed to define a metric");
@@ -325,8 +322,8 @@ impl PageServerHandler {
        let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered();

        // Check that the timeline exists
-        let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
-            .context("Cannot handle pagerequests for a remote timeline")?;
+        let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
+            .context("Cannot load local timeline")?;

        /* switch client to COPYBOTH */
        pgb.write_message(&BeMessage::CopyBothResponse)?;
@@ -343,20 +340,22 @@ impl PageServerHandler {
                        };

                        let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;
+                        let tenant_id = tenantid.to_string();
+                        let timeline_id = timelineid.to_string();

                        let response = match zenith_fe_msg {
                            PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME
-                                .with_label_values(&["get_rel_exists"])
+                                .with_label_values(&["get_rel_exists", &tenant_id, &timeline_id])
                                .observe_closure_duration(|| {
                                    self.handle_get_rel_exists_request(timeline.as_ref(), &req)
                                }),
                            PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME
-                                .with_label_values(&["get_rel_size"])
+                                .with_label_values(&["get_rel_size", &tenant_id, &timeline_id])
                                .observe_closure_duration(|| {
                                    self.handle_get_nblocks_request(timeline.as_ref(), &req)
                                }),
                            PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME
-                                .with_label_values(&["get_page_at_lsn"])
+                                .with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id])
                                .observe_closure_duration(|| {
                                    self.handle_get_page_at_lsn_request(timeline.as_ref(), &req)
                                }),
@@ -398,7 +397,12 @@ impl PageServerHandler {
    /// In either case, if the page server hasn't received the WAL up to the
    /// requested LSN yet, we will wait for it to arrive. The return value is
    /// the LSN that should be used to look up the page versions.
-    fn wait_or_get_last_lsn(timeline: &dyn Timeline, lsn: Lsn, latest: bool) -> Result<Lsn> {
+    fn wait_or_get_last_lsn(
+        timeline: &dyn Timeline,
+        mut lsn: Lsn,
+        latest: bool,
+        latest_gc_cutoff_lsn: &RwLockReadGuard<Lsn>,
+    ) -> Result<Lsn> {
        if latest {
            // Latest page version was requested. If LSN is given, it is a hint
            // to the page server that there have been no modifications to the
@@ -419,22 +423,26 @@ impl PageServerHandler {
            // walsender completes the authentication and starts streaming the
            // WAL.
            if lsn <= last_record_lsn {
-                Ok(last_record_lsn)
+                lsn = last_record_lsn;
            } else {
                timeline.wait_lsn(lsn)?;
                // Since we waited for 'lsn' to arrive, that is now the last
                // record LSN. (Or close enough for our purposes; the
                // last-record LSN can advance immediately after we return
                // anyway)
-                Ok(lsn)
            }
        } else {
            if lsn == Lsn(0) {
                bail!("invalid LSN(0) in request");
            }
            timeline.wait_lsn(lsn)?;
-            Ok(lsn)
        }
+        ensure!(
+            lsn >= **latest_gc_cutoff_lsn,
+            "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
+            lsn, **latest_gc_cutoff_lsn
+        );
+        Ok(lsn)
    }

    fn handle_get_rel_exists_request(
@@ -445,7 +453,8 @@ impl PageServerHandler {
        let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered();

        let tag = RelishTag::Relation(req.rel);
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;
+        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;

        let exists = timeline.get_rel_exists(tag, lsn)?;

@@ -461,7 +470,8 @@ impl PageServerHandler {
    ) -> Result<PagestreamBeMessage> {
        let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered();
        let tag = RelishTag::Relation(req.rel);
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;
+        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;

        let n_blocks = timeline.get_relish_size(tag, lsn)?;

@@ -482,8 +492,16 @@ impl PageServerHandler {
        let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn)
            .entered();
        let tag = RelishTag::Relation(req.rel);
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;
-
+        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
+        /*
+        // Add a 1s delay to some requests. The delayed causes the requests to
+        // hit the race condition from github issue #1047 more easily.
+        use rand::Rng;
+        if rand::thread_rng().gen::<u8>() < 5 {
+            std::thread::sleep(std::time::Duration::from_millis(1000));
+        }
+        */
        let page = timeline.get_page_at_lsn(tag, req.blkno, lsn)?;

        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
@@ -496,17 +514,20 @@ impl PageServerHandler {
        pgb: &mut PostgresBackend,
        timelineid: ZTimelineId,
        lsn: Option<Lsn>,
+        prev_lsn: Option<Lsn>,
        tenantid: ZTenantId,
+        full_backup: bool,
    ) -> anyhow::Result<()> {
        let span = info_span!("basebackup", timeline = %timelineid, tenant = %tenantid, lsn = field::Empty);
        let _enter = span.enter();

        // check that the timeline exists
-        let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
-            .context("Cannot handle basebackup request for a remote timeline")?;
+        let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
+            .context("Cannot load local timeline")?;
+        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        if let Some(lsn) = lsn {
            timeline
-                .check_lsn_is_in_scope(lsn)
+                .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
                .context("invalid basebackup lsn")?;
        }

@@ -516,7 +537,9 @@ impl PageServerHandler {
        /* Send a tarball of the latest layer on the timeline */
        {
            let mut writer = CopyDataSink { pgb };
-            let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?;
+
+            let mut basebackup =
+                basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?;
            span.record("lsn", &basebackup.lsn.to_string().as_str());
            basebackup.send_tarball()?;
        }
@@ -616,7 +639,67 @@ impl postgres_backend::Handler for PageServerHandler {
            };

            // Check that the timeline exists
-            self.handle_basebackup_request(pgb, timelineid, lsn, tenantid)?;
+            self.handle_basebackup_request(pgb, timelineid, lsn, None, tenantid, false)?;
+            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+        }
+        // return pair of prev_lsn and last_lsn
+        else if query_string.starts_with("get_last_record_rlsn ") {
+            let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len());
+            let params = params_raw.split_whitespace().collect::<Vec<_>>();
+
+            ensure!(
+                params.len() == 2,
+                "invalid param number for get_last_record_rlsn command"
+            );
+
+            let tenantid = ZTenantId::from_str(params[0])?;
+            let timelineid = ZTimelineId::from_str(params[1])?;
+
+            self.check_permission(Some(tenantid))?;
+            let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
+                .context("Cannot load local timeline")?;
+
+            let end_of_timeline = timeline.get_last_record_rlsn();
+
+            pgb.write_message_noflush(&BeMessage::RowDescription(&[
+                RowDescriptor::text_col(b"prev_lsn"),
+                RowDescriptor::text_col(b"last_lsn"),
+            ]))?
+            .write_message_noflush(&BeMessage::DataRow(&[
+                Some(end_of_timeline.prev.to_string().as_bytes()),
+                Some(end_of_timeline.last.to_string().as_bytes()),
+            ]))?
+            .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
+        }
+        // same as basebackup, but result includes relational data as well
+        else if query_string.starts_with("fullbackup ") {
+            let (_, params_raw) = query_string.split_at("fullbackup ".len());
+            let params = params_raw.split_whitespace().collect::<Vec<_>>();
+
+            ensure!(
+                params.len() >= 2,
+                "invalid param number for fullbackup command"
+            );
+
+            let tenantid = ZTenantId::from_str(params[0])?;
+            let timelineid = ZTimelineId::from_str(params[1])?;
+
+            // The caller is responsible for providing correct lsn and prev_lsn.
+            let lsn = if params.len() > 2 {
+                Some(Lsn::from_str(params[2])?)
+            } else {
+                None
+            };
+            let prev_lsn = if params.len() > 3 {
+                Some(Lsn::from_str(params[3])?)
+            } else {
+                None
+            };
+
+            self.check_permission(Some(tenantid))?;
+
+            // Check that the timeline exists
+            self.handle_basebackup_request(pgb, timelineid, lsn, prev_lsn, tenantid, true)?;
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else if query_string.starts_with("callmemaybe ") {
            // callmemaybe <zenith tenantid as hex string> <zenith timelineid as hex string> <connstr>
@@ -624,7 +707,7 @@ impl postgres_backend::Handler for PageServerHandler {
            let re = Regex::new(r"^callmemaybe ([[:xdigit:]]+) ([[:xdigit:]]+) (.*)$").unwrap();
            let caps = re
                .captures(query_string)
-                .ok_or_else(|| anyhow!("invalid callmemaybe: '{}'", query_string))?;
+                .with_context(|| format!("invalid callmemaybe: '{}'", query_string))?;

            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
            let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;
@@ -636,85 +719,27 @@ impl postgres_backend::Handler for PageServerHandler {
                info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered();

            // Check that the timeline exists
-            tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
-                .context("Failed to fetch local timeline for callmemaybe requests")?;
+            tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
+                .context("Cannot load local timeline")?;

            walreceiver::launch_wal_receiver(self.conf, tenantid, timelineid, &connstr)?;

            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("branch_create ") {
-            let err = || anyhow!("invalid branch_create: '{}'", query_string);
-
-            // branch_create <tenantid> <branchname> <startpoint>
-            // TODO lazy static
-            // TODO: escaping, to allow branch names with spaces
-            let re = Regex::new(r"^branch_create ([[:xdigit:]]+) (\S+) ([^\r\n\s;]+)[\r\n\s;]*;?$")
-                .unwrap();
-            let caps = re.captures(query_string).ok_or_else(err)?;
-
-            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
-            let branchname = caps.get(2).ok_or_else(err)?.as_str().to_owned();
-            let startpoint_str = caps.get(3).ok_or_else(err)?.as_str().to_owned();
-
-            self.check_permission(Some(tenantid))?;
-
-            let _enter =
-                info_span!("branch_create", name = %branchname, tenant = %tenantid).entered();
-
-            let branch =
-                branches::create_branch(self.conf, &branchname, &startpoint_str, &tenantid)?;
-            let branch = serde_json::to_vec(&branch)?;
-
-            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
-                .write_message_noflush(&BeMessage::DataRow(&[Some(&branch)]))?
-                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("branch_list ") {
-            // branch_list <zenith tenantid as hex string>
-            let re = Regex::new(r"^branch_list ([[:xdigit:]]+)$").unwrap();
-            let caps = re
-                .captures(query_string)
-                .ok_or_else(|| anyhow!("invalid branch_list: '{}'", query_string))?;
-
-            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
-
-            // since these handlers for tenant/branch commands are deprecated (in favor of http based ones)
-            // just use false in place of include non incremental logical size
-            let branches = crate::branches::get_branches(self.conf, &tenantid, false)?;
-            let branches_buf = serde_json::to_vec(&branches)?;
-
-            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
-                .write_message_noflush(&BeMessage::DataRow(&[Some(&branches_buf)]))?
-                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("tenant_list") {
-            let tenants = crate::tenant_mgr::list_tenants()?;
-            let tenants_buf = serde_json::to_vec(&tenants)?;
-
-            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
-                .write_message_noflush(&BeMessage::DataRow(&[Some(&tenants_buf)]))?
-                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("tenant_create") {
-            let err = || anyhow!("invalid tenant_create: '{}'", query_string);
-
-            // tenant_create <tenantid>
-            let re = Regex::new(r"^tenant_create ([[:xdigit:]]+)$").unwrap();
-            let caps = re.captures(query_string).ok_or_else(err)?;
-
-            self.check_permission(None)?;
-
-            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
-
-            tenant_mgr::create_repository_for_tenant(self.conf, tenantid)?;
-
-            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
-                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("status") {
-            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
-                .write_message_noflush(&HELLO_WORLD_ROW)?
-                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else if query_string.to_ascii_lowercase().starts_with("set ") {
            // important because psycopg2 executes "SET datestyle TO 'ISO'"
            // on connect
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+        } else if query_string.starts_with("failpoints ") {
+            let (_, failpoints) = query_string.split_at("failpoints ".len());
+            for failpoint in failpoints.split(';') {
+                if let Some((name, actions)) = failpoint.split_once('=') {
+                    info!("cfg failpoint: {} {}", name, actions);
+                    fail::cfg(name, actions).unwrap();
+                } else {
+                    bail!("Invalid failpoints format");
+                }
+            }
+            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else if query_string.starts_with("do_gc ") {
            // Run GC immediately on given timeline.
            // FIXME: This is just for tests. See test_runner/batch_others/test_gc.py.
@@ -728,7 +753,7 @@ impl postgres_backend::Handler for PageServerHandler {

            let caps = re
                .captures(query_string)
-                .ok_or_else(|| anyhow!("invalid do_gc: '{}'", query_string))?;
+                .with_context(|| format!("invalid do_gc: '{}'", query_string))?;

            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
            let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;
@@ -812,13 +837,13 @@ impl postgres_backend::Handler for PageServerHandler {

            let caps = re
                .captures(query_string)
-                .ok_or_else(|| anyhow!("invalid checkpoint command: '{}'", query_string))?;
+                .with_context(|| format!("invalid checkpoint command: '{}'", query_string))?;

            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
            let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;

-            let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
-                .context("Failed to fetch local timeline for checkpoint request")?;
+            let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
+                .context("Cannot load local timeline")?;

            timeline.checkpoint(CheckpointConfig::Forced)?;
            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
--- a/pageserver/src/relish.rs
+++ b/pageserver/src/relish.rs
@@ -26,6 +26,7 @@
 use serde::{Deserialize, Serialize};
 use std::fmt;

+use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::forknumber_to_name;
 use postgres_ffi::{Oid, TransactionId};

@@ -170,6 +171,30 @@ impl fmt::Display for RelTag {
    }
 }

+impl RelTag {
+    pub fn to_segfile_name(&self, segno: u32) -> String {
+        let mut name = if self.spcnode == pg_constants::GLOBALTABLESPACE_OID {
+            "global/".to_string()
+        } else {
+            format!("base/{}/", self.dbnode)
+        };
+
+        name += &self.relnode.to_string();
+
+        if let Some(fork_name) = forknumber_to_name(self.forknum) {
+            name += "_";
+            name += fork_name;
+        }
+
+        if segno != 0 {
+            name += ".";
+            name += &segno.to_string();
+        }
+
+        name
+    }
+}
+
 /// Display RelTag in the same format that's used in most PostgreSQL debug messages:
 ///
 /// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
--- a/pageserver/src/remote_storage.rs
+++ b/pageserver/src/remote_storage.rs
@@ -89,41 +89,38 @@ use std::{
    collections::HashMap,
    ffi, fs,
    path::{Path, PathBuf},
+    sync::Arc,
 };

 use anyhow::{bail, Context};
-use tokio::io;
+use tokio::{io, sync::RwLock};
 use tracing::{error, info};
-use zenith_utils::zid::{ZTenantId, ZTimelineId};
+use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};

+pub use self::storage_sync::index::{RemoteTimelineIndex, TimelineIndexEntry};
 pub use self::storage_sync::{schedule_timeline_checkpoint_upload, schedule_timeline_download};
 use self::{local_fs::LocalFs, rust_s3::S3};
 use crate::{
    config::{PageServerConf, RemoteStorageKind},
    layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME},
-    repository::TimelineSyncState,
 };

-/// Any timeline has its own id and its own tenant it belongs to,
-/// the sync processes group timelines by both for simplicity.
-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct TimelineSyncId(ZTenantId, ZTimelineId);
+pub use storage_sync::compression;

-impl std::fmt::Display for TimelineSyncId {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "(tenant: {}, timeline: {})", self.0, self.1)
-    }
+#[derive(Clone, Copy, Debug)]
+pub enum LocalTimelineInitStatus {
+    LocallyComplete,
+    NeedsSync,
 }

+type LocalTimelineInitStatuses = HashMap<ZTenantId, HashMap<ZTimelineId, LocalTimelineInitStatus>>;
+
 /// A structure to combine all synchronization data to share with pageserver after a successful sync loop initialization.
 /// Successful initialization includes a case when sync loop is not started, in which case the startup data is returned still,
 /// to simplify the received code.
 pub struct SyncStartupData {
-    /// A sync state, derived from initial comparison of local timeline files and the remote archives,
-    /// before any sync tasks are executed.
-    /// To reuse the local file scan logic, the timeline states are returned even if no sync loop get started during init:
-    /// in this case, no remote files exist and all local timelines with correct metadata files are considered ready.
-    pub initial_timeline_states: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>>,
+    pub remote_index: Arc<RwLock<RemoteTimelineIndex>>,
+    pub local_timeline_init_statuses: LocalTimelineInitStatuses,
 }

 /// Based on the config, initiates the remote storage connection and starts a separate thread
@@ -138,41 +135,43 @@ pub fn start_local_timeline_sync(

    match &config.remote_storage_config {
        Some(storage_config) => match &storage_config.storage {
-            RemoteStorageKind::LocalFs(root) => storage_sync::spawn_storage_sync_thread(
-                config,
-                local_timeline_files,
-                LocalFs::new(root.clone(), &config.workdir)?,
-                storage_config.max_concurrent_sync,
-                storage_config.max_sync_errors,
-            ),
-            RemoteStorageKind::AwsS3(s3_config) => storage_sync::spawn_storage_sync_thread(
-                config,
-                local_timeline_files,
-                S3::new(s3_config, &config.workdir)?,
-                storage_config.max_concurrent_sync,
-                storage_config.max_sync_errors,
-            ),
+            RemoteStorageKind::LocalFs(root) => {
+                info!("Using fs root '{}' as a remote storage", root.display());
+                storage_sync::spawn_storage_sync_thread(
+                    config,
+                    local_timeline_files,
+                    LocalFs::new(root.clone(), &config.workdir)?,
+                    storage_config.max_concurrent_sync,
+                    storage_config.max_sync_errors,
+                )
+            },
+            RemoteStorageKind::AwsS3(s3_config) => {
+                info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
+                    s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
+                storage_sync::spawn_storage_sync_thread(
+                    config,
+                    local_timeline_files,
+                    S3::new(s3_config, &config.workdir)?,
+                    storage_config.max_concurrent_sync,
+                    storage_config.max_sync_errors,
+                )
+            },
        }
        .context("Failed to spawn the storage sync thread"),
        None => {
            info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled");
-            let mut initial_timeline_states: HashMap<
-                ZTenantId,
-                HashMap<ZTimelineId, TimelineSyncState>,
-            > = HashMap::new();
-            for (TimelineSyncId(tenant_id, timeline_id), (timeline_metadata, _)) in
+            let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new();
+            for (ZTenantTimelineId { tenant_id, timeline_id }, _) in
                local_timeline_files
            {
-                initial_timeline_states
+                local_timeline_init_statuses
                    .entry(tenant_id)
                    .or_default()
-                    .insert(
-                        timeline_id,
-                        TimelineSyncState::Ready(timeline_metadata.disk_consistent_lsn()),
-                    );
+                    .insert(timeline_id, LocalTimelineInitStatus::LocallyComplete);
            }
            Ok(SyncStartupData {
-                initial_timeline_states,
+                local_timeline_init_statuses,
+                remote_index: Arc::new(RwLock::new(RemoteTimelineIndex::empty())),
            })
        }
    }
@@ -180,7 +179,7 @@ pub fn start_local_timeline_sync(

 fn local_tenant_timeline_files(
    config: &'static PageServerConf,
-) -> anyhow::Result<HashMap<TimelineSyncId, (TimelineMetadata, Vec<PathBuf>)>> {
+) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, Vec<PathBuf>)>> {
    let mut local_tenant_timeline_files = HashMap::new();
    let tenants_dir = config.tenants_path();
    for tenants_dir_entry in fs::read_dir(&tenants_dir)
@@ -215,8 +214,9 @@ fn local_tenant_timeline_files(
 fn collect_timelines_for_tenant(
    config: &'static PageServerConf,
    tenant_path: &Path,
-) -> anyhow::Result<HashMap<TimelineSyncId, (TimelineMetadata, Vec<PathBuf>)>> {
-    let mut timelines: HashMap<TimelineSyncId, (TimelineMetadata, Vec<PathBuf>)> = HashMap::new();
+) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, Vec<PathBuf>)>> {
+    let mut timelines: HashMap<ZTenantTimelineId, (TimelineMetadata, Vec<PathBuf>)> =
+        HashMap::new();
    let tenant_id = tenant_path
        .file_name()
        .and_then(ffi::OsStr::to_str)
@@ -237,7 +237,10 @@ fn collect_timelines_for_tenant(
                match collect_timeline_files(&timeline_path) {
                    Ok((timeline_id, metadata, timeline_files)) => {
                        timelines.insert(
-                            TimelineSyncId(tenant_id, timeline_id),
+                            ZTenantTimelineId {
+                                tenant_id,
+                                timeline_id,
+                            },
                            (metadata, timeline_files),
                        );
                    }
--- a/pageserver/src/remote_storage/README.md
+++ b/pageserver/src/remote_storage/README.md
@@ -62,16 +62,3 @@ Based on previous evaluation, even `rusoto-s3` could be a better choice over thi

 So far, we don't adjust the remote storage based on GC thread loop results, only checkpointer loop affects the remote storage.
 Index module could be used as a base to implement a deferred GC mechanism, a "defragmentation" that repacks archives into new ones after GC is done removing the files from the archives.
-
-* bracnhes implementaion could be improved
-
-Currently, there's a code to sync the branches along with the timeline files: on upload, every local branch files that are missing remotely are uploaded,
-on the timeline download, missing remote branch files are downlaoded.
-
-A branch is a per-tenant entity, yet a current implementaion requires synchronizing a timeline first to get the branch files locally.
-Currently, there's no other way to know about the remote branch files, neither the file contents is verified and updated.
-
-* no IT tests
-
-Automated S3 testing is lacking currently, due to no convenient way to enable backups during the tests.
-After it's fixed, benchmark runs should also be carried out to find bottlenecks.
--- a/pageserver/src/remote_storage/local_fs.rs
+++ b/pageserver/src/remote_storage/local_fs.rs
@@ -5,6 +5,7 @@
 //! volume is mounted to the local FS.

 use std::{
+    ffi::OsString,
    future::Future,
    path::{Path, PathBuf},
    pin::Pin,
@@ -73,7 +74,7 @@ impl RemoteStorage for LocalFs {
    }

    async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
-        Ok(get_all_files(&self.root).await?.into_iter().collect())
+        get_all_files(&self.root).await
    }

    async fn upload(
@@ -83,11 +84,21 @@ impl RemoteStorage for LocalFs {
    ) -> anyhow::Result<()> {
        let target_file_path = self.resolve_in_storage(to)?;
        create_target_directory(&target_file_path).await?;
+        // We need this dance with sort of durable rename (without fsyncs)
+        // to prevent partial uploads. This was really hit when pageserver shutdown
+        // cancelled the upload and partial file was left on the fs
+        let mut temp_extension = target_file_path
+            .extension()
+            .unwrap_or_default()
+            .to_os_string();
+
+        temp_extension.push(OsString::from(".temp"));
+        let temp_file_path = target_file_path.with_extension(temp_extension);
        let mut destination = io::BufWriter::new(
            fs::OpenOptions::new()
                .write(true)
                .create(true)
-                .open(&target_file_path)
+                .open(&temp_file_path)
                .await
                .with_context(|| {
                    format!(
@@ -101,16 +112,26 @@ impl RemoteStorage for LocalFs {
            .await
            .with_context(|| {
                format!(
-                    "Failed to upload file to the local storage at '{}'",
+                    "Failed to upload file (write temp) to the local storage at '{}'",
+                    temp_file_path.display()
+                )
+            })?;
+
+        destination.flush().await.with_context(|| {
+            format!(
+                "Failed to upload (flush temp) file to the local storage at '{}'",
+                temp_file_path.display()
+            )
+        })?;
+
+        fs::rename(temp_file_path, &target_file_path)
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to upload (rename) file to the local storage at '{}'",
                    target_file_path.display()
                )
            })?;
-        destination.flush().await.with_context(|| {
-            format!(
-                "Failed to upload file to the local storage at '{}'",
-                target_file_path.display()
-            )
-        })?;
        Ok(())
    }

--- a/pageserver/src/remote_storage/rust_s3.rs
+++ b/pageserver/src/remote_storage/rust_s3.rs
@@ -9,6 +9,7 @@ use std::path::{Path, PathBuf};
 use anyhow::Context;
 use s3::{bucket::Bucket, creds::Credentials, region::Region};
 use tokio::io::{self, AsyncWriteExt};
+use tracing::debug;

 use crate::{
    config::S3Config,
@@ -58,10 +59,21 @@ pub struct S3 {
 impl S3 {
    /// Creates the storage, errors if incorrect AWS S3 configuration provided.
    pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
-        let region = aws_config
-            .bucket_region
-            .parse::<Region>()
-            .context("Failed to parse the s3 region from config")?;
+        debug!(
+            "Creating s3 remote storage around bucket {}",
+            aws_config.bucket_name
+        );
+        let region = match aws_config.endpoint.clone() {
+            Some(endpoint) => Region::Custom {
+                endpoint,
+                region: aws_config.bucket_region.clone(),
+            },
+            None => aws_config
+                .bucket_region
+                .parse::<Region>()
+                .context("Failed to parse the s3 region from config")?,
+        };
+
        let credentials = Credentials::new(
            aws_config.access_key_id.as_deref(),
            aws_config.secret_access_key.as_deref(),
--- a/pageserver/src/remote_storage/storage_sync.rs
+++ b/pageserver/src/remote_storage/storage_sync.rs
@@ -14,13 +14,6 @@
 //! Only GC removes local timeline files, the GC support is not added to sync currently,
 //! yet downloading extra files is not critically bad at this stage, GC can remove those again.
 //!
-//! Along the timeline files, branch files are uploaded and downloaded every time a corresponding sync task is processed.
-//! For simplicity, branch files are also treated as immutable: only missing files are uploaded or downloaded, no removals, amendments or file contents checks are done.
-//! Also, the branches are copied as separate files, with no extra compressions done.
-//! Despite branches information currently belonging to tenants, a tenants' timeline sync is required to upload or download the branch files, also, there's no way to know
-//! the branch sync state outside of the sync loop.
-//! This implementation is currently considered as temporary and is a subjec to change later.
-//!
 //! During the loop startup, an initial [`RemoteTimelineIndex`] state is constructed via listing the remote storage contents.
 //! It's enough to poll the remote state once on startup only, due to agreement that the pageserver has
 //! an exclusive write access to the remote storage: new files appear in the storage only after the same
@@ -65,18 +58,18 @@
 //! Synchronization never removes any local from pageserver workdir or remote files from the remote storage, yet there could be overwrites of the same files (metadata file updates; future checksum mismatch fixes).
 //! NOTE: No real contents or checksum check happens right now and is a subject to improve later.
 //!
-//! After the whole timeline is downloaded, [`crate::tenant_mgr::set_timeline_states`] function is used to update pageserver memory stage for the timeline processed.
-//! No extra branch registration is done.
+//! After the whole timeline is downloaded, [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function is used to update pageserver memory stage for the timeline processed.
 //!
 //! When pageserver signals shutdown, current sync task gets finished and the loop exists.

-mod compression;
+/// Expose the module for a binary CLI tool that deals with the corresponding blobs.
+pub mod compression;
 mod download;
 pub mod index;
 mod upload;

 use std::{
-    collections::{BTreeSet, HashMap, HashSet, VecDeque},
+    collections::{BTreeSet, HashMap, VecDeque},
    num::{NonZeroU32, NonZeroUsize},
    path::{Path, PathBuf},
    sync::Arc,
@@ -86,7 +79,6 @@ use anyhow::{bail, Context};
 use futures::stream::{FuturesUnordered, StreamExt};
 use lazy_static::lazy_static;
 use tokio::{
-    fs,
    runtime::Runtime,
    sync::{
        mpsc::{self, UnboundedReceiver},
@@ -100,19 +92,26 @@ use self::{
    compression::ArchiveHeader,
    download::{download_timeline, DownloadedTimeline},
    index::{
-        ArchiveDescription, ArchiveId, RelativePath, RemoteTimeline, RemoteTimelineIndex,
-        TimelineIndexEntry,
+        ArchiveDescription, ArchiveId, RemoteTimeline, RemoteTimelineIndex, TimelineIndexEntry,
+        TimelineIndexEntryInner,
    },
    upload::upload_timeline_checkpoint,
 };
-use super::{RemoteStorage, SyncStartupData, TimelineSyncId};
+use super::{
+    LocalTimelineInitStatus, LocalTimelineInitStatuses, RemoteStorage, SyncStartupData,
+    ZTenantTimelineId,
+};
 use crate::{
    config::PageServerConf, layered_repository::metadata::TimelineMetadata,
-    remote_storage::storage_sync::compression::read_archive_header, repository::TimelineSyncState,
-    tenant_mgr::set_timeline_states, thread_mgr, thread_mgr::ThreadKind,
+    remote_storage::storage_sync::compression::read_archive_header,
+    repository::TimelineSyncStatusUpdate, tenant_mgr::apply_timeline_sync_status_updates,
+    thread_mgr, thread_mgr::ThreadKind,
 };

-use zenith_metrics::{register_histogram_vec, register_int_gauge, HistogramVec, IntGauge};
+use zenith_metrics::{
+    register_histogram_vec, register_int_counter, register_int_gauge, HistogramVec, IntCounter,
+    IntGauge,
+};
 use zenith_utils::zid::{ZTenantId, ZTimelineId};

 lazy_static! {
@@ -121,6 +120,11 @@ lazy_static! {
        "Number of storage sync items left in the queue"
    )
    .expect("failed to register pageserver remote storage remaining sync items int gauge");
+    static ref FATAL_TASK_FAILURES: IntCounter = register_int_counter!(
+        "pageserver_remote_storage_fatal_task_failures",
+        "Number of critically failed tasks"
+    )
+    .expect("failed to register pageserver remote storage remaining sync items int gauge");
    static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!(
        "pageserver_remote_storage_image_sync_time",
        "Time took to synchronize (download or upload) a whole pageserver image. \
@@ -242,13 +246,13 @@ mod sync_queue {
 /// Limited by the number of retries, after certain threshold the failing task gets evicted and the timeline disabled.
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
 pub struct SyncTask {
-    sync_id: TimelineSyncId,
+    sync_id: ZTenantTimelineId,
    retries: u32,
    kind: SyncKind,
 }

 impl SyncTask {
-    fn new(sync_id: TimelineSyncId, retries: u32, kind: SyncKind) -> Self {
+    fn new(sync_id: ZTenantTimelineId, retries: u32, kind: SyncKind) -> Self {
        Self {
            sync_id,
            retries,
@@ -307,7 +311,10 @@ pub fn schedule_timeline_checkpoint_upload(
    }

    if !sync_queue::push(SyncTask::new(
-        TimelineSyncId(tenant_id, timeline_id),
+        ZTenantTimelineId {
+            tenant_id,
+            timeline_id,
+        },
        0,
        SyncKind::Upload(NewCheckpoint { layers, metadata }),
    )) {
@@ -333,8 +340,15 @@ pub fn schedule_timeline_checkpoint_upload(
 ///
 /// Ensure that the loop is started otherwise the task is never processed.
 pub fn schedule_timeline_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) {
+    debug!(
+        "Scheduling timeline download for tenant {}, timeline {}",
+        tenant_id, timeline_id
+    );
    sync_queue::push(SyncTask::new(
-        TimelineSyncId(tenant_id, timeline_id),
+        ZTenantTimelineId {
+            tenant_id,
+            timeline_id,
+        },
        0,
        SyncKind::Download(TimelineDownload {
            files_to_skip: Arc::new(BTreeSet::new()),
@@ -350,7 +364,7 @@ pub(super) fn spawn_storage_sync_thread<
    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
 >(
    conf: &'static PageServerConf,
-    local_timeline_files: HashMap<TimelineSyncId, (TimelineMetadata, Vec<PathBuf>)>,
+    local_timeline_files: HashMap<ZTenantTimelineId, (TimelineMetadata, Vec<PathBuf>)>,
    storage: S,
    max_concurrent_sync: NonZeroUsize,
    max_sync_errors: NonZeroU32,
@@ -378,10 +392,13 @@ pub(super) fn spawn_storage_sync_thread<
                None
            }
        });
-    let remote_index = RemoteTimelineIndex::try_parse_descriptions_from_paths(conf, download_paths);
-
-    let initial_timeline_states = schedule_first_sync_tasks(&remote_index, local_timeline_files);
+    let mut remote_index =
+        RemoteTimelineIndex::try_parse_descriptions_from_paths(conf, download_paths);

+    let local_timeline_init_statuses =
+        schedule_first_sync_tasks(&mut remote_index, local_timeline_files);
+    let remote_index = Arc::new(RwLock::new(remote_index));
+    let remote_index_cloned = Arc::clone(&remote_index);
    thread_mgr::spawn(
        ThreadKind::StorageSync,
        None,
@@ -392,7 +409,7 @@ pub(super) fn spawn_storage_sync_thread<
                runtime,
                conf,
                receiver,
-                remote_index,
+                remote_index_cloned,
                storage,
                max_concurrent_sync,
                max_sync_errors,
@@ -401,12 +418,13 @@ pub(super) fn spawn_storage_sync_thread<
    )
    .context("Failed to spawn remote storage sync thread")?;
    Ok(SyncStartupData {
-        initial_timeline_states,
+        remote_index,
+        local_timeline_init_statuses,
    })
 }

 enum LoopStep {
-    NewStates(HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>>),
+    SyncStatusUpdates(HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncStatusUpdate>>),
    Shutdown,
 }

@@ -418,13 +436,14 @@ fn storage_sync_loop<
    runtime: Runtime,
    conf: &'static PageServerConf,
    mut receiver: UnboundedReceiver<SyncTask>,
-    index: RemoteTimelineIndex,
+    index: Arc<RwLock<RemoteTimelineIndex>>,
    storage: S,
    max_concurrent_sync: NonZeroUsize,
    max_sync_errors: NonZeroU32,
 ) -> anyhow::Result<()> {
-    let remote_assets = Arc::new((storage, RwLock::new(index)));
+    let remote_assets = Arc::new((storage, Arc::clone(&index)));
    loop {
+        let index = Arc::clone(&index);
        let loop_step = runtime.block_on(async {
            tokio::select! {
                new_timeline_states = loop_step(
@@ -434,15 +453,15 @@ fn storage_sync_loop<
                    max_concurrent_sync,
                    max_sync_errors,
                )
-                .instrument(debug_span!("storage_sync_loop_step")) => LoopStep::NewStates(new_timeline_states),
+                .instrument(debug_span!("storage_sync_loop_step")) => LoopStep::SyncStatusUpdates(new_timeline_states),
                _ = thread_mgr::shutdown_watcher() => LoopStep::Shutdown,
            }
        });

        match loop_step {
-            LoopStep::NewStates(new_timeline_states) => {
+            LoopStep::SyncStatusUpdates(new_timeline_states) => {
                // Batch timeline download registration to ensure that the external registration code won't block any running tasks before.
-                set_timeline_states(conf, new_timeline_states);
+                apply_timeline_sync_status_updates(conf, index, new_timeline_states);
                debug!("Sync loop step completed");
            }
            LoopStep::Shutdown => {
@@ -461,10 +480,10 @@ async fn loop_step<
 >(
    conf: &'static PageServerConf,
    receiver: &mut UnboundedReceiver<SyncTask>,
-    remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
+    remote_assets: Arc<(S, Arc<RwLock<RemoteTimelineIndex>>)>,
    max_concurrent_sync: NonZeroUsize,
    max_sync_errors: NonZeroU32,
-) -> HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>> {
+) -> HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncStatusUpdate>> {
    let max_concurrent_sync = max_concurrent_sync.get();
    let mut next_tasks = BTreeSet::new();

@@ -506,7 +525,7 @@ async fn loop_step<
                Err(e) => {
                    error!(
                        "Failed to process storage sync task for tenant {}, timeline {}: {:?}",
-                        sync_id.0, sync_id.1, e
+                        sync_id.tenant_id, sync_id.timeline_id, e
                    );
                    None
                }
@@ -515,12 +534,17 @@ async fn loop_step<
        })
        .collect::<FuturesUnordered<_>>();

-    let mut new_timeline_states: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>> =
-        HashMap::with_capacity(max_concurrent_sync);
+    let mut new_timeline_states: HashMap<
+        ZTenantId,
+        HashMap<ZTimelineId, TimelineSyncStatusUpdate>,
+    > = HashMap::with_capacity(max_concurrent_sync);
    while let Some((sync_id, state_update)) = task_batch.next().await {
        debug!("Finished storage sync task for sync id {}", sync_id);
        if let Some(state_update) = state_update {
-            let TimelineSyncId(tenant_id, timeline_id) = sync_id;
+            let ZTenantTimelineId {
+                tenant_id,
+                timeline_id,
+            } = sync_id;
            new_timeline_states
                .entry(tenant_id)
                .or_default()
@@ -536,24 +560,19 @@ async fn process_task<
    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
 >(
    conf: &'static PageServerConf,
-    remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
+    remote_assets: Arc<(S, Arc<RwLock<RemoteTimelineIndex>>)>,
    task: SyncTask,
    max_sync_errors: NonZeroU32,
-) -> Option<TimelineSyncState> {
+) -> Option<TimelineSyncStatusUpdate> {
    if task.retries > max_sync_errors.get() {
        error!(
            "Evicting task {:?} that failed {} times, exceeding the error threshold",
            task.kind, task.retries
        );
-        return Some(TimelineSyncState::Evicted(
-            remote_assets
-                .as_ref()
-                .1
-                .read()
-                .await
-                .timeline_entry(&task.sync_id)
-                .and_then(TimelineIndexEntry::disk_consistent_lsn),
-        ));
+        FATAL_TASK_FAILURES.inc();
+        // FIXME (rodionov) this can potentially leave holes in timeline uploads
+        //    planneed to be fixed as part of https://github.com/zenithdb/zenith/issues/977
+        return None;
    }

    if task.retries > 0 {
@@ -565,6 +584,8 @@ async fn process_task<
        tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await;
    }

+    let remote_index = Arc::clone(&remote_assets.1);
+
    let sync_start = Instant::now();
    let sync_name = task.kind.sync_name();
    match task.kind {
@@ -581,19 +602,25 @@ async fn process_task<
            match download_result {
                DownloadedTimeline::Abort => {
                    register_sync_status(sync_start, sync_name, None);
+                    remote_index
+                        .write()
+                        .await
+                        .set_awaits_download(&task.sync_id, false)
+                        .expect("timeline should be present in remote index");
                    None
                }
-                DownloadedTimeline::FailedAndRescheduled {
-                    disk_consistent_lsn,
-                } => {
+                DownloadedTimeline::FailedAndRescheduled => {
                    register_sync_status(sync_start, sync_name, Some(false));
-                    Some(TimelineSyncState::AwaitsDownload(disk_consistent_lsn))
+                    None
                }
-                DownloadedTimeline::Successful {
-                    disk_consistent_lsn,
-                } => {
+                DownloadedTimeline::Successful => {
                    register_sync_status(sync_start, sync_name, Some(true));
-                    Some(TimelineSyncState::Ready(disk_consistent_lsn))
+                    remote_index
+                        .write()
+                        .await
+                        .set_awaits_download(&task.sync_id, false)
+                        .expect("timeline should be present in remote index");
+                    Some(TimelineSyncStatusUpdate::Downloaded)
                }
            }
        }
@@ -613,42 +640,45 @@ async fn process_task<
 }

 fn schedule_first_sync_tasks(
-    index: &RemoteTimelineIndex,
-    local_timeline_files: HashMap<TimelineSyncId, (TimelineMetadata, Vec<PathBuf>)>,
-) -> HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>> {
-    let mut initial_timeline_statuses: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>> =
-        HashMap::new();
+    index: &mut RemoteTimelineIndex,
+    local_timeline_files: HashMap<ZTenantTimelineId, (TimelineMetadata, Vec<PathBuf>)>,
+) -> LocalTimelineInitStatuses {
+    let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new();

    let mut new_sync_tasks =
        VecDeque::with_capacity(local_timeline_files.len().max(local_timeline_files.len()));

    for (sync_id, (local_metadata, local_files)) in local_timeline_files {
-        let local_disk_consistent_lsn = local_metadata.disk_consistent_lsn();
-
-        let TimelineSyncId(tenant_id, timeline_id) = sync_id;
-        match index.timeline_entry(&sync_id) {
+        let ZTenantTimelineId {
+            tenant_id,
+            timeline_id,
+        } = sync_id;
+        match index.timeline_entry_mut(&sync_id) {
            Some(index_entry) => {
-                let timeline_status = compare_local_and_remote_timeline(
+                let (timeline_status, awaits_download) = compare_local_and_remote_timeline(
                    &mut new_sync_tasks,
                    sync_id,
                    local_metadata,
                    local_files,
                    index_entry,
                );
-                match timeline_status {
-                    Some(timeline_status) => {
-                        initial_timeline_statuses
-                            .entry(tenant_id)
-                            .or_default()
-                            .insert(timeline_id, timeline_status);
-                    }
-                    None => error!(
-                        "Failed to compare local and remote timeline for task {}",
-                        sync_id
-                    ),
+                let was_there = local_timeline_init_statuses
+                    .entry(tenant_id)
+                    .or_default()
+                    .insert(timeline_id, timeline_status);
+
+                if was_there.is_some() {
+                    // defensive check
+                    warn!(
+                        "Overwriting timeline init sync status. Status {:?} Timeline {}",
+                        timeline_status, timeline_id
+                    );
                }
+                index_entry.set_awaits_download(awaits_download);
            }
            None => {
+                // TODO (rodionov) does this mean that we've crashed during tenant creation?
+                //  is it safe to upload this checkpoint? could it be half broken?
                new_sync_tasks.push_back(SyncTask::new(
                    sync_id,
                    0,
@@ -657,65 +687,41 @@ fn schedule_first_sync_tasks(
                        metadata: local_metadata,
                    }),
                ));
-                initial_timeline_statuses
+                local_timeline_init_statuses
                    .entry(tenant_id)
                    .or_default()
-                    .insert(
-                        timeline_id,
-                        TimelineSyncState::Ready(local_disk_consistent_lsn),
-                    );
+                    .insert(timeline_id, LocalTimelineInitStatus::LocallyComplete);
            }
        }
    }

-    let unprocessed_remote_ids = |remote_id: &TimelineSyncId| {
-        initial_timeline_statuses
-            .get(&remote_id.0)
-            .and_then(|timelines| timelines.get(&remote_id.1))
-            .is_none()
-    };
-    for unprocessed_remote_id in index
-        .all_sync_ids()
-        .filter(unprocessed_remote_ids)
-        .collect::<Vec<_>>()
-    {
-        let TimelineSyncId(cloud_only_tenant_id, cloud_only_timeline_id) = unprocessed_remote_id;
-        match index
-            .timeline_entry(&unprocessed_remote_id)
-            .and_then(TimelineIndexEntry::disk_consistent_lsn)
-        {
-            Some(remote_disk_consistent_lsn) => {
-                initial_timeline_statuses
-                    .entry(cloud_only_tenant_id)
-                    .or_default()
-                    .insert(
-                        cloud_only_timeline_id,
-                        TimelineSyncState::CloudOnly(remote_disk_consistent_lsn),
-                    );
-            }
-            None => error!(
-                "Failed to find disk consistent LSN for remote timeline {}",
-                unprocessed_remote_id
-            ),
-        }
-    }
-
    new_sync_tasks.into_iter().for_each(|task| {
        sync_queue::push(task);
    });
-    initial_timeline_statuses
+    local_timeline_init_statuses
 }

 fn compare_local_and_remote_timeline(
    new_sync_tasks: &mut VecDeque<SyncTask>,
-    sync_id: TimelineSyncId,
+    sync_id: ZTenantTimelineId,
    local_metadata: TimelineMetadata,
    local_files: Vec<PathBuf>,
    remote_entry: &TimelineIndexEntry,
-) -> Option<TimelineSyncState> {
+) -> (LocalTimelineInitStatus, bool) {
    let local_lsn = local_metadata.disk_consistent_lsn();
    let uploads = remote_entry.uploaded_checkpoints();

+    let mut initial_timeline_status = LocalTimelineInitStatus::LocallyComplete;
+
+    let mut awaits_download = false;
+    // TODO probably here we need more sophisticated logic,
+    //   if more data is available remotely can we just download whats there?
+    //   without trying to upload something. It may be tricky, needs further investigation.
+    //   For now looks strange that we can request upload
+    //   and dowload for the same timeline simultaneously.
+    //   (upload needs to be only for previously unsynced files, not whole timeline dir).
+    //   If one of the tasks fails they will be reordered in the queue which can lead
+    //   to timeline being stuck in evicted state
    if !uploads.contains(&local_lsn) {
        new_sync_tasks.push_back(SyncTask::new(
            sync_id,
@@ -725,6 +731,7 @@ fn compare_local_and_remote_timeline(
                metadata: local_metadata,
            }),
        ));
+        // Note that status here doesnt change.
    }

    let uploads_count = uploads.len();
@@ -733,7 +740,7 @@ fn compare_local_and_remote_timeline(
        .filter(|upload_lsn| upload_lsn <= &local_lsn)
        .map(ArchiveId)
        .collect();
-    Some(if archives_to_skip.len() != uploads_count {
+    if archives_to_skip.len() != uploads_count {
        new_sync_tasks.push_back(SyncTask::new(
            sync_id,
            0,
@@ -742,10 +749,12 @@ fn compare_local_and_remote_timeline(
                archives_to_skip,
            }),
        ));
-        TimelineSyncState::AwaitsDownload(remote_entry.disk_consistent_lsn()?)
-    } else {
-        TimelineSyncState::Ready(remote_entry.disk_consistent_lsn().unwrap_or(local_lsn))
-    })
+        initial_timeline_status = LocalTimelineInitStatus::NeedsSync;
+        awaits_download = true;
+        // we do not need to manupulate with remote consistent lsn here
+        // because it will be updated when sync will be completed
+    }
+    (initial_timeline_status, awaits_download)
 }

 fn register_sync_status(sync_start: Instant, sync_name: &str, sync_status: Option<bool>) {
@@ -759,21 +768,23 @@ fn register_sync_status(sync_start: Instant, sync_name: &str, sync_status: Optio
    .observe(secs_elapsed)
 }

-async fn update_index_description<
+async fn fetch_full_index<
    P: Send + Sync + 'static,
    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
 >(
-    (storage, index): &(S, RwLock<RemoteTimelineIndex>),
+    (storage, index): &(S, Arc<RwLock<RemoteTimelineIndex>>),
    timeline_dir: &Path,
-    id: TimelineSyncId,
+    id: ZTenantTimelineId,
 ) -> anyhow::Result<RemoteTimeline> {
-    let mut index_write = index.write().await;
-    let full_index = match index_write.timeline_entry(&id) {
+    let index_read = index.read().await;
+    let full_index = match index_read.timeline_entry(&id).map(|e| e.inner()) {
        None => bail!("Timeline not found for sync id {}", id),
-        Some(TimelineIndexEntry::Full(_)) => bail!("Index is already populated for sync id {}", id),
-        Some(TimelineIndexEntry::Description(description)) => {
+        Some(TimelineIndexEntryInner::Full(_)) => {
+            bail!("Index is already populated for sync id {}", id)
+        }
+        Some(TimelineIndexEntryInner::Description(description)) => {
            let mut archive_header_downloads = FuturesUnordered::new();
-            for (&archive_id, description) in description {
+            for (archive_id, description) in description {
                archive_header_downloads.push(async move {
                    let header = download_archive_header(storage, timeline_dir, description)
                        .await
@@ -785,18 +796,22 @@ async fn update_index_description<
            let mut full_index = RemoteTimeline::empty();
            while let Some(header_data) = archive_header_downloads.next().await {
                match header_data {
-                        Ok((archive_id, header_size, header)) => full_index.update_archive_contents(archive_id.0, header, header_size),
-                        Err((e, archive_id)) => bail!(
-                            "Failed to download archive header for tenant {}, timeline {}, archive for Lsn {}: {}",
-                            id.0, id.1, archive_id.0,
-                            e
-                        ),
-                    }
+                    Ok((archive_id, header_size, header)) => full_index.update_archive_contents(archive_id.0, header, header_size),
+                    Err((e, archive_id)) => bail!(
+                        "Failed to download archive header for tenant {}, timeline {}, archive for Lsn {}: {}",
+                        id.tenant_id, id.timeline_id, archive_id.0,
+                        e
+                    ),
+                }
            }
            full_index
        }
    };
-    index_write.add_timeline_entry(id, TimelineIndexEntry::Full(full_index.clone()));
+    drop(index_read); // tokio rw lock is not upgradeable
+    let mut index_write = index.write().await;
+    index_write
+        .upgrade_timeline_entry(&id, full_index.clone())
+        .context("cannot upgrade timeline entry in remote index")?;
    Ok(full_index)
 }

@@ -823,28 +838,6 @@ async fn download_archive_header<
    Ok(header)
 }

-async fn tenant_branch_files(
-    conf: &'static PageServerConf,
-    tenant_id: ZTenantId,
-) -> anyhow::Result<HashSet<RelativePath>> {
-    let branches_dir = conf.branches_path(&tenant_id);
-    if !branches_dir.exists() {
-        return Ok(HashSet::new());
-    }
-
-    let mut branch_entries = fs::read_dir(&branches_dir)
-        .await
-        .context("Failed to list tenant branches dir contents")?;
-
-    let mut branch_files = HashSet::new();
-    while let Some(branch_entry) = branch_entries.next_entry().await? {
-        if branch_entry.file_type().await?.is_file() {
-            branch_files.insert(RelativePath::new(&branches_dir, branch_entry.path())?);
-        }
-    }
-    Ok(branch_files)
-}
-
 #[cfg(test)]
 mod test_utils {
    use std::{
@@ -862,11 +855,11 @@ mod test_utils {
    #[track_caller]
    pub async fn ensure_correct_timeline_upload(
        harness: &RepoHarness,
-        remote_assets: Arc<(LocalFs, RwLock<RemoteTimelineIndex>)>,
+        remote_assets: Arc<(LocalFs, Arc<RwLock<RemoteTimelineIndex>>)>,
        timeline_id: ZTimelineId,
        new_upload: NewCheckpoint,
    ) {
-        let sync_id = TimelineSyncId(harness.tenant_id, timeline_id);
+        let sync_id = ZTenantTimelineId::new(harness.tenant_id, timeline_id);
        upload_timeline_checkpoint(
            harness.conf,
            Arc::clone(&remote_assets),
@@ -921,11 +914,14 @@ mod test_utils {
    }

    pub async fn expect_timeline(
-        index: &RwLock<RemoteTimelineIndex>,
-        sync_id: TimelineSyncId,
+        index: &Arc<RwLock<RemoteTimelineIndex>>,
+        sync_id: ZTenantTimelineId,
    ) -> RemoteTimeline {
-        if let Some(TimelineIndexEntry::Full(remote_timeline)) =
-            index.read().await.timeline_entry(&sync_id)
+        if let Some(TimelineIndexEntryInner::Full(remote_timeline)) = index
+            .read()
+            .await
+            .timeline_entry(&sync_id)
+            .map(|e| e.inner())
        {
            remote_timeline.clone()
        } else {
@@ -938,7 +934,7 @@ mod test_utils {

    #[track_caller]
    pub async fn assert_index_descriptions(
-        index: &RwLock<RemoteTimelineIndex>,
+        index: &Arc<RwLock<RemoteTimelineIndex>>,
        expected_index_with_descriptions: RemoteTimelineIndex,
    ) {
        let index_read = index.read().await;
@@ -951,30 +947,9 @@ mod test_utils {
            "Index contains unexpected sync ids"
        );

-        let mut actual_branches = BTreeMap::new();
-        let mut expected_branches = BTreeMap::new();
        let mut actual_timeline_entries = BTreeMap::new();
        let mut expected_timeline_entries = BTreeMap::new();
        for sync_id in actual_sync_ids {
-            actual_branches.insert(
-                sync_id.1,
-                index_read
-                    .branch_files(sync_id.0)
-                    .into_iter()
-                    .flat_map(|branch_paths| branch_paths.iter())
-                    .cloned()
-                    .collect::<BTreeSet<_>>(),
-            );
-            expected_branches.insert(
-                sync_id.1,
-                expected_index_with_descriptions
-                    .branch_files(sync_id.0)
-                    .into_iter()
-                    .flat_map(|branch_paths| branch_paths.iter())
-                    .cloned()
-                    .collect::<BTreeSet<_>>(),
-            );
-
            actual_timeline_entries.insert(
                sync_id,
                index_read.timeline_entry(&sync_id).unwrap().clone(),
@@ -989,11 +964,6 @@ mod test_utils {
        }
        drop(index_read);

-        assert_eq!(
-            actual_branches, expected_branches,
-            "Index contains unexpected branches"
-        );
-
        for (sync_id, actual_timeline_entry) in actual_timeline_entries {
            let expected_timeline_description = expected_timeline_entries
                .remove(&sync_id)
@@ -1003,26 +973,26 @@ mod test_utils {
                        sync_id
                    )
                });
-            let expected_timeline_description = match expected_timeline_description {
-                TimelineIndexEntry::Description(description) => description,
-                TimelineIndexEntry::Full(_) => panic!("Expected index entry for sync id {} is a full entry, while a description was expected", sync_id),
+            let expected_timeline_description = match expected_timeline_description.inner() {
+                TimelineIndexEntryInner::Description(description) => description,
+                TimelineIndexEntryInner::Full(_) => panic!("Expected index entry for sync id {} is a full entry, while a description was expected", sync_id),
            };

-            match actual_timeline_entry {
-                TimelineIndexEntry::Description(actual_descriptions) => {
+            match actual_timeline_entry.inner() {
+                TimelineIndexEntryInner::Description(description) => {
                    assert_eq!(
-                        actual_descriptions, expected_timeline_description,
+                        description, expected_timeline_description,
                        "Index contains unexpected descriptions entry for sync id {}",
                        sync_id
                    )
                }
-                TimelineIndexEntry::Full(actual_full_entry) => {
+                TimelineIndexEntryInner::Full(remote_timeline) => {
                    let expected_lsns = expected_timeline_description
                        .values()
                        .map(|description| description.disk_consistent_lsn)
                        .collect::<BTreeSet<_>>();
                    assert_eq!(
-                        actual_full_entry.checkpoints().collect::<BTreeSet<_>>(),
+                        remote_timeline.checkpoints().collect::<BTreeSet<_>>(),
                        expected_lsns,
                        "Timeline {} should have the same checkpoints uploaded",
                        sync_id,
--- a/pageserver/src/remote_storage/storage_sync/compression.rs
+++ b/pageserver/src/remote_storage/storage_sync/compression.rs
@@ -34,7 +34,7 @@ use std::{
    sync::Arc,
 };

-use anyhow::{anyhow, bail, ensure, Context};
+use anyhow::{bail, ensure, Context};
 use async_compression::tokio::bufread::{ZstdDecoder, ZstdEncoder};
 use serde::{Deserialize, Serialize};
 use tokio::{
@@ -211,16 +211,18 @@ pub async fn read_archive_header<A: io::AsyncRead + Send + Sync + Unpin>(
 pub fn parse_archive_name(archive_path: &Path) -> anyhow::Result<(Lsn, u64)> {
    let archive_name = archive_path
        .file_name()
-        .ok_or_else(|| anyhow!("Archive '{}' has no file name", archive_path.display()))?
+        .with_context(|| format!("Archive '{}' has no file name", archive_path.display()))?
        .to_string_lossy();
    let (lsn_str, header_size_str) =
-        archive_name.rsplit_once(ARCHIVE_EXTENSION).ok_or_else(|| {
-            anyhow!(
-                "Archive '{}' has incorrect extension, expected to contain '{}'",
-                archive_path.display(),
-                ARCHIVE_EXTENSION
-            )
-        })?;
+        archive_name
+            .rsplit_once(ARCHIVE_EXTENSION)
+            .with_context(|| {
+                format!(
+                    "Archive '{}' has incorrect extension, expected to contain '{}'",
+                    archive_path.display(),
+                    ARCHIVE_EXTENSION
+                )
+            })?;
    let disk_consistent_lsn = Lsn::from_hex(lsn_str).with_context(|| {
        format!(
            "Archive '{}' has an invalid disk consistent lsn in its extension",
@@ -246,7 +248,7 @@ fn archive_name(disk_consistent_lsn: Lsn, header_size: u64) -> String {
    archive_name
 }

-async fn uncompress_with_header(
+pub async fn uncompress_with_header(
    files_to_skip: &BTreeSet<PathBuf>,
    destination_dir: &Path,
    header: ArchiveHeader,
@@ -374,7 +376,7 @@ async fn write_archive_contents(
    }
    let metadata_bytes_written = io::copy(&mut metadata_bytes.as_slice(), &mut archive_input)
        .await
-        .with_context(|| "Failed to add metadata into the archive")?;
+        .context("Failed to add metadata into the archive")?;
    ensure!(
        header.metadata_file_size == metadata_bytes_written,
        "Metadata file was written to the archive incompletely",
--- a/pageserver/src/remote_storage/storage_sync/download.rs
+++ b/pageserver/src/remote_storage/storage_sync/download.rs
@@ -1,23 +1,21 @@
 //! Timeline synchrnonization logic to put files from archives on remote storage into pageserver's local directory.
-//! Currently, tenant branch files are also downloaded, but this does not appear final.

 use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc};

-use anyhow::{anyhow, ensure, Context};
-use futures::{stream::FuturesUnordered, StreamExt};
+use anyhow::{ensure, Context};
 use tokio::{fs, sync::RwLock};
 use tracing::{debug, error, trace, warn};
-use zenith_utils::{lsn::Lsn, zid::ZTenantId};
+use zenith_utils::zid::ZTenantId;

 use crate::{
    config::PageServerConf,
    layered_repository::metadata::{metadata_path, TimelineMetadata},
    remote_storage::{
        storage_sync::{
-            compression, index::TimelineIndexEntry, sync_queue, tenant_branch_files,
-            update_index_description, SyncKind, SyncTask,
+            compression, fetch_full_index, index::TimelineIndexEntryInner, sync_queue, SyncKind,
+            SyncTask,
        },
-        RemoteStorage, TimelineSyncId,
+        RemoteStorage, ZTenantTimelineId,
    },
 };

@@ -32,18 +30,16 @@ pub(super) enum DownloadedTimeline {
    Abort,
    /// Remote timeline data is found, its latest checkpoint's metadata contents (disk_consistent_lsn) is known.
    /// Initial download failed due to some error, the download task is rescheduled for another retry.
-    FailedAndRescheduled { disk_consistent_lsn: Lsn },
+    FailedAndRescheduled,
    /// Remote timeline data is found, its latest checkpoint's metadata contents (disk_consistent_lsn) is known.
    /// Initial download successful.
-    Successful { disk_consistent_lsn: Lsn },
+    Successful,
 }

 /// Attempts to download and uncompress files from all remote archives for the timeline given.
 /// Timeline files that already exist locally are skipped during the download, but the local metadata file is
 /// updated in the end of every checkpoint archive extraction.
 ///
-/// Before any archives are considered, the branch files are checked locally and remotely, all remote-only files are downloaded.
-///
 /// On an error, bumps the retries count and reschedules the download, with updated archive skip list
 /// (for any new successful archive downloads and extractions).
 pub(super) async fn download_timeline<
@@ -51,27 +47,37 @@ pub(super) async fn download_timeline<
    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
 >(
    conf: &'static PageServerConf,
-    remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
-    sync_id: TimelineSyncId,
+    remote_assets: Arc<(S, Arc<RwLock<RemoteTimelineIndex>>)>,
+    sync_id: ZTenantTimelineId,
    mut download: TimelineDownload,
    retries: u32,
 ) -> DownloadedTimeline {
    debug!("Downloading layers for sync id {}", sync_id);

-    let TimelineSyncId(tenant_id, timeline_id) = sync_id;
-    let index_read = remote_assets.1.read().await;
+    let ZTenantTimelineId {
+        tenant_id,
+        timeline_id,
+    } = sync_id;
+    let index = &remote_assets.1;
+
+    let index_read = index.read().await;
    let remote_timeline = match index_read.timeline_entry(&sync_id) {
        None => {
-            error!("Cannot download: no timeline is present in the index for given ids");
+            error!("Cannot download: no timeline is present in the index for given id");
            return DownloadedTimeline::Abort;
        }
-        Some(index_entry) => match index_entry {
-            TimelineIndexEntry::Full(remote_timeline) => Cow::Borrowed(remote_timeline),
-            TimelineIndexEntry::Description(_) => {
+
+        Some(index_entry) => match index_entry.inner() {
+            TimelineIndexEntryInner::Full(remote_timeline) => Cow::Borrowed(remote_timeline),
+            TimelineIndexEntryInner::Description(_) => {
+                // we do not check here for awaits_download because it is ok
+                // to call this function while the download is in progress
+                // so it is not a concurrent download, it is the same one
+
                let remote_disk_consistent_lsn = index_entry.disk_consistent_lsn();
                drop(index_read);
                debug!("Found timeline description for the given ids, downloading the full index");
-                match update_index_description(
+                match fetch_full_index(
                    remote_assets.as_ref(),
                    &conf.timeline_path(&timeline_id, &tenant_id),
                    sync_id,
@@ -81,16 +87,15 @@ pub(super) async fn download_timeline<
                    Ok(remote_timeline) => Cow::Owned(remote_timeline),
                    Err(e) => {
                        error!("Failed to download full timeline index: {:?}", e);
+
                        return match remote_disk_consistent_lsn {
-                            Some(disk_consistent_lsn) => {
+                            Some(_) => {
                                sync_queue::push(SyncTask::new(
                                    sync_id,
                                    retries,
                                    SyncKind::Download(download),
                                ));
-                                DownloadedTimeline::FailedAndRescheduled {
-                                    disk_consistent_lsn,
-                                }
+                                DownloadedTimeline::FailedAndRescheduled
                            }
                            None => {
                                error!("Cannot download: no disk consistent Lsn is present for the index entry");
@@ -102,29 +107,11 @@ pub(super) async fn download_timeline<
            }
        },
    };
-    let disk_consistent_lsn = match remote_timeline.checkpoints().max() {
-        Some(lsn) => lsn,
-        None => {
-            debug!("Cannot download: no disk consistent Lsn is present for the remote timeline");
-            return DownloadedTimeline::Abort;
-        }
+    if remote_timeline.checkpoints().max().is_none() {
+        debug!("Cannot download: no disk consistent Lsn is present for the remote timeline");
+        return DownloadedTimeline::Abort;
    };

-    if let Err(e) = download_missing_branches(conf, remote_assets.as_ref(), sync_id.0).await {
-        error!(
-            "Failed to download missing branches for sync id {}: {:?}",
-            sync_id, e
-        );
-        sync_queue::push(SyncTask::new(
-            sync_id,
-            retries,
-            SyncKind::Download(download),
-        ));
-        return DownloadedTimeline::FailedAndRescheduled {
-            disk_consistent_lsn,
-        };
-    }
-
    debug!("Downloading timeline archives");
    let archives_to_download = remote_timeline
        .checkpoints()
@@ -141,7 +128,7 @@ pub(super) async fn download_timeline<
            conf,
            sync_id,
            Arc::clone(&remote_assets),
-            remote_timeline.as_ref(),
+            &remote_timeline,
            archive_id,
            Arc::clone(&download.files_to_skip),
        )
@@ -158,9 +145,7 @@ pub(super) async fn download_timeline<
                    retries,
                    SyncKind::Download(download),
                ));
-                return DownloadedTimeline::FailedAndRescheduled {
-                    disk_consistent_lsn,
-                };
+                return DownloadedTimeline::FailedAndRescheduled;
            }
            Ok(()) => {
                debug!("Successfully downloaded archive {:?}", archive_id);
@@ -170,9 +155,7 @@ pub(super) async fn download_timeline<
    }

    debug!("Finished downloading all timeline's archives");
-    DownloadedTimeline::Successful {
-        disk_consistent_lsn,
-    }
+    DownloadedTimeline::Successful
 }

 async fn try_download_archive<
@@ -180,8 +163,11 @@ async fn try_download_archive<
    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
 >(
    conf: &'static PageServerConf,
-    TimelineSyncId(tenant_id, timeline_id): TimelineSyncId,
-    remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
+    ZTenantTimelineId {
+        tenant_id,
+        timeline_id,
+    }: ZTenantTimelineId,
+    remote_assets: Arc<(S, Arc<RwLock<RemoteTimelineIndex>>)>,
    remote_timeline: &RemoteTimeline,
    archive_id: ArchiveId,
    files_to_skip: Arc<BTreeSet<PathBuf>>,
@@ -189,7 +175,7 @@ async fn try_download_archive<
    debug!("Downloading archive {:?}", archive_id);
    let archive_to_download = remote_timeline
        .archive_data(archive_id)
-        .ok_or_else(|| anyhow!("Archive {:?} not found in remote storage", archive_id))?;
+        .with_context(|| format!("Archive {:?} not found in remote storage", archive_id))?;
    let (archive_header, header_size) = remote_timeline
        .restore_header(archive_id)
        .context("Failed to restore header when downloading an archive")?;
@@ -243,82 +229,6 @@ async fn read_local_metadata(
        .context("Failed to read local metadata files bytes")?)
 }

-async fn download_missing_branches<
-    P: std::fmt::Debug + Send + Sync + 'static,
-    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
->(
-    conf: &'static PageServerConf,
-    (storage, index): &(S, RwLock<RemoteTimelineIndex>),
-    tenant_id: ZTenantId,
-) -> anyhow::Result<()> {
-    let local_branches = tenant_branch_files(conf, tenant_id)
-        .await
-        .context("Failed to list local branch files for the tenant")?;
-    let local_branches_dir = conf.branches_path(&tenant_id);
-    if !local_branches_dir.exists() {
-        fs::create_dir_all(&local_branches_dir)
-            .await
-            .with_context(|| {
-                format!(
-                    "Failed to create local branches directory at path '{}'",
-                    local_branches_dir.display()
-                )
-            })?;
-    }
-
-    if let Some(remote_branches) = index.read().await.branch_files(tenant_id) {
-        let mut remote_only_branches_downloads = remote_branches
-            .difference(&local_branches)
-            .map(|remote_only_branch| async move {
-                let branches_dir = conf.branches_path(&tenant_id);
-                let remote_branch_path = remote_only_branch.as_path(&branches_dir);
-                let storage_path =
-                    storage.storage_path(&remote_branch_path).with_context(|| {
-                        format!(
-                            "Failed to derive a storage path for branch with local path '{}'",
-                            remote_branch_path.display()
-                        )
-                    })?;
-                let mut target_file = fs::OpenOptions::new()
-                    .write(true)
-                    .create_new(true)
-                    .open(&remote_branch_path)
-                    .await
-                    .with_context(|| {
-                        format!(
-                            "Failed to create local branch file at '{}'",
-                            remote_branch_path.display()
-                        )
-                    })?;
-                storage
-                    .download(&storage_path, &mut target_file)
-                    .await
-                    .with_context(|| {
-                        format!(
-                            "Failed to download branch file from the remote path {:?}",
-                            storage_path
-                        )
-                    })?;
-                Ok::<_, anyhow::Error>(())
-            })
-            .collect::<FuturesUnordered<_>>();
-
-        let mut branch_downloads_failed = false;
-        while let Some(download_result) = remote_only_branches_downloads.next().await {
-            if let Err(e) = download_result {
-                branch_downloads_failed = true;
-                error!("Failed to download a branch file: {:?}", e);
-            }
-        }
-        ensure!(
-            !branch_downloads_failed,
-            "Failed to download all branch files"
-        );
-    }
-
-    Ok(())
-}
-
 #[cfg(test)]
 mod tests {
    use std::collections::BTreeSet;
@@ -343,15 +253,17 @@ mod tests {
    #[tokio::test]
    async fn test_download_timeline() -> anyhow::Result<()> {
        let repo_harness = RepoHarness::create("test_download_timeline")?;
-        let sync_id = TimelineSyncId(repo_harness.tenant_id, TIMELINE_ID);
+        let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID);
        let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?;
-        let index = RwLock::new(RemoteTimelineIndex::try_parse_descriptions_from_paths(
-            repo_harness.conf,
-            storage
-                .list()
-                .await?
-                .into_iter()
-                .map(|storage_path| storage.local_path(&storage_path).unwrap()),
+        let index = Arc::new(RwLock::new(
+            RemoteTimelineIndex::try_parse_descriptions_from_paths(
+                repo_harness.conf,
+                storage
+                    .list()
+                    .await?
+                    .into_iter()
+                    .map(|storage_path| storage.local_path(&storage_path).unwrap()),
+            ),
        ));
        let remote_assets = Arc::new((storage, index));
        let storage = &remote_assets.0;
--- a/pageserver/src/remote_storage/storage_sync/index.rs
+++ b/pageserver/src/remote_storage/storage_sync/index.rs
@@ -5,13 +5,13 @@
 //! This way in the future, the index could be restored fast from its serialized stored form.

 use std::{
-    collections::{BTreeMap, BTreeSet, HashMap, HashSet},
+    collections::{BTreeMap, BTreeSet, HashMap},
    path::{Path, PathBuf},
 };

-use anyhow::{anyhow, bail, ensure, Context};
+use anyhow::{bail, ensure, Context};
 use serde::{Deserialize, Serialize};
-use tracing::debug;
+use tracing::*;
 use zenith_utils::{
    lsn::Lsn,
    zid::{ZTenantId, ZTimelineId},
@@ -22,7 +22,7 @@ use crate::{
    layered_repository::TIMELINES_SEGMENT_NAME,
    remote_storage::{
        storage_sync::compression::{parse_archive_name, FileEntry},
-        TimelineSyncId,
+        ZTenantTimelineId,
    },
 };

@@ -49,14 +49,19 @@ impl RelativePath {
 }

 /// An index to track tenant files that exist on the remote storage.
-/// Currently, timeline archives and branch files are tracked.
+/// Currently, timeline archive files are tracked only.
 #[derive(Debug, Clone)]
 pub struct RemoteTimelineIndex {
-    branch_files: HashMap<ZTenantId, HashSet<RelativePath>>,
-    timeline_files: HashMap<TimelineSyncId, TimelineIndexEntry>,
+    timeline_entries: HashMap<ZTenantTimelineId, TimelineIndexEntry>,
 }

 impl RemoteTimelineIndex {
+    pub fn empty() -> Self {
+        Self {
+            timeline_entries: HashMap::new(),
+        }
+    }
+
    /// Attempts to parse file paths (not checking the file contents) and find files
    /// that can be tracked wiht the index.
    /// On parse falures, logs the error and continues, so empty index can be created from not suitable paths.
@@ -64,10 +69,7 @@ impl RemoteTimelineIndex {
        conf: &'static PageServerConf,
        paths: impl Iterator<Item = P>,
    ) -> Self {
-        let mut index = Self {
-            branch_files: HashMap::new(),
-            timeline_files: HashMap::new(),
-        };
+        let mut index = Self::empty();
        for path in paths {
            if let Err(e) = try_parse_index_entry(&mut index, conf, path.as_ref()) {
                debug!(
@@ -80,49 +82,101 @@ impl RemoteTimelineIndex {
        index
    }

-    pub fn timeline_entry(&self, id: &TimelineSyncId) -> Option<&TimelineIndexEntry> {
-        self.timeline_files.get(id)
+    pub fn timeline_entry(&self, id: &ZTenantTimelineId) -> Option<&TimelineIndexEntry> {
+        self.timeline_entries.get(id)
    }

-    pub fn timeline_entry_mut(&mut self, id: &TimelineSyncId) -> Option<&mut TimelineIndexEntry> {
-        self.timeline_files.get_mut(id)
+    pub fn timeline_entry_mut(
+        &mut self,
+        id: &ZTenantTimelineId,
+    ) -> Option<&mut TimelineIndexEntry> {
+        self.timeline_entries.get_mut(id)
    }

-    pub fn add_timeline_entry(&mut self, id: TimelineSyncId, entry: TimelineIndexEntry) {
-        self.timeline_files.insert(id, entry);
+    pub fn add_timeline_entry(&mut self, id: ZTenantTimelineId, entry: TimelineIndexEntry) {
+        self.timeline_entries.insert(id, entry);
    }

-    pub fn all_sync_ids(&self) -> impl Iterator<Item = TimelineSyncId> + '_ {
-        self.timeline_files.keys().copied()
+    pub fn upgrade_timeline_entry(
+        &mut self,
+        id: &ZTenantTimelineId,
+        remote_timeline: RemoteTimeline,
+    ) -> anyhow::Result<()> {
+        let mut entry = self.timeline_entries.get_mut(id).ok_or(anyhow::anyhow!(
+            "timeline is unexpectedly missing from remote index"
+        ))?;
+
+        if !matches!(entry.inner, TimelineIndexEntryInner::Description(_)) {
+            anyhow::bail!("timeline entry is not a description entry")
+        };
+
+        entry.inner = TimelineIndexEntryInner::Full(remote_timeline);
+
+        Ok(())
    }

-    pub fn add_branch_file(&mut self, tenant_id: ZTenantId, path: RelativePath) {
-        self.branch_files
-            .entry(tenant_id)
-            .or_insert_with(HashSet::new)
-            .insert(path);
+    pub fn all_sync_ids(&self) -> impl Iterator<Item = ZTenantTimelineId> + '_ {
+        self.timeline_entries.keys().copied()
    }

-    pub fn branch_files(&self, tenant_id: ZTenantId) -> Option<&HashSet<RelativePath>> {
-        self.branch_files.get(&tenant_id)
+    pub fn set_awaits_download(
+        &mut self,
+        id: &ZTenantTimelineId,
+        awaits_download: bool,
+    ) -> anyhow::Result<()> {
+        self.timeline_entry_mut(id)
+            .ok_or_else(|| anyhow::anyhow!("unknown timeline sync {}", id))?
+            .set_awaits_download(awaits_download);
+        Ok(())
    }
 }

+#[derive(Debug, Clone, PartialEq, Eq, Default)]
+pub struct DescriptionTimelineIndexEntry {
+    pub description: BTreeMap<ArchiveId, ArchiveDescription>,
+    pub awaits_download: bool,
+}
+
 #[derive(Debug, Clone, PartialEq, Eq)]
-pub enum TimelineIndexEntry {
-    /// An archive found on the remote storage, but not yet downloaded, only a metadata from its storage path is available, without archive contents.
+pub struct FullTimelineIndexEntry {
+    pub remote_timeline: RemoteTimeline,
+    pub awaits_download: bool,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum TimelineIndexEntryInner {
    Description(BTreeMap<ArchiveId, ArchiveDescription>),
-    /// Full archive metadata, including the file list, parsed from the archive header.
    Full(RemoteTimeline),
 }

+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct TimelineIndexEntry {
+    inner: TimelineIndexEntryInner,
+    awaits_download: bool,
+}
+
 impl TimelineIndexEntry {
+    pub fn new(inner: TimelineIndexEntryInner, awaits_download: bool) -> Self {
+        Self {
+            inner,
+            awaits_download,
+        }
+    }
+
+    pub fn inner(&self) -> &TimelineIndexEntryInner {
+        &self.inner
+    }
+
+    pub fn inner_mut(&mut self) -> &mut TimelineIndexEntryInner {
+        &mut self.inner
+    }
+
    pub fn uploaded_checkpoints(&self) -> BTreeSet<Lsn> {
-        match self {
-            Self::Description(description) => {
+        match &self.inner {
+            TimelineIndexEntryInner::Description(description) => {
                description.keys().map(|archive_id| archive_id.0).collect()
            }
-            Self::Full(remote_timeline) => remote_timeline
+            TimelineIndexEntryInner::Full(remote_timeline) => remote_timeline
                .checkpoint_archives
                .keys()
                .map(|archive_id| archive_id.0)
@@ -132,17 +186,25 @@ impl TimelineIndexEntry {

    /// Gets latest uploaded checkpoint's disk consisten Lsn for the corresponding timeline.
    pub fn disk_consistent_lsn(&self) -> Option<Lsn> {
-        match self {
-            Self::Description(description) => {
+        match &self.inner {
+            TimelineIndexEntryInner::Description(description) => {
                description.keys().map(|archive_id| archive_id.0).max()
            }
-            Self::Full(remote_timeline) => remote_timeline
+            TimelineIndexEntryInner::Full(remote_timeline) => remote_timeline
                .checkpoint_archives
                .keys()
                .map(|archive_id| archive_id.0)
                .max(),
        }
    }
+
+    pub fn get_awaits_download(&self) -> bool {
+        self.awaits_download
+    }
+
+    pub fn set_awaits_download(&mut self, awaits_download: bool) {
+        self.awaits_download = awaits_download;
+    }
 }

 /// Checkpoint archive's id, corresponding to the `disk_consistent_lsn` from the timeline's metadata file during checkpointing.
@@ -214,7 +276,7 @@ impl RemoteTimeline {
        let archive = self
            .checkpoint_archives
            .get(&archive_id)
-            .ok_or_else(|| anyhow!("Archive {:?} not found", archive_id))?;
+            .with_context(|| format!("Archive {:?} not found", archive_id))?;

        let mut header_files = Vec::with_capacity(archive.files.len());
        for (expected_archive_position, archive_file) in archive.files.iter().enumerate() {
@@ -226,11 +288,10 @@ impl RemoteTimeline {
                archive_id,
            );

-            let timeline_file = self.timeline_files.get(archive_file).ok_or_else(|| {
-                anyhow!(
+            let timeline_file = self.timeline_files.get(archive_file).with_context(|| {
+                format!(
                    "File with id {:?} not found for archive {:?}",
-                    archive_file,
-                    archive_id
+                    archive_file, archive_id
                )
            })?;
            header_files.push(timeline_file.clone());
@@ -299,30 +360,19 @@ fn try_parse_index_entry(
        })?
        .iter()
        .next()
-        .ok_or_else(|| anyhow!("Found no tenant id in path '{}'", path.display()))?
+        .with_context(|| format!("Found no tenant id in path '{}'", path.display()))?
        .to_string_lossy()
        .parse::<ZTenantId>()
        .with_context(|| format!("Failed to parse tenant id from path '{}'", path.display()))?;

-    let branches_path = conf.branches_path(&tenant_id);
    let timelines_path = conf.timelines_path(&tenant_id);
-    match (
-        RelativePath::new(&branches_path, &path),
-        path.strip_prefix(&timelines_path),
-    ) {
-        (Ok(_), Ok(_)) => bail!(
-            "Path '{}' cannot start with both branches '{}' and the timelines '{}' prefixes",
-            path.display(),
-            branches_path.display(),
-            timelines_path.display()
-        ),
-        (Ok(branches_entry), Err(_)) => index.add_branch_file(tenant_id, branches_entry),
-        (Err(_), Ok(timelines_subpath)) => {
+    match path.strip_prefix(&timelines_path) {
+        Ok(timelines_subpath) => {
            let mut segments = timelines_subpath.iter();
            let timeline_id = segments
                .next()
-                .ok_or_else(|| {
-                    anyhow!(
+                .with_context(|| {
+                    format!(
                        "{} directory of tenant {} (path '{}') is not an index entry",
                        TIMELINES_SEGMENT_NAME,
                        tenant_id,
@@ -345,18 +395,23 @@ fn try_parse_index_entry(

            let archive_name = path
                .file_name()
-                .ok_or_else(|| anyhow!("Archive '{}' has no file name", path.display()))?
+                .with_context(|| format!("Archive '{}' has no file name", path.display()))?
                .to_string_lossy()
                .to_string();

-            let sync_id = TimelineSyncId(tenant_id, timeline_id);
-            let timeline_index_entry = index
-                .timeline_files
-                .entry(sync_id)
-                .or_insert_with(|| TimelineIndexEntry::Description(BTreeMap::new()));
-            match timeline_index_entry {
-                TimelineIndexEntry::Description(descriptions) => {
-                    descriptions.insert(
+            let sync_id = ZTenantTimelineId {
+                tenant_id,
+                timeline_id,
+            };
+            let timeline_index_entry = index.timeline_entries.entry(sync_id).or_insert_with(|| {
+                TimelineIndexEntry::new(
+                    TimelineIndexEntryInner::Description(BTreeMap::default()),
+                    false,
+                )
+            });
+            match timeline_index_entry.inner_mut() {
+                TimelineIndexEntryInner::Description(description) => {
+                    description.insert(
                        ArchiveId(disk_consistent_lsn),
                        ArchiveDescription {
                            header_size,
@@ -365,16 +420,15 @@ fn try_parse_index_entry(
                        },
                    );
                }
-                TimelineIndexEntry::Full(_) => {
+                TimelineIndexEntryInner::Full(_) => {
                    bail!("Cannot add parsed archive description to its full context in index with sync id {}", sync_id)
                }
            }
        }
-        (Err(branches_error), Err(timelines_strip_error)) => {
+        Err(timelines_strip_error) => {
            bail!(
-                "Path '{}' is not an index entry: it's neither parsable as a branch entry '{:#}' nor as an archive entry '{}'",
+                "Path '{}' is not an archive entry '{}'",
                path.display(),
-                branches_error,
                timelines_strip_error,
            )
        }
--- a/pageserver/src/remote_storage/storage_sync/upload.rs
+++ b/pageserver/src/remote_storage/storage_sync/upload.rs
@@ -1,23 +1,20 @@
 //! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints.
-//! Currently, tenant branch files are also uploaded, but this does not appear final.

 use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc};

-use anyhow::{ensure, Context};
-use futures::{stream::FuturesUnordered, StreamExt};
-use tokio::{fs, sync::RwLock};
+use anyhow::ensure;
+use tokio::sync::RwLock;
 use tracing::{debug, error, warn};
-use zenith_utils::zid::ZTenantId;

 use crate::{
    config::PageServerConf,
    remote_storage::{
        storage_sync::{
-            compression,
-            index::{RemoteTimeline, TimelineIndexEntry},
-            sync_queue, tenant_branch_files, update_index_description, SyncKind, SyncTask,
+            compression, fetch_full_index,
+            index::{RemoteTimeline, TimelineIndexEntry, TimelineIndexEntryInner},
+            sync_queue, SyncKind, SyncTask,
        },
-        RemoteStorage, TimelineSyncId,
+        RemoteStorage, ZTenantTimelineId,
    },
 };

@@ -26,8 +23,6 @@ use super::{compression::ArchiveHeader, index::RemoteTimelineIndex, NewCheckpoin
 /// Attempts to compress and upload given checkpoint files.
 /// No extra checks for overlapping files is made: download takes care of that, ensuring no non-metadata local timeline files are overwritten.
 ///
-/// Before the checkpoint files are uploaded, branch files are uploaded, if any local ones are missing remotely.
-///
 /// On an error, bumps the retries count and reschedules the entire task.
 /// On success, populates index data with new downloads.
 pub(super) async fn upload_timeline_checkpoint<
@@ -35,50 +30,43 @@ pub(super) async fn upload_timeline_checkpoint<
    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
 >(
    config: &'static PageServerConf,
-    remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
-    sync_id: TimelineSyncId,
+    remote_assets: Arc<(S, Arc<RwLock<RemoteTimelineIndex>>)>,
+    sync_id: ZTenantTimelineId,
    new_checkpoint: NewCheckpoint,
    retries: u32,
 ) -> Option<bool> {
    debug!("Uploading checkpoint for sync id {}", sync_id);
-    if let Err(e) = upload_missing_branches(config, remote_assets.as_ref(), sync_id.0).await {
-        error!(
-            "Failed to upload missing branches for sync id {}: {:?}",
-            sync_id, e
-        );
-        sync_queue::push(SyncTask::new(
-            sync_id,
-            retries,
-            SyncKind::Upload(new_checkpoint),
-        ));
-        return Some(false);
-    }
    let new_upload_lsn = new_checkpoint.metadata.disk_consistent_lsn();

    let index = &remote_assets.1;

-    let TimelineSyncId(tenant_id, timeline_id) = sync_id;
+    let ZTenantTimelineId {
+        tenant_id,
+        timeline_id,
+    } = sync_id;
    let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);

    let index_read = index.read().await;
    let remote_timeline = match index_read.timeline_entry(&sync_id) {
        None => None,
-        Some(TimelineIndexEntry::Full(remote_timeline)) => Some(Cow::Borrowed(remote_timeline)),
-        Some(TimelineIndexEntry::Description(_)) => {
-            debug!("Found timeline description for the given ids, downloading the full index");
-            match update_index_description(remote_assets.as_ref(), &timeline_dir, sync_id).await {
-                Ok(remote_timeline) => Some(Cow::Owned(remote_timeline)),
-                Err(e) => {
-                    error!("Failed to download full timeline index: {:?}", e);
-                    sync_queue::push(SyncTask::new(
-                        sync_id,
-                        retries,
-                        SyncKind::Upload(new_checkpoint),
-                    ));
-                    return Some(false);
+        Some(entry) => match entry.inner() {
+            TimelineIndexEntryInner::Full(remote_timeline) => Some(Cow::Borrowed(remote_timeline)),
+            TimelineIndexEntryInner::Description(_) => {
+                debug!("Found timeline description for the given ids, downloading the full index");
+                match fetch_full_index(remote_assets.as_ref(), &timeline_dir, sync_id).await {
+                    Ok(remote_timeline) => Some(Cow::Owned(remote_timeline)),
+                    Err(e) => {
+                        error!("Failed to download full timeline index: {:?}", e);
+                        sync_queue::push(SyncTask::new(
+                            sync_id,
+                            retries,
+                            SyncKind::Upload(new_checkpoint),
+                        ));
+                        return Some(false);
+                    }
                }
            }
-        }
+        },
    };

    let already_contains_upload_lsn = remote_timeline
@@ -109,22 +97,40 @@ pub(super) async fn upload_timeline_checkpoint<
    {
        Ok((archive_header, header_size)) => {
            let mut index_write = index.write().await;
-            match index_write.timeline_entry_mut(&sync_id) {
-                Some(TimelineIndexEntry::Full(remote_timeline)) => {
-                    remote_timeline.update_archive_contents(
-                        new_checkpoint.metadata.disk_consistent_lsn(),
-                        archive_header,
-                        header_size,
-                    );
-                }
-                None | Some(TimelineIndexEntry::Description(_)) => {
+            match index_write
+                .timeline_entry_mut(&sync_id)
+                .map(|e| e.inner_mut())
+            {
+                None => {
                    let mut new_timeline = RemoteTimeline::empty();
                    new_timeline.update_archive_contents(
                        new_checkpoint.metadata.disk_consistent_lsn(),
                        archive_header,
                        header_size,
                    );
-                    index_write.add_timeline_entry(sync_id, TimelineIndexEntry::Full(new_timeline));
+                    index_write.add_timeline_entry(
+                        sync_id,
+                        TimelineIndexEntry::new(TimelineIndexEntryInner::Full(new_timeline), false),
+                    )
+                }
+                Some(TimelineIndexEntryInner::Full(remote_timeline)) => {
+                    remote_timeline.update_archive_contents(
+                        new_checkpoint.metadata.disk_consistent_lsn(),
+                        archive_header,
+                        header_size,
+                    );
+                }
+                Some(TimelineIndexEntryInner::Description(_)) => {
+                    let mut new_timeline = RemoteTimeline::empty();
+                    new_timeline.update_archive_contents(
+                        new_checkpoint.metadata.disk_consistent_lsn(),
+                        archive_header,
+                        header_size,
+                    );
+                    index_write.add_timeline_entry(
+                        sync_id,
+                        TimelineIndexEntry::new(TimelineIndexEntryInner::Full(new_timeline), false),
+                    )
                }
            }
            debug!("Checkpoint uploaded successfully");
@@ -150,12 +156,15 @@ async fn try_upload_checkpoint<
    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
 >(
    config: &'static PageServerConf,
-    remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
-    sync_id: TimelineSyncId,
+    remote_assets: Arc<(S, Arc<RwLock<RemoteTimelineIndex>>)>,
+    sync_id: ZTenantTimelineId,
    new_checkpoint: &NewCheckpoint,
    files_to_skip: BTreeSet<PathBuf>,
 ) -> anyhow::Result<(ArchiveHeader, u64)> {
-    let TimelineSyncId(tenant_id, timeline_id) = sync_id;
+    let ZTenantTimelineId {
+        tenant_id,
+        timeline_id,
+    } = sync_id;
    let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);

    let files_to_upload = new_checkpoint
@@ -194,76 +203,6 @@ async fn try_upload_checkpoint<
    .map(|(header, header_size, _)| (header, header_size))
 }

-async fn upload_missing_branches<
-    P: std::fmt::Debug + Send + Sync + 'static,
-    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
->(
-    config: &'static PageServerConf,
-    (storage, index): &(S, RwLock<RemoteTimelineIndex>),
-    tenant_id: ZTenantId,
-) -> anyhow::Result<()> {
-    let local_branches = tenant_branch_files(config, tenant_id)
-        .await
-        .context("Failed to list local branch files for the tenant")?;
-    let index_read = index.read().await;
-    let remote_branches = index_read
-        .branch_files(tenant_id)
-        .cloned()
-        .unwrap_or_default();
-    drop(index_read);
-
-    let mut branch_uploads = local_branches
-        .difference(&remote_branches)
-        .map(|local_only_branch| async move {
-            let local_branch_path = local_only_branch.as_path(&config.branches_path(&tenant_id));
-            let storage_path = storage.storage_path(&local_branch_path).with_context(|| {
-                format!(
-                    "Failed to derive a storage path for branch with local path '{}'",
-                    local_branch_path.display()
-                )
-            })?;
-            let local_branch_file = fs::OpenOptions::new()
-                .read(true)
-                .open(&local_branch_path)
-                .await
-                .with_context(|| {
-                    format!(
-                        "Failed to open local branch file {} for reading",
-                        local_branch_path.display()
-                    )
-                })?;
-            storage
-                .upload(local_branch_file, &storage_path)
-                .await
-                .with_context(|| {
-                    format!(
-                        "Failed to upload branch file to the remote path {:?}",
-                        storage_path
-                    )
-                })?;
-            Ok::<_, anyhow::Error>(local_only_branch)
-        })
-        .collect::<FuturesUnordered<_>>();
-
-    let mut branch_uploads_failed = false;
-    while let Some(upload_result) = branch_uploads.next().await {
-        match upload_result {
-            Ok(local_only_branch) => index
-                .write()
-                .await
-                .add_branch_file(tenant_id, local_only_branch.clone()),
-            Err(e) => {
-                error!("Failed to upload branch file: {:?}", e);
-                branch_uploads_failed = true;
-            }
-        }
-    }
-
-    ensure!(!branch_uploads_failed, "Failed to upload all branch files");
-
-    Ok(())
-}
-
 #[cfg(test)]
 mod tests {
    use tempfile::tempdir;
@@ -288,15 +227,17 @@ mod tests {
    #[tokio::test]
    async fn reupload_timeline() -> anyhow::Result<()> {
        let repo_harness = RepoHarness::create("reupload_timeline")?;
-        let sync_id = TimelineSyncId(repo_harness.tenant_id, TIMELINE_ID);
+        let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID);
        let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?;
-        let index = RwLock::new(RemoteTimelineIndex::try_parse_descriptions_from_paths(
-            repo_harness.conf,
-            storage
-                .list()
-                .await?
-                .into_iter()
-                .map(|storage_path| storage.local_path(&storage_path).unwrap()),
+        let index = Arc::new(RwLock::new(
+            RemoteTimelineIndex::try_parse_descriptions_from_paths(
+                repo_harness.conf,
+                storage
+                    .list()
+                    .await?
+                    .into_iter()
+                    .map(|storage_path| storage.local_path(&storage_path).unwrap()),
+            ),
        ));
        let remote_assets = Arc::new((storage, index));
        let index = &remote_assets.1;
@@ -484,15 +425,17 @@ mod tests {
    #[tokio::test]
    async fn reupload_timeline_rejected() -> anyhow::Result<()> {
        let repo_harness = RepoHarness::create("reupload_timeline_rejected")?;
-        let sync_id = TimelineSyncId(repo_harness.tenant_id, TIMELINE_ID);
+        let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID);
        let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?;
-        let index = RwLock::new(RemoteTimelineIndex::try_parse_descriptions_from_paths(
-            repo_harness.conf,
-            storage
-                .list()
-                .await?
-                .into_iter()
-                .map(|storage_path| storage.local_path(&storage_path).unwrap()),
+        let index = Arc::new(RwLock::new(
+            RemoteTimelineIndex::try_parse_descriptions_from_paths(
+                repo_harness.conf,
+                storage
+                    .list()
+                    .await?
+                    .into_iter()
+                    .map(|storage_path| storage.local_path(&storage_path).unwrap()),
+            ),
        ));
        let remote_assets = Arc::new((storage, index));
        let storage = &remote_assets.0;
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -1,4 +1,6 @@
+use crate::layered_repository::metadata::TimelineMetadata;
 use crate::relish::*;
+use crate::remote_storage::RemoteTimelineIndex;
 use crate::walrecord::MultiXactMember;
 use crate::CheckpointConfig;
 use anyhow::Result;
@@ -6,8 +8,9 @@ use bytes::Bytes;
 use postgres_ffi::{MultiXactId, MultiXactOffset, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::collections::HashSet;
+use std::fmt::Display;
 use std::ops::{AddAssign, Deref};
-use std::sync::Arc;
+use std::sync::{Arc, RwLockReadGuard};
 use std::time::Duration;
 use zenith_utils::lsn::{Lsn, RecordLsn};
 use zenith_utils::zid::ZTimelineId;
@@ -15,24 +18,43 @@ use zenith_utils::zid::ZTimelineId;
 /// Block number within a relish. This matches PostgreSQL's BlockNumber type.
 pub type BlockNumber = u32;

+#[derive(Clone, Copy, Debug)]
+pub enum TimelineSyncStatusUpdate {
+    Uploaded,
+    Downloaded,
+}
+
+impl Display for TimelineSyncStatusUpdate {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let s = match self {
+            TimelineSyncStatusUpdate::Uploaded => "Uploaded",
+            TimelineSyncStatusUpdate::Downloaded => "Downloaded",
+        };
+        f.write_str(s)
+    }
+}
 ///
 /// A repository corresponds to one .zenith directory. One repository holds multiple
 /// timelines, forked off from the same initial call to 'initdb'.
 pub trait Repository: Send + Sync {
-    /// Updates timeline based on the new sync state, received from the remote storage synchronization.
+    /// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization.
    /// See [`crate::remote_storage`] for more details about the synchronization.
-    fn set_timeline_state(
+    fn apply_timeline_remote_sync_status_update(
        &self,
        timeline_id: ZTimelineId,
-        new_state: TimelineSyncState,
+        timeline_sync_status_update: TimelineSyncStatusUpdate,
    ) -> Result<()>;

-    /// Gets current synchronization state of the timeline.
-    /// See [`crate::remote_storage`] for more details about the synchronization.
-    fn get_timeline_state(&self, timeline_id: ZTimelineId) -> Option<TimelineSyncState>;
-
    /// Get Timeline handle for given zenith timeline ID.
-    fn get_timeline(&self, timelineid: ZTimelineId) -> Result<RepositoryTimeline>;
+    /// This function is idempotent. It doesnt change internal state in any way.
+    fn get_timeline(&self, timelineid: ZTimelineId) -> Option<RepositoryTimeline>;
+
+    /// Get Timeline handle for locally available timeline. Load it into memory if it is not loaded.
+    fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
+
+    /// Lists timelines the repository contains.
+    /// Up to repository's implementation to omit certain timelines that ar not considered ready for use.
+    fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline)>;

    /// Create a new, empty timeline. The caller is responsible for loading data into it
    /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
@@ -64,65 +86,47 @@ pub trait Repository: Send + Sync {
    /// perform one checkpoint iteration, flushing in-memory data on disk.
    /// this function is periodically called by checkponter thread.
    fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()>;
+
+    /// detaches locally available timeline by stopping all threads and removing all the data.
+    fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>;
+
+    // Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn.
+    fn get_remote_index(&self) -> &tokio::sync::RwLock<RemoteTimelineIndex>;
 }

 /// A timeline, that belongs to the current repository.
 pub enum RepositoryTimeline {
    /// Timeline, with its files present locally in pageserver's working directory.
    /// Loaded into pageserver's memory and ready to be used.
-    Local(Arc<dyn Timeline>),
-    /// Timeline, found on the pageserver's remote storage, but not yet downloaded locally.
-    Remote {
-        id: ZTimelineId,
-        /// metadata contents of the latest successfully uploaded checkpoint
-        disk_consistent_lsn: Lsn,
+    Loaded(Arc<dyn Timeline>),
+
+    /// All the data is available locally, but not loaded into memory, so loading have to be done before actually using the timeline
+    Unloaded {
+        // It is ok to keep metadata here, because it is not changed when timeline is unloaded.
+        // FIXME can s3 sync actually change it? It can change it when timeline is in awaiting download state.
+        //  but we currently do not download something for the timeline once it is local (even if there are new checkpoints) is it correct?
+        // also it is not that good to keep TimelineMetadata here, because it is layered repo implementation detail
+        metadata: TimelineMetadata,
    },
 }

-impl RepositoryTimeline {
-    pub fn local_timeline(&self) -> Option<Arc<dyn Timeline>> {
-        if let Self::Local(local_timeline) = self {
-            Some(Arc::clone(local_timeline))
-        } else {
-            None
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub enum LocalTimelineState {
+    // timeline is loaded into memory (with layer map and all the bits),
+    Loaded,
+    // timeline is on disk locally and ready to be loaded into memory.
+    Unloaded,
+}
+
+impl<'a> From<&'a RepositoryTimeline> for LocalTimelineState {
+    fn from(local_timeline_entry: &'a RepositoryTimeline) -> Self {
+        match local_timeline_entry {
+            RepositoryTimeline::Loaded(_) => LocalTimelineState::Loaded,
+            RepositoryTimeline::Unloaded { .. } => LocalTimelineState::Unloaded,
        }
    }
 }

-/// A state of the timeline synchronization with the remote storage.
-/// Contains `disk_consistent_lsn` of the corresponding remote timeline (latest checkpoint's disk_consistent_lsn).
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
-pub enum TimelineSyncState {
-    /// No further downloads from the remote storage are needed.
-    /// The timeline state is up-to-date or ahead of the remote storage one,
-    /// ready to be used in any pageserver operation.
-    Ready(Lsn),
-    /// Timeline is scheduled for downloading, but its current local state is not up to date with the remote storage.
-    /// The timeline is not ready to be used in any pageserver operations, otherwise it might diverge its local state from the remote version,
-    /// making it impossible to sync it further.
-    AwaitsDownload(Lsn),
-    /// Timeline was not in the pageserver's local working directory, but was found on the remote storage, ready to be downloaded.
-    /// Cannot be used in any pageserver operations due to complete absence locally.
-    CloudOnly(Lsn),
-    /// Timeline was evicted from the pageserver's local working directory due to conflicting remote and local states or too many errors during the synchronization.
-    /// Such timelines cannot have their state synchronized further and may not have the data about remote timeline's disk_consistent_lsn, since eviction may happen
-    /// due to errors before the remote timeline contents is known.
-    Evicted(Option<Lsn>),
-}
-
-impl TimelineSyncState {
-    pub fn remote_disk_consistent_lsn(&self) -> Option<Lsn> {
-        Some(match self {
-            TimelineSyncState::Evicted(None) => return None,
-            TimelineSyncState::Ready(lsn) => lsn,
-            TimelineSyncState::AwaitsDownload(lsn) => lsn,
-            TimelineSyncState::CloudOnly(lsn) => lsn,
-            TimelineSyncState::Evicted(Some(lsn)) => lsn,
-        })
-        .copied()
-    }
-}
-
 ///
 /// Result of performing GC
 ///
@@ -182,6 +186,9 @@ pub trait Timeline: Send + Sync {
    ///
    fn wait_lsn(&self, lsn: Lsn) -> Result<()>;

+    /// Lock and get timeline's GC cuttof
+    fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard<Lsn>;
+
    /// Look up given page version.
    fn get_page_at_lsn(&self, tag: RelishTag, blknum: BlockNumber, lsn: Lsn) -> Result<Bytes>;

@@ -215,10 +222,12 @@ pub trait Timeline: Send + Sync {

    /// Atomically get both last and prev.
    fn get_last_record_rlsn(&self) -> RecordLsn;
+
    /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
    fn get_last_record_lsn(&self) -> Lsn;
+
    fn get_prev_record_lsn(&self) -> Lsn;
-    fn get_start_lsn(&self) -> Lsn;
+
    fn get_disk_consistent_lsn(&self) -> Lsn;

    /// Mutate the timeline with a [`TimelineWriter`].
@@ -233,7 +242,11 @@ pub trait Timeline: Send + Sync {

    ///
    /// Check that it is valid to request operations with that lsn.
-    fn check_lsn_is_in_scope(&self, lsn: Lsn) -> Result<()>;
+    fn check_lsn_is_in_scope(
+        &self,
+        lsn: Lsn,
+        latest_gc_cutoff_lsn: &RwLockReadGuard<Lsn>,
+    ) -> Result<()>;

    /// Retrieve current logical size of the timeline
    ///
@@ -242,7 +255,7 @@ pub trait Timeline: Send + Sync {
    fn get_current_logical_size(&self) -> usize;

    /// Does the same as get_current_logical_size but counted on demand.
-    /// Used in tests to ensure thet incremental and non incremental variants match.
+    /// Used in tests to ensure that incremental and non incremental variants match.
    fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize>;

    /// An escape hatch to allow "casting" a generic Timeline to LayeredTimeline.
@@ -295,8 +308,12 @@ pub enum ZenithWalRecord {
    /// Native PostgreSQL WAL record
    Postgres { will_init: bool, rec: Bytes },

-    /// Set bits in heap visibility map. (heap blkno, flag bits to clear)
-    ClearVisibilityMapFlags { heap_blkno: u32, flags: u8 },
+    /// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear)
+    ClearVisibilityMapFlags {
+        new_heap_blkno: Option<u32>,
+        old_heap_blkno: Option<u32>,
+        flags: u8,
+    },
    /// Mark transaction IDs as committed on a CLOG page
    ClogSetCommitted { xids: Vec<TransactionId> },
    /// Mark transaction IDs as aborted on a CLOG page
@@ -333,7 +350,7 @@ pub mod repo_harness {

    use crate::{
        config::PageServerConf,
-        layered_repository::{LayeredRepository, TIMELINES_SEGMENT_NAME},
+        layered_repository::LayeredRepository,
        walredo::{WalRedoError, WalRedoManager},
    };

@@ -366,7 +383,6 @@ pub mod repo_harness {
            let repo_dir = PageServerConf::test_repo_dir(test_name);
            let _ = fs::remove_dir_all(&repo_dir);
            fs::create_dir_all(&repo_dir)?;
-            fs::create_dir_all(&repo_dir.join(TIMELINES_SEGMENT_NAME))?;

            let conf = PageServerConf::dummy_conf(repo_dir);
            // Make a static copy of the config. This can never be free'd, but that's
@@ -375,20 +391,45 @@ pub mod repo_harness {

            let tenant_id = ZTenantId::generate();
            fs::create_dir_all(conf.tenant_path(&tenant_id))?;
-            fs::create_dir_all(conf.branches_path(&tenant_id))?;
+            fs::create_dir_all(conf.timelines_path(&tenant_id))?;

            Ok(Self { conf, tenant_id })
        }

        pub fn load(&self) -> Box<dyn Repository> {
+            self.try_load().expect("failed to load test repo")
+        }
+
+        pub fn try_load(&self) -> Result<Box<dyn Repository>> {
            let walredo_mgr = Arc::new(TestRedoManager);

-            Box::new(LayeredRepository::new(
+            let repo = Box::new(LayeredRepository::new(
                self.conf,
                walredo_mgr,
                self.tenant_id,
+                Arc::new(tokio::sync::RwLock::new(RemoteTimelineIndex::empty())),
                false,
-            ))
+            ));
+            // populate repo with locally available timelines
+            for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id))
+                .expect("should be able to read timelines dir")
+            {
+                let timeline_dir_entry = timeline_dir_entry.unwrap();
+                let timeline_id: ZTimelineId = timeline_dir_entry
+                    .path()
+                    .file_name()
+                    .unwrap()
+                    .to_string_lossy()
+                    .parse()
+                    .unwrap();
+
+                repo.apply_timeline_remote_sync_status_update(
+                    timeline_id,
+                    TimelineSyncStatusUpdate::Downloaded,
+                )?;
+            }
+
+            Ok(repo)
        }

        pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
@@ -432,8 +473,6 @@ pub mod repo_harness {
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
-    use crate::layered_repository::metadata::METADATA_FILE_NAME;
-
    use super::repo_harness::*;
    use super::*;
    use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT};
@@ -731,8 +770,8 @@ mod tests {

        let mut lsn = 0x10;
        for blknum in 0..pg_constants::RELSEG_SIZE + 1 {
-            let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
            lsn += 0x10;
+            let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
            writer.put_page_image(TESTREL_A, blknum as BlockNumber, Lsn(lsn), img)?;
        }
        writer.advance_last_record_lsn(Lsn(lsn));
@@ -809,10 +848,9 @@ mod tests {

        // Create a branch, check that the relation is visible there
        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
-        let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() {
-            Some(timeline) => timeline,
-            None => panic!("Should have a local timeline"),
-        };
+        let newtline = repo
+            .get_timeline_load(NEW_TIMELINE_ID)
+            .expect("Should have a local timeline");
        let new_writer = newtline.writer();

        assert!(newtline
@@ -870,10 +908,9 @@ mod tests {

        // Branch the history, modify relation differently on the new timeline
        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
-        let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() {
-            Some(timeline) => timeline,
-            None => panic!("Should have a local timeline"),
-        };
+        let newtline = repo
+            .get_timeline_load(NEW_TIMELINE_ID)
+            .expect("Should have a local timeline");
        let new_writer = newtline.writer();

        new_writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?;
@@ -985,7 +1022,7 @@ mod tests {
                    .source()
                    .unwrap()
                    .to_string()
-                    .contains("is earlier than initdb lsn"));
+                    .contains("is earlier than latest GC horizon"));
            }
        }

@@ -1002,12 +1039,11 @@ mod tests {
        make_some_layers(&tline, Lsn(0x20))?;

        repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
-
+        let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
+        assert!(*latest_gc_cutoff_lsn > Lsn(0x25));
        match tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)) {
            Ok(_) => panic!("request for page should have failed"),
-            Err(err) => assert!(err
-                .to_string()
-                .contains("tried to request a page version that was garbage collected")),
+            Err(err) => assert!(err.to_string().contains("not found at")),
        }
        Ok(())
    }
@@ -1021,11 +1057,9 @@ mod tests {
        make_some_layers(&tline, Lsn(0x20))?;

        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
-        let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() {
-            Some(timeline) => timeline,
-            None => panic!("Should have a local timeline"),
-        };
-
+        let newtline = repo
+            .get_timeline_load(NEW_TIMELINE_ID)
+            .expect("Should have a local timeline");
        // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
        repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
        assert!(newtline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)).is_ok());
@@ -1042,10 +1076,9 @@ mod tests {
        make_some_layers(&tline, Lsn(0x20))?;

        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
-        let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() {
-            Some(timeline) => timeline,
-            None => panic!("Should have a local timeline"),
-        };
+        let newtline = repo
+            .get_timeline_load(NEW_TIMELINE_ID)
+            .expect("Should have a local timeline");

        make_some_layers(&newtline, Lsn(0x60))?;

@@ -1120,138 +1153,78 @@ mod tests {
    }

    #[test]
-    fn corrupt_metadata() -> Result<()> {
-        const TEST_NAME: &str = "corrupt_metadata";
+    fn timeline_load() -> Result<()> {
+        const TEST_NAME: &str = "timeline_load";
        let harness = RepoHarness::create(TEST_NAME)?;
+        {
+            let repo = harness.load();
+            let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?;
+            make_some_layers(&tline, Lsn(0x8000))?;
+            tline.checkpoint(CheckpointConfig::Forced)?;
+        }
+
        let repo = harness.load();
+        let tline = repo
+            .get_timeline(TIMELINE_ID)
+            .expect("cannot load timeline");
+        assert!(matches!(tline, RepositoryTimeline::Unloaded { .. }));

-        repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
-        drop(repo);
+        assert!(repo.get_timeline_load(TIMELINE_ID).is_ok());

-        let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
-
-        assert!(metadata_path.is_file());
-
-        let mut metadata_bytes = std::fs::read(&metadata_path)?;
-        assert_eq!(metadata_bytes.len(), 512);
-        metadata_bytes[512 - 4 - 2] ^= 1;
-        std::fs::write(metadata_path, metadata_bytes)?;
-
-        let new_repo = harness.load();
-        let err = new_repo.get_timeline(TIMELINE_ID).err().unwrap();
-        assert_eq!(err.to_string(), "failed to load metadata");
-        assert_eq!(
-            err.source().unwrap().to_string(),
-            "metadata checksum mismatch"
-        );
+        let tline = repo
+            .get_timeline(TIMELINE_ID)
+            .expect("cannot load timeline");
+        assert!(matches!(tline, RepositoryTimeline::Loaded(_)));

        Ok(())
    }

    #[test]
-    fn future_layerfiles() -> Result<()> {
-        const TEST_NAME: &str = "future_layerfiles";
+    fn timeline_load_with_ancestor() -> Result<()> {
+        const TEST_NAME: &str = "timeline_load_with_ancestor";
        let harness = RepoHarness::create(TEST_NAME)?;
+        // create two timelines
+        {
+            let repo = harness.load();
+            let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
+
+            make_some_layers(&tline, Lsn(0x20))?;
+            tline.checkpoint(CheckpointConfig::Forced)?;
+
+            repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
+
+            let newtline = repo
+                .get_timeline_load(NEW_TIMELINE_ID)
+                .expect("Should have a local timeline");
+
+            make_some_layers(&newtline, Lsn(0x60))?;
+            tline.checkpoint(CheckpointConfig::Forced)?;
+        }
+
+        // check that both of them are initially unloaded
        let repo = harness.load();
+        {
+            let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline");
+            assert!(matches!(tline, RepositoryTimeline::Unloaded { .. }));

-        // Create a timeline with disk_consistent_lsn = 8000
-        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?;
-        let writer = tline.writer();
-        writer.advance_last_record_lsn(Lsn(0x8000));
-        drop(writer);
-        repo.checkpoint_iteration(CheckpointConfig::Forced)?;
-        drop(repo);
-
-        let timeline_path = harness.timeline_path(&TIMELINE_ID);
-
-        let make_empty_file = |filename: &str| -> std::io::Result<()> {
-            let path = timeline_path.join(filename);
-
-            assert!(!path.exists());
-            std::fs::write(&path, &[])?;
-
-            Ok(())
-        };
-
-        // Helper function to check that a relation file exists, and a corresponding
-        // <filename>.0.old file does not.
-        let assert_exists = |filename: &str| {
-            let path = timeline_path.join(filename);
-            assert!(path.exists(), "file {} was removed", filename);
-
-            // Check that there is no .old file
-            let backup_path = timeline_path.join(format!("{}.0.old", filename));
-            assert!(
-                !backup_path.exists(),
-                "unexpected backup file {}",
-                backup_path.display()
-            );
-        };
-
-        // Helper function to check that a relation file does *not* exists, and a corresponding
-        // <filename>.<num>.old file does.
-        let assert_is_renamed = |filename: &str, num: u32| {
-            let path = timeline_path.join(filename);
-            assert!(
-                !path.exists(),
-                "file {} was not removed as expected",
-                filename
-            );
-
-            let backup_path = timeline_path.join(format!("{}.{}.old", filename, num));
-            assert!(
-                backup_path.exists(),
-                "backup file {} was not created",
-                backup_path.display()
-            );
-        };
-
-        // These files are considered to be in the future and will be renamed out
-        // of the way
-        let future_filenames = vec![
-            format!("pg_control_0_{:016X}", 0x8001),
-            format!("pg_control_0_{:016X}_{:016X}", 0x8001, 0x8008),
-        ];
-        // But these are not:
-        let past_filenames = vec![
-            format!("pg_control_0_{:016X}", 0x8000),
-            format!("pg_control_0_{:016X}_{:016X}", 0x7000, 0x8001),
-        ];
-
-        for filename in future_filenames.iter().chain(past_filenames.iter()) {
-            make_empty_file(filename)?;
+            let tline = repo
+                .get_timeline(NEW_TIMELINE_ID)
+                .expect("cannot get timeline");
+            assert!(matches!(tline, RepositoryTimeline::Unloaded { .. }));
        }
+        // load only child timeline
+        let _ = repo
+            .get_timeline_load(NEW_TIMELINE_ID)
+            .expect("cannot load timeline");

-        // Load the timeline. This will cause the files in the "future" to be renamed
-        // away.
-        let new_repo = harness.load();
-        new_repo.get_timeline(TIMELINE_ID).unwrap();
-        drop(new_repo);
+        // check that both, child and ancestor are loaded
+        let tline = repo
+            .get_timeline(NEW_TIMELINE_ID)
+            .expect("cannot get timeline");
+        assert!(matches!(tline, RepositoryTimeline::Loaded(_)));

-        for filename in future_filenames.iter() {
-            assert_is_renamed(filename, 0);
-        }
-        for filename in past_filenames.iter() {
-            assert_exists(filename);
-        }
-
-        // Create the future files again, and load again. They should be renamed to
-        // *.1.old this time.
-        for filename in future_filenames.iter() {
-            make_empty_file(filename)?;
-        }
-
-        let new_repo = harness.load();
-        new_repo.get_timeline(TIMELINE_ID).unwrap();
-        drop(new_repo);
-
-        for filename in future_filenames.iter() {
-            assert_is_renamed(filename, 0);
-            assert_is_renamed(filename, 1);
-        }
-        for filename in past_filenames.iter() {
-            assert_exists(filename);
-        }
+        let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline");
+        assert!(matches!(tline, RepositoryTimeline::Loaded(_)));

        Ok(())
    }
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -1,19 +1,23 @@
 //! This module acts as a switchboard to access different repositories managed by this
 //! page server.

-use crate::branches;
 use crate::config::PageServerConf;
 use crate::layered_repository::LayeredRepository;
-use crate::repository::{Repository, Timeline, TimelineSyncState};
+use crate::remote_storage::RemoteTimelineIndex;
+use crate::repository::{Repository, Timeline, TimelineSyncStatusUpdate};
 use crate::thread_mgr;
 use crate::thread_mgr::ThreadKind;
+use crate::timelines;
+use crate::timelines::CreateRepo;
 use crate::walredo::PostgresRedoManager;
 use crate::CheckpointConfig;
-use anyhow::{anyhow, bail, Context, Result};
+use anyhow::{Context, Result};
 use lazy_static::lazy_static;
 use log::*;
 use serde::{Deserialize, Serialize};
-use std::collections::{hash_map, HashMap};
+use serde_with::{serde_as, DisplayFromStr};
+use std::collections::hash_map::Entry;
+use std::collections::HashMap;
 use std::fmt;
 use std::sync::{Arc, Mutex, MutexGuard};
 use zenith_utils::zid::{ZTenantId, ZTimelineId};
@@ -57,78 +61,67 @@ fn access_tenants() -> MutexGuard<'static, HashMap<ZTenantId, Tenant>> {
    TENANTS.lock().unwrap()
 }

-/// Updates tenants' repositories, changing their timelines state in memory.
-pub fn set_timeline_states(
+// Sets up wal redo manager and repository for tenant. Reduces code duplocation.
+// Used during pageserver startup, or when new tenant is attached to pageserver.
+pub fn load_local_repo(
    conf: &'static PageServerConf,
-    timeline_states: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>>,
-) {
-    if timeline_states.is_empty() {
-        debug!("no timeline state updates to perform");
-        return;
-    }
-
-    info!("Updating states for {} timelines", timeline_states.len());
-    trace!("States: {:?}", timeline_states);
-
+    tenant_id: ZTenantId,
+    remote_index: &Arc<tokio::sync::RwLock<RemoteTimelineIndex>>,
+) -> Arc<dyn Repository> {
    let mut m = access_tenants();
-    for (tenant_id, timeline_states) in timeline_states {
-        let tenant = m.entry(tenant_id).or_insert_with(|| {
-            // Set up a WAL redo manager, for applying WAL records.
-            let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
+    let tenant = m.entry(tenant_id).or_insert_with(|| {
+        // Set up a WAL redo manager, for applying WAL records.
+        let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);

-            // Set up an object repository, for actual data storage.
-            let repo: Arc<dyn Repository> = Arc::new(LayeredRepository::new(
-                conf,
-                Arc::new(walredo_mgr),
-                tenant_id,
-                conf.remote_storage_config.is_some(),
-            ));
-            Tenant {
-                state: TenantState::Idle,
-                repo,
-            }
-        });
-        if let Err(e) = put_timelines_into_tenant(tenant, tenant_id, timeline_states) {
-            error!(
-                "Failed to update timeline states for tenant {}: {:?}",
-                tenant_id, e
-            );
+        // Set up an object repository, for actual data storage.
+        let repo: Arc<dyn Repository> = Arc::new(LayeredRepository::new(
+            conf,
+            Arc::new(walredo_mgr),
+            tenant_id,
+            Arc::clone(remote_index),
+            conf.remote_storage_config.is_some(),
+        ));
+        Tenant {
+            state: TenantState::Idle,
+            repo,
        }
-    }
+    });
+    Arc::clone(&tenant.repo)
 }

-fn put_timelines_into_tenant(
-    tenant: &mut Tenant,
-    tenant_id: ZTenantId,
-    timeline_states: HashMap<ZTimelineId, TimelineSyncState>,
-) -> anyhow::Result<()> {
-    for (timeline_id, timeline_state) in timeline_states {
-        // If the timeline is being put into any other state than Ready,
-        // stop any threads operating on it.
-        //
-        // FIXME: This is racy. A page service thread could just get
-        // handle on the Timeline, before we call set_timeline_state()
-        if !matches!(timeline_state, TimelineSyncState::Ready(_)) {
-            thread_mgr::shutdown_threads(None, Some(tenant_id), Some(timeline_id));
-
-            // Should we run a final checkpoint to flush all the data to
-            // disk? Doesn't seem necessary; all of the states other than
-            // Ready imply that the data on local disk is corrupt or incomplete,
-            // and we don't want to flush that to disk.
-        }
-
-        tenant
-            .repo
-            .set_timeline_state(timeline_id, timeline_state)
-            .with_context(|| {
-                format!(
-                    "Failed to update timeline {} state to {:?}",
-                    timeline_id, timeline_state
-                )
-            })?;
+/// Updates tenants' repositories, changing their timelines state in memory.
+pub fn apply_timeline_sync_status_updates(
+    conf: &'static PageServerConf,
+    remote_index: Arc<tokio::sync::RwLock<RemoteTimelineIndex>>,
+    sync_status_updates: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncStatusUpdate>>,
+) {
+    if sync_status_updates.is_empty() {
+        debug!("no sync status updates to apply");
+        return;
    }
+    info!(
+        "Applying sync status updates for {} timelines",
+        sync_status_updates.len()
+    );
+    trace!("Sync status updates: {:?}", sync_status_updates);

-    Ok(())
+    for (tenant_id, tenant_timelines_sync_status_updates) in sync_status_updates {
+        let repo = load_local_repo(conf, tenant_id, &remote_index);
+
+        for (timeline_id, timeline_sync_status_update) in tenant_timelines_sync_status_updates {
+            match repo.apply_timeline_remote_sync_status_update(timeline_id, timeline_sync_status_update)
+            {
+                Ok(_) => debug!(
+                    "successfully applied timeline sync status update: {} -> {}",
+                    timeline_id, timeline_sync_status_update
+                ),
+                Err(e) => error!(
+                    "Failed to apply timeline sync status update for tenant {}. timeline {} update {} Error: {:#}",
+                    tenant_id, timeline_id, timeline_sync_status_update, e
+                ),
+            }
+        }
+    }
 }

 ///
@@ -176,24 +169,33 @@ pub fn shutdown_all_tenants() {
    }
 }

-pub fn create_repository_for_tenant(
+pub fn create_tenant_repository(
    conf: &'static PageServerConf,
    tenantid: ZTenantId,
-) -> Result<()> {
-    let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
-    let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?;
-
+    remote_index: Arc<tokio::sync::RwLock<RemoteTimelineIndex>>,
+) -> Result<Option<ZTenantId>> {
    match access_tenants().entry(tenantid) {
-        hash_map::Entry::Occupied(_) => bail!("tenant {} already exists", tenantid),
-        hash_map::Entry::Vacant(v) => {
+        Entry::Occupied(_) => {
+            debug!("tenant {} already exists", tenantid);
+            Ok(None)
+        }
+        Entry::Vacant(v) => {
+            let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
+            let repo = timelines::create_repo(
+                conf,
+                tenantid,
+                CreateRepo::Real {
+                    wal_redo_manager,
+                    remote_index,
+                },
+            )?;
            v.insert(Tenant {
                state: TenantState::Idle,
                repo,
            });
+            Ok(Some(tenantid))
        }
    }
-
-    Ok(())
 }

 pub fn get_tenant_state(tenantid: ZTenantId) -> Option<TenantState> {
@@ -208,7 +210,7 @@ pub fn activate_tenant(conf: &'static PageServerConf, tenantid: ZTenantId) -> Re
    let mut m = access_tenants();
    let tenant = m
        .get_mut(&tenantid)
-        .ok_or_else(|| anyhow!("Tenant not found for id {}", tenantid))?;
+        .with_context(|| format!("Tenant not found for id {}", tenantid))?;

    info!("activating tenant {}", tenantid);

@@ -251,24 +253,25 @@ pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<dyn Reposito
    let m = access_tenants();
    let tenant = m
        .get(&tenantid)
-        .ok_or_else(|| anyhow!("Tenant not found for tenant {}", tenantid))?;
+        .with_context(|| format!("Tenant {} not found", tenantid))?;

    Ok(Arc::clone(&tenant.repo))
 }

-pub fn get_timeline_for_tenant(
+// Retrieve timeline for tenant. Load it into memory if it is not already loaded
+pub fn get_timeline_for_tenant_load(
    tenantid: ZTenantId,
    timelineid: ZTimelineId,
 ) -> Result<Arc<dyn Timeline>> {
    get_repository_for_tenant(tenantid)?
-        .get_timeline(timelineid)?
-        .local_timeline()
-        .ok_or_else(|| anyhow!("cannot fetch timeline {}", timelineid))
+        .get_timeline_load(timelineid)
+        .with_context(|| format!("Timeline {} not found for tenant {}", timelineid, tenantid))
 }

+#[serde_as]
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TenantInfo {
-    #[serde(with = "hex")]
+    #[serde_as(as = "DisplayFromStr")]
    pub id: ZTenantId,
    pub state: TenantState,
 }
--- a/pageserver/src/timelines.rs
+++ b/pageserver/src/timelines.rs
@@ -0,0 +1,408 @@
+//!
+//! Timeline management code
+//
+
+use anyhow::{bail, Context, Result};
+use postgres_ffi::ControlFileData;
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
+use std::{
+    fs,
+    path::Path,
+    process::{Command, Stdio},
+    sync::Arc,
+};
+use tracing::*;
+
+use zenith_utils::lsn::Lsn;
+use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
+use zenith_utils::{crashsafe_dir, logging};
+
+use crate::{
+    config::PageServerConf,
+    layered_repository::metadata::TimelineMetadata,
+    remote_storage::RemoteTimelineIndex,
+    repository::{LocalTimelineState, Repository},
+};
+use crate::{import_datadir, LOG_FILE_NAME};
+use crate::{layered_repository::LayeredRepository, walredo::WalRedoManager};
+use crate::{repository::RepositoryTimeline, tenant_mgr};
+use crate::{repository::Timeline, CheckpointConfig};
+
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct LocalTimelineInfo {
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub ancestor_timeline_id: Option<ZTimelineId>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub ancestor_lsn: Option<Lsn>,
+    #[serde_as(as = "DisplayFromStr")]
+    pub last_record_lsn: Lsn,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub prev_record_lsn: Option<Lsn>,
+    #[serde_as(as = "DisplayFromStr")]
+    pub disk_consistent_lsn: Lsn,
+    pub current_logical_size: Option<usize>, // is None when timeline is Unloaded
+    pub current_logical_size_non_incremental: Option<usize>,
+    pub timeline_state: LocalTimelineState,
+}
+
+impl LocalTimelineInfo {
+    pub fn from_loaded_timeline(
+        timeline: &dyn Timeline,
+        include_non_incremental_logical_size: bool,
+    ) -> anyhow::Result<Self> {
+        let last_record_lsn = timeline.get_last_record_lsn();
+        let info = LocalTimelineInfo {
+            ancestor_timeline_id: timeline.get_ancestor_timeline_id(),
+            ancestor_lsn: {
+                match timeline.get_ancestor_lsn() {
+                    Lsn(0) => None,
+                    lsn @ Lsn(_) => Some(lsn),
+                }
+            },
+            disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
+            last_record_lsn,
+            prev_record_lsn: Some(timeline.get_prev_record_lsn()),
+            timeline_state: LocalTimelineState::Loaded,
+            current_logical_size: Some(timeline.get_current_logical_size()),
+            current_logical_size_non_incremental: if include_non_incremental_logical_size {
+                Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?)
+            } else {
+                None
+            },
+        };
+        Ok(info)
+    }
+
+    pub fn from_unloaded_timeline(metadata: &TimelineMetadata) -> Self {
+        LocalTimelineInfo {
+            ancestor_timeline_id: metadata.ancestor_timeline(),
+            ancestor_lsn: {
+                match metadata.ancestor_lsn() {
+                    Lsn(0) => None,
+                    lsn @ Lsn(_) => Some(lsn),
+                }
+            },
+            disk_consistent_lsn: metadata.disk_consistent_lsn(),
+            last_record_lsn: metadata.disk_consistent_lsn(),
+            prev_record_lsn: metadata.prev_record_lsn(),
+            timeline_state: LocalTimelineState::Unloaded,
+            current_logical_size: None,
+            current_logical_size_non_incremental: None,
+        }
+    }
+
+    pub fn from_repo_timeline(
+        repo_timeline: RepositoryTimeline,
+        include_non_incremental_logical_size: bool,
+    ) -> anyhow::Result<Self> {
+        match repo_timeline {
+            RepositoryTimeline::Loaded(timeline) => {
+                Self::from_loaded_timeline(timeline.as_ref(), include_non_incremental_logical_size)
+            }
+            RepositoryTimeline::Unloaded { metadata } => {
+                Ok(Self::from_unloaded_timeline(&metadata))
+            }
+        }
+    }
+}
+
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct RemoteTimelineInfo {
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub remote_consistent_lsn: Option<Lsn>,
+    pub awaits_download: bool,
+}
+
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct TimelineInfo {
+    #[serde_as(as = "DisplayFromStr")]
+    pub tenant_id: ZTenantId,
+    #[serde_as(as = "DisplayFromStr")]
+    pub timeline_id: ZTimelineId,
+    pub local: Option<LocalTimelineInfo>,
+    pub remote: Option<RemoteTimelineInfo>,
+}
+
+pub fn extract_remote_timeline_info(
+    tenant_id: ZTenantId,
+    timeline_id: ZTimelineId,
+    remote_index: &RemoteTimelineIndex,
+) -> Option<RemoteTimelineInfo> {
+    remote_index
+        .timeline_entry(&ZTenantTimelineId {
+            tenant_id,
+            timeline_id,
+        })
+        .map(|remote_entry| RemoteTimelineInfo {
+            remote_consistent_lsn: remote_entry.disk_consistent_lsn(),
+            awaits_download: remote_entry.get_awaits_download(),
+        })
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct PointInTime {
+    pub timeline_id: ZTimelineId,
+    pub lsn: Lsn,
+}
+
+pub fn init_pageserver(
+    conf: &'static PageServerConf,
+    create_tenant: Option<ZTenantId>,
+    initial_timeline_id: Option<ZTimelineId>,
+) -> anyhow::Result<()> {
+    // Initialize logger
+    // use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages
+    let _log_file = logging::init(LOG_FILE_NAME, true)?;
+
+    crashsafe_dir::create_dir_all(conf.tenants_path())?;
+
+    if let Some(tenant_id) = create_tenant {
+        println!("initializing tenantid {}", tenant_id);
+        let repo =
+            create_repo(conf, tenant_id, CreateRepo::Dummy).context("failed to create repo")?;
+        let new_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate);
+        bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())
+            .context("failed to create initial timeline")?;
+        println!("initial timeline {} created", new_timeline_id)
+    } else if initial_timeline_id.is_some() {
+        println!("Ignoring initial timeline parameter, due to no tenant id to create given");
+    }
+
+    println!("pageserver init succeeded");
+    Ok(())
+}
+
+pub enum CreateRepo {
+    Real {
+        wal_redo_manager: Arc<dyn WalRedoManager + Send + Sync>,
+        remote_index: Arc<tokio::sync::RwLock<RemoteTimelineIndex>>,
+    },
+    Dummy,
+}
+
+pub fn create_repo(
+    conf: &'static PageServerConf,
+    tenant_id: ZTenantId,
+    create_repo: CreateRepo,
+) -> Result<Arc<dyn Repository>> {
+    let (wal_redo_manager, remote_index) = match create_repo {
+        CreateRepo::Real {
+            wal_redo_manager,
+            remote_index,
+        } => (wal_redo_manager, remote_index),
+        CreateRepo::Dummy => {
+            // We don't use the real WAL redo manager, because we don't want to spawn the WAL redo
+            // process during repository initialization.
+            //
+            // FIXME: That caused trouble, because the WAL redo manager spawned a thread that launched
+            // initdb in the background, and it kept running even after the "zenith init" had exited.
+            // In tests, we started the  page server immediately after that, so that initdb was still
+            // running in the background, and we failed to run initdb again in the same directory. This
+            // has been solved for the rapid init+start case now, but the general race condition remains
+            // if you restart the server quickly. The WAL redo manager doesn't use a separate thread
+            // anymore, but I think that could still happen.
+            let wal_redo_manager = Arc::new(crate::walredo::DummyRedoManager {});
+
+            let remote_index = Arc::new(tokio::sync::RwLock::new(RemoteTimelineIndex::empty()));
+            (wal_redo_manager as _, remote_index)
+        }
+    };
+
+    let repo_dir = conf.tenant_path(&tenant_id);
+    if repo_dir.exists() {
+        bail!("tenant {} directory already exists", tenant_id);
+    }
+
+    // top-level dir may exist if we are creating it through CLI
+    crashsafe_dir::create_dir_all(&repo_dir)
+        .with_context(|| format!("could not create directory {}", repo_dir.display()))?;
+    crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?;
+    info!("created directory structure in {}", repo_dir.display());
+
+    Ok(Arc::new(LayeredRepository::new(
+        conf,
+        wal_redo_manager,
+        tenant_id,
+        remote_index,
+        conf.remote_storage_config.is_some(),
+    )))
+}
+
+// Returns checkpoint LSN from controlfile
+fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
+    // Read control file to extract the LSN
+    let controlfile_path = path.join("global").join("pg_control");
+    let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?;
+    let lsn = controlfile.checkPoint;
+
+    Ok(Lsn(lsn))
+}
+
+// Create the cluster temporarily in 'initdbpath' directory inside the repository
+// to get bootstrap data for timeline initialization.
+//
+fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
+    info!("running initdb in {}... ", initdbpath.display());
+
+    let initdb_path = conf.pg_bin_dir().join("initdb");
+    let initdb_output = Command::new(initdb_path)
+        .args(&["-D", initdbpath.to_str().unwrap()])
+        .args(&["-U", &conf.superuser])
+        .args(&["-E", "utf8"])
+        .arg("--no-instructions")
+        // This is only used for a temporary installation that is deleted shortly after,
+        // so no need to fsync it
+        .arg("--no-sync")
+        .env_clear()
+        .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
+        .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
+        .stdout(Stdio::null())
+        .output()
+        .context("failed to execute initdb")?;
+    if !initdb_output.status.success() {
+        bail!(
+            "initdb failed: '{}'",
+            String::from_utf8_lossy(&initdb_output.stderr)
+        );
+    }
+
+    Ok(())
+}
+
+//
+// - run initdb to init temporary instance and get bootstrap data
+// - after initialization complete, remove the temp dir.
+//
+fn bootstrap_timeline(
+    conf: &'static PageServerConf,
+    tenantid: ZTenantId,
+    tli: ZTimelineId,
+    repo: &dyn Repository,
+) -> Result<Arc<dyn Timeline>> {
+    let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered();
+
+    let initdb_path = conf.tenant_path(&tenantid).join("tmp");
+
+    // Init temporarily repo to get bootstrap data
+    run_initdb(conf, &initdb_path)?;
+    let pgdata_path = initdb_path;
+
+    let lsn = get_lsn_from_controlfile(&pgdata_path)?.align();
+
+    // Import the contents of the data directory at the initial checkpoint
+    // LSN, and any WAL after that.
+    // Initdb lsn will be equal to last_record_lsn which will be set after import.
+    // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline.
+    let timeline = repo.create_empty_timeline(tli, lsn)?;
+    import_datadir::import_timeline_from_postgres_datadir(
+        &pgdata_path,
+        timeline.writer().as_ref(),
+        lsn,
+    )?;
+    timeline.checkpoint(CheckpointConfig::Forced)?;
+
+    println!(
+        "created initial timeline {} timeline.lsn {}",
+        tli,
+        timeline.get_last_record_lsn()
+    );
+
+    // Remove temp dir. We don't need it anymore
+    fs::remove_dir_all(pgdata_path)?;
+
+    Ok(timeline)
+}
+
+pub(crate) fn get_local_timelines(
+    tenant_id: ZTenantId,
+    include_non_incremental_logical_size: bool,
+) -> Result<Vec<(ZTimelineId, LocalTimelineInfo)>> {
+    let repo = tenant_mgr::get_repository_for_tenant(tenant_id)
+        .with_context(|| format!("Failed to get repo for tenant {}", tenant_id))?;
+    let repo_timelines = repo.list_timelines();
+
+    let mut local_timeline_info = Vec::with_capacity(repo_timelines.len());
+    for (timeline_id, repository_timeline) in repo_timelines {
+        local_timeline_info.push((
+            timeline_id,
+            LocalTimelineInfo::from_repo_timeline(
+                repository_timeline,
+                include_non_incremental_logical_size,
+            )?,
+        ))
+    }
+    Ok(local_timeline_info)
+}
+
+pub(crate) fn create_timeline(
+    conf: &'static PageServerConf,
+    tenant_id: ZTenantId,
+    new_timeline_id: Option<ZTimelineId>,
+    ancestor_timeline_id: Option<ZTimelineId>,
+    ancestor_start_lsn: Option<Lsn>,
+) -> Result<Option<TimelineInfo>> {
+    let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate);
+    let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
+
+    if conf.timeline_path(&new_timeline_id, &tenant_id).exists() {
+        debug!("timeline {} already exists", new_timeline_id);
+        return Ok(None);
+    }
+
+    let mut start_lsn = ancestor_start_lsn.unwrap_or(Lsn(0));
+
+    let new_timeline_info = match ancestor_timeline_id {
+        Some(ancestor_timeline_id) => {
+            let ancestor_timeline = repo
+                .get_timeline_load(ancestor_timeline_id)
+                .context("Cannot branch off the timeline that's not present locally")?;
+
+            if start_lsn == Lsn(0) {
+                // Find end of WAL on the old timeline
+                let end_of_wal = ancestor_timeline.get_last_record_lsn();
+                info!("branching at end of WAL: {}", end_of_wal);
+                start_lsn = end_of_wal;
+            } else {
+                // Wait for the WAL to arrive and be processed on the parent branch up
+                // to the requested branch point. The repository code itself doesn't
+                // require it, but if we start to receive WAL on the new timeline,
+                // decoding the new WAL might need to look up previous pages, relation
+                // sizes etc. and that would get confused if the previous page versions
+                // are not in the repository yet.
+                ancestor_timeline.wait_lsn(start_lsn)?;
+            }
+            start_lsn = start_lsn.align();
+
+            let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
+            if ancestor_ancestor_lsn > start_lsn {
+                // can we safely just branch from the ancestor instead?
+                anyhow::bail!(
+                    "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
+                    start_lsn,
+                    ancestor_timeline_id,
+                    ancestor_ancestor_lsn,
+                );
+            }
+            repo.branch_timeline(ancestor_timeline_id, new_timeline_id, start_lsn)?;
+            // load the timeline into memory
+            let loaded_timeline = repo.get_timeline_load(new_timeline_id)?;
+            LocalTimelineInfo::from_loaded_timeline(loaded_timeline.as_ref(), false)
+                .context("cannot fill timeline info")?
+        }
+        None => {
+            let new_timeline = bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?;
+            LocalTimelineInfo::from_loaded_timeline(new_timeline.as_ref(), false)
+                .context("cannot fill timeline info")?
+        }
+    };
+    Ok(Some(TimelineInfo {
+        tenant_id,
+        timeline_id: new_timeline_id,
+        local: Some(new_timeline_info),
+        remote: None,
+    }))
+}
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -332,8 +332,11 @@ impl VirtualFile {
        // TODO: We could downgrade the locks to read mode before calling
        // 'func', to allow a little bit more concurrency, but the standard
        // library RwLock doesn't allow downgrading without releasing the lock,
-        // and that doesn't seem worth the trouble. (parking_lot RwLock would
-        // allow it)
+        // and that doesn't seem worth the trouble.
+        //
+        // XXX: `parking_lot::RwLock` can enable such downgrades, yet its implemenation is fair and
+        // may deadlock on subsequent read calls.
+        // Simply replacing all `RwLock` in project causes deadlocks, so use it sparingly.
        let result = STORAGE_IO_TIME
            .with_label_values(&[op, &self.tenantid, &self.timelineid])
            .observe_closure_duration(|| func(&file));
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -349,49 +349,25 @@ impl WalIngest {
        decoded: &mut DecodedWALRecord,
    ) -> Result<()> {
        // Handle VM bit updates that are implicitly part of heap records.
+
+        // First, look at the record to determine which VM bits need
+        // to be cleared. If either of these variables is set, we
+        // need to clear the corresponding bits in the visibility map.
+        let mut new_heap_blkno: Option<u32> = None;
+        let mut old_heap_blkno: Option<u32> = None;
        if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
            let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
            if info == pg_constants::XLOG_HEAP_INSERT {
                let xlrec = XlHeapInsert::decode(buf);
                assert_eq!(0, buf.remaining());
-                if (xlrec.flags
-                    & (pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED
-                        | pg_constants::XLH_INSERT_ALL_FROZEN_SET))
-                    != 0
-                {
-                    timeline.put_wal_record(
-                        lsn,
-                        RelishTag::Relation(RelTag {
-                            forknum: pg_constants::VISIBILITYMAP_FORKNUM,
-                            spcnode: decoded.blocks[0].rnode_spcnode,
-                            dbnode: decoded.blocks[0].rnode_dbnode,
-                            relnode: decoded.blocks[0].rnode_relnode,
-                        }),
-                        decoded.blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
-                        ZenithWalRecord::ClearVisibilityMapFlags {
-                            heap_blkno: decoded.blocks[0].blkno,
-                            flags: pg_constants::VISIBILITYMAP_VALID_BITS,
-                        },
-                    )?;
+                if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                    new_heap_blkno = Some(decoded.blocks[0].blkno);
                }
            } else if info == pg_constants::XLOG_HEAP_DELETE {
                let xlrec = XlHeapDelete::decode(buf);
                assert_eq!(0, buf.remaining());
                if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
-                    timeline.put_wal_record(
-                        lsn,
-                        RelishTag::Relation(RelTag {
-                            forknum: pg_constants::VISIBILITYMAP_FORKNUM,
-                            spcnode: decoded.blocks[0].rnode_spcnode,
-                            dbnode: decoded.blocks[0].rnode_dbnode,
-                            relnode: decoded.blocks[0].rnode_relnode,
-                        }),
-                        decoded.blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
-                        ZenithWalRecord::ClearVisibilityMapFlags {
-                            heap_blkno: decoded.blocks[0].blkno,
-                            flags: pg_constants::VISIBILITYMAP_VALID_BITS,
-                        },
-                    )?;
+                    new_heap_blkno = Some(decoded.blocks[0].blkno);
                }
            } else if info == pg_constants::XLOG_HEAP_UPDATE
                || info == pg_constants::XLOG_HEAP_HOT_UPDATE
@@ -400,39 +376,15 @@ impl WalIngest {
                // the size of tuple data is inferred from the size of the record.
                // we can't validate the remaining number of bytes without parsing
                // the tuple data.
-                if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
-                    timeline.put_wal_record(
-                        lsn,
-                        RelishTag::Relation(RelTag {
-                            forknum: pg_constants::VISIBILITYMAP_FORKNUM,
-                            spcnode: decoded.blocks[0].rnode_spcnode,
-                            dbnode: decoded.blocks[0].rnode_dbnode,
-                            relnode: decoded.blocks[0].rnode_relnode,
-                        }),
-                        decoded.blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
-                        ZenithWalRecord::ClearVisibilityMapFlags {
-                            heap_blkno: decoded.blocks[0].blkno,
-                            flags: pg_constants::VISIBILITYMAP_VALID_BITS,
-                        },
-                    )?;
+                if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
+                    old_heap_blkno = Some(decoded.blocks[0].blkno);
                }
-                if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0
-                    && decoded.blocks.len() > 1
-                {
-                    timeline.put_wal_record(
-                        lsn,
-                        RelishTag::Relation(RelTag {
-                            forknum: pg_constants::VISIBILITYMAP_FORKNUM,
-                            spcnode: decoded.blocks[1].rnode_spcnode,
-                            dbnode: decoded.blocks[1].rnode_dbnode,
-                            relnode: decoded.blocks[1].rnode_relnode,
-                        }),
-                        decoded.blocks[1].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
-                        ZenithWalRecord::ClearVisibilityMapFlags {
-                            heap_blkno: decoded.blocks[1].blkno,
-                            flags: pg_constants::VISIBILITYMAP_VALID_BITS,
-                        },
-                    )?;
+                if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
+                    // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
+                    // non-HOT update where the new tuple goes to different page than
+                    // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
+                    // set.
+                    new_heap_blkno = Some(decoded.blocks[1].blkno);
                }
            }
        } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
@@ -448,23 +400,60 @@ impl WalIngest {
                };
                assert_eq!(offset_array_len, buf.remaining());

-                // FIXME: why also ALL_FROZEN_SET?
-                if (xlrec.flags
-                    & (pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED
-                        | pg_constants::XLH_INSERT_ALL_FROZEN_SET))
-                    != 0
-                {
+                if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                    new_heap_blkno = Some(decoded.blocks[0].blkno);
+                }
+            }
+        }
+        // FIXME: What about XLOG_HEAP_LOCK and XLOG_HEAP2_LOCK_UPDATED?
+
+        // Clear the VM bits if required.
+        if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
+            let vm_relish = RelishTag::Relation(RelTag {
+                forknum: pg_constants::VISIBILITYMAP_FORKNUM,
+                spcnode: decoded.blocks[0].rnode_spcnode,
+                dbnode: decoded.blocks[0].rnode_dbnode,
+                relnode: decoded.blocks[0].rnode_relnode,
+            });
+
+            let new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
+            let old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
+            if new_vm_blk == old_vm_blk {
+                // An UPDATE record that needs to clear the bits for both old and the
+                // new page, both of which reside on the same VM page.
+                timeline.put_wal_record(
+                    lsn,
+                    vm_relish,
+                    new_vm_blk.unwrap(),
+                    ZenithWalRecord::ClearVisibilityMapFlags {
+                        new_heap_blkno,
+                        old_heap_blkno,
+                        flags: pg_constants::VISIBILITYMAP_VALID_BITS,
+                    },
+                )?;
+            } else {
+                // Clear VM bits for one heap page, or for two pages that reside on
+                // different VM pages.
+                if let Some(new_vm_blk) = new_vm_blk {
                    timeline.put_wal_record(
                        lsn,
-                        RelishTag::Relation(RelTag {
-                            forknum: pg_constants::VISIBILITYMAP_FORKNUM,
-                            spcnode: decoded.blocks[0].rnode_spcnode,
-                            dbnode: decoded.blocks[0].rnode_dbnode,
-                            relnode: decoded.blocks[0].rnode_relnode,
-                        }),
-                        decoded.blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
+                        vm_relish,
+                        new_vm_blk,
                        ZenithWalRecord::ClearVisibilityMapFlags {
-                            heap_blkno: decoded.blocks[0].blkno,
+                            new_heap_blkno,
+                            old_heap_blkno: None,
+                            flags: pg_constants::VISIBILITYMAP_VALID_BITS,
+                        },
+                    )?;
+                }
+                if let Some(old_vm_blk) = old_vm_blk {
+                    timeline.put_wal_record(
+                        lsn,
+                        vm_relish,
+                        old_vm_blk,
+                        ZenithWalRecord::ClearVisibilityMapFlags {
+                            new_heap_blkno: None,
+                            old_heap_blkno,
                            flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                        },
                    )?;
@@ -472,8 +461,6 @@ impl WalIngest {
            }
        }

-        // FIXME: What about XLOG_HEAP_LOCK and XLOG_HEAP2_LOCK_UPDATED?
-
        Ok(())
    }

--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -11,14 +11,16 @@ use crate::thread_mgr;
 use crate::thread_mgr::ThreadKind;
 use crate::walingest::WalIngest;
 use anyhow::{bail, Context, Error, Result};
+use bytes::BytesMut;
+use fail::fail_point;
 use lazy_static::lazy_static;
-use parking_lot::Mutex;
 use postgres_ffi::waldecoder::*;
 use postgres_protocol::message::backend::ReplicationMessage;
 use postgres_types::PgLsn;
 use std::cell::Cell;
 use std::collections::HashMap;
 use std::str::FromStr;
+use std::sync::Mutex;
 use std::thread_local;
 use std::time::SystemTime;
 use tokio::pin;
@@ -27,7 +29,9 @@ use tokio_postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow};
 use tokio_stream::StreamExt;
 use tracing::*;
 use zenith_utils::lsn::Lsn;
+use zenith_utils::pq_proto::ZenithFeedback;
 use zenith_utils::zid::ZTenantId;
+use zenith_utils::zid::ZTenantTimelineId;
 use zenith_utils::zid::ZTimelineId;

 //
@@ -50,7 +54,7 @@ thread_local! {
 }

 fn drop_wal_receiver(tenantid: ZTenantId, timelineid: ZTimelineId) {
-    let mut receivers = WAL_RECEIVERS.lock();
+    let mut receivers = WAL_RECEIVERS.lock().unwrap();
    receivers.remove(&(tenantid, timelineid));
 }

@@ -61,10 +65,11 @@ pub fn launch_wal_receiver(
    timelineid: ZTimelineId,
    wal_producer_connstr: &str,
 ) -> Result<()> {
-    let mut receivers = WAL_RECEIVERS.lock();
+    let mut receivers = WAL_RECEIVERS.lock().unwrap();

    match receivers.get_mut(&(tenantid, timelineid)) {
        Some(receiver) => {
+            info!("wal receiver already running, updating connection string");
            receiver.wal_producer_connstr = wal_producer_connstr.into();
        }
        None => {
@@ -93,7 +98,7 @@ pub fn launch_wal_receiver(

 // Look up current WAL producer connection string in the hash table
 fn get_wal_producer_connstr(tenantid: ZTenantId, timelineid: ZTimelineId) -> String {
-    let receivers = WAL_RECEIVERS.lock();
+    let receivers = WAL_RECEIVERS.lock().unwrap();

    receivers
        .get(&(tenantid, timelineid))
@@ -107,18 +112,18 @@ fn get_wal_producer_connstr(tenantid: ZTenantId, timelineid: ZTimelineId) -> Str
 //
 fn thread_main(
    conf: &'static PageServerConf,
-    tenantid: ZTenantId,
-    timelineid: ZTimelineId,
+    tenant_id: ZTenantId,
+    timeline_id: ZTimelineId,
 ) -> Result<()> {
-    let _enter = info_span!("WAL receiver", timeline = %timelineid, tenant = %tenantid).entered();
+    let _enter = info_span!("WAL receiver", timeline = %timeline_id, tenant = %tenant_id).entered();
    info!("WAL receiver thread started");

    // Look up the current WAL producer address
-    let wal_producer_connstr = get_wal_producer_connstr(tenantid, timelineid);
+    let wal_producer_connstr = get_wal_producer_connstr(tenant_id, timeline_id);

    // Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
    // and start streaming WAL from it.
-    let res = walreceiver_main(conf, tenantid, timelineid, &wal_producer_connstr);
+    let res = walreceiver_main(conf, tenant_id, timeline_id, &wal_producer_connstr);

    // TODO cleanup info messages
    if let Err(e) = res {
@@ -126,20 +131,20 @@ fn thread_main(
    } else {
        info!(
            "walreceiver disconnected tenant {}, timelineid {}",
-            tenantid, timelineid
+            tenant_id, timeline_id
        );
    }

    // Drop it from list of active WAL_RECEIVERS
    // so that next callmemaybe request launched a new thread
-    drop_wal_receiver(tenantid, timelineid);
+    drop_wal_receiver(tenant_id, timeline_id);
    Ok(())
 }

 fn walreceiver_main(
    _conf: &PageServerConf,
-    tenantid: ZTenantId,
-    timelineid: ZTimelineId,
+    tenant_id: ZTenantId,
+    timeline_id: ZTimelineId,
    wal_producer_connstr: &str,
 ) -> Result<(), Error> {
    // Connect to the database in replication mode.
@@ -158,7 +163,7 @@ fn walreceiver_main(
    // This is from tokio-postgres docs, but it is a bit weird in our case because we extensively use block_on
    runtime.spawn(async move {
        if let Err(e) = connection.await {
-            eprintln!("connection error: {}", e);
+            error!("connection error: {}", e);
        }
    });

@@ -178,13 +183,16 @@ fn walreceiver_main(
    let end_of_wal = Lsn::from(u64::from(identify.xlogpos));
    let mut caught_up = false;

-    let timeline =
-        tenant_mgr::get_timeline_for_tenant(tenantid, timelineid).with_context(|| {
-            format!(
-                "Can not start the walrecever for a remote tenant {}, timeline {}",
-                tenantid, timelineid,
-            )
-        })?;
+    let repo = tenant_mgr::get_repository_for_tenant(tenant_id)
+        .with_context(|| format!("no repository found for tenant {}", tenant_id))?;
+    let timeline = repo.get_timeline_load(timeline_id).with_context(|| {
+        format!(
+            "local timeline {} not found for tenant {}",
+            timeline_id, tenant_id
+        )
+    })?;
+
+    let remote_index = repo.get_remote_index();

    //
    // Start streaming the WAL, from where we left off previously.
@@ -252,6 +260,8 @@ fn walreceiver_main(
                    let writer = timeline.writer();
                    walingest.ingest_record(writer.as_ref(), recdata, lsn)?;

+                    fail_point!("walreceiver-after-ingest");
+
                    last_rec_lsn = lsn;
                }

@@ -286,26 +296,47 @@ fn walreceiver_main(
        };

        if let Some(last_lsn) = status_update {
-            let last_lsn = PgLsn::from(u64::from(last_lsn));
-            let timeline_synced_disk_consistent_lsn =
-                tenant_mgr::get_repository_for_tenant(tenantid)?
-                    .get_timeline_state(timelineid)
-                    .and_then(|state| state.remote_disk_consistent_lsn())
-                    .unwrap_or(Lsn(0));
+            let timeline_remote_consistent_lsn = runtime.block_on(async {
+                remote_index
+                    .read()
+                    .await
+                    // here we either do not have this timeline in remote index
+                    // or there were no checkpoints for it yet
+                    .timeline_entry(&ZTenantTimelineId {
+                        tenant_id,
+                        timeline_id,
+                    })
+                    .and_then(|e| e.disk_consistent_lsn())
+                    .unwrap_or(Lsn(0)) // no checkpoint was uploaded
+            });

            // The last LSN we processed. It is not guaranteed to survive pageserver crash.
-            let write_lsn = last_lsn;
+            let write_lsn = u64::from(last_lsn);
            // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data
-            let flush_lsn = PgLsn::from(u64::from(timeline.get_disk_consistent_lsn()));
+            let flush_lsn = u64::from(timeline.get_disk_consistent_lsn());
            // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash
            // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`.
-            let apply_lsn = PgLsn::from(u64::from(timeline_synced_disk_consistent_lsn));
+            let apply_lsn = u64::from(timeline_remote_consistent_lsn);
            let ts = SystemTime::now();
-            const NO_REPLY: u8 = 0;
+
+            // Send zenith feedback message.
+            // Regular standby_status_update fields are put into this message.
+            let zenith_status_update = ZenithFeedback {
+                current_timeline_size: timeline.get_current_logical_size() as u64,
+                ps_writelsn: write_lsn,
+                ps_flushlsn: flush_lsn,
+                ps_applylsn: apply_lsn,
+                ps_replytime: ts,
+            };
+
+            debug!("zenith_status_update {:?}", zenith_status_update);
+
+            let mut data = BytesMut::new();
+            zenith_status_update.serialize(&mut data)?;
            runtime.block_on(
                physical_stream
                    .as_mut()
-                    .standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY),
+                    .zenith_status_update(data.len() as u64, &data),
            )?;
        }
    }
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -268,12 +268,11 @@ impl XlXactParsedRecord {
        let info = xl_info & pg_constants::XLOG_XACT_OPMASK;
        // The record starts with time of commit/abort
        let xact_time = buf.get_i64_le();
-        let xinfo;
-        if xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
-            xinfo = buf.get_u32_le();
+        let xinfo = if xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
+            buf.get_u32_le()
        } else {
-            xinfo = 0;
-        }
+            0
+        };
        let db_id;
        let ts_id;
        if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
@@ -502,7 +501,6 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
            0..=pg_constants::XLR_MAX_BLOCK_ID => {
                /* XLogRecordBlockHeader */
                let mut blk = DecodedBkpBlock::new();
-                let fork_flags: u8;

                if block_id <= max_block_id {
                    // TODO
@@ -515,7 +513,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
                }
                max_block_id = block_id;

-                fork_flags = buf.get_u8();
+                let fork_flags: u8 = buf.get_u8();
                blk.forknum = fork_flags & pg_constants::BKPBLOCK_FORK_MASK;
                blk.flags = fork_flags;
                blk.has_image = (fork_flags & pg_constants::BKPBLOCK_HAS_IMAGE) != 0;
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -102,8 +102,6 @@ impl crate::walredo::WalRedoManager for DummyRedoManager {
    }
 }

-static TIMEOUT: Duration = Duration::from_secs(20);
-
 // Metrics collected on WAL redo operations
 //
 // We collect the time spent in actual WAL redo ('redo'), and time waiting
@@ -221,7 +219,14 @@ impl WalRedoManager for PostgresRedoManager {
                let result = if batch_zenith {
                    self.apply_batch_zenith(rel, blknum, lsn, img, &records[batch_start..i])
                } else {
-                    self.apply_batch_postgres(rel, blknum, lsn, img, &records[batch_start..i])
+                    self.apply_batch_postgres(
+                        rel,
+                        blknum,
+                        lsn,
+                        img,
+                        &records[batch_start..i],
+                        self.conf.wal_redo_timeout,
+                    )
                };
                img = Some(result?);

@@ -233,7 +238,14 @@ impl WalRedoManager for PostgresRedoManager {
        if batch_zenith {
            self.apply_batch_zenith(rel, blknum, lsn, img, &records[batch_start..])
        } else {
-            self.apply_batch_postgres(rel, blknum, lsn, img, &records[batch_start..])
+            self.apply_batch_postgres(
+                rel,
+                blknum,
+                lsn,
+                img,
+                &records[batch_start..],
+                self.conf.wal_redo_timeout,
+            )
        }
    }
 }
@@ -261,6 +273,7 @@ impl PostgresRedoManager {
        lsn: Lsn,
        base_img: Option<Bytes>,
        records: &[(Lsn, ZenithWalRecord)],
+        wal_redo_timeout: Duration,
    ) -> Result<Bytes, WalRedoError> {
        let start_time = Instant::now();

@@ -281,7 +294,7 @@ impl PostgresRedoManager {
        let result = if let RelishTag::Relation(rel) = rel {
            // Relational WAL records are applied using wal-redo-postgres
            let buf_tag = BufferTag { rel, blknum };
-            apply_result = process.apply_wal_records(buf_tag, base_img, records);
+            apply_result = process.apply_wal_records(buf_tag, base_img, records, wal_redo_timeout);

            apply_result.map_err(WalRedoError::IoError)
        } else {
@@ -363,25 +376,44 @@ impl PostgresRedoManager {
                will_init: _,
                rec: _,
            } => panic!("tried to pass postgres wal record to zenith WAL redo"),
-            ZenithWalRecord::ClearVisibilityMapFlags { heap_blkno, flags } => {
-                // Calculate the VM block and offset that corresponds to the heap block.
-                let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(*heap_blkno);
-                let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(*heap_blkno);
-                let map_offset = pg_constants::HEAPBLK_TO_OFFSET(*heap_blkno);
-
-                // Check that we're modifying the correct VM block.
+            ZenithWalRecord::ClearVisibilityMapFlags {
+                new_heap_blkno,
+                old_heap_blkno,
+                flags,
+            } => {
+                // sanity check that this is modifying the correct relish
                assert!(
                    check_forknum(&rel, pg_constants::VISIBILITYMAP_FORKNUM),
                    "ClearVisibilityMapFlags record on unexpected rel {:?}",
                    rel
                );
-                assert!(map_block == blknum);
+                if let Some(heap_blkno) = *new_heap_blkno {
+                    // Calculate the VM block and offset that corresponds to the heap block.
+                    let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
+                    let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
+                    let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);

-                // equivalent to PageGetContents(page)
-                let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
+                    // Check that we're modifying the correct VM block.
+                    assert!(map_block == blknum);

-                let mask: u8 = flags << map_offset;
-                map[map_byte as usize] &= !mask;
+                    // equivalent to PageGetContents(page)
+                    let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
+
+                    map[map_byte as usize] &= !(flags << map_offset);
+                }
+
+                // Repeat for 'old_heap_blkno', if any
+                if let Some(heap_blkno) = *old_heap_blkno {
+                    let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
+                    let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
+                    let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
+
+                    assert!(map_block == blknum);
+
+                    let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
+
+                    map[map_byte as usize] &= !(flags << map_offset);
+                }
            }
            // Non-relational WAL records are handled here, with custom code that has the
            // same effects as the corresponding Postgres WAL redo function.
@@ -584,6 +616,7 @@ impl PostgresRedoProcess {
        tag: BufferTag,
        base_img: Option<Bytes>,
        records: &[(Lsn, ZenithWalRecord)],
+        wal_redo_timeout: Duration,
    ) -> Result<Bytes, std::io::Error> {
        // Serialize all the messages to send the WAL redo process first.
        //
@@ -634,7 +667,7 @@ impl PostgresRedoProcess {
            // If we have more data to write, wake up if 'stdin' becomes writeable or
            // we have data to read. Otherwise only wake up if there's data to read.
            let nfds = if nwrite < writebuf.len() { 3 } else { 2 };
-            let n = nix::poll::poll(&mut pollfds[0..nfds], TIMEOUT.as_millis() as i32)?;
+            let n = nix::poll::poll(&mut pollfds[0..nfds], wal_redo_timeout.as_millis() as i32)?;

            if n == 0 {
                return Err(Error::new(ErrorKind::Other, "WAL redo timed out"));
--- a/poetry.lock
+++ b/poetry.lock
--- a/postgres_ffi/Cargo.toml
+++ b/postgres_ffi/Cargo.toml
@@ -1,10 +1,7 @@
 [package]
 name = "postgres_ffi"
 version = "0.1.0"
-authors = ["Heikki Linnakangas <heikki@zenith.tech>"]
-edition = "2018"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+edition = "2021"

 [dependencies]
 chrono = "0.4.19"
--- a/postgres_ffi/src/xlog_utils.rs
+++ b/postgres_ffi/src/xlog_utils.rs
@@ -51,6 +51,13 @@ pub type TimeLineID = u32;
 pub type TimestampTz = i64;
 pub type XLogSegNo = u64;

+/// Interval of checkpointing metadata file. We should store metadata file to enforce
+/// predicate that checkpoint.nextXid is larger than any XID in WAL.
+/// But flushing checkpoint file for each transaction seems to be too expensive,
+/// so XID_CHECKPOINT_INTERVAL is used to forward align nextXid and so perform
+/// metadata checkpoint only once per XID_CHECKPOINT_INTERVAL transactions.
+/// XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
+/// in order to let CLOG_TRUNCATE mechanism correctly extend CLOG.
 const XID_CHECKPOINT_INTERVAL: u32 = 1024;

 #[allow(non_snake_case)]
@@ -125,6 +132,8 @@ pub fn get_current_timestamp() -> TimestampTz {
    }
 }

+/// Return offset of the last valid record in the segment segno, starting
+/// looking at start_offset. Returns start_offset if no records found.
 fn find_end_of_wal_segment(
    data_dir: &Path,
    segno: XLogSegNo,
@@ -140,7 +149,7 @@ fn find_end_of_wal_segment(
    let mut rec_offs: usize = 0;
    let mut buf = [0u8; XLOG_BLCKSZ];
    let file_name = XLogFileName(tli, segno, wal_seg_size);
-    let mut last_valid_rec_pos: usize = 0;
+    let mut last_valid_rec_pos: usize = start_offset; // assume at given start_offset begins new record
    let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap();
    file.seek(SeekFrom::Start(offs as u64))?;
    let mut rec_hdr = [0u8; XLOG_RECORD_CRC_OFFS];
@@ -400,9 +409,13 @@ impl CheckPoint {
    ///
    /// Returns 'true' if the XID was updated.
    pub fn update_next_xid(&mut self, xid: u32) -> bool {
-        let xid = xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1);
+        // nextXid should nw greate than any XID in WAL, so increment provided XID and check for wraparround.
+        let mut new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID);
+        // To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL.
+        // XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
+        new_xid =
+            new_xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1);
        let full_xid = self.nextXid.value;
-        let new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID);
        let old_xid = full_xid as u32;
        if new_xid.wrapping_sub(old_xid) as i32 > 0 {
            let mut epoch = full_xid >> 32;
@@ -520,4 +533,34 @@ mod tests {
        println!("wal_end={}, tli={}", wal_end, tli);
        assert_eq!(wal_end, waldump_wal_end);
    }
+
+    /// Check the math in update_next_xid
+    ///
+    /// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL,
+    /// currently 1024.
+    #[test]
+    pub fn test_update_next_xid() {
+        let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
+        let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
+
+        checkpoint.nextXid = FullTransactionId { value: 10 };
+        assert_eq!(checkpoint.nextXid.value, 10);
+
+        // The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL
+        // boundary
+        checkpoint.update_next_xid(100);
+        assert_eq!(checkpoint.nextXid.value, 1024);
+
+        // No change
+        checkpoint.update_next_xid(500);
+        assert_eq!(checkpoint.nextXid.value, 1024);
+        checkpoint.update_next_xid(1023);
+        assert_eq!(checkpoint.nextXid.value, 1024);
+
+        // The function returns the *next* XID, given the highest XID seen so
+        // far. So when we pass 1024, the nextXid gets bumped up to the next
+        // XID_CHECKPOINT_INTERVAL boundary.
+        checkpoint.update_next_xid(1024);
+        assert_eq!(checkpoint.nextXid.value, 2048);
+    }
 }
--- a/pre-commit.py
+++ b/pre-commit.py
@@ -38,7 +38,7 @@ def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str:


 def yapf(fix_inplace: bool) -> str:
-    cmd = "pipenv run yapf --recursive"
+    cmd = "poetry run yapf --recursive"
    if fix_inplace:
        cmd += " --in-place"
    else:
@@ -47,7 +47,7 @@ def yapf(fix_inplace: bool) -> str:


 def mypy() -> str:
-    return "pipenv run mypy"
+    return "poetry run mypy"


 def get_commit_files() -> List[str]:
@@ -72,7 +72,7 @@ def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color:
            print("Please inspect the output below and run make fmt to fix automatically.")
        if suffix == ".py":
            print("If the output is empty, ensure that you've installed Python tooling by\n"
-                  "running 'pipenv install --dev' in the current directory (no root needed)")
+                  "running './scripts/pysync' in the current directory (no root needed)")
        print()
        print(res.stdout.decode())
        exit(1)
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -1,28 +1,35 @@
 [package]
 name = "proxy"
 version = "0.1.0"
-authors = ["Stas Kelvich <stas.kelvich@gmail.com>"]
-edition = "2018"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+edition = "2021"

 [dependencies]
 anyhow = "1.0"
 bytes = { version = "1.0.1", features = ['serde'] }
-lazy_static = "1.4.0"
-md5 = "0.7.0"
-rand = "0.8.3"
+clap = "3.0"
+fail = "0.5.0"
+futures = "0.3.13"
+hashbrown = "0.11.2"
 hex = "0.4.3"
 hyper = "0.14"
-routerify = "2"
+lazy_static = "1.4.0"
+md5 = "0.7.0"
 parking_lot = "0.11.2"
+pin-project-lite = "0.2.7"
+rand = "0.8.3"
+reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
+rustls = "0.19.1"
+scopeguard = "1.1.0"
 serde = "1"
 serde_json = "1"
+thiserror = "1.0"
 tokio = { version = "1.11", features = ["macros"] }
-tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
-clap = "2.33.0"
-rustls = "0.19.1"
-reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
+tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
+tokio-rustls = "0.22.0"

 zenith_utils = { path = "../zenith_utils" }
 zenith_metrics = { path = "../zenith_metrics" }
+
+[dev-dependencies]
+tokio-postgres-rustls = "0.8.0"
+rcgen = "0.8.14"
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -0,0 +1,233 @@
+use crate::compute::DatabaseInfo;
+use crate::config::ProxyConfig;
+use crate::cplane_api::{self, CPlaneApi};
+use crate::error::UserFacingError;
+use crate::stream::PqStream;
+use crate::waiters;
+use std::collections::HashMap;
+use thiserror::Error;
+use tokio::io::{AsyncRead, AsyncWrite};
+use zenith_utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage};
+
+/// Common authentication error.
+#[derive(Debug, Error)]
+pub enum AuthErrorImpl {
+    /// Authentication error reported by the console.
+    #[error(transparent)]
+    Console(#[from] cplane_api::AuthError),
+
+    /// For passwords that couldn't be processed by [`parse_password`].
+    #[error("Malformed password message")]
+    MalformedPassword,
+
+    /// Errors produced by [`PqStream`].
+    #[error(transparent)]
+    Io(#[from] std::io::Error),
+}
+
+impl AuthErrorImpl {
+    pub fn auth_failed(msg: impl Into<String>) -> Self {
+        AuthErrorImpl::Console(cplane_api::AuthError::auth_failed(msg))
+    }
+}
+
+impl From<waiters::RegisterError> for AuthErrorImpl {
+    fn from(e: waiters::RegisterError) -> Self {
+        AuthErrorImpl::Console(cplane_api::AuthError::from(e))
+    }
+}
+
+impl From<waiters::WaitError> for AuthErrorImpl {
+    fn from(e: waiters::WaitError) -> Self {
+        AuthErrorImpl::Console(cplane_api::AuthError::from(e))
+    }
+}
+
+#[derive(Debug, Error)]
+#[error(transparent)]
+pub struct AuthError(Box<AuthErrorImpl>);
+
+impl<T> From<T> for AuthError
+where
+    AuthErrorImpl: From<T>,
+{
+    fn from(e: T) -> Self {
+        AuthError(Box::new(e.into()))
+    }
+}
+
+impl UserFacingError for AuthError {
+    fn to_string_client(&self) -> String {
+        use AuthErrorImpl::*;
+        match self.0.as_ref() {
+            Console(e) => e.to_string_client(),
+            MalformedPassword => self.to_string(),
+            _ => "Internal error".to_string(),
+        }
+    }
+}
+
+#[derive(Debug, Error)]
+pub enum ClientCredsParseError {
+    #[error("Parameter `{0}` is missing in startup packet")]
+    MissingKey(&'static str),
+}
+
+impl UserFacingError for ClientCredsParseError {}
+
+/// Various client credentials which we use for authentication.
+#[derive(Debug, PartialEq, Eq)]
+pub struct ClientCredentials {
+    pub user: String,
+    pub dbname: String,
+}
+
+impl TryFrom<HashMap<String, String>> for ClientCredentials {
+    type Error = ClientCredsParseError;
+
+    fn try_from(mut value: HashMap<String, String>) -> Result<Self, Self::Error> {
+        let mut get_param = |key| {
+            value
+                .remove(key)
+                .ok_or(ClientCredsParseError::MissingKey(key))
+        };
+
+        let user = get_param("user")?;
+        let db = get_param("database")?;
+
+        Ok(Self { user, dbname: db })
+    }
+}
+
+impl ClientCredentials {
+    /// Use credentials to authenticate the user.
+    pub async fn authenticate(
+        self,
+        config: &ProxyConfig,
+        client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    ) -> Result<DatabaseInfo, AuthError> {
+        fail::fail_point!("proxy-authenticate", |_| {
+            Err(AuthError::auth_failed("failpoint triggered"))
+        });
+
+        use crate::config::ClientAuthMethod::*;
+        use crate::config::RouterConfig::*;
+        match &config.router_config {
+            Static { host, port } => handle_static(host.clone(), *port, client, self).await,
+            Dynamic(Mixed) => {
+                if self.user.ends_with("@zenith") {
+                    handle_existing_user(config, client, self).await
+                } else {
+                    handle_new_user(config, client).await
+                }
+            }
+            Dynamic(Password) => handle_existing_user(config, client, self).await,
+            Dynamic(Link) => handle_new_user(config, client).await,
+        }
+    }
+}
+
+fn new_psql_session_id() -> String {
+    hex::encode(rand::random::<[u8; 8]>())
+}
+
+async fn handle_static(
+    host: String,
+    port: u16,
+    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    creds: ClientCredentials,
+) -> Result<DatabaseInfo, AuthError> {
+    client
+        .write_message(&Be::AuthenticationCleartextPassword)
+        .await?;
+
+    // Read client's password bytes
+    let msg = client.read_password_message().await?;
+    let cleartext_password = parse_password(&msg).ok_or(AuthErrorImpl::MalformedPassword)?;
+
+    let db_info = DatabaseInfo {
+        host,
+        port,
+        dbname: creds.dbname.clone(),
+        user: creds.user.clone(),
+        password: Some(cleartext_password.into()),
+    };
+
+    client
+        .write_message_noflush(&Be::AuthenticationOk)?
+        .write_message_noflush(&BeParameterStatusMessage::encoding())?;
+
+    Ok(db_info)
+}
+
+async fn handle_existing_user(
+    config: &ProxyConfig,
+    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    creds: ClientCredentials,
+) -> Result<DatabaseInfo, AuthError> {
+    let psql_session_id = new_psql_session_id();
+    let md5_salt = rand::random();
+
+    client
+        .write_message(&Be::AuthenticationMD5Password(&md5_salt))
+        .await?;
+
+    // Read client's password hash
+    let msg = client.read_password_message().await?;
+    let md5_response = parse_password(&msg).ok_or(AuthErrorImpl::MalformedPassword)?;
+
+    let cplane = CPlaneApi::new(config.auth_endpoint.clone());
+    let db_info = cplane
+        .authenticate_proxy_client(creds, md5_response, &md5_salt, &psql_session_id)
+        .await?;
+
+    client
+        .write_message_noflush(&Be::AuthenticationOk)?
+        .write_message_noflush(&BeParameterStatusMessage::encoding())?;
+
+    Ok(db_info)
+}
+
+async fn handle_new_user(
+    config: &ProxyConfig,
+    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+) -> Result<DatabaseInfo, AuthError> {
+    let psql_session_id = new_psql_session_id();
+    let greeting = hello_message(&config.redirect_uri, &psql_session_id);
+
+    let db_info = cplane_api::with_waiter(psql_session_id, |waiter| async {
+        // Give user a URL to spawn a new database
+        client
+            .write_message_noflush(&Be::AuthenticationOk)?
+            .write_message_noflush(&BeParameterStatusMessage::encoding())?
+            .write_message(&Be::NoticeResponse(&greeting))
+            .await?;
+
+        // Wait for web console response (see `mgmt`)
+        waiter.await?.map_err(AuthErrorImpl::auth_failed)
+    })
+    .await?;
+
+    client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;
+
+    Ok(db_info)
+}
+
+fn parse_password(bytes: &[u8]) -> Option<&str> {
+    std::str::from_utf8(bytes).ok()?.strip_suffix('\0')
+}
+
+fn hello_message(redirect_uri: &str, session_id: &str) -> String {
+    format!(
+        concat![
+            "☀️  Welcome to Zenith!\n",
+            "To proceed with database creation, open the following link:\n\n",
+            "    {redirect_uri}{session_id}\n\n",
+            "It needs to be done once and we will send you '.pgpass' file,\n",
+            "which will allow you to access or create ",
+            "databases without opening your web browser."
+        ],
+        redirect_uri = redirect_uri,
+        session_id = session_id,
+    )
+}
--- a/Show More
+++ b/Show More