Add script export_import_betwen_pageservers.py to migrate projects between pageservers

Do not overwrite an existing image layer.
See github issues #1594 and #1690 Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
2026-07-26 15:30:42 +00:00 · 2022-07-05 15:27:31 +03:00 · 2022-07-05 14:45:31 +03:00 · 2022-07-05 12:22:58 +01:00 · 2022-07-05 10:55:03 +03:00 · 2022-07-05 02:06:40 -04:00
448 changed files with 66490 additions and 18367 deletions
--- a/.circleci/ansible/.gitignore
+++ b/.circleci/ansible/.gitignore
@@ -0,0 +1,4 @@
+zenith_install.tar.gz
+.zenith_current_version
+neon_install.tar.gz
+.neon_current_version
--- a/.circleci/ansible/ansible.cfg
+++ b/.circleci/ansible/ansible.cfg
@@ -0,0 +1,12 @@
+[defaults]
+
+localhost_warning = False
+host_key_checking = False
+timeout = 30
+
+[ssh_connection]
+ssh_args   = -F ./ansible.ssh.cfg
+# teleport doesn't support sftp yet https://github.com/gravitational/teleport/issues/7127
+# and scp neither worked for me
+transfer_method = piped
+pipelining = True
--- a/.circleci/ansible/ansible.ssh.cfg
+++ b/.circleci/ansible/ansible.ssh.cfg
@@ -0,0 +1,15 @@
+# Remove this once https://github.com/gravitational/teleport/issues/10918 is fixed
+# (use pre 8.5 option name to cope with old ssh in CI)
+PubkeyAcceptedKeyTypes +ssh-rsa-cert-v01@openssh.com
+
+Host tele.zenith.tech
+    User admin
+    Port 3023
+    StrictHostKeyChecking no
+    UserKnownHostsFile /dev/null
+
+Host * !tele.zenith.tech
+    User admin
+    StrictHostKeyChecking no
+    UserKnownHostsFile /dev/null
+    ProxyJump tele.zenith.tech
--- a/.circleci/ansible/deploy.yaml
+++ b/.circleci/ansible/deploy.yaml
@@ -0,0 +1,176 @@
+- name: Upload Neon binaries
+  hosts: storage
+  gather_facts: False
+  remote_user: admin
+
+  tasks:
+
+    - name: get latest version of Neon binaries
+      register: current_version_file
+      set_fact:
+        current_version: "{{ lookup('file', '.neon_current_version') | trim }}"
+      tags:
+      - pageserver
+      - safekeeper
+
+    - name: inform about versions
+      debug: msg="Version to deploy - {{ current_version }}"
+      tags:
+      - pageserver
+      - safekeeper
+
+    - name: upload and extract Neon binaries to /usr/local
+      ansible.builtin.unarchive:
+        owner: root
+        group: root
+        src: neon_install.tar.gz
+        dest: /usr/local
+      become: true
+      tags:
+      - pageserver
+      - safekeeper
+      - binaries
+      - putbinaries
+
+- name: Deploy pageserver
+  hosts: pageservers
+  gather_facts: False
+  remote_user: admin
+
+  tasks:
+
+    - name: upload init script
+      when: console_mgmt_base_url is defined
+      ansible.builtin.template:
+        src: scripts/init_pageserver.sh
+        dest: /tmp/init_pageserver.sh
+        owner: root
+        group: root
+        mode: '0755'
+      become: true
+      tags:
+      - pageserver
+
+    - name: init pageserver
+      shell:
+        cmd: /tmp/init_pageserver.sh
+      args:
+        creates: "/storage/pageserver/data/tenants"
+      environment:
+        NEON_REPO_DIR: "/storage/pageserver/data"
+        LD_LIBRARY_PATH: "/usr/local/lib"
+      become: true
+      tags:
+      - pageserver
+
+    - name: update remote storage (s3) config
+      lineinfile:
+        path: /storage/pageserver/data/pageserver.toml
+        line: "{{ item }}"
+      loop:
+        - "[remote_storage]"
+        - "bucket_name = '{{ bucket_name }}'"
+        - "bucket_region = '{{ bucket_region }}'"
+        - "prefix_in_bucket = '{{ inventory_hostname }}'"
+      become: true
+      tags:
+      - pageserver
+
+    - name: upload systemd service definition
+      ansible.builtin.template:
+        src: systemd/pageserver.service
+        dest: /etc/systemd/system/pageserver.service
+        owner: root
+        group: root
+        mode: '0644'
+      become: true
+      tags:
+      - pageserver
+
+    - name: start systemd service
+      ansible.builtin.systemd:
+        daemon_reload: yes
+        name: pageserver
+        enabled: yes
+        state: restarted
+      become: true
+      tags:
+      - pageserver
+
+    - name: post version to console
+      when: console_mgmt_base_url is defined
+      shell:
+        cmd: |
+          INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
+          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID
+      tags:
+      - pageserver
+
+- name: Deploy safekeeper
+  hosts: safekeepers
+  gather_facts: False
+  remote_user: admin
+
+  tasks:
+
+    - name: upload init script
+      when: console_mgmt_base_url is defined
+      ansible.builtin.template:
+        src: scripts/init_safekeeper.sh
+        dest: /tmp/init_safekeeper.sh
+        owner: root
+        group: root
+        mode: '0755'
+      become: true
+      tags:
+      - safekeeper
+
+    - name: init safekeeper
+      shell:
+        cmd: /tmp/init_safekeeper.sh
+      args:
+        creates: "/storage/safekeeper/data/safekeeper.id"
+      environment:
+        NEON_REPO_DIR: "/storage/safekeeper/data"
+        LD_LIBRARY_PATH: "/usr/local/lib"
+      become: true
+      tags:
+      - safekeeper
+
+    # in the future safekeepers should discover pageservers byself
+    # but currently use first pageserver that was discovered
+    - name: set first pageserver var for safekeepers
+      set_fact:
+        first_pageserver: "{{ hostvars[groups['pageservers'][0]]['inventory_hostname'] }}"
+      tags:
+      - safekeeper
+
+    - name: upload systemd service definition
+      ansible.builtin.template:
+        src: systemd/safekeeper.service
+        dest: /etc/systemd/system/safekeeper.service
+        owner: root
+        group: root
+        mode: '0644'
+      become: true
+      tags:
+      - safekeeper
+
+    - name: start systemd service
+      ansible.builtin.systemd:
+        daemon_reload: yes
+        name: safekeeper
+        enabled: yes
+        state: restarted
+      become: true
+      tags:
+      - safekeeper
+
+    - name: post version to console
+      when: console_mgmt_base_url is defined
+      shell:
+        cmd: |
+          INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
+          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID
+      tags:
+      - safekeeper
--- a/.circleci/ansible/get_binaries.sh
+++ b/.circleci/ansible/get_binaries.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+set -e
+
+RELEASE=${RELEASE:-false}
+
+# look at docker hub for latest tag for neon docker image
+if [ "${RELEASE}" = "true" ]; then
+    echo "search latest release tag"
+    VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | grep -E '^[0-9]+$' | sort -n | tail -1)
+    if [ -z "${VERSION}" ]; then
+        echo "no any docker tags found, exiting..."
+        exit 1
+    else
+        TAG="release-${VERSION}"
+    fi
+else
+    echo "search latest dev tag"
+    VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -E '^[0-9]+$' | sort -n | tail -1)
+    if [ -z "${VERSION}" ]; then
+        echo "no any docker tags found, exiting..."
+        exit 1
+    else
+        TAG="${VERSION}"
+    fi
+fi
+
+echo "found ${VERSION}"
+
+# do initial cleanup
+rm -rf neon_install postgres_install.tar.gz neon_install.tar.gz .neon_current_version
+mkdir neon_install
+
+# retrieve binaries from docker image
+echo "getting binaries from docker image"
+docker pull --quiet neondatabase/neon:${TAG}
+ID=$(docker create neondatabase/neon:${TAG})
+docker cp ${ID}:/data/postgres_install.tar.gz .
+tar -xzf postgres_install.tar.gz -C neon_install
+docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/
+docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/
+docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/
+docker cp ${ID}:/usr/local/bin/postgres neon_install/bin/
+docker rm -vf ${ID}
+
+# store version to file (for ansible playbooks) and create binaries tarball
+echo ${VERSION} > neon_install/.neon_current_version
+echo ${VERSION} > .neon_current_version
+tar -czf neon_install.tar.gz -C neon_install .
+
+# do final cleaup
+rm -rf neon_install postgres_install.tar.gz
--- a/.circleci/ansible/neon-stress.hosts
+++ b/.circleci/ansible/neon-stress.hosts
@@ -0,0 +1,19 @@
+[pageservers]
+neon-stress-ps-1 console_region_id=1
+neon-stress-ps-2 console_region_id=1
+
+[safekeepers]
+neon-stress-sk-1 console_region_id=1
+neon-stress-sk-2 console_region_id=1
+neon-stress-sk-3 console_region_id=1
+
+[storage:children]
+pageservers
+safekeepers
+
+[storage:vars]
+console_mgmt_base_url = http://neon-stress-console.local
+bucket_name           = neon-storage-ireland
+bucket_region         = eu-west-1
+etcd_endpoints        = etcd-stress.local:2379
+safekeeper_enable_s3_offload = false
--- a/.circleci/ansible/production.hosts
+++ b/.circleci/ansible/production.hosts
@@ -0,0 +1,19 @@
+[pageservers]
+#zenith-1-ps-1 console_region_id=1
+zenith-1-ps-2 console_region_id=1
+
+[safekeepers]
+zenith-1-sk-1 console_region_id=1
+zenith-1-sk-2 console_region_id=1
+zenith-1-sk-3 console_region_id=1
+
+[storage:children]
+pageservers
+safekeepers
+
+[storage:vars]
+env_name = prod-1
+console_mgmt_base_url = http://console-release.local
+bucket_name           = zenith-storage-oregon
+bucket_region         = us-west-2
+etcd_endpoints        = etcd-release.local:2379
--- a/.circleci/ansible/scripts/init_pageserver.sh
+++ b/.circleci/ansible/scripts/init_pageserver.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+# get instance id from meta-data service
+INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
+
+# store fqdn hostname in var
+HOST=$(hostname -f)
+
+
+cat <<EOF | tee /tmp/payload
+{
+  "version": 1,
+  "host": "${HOST}",
+  "port": 6400,
+  "region_id": {{ console_region_id }},
+  "instance_id": "${INSTANCE_ID}",
+  "http_host": "${HOST}",
+  "http_port": 9898
+}
+EOF
+
+# check if pageserver already registered or not
+if ! curl -sf -X PATCH -d '{}' {{ console_mgmt_base_url }}/api/v1/pageservers/${INSTANCE_ID} -o /dev/null; then
+
+    # not registered, so register it now
+    ID=$(curl -sf -X POST {{ console_mgmt_base_url }}/api/v1/pageservers -d@/tmp/payload | jq -r '.ID')
+
+    # init pageserver
+    sudo -u pageserver /usr/local/bin/pageserver -c "id=${ID}" -c "pg_distrib_dir='/usr/local'" --init -D /storage/pageserver/data
+fi
--- a/.circleci/ansible/scripts/init_safekeeper.sh
+++ b/.circleci/ansible/scripts/init_safekeeper.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+# get instance id from meta-data service
+INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
+
+# store fqdn hostname in var
+HOST=$(hostname -f)
+
+
+cat <<EOF | tee /tmp/payload
+{
+  "version": 1,
+  "host": "${HOST}",
+  "port": 6500,
+  "region_id": {{ console_region_id }},
+  "instance_id": "${INSTANCE_ID}",
+  "http_host": "${HOST}",
+  "http_port": 7676
+}
+EOF
+
+# check if safekeeper already registered or not
+if ! curl -sf -X PATCH -d '{}' {{ console_mgmt_base_url }}/api/v1/safekeepers/${INSTANCE_ID} -o /dev/null; then
+
+    # not registered, so register it now
+    ID=$(curl -sf -X POST {{ console_mgmt_base_url }}/api/v1/safekeepers -d@/tmp/payload | jq -r '.ID')
+
+    # init safekeeper
+    sudo -u safekeeper /usr/local/bin/safekeeper --id ${ID} --init -D /storage/safekeeper/data
+fi
--- a/.circleci/ansible/staging.hosts
+++ b/.circleci/ansible/staging.hosts
@@ -0,0 +1,20 @@
+[pageservers]
+#zenith-us-stage-ps-1 console_region_id=27
+zenith-us-stage-ps-2 console_region_id=27
+zenith-us-stage-ps-3 console_region_id=27
+
+[safekeepers]
+zenith-us-stage-sk-4 console_region_id=27
+zenith-us-stage-sk-5 console_region_id=27
+zenith-us-stage-sk-6 console_region_id=27
+
+[storage:children]
+pageservers
+safekeepers
+
+[storage:vars]
+env_name = us-stage
+console_mgmt_base_url = http://console-staging.local
+bucket_name           = zenith-staging-storage-us-east-1
+bucket_region         = us-east-1
+etcd_endpoints        = etcd-staging.local:2379
--- a/.circleci/ansible/systemd/pageserver.service
+++ b/.circleci/ansible/systemd/pageserver.service
@@ -0,0 +1,18 @@
+[Unit]
+Description=Zenith pageserver
+After=network.target auditd.service
+
+[Service]
+Type=simple
+User=pageserver
+Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib
+ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data
+ExecReload=/bin/kill -HUP $MAINPID
+KillMode=mixed
+KillSignal=SIGINT
+Restart=on-failure
+TimeoutSec=10
+LimitNOFILE=30000000
+
+[Install]
+WantedBy=multi-user.target
--- a/.circleci/ansible/systemd/safekeeper.service
+++ b/.circleci/ansible/systemd/safekeeper.service
@@ -0,0 +1,18 @@
+[Unit]
+Description=Zenith safekeeper
+After=network.target auditd.service
+
+[Service]
+Type=simple
+User=safekeeper
+Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
+ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
+ExecReload=/bin/kill -HUP $MAINPID
+KillMode=mixed
+KillSignal=SIGINT
+Restart=on-failure
+TimeoutSec=10
+LimitNOFILE=30000000
+
+[Install]
+WantedBy=multi-user.target
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,54 +1,43 @@
 version: 2.1

-orbs:
-  python: circleci/python@1.4.0
-
 executors:
-  zenith-build-executor:
+  neon-xlarge-executor:
    resource_class: xlarge
    docker:
-      - image: cimg/rust:1.52.1
+      # NB: when changed, do not forget to update rust image tag in all Dockerfiles
+      - image: zimg/rust:1.58
+  neon-executor:
+    docker:
+      - image: zimg/rust:1.58

 jobs:
-  check-codestyle:
-    executor: zenith-build-executor
-    steps:
-      - checkout
-
-      - run:
-          name: rustfmt
-          when: always
-          command: |
-            cargo fmt --all -- --check
-
  # A job to build postgres
  build-postgres:
-    executor: zenith-build-executor
+    executor: neon-xlarge-executor
+    parameters:
+      build_type:
+        type: enum
+        enum: ["debug", "release"]
+    environment:
+      BUILD_TYPE: << parameters.build_type >>
    steps:
        # Checkout the git repo (circleci doesn't have a flag to enable submodules here)
      - checkout

        # Grab the postgres git revision to build a cache key.
+        # Append makefile as it could change the way postgres is built.
        # Note this works even though the submodule hasn't been checkout out yet.
      - run:
          name: Get postgres cache key
          command: |
-            git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
+              git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
+              cat Makefile >> /tmp/cache-key-postgres

      - restore_cache:
          name: Restore postgres cache
          keys:
            # Restore ONLY if the rev key matches exactly
-            - v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
-
-        # FIXME We could cache our own docker container, instead of installing packages every time.
-      - run:
-          name: apt install dependencies
-          command: |
-            if [ ! -e tmp_install/bin/postgres ]; then
-              sudo apt update
-              sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
-            fi
+            - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}

        # Build postgres if the restore_cache didn't find a build.
        # `make` can't figure out whether the cache is valid, since
@@ -59,44 +48,44 @@ jobs:
            if [ ! -e tmp_install/bin/postgres ]; then
              # "depth 1" saves some time by not cloning the whole repo
              git submodule update --init --depth 1
-              make postgres
+              # bail out on any warnings
+              COPT='-Werror' mold -run make postgres -j$(nproc)
            fi

      - save_cache:
          name: Save postgres cache
-          key: v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
+          key: v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
          paths:
            - tmp_install

-  # A job to build zenith rust code
-  build-zenith:
-    executor: zenith-build-executor
+  # A job to build Neon rust code
+  build-neon:
+    executor: neon-xlarge-executor
    parameters:
      build_type:
        type: enum
        enum: ["debug", "release"]
+    environment:
+      BUILD_TYPE: << parameters.build_type >>
    steps:
-      - run:
-          name: apt install dependencies
-          command: |
-            sudo apt update
-            sudo apt install libssl-dev clang
-
        # Checkout the git repo (without submodules)
      - checkout

        # Grab the postgres git revision to build a cache key.
+        # Append makefile as it could change the way postgres is built.
        # Note this works even though the submodule hasn't been checkout out yet.
      - run:
          name: Get postgres cache key
          command: |
            git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
+            cat Makefile >> /tmp/cache-key-postgres
+

      - restore_cache:
          name: Restore postgres cache
          keys:
            # Restore ONLY if the rev key matches exactly
-            - v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
+            - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}

      - restore_cache:
          name: Restore rust cache
@@ -104,81 +93,114 @@ jobs:
            # Require an exact match. While an out of date cache might speed up the build,
            # there's no way to clean out old packages, so the cache grows every time something
            # changes.
-            - v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
+            - v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}

        # Build the rust code, including test binaries
      - run:
          name: Rust build << parameters.build_type >>
          command: |
-            export CARGO_INCREMENTAL=0
-            BUILD_TYPE="<< parameters.build_type >>"
            if [[ $BUILD_TYPE == "debug" ]]; then
-              echo "Build in debug mode"
-              cargo build --bins --tests
+              CARGO_FLAGS=
            elif [[ $BUILD_TYPE == "release" ]]; then
-              echo "Build in release mode"
-              cargo build --release --bins --tests
+              CARGO_FLAGS="--release --features profiling"
            fi

+            export CARGO_INCREMENTAL=0
+            export CACHEPOT_BUCKET=zenith-rust-cachepot
+            export RUSTC_WRAPPER=cachepot
+            export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}"
+            export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}"
+            mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
+            cachepot -s
+
      - save_cache:
          name: Save rust cache
-          key: v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
+          key: v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
          paths:
            - ~/.cargo/registry
            - ~/.cargo/git
            - target

-        # Run style checks
-        # has to run separately from cargo fmt section
-        # since needs to run with dependencies
-      - run:
-          name: clippy
-          command: |
-            ./run_clippy.sh
-
        # Run rust unit tests
-      - run: cargo test
+      - run:
+          name: cargo test
+          command: |
+            if [[ $BUILD_TYPE == "debug" ]]; then
+              CARGO_FLAGS=
+            elif [[ $BUILD_TYPE == "release" ]]; then
+              CARGO_FLAGS=--release
+            fi
+
+            cargo test $CARGO_FLAGS

        # Install the rust binaries, for use by test jobs
-        # `--locked` is required; otherwise, `cargo install` will ignore Cargo.lock.
-        # FIXME: this is a really silly way to install; maybe we should just output
-        # a tarball as an artifact? Or a .deb package?
      - run:
-          name: cargo install
+          name: Install rust binaries
          command: |
-            export CARGO_INCREMENTAL=0
-            BUILD_TYPE="<< parameters.build_type >>"
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              echo "Install debug mode"
-              CARGO_FLAGS="--debug"
-            elif [[ $BUILD_TYPE == "release" ]]; then
-              echo "Install release mode"
-              # The default is release mode; there is no --release flag.
-              CARGO_FLAGS=""
-            fi
-            cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path pageserver
-            cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path walkeeper
-            cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path zenith
+            binaries=$(
+              cargo metadata --format-version=1 --no-deps |
+              jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
+            )
+
+            test_exe_paths=$(
+              cargo test --message-format=json --no-run |
+              jq -r '.executable | select(. != null)'
+            )
+
+            mkdir -p /tmp/zenith/bin
+            mkdir -p /tmp/zenith/test_bin
+            mkdir -p /tmp/zenith/etc
+
+            # Install target binaries
+            for bin in $binaries; do
+              SRC=target/$BUILD_TYPE/$bin
+              DST=/tmp/zenith/bin/$bin
+              cp $SRC $DST
+            done

        # Install the postgres binaries, for use by test jobs
-        # FIXME: this is a silly way to do "install"; maybe just output a standard
-        # postgres package, whatever the favored form is (tarball? .deb package?)
-        # Note that pg_regress needs some build artifacts that probably aren't
-        # in the usual package...?
      - run:
-          name: postgres install
+          name: Install postgres binaries
          command: |
            cp -a tmp_install /tmp/zenith/pg_install

-        # Save the rust output binaries for other jobs in this workflow.
+      # Save rust binaries for other jobs in the workflow
      - persist_to_workspace:
          root: /tmp/zenith
          paths:
            - "*"

+  check-codestyle-python:
+    executor: neon-executor
+    steps:
+      - checkout
+      - restore_cache:
+          keys:
+            - v2-python-deps-{{ checksum "poetry.lock" }}
+      - run:
+          name: Install deps
+          command: ./scripts/pysync
+      - save_cache:
+          key: v2-python-deps-{{ checksum "poetry.lock" }}
+          paths:
+            - /home/circleci/.cache/pypoetry/virtualenvs
+      - run:
+          name: Print versions
+          when: always
+          command: |
+              poetry run python --version
+              poetry show
+      - run:
+          name: Run yapf to ensure code format
+          when: always
+          command: poetry run yapf --recursive --diff .
+      - run:
+          name: Run mypy to check types
+          when: always
+          command: poetry run mypy .
+
  run-pytest:
-    #description: "Run pytest"
-    executor: python/default
+    executor: neon-executor
    parameters:
      # pytest args to specify the tests to run.
      #
@@ -204,6 +226,11 @@ jobs:
      run_in_parallel:
        type: boolean
        default: true
+      save_perf_report:
+        type: boolean
+        default: false
+    environment:
+      BUILD_TYPE: << parameters.build_type >>
    steps:
      - attach_workspace:
          at: /tmp/zenith
@@ -212,21 +239,35 @@ jobs:
          condition: << parameters.needs_postgres_source >>
          steps:
            - run: git submodule update --init --depth 1
+      - restore_cache:
+          keys:
+            - v2-python-deps-{{ checksum "poetry.lock" }}
      - run:
-          name: Install pipenv & deps
-          working_directory: test_runner
-          command: |
-            pip install pipenv
-            pipenv install
+          name: Install deps
+          command: ./scripts/pysync
+      - save_cache:
+          key: v2-python-deps-{{ checksum "poetry.lock" }}
+          paths:
+            - /home/circleci/.cache/pypoetry/virtualenvs
      - run:
          name: Run pytest
-          working_directory: test_runner
+          # pytest doesn't output test logs in real time, so CI job may fail with
+          # `Too long with no output` error, if a test is running for a long time.
+          # In that case, tests should have internal timeouts that are less than
+          # no_output_timeout, specified here.
+          no_output_timeout: 10m
          environment:
-            - ZENITH_BIN: /tmp/zenith/bin
+            - NEON_BIN: /tmp/zenith/bin
            - POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
            - TEST_OUTPUT: /tmp/test_output
+            # this variable will be embedded in perf test report
+            # and is needed to distinguish different environments
+            - PLATFORM: zenith-local-ci
          command: |
-            TEST_SELECTION="<< parameters.test_selection >>"
+            PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
+            rm -rf $PERF_REPORT_DIR
+
+            TEST_SELECTION="test_runner/<< parameters.test_selection >>"
            EXTRA_PARAMS="<< parameters.extra_params >>"
            if [ -z "$TEST_SELECTION" ]; then
              echo "test_selection must be set"
@@ -234,18 +275,40 @@ jobs:
            fi
            if << parameters.run_in_parallel >>; then
              EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
-            fi;
+            fi
+            if << parameters.save_perf_report >>; then
+              if [[ $CIRCLE_BRANCH == "main" ]]; then
+                mkdir -p "$PERF_REPORT_DIR"
+                EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
+              fi
+            fi
+
+            export GITHUB_SHA=$CIRCLE_SHA1
+
            # Run the tests.
            #
            # The junit.xml file allows CircleCI to display more fine-grained test information
            # in its "Tests" tab in the results page.
-            # -s prevents pytest from capturing output, which helps to see
-            # what's going on if the test hangs
            # --verbose prints name of each test (helpful when there are
            # multiple tests in one file)
            # -rA prints summary in the end
            # -n4 uses four processes to run tests via pytest-xdist
-            pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short -s --verbose -rA $TEST_SELECTION $EXTRA_PARAMS
+            # -s is not used to prevent pytest from capturing output, because tests are running
+            # in parallel and logs are mixed between different tests
+            ./scripts/pytest \
+              --junitxml=$TEST_OUTPUT/junit.xml \
+              --tb=short \
+              --verbose \
+              -m "not remote_cluster" \
+              -rA $TEST_SELECTION $EXTRA_PARAMS
+
+            if << parameters.save_perf_report >>; then
+              if [[ $CIRCLE_BRANCH == "main" ]]; then
+                export REPORT_FROM="$PERF_REPORT_DIR"
+                export REPORT_TO=local
+                scripts/generate_and_push_perf_report.sh
+              fi
+            fi
      - run:
          # CircleCI artifacts are preserved one file at a time, so skipping
          # this step isn't a good idea. If you want to extract the
@@ -254,15 +317,20 @@ jobs:
          when: always
          command: |
            du -sh /tmp/test_output/*
-            find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "wal_acceptor.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete
+            find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete
            du -sh /tmp/test_output/*
      - store_artifacts:
          path: /tmp/test_output
      # The store_test_results step tells CircleCI where to find the junit.xml file.
      - store_test_results:
          path: /tmp/test_output
+      # Save data (if any)
+      - persist_to_workspace:
+          root: /tmp/zenith
+          paths:
+            - "*"

-  # Build zenithdb/zenith:latest image and push it to Docker hub
+  # Build neondatabase/neon:latest image and push it to Docker hub
  docker-image:
    docker:
      - image: cimg/base:2021.04
@@ -276,66 +344,276 @@ jobs:
      - run:
          name: Build and push Docker image
          command: |
-            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
-            docker build -t zenithdb/zenith:latest . && docker push zenithdb/zenith:latest
+            echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
+            DOCKER_TAG=$(git log --oneline|wc -l)
+            docker build \
+              --pull \
+              --build-arg GIT_VERSION=${CIRCLE_SHA1} \
+              --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
+              --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
+              --tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:latest .
+            docker push neondatabase/neon:${DOCKER_TAG}
+            docker push neondatabase/neon:latest

-  # Trigger a new remote CI job
-  remote-ci-trigger:
+  # Build neondatabase/compute-node:latest image and push it to Docker hub
+  docker-image-compute:
    docker:
      - image: cimg/base:2021.04
-    parameters:
-      remote_repo:
-        type: string
-    environment:
-      REMOTE_REPO: << parameters.remote_repo >>
    steps:
+      - checkout
+      - setup_remote_docker:
+          docker_layer_caching: true
      - run:
-          name: Set PR's status to pending
+          name: Build and push compute-tools Docker image
          command: |
-            LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
-
-            curl -f -X POST \
-            https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \
-            -H "Accept: application/vnd.github.v3+json" \
-            --user "$CI_ACCESS_TOKEN" \
-            --data \
-              "{
-                \"state\": \"pending\",
-                \"context\": \"zenith-remote-ci\",
-                \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
-              }"
+            echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
+            docker build \
+              --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
+              --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
+              --tag neondatabase/compute-tools:local \
+              --tag neondatabase/compute-tools:latest \
+              -f Dockerfile.compute-tools .
+            # Only push :latest image
+            docker push neondatabase/compute-tools:latest
      - run:
-          name: Request a remote CI test
+          name: Init postgres submodule
+          command: git submodule update --init --depth 1
+      - run:
+          name: Build and push compute-node Docker image
          command: |
-            LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
+            echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
+            DOCKER_TAG=$(git log --oneline|wc -l)
+            docker build --tag neondatabase/compute-node:${DOCKER_TAG} \
+              --tag neondatabase/compute-node:latest vendor/postgres \
+              --build-arg COMPUTE_TOOLS_TAG=local
+            docker push neondatabase/compute-node:${DOCKER_TAG}
+            docker push neondatabase/compute-node:latest

-            curl -f -X POST \
-            https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
-            -H "Accept: application/vnd.github.v3+json" \
-            --user "$CI_ACCESS_TOKEN" \
-            --data \
-              "{
-                \"ref\": \"main\",
-                \"inputs\": {
-                  \"ci_job_name\": \"zenith-remote-ci\",
-                  \"commit_hash\": \"$CIRCLE_SHA1\",
-                  \"remote_repo\": \"$LOCAL_REPO\",
-                  \"zenith_image_branch\": \"$CIRCLE_BRANCH\"
-                }
-              }"
+  # Build production neondatabase/neon:release image and push it to Docker hub
+  docker-image-release:
+    docker:
+      - image: cimg/base:2021.04
+    steps:
+      - checkout
+      - setup_remote_docker:
+          docker_layer_caching: true
+      - run:
+          name: Init postgres submodule
+          command: git submodule update --init --depth 1
+      - run:
+          name: Build and push Docker image
+          command: |
+            echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
+            DOCKER_TAG="release-$(git log --oneline|wc -l)"
+            docker build \
+              --pull \
+              --build-arg GIT_VERSION=${CIRCLE_SHA1} \
+              --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
+              --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
+              --tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:release .
+            docker push neondatabase/neon:${DOCKER_TAG}
+            docker push neondatabase/neon:release
+
+  # Build production neondatabase/compute-node:release image and push it to Docker hub
+  docker-image-compute-release:
+    docker:
+      - image: cimg/base:2021.04
+    steps:
+      - checkout
+      - setup_remote_docker:
+          docker_layer_caching: true
+      - run:
+          name: Build and push compute-tools Docker image
+          command: |
+            echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
+            docker build \
+              --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
+              --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
+              --tag neondatabase/compute-tools:release \
+              --tag neondatabase/compute-tools:local \
+              -f Dockerfile.compute-tools .
+            # Only push :release image
+            docker push neondatabase/compute-tools:release
+      - run:
+          name: Init postgres submodule
+          command: git submodule update --init --depth 1
+      - run:
+          name: Build and push compute-node Docker image
+          command: |
+            echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
+            DOCKER_TAG="release-$(git log --oneline|wc -l)"
+            docker build --tag neondatabase/compute-node:${DOCKER_TAG} \
+              --tag neondatabase/compute-node:release vendor/postgres \
+              --build-arg COMPUTE_TOOLS_TAG=local
+            docker push neondatabase/compute-node:${DOCKER_TAG}
+            docker push neondatabase/compute-node:release
+
+  deploy-staging:
+    docker:
+      - image: cimg/python:3.10
+    steps:
+      - checkout
+      - setup_remote_docker
+      - run:
+          name: Setup ansible
+          command: |
+            pip install --progress-bar off --user ansible boto3
+      - run:
+          name: Redeploy
+          command: |
+            cd "$(pwd)/.circleci/ansible"
+
+            ./get_binaries.sh
+
+            echo "${TELEPORT_SSH_KEY}"  | tr -d '\n'| base64 --decode >ssh-key
+            echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
+            chmod 0600 ssh-key
+            ssh-add ssh-key
+            rm -f ssh-key ssh-key-cert.pub
+
+            ansible-playbook deploy.yaml -i staging.hosts
+            rm -f neon_install.tar.gz .neon_current_version
+
+  deploy-staging-proxy:
+    docker:
+      - image: cimg/base:2021.04
+    environment:
+      KUBECONFIG: .kubeconfig
+    steps:
+      - checkout
+      - run:
+          name: Store kubeconfig file
+          command: |
+            echo "${STAGING_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
+            chmod 0600 ${KUBECONFIG}
+      - run:
+          name: Setup helm v3
+          command: |
+            curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+            helm repo add neondatabase https://neondatabase.github.io/helm-charts
+      - run:
+          name: Re-deploy proxy
+          command: |
+            DOCKER_TAG=$(git log --oneline|wc -l)
+            helm upgrade neon-proxy       neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
+            helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
+
+  deploy-neon-stress:
+    docker:
+      - image: cimg/python:3.10
+    steps:
+      - checkout
+      - setup_remote_docker
+      - run:
+          name: Setup ansible
+          command: |
+            pip install --progress-bar off --user ansible boto3
+      - run:
+          name: Redeploy
+          command: |
+            cd "$(pwd)/.circleci/ansible"
+
+            ./get_binaries.sh
+
+            echo "${TELEPORT_SSH_KEY}"  | tr -d '\n'| base64 --decode >ssh-key
+            echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
+            chmod 0600 ssh-key
+            ssh-add ssh-key
+            rm -f ssh-key ssh-key-cert.pub
+
+            ansible-playbook deploy.yaml -i neon-stress.hosts
+            rm -f neon_install.tar.gz .neon_current_version
+
+  deploy-neon-stress-proxy:
+    docker:
+      - image: cimg/base:2021.04
+    environment:
+      KUBECONFIG: .kubeconfig
+    steps:
+      - checkout
+      - run:
+          name: Store kubeconfig file
+          command: |
+            echo "${NEON_STRESS_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
+            chmod 0600 ${KUBECONFIG}
+      - run:
+          name: Setup helm v3
+          command: |
+            curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+            helm repo add neondatabase https://neondatabase.github.io/helm-charts
+      - run:
+          name: Re-deploy proxy
+          command: |
+            DOCKER_TAG=$(git log --oneline|wc -l)
+            helm upgrade neon-stress-proxy       neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
+            helm upgrade neon-stress-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
+
+  deploy-release:
+    docker:
+      - image: cimg/python:3.10
+    steps:
+      - checkout
+      - setup_remote_docker
+      - run:
+          name: Setup ansible
+          command: |
+            pip install --progress-bar off --user ansible boto3
+      - run:
+          name: Redeploy
+          command: |
+            cd "$(pwd)/.circleci/ansible"
+
+            RELEASE=true ./get_binaries.sh
+
+            echo "${TELEPORT_SSH_KEY}"  | tr -d '\n'| base64 --decode >ssh-key
+            echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
+            chmod 0600 ssh-key
+            ssh-add ssh-key
+            rm -f ssh-key ssh-key-cert.pub
+
+            ansible-playbook deploy.yaml -i production.hosts
+            rm -f neon_install.tar.gz .neon_current_version
+
+  deploy-release-proxy:
+    docker:
+      - image: cimg/base:2021.04
+    environment:
+      KUBECONFIG: .kubeconfig
+    steps:
+      - checkout
+      - run:
+          name: Store kubeconfig file
+          command: |
+            echo "${PRODUCTION_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
+            chmod 0600 ${KUBECONFIG}
+      - run:
+          name: Setup helm v3
+          command: |
+            curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+            helm repo add neondatabase https://neondatabase.github.io/helm-charts
+      - run:
+          name: Re-deploy proxy
+          command: |
+            DOCKER_TAG="release-$(git log --oneline|wc -l)"
+            helm upgrade neon-proxy       neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
+            helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait

 workflows:
  build_and_test:
    jobs:
-      - check-codestyle
-      - build-postgres
-      - build-zenith:
-          name: build-zenith-<< matrix.build_type >>
+      - check-codestyle-python
+      - build-postgres:
+          name: build-postgres-<< matrix.build_type >>
+          matrix:
+            parameters:
+              build_type: ["debug", "release"]
+      - build-neon:
+          name: build-neon-<< matrix.build_type >>
          matrix:
            parameters:
              build_type: ["debug", "release"]
          requires:
-            - build-postgres
+            - build-postgres-<< matrix.build_type >>
      - run-pytest:
          name: pg_regress-tests-<< matrix.build_type >>
          matrix:
@@ -344,7 +622,7 @@ workflows:
          test_selection: batch_pg_regress
          needs_postgres_source: true
          requires:
-            - build-zenith-<< matrix.build_type >>
+            - build-neon-<< matrix.build_type >>
      - run-pytest:
          name: other-tests-<< matrix.build_type >>
          matrix:
@@ -352,14 +630,16 @@ workflows:
              build_type: ["debug", "release"]
          test_selection: batch_others
          requires:
-            - build-zenith-<< matrix.build_type >>
+            - build-neon-<< matrix.build_type >>
      - run-pytest:
          name: benchmarks
+          context: PERF_TEST_RESULT_CONNSTR
          build_type: release
          test_selection: performance
          run_in_parallel: false
+          save_perf_report: true
          requires:
-            - build-zenith-release
+            - build-neon-release
      - docker-image:
          # Context gives an ability to login
          context: Docker Hub
@@ -371,14 +651,92 @@ workflows:
          requires:
            - pg_regress-tests-release
            - other-tests-release
-      - remote-ci-trigger:
-          # Context passes credentials for gh api
-          context: CI_ACCESS_TOKEN
-          remote_repo: "zenithdb/console"
+      - docker-image-compute:
+          # Context gives an ability to login
+          context: Docker Hub
+          # Build image only for commits to main
+          filters:
+            branches:
+              only:
+                - main
          requires:
-            # XXX: Successful build doesn't mean everything is OK, but
-            # the job to be triggered takes so much time to complete (~22 min)
-            # that it's better not to wait for the commented-out steps
-            - build-zenith-debug
-            # - pg_regress-tests-release
-            # - other-tests-release
+            - pg_regress-tests-release
+            - other-tests-release
+      - deploy-staging:
+          # Context gives an ability to login
+          context: Docker Hub
+          # deploy only for commits to main
+          filters:
+            branches:
+              only:
+                - main
+          requires:
+            - docker-image
+      - deploy-staging-proxy:
+          # deploy only for commits to main
+          filters:
+            branches:
+              only:
+                - main
+          requires:
+            - docker-image
+
+      - deploy-neon-stress:
+          # Context gives an ability to login
+          context: Docker Hub
+          # deploy only for commits to main
+          filters:
+            branches:
+              only:
+                - main
+          requires:
+            - docker-image
+      - deploy-neon-stress-proxy:
+          # deploy only for commits to main
+          filters:
+            branches:
+              only:
+                - main
+          requires:
+            - docker-image
+
+      - docker-image-release:
+          # Context gives an ability to login
+          context: Docker Hub
+          # Build image only for commits to main
+          filters:
+            branches:
+              only:
+                - release
+          requires:
+            - pg_regress-tests-release
+            - other-tests-release
+      - docker-image-compute-release:
+          # Context gives an ability to login
+          context: Docker Hub
+          # Build image only for commits to main
+          filters:
+            branches:
+              only:
+                - release
+          requires:
+            - pg_regress-tests-release
+            - other-tests-release
+      - deploy-release:
+          # Context gives an ability to login
+          context: Docker Hub
+          # deploy only for commits to main
+          filters:
+            branches:
+              only:
+                - release
+          requires:
+            - docker-image-release
+      - deploy-release-proxy:
+          # deploy only for commits to main
+          filters:
+            branches:
+              only:
+                - release
+          requires:
+            - docker-image-release
--- a/.circleci/helm-values/neon-stress.proxy-scram.yaml
+++ b/.circleci/helm-values/neon-stress.proxy-scram.yaml
@@ -0,0 +1,26 @@
+fullnameOverride: "neon-stress-proxy-scram"
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://neon-stress-console.local/management/api/v2"
+  domain: "*.stress.neon.tech"
+
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: staging
+  zenith_region: eu-west-1
+  zenith_region_slug: ireland
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: '*.stress.neon.tech'
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
--- a/.circleci/helm-values/neon-stress.proxy.yaml
+++ b/.circleci/helm-values/neon-stress.proxy.yaml
@@ -0,0 +1,34 @@
+fullnameOverride: "neon-stress-proxy"
+
+settings:
+  authEndpoint: "https://console.dev.neon.tech/authenticate_proxy_request/"
+  uri: "https://console.dev.neon.tech/psql_session/"
+
+# -- Additional labels for zenith-proxy pods
+podLabels:
+  zenith_service: proxy
+  zenith_env: staging
+  zenith_region: eu-west-1
+  zenith_region_slug: ireland
+
+service:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal
+    external-dns.alpha.kubernetes.io/hostname: neon-stress-proxy.local
+  type: LoadBalancer
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: connect.dev.neon.tech
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
--- a/.circleci/helm-values/production.proxy-scram.yaml
+++ b/.circleci/helm-values/production.proxy-scram.yaml
@@ -0,0 +1,24 @@
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-release.local/management/api/v2"
+  domain: "*.cloud.neon.tech"
+
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: production
+  zenith_region: us-west-2
+  zenith_region_slug: oregon
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: '*.cloud.neon.tech'
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
--- a/.circleci/helm-values/production.proxy.yaml
+++ b/.circleci/helm-values/production.proxy.yaml
@@ -0,0 +1,32 @@
+settings:
+  authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
+  uri: "https://console.neon.tech/psql_session/"
+
+# -- Additional labels for zenith-proxy pods
+podLabels:
+  zenith_service: proxy
+  zenith_env: production
+  zenith_region: us-west-2
+  zenith_region_slug: oregon
+
+service:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal
+    external-dns.alpha.kubernetes.io/hostname: proxy-release.local
+  type: LoadBalancer
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: connect.neon.tech,pg.neon.tech
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
--- a/.circleci/helm-values/staging.proxy-scram.yaml
+++ b/.circleci/helm-values/staging.proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for zenith-proxy.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-staging.local/management/api/v2"
+  domain: "*.cloud.stage.neon.tech"
+
+# -- Additional labels for zenith-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: staging
+  zenith_region: us-east-1
+  zenith_region_slug: virginia
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: cloud.stage.neon.tech
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
--- a/.circleci/helm-values/staging.proxy.yaml
+++ b/.circleci/helm-values/staging.proxy.yaml
@@ -0,0 +1,30 @@
+# Helm chart values for zenith-proxy.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
+  uri: "https://console.stage.neon.tech/psql_session/"
+
+# -- Additional labels for zenith-proxy pods
+podLabels:
+  zenith_service: proxy
+  zenith_env: staging
+  zenith_region: us-east-1
+  zenith_region_slug: virginia
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: connect.stage.neon.tech
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -0,0 +1,26 @@
+# This file contains settings for `cargo hakari`.
+# See https://docs.rs/cargo-hakari/latest/cargo_hakari/config for a full list of options.
+
+hakari-package = "workspace_hack"
+
+# Format for `workspace-hack = ...` lines in other Cargo.tomls. Requires cargo-hakari 0.9.8 or above.
+dep-format-version = "2"
+
+# Setting workspace.resolver = "2" in the root Cargo.toml is HIGHLY recommended.
+# Hakari works much better with the new feature resolver.
+# For more about the new feature resolver, see:
+# https://blog.rust-lang.org/2021/03/25/Rust-1.51.0.html#cargos-new-feature-resolver
+# Have to keep the resolver still here since hakari requires this field,
+# despite it's now the default for 2021 edition & cargo.
+resolver = "2"
+
+# Add triples corresponding to platforms commonly used by developers here.
+# https://doc.rust-lang.org/rustc/platform-support.html
+platforms = [
+    # "x86_64-unknown-linux-gnu",
+    # "x86_64-apple-darwin",
+    # "x86_64-pc-windows-msvc",
+]
+
+# Write out exact versions rather than a semver range. (Defaults to false.)
+# exact-versions = true
--- a/.dockerignore
+++ b/.dockerignore
@@ -9,6 +9,10 @@ tmp_install
 tmp_check_cli
 test_output
 .vscode
-.zenith
-integration_tests/.zenith
-.mypy_cache
+.neon
+integration_tests/.neon
+.mypy_cache
+
+Dockerfile
+.dockerignore
+
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -0,0 +1,140 @@
+name: 'Run python test'
+description: 'Runs a Neon python test set, performing all the required preparations before'
+
+inputs:
+  build_type:
+    description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug".'
+    required: true
+  rust_toolchain:
+    description: 'Rust toolchain version to fetch the caches'
+    required: true
+  test_selection:
+    description: 'A python test suite to run'
+    required: true
+  extra_params:
+    description: 'Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr'
+    required: false
+    default: ''
+  needs_postgres_source:
+    description: 'Set to true if the test suite requires postgres source checked out'
+    required: false
+    default: 'false'
+  run_in_parallel:
+    description: 'Whether to run tests in parallel'
+    required: false
+    default: 'true'
+  save_perf_report:
+    description: 'Whether to upload the performance report'
+    required: false
+    default: 'false'
+
+runs:
+  using: "composite"
+  steps:
+    - name: Get Neon artifact for restoration
+      uses: actions/download-artifact@v3
+      with:
+        name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact
+        path: ./neon-artifact/
+
+    - name: Extract Neon artifact
+      shell: bash -ex {0}
+      run: |
+        mkdir -p /tmp/neon/
+        tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/
+        rm -rf ./neon-artifact/
+
+    - name: Checkout
+      if: inputs.needs_postgres_source == 'true'
+      uses: actions/checkout@v3
+      with:
+        submodules: true
+        fetch-depth: 1
+
+    - name: Cache poetry deps
+      id: cache_poetry
+      uses: actions/cache@v3
+      with:
+        path: ~/.cache/pypoetry/virtualenvs
+        key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+
+    - name: Install Python deps
+      shell: bash -ex {0}
+      run: ./scripts/pysync
+
+    - name: Run pytest
+      env:
+        NEON_BIN: /tmp/neon/bin
+        POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+        TEST_OUTPUT: /tmp/test_output
+        # this variable will be embedded in perf test report
+        # and is needed to distinguish different environments
+        PLATFORM: github-actions-selfhosted
+      shell: bash -ex {0}
+      run: |
+        PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
+        rm -rf $PERF_REPORT_DIR
+
+        TEST_SELECTION="test_runner/${{ inputs.test_selection }}"
+        EXTRA_PARAMS="${{ inputs.extra_params }}"
+        if [ -z "$TEST_SELECTION" ]; then
+          echo "test_selection must be set"
+          exit 1
+        fi
+        if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
+          EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
+        fi
+        if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
+          if [[ "$GITHUB_REF" == "main" ]]; then
+            mkdir -p "$PERF_REPORT_DIR"
+            EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
+          fi
+        fi
+
+        if [[ "${{ inputs.build_type }}" == "debug" ]]; then
+          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
+        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
+          cov_prefix=()
+        fi
+
+        # Run the tests.
+        #
+        # The junit.xml file allows CircleCI to display more fine-grained test information
+        # in its "Tests" tab in the results page.
+        # --verbose prints name of each test (helpful when there are
+        # multiple tests in one file)
+        # -rA prints summary in the end
+        # -n4 uses four processes to run tests via pytest-xdist
+        # -s is not used to prevent pytest from capturing output, because tests are running
+        # in parallel and logs are mixed between different tests
+        "${cov_prefix[@]}" ./scripts/pytest \
+          --junitxml=$TEST_OUTPUT/junit.xml \
+          --tb=short \
+          --verbose \
+          -m "not remote_cluster" \
+          -rA $TEST_SELECTION $EXTRA_PARAMS
+
+        if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
+          if [[ "$GITHUB_REF" == "main" ]]; then
+            export REPORT_FROM="$PERF_REPORT_DIR"
+            export REPORT_TO=local
+            scripts/generate_and_push_perf_report.sh
+          fi
+        fi
+
+    - name: Delete all data but logs
+      shell: bash -ex {0}
+      if: always()
+      run: |
+        du -sh /tmp/test_output/*
+        find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete
+        du -sh /tmp/test_output/*
+
+    - name: Upload python test logs
+      if: always()
+      uses: actions/upload-artifact@v3
+      with:
+        retention-days: 7
+        if-no-files-found: error
+        name: python-test-${{ inputs.test_selection }}-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-logs
+        path: /tmp/test_output/
--- a/.github/actions/save-coverage-data/action.yml
+++ b/.github/actions/save-coverage-data/action.yml
@@ -0,0 +1,17 @@
+name: 'Merge and upload coverage data'
+description: 'Compresses and uploads the coverage data as an artifact'
+
+runs:
+  using: "composite"
+  steps:
+    - name: Merge coverage data
+      shell: bash -ex {0}
+      run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
+
+    - name: Upload coverage data
+      uses: actions/upload-artifact@v3
+      with:
+        retention-days: 7
+        if-no-files-found: error
+        name: coverage-data-artifact
+        path: /tmp/coverage/
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -0,0 +1,106 @@
+name: benchmarking
+
+on:
+  # uncomment to run on push for debugging your PR
+  # push:
+  #   branches: [ your branch ]
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '36 7 * * *' # run once a day, timezone is utc
+
+  workflow_dispatch: # adds ability to run this manually
+
+jobs:
+  bench:
+    # this workflow runs on self hosteed runner
+    # it's environment is quite different from usual guthub runner
+    # probably the most important difference is that it doesn't start from clean workspace each time
+    # e g if you install system packages they are not cleaned up since you install them directly in host machine
+    # not a container or something
+    # See documentation for more info: https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners
+    runs-on: [self-hosted, zenith-benchmarker]
+
+    env:
+      POSTGRES_DISTRIB_DIR: "/usr/pgsql-13"
+
+    steps:
+    - name: Checkout zenith repo
+      uses: actions/checkout@v2
+
+    # actions/setup-python@v2 is not working correctly on self-hosted runners
+    # see https://github.com/actions/setup-python/issues/162
+    # and probably https://github.com/actions/setup-python/issues/162#issuecomment-865387976 in particular
+    # so the simplest solution to me is to use already installed system python and spin virtualenvs for job runs.
+    # there is Python 3.7.10 already installed on the machine so use it to install poetry and then use poetry's virtuealenvs
+    - name: Install poetry & deps
+      run: |
+        python3 -m pip install --upgrade poetry wheel
+        # since pip/poetry caches are reused there shouldn't be any troubles with install every time
+        ./scripts/pysync
+
+    - name: Show versions
+      run: |
+        echo Python
+        python3 --version
+        poetry run python3 --version
+        echo Poetry
+        poetry --version
+        echo Pgbench
+        $POSTGRES_DISTRIB_DIR/bin/pgbench --version
+
+    # FIXME cluster setup is skipped due to various changes in console API
+    # for now pre created cluster is used. When API gain some stability
+    # after massive changes dynamic cluster setup will be revived.
+    # So use pre created cluster. It needs to be started manually, but stop is automatic after 5 minutes of inactivity
+    - name: Setup cluster
+      env:
+        BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
+      shell: bash
+      run: |
+        set -e
+
+        echo "Starting cluster"
+        # wake up the cluster
+        $POSTGRES_DISTRIB_DIR/bin/psql $BENCHMARK_CONNSTR -c "SELECT 1"
+
+    - name: Run benchmark
+      # pgbench is installed system wide from official repo
+      # https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
+      # via
+      # sudo tee /etc/yum.repos.d/pgdg.repo<<EOF
+      # [pgdg13]
+      # name=PostgreSQL 13 for RHEL/CentOS 7 - x86_64
+      # baseurl=https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
+      # enabled=1
+      # gpgcheck=0
+      # EOF
+      # sudo yum makecache
+      # sudo yum install postgresql13-contrib
+      # actual binaries are located in /usr/pgsql-13/bin/
+      env:
+        # The pgbench test runs two tests of given duration against each scale.
+        # So the total runtime with these parameters is 2 * 2 * 300 = 1200, or 20 minutes.
+        # Plus time needed to initialize the test databases.
+        TEST_PG_BENCH_DURATIONS_MATRIX: "300"
+        TEST_PG_BENCH_SCALES_MATRIX: "10,100"
+        PLATFORM: "zenith-staging"
+        BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
+        REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
+      run: |
+        # just to be sure that no data was cached on self hosted runner
+        # since it might generate duplicates when calling ingest_perf_test_result.py
+        rm -rf perf-report-staging
+        mkdir -p perf-report-staging
+        ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging
+
+    - name: Submit result
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+      run: |
+        REPORT_FROM=$(realpath perf-report-staging) REPORT_TO=staging scripts/generate_and_push_perf_report.sh
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -0,0 +1,389 @@
+name: Test
+
+on:
+  push:
+    branches:
+    - main
+  pull_request:
+
+defaults:
+  run:
+    shell: bash -ex {0}
+
+concurrency:
+   group: ${{ github.workflow }}-${{ github.ref }}
+   cancel-in-progress: true
+
+env:
+  RUST_BACKTRACE: 1
+  COPT: '-Werror'
+
+jobs:
+  build-postgres:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ debug, release ]
+        rust_toolchain: [ 1.58 ]
+
+    env:
+      BUILD_TYPE: ${{ matrix.build_type }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Set pg revision for caching
+        id: pg_ver
+        run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres)
+
+      - name: Cache postgres build
+        id: cache_pg
+        uses: actions/cache@v3
+        with:
+          path: tmp_install/
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_ver.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Build postgres
+        if: steps.cache_pg.outputs.cache-hit != 'true'
+        run: mold -run make postgres -j$(nproc)
+
+      # actions/cache@v3 does not allow concurrently using the same cache across job steps, so use a separate cache
+      - name: Prepare postgres artifact
+        run: tar -C tmp_install/ -czf ./pg.tgz .
+      - name: Upload postgres artifact
+        uses: actions/upload-artifact@v3
+        with:
+          retention-days: 7
+          if-no-files-found: error
+          name: postgres-${{ runner.os }}-${{ matrix.build_type }}-artifact
+          path: ./pg.tgz
+
+
+  build-neon:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    needs: [ build-postgres ]
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ debug, release ]
+        rust_toolchain: [ 1.58 ]
+
+    env:
+      BUILD_TYPE: ${{ matrix.build_type }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Get postgres artifact for restoration
+        uses: actions/download-artifact@v3
+        with:
+          name: postgres-${{ runner.os }}-${{ matrix.build_type }}-artifact
+          path: ./postgres-artifact/
+      - name: Extract postgres artifact
+        run: |
+          mkdir ./tmp_install/
+          tar -xf ./postgres-artifact/pg.tgz -C ./tmp_install/
+          rm -rf ./postgres-artifact/
+
+      - name: Cache cargo deps
+        id: cache_cargo
+        uses: actions/cache@v3
+        with:
+          path: |
+            ~/.cargo/registry/
+            ~/.cargo/git/
+            target/
+          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
+          key: |
+            v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
+            v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-
+
+      - name: Run cargo build
+        run: |
+          if [[ $BUILD_TYPE == "debug" ]]; then
+            cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
+            CARGO_FLAGS=
+          elif [[ $BUILD_TYPE == "release" ]]; then
+            cov_prefix=()
+            CARGO_FLAGS="--release --features profiling"
+          fi
+
+          "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
+
+      - name: Run cargo test
+        run: |
+          if [[ $BUILD_TYPE == "debug" ]]; then
+            cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
+            CARGO_FLAGS=
+          elif [[ $BUILD_TYPE == "release" ]]; then
+            cov_prefix=()
+            CARGO_FLAGS=--release
+          fi
+
+          "${cov_prefix[@]}" cargo test $CARGO_FLAGS
+
+      - name: Install rust binaries
+        run: |
+          if [[ $BUILD_TYPE == "debug" ]]; then
+            cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
+          elif [[ $BUILD_TYPE == "release" ]]; then
+            cov_prefix=()
+          fi
+
+          binaries=$(
+            "${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps |
+            jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
+          )
+
+          test_exe_paths=$(
+            "${cov_prefix[@]}" cargo test --message-format=json --no-run |
+            jq -r '.executable | select(. != null)'
+          )
+
+          mkdir -p /tmp/neon/bin/
+          mkdir -p /tmp/neon/test_bin/
+          mkdir -p /tmp/neon/etc/
+
+          # Keep bloated coverage data files away from the rest of the artifact
+          mkdir -p /tmp/coverage/
+
+          # Install target binaries
+          for bin in $binaries; do
+            SRC=target/$BUILD_TYPE/$bin
+            DST=/tmp/neon/bin/$bin
+            cp "$SRC" "$DST"
+          done
+
+          # Install test executables and write list of all binaries (for code coverage)
+          if [[ $BUILD_TYPE == "debug" ]]; then
+            for bin in $binaries; do
+              echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
+            done
+            for bin in $test_exe_paths; do
+              SRC=$bin
+              DST=/tmp/neon/test_bin/$(basename $bin)
+              cp "$SRC" "$DST"
+              echo "$DST" >> /tmp/coverage/binaries.list
+            done
+          fi
+
+      - name: Install postgres binaries
+        run: cp -a tmp_install /tmp/neon/pg_install
+
+      - name: Prepare neon artifact
+        run: tar -C /tmp/neon/ -czf ./neon.tgz .
+
+      - name: Upload neon binaries
+        uses: actions/upload-artifact@v3
+        with:
+          retention-days: 7
+          if-no-files-found: error
+          name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
+          path: ./neon.tgz
+
+      # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
+      - name: Merge and upload coverage data
+        if: matrix.build_type == 'debug'
+        uses: ./.github/actions/save-coverage-data
+
+
+  pg_regress-tests:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    needs: [ build-neon ]
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ debug, release ]
+        rust_toolchain: [ 1.58 ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 2
+
+      - name: Pytest regress tests
+        uses: ./.github/actions/run-python-test-set
+        with:
+          build_type: ${{ matrix.build_type }}
+          rust_toolchain: ${{ matrix.rust_toolchain }}
+          test_selection: batch_pg_regress
+          needs_postgres_source: true
+
+      - name: Merge and upload coverage data
+        if: matrix.build_type == 'debug'
+        uses: ./.github/actions/save-coverage-data
+
+  other-tests:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    needs: [ build-neon ]
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ debug, release ]
+        rust_toolchain: [ 1.58 ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 2
+
+      - name: Pytest other tests
+        uses: ./.github/actions/run-python-test-set
+        with:
+          build_type: ${{ matrix.build_type }}
+          rust_toolchain: ${{ matrix.rust_toolchain }}
+          test_selection: batch_others
+
+      - name: Merge and upload coverage data
+        if: matrix.build_type == 'debug'
+        uses: ./.github/actions/save-coverage-data
+
+  benchmarks:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    needs: [ build-neon ]
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ release ]
+        rust_toolchain: [ 1.58 ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 2
+
+      - name: Pytest benchmarks
+        uses: ./.github/actions/run-python-test-set
+        with:
+          build_type: ${{ matrix.build_type }}
+          rust_toolchain: ${{ matrix.rust_toolchain }}
+          test_selection: performance
+          run_in_parallel: false
+          save_perf_report: true
+      # XXX: no coverage data handling here, since benchmarks are run on release builds,
+      # while coverage is currently collected for the debug ones
+
+  coverage-report:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    needs: [ other-tests, pg_regress-tests ]
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ debug ]
+        rust_toolchain: [ 1.58 ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Restore cargo deps cache
+        id: cache_cargo
+        uses: actions/cache@v3
+        with:
+          path: |
+            ~/.cargo/registry/
+            ~/.cargo/git/
+            target/
+          key: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
+
+      - name: Get Neon artifact for restoration
+        uses: actions/download-artifact@v3
+        with:
+          name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
+          path: ./neon-artifact/
+
+      - name: Extract Neon artifact
+        run: |
+          mkdir -p /tmp/neon/
+          tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/
+          rm -rf ./neon-artifact/
+
+      - name: Restore coverage data
+        uses: actions/download-artifact@v3
+        with:
+          name: coverage-data-artifact
+          path: /tmp/coverage/
+
+      - name: Merge coverage data
+        run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
+
+      - name: Build and upload coverage report
+        run: |
+          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
+          COMMIT_URL=https://github.com/${{ github.repository }}/commit/$COMMIT_SHA
+
+          scripts/coverage \
+            --dir=/tmp/coverage report \
+            --input-objects=/tmp/coverage/binaries.list \
+            --commit-url=$COMMIT_URL \
+            --format=github
+
+          REPORT_URL=https://${{ github.repository_owner }}.github.io/zenith-coverage-data/$COMMIT_SHA
+
+          scripts/git-upload \
+            --repo=https://${{ secrets.VIP_VAP_ACCESS_TOKEN }}@github.com/${{ github.repository_owner }}/zenith-coverage-data.git \
+            --message="Add code coverage for $COMMIT_URL" \
+            copy /tmp/coverage/report $COMMIT_SHA # COPY FROM TO_RELATIVE
+
+          # Add link to the coverage report to the commit
+          curl -f -X POST \
+          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"state\": \"success\",
+              \"context\": \"neon-coverage\",
+              \"description\": \"Coverage report is ready\",
+              \"target_url\": \"$REPORT_URL\"
+            }"
+
+  trigger-e2e-tests:
+   runs-on: [ self-hosted, Linux, k8s-runner ]
+   needs: [ build-neon ]
+   steps:
+     - name: Set PR's status to pending and request a remote CI test
+       run: |
+         COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+         COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
+
+         REMOTE_REPO="${{ github.repository_owner }}/cloud"
+
+         curl -f -X POST \
+         https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
+         -H "Accept: application/vnd.github.v3+json" \
+         --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+         --data \
+           "{
+             \"state\": \"pending\",
+             \"context\": \"neon-cloud-e2e\",
+             \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
+           }"
+
+         curl -f -X POST \
+         https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
+         -H "Accept: application/vnd.github.v3+json" \
+         --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+         --data \
+           "{
+             \"ref\": \"main\",
+             \"inputs\": {
+               \"ci_job_name\": \"neon-cloud-e2e\",
+               \"commit_hash\": \"$COMMIT_SHA\",
+               \"remote_repo\": \"${{ github.repository }}\"
+             }
+           }"
--- a/.github/workflows/codestyle.yml
+++ b/.github/workflows/codestyle.yml
@@ -0,0 +1,133 @@
+name: Check code style and build
+
+on:
+  push:
+    branches:
+    - main
+  pull_request:
+
+defaults:
+  run:
+    shell: bash -ex {0}
+
+concurrency:
+   group: ${{ github.workflow }}-${{ github.ref }}
+   cancel-in-progress: true
+
+env:
+  RUST_BACKTRACE: 1
+
+jobs:
+  check-codestyle-rust:
+    strategy:
+      fail-fast: false
+      matrix:
+        # If we want to duplicate this job for different
+        # Rust toolchains (e.g. nightly or 1.37.0), add them here.
+        rust_toolchain: [1.58]
+        os: [ubuntu-latest, macos-latest]
+    timeout-minutes: 50
+    name: run regression test suite
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+        with:
+          submodules: true
+          fetch-depth: 2
+
+      - name: Install rust toolchain ${{ matrix.rust_toolchain }}
+        uses: actions-rs/toolchain@v1
+        with:
+          profile: minimal
+          toolchain: ${{ matrix.rust_toolchain }}
+          components: rustfmt, clippy
+          override: true
+
+      - name: Check formatting
+        run: cargo fmt --all -- --check
+
+      - name: Install Ubuntu postgres dependencies
+        if: matrix.os == 'ubuntu-latest'
+        run: |
+          sudo apt update
+          sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev libssl-dev
+
+      - name: Install macOS postgres dependencies
+        if: matrix.os == 'macos-latest'
+        run: brew install flex bison openssl
+
+      - name: Set pg revision for caching
+        id: pg_ver
+        run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres)
+
+      - name: Cache postgres build
+        id: cache_pg
+        uses: actions/cache@v2
+        with:
+          path: |
+            tmp_install/
+          key: ${{ runner.os }}-pg-${{ steps.pg_ver.outputs.pg_rev }}
+
+      - name: Set extra env for macOS
+        if: matrix.os == 'macos-latest'
+        run: |
+          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
+          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
+
+      - name: Build postgres
+        if: steps.cache_pg.outputs.cache-hit != 'true'
+        run: make postgres
+
+      # Plain configure output can contain weird errors like 'error: C compiler cannot create executables'
+      # and the real cause will be inside config.log
+      - name: Print configure logs in case of failure
+        if: failure()
+        continue-on-error: true
+        run: |
+          echo '' && echo '=== config.log ===' && echo ''
+          cat tmp_install/build/config.log
+          echo '' && echo '=== configure.log ===' && echo ''
+          cat tmp_install/build/configure.log
+
+      - name: Cache cargo deps
+        id: cache_cargo
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            target
+          key: ${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }}
+
+      - name: Run cargo clippy
+        run: ./run_clippy.sh
+
+      - name: Ensure all project builds
+        run: cargo build --all --all-targets
+
+  check-codestyle-python:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: false
+          fetch-depth: 1
+
+      - name: Cache poetry deps
+        id: cache_poetry
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pypoetry/virtualenvs
+          key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }}
+
+      - name: Install Python deps
+        run: ./scripts/pysync
+
+      - name: Run yapf to ensure code format
+        run: poetry run yapf --recursive --diff .
+
+      - name: Run mypy to check types
+        run: poetry run mypy .
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -0,0 +1,74 @@
+name: Test Postgres client libraries
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '23 02 * * *' # run once a day, timezone is utc
+
+  workflow_dispatch:
+
+concurrency:
+   group: ${{ github.workflow }}-${{ github.ref }}
+   cancel-in-progress: true
+
+jobs:
+  test-postgres-client-libs:
+    runs-on: [ ubuntu-latest ]
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+
+    - uses: actions/setup-python@v4
+      with:
+        python-version: 3.9
+
+    - name: Install Poetry
+      uses: snok/install-poetry@v1
+
+    - name: Cache poetry deps
+      id: cache_poetry
+      uses: actions/cache@v3
+      with:
+        path: ~/.cache/pypoetry/virtualenvs
+        key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+
+    - name: Install Python deps
+      shell: bash -ex {0}
+      run: ./scripts/pysync
+
+    - name: Run pytest
+      env:
+        REMOTE_ENV: 1
+        BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
+        TEST_OUTPUT: /tmp/test_output
+        POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+        # this variable will be embedded in perf test report
+        # and is needed to distinguish different environments
+        PLATFORM: github-actions-selfhosted
+      shell: bash -ex {0}
+      run: |
+        # Test framework expects we have psql binary;
+        # but since we don't really need it in this test, let's mock it
+        mkdir -p "$POSTGRES_DISTRIB_DIR/bin" && touch "$POSTGRES_DISTRIB_DIR/bin/psql";
+        ./scripts/pytest \
+          --junitxml=$TEST_OUTPUT/junit.xml \
+          --tb=short \
+          --verbose \
+          -m "remote_cluster" \
+          -rA "test_runner/pg_clients"
+
+    - name: Post to a Slack channel
+      if: failure()
+      id: slack
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -1,73 +0,0 @@
-name: Build and Test
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-
-jobs:
-  regression-check:
-    strategy:
-      matrix:
-        # If we want to duplicate this job for different
-        # Rust toolchains (e.g. nightly or 1.37.0), add them here.
-        rust_toolchain: [stable]
-        os: [ubuntu-latest]
-    timeout-minutes: 30
-    name: run regression test suite
-    runs-on: ${{ matrix.os }}
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v2
-        with:
-          submodules: true
-          fetch-depth: 2
-
-      - name: install rust toolchain ${{ matrix.rust_toolchain }}
-        uses: actions-rs/toolchain@v1
-        with:
-          profile: minimal
-          toolchain: ${{ matrix.rust_toolchain }}
-          override: true
-
-      - name: Install postgres dependencies
-        run: |
-          sudo apt update
-          sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
-
-      - name: Set pg revision for caching
-        id: pg_ver
-        run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres)
-
-      - name: Cache postgres build
-        id: cache_pg
-        uses: actions/cache@v2
-        with:
-          path: |
-            tmp_install/
-          key: ${{ runner.os }}-pg-${{ steps.pg_ver.outputs.pg_rev }}
-
-      - name: Build postgres
-        if: steps.cache_pg.outputs.cache-hit != 'true'
-        run: |
-          make postgres
-
-      - name: Cache cargo deps
-        id: cache_cargo
-        uses: actions/cache@v2
-        with:
-          path: |
-            ~/.cargo/registry
-            ~/.cargo/git
-            target
-          key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
-
-      - name: Run cargo build
-        run: |
-          cargo build --workspace --bins --examples --tests
-
-      - name: Run cargo test
-        run: |
-          cargo test -- --nocapture --test-threads=1
--- a/.gitignore
+++ b/.gitignore
@@ -5,5 +5,13 @@
 __pycache__/
 test_output/
 .vscode
-/.zenith
-/integration_tests/.zenith
+.idea
+/.neon
+/integration_tests/.neon
+
+# Coverage
+*.profraw
+*.profdata
+
+*.key
+*.crt
--- a/.yapfignore
+++ b/.yapfignore
@@ -0,0 +1,10 @@
+# This file is only read when `yapf` is run from this directory.
+# Hence we only top-level directories here to avoid confusion.
+# See source code for the exact file format: https://github.com/google/yapf/blob/c6077954245bc3add82dafd853a1c7305a6ebd20/yapf/yapflib/file_resources.py#L40-L43
+vendor/
+target/
+tmp_install/
+__pycache__/
+test_output/
+.neon/
+.git/
--- a/20
+++ b/20
@@ -1,20 +0,0 @@
-This software is licensed under the Apache 2.0 License:
-
----------------------------------------------------------------------------
-Copyright 2021 Zenith Labs, Inc
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
----------------------------------------------------------------------------
-
-The PostgreSQL submodule in vendor/postgres is licensed under the
-PostgreSQL license. See vendor/postgres/COPYRIGHT.
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,17 +1,21 @@
 [workspace]
 members = [
+    "compute_tools",
    "control_plane",
    "pageserver",
-    "postgres_ffi",
    "proxy",
-    "walkeeper",
+    "safekeeper",
    "workspace_hack",
-    "zenith",
-    "zenith_metrics",
-    "zenith_utils",
+    "neon_local",
+    "libs/*",
 ]

 [profile.release]
 # This is useful for profiling and, to some extent, debug.
 # Besides, debug info should not affect the performance.
 debug = true
+
+# This is only needed for proxy's tests.
+# TODO: we should probably fork `tokio-postgres-rustls` instead.
+[patch.crates-io]
+tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
--- a/87
+++ b/87
@@ -1,57 +1,60 @@
-#
-# Docker image for console integration testing.
-#
+# Build Postgres
+FROM neondatabase/rust:1.58 AS pg-build
+WORKDIR /pg

-#
-# Build Postgres separately --- this layer will be rebuilt only if one of
-# mentioned paths will get any changes.
-#
-FROM zenithdb/build:buster AS pg-build
-WORKDIR /zenith
-COPY ./vendor/postgres vendor/postgres
-COPY ./Makefile Makefile
-RUN make -j $(getconf _NPROCESSORS_ONLN) -s postgres
-RUN rm -rf postgres_install/build
+USER root
+
+COPY vendor/postgres vendor/postgres
+COPY Makefile Makefile
+
+ENV BUILD_TYPE release
+RUN set -e \
+    && mold -run make -j $(nproc) -s postgres \
+    && rm -rf tmp_install/build \
+    && tar -C tmp_install -czf /postgres_install.tar.gz .

-#
 # Build zenith binaries
-#
-# TODO: build cargo deps as separate layer. We used cargo-chef before but that was
-# net time waste in a lot of cases. Copying Cargo.lock with empty lib.rs should do the work.
-#
-FROM zenithdb/build:buster AS build
-WORKDIR /zenith
-COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
+FROM neondatabase/rust:1.58 AS build
+ARG GIT_VERSION=local

+ARG CACHEPOT_BUCKET=zenith-rust-cachepot
+ARG AWS_ACCESS_KEY_ID
+ARG AWS_SECRET_ACCESS_KEY
+
+COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
 COPY . .
-RUN cargo build --release

+# Show build caching stats to check if it was used in the end.
+# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
+RUN set -e \
+    && sudo -E "PATH=$PATH" mold -run cargo build --release \
+    && cachepot -s
+
+# Build final image
 #
-# Copy binaries to resulting image.
-#
-FROM debian:buster-slim
+FROM debian:bullseye-slim
 WORKDIR /data

-RUN apt-get update && apt-get -yq install libreadline-dev libseccomp-dev openssl ca-certificates && \
-    mkdir zenith_install
+RUN set -e \
+    && apt-get update \
+    && apt-get install -y \
+        libreadline-dev \
+        libseccomp-dev \
+        openssl \
+        ca-certificates \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
+    && useradd -d /data zenith \
+    && chown -R zenith:zenith /data
+
+COPY --from=build --chown=zenith:zenith /home/runner/target/release/pageserver /usr/local/bin
+COPY --from=build --chown=zenith:zenith /home/runner/target/release/safekeeper /usr/local/bin
+COPY --from=build --chown=zenith:zenith /home/runner/target/release/proxy      /usr/local/bin
+
+COPY --from=pg-build /pg/tmp_install/         /usr/local/
+COPY --from=pg-build /postgres_install.tar.gz /data/

-COPY --from=build /zenith/target/release/pageserver /usr/local/bin
-COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
-COPY --from=build /zenith/target/release/proxy /usr/local/bin
-COPY --from=pg-build /zenith/tmp_install postgres_install
 COPY docker-entrypoint.sh /docker-entrypoint.sh

-# Remove build artifacts (~ 500 MB)
-RUN rm -rf postgres_install/build && \
-    # 'Install' Postgres binaries locally
-    cp -r postgres_install/* /usr/local/ && \
-    # Prepare an archive of Postgres binaries (should be around 11 MB)
-    # and keep it inside container for an ease of deploy pipeline.
-    cd postgres_install && tar -czf /data/postgres_install.tar.gz . && cd .. && \
-    rm -rf postgres_install
-
-RUN useradd -d /data zenith && chown -R zenith:zenith /data
-
 VOLUME ["/data"]
 USER zenith
 EXPOSE 6400
--- a/Dockerfile.alpine
+++ b/Dockerfile.alpine
@@ -1,95 +0,0 @@
-#
-# Docker image for console integration testing.
-#
-# We may also reuse it in CI to unify installation process and as a general binaries building
-# tool for production servers.
-#
-# Dynamic linking is used for librocksdb and libstdc++ bacause librocksdb-sys calls
-# bindgen with "dynamic" feature flag. This also prevents usage of dockerhub alpine-rust
-# images which are statically linked and have guards against any dlopen. I would rather
-# prefer all static binaries so we may change the way librocksdb-sys builds or wait until
-# we will have our own storage and drop rockdb dependency.
-#
-# Cargo-chef is used to separate dependencies building from main binaries building. This
-# way `docker build` will download and install dependencies only of there are changes to
-# out Cargo.toml files.
-#
-
-
-#
-# build postgres separately -- this layer will be rebuilt only if one of
-# mentioned paths will get any changes
-#
-FROM alpine:3.13 as pg-build
-RUN apk add --update clang llvm compiler-rt compiler-rt-static lld musl-dev binutils \
-                     make bison flex readline-dev zlib-dev perl linux-headers libseccomp-dev
-WORKDIR zenith
-COPY ./vendor/postgres vendor/postgres
-COPY ./Makefile Makefile
-# Build using clang and lld
-RUN CC='clang' LD='lld' CFLAGS='-fuse-ld=lld --rtlib=compiler-rt' make postgres -j4
-
-#
-# Calculate cargo dependencies.
-# This will always run, but only generate recipe.json with list of dependencies without
-# installing them.
-#
-FROM alpine:20210212 as cargo-deps-inspect
-RUN apk add --update rust cargo
-RUN cargo install cargo-chef
-WORKDIR zenith
-COPY . .
-RUN cargo chef prepare --recipe-path recipe.json
-
-#
-# Build cargo dependencies.
-# This temp cantainner would be build only if recipe.json was changed.
-#
-FROM alpine:20210212 as deps-build
-RUN apk add --update rust cargo openssl-dev clang build-base
-# rust-rocksdb can be built against system-wide rocksdb -- that saves about
-# 10 minutes during build. Rocksdb apk package is in testing now, but use it
-# anyway. In case of any troubles we can download and build rocksdb here manually
-# (to cache it as a docker layer).
-RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev
-WORKDIR zenith
-COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
-COPY --from=cargo-deps-inspect /root/.cargo/bin/cargo-chef /root/.cargo/bin/
-COPY --from=cargo-deps-inspect /zenith/recipe.json recipe.json
-RUN ROCKSDB_LIB_DIR=/usr/lib/ cargo chef cook --release --recipe-path recipe.json
-
-#
-# Build zenith binaries
-#
-FROM alpine:20210212 as build
-RUN apk add --update rust cargo openssl-dev clang build-base
-RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev
-WORKDIR zenith
-COPY . .
-# Copy cached dependencies
-COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
-COPY --from=deps-build /zenith/target target
-COPY --from=deps-build /root/.cargo /root/.cargo
-RUN cargo build --release
-
-#
-# Copy binaries to resulting image.
-# build-base hare to provide libstdc++ (it will also bring gcc, but leave it this way until we figure
-# out how to statically link rocksdb or avoid it at all).
-#
-FROM alpine:3.13
-RUN apk add --update openssl build-base libseccomp-dev
-RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb
-COPY --from=build /zenith/target/release/pageserver /usr/local/bin
-COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
-COPY --from=build /zenith/target/release/proxy /usr/local/bin
-COPY --from=pg-build /zenith/tmp_install /usr/local
-COPY docker-entrypoint.sh /docker-entrypoint.sh
-
-RUN addgroup zenith && adduser -h /data -D -G zenith zenith
-VOLUME ["/data"]
-WORKDIR /data
-USER zenith
-EXPOSE 6400
-ENTRYPOINT ["/docker-entrypoint.sh"]
-CMD ["pageserver"]
--- a/Dockerfile.build
+++ b/Dockerfile.build
@@ -1,15 +0,0 @@
-#
-# Image with all the required dependencies to build https://github.com/zenithdb/zenith
-# and Postgres from https://github.com/zenithdb/postgres
-# Also includes some rust development and build tools.
-#
-FROM rust:slim-buster
-WORKDIR /zenith
-
-# Install postgres and zenith build dependencies
-# clang is for rocksdb
-RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-                                          libseccomp-dev pkg-config libssl-dev clang
-
-# Install rust tools
-RUN rustup component add clippy && cargo install cargo-audit
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -0,0 +1,18 @@
+# First transient image to build compute_tools binaries
+# NB: keep in sync with rust image version in .circle/config.yml
+FROM neondatabase/rust:1.58 AS rust-build
+
+ARG CACHEPOT_BUCKET=zenith-rust-cachepot
+ARG AWS_ACCESS_KEY_ID
+ARG AWS_SECRET_ACCESS_KEY
+
+COPY . .
+
+RUN set -e \
+    && sudo -E "PATH=$PATH" mold -run cargo build -p compute_tools --release \
+    && cachepot -s
+
+# Final image that only has one binary
+FROM debian:buster-slim
+
+COPY --from=rust-build /home/runner/target/release/compute_ctl /usr/local/bin/compute_ctl
--- a/79
+++ b/79
@@ -6,34 +6,61 @@ else
 	SECCOMP =
 endif

+#
+# We differentiate between release / debug build types using the BUILD_TYPE
+# environment variable.
+#
+BUILD_TYPE ?= debug
+ifeq ($(BUILD_TYPE),release)
+	PG_CONFIGURE_OPTS = --enable-debug --with-openssl
+	PG_CFLAGS = -O2 -g3 $(CFLAGS)
+	# Unfortunately, `--profile=...` is a nightly feature
+	CARGO_BUILD_FLAGS += --release
+else ifeq ($(BUILD_TYPE),debug)
+	PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
+	PG_CFLAGS = -O0 -g3 $(CFLAGS)
+else
+	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
+endif
+
+# macOS with brew-installed openssl requires explicit paths
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Darwin)
+    PG_CONFIGURE_OPTS += --with-includes=$(HOMEBREW_PREFIX)/opt/openssl/include --with-libraries=$(HOMEBREW_PREFIX)/opt/openssl/lib
+endif
+
+# Choose whether we should be silent or verbose
+CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose)
+# Fix for a corner case when make doesn't pass a jobserver
+CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
+
+# This option has a side effect of passing make jobserver to cargo.
+# However, we shouldn't do this if `make -n` (--dry-run) has been asked.
+CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
+# Force cargo not to print progress bar
+CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
+
 #
 # Top level Makefile to build Zenith and PostgreSQL
 #
+.PHONY: all
 all: zenith postgres

-# We don't want to run 'cargo build' in parallel with the postgres build,
-# because interleaving cargo build output with postgres build output looks
-# confusing. Also, 'cargo build' is parallel on its own, so it would be too
-# much parallelism. (Recursive invocation of postgres target still gets any
-# '-j' flag from the command line, so 'make -j' is still useful.)
-.NOTPARALLEL:
-
 ### Zenith Rust bits
 #
 # The 'postgres_ffi' depends on the Postgres headers.
 .PHONY: zenith
 zenith: postgres-headers
-	cargo build
+	+@echo "Compiling Zenith"
+	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)

 ### PostgreSQL parts
 tmp_install/build/config.status:
 	+@echo "Configuring postgres build"
 	mkdir -p tmp_install/build
 	(cd tmp_install/build && \
-	../../vendor/postgres/configure CFLAGS='-O0 -g3 $(CFLAGS)' \
-		--enable-cassert \
-		--enable-debug \
-		--enable-depend \
+	../../vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
+		$(PG_CONFIGURE_OPTS) \
 		$(SECCOMP) \
 		--prefix=$(abspath tmp_install) > configure.log)

@@ -47,29 +74,37 @@ postgres-headers: postgres-configure
 	+@echo "Installing PostgreSQL headers"
 	$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install

-
-# Compile and install PostgreSQL and contrib/zenith
+# Compile and install PostgreSQL and contrib/neon
 .PHONY: postgres
-postgres: postgres-configure
+postgres: postgres-configure \
+		  postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers`
 	+@echo "Compiling PostgreSQL"
 	$(MAKE) -C tmp_install/build MAKELEVEL=0 install
-	+@echo "Compiling contrib/zenith"
-	$(MAKE) -C tmp_install/build/contrib/zenith install
-	+@echo "Compiling contrib/zenith_test_utils"
-	$(MAKE) -C tmp_install/build/contrib/zenith_test_utils install
+	+@echo "Compiling contrib/neon"
+	$(MAKE) -C tmp_install/build/contrib/neon install
+	+@echo "Compiling contrib/neon_test_utils"
+	$(MAKE) -C tmp_install/build/contrib/neon_test_utils install
+	+@echo "Compiling pg_buffercache"
+	$(MAKE) -C tmp_install/build/contrib/pg_buffercache install
+	+@echo "Compiling pageinspect"
+	$(MAKE) -C tmp_install/build/contrib/pageinspect install

+
+.PHONY: postgres-clean
 postgres-clean:
 	$(MAKE) -C tmp_install/build MAKELEVEL=0 clean

 # This doesn't remove the effects of 'configure'.
+.PHONY: clean
 clean:
-	cd tmp_install/build && ${MAKE} clean
-	cargo clean
+	cd tmp_install/build && $(MAKE) clean
+	$(CARGO_CMD_PREFIX) cargo clean

 # This removes everything
+.PHONY: distclean
 distclean:
 	rm -rf tmp_install
-	cargo clean
+	$(CARGO_CMD_PREFIX) cargo clean

 .PHONY: fmt
 fmt:
--- a/5
+++ b/5
@@ -0,0 +1,5 @@
+Neon
+Copyright 2022 Neon Inc.
+
+The PostgreSQL submodule in vendor/postgres is licensed under the
+PostgreSQL license. See vendor/postgres/COPYRIGHT.
--- a/1
+++ b/1
@@ -1 +0,0 @@
-./test_runner/Pipfile
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1 +0,0 @@
-./test_runner/Pipfile.lock
--- a/README.md
+++ b/README.md
@@ -1,73 +1,131 @@
-# Zenith
+# Neon

-Zenith substitutes PostgreSQL storage layer and redistributes data across a cluster of nodes
+Neon is a serverless open source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes PostgreSQL storage layer by redistributing data across a cluster of nodes.
+
+The project used to be called "Zenith". Many of the commands and code comments
+still refer to "zenith", but we are in the process of renaming things.
+
+## Quick start
+[Join the waitlist](https://neon.tech/) for our free tier to receive your serverless postgres instance. Then connect to it with your preferred postgres client (psql, dbeaver, etc) or use the online SQL editor.
+
+Alternatively, compile and run the project [locally](#running-local-installation).

 ## Architecture overview

-A Zenith installation consists of Compute nodes and Storage engine.
+A Neon installation consists of compute nodes and Neon storage engine.

-Compute nodes are stateless PostgreSQL nodes, backed by zenith storage.
+Compute nodes are stateless PostgreSQL nodes, backed by Neon storage engine.

-Zenith storage engine consists of two major components:
+Neon storage engine consists of two major components:
 - Pageserver. Scalable storage backend for compute nodes.
 - WAL service. The service that receives WAL from compute node and ensures that it is stored durably.

 Pageserver consists of:
- Repository - Zenith storage implementation.
+- Repository - Neon storage implementation.
 - WAL receiver - service that receives WAL from WAL service and stores it in the repository.
 - Page service - service that communicates with compute nodes and responds with pages from the repository.
 - WAL redo - service that builds pages from base images and WAL records on Page service request.

 ## Running local installation

+
+#### Installing dependencies on Linux
 1. Install build dependencies and other useful packages

-On Ubuntu or Debian this set of packages should be sufficient to build the code:
-```text
+* On Ubuntu or Debian this set of packages should be sufficient to build the code:
+```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
-libssl-dev clang pkg-config libpq-dev
+libssl-dev clang pkg-config libpq-dev etcd cmake postgresql-client
+```
+* On Fedora these packages are needed:
+```bash
+dnf install flex bison readline-devel zlib-devel openssl-devel \
+  libseccomp-devel perl clang cmake etcd postgresql postgresql-contrib
 ```

-[Rust] 1.52 or later is also required.
+2. [Install Rust](https://www.rust-lang.org/tools/install)
+```
+# recommended approach from https://www.rust-lang.org/tools/install
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+```

+#### Installing dependencies on OSX (12.3.1)
+1. Install XCode and dependencies
+```
+xcode-select --install
+brew install protobuf etcd openssl
+```
+
+2. [Install Rust](https://www.rust-lang.org/tools/install)
+```
+# recommended approach from https://www.rust-lang.org/tools/install
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+```
+
+3. Install PostgreSQL Client
+```
+# from https://stackoverflow.com/questions/44654216/correct-way-to-install-psql-without-full-postgres-on-macos
+brew install libpq
+brew link --force libpq
+```
+
+#### Building on Linux and OSX
+
+1. Build neon and patched postgres
+```
+# Note: The path to the neon sources can not contain a space.
+
+git clone --recursive https://github.com/neondatabase/neon.git
+cd neon
+
+# The preferred and default is to make a debug build. This will create a 
+# demonstrably slower build than a release build. If you want to use a release
+# build, utilize "`BUILD_TYPE=release make -j`nproc``" 
+
+make -j`nproc`
+```
+
+#### dependency installation notes
 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.

-To run the integration tests (not required to use the code), install
-Python (3.6 or higher), and install python3 packages with `pipenv` using `pipenv install` in the project directory.
+To run the integration tests or Python scripts (not required to use the code), install
+Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory.

-2. Build zenith and patched postgres
-```sh
-git clone --recursive https://github.com/zenithdb/zenith.git
-cd zenith
-make -j5
-```

-3. Start pageserver and postgres on top of it (should be called from repo root):
+#### running neon database
+1. Start pageserver and postgres on top of it (should be called from repo root):
 ```sh
-# Create repository in .zenith with proper paths to binaries and data
+# Create repository in .neon with proper paths to binaries and data
 # Later that would be responsibility of a package install script
-> ./target/debug/zenith init
+> ./target/debug/neon_local init
+initializing tenantid 9ef87a5bf0d92544f6fafeeb3239695c
+created initial timeline de200bd42b49cc1814412c7e592dd6e9 timeline.lsn 0/16B5A50
+initial timeline de200bd42b49cc1814412c7e592dd6e9 created
 pageserver init succeeded

-# start pageserver
-> ./target/debug/zenith start
-Starting pageserver at '127.0.0.1:64000' in .zenith
+# start pageserver and safekeeper
+> ./target/debug/neon_local start
+Starting pageserver at '127.0.0.1:64000' in '.neon'
 Pageserver started
+initializing for sk 1 for 7676
+Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'
+Safekeeper started

-# start postgres on top on the pageserver
-> ./target/debug/zenith pg start main
-Starting postgres node at 'host=127.0.0.1 port=55432 user=stas'
-waiting for server to start.... done
+# start postgres compute node
+> ./target/debug/neon_local pg start main
+Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
+Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
+Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'

 # check list of running postgres instances
-> ./target/debug/zenith pg list
-BRANCH	ADDRESS		LSN		STATUS
-main	127.0.0.1:55432	0/1609610	running
+> ./target/debug/neon_local pg list
+ NODE  ADDRESS          TIMELINE                          BRANCH NAME  LSN        STATUS
+ main  127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main         0/16B5BA8  running
 ```

-4. Now it is possible to connect to postgres and run some queries:
+2. Now it is possible to connect to postgres and run some queries:
 ```text
-> psql -p55432 -h 127.0.0.1 -U zenith_admin postgres
+> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres
 postgres=# CREATE TABLE t(key int primary key, value text);
 CREATE TABLE
 postgres=# insert into t values(1,1);
@@ -79,25 +137,32 @@ postgres=# select * from t;
 (1 row)
 ```

-5. And create branches and run postgres on them:
+3. And create branches and run postgres on them:
 ```sh
 # create branch named migration_check
-> ./target/debug/zenith branch migration_check main
-Created branch 'migration_check' at 0/1609610
+> ./target/debug/neon_local timeline branch --branch-name migration_check
+Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c. Ancestor timeline: 'main'

 # check branches tree
-> ./target/debug/zenith branch
- main
- ┗━ @0/1609610: migration_check
+> ./target/debug/neon_local timeline list
+(L) main [de200bd42b49cc1814412c7e592dd6e9]
+(L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]

 # start postgres on that branch
-> ./target/debug/zenith pg start migration_check
-Starting postgres node at 'host=127.0.0.1 port=55433 user=stas'
-waiting for server to start.... done
+> ./target/debug/neon_local pg start migration_check --branch-name migration_check
+Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
+Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
+Starting postgres node at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres'
+
+# check the new list of running postgres instances
+> ./target/debug/neon_local pg list
+ NODE             ADDRESS          TIMELINE                          BRANCH NAME      LSN        STATUS
+ main             127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main             0/16F9A38  running
+ migration_check  127.0.0.1:55433  b3b863fa45fa9e57e615f9f2d944e601  migration_check  0/16F9A70  running

 # this new postgres instance will have all the data from 'main' postgres,
 # but all modifications would not affect data in original postgres
-> psql -p55433 -h 127.0.0.1 -U zenith_admin postgres
+> psql -p55433 -h 127.0.0.1 -U cloud_admin postgres
 postgres=# select * from t;
 key | value
 -----+-------
@@ -106,22 +171,28 @@ postgres=# select * from t;

 postgres=# insert into t values(2,2);
 INSERT 0 1
+
+# check that the new change doesn't affect the 'main' postgres
+> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres
+postgres=# select * from t;
+ key | value
+-----+-------
+   1 | 1
+(1 row)
 ```

-6. If you want to run tests afterwards (see below), you have to stop pageserver and all postgres instances you have just started:
+4. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances
+   you have just started. You can stop them all with one command:
 ```sh
-> ./target/debug/zenith pg stop migration_check
-> ./target/debug/zenith pg stop main
-> ./target/debug/zenith stop
+> ./target/debug/neon_local stop
 ```

 ## Running tests

 ```sh
-git clone --recursive https://github.com/zenithdb/zenith.git
+git clone --recursive https://github.com/neondatabase/neon.git
 make # builds also postgres and installs it to ./tmp_install
-cd test_runner
-pytest
+./scripts/pytest
 ```

 ## Documentation
@@ -134,14 +205,14 @@ To view your `rustdoc` documentation in a browser, try running `cargo doc --no-d

 ### Postgres-specific terms

-Due to Zenith's very close relation with PostgreSQL internals, there are numerous specific terms used.
+Due to Neon's very close relation with PostgreSQL internals, there are numerous specific terms used.
 Same applies to certain spelling: i.e. we use MB to denote 1024 * 1024 bytes, while MiB would be technically more correct, it's inconsistent with what PostgreSQL code and its documentation use.

 To get more familiar with this aspect, refer to:

- [Zenith glossary](/docs/glossary.md)
- [PostgreSQL glossary](https://www.postgresql.org/docs/13/glossary.html)
- Other PostgreSQL documentation and sources (Zenith fork sources can be found [here](https://github.com/zenithdb/postgres))
+- [Neon glossary](/docs/glossary.md)
+- [PostgreSQL glossary](https://www.postgresql.org/docs/14/glossary.html)
+- Other PostgreSQL documentation and sources (Neon fork sources can be found [here](https://github.com/neondatabase/postgres))

 ## Join the development

--- a/compute_tools/.dockerignore
+++ b/compute_tools/.dockerignore
@@ -0,0 +1 @@
+target
--- a/compute_tools/.gitignore
+++ b/compute_tools/.gitignore
@@ -0,0 +1 @@
+target
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name = "compute_tools"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+libc = "0.2"
+anyhow = "1.0"
+chrono = "0.4"
+clap = "3.0"
+env_logger = "0.9"
+hyper = { version = "0.14", features = ["full"] }
+log = { version = "0.4", features = ["std", "serde"] }
+postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+regex = "1"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1"
+tar = "0.4"
+tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
+tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+url = "2.2.2"
+workspace_hack = { version = "0.1", path = "../workspace_hack" }
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -0,0 +1,81 @@
+# Compute node tools
+
+Postgres wrapper (`compute_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
+`ExecStart` option. It will handle all the `Neon` specifics during compute node
+initialization:
+- `compute_ctl` accepts cluster (compute node) specification as a JSON file.
+- Every start is a fresh start, so the data directory is removed and
+  initialized again on each run.
+- Next it will put configuration files into the `PGDATA` directory.
+- Sync safekeepers and get commit LSN.
+- Get `basebackup` from pageserver using the returned on the previous step LSN.
+- Try to start `postgres` and wait until it is ready to accept connections.
+- Check and alter/drop/create roles and databases.
+- Hang waiting on the `postmaster` process to exit.
+
+Also `compute_ctl` spawns two separate service threads:
+- `compute-monitor` checks the last Postgres activity timestamp and saves it
+  into the shared `ComputeNode`;
+- `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
+  last activity requests.
+
+Usage example:
+```sh
+compute_ctl -D /var/db/postgres/compute \
+            -C 'postgresql://cloud_admin@localhost/postgres' \
+            -S /var/db/postgres/specs/current.json \
+            -b /usr/local/bin/postgres
+```
+
+## Tests
+
+Cargo formatter:
+```sh
+cargo fmt
+```
+
+Run tests:
+```sh
+cargo test
+```
+
+Clippy linter:
+```sh
+cargo clippy --all --all-targets -- -Dwarnings -Drust-2018-idioms
+```
+
+## Cross-platform compilation
+
+Imaging that you are on macOS (x86) and you want a Linux GNU (`x86_64-unknown-linux-gnu` platform in `rust` terminology) executable.
+
+### Using docker
+
+You can use a throw-away Docker container ([rustlang/rust](https://hub.docker.com/r/rustlang/rust/) image) for doing that:
+```sh
+docker run --rm \
+    -v $(pwd):/compute_tools \
+    -w /compute_tools \
+    -t rustlang/rust:nightly cargo build --release --target=x86_64-unknown-linux-gnu
+```
+or one-line:
+```sh
+docker run --rm -v $(pwd):/compute_tools -w /compute_tools -t rust:latest cargo build --release --target=x86_64-unknown-linux-gnu
+```
+
+### Using rust native cross-compilation
+
+Another way is to add `x86_64-unknown-linux-gnu` target on your host system:
+```sh
+rustup target add x86_64-unknown-linux-gnu
+```
+
+Install macOS cross-compiler toolchain:
+```sh
+brew tap SergioBenitez/osxct
+brew install x86_64-unknown-linux-gnu
+```
+
+And finally run `cargo build`:
+```sh
+CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_LINKER=x86_64-unknown-linux-gnu-gcc cargo build --target=x86_64-unknown-linux-gnu --release
+```
--- a/compute_tools/rustfmt.toml
+++ b/compute_tools/rustfmt.toml
@@ -0,0 +1 @@
+max_width = 100
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -0,0 +1,175 @@
+//!
+//! Postgres wrapper (`compute_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
+//! `ExecStart` option. It will handle all the `Neon` specifics during compute node
+//! initialization:
+//! - `compute_ctl` accepts cluster (compute node) specification as a JSON file.
+//! - Every start is a fresh start, so the data directory is removed and
+//!   initialized again on each run.
+//! - Next it will put configuration files into the `PGDATA` directory.
+//! - Sync safekeepers and get commit LSN.
+//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
+//! - Try to start `postgres` and wait until it is ready to accept connections.
+//! - Check and alter/drop/create roles and databases.
+//! - Hang waiting on the `postmaster` process to exit.
+//!
+//! Also `compute_ctl` spawns two separate service threads:
+//! - `compute-monitor` checks the last Postgres activity timestamp and saves it
+//!   into the shared `ComputeNode`;
+//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
+//!   last activity requests.
+//!
+//! Usage example:
+//! ```sh
+//! compute_ctl -D /var/db/postgres/compute \
+//!             -C 'postgresql://cloud_admin@localhost/postgres' \
+//!             -S /var/db/postgres/specs/current.json \
+//!             -b /usr/local/bin/postgres
+//! ```
+//!
+use std::fs::File;
+use std::panic;
+use std::path::Path;
+use std::process::exit;
+use std::sync::{Arc, RwLock};
+use std::{thread, time::Duration};
+
+use anyhow::{Context, Result};
+use chrono::Utc;
+use clap::Arg;
+use log::{error, info};
+
+use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
+use compute_tools::http::api::launch_http_server;
+use compute_tools::logger::*;
+use compute_tools::monitor::launch_monitor;
+use compute_tools::params::*;
+use compute_tools::pg_helpers::*;
+use compute_tools::spec::*;
+use url::Url;
+
+fn main() -> Result<()> {
+    // TODO: re-use `utils::logging` later
+    init_logger(DEFAULT_LOG_LEVEL)?;
+
+    // Env variable is set by `cargo`
+    let version: Option<&str> = option_env!("CARGO_PKG_VERSION");
+    let matches = clap::App::new("compute_ctl")
+        .version(version.unwrap_or("unknown"))
+        .arg(
+            Arg::new("connstr")
+                .short('C')
+                .long("connstr")
+                .value_name("DATABASE_URL")
+                .required(true),
+        )
+        .arg(
+            Arg::new("pgdata")
+                .short('D')
+                .long("pgdata")
+                .value_name("DATADIR")
+                .required(true),
+        )
+        .arg(
+            Arg::new("pgbin")
+                .short('b')
+                .long("pgbin")
+                .value_name("POSTGRES_PATH"),
+        )
+        .arg(
+            Arg::new("spec")
+                .short('s')
+                .long("spec")
+                .value_name("SPEC_JSON"),
+        )
+        .arg(
+            Arg::new("spec-path")
+                .short('S')
+                .long("spec-path")
+                .value_name("SPEC_PATH"),
+        )
+        .get_matches();
+
+    let pgdata = matches.value_of("pgdata").expect("PGDATA path is required");
+    let connstr = matches
+        .value_of("connstr")
+        .expect("Postgres connection string is required");
+    let spec = matches.value_of("spec");
+    let spec_path = matches.value_of("spec-path");
+
+    // Try to use just 'postgres' if no path is provided
+    let pgbin = matches.value_of("pgbin").unwrap_or("postgres");
+
+    let spec: ComputeSpec = match spec {
+        // First, try to get cluster spec from the cli argument
+        Some(json) => serde_json::from_str(json)?,
+        None => {
+            // Second, try to read it from the file if path is provided
+            if let Some(sp) = spec_path {
+                let path = Path::new(sp);
+                let file = File::open(path)?;
+                serde_json::from_reader(file)?
+            } else {
+                panic!("cluster spec should be provided via --spec or --spec-path argument");
+            }
+        }
+    };
+
+    let pageserver_connstr = spec
+        .cluster
+        .settings
+        .find("neon.pageserver_connstring")
+        .expect("pageserver connstr should be provided");
+    let tenant = spec
+        .cluster
+        .settings
+        .find("neon.tenant_id")
+        .expect("tenant id should be provided");
+    let timeline = spec
+        .cluster
+        .settings
+        .find("neon.timeline_id")
+        .expect("tenant id should be provided");
+
+    let compute_state = ComputeNode {
+        start_time: Utc::now(),
+        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
+        pgdata: pgdata.to_string(),
+        pgbin: pgbin.to_string(),
+        spec,
+        tenant,
+        timeline,
+        pageserver_connstr,
+        metrics: ComputeMetrics::new(),
+        state: RwLock::new(ComputeState::new()),
+    };
+    let compute = Arc::new(compute_state);
+
+    // Launch service threads first, so we were able to serve availability
+    // requests, while configuration is still in progress.
+    let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
+    let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
+
+    // Run compute (Postgres) and hang waiting on it.
+    match compute.prepare_and_run() {
+        Ok(ec) => {
+            let code = ec.code().unwrap_or(1);
+            info!("Postgres exited with code {}, shutting down", code);
+            exit(code)
+        }
+        Err(error) => {
+            error!("could not start the compute node: {}", error);
+
+            let mut state = compute.state.write().unwrap();
+            state.error = Some(format!("{:?}", error));
+            state.status = ComputeStatus::Failed;
+            drop(state);
+
+            // Keep serving HTTP requests, so the cloud control plane was able to
+            // get the actual error.
+            info!("giving control plane 30s to collect the error before shutdown");
+            thread::sleep(Duration::from_secs(30));
+            info!("shutting down");
+            Err(error)
+        }
+    }
+}
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -0,0 +1,43 @@
+use anyhow::{anyhow, Result};
+use log::error;
+use postgres::Client;
+use tokio_postgres::NoTls;
+
+use crate::compute::ComputeNode;
+
+pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
+    let query = "
+    CREATE TABLE IF NOT EXISTS health_check (
+        id serial primary key,
+        updated_at timestamptz default now()
+    );
+    INSERT INTO health_check VALUES (1, now())
+        ON CONFLICT (id) DO UPDATE
+         SET updated_at = now();";
+    let result = client.simple_query(query)?;
+    if result.len() < 2 {
+        return Err(anyhow::format_err!("executed  {} queries", result.len()));
+    }
+    Ok(())
+}
+
+pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
+    let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
+    if client.is_closed() {
+        return Err(anyhow!("connection to postgres closed"));
+    }
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            error!("connection error: {}", e);
+        }
+    });
+
+    let result = client
+        .simple_query("UPDATE health_check SET updated_at = now() WHERE id = 1;")
+        .await?;
+
+    if result.len() != 1 {
+        return Err(anyhow!("statement can't be executed"));
+    }
+    Ok(())
+}
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -0,0 +1,350 @@
+//
+// XXX: This starts to be scarry similar to the `PostgresNode` from `control_plane`,
+// but there are several things that makes `PostgresNode` usage inconvenient in the
+// cloud:
+// - it inherits from `LocalEnv`, which contains **all-all** the information about
+//   a complete service running
+// - it uses `PageServerNode` with information about http endpoint, which we do not
+//   need in the cloud again
+// - many tiny pieces like, for example, we do not use `pg_ctl` in the cloud
+//
+// Thus, to use `PostgresNode` in the cloud, we need to 'mock' a bunch of required
+// attributes (not required for the cloud). Yet, it is still tempting to unify these
+// `PostgresNode` and `ComputeNode` and use one in both places.
+//
+// TODO: stabilize `ComputeNode` and think about using it in the `control_plane`.
+//
+use std::fs;
+use std::os::unix::fs::PermissionsExt;
+use std::path::Path;
+use std::process::{Command, ExitStatus, Stdio};
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::RwLock;
+
+use anyhow::{Context, Result};
+use chrono::{DateTime, Utc};
+use log::info;
+use postgres::{Client, NoTls};
+use serde::{Serialize, Serializer};
+
+use crate::checker::create_writablity_check_data;
+use crate::config;
+use crate::pg_helpers::*;
+use crate::spec::*;
+
+/// Compute node info shared across several `compute_ctl` threads.
+pub struct ComputeNode {
+    pub start_time: DateTime<Utc>,
+    // Url type maintains proper escaping
+    pub connstr: url::Url,
+    pub pgdata: String,
+    pub pgbin: String,
+    pub spec: ComputeSpec,
+    pub tenant: String,
+    pub timeline: String,
+    pub pageserver_connstr: String,
+    pub metrics: ComputeMetrics,
+    /// Volatile part of the `ComputeNode` so should be used under `RwLock`
+    /// to allow HTTP API server to serve status requests, while configuration
+    /// is in progress.
+    pub state: RwLock<ComputeState>,
+}
+
+fn rfc3339_serialize<S>(x: &DateTime<Utc>, s: S) -> Result<S::Ok, S::Error>
+where
+    S: Serializer,
+{
+    x.to_rfc3339().serialize(s)
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "snake_case")]
+pub struct ComputeState {
+    pub status: ComputeStatus,
+    /// Timestamp of the last Postgres activity
+    #[serde(serialize_with = "rfc3339_serialize")]
+    pub last_active: DateTime<Utc>,
+    pub error: Option<String>,
+}
+
+impl ComputeState {
+    pub fn new() -> Self {
+        Self {
+            status: ComputeStatus::Init,
+            last_active: Utc::now(),
+            error: None,
+        }
+    }
+}
+
+impl Default for ComputeState {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[derive(Serialize, Clone, Copy, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum ComputeStatus {
+    Init,
+    Running,
+    Failed,
+}
+
+#[derive(Serialize)]
+pub struct ComputeMetrics {
+    pub sync_safekeepers_ms: AtomicU64,
+    pub basebackup_ms: AtomicU64,
+    pub config_ms: AtomicU64,
+    pub total_startup_ms: AtomicU64,
+}
+
+impl ComputeMetrics {
+    pub fn new() -> Self {
+        Self {
+            sync_safekeepers_ms: AtomicU64::new(0),
+            basebackup_ms: AtomicU64::new(0),
+            config_ms: AtomicU64::new(0),
+            total_startup_ms: AtomicU64::new(0),
+        }
+    }
+}
+
+impl Default for ComputeMetrics {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ComputeNode {
+    pub fn set_status(&self, status: ComputeStatus) {
+        self.state.write().unwrap().status = status;
+    }
+
+    pub fn get_status(&self) -> ComputeStatus {
+        self.state.read().unwrap().status
+    }
+
+    // Remove `pgdata` directory and create it again with right permissions.
+    fn create_pgdata(&self) -> Result<()> {
+        // Ignore removal error, likely it is a 'No such file or directory (os error 2)'.
+        // If it is something different then create_dir() will error out anyway.
+        let _ok = fs::remove_dir_all(&self.pgdata);
+        fs::create_dir(&self.pgdata)?;
+        fs::set_permissions(&self.pgdata, fs::Permissions::from_mode(0o700))?;
+
+        Ok(())
+    }
+
+    // Get basebackup from the libpq connection to pageserver using `connstr` and
+    // unarchive it to `pgdata` directory overriding all its previous content.
+    fn get_basebackup(&self, lsn: &str) -> Result<()> {
+        let start_time = Utc::now();
+
+        let mut client = Client::connect(&self.pageserver_connstr, NoTls)?;
+        let basebackup_cmd = match lsn {
+            "0/0" => format!("basebackup {} {}", &self.tenant, &self.timeline), // First start of the compute
+            _ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn),
+        };
+        let copyreader = client.copy_out(basebackup_cmd.as_str())?;
+
+        // Read the archive directly from the `CopyOutReader`
+        //
+        // Set `ignore_zeros` so that unpack() reads all the Copy data and
+        // doesn't stop at the end-of-archive marker. Otherwise, if the server
+        // sends an Error after finishing the tarball, we will not notice it.
+        let mut ar = tar::Archive::new(copyreader);
+        ar.set_ignore_zeros(true);
+        ar.unpack(&self.pgdata)?;
+
+        self.metrics.basebackup_ms.store(
+            Utc::now()
+                .signed_duration_since(start_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64,
+            Ordering::Relaxed,
+        );
+
+        Ok(())
+    }
+
+    // Run `postgres` in a special mode with `--sync-safekeepers` argument
+    // and return the reported LSN back to the caller.
+    fn sync_safekeepers(&self) -> Result<String> {
+        let start_time = Utc::now();
+
+        let sync_handle = Command::new(&self.pgbin)
+            .args(&["--sync-safekeepers"])
+            .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
+            .stdout(Stdio::piped())
+            .spawn()
+            .expect("postgres --sync-safekeepers failed to start");
+
+        // `postgres --sync-safekeepers` will print all log output to stderr and
+        // final LSN to stdout. So we pipe only stdout, while stderr will be automatically
+        // redirected to the caller output.
+        let sync_output = sync_handle
+            .wait_with_output()
+            .expect("postgres --sync-safekeepers failed");
+        if !sync_output.status.success() {
+            anyhow::bail!(
+                "postgres --sync-safekeepers exited with non-zero status: {}",
+                sync_output.status,
+            );
+        }
+
+        self.metrics.sync_safekeepers_ms.store(
+            Utc::now()
+                .signed_duration_since(start_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64,
+            Ordering::Relaxed,
+        );
+
+        let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim());
+
+        Ok(lsn)
+    }
+
+    /// Do all the preparations like PGDATA directory creation, configuration,
+    /// safekeepers sync, basebackup, etc.
+    pub fn prepare_pgdata(&self) -> Result<()> {
+        let spec = &self.spec;
+        let pgdata_path = Path::new(&self.pgdata);
+
+        // Remove/create an empty pgdata directory and put configuration there.
+        self.create_pgdata()?;
+        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
+
+        info!("starting safekeepers syncing");
+        let lsn = self
+            .sync_safekeepers()
+            .with_context(|| "failed to sync safekeepers")?;
+        info!("safekeepers synced at LSN {}", lsn);
+
+        info!(
+            "getting basebackup@{} from pageserver {}",
+            lsn, &self.pageserver_connstr
+        );
+        self.get_basebackup(&lsn).with_context(|| {
+            format!(
+                "failed to get basebackup@{} from pageserver {}",
+                lsn, &self.pageserver_connstr
+            )
+        })?;
+
+        // Update pg_hba.conf received with basebackup.
+        update_pg_hba(pgdata_path)?;
+
+        Ok(())
+    }
+
+    /// Start Postgres as a child process and manage DBs/roles.
+    /// After that this will hang waiting on the postmaster process to exit.
+    pub fn run(&self) -> Result<ExitStatus> {
+        let start_time = Utc::now();
+
+        let pgdata_path = Path::new(&self.pgdata);
+
+        // Run postgres as a child process.
+        let mut pg = Command::new(&self.pgbin)
+            .args(&["-D", &self.pgdata])
+            .spawn()
+            .expect("cannot start postgres process");
+
+        // Try default Postgres port if it is not provided
+        let port = self
+            .spec
+            .cluster
+            .settings
+            .find("port")
+            .unwrap_or_else(|| "5432".to_string());
+        wait_for_postgres(&mut pg, &port, pgdata_path)?;
+
+        // If connection fails,
+        // it may be the old node with `zenith_admin` superuser.
+        //
+        // In this case we need to connect with old `zenith_admin`name
+        // and create new user. We cannot simply rename connected user,
+        // but we can create a new one and grant it all privileges.
+        let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
+            Err(e) => {
+                info!(
+                    "cannot connect to postgres: {}, retrying with `zenith_admin` username",
+                    e
+                );
+                let mut zenith_admin_connstr = self.connstr.clone();
+
+                zenith_admin_connstr
+                    .set_username("zenith_admin")
+                    .map_err(|_| anyhow::anyhow!("invalid connstr"))?;
+
+                let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
+                client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
+                client.simple_query("GRANT zenith_admin TO cloud_admin")?;
+                drop(client);
+
+                // reconnect with connsting with expected name
+                Client::connect(self.connstr.as_str(), NoTls)?
+            }
+            Ok(client) => client,
+        };
+
+        handle_roles(&self.spec, &mut client)?;
+        handle_databases(&self.spec, &mut client)?;
+        handle_role_deletions(self, &mut client)?;
+        handle_grants(&self.spec, &mut client)?;
+        create_writablity_check_data(&mut client)?;
+
+        // 'Close' connection
+        drop(client);
+        let startup_end_time = Utc::now();
+
+        self.metrics.config_ms.store(
+            startup_end_time
+                .signed_duration_since(start_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64,
+            Ordering::Relaxed,
+        );
+        self.metrics.total_startup_ms.store(
+            startup_end_time
+                .signed_duration_since(self.start_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64,
+            Ordering::Relaxed,
+        );
+
+        self.set_status(ComputeStatus::Running);
+
+        info!(
+            "finished configuration of compute for project {}",
+            self.spec.cluster.cluster_id
+        );
+
+        // Wait for child Postgres process basically forever. In this state Ctrl+C
+        // will propagate to Postgres and it will be shut down as well.
+        let ecode = pg
+            .wait()
+            .expect("failed to start waiting on Postgres process");
+
+        Ok(ecode)
+    }
+
+    pub fn prepare_and_run(&self) -> Result<ExitStatus> {
+        info!(
+            "starting compute for project {}, operation {}, tenant {}, timeline {}",
+            self.spec.cluster.cluster_id,
+            self.spec.operation_uuid.as_ref().unwrap(),
+            self.tenant,
+            self.timeline,
+        );
+
+        self.prepare_pgdata()?;
+        self.run()
+    }
+}
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -0,0 +1,51 @@
+use std::fs::{File, OpenOptions};
+use std::io;
+use std::io::prelude::*;
+use std::path::Path;
+
+use anyhow::Result;
+
+use crate::pg_helpers::PgOptionsSerialize;
+use crate::spec::ComputeSpec;
+
+/// Check that `line` is inside a text file and put it there if it is not.
+/// Create file if it doesn't exist.
+pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
+    let mut file = OpenOptions::new()
+        .read(true)
+        .write(true)
+        .create(true)
+        .append(false)
+        .open(path)?;
+    let buf = io::BufReader::new(&file);
+    let mut count: usize = 0;
+
+    for l in buf.lines() {
+        if l? == line {
+            return Ok(false);
+        }
+        count = 1;
+    }
+
+    write!(file, "{}{}", "\n".repeat(count), line)?;
+    Ok(true)
+}
+
+/// Create or completely rewrite configuration file specified by `path`
+pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
+    // File::create() destroys the file content if it exists.
+    let mut postgres_conf = File::create(path)?;
+
+    write_auto_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?;
+
+    Ok(())
+}
+
+// Write Postgres config block wrapped with generated comment section
+fn write_auto_managed_block(file: &mut File, buf: &str) -> Result<()> {
+    writeln!(file, "# Managed by compute_ctl: begin")?;
+    writeln!(file, "{}", buf)?;
+    writeln!(file, "# Managed by compute_ctl: end")?;
+
+    Ok(())
+}
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -0,0 +1,109 @@
+use std::convert::Infallible;
+use std::net::SocketAddr;
+use std::sync::Arc;
+use std::thread;
+
+use anyhow::Result;
+use hyper::service::{make_service_fn, service_fn};
+use hyper::{Body, Method, Request, Response, Server, StatusCode};
+use log::{error, info};
+use serde_json;
+
+use crate::compute::{ComputeNode, ComputeStatus};
+
+// Service function to handle all available routes.
+async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body> {
+    match (req.method(), req.uri().path()) {
+        // Timestamp of the last Postgres activity in the plain text.
+        // DEPRECATED in favour of /status
+        (&Method::GET, "/last_activity") => {
+            info!("serving /last_active GET request");
+            let state = compute.state.read().unwrap();
+
+            // Use RFC3339 format for consistency.
+            Response::new(Body::from(state.last_active.to_rfc3339()))
+        }
+
+        // Has compute setup process finished? -> true/false.
+        // DEPRECATED in favour of /status
+        (&Method::GET, "/ready") => {
+            info!("serving /ready GET request");
+            let status = compute.get_status();
+            Response::new(Body::from(format!("{}", status == ComputeStatus::Running)))
+        }
+
+        // Serialized compute state.
+        (&Method::GET, "/status") => {
+            info!("serving /status GET request");
+            let state = compute.state.read().unwrap();
+            Response::new(Body::from(serde_json::to_string(&*state).unwrap()))
+        }
+
+        // Startup metrics in JSON format. Keep /metrics reserved for a possible
+        // future use for Prometheus metrics format.
+        (&Method::GET, "/metrics.json") => {
+            info!("serving /metrics.json GET request");
+            Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap()))
+        }
+
+        // DEPRECATED, use POST instead
+        (&Method::GET, "/check_writability") => {
+            info!("serving /check_writability GET request");
+            let res = crate::checker::check_writability(&compute).await;
+            match res {
+                Ok(_) => Response::new(Body::from("true")),
+                Err(e) => Response::new(Body::from(e.to_string())),
+            }
+        }
+
+        (&Method::POST, "/check_writability") => {
+            info!("serving /check_writability POST request");
+            let res = crate::checker::check_writability(&compute).await;
+            match res {
+                Ok(_) => Response::new(Body::from("true")),
+                Err(e) => Response::new(Body::from(e.to_string())),
+            }
+        }
+
+        // Return the `404 Not Found` for any other routes.
+        _ => {
+            let mut not_found = Response::new(Body::from("404 Not Found"));
+            *not_found.status_mut() = StatusCode::NOT_FOUND;
+            not_found
+        }
+    }
+}
+
+// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
+#[tokio::main]
+async fn serve(state: Arc<ComputeNode>) {
+    let addr = SocketAddr::from(([0, 0, 0, 0], 3080));
+
+    let make_service = make_service_fn(move |_conn| {
+        let state = state.clone();
+        async move {
+            Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
+                let state = state.clone();
+                async move { Ok::<_, Infallible>(routes(req, state).await) }
+            }))
+        }
+    });
+
+    info!("starting HTTP server on {}", addr);
+
+    let server = Server::bind(&addr).serve(make_service);
+
+    // Run this server forever
+    if let Err(e) = server.await {
+        error!("server error: {}", e);
+    }
+}
+
+/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
+pub fn launch_http_server(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+    let state = Arc::clone(state);
+
+    Ok(thread::Builder::new()
+        .name("http-endpoint".into())
+        .spawn(move || serve(state))?)
+}
--- a/compute_tools/src/http/mod.rs
+++ b/compute_tools/src/http/mod.rs
@@ -0,0 +1 @@
+pub mod api;
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -0,0 +1,158 @@
+openapi: "3.0.2"
+info:
+  title: Compute node control API
+  version: "1.0"
+
+servers:
+  - url: "http://localhost:3080"
+
+paths:
+  /status:
+    get:
+      tags:
+      - "info"
+      summary: Get compute node internal status
+      description: ""
+      operationId: getComputeStatus
+      responses:
+        "200":
+          description: ComputeState
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ComputeState"
+
+  /metrics.json:
+    get:
+      tags:
+      - "info"
+      summary: Get compute node startup metrics in JSON format
+      description: ""
+      operationId: getComputeMetricsJSON
+      responses:
+        "200":
+          description: ComputeMetrics
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ComputeMetrics"
+
+  /ready:
+    get:
+      deprecated: true
+      tags:
+      - "info"
+      summary: Check whether compute startup process finished successfully
+      description: ""
+      operationId: computeIsReady
+      responses:
+        "200":
+          description: Compute is ready ('true') or not ('false')
+          content:
+            text/plain:
+              schema:
+                type: string
+                example: "true"
+
+  /last_activity:
+    get:
+      deprecated: true
+      tags:
+      - "info"
+      summary: Get timestamp of the last compute activity
+      description: ""
+      operationId: getLastComputeActivityTS
+      responses:
+        "200":
+          description: Timestamp of the last compute activity
+          content:
+            text/plain:
+              schema:
+                type: string
+                example: "2022-10-12T07:20:50.52Z"
+
+  /check_writability:
+    get:
+      deprecated: true
+      tags:
+      - "check"
+      summary: Check that we can write new data on this compute
+      description: ""
+      operationId: checkComputeWritabilityDeprecated
+      responses:
+        "200":
+          description: Check result
+          content:
+            text/plain:
+              schema:
+                type: string
+                description: Error text or 'true' if check passed
+                example: "true"
+
+    post:
+      tags:
+      - "check"
+      summary: Check that we can write new data on this compute
+      description: ""
+      operationId: checkComputeWritability
+      responses:
+        "200":
+          description: Check result
+          content:
+            text/plain:
+              schema:
+                type: string
+                description: Error text or 'true' if check passed
+                example: "true"
+
+components:
+  securitySchemes:
+    JWT:
+      type: http
+      scheme: bearer
+      bearerFormat: JWT
+
+  schemas:
+    ComputeMetrics:
+      type: object
+      description: Compute startup metrics
+      required:
+        - sync_safekeepers_ms
+        - basebackup_ms
+        - config_ms
+        - total_startup_ms
+      properties:
+        sync_safekeepers_ms:
+          type: integer
+        basebackup_ms:
+          type: integer
+        config_ms:
+          type: integer
+        total_startup_ms:
+          type: integer
+
+    ComputeState:
+      type: object
+      required:
+        - status
+        - last_active
+      properties:
+        status:
+          $ref: '#/components/schemas/ComputeStatus'
+        last_active:
+          type: string
+          description: The last detected compute activity timestamp in UTC and RFC3339 format
+          example: "2022-10-12T07:20:50.52Z"
+        error:
+          type: string
+          description: Text of the error during compute startup, if any
+
+    ComputeStatus:
+      type: string
+      enum:
+        - init
+        - failed
+        - running
+
+security:
+  - JWT: []
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -0,0 +1,14 @@
+//!
+//! Various tools and helpers to handle cluster / compute node (Postgres)
+//! configuration.
+//!
+pub mod checker;
+pub mod config;
+pub mod http;
+#[macro_use]
+pub mod logger;
+pub mod compute;
+pub mod monitor;
+pub mod params;
+pub mod pg_helpers;
+pub mod spec;
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -0,0 +1,43 @@
+use std::io::Write;
+
+use anyhow::Result;
+use chrono::Utc;
+use env_logger::{Builder, Env};
+
+macro_rules! info_println {
+    ($($tts:tt)*) => {
+        if log_enabled!(Level::Info) {
+            println!($($tts)*);
+        }
+    }
+}
+
+macro_rules! info_print {
+    ($($tts:tt)*) => {
+        if log_enabled!(Level::Info) {
+            print!($($tts)*);
+        }
+    }
+}
+
+/// Initialize `env_logger` using either `default_level` or
+/// `RUST_LOG` environment variable as default log level.
+pub fn init_logger(default_level: &str) -> Result<()> {
+    let env = Env::default().filter_or("RUST_LOG", default_level);
+
+    Builder::from_env(env)
+        .format(|buf, record| {
+            let thread_handle = std::thread::current();
+            writeln!(
+                buf,
+                "{} [{}] {}: {}",
+                Utc::now().format("%Y-%m-%d %H:%M:%S%.3f %Z"),
+                thread_handle.name().unwrap_or("main"),
+                record.level(),
+                record.args()
+            )
+        })
+        .init();
+
+    Ok(())
+}
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -0,0 +1,109 @@
+use std::sync::Arc;
+use std::{thread, time};
+
+use anyhow::Result;
+use chrono::{DateTime, Utc};
+use log::{debug, info};
+use postgres::{Client, NoTls};
+
+use crate::compute::ComputeNode;
+
+const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds
+
+// Spin in a loop and figure out the last activity time in the Postgres.
+// Then update it in the shared state. This function never errors out.
+// XXX: the only expected panic is at `RwLock` unwrap().
+fn watch_compute_activity(compute: &ComputeNode) {
+    // Suppose that `connstr` doesn't change
+    let connstr = compute.connstr.as_str();
+    // Define `client` outside of the loop to reuse existing connection if it's active.
+    let mut client = Client::connect(connstr, NoTls);
+    let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);
+
+    info!("watching Postgres activity at {}", connstr);
+
+    loop {
+        // Should be outside of the write lock to allow others to read while we sleep.
+        thread::sleep(timeout);
+
+        match &mut client {
+            Ok(cli) => {
+                if cli.is_closed() {
+                    info!("connection to postgres closed, trying to reconnect");
+
+                    // Connection is closed, reconnect and try again.
+                    client = Client::connect(connstr, NoTls);
+                    continue;
+                }
+
+                // Get all running client backends except ourself, use RFC3339 DateTime format.
+                let backends = cli
+                    .query(
+                        "SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change
+                         FROM pg_stat_activity
+                         WHERE backend_type = 'client backend'
+                            AND pid != pg_backend_pid()
+                            AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
+                        &[],
+                    );
+                let mut last_active = compute.state.read().unwrap().last_active;
+
+                if let Ok(backs) = backends {
+                    let mut idle_backs: Vec<DateTime<Utc>> = vec![];
+
+                    for b in backs.into_iter() {
+                        let state: String = b.get("state");
+                        let change: String = b.get("state_change");
+
+                        if state == "idle" {
+                            let change = DateTime::parse_from_rfc3339(&change);
+                            match change {
+                                Ok(t) => idle_backs.push(t.with_timezone(&Utc)),
+                                Err(e) => {
+                                    info!("cannot parse backend state_change DateTime: {}", e);
+                                    continue;
+                                }
+                            }
+                        } else {
+                            // Found non-idle backend, so the last activity is NOW.
+                            // Save it and exit the for loop. Also clear the idle backend
+                            // `state_change` timestamps array as it doesn't matter now.
+                            last_active = Utc::now();
+                            idle_backs.clear();
+                            break;
+                        }
+                    }
+
+                    // Sort idle backend `state_change` timestamps. The last one corresponds
+                    // to the last activity.
+                    idle_backs.sort();
+                    if let Some(last) = idle_backs.last() {
+                        last_active = *last;
+                    }
+                }
+
+                // Update the last activity in the shared state if we got a more recent one.
+                let mut state = compute.state.write().unwrap();
+                if last_active > state.last_active {
+                    state.last_active = last_active;
+                    debug!("set the last compute activity time to: {}", last_active);
+                }
+            }
+            Err(e) => {
+                debug!("cannot connect to postgres: {}, retrying", e);
+
+                // Establish a new connection and try again.
+                client = Client::connect(connstr, NoTls);
+            }
+        }
+    }
+}
+
+/// Launch a separate compute monitor thread and return its `JoinHandle`.
+pub fn launch_monitor(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+    let state = Arc::clone(state);
+
+    Ok(thread::Builder::new()
+        .name("compute-monitor".into())
+        .spawn(move || watch_compute_activity(&state))?)
+}
--- a/compute_tools/src/params.rs
+++ b/compute_tools/src/params.rs
@@ -0,0 +1,3 @@
+pub const DEFAULT_LOG_LEVEL: &str = "info";
+pub const DEFAULT_CONNSTRING: &str = "host=localhost user=postgres";
+pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5";
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -0,0 +1,282 @@
+use std::fmt::Write;
+use std::fs::File;
+use std::io::{BufRead, BufReader};
+use std::net::{SocketAddr, TcpStream};
+use std::os::unix::fs::PermissionsExt;
+use std::path::Path;
+use std::process::Child;
+use std::str::FromStr;
+use std::{fs, thread, time};
+
+use anyhow::{bail, Result};
+use postgres::{Client, Transaction};
+use serde::Deserialize;
+
+const POSTGRES_WAIT_TIMEOUT: u64 = 60 * 1000; // milliseconds
+
+/// Rust representation of Postgres role info with only those fields
+/// that matter for us.
+#[derive(Clone, Deserialize)]
+pub struct Role {
+    pub name: PgIdent,
+    pub encrypted_password: Option<String>,
+    pub options: GenericOptions,
+}
+
+/// Rust representation of Postgres database info with only those fields
+/// that matter for us.
+#[derive(Clone, Deserialize)]
+pub struct Database {
+    pub name: PgIdent,
+    pub owner: PgIdent,
+    pub options: GenericOptions,
+}
+
+/// Common type representing both SQL statement params with or without value,
+/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
+/// options like `wal_level = logical`.
+#[derive(Clone, Deserialize)]
+pub struct GenericOption {
+    pub name: String,
+    pub value: Option<String>,
+    pub vartype: String,
+}
+
+/// Optional collection of `GenericOption`'s. Type alias allows us to
+/// declare a `trait` on it.
+pub type GenericOptions = Option<Vec<GenericOption>>;
+
+impl GenericOption {
+    /// Represent `GenericOption` as SQL statement parameter.
+    pub fn to_pg_option(&self) -> String {
+        if let Some(val) = &self.value {
+            match self.vartype.as_ref() {
+                "string" => format!("{} '{}'", self.name, val),
+                _ => format!("{} {}", self.name, val),
+            }
+        } else {
+            self.name.to_owned()
+        }
+    }
+
+    /// Represent `GenericOption` as configuration option.
+    pub fn to_pg_setting(&self) -> String {
+        if let Some(val) = &self.value {
+            match self.vartype.as_ref() {
+                "string" => format!("{} = '{}'", self.name, val),
+                _ => format!("{} = {}", self.name, val),
+            }
+        } else {
+            self.name.to_owned()
+        }
+    }
+}
+
+pub trait PgOptionsSerialize {
+    fn as_pg_options(&self) -> String;
+    fn as_pg_settings(&self) -> String;
+}
+
+impl PgOptionsSerialize for GenericOptions {
+    /// Serialize an optional collection of `GenericOption`'s to
+    /// Postgres SQL statement arguments.
+    fn as_pg_options(&self) -> String {
+        if let Some(ops) = &self {
+            ops.iter()
+                .map(|op| op.to_pg_option())
+                .collect::<Vec<String>>()
+                .join(" ")
+        } else {
+            "".to_string()
+        }
+    }
+
+    /// Serialize an optional collection of `GenericOption`'s to
+    /// `postgresql.conf` compatible format.
+    fn as_pg_settings(&self) -> String {
+        if let Some(ops) = &self {
+            ops.iter()
+                .map(|op| op.to_pg_setting())
+                .collect::<Vec<String>>()
+                .join("\n")
+        } else {
+            "".to_string()
+        }
+    }
+}
+
+pub trait GenericOptionsSearch {
+    fn find(&self, name: &str) -> Option<String>;
+}
+
+impl GenericOptionsSearch for GenericOptions {
+    /// Lookup option by name
+    fn find(&self, name: &str) -> Option<String> {
+        match &self {
+            Some(ops) => {
+                let op = ops.iter().find(|s| s.name == name);
+                match op {
+                    Some(op) => op.value.clone(),
+                    None => None,
+                }
+            }
+            None => None,
+        }
+    }
+}
+
+impl Role {
+    /// Serialize a list of role parameters into a Postgres-acceptable
+    /// string of arguments.
+    pub fn to_pg_options(&self) -> String {
+        // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in Rails.
+        // For now we do not use generic `options` for roles. Once used, add
+        // `self.options.as_pg_options()` somewhere here.
+        let mut params: String = "LOGIN".to_string();
+
+        if let Some(pass) = &self.encrypted_password {
+            // Some time ago we supported only md5 and treated all encrypted_password as md5.
+            // Now we also support SCRAM-SHA-256 and to preserve compatibility
+            // we treat all encrypted_password as md5 unless they starts with SCRAM-SHA-256.
+            if pass.starts_with("SCRAM-SHA-256") {
+                write!(params, " PASSWORD '{pass}'")
+                    .expect("String is documented to not to error during write operations");
+            } else {
+                write!(params, " PASSWORD 'md5{pass}'")
+                    .expect("String is documented to not to error during write operations");
+            }
+        } else {
+            params.push_str(" PASSWORD NULL");
+        }
+
+        params
+    }
+}
+
+impl Database {
+    /// Serialize a list of database parameters into a Postgres-acceptable
+    /// string of arguments.
+    /// NB: `TEMPLATE` is actually also an identifier, but so far we only need
+    /// to use `template0` and `template1`, so it is not a problem. Yet in the future
+    /// it may require a proper quoting too.
+    pub fn to_pg_options(&self) -> String {
+        let mut params: String = self.options.as_pg_options();
+        write!(params, " OWNER {}", &self.owner.quote())
+            .expect("String is documented to not to error during write operations");
+
+        params
+    }
+}
+
+/// String type alias representing Postgres identifier and
+/// intended to be used for DB / role names.
+pub type PgIdent = String;
+
+/// Generic trait used to provide quoting for strings used in the
+/// Postgres SQL queries. Currently used only to implement quoting
+/// of identifiers, but could be used for literals in the future.
+pub trait PgQuote {
+    fn quote(&self) -> String;
+}
+
+impl PgQuote for PgIdent {
+    /// This is intended to mimic Postgres quote_ident(), but for simplicity it
+    /// always quotes provided string with `""` and escapes every `"`. Not idempotent,
+    /// i.e. if string is already escaped it will be escaped again.
+    fn quote(&self) -> String {
+        let result = format!("\"{}\"", self.replace('"', "\"\""));
+        result
+    }
+}
+
+/// Build a list of existing Postgres roles
+pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
+    let postgres_roles = xact
+        .query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
+        .iter()
+        .map(|row| Role {
+            name: row.get("rolname"),
+            encrypted_password: row.get("rolpassword"),
+            options: None,
+        })
+        .collect();
+
+    Ok(postgres_roles)
+}
+
+/// Build a list of existing Postgres databases
+pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
+    let postgres_dbs = client
+        .query(
+            "SELECT datname, datdba::regrole::text as owner
+               FROM pg_catalog.pg_database;",
+            &[],
+        )?
+        .iter()
+        .map(|row| Database {
+            name: row.get("datname"),
+            owner: row.get("owner"),
+            options: None,
+        })
+        .collect();
+
+    Ok(postgres_dbs)
+}
+
+/// Wait for Postgres to become ready to accept connections:
+/// - state should be `ready` in the `pgdata/postmaster.pid`
+/// - and we should be able to connect to 127.0.0.1:5432
+pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()> {
+    let pid_path = pgdata.join("postmaster.pid");
+    let mut slept: u64 = 0; // ms
+    let pause = time::Duration::from_millis(100);
+
+    let timeout = time::Duration::from_millis(10);
+    let addr = SocketAddr::from_str(&format!("127.0.0.1:{}", port)).unwrap();
+
+    loop {
+        // Sleep POSTGRES_WAIT_TIMEOUT at max (a bit longer actually if consider a TCP timeout,
+        // but postgres starts listening almost immediately, even if it is not really
+        // ready to accept connections).
+        if slept >= POSTGRES_WAIT_TIMEOUT {
+            bail!("timed out while waiting for Postgres to start");
+        }
+
+        if let Ok(Some(status)) = pg.try_wait() {
+            // Postgres exited, that is not what we expected, bail out earlier.
+            let code = status.code().unwrap_or(-1);
+            bail!("Postgres exited unexpectedly with code {}", code);
+        }
+
+        if pid_path.exists() {
+            let file = BufReader::new(File::open(&pid_path)?);
+            let status = file
+                .lines()
+                .last()
+                .unwrap()
+                .unwrap_or_else(|_| "unknown".to_string());
+            let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok();
+
+            // Now Postgres is ready to accept connections
+            if status.trim() == "ready" && can_connect {
+                break;
+            }
+        }
+
+        thread::sleep(pause);
+        slept += 100;
+    }
+
+    Ok(())
+}
+
+/// Remove `pgdata` directory and create it again with right permissions.
+pub fn create_pgdata(pgdata: &str) -> Result<()> {
+    // Ignore removal error, likely it is a 'No such file or directory (os error 2)'.
+    // If it is something different then create_dir() will error out anyway.
+    let _ok = fs::remove_dir_all(pgdata);
+    fs::create_dir(pgdata)?;
+    fs::set_permissions(pgdata, fs::Permissions::from_mode(0o700))?;
+
+    Ok(())
+}
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -0,0 +1,384 @@
+use std::path::Path;
+
+use anyhow::Result;
+use log::{info, log_enabled, warn, Level};
+use postgres::{Client, NoTls};
+use serde::Deserialize;
+
+use crate::compute::ComputeNode;
+use crate::config;
+use crate::params::PG_HBA_ALL_MD5;
+use crate::pg_helpers::*;
+
+/// Cluster spec or configuration represented as an optional number of
+/// delta operations + final cluster state description.
+#[derive(Clone, Deserialize)]
+pub struct ComputeSpec {
+    pub format_version: f32,
+    pub timestamp: String,
+    pub operation_uuid: Option<String>,
+    /// Expected cluster state at the end of transition process.
+    pub cluster: Cluster,
+    pub delta_operations: Option<Vec<DeltaOp>>,
+}
+
+/// Cluster state seen from the perspective of the external tools
+/// like Rails web console.
+#[derive(Clone, Deserialize)]
+pub struct Cluster {
+    pub cluster_id: String,
+    pub name: String,
+    pub state: Option<String>,
+    pub roles: Vec<Role>,
+    pub databases: Vec<Database>,
+    pub settings: GenericOptions,
+}
+
+/// Single cluster state changing operation that could not be represented as
+/// a static `Cluster` structure. For example:
+/// - DROP DATABASE
+/// - DROP ROLE
+/// - ALTER ROLE name RENAME TO new_name
+/// - ALTER DATABASE name RENAME TO new_name
+#[derive(Clone, Deserialize)]
+pub struct DeltaOp {
+    pub action: String,
+    pub name: PgIdent,
+    pub new_name: Option<PgIdent>,
+}
+
+/// It takes cluster specification and does the following:
+/// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file.
+/// - Update `pg_hba.conf` to allow external connections.
+pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
+    // File `postgresql.conf` is no longer included into `basebackup`, so just
+    // always write all config into it creating new file.
+    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
+
+    update_pg_hba(pgdata_path)?;
+
+    Ok(())
+}
+
+/// Check `pg_hba.conf` and update if needed to allow external connections.
+pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
+    // XXX: consider making it a part of spec.json
+    info!("checking pg_hba.conf");
+    let pghba_path = pgdata_path.join("pg_hba.conf");
+
+    if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? {
+        info!("updated pg_hba.conf to allow external connections");
+    } else {
+        info!("pg_hba.conf is up-to-date");
+    }
+
+    Ok(())
+}
+
+/// Given a cluster spec json and open transaction it handles roles creation,
+/// deletion and update.
+pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
+    let mut xact = client.transaction()?;
+    let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
+
+    // Print a list of existing Postgres roles (only in debug mode)
+    info!("postgres roles:");
+    for r in &existing_roles {
+        info_println!(
+            "{} - {}:{}",
+            " ".repeat(27 + 5),
+            r.name,
+            if r.encrypted_password.is_some() {
+                "[FILTERED]"
+            } else {
+                "(null)"
+            }
+        );
+    }
+
+    // Process delta operations first
+    if let Some(ops) = &spec.delta_operations {
+        info!("processing role renames");
+        for op in ops {
+            match op.action.as_ref() {
+                "delete_role" => {
+                    // no-op now, roles will be deleted at the end of configuration
+                }
+                // Renaming role drops its password, since role name is
+                // used as a salt there.  It is important that this role
+                // is recorded with a new `name` in the `roles` list.
+                // Follow up roles update will set the new password.
+                "rename_role" => {
+                    let new_name = op.new_name.as_ref().unwrap();
+
+                    // XXX: with a limited number of roles it is fine, but consider making it a HashMap
+                    if existing_roles.iter().any(|r| r.name == op.name) {
+                        let query: String = format!(
+                            "ALTER ROLE {} RENAME TO {}",
+                            op.name.quote(),
+                            new_name.quote()
+                        );
+
+                        warn!("renaming role '{}' to '{}'", op.name, new_name);
+                        xact.execute(query.as_str(), &[])?;
+                    }
+                }
+                _ => {}
+            }
+        }
+    }
+
+    // Refresh Postgres roles info to handle possible roles renaming
+    let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
+
+    info!("cluster spec roles:");
+    for role in &spec.cluster.roles {
+        let name = &role.name;
+
+        info_print!(
+            "{} - {}:{}",
+            " ".repeat(27 + 5),
+            name,
+            if role.encrypted_password.is_some() {
+                "[FILTERED]"
+            } else {
+                "(null)"
+            }
+        );
+
+        // XXX: with a limited number of roles it is fine, but consider making it a HashMap
+        let pg_role = existing_roles.iter().find(|r| r.name == *name);
+
+        if let Some(r) = pg_role {
+            let mut update_role = false;
+
+            if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
+                || (r.encrypted_password.is_some() && role.encrypted_password.is_none())
+            {
+                update_role = true;
+            } else if let Some(pg_pwd) = &r.encrypted_password {
+                // Check whether password changed or not (trim 'md5:' prefix first)
+                update_role = pg_pwd[3..] != *role.encrypted_password.as_ref().unwrap();
+            }
+
+            if update_role {
+                let mut query: String = format!("ALTER ROLE {} ", name.quote());
+                info_print!(" -> update");
+
+                query.push_str(&role.to_pg_options());
+                xact.execute(query.as_str(), &[])?;
+            }
+        } else {
+            info!("role name: '{}'", &name);
+            let mut query: String = format!("CREATE ROLE {} ", name.quote());
+            info!("role create query: '{}'", &query);
+            info_print!(" -> create");
+
+            query.push_str(&role.to_pg_options());
+            xact.execute(query.as_str(), &[])?;
+
+            let grant_query = format!(
+                "GRANT pg_read_all_data, pg_write_all_data TO {}",
+                name.quote()
+            );
+            xact.execute(grant_query.as_str(), &[])?;
+            info!("role grant query: '{}'", &grant_query);
+        }
+
+        info_print!("\n");
+    }
+
+    xact.commit()?;
+
+    Ok(())
+}
+
+/// Reassign all dependent objects and delete requested roles.
+pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
+    let spec = &node.spec;
+
+    // First, reassign all dependent objects to db owners.
+    if let Some(ops) = &spec.delta_operations {
+        info!("reassigning dependent objects of to-be-deleted roles");
+        for op in ops {
+            if op.action == "delete_role" {
+                reassign_owned_objects(node, &op.name)?;
+            }
+        }
+    }
+
+    // Second, proceed with role deletions.
+    let mut xact = client.transaction()?;
+    if let Some(ops) = &spec.delta_operations {
+        info!("processing role deletions");
+        for op in ops {
+            // We do not check either role exists or not,
+            // Postgres will take care of it for us
+            if op.action == "delete_role" {
+                let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote());
+
+                warn!("deleting role '{}'", &op.name);
+                xact.execute(query.as_str(), &[])?;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+// Reassign all owned objects in all databases to the owner of the database.
+fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> {
+    for db in &node.spec.cluster.databases {
+        if db.owner != *role_name {
+            let mut connstr = node.connstr.clone();
+            // database name is always the last and the only component of the path
+            connstr.set_path(&db.name);
+
+            let mut client = Client::connect(connstr.as_str(), NoTls)?;
+
+            // This will reassign all dependent objects to the db owner
+            let reassign_query = format!(
+                "REASSIGN OWNED BY {} TO {}",
+                role_name.quote(),
+                db.owner.quote()
+            );
+            info!(
+                "reassigning objects owned by '{}' in db '{}' to '{}'",
+                role_name, &db.name, &db.owner
+            );
+            client.simple_query(&reassign_query)?;
+
+            // This now will only drop privileges of the role
+            let drop_query = format!("DROP OWNED BY {}", role_name.quote());
+            client.simple_query(&drop_query)?;
+        }
+    }
+
+    Ok(())
+}
+
+/// It follows mostly the same logic as `handle_roles()` excepting that we
+/// does not use an explicit transactions block, since major database operations
+/// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
+/// atomicity should be enough here due to the order of operations and various checks,
+/// which together provide us idempotency.
+pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
+    let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
+
+    // Print a list of existing Postgres databases (only in debug mode)
+    info!("postgres databases:");
+    for r in &existing_dbs {
+        info_println!("{} - {}:{}", " ".repeat(27 + 5), r.name, r.owner);
+    }
+
+    // Process delta operations first
+    if let Some(ops) = &spec.delta_operations {
+        info!("processing delta operations on databases");
+        for op in ops {
+            match op.action.as_ref() {
+                // We do not check either DB exists or not,
+                // Postgres will take care of it for us
+                "delete_db" => {
+                    let query: String = format!("DROP DATABASE IF EXISTS {}", &op.name.quote());
+
+                    warn!("deleting database '{}'", &op.name);
+                    client.execute(query.as_str(), &[])?;
+                }
+                "rename_db" => {
+                    let new_name = op.new_name.as_ref().unwrap();
+
+                    // XXX: with a limited number of roles it is fine, but consider making it a HashMap
+                    if existing_dbs.iter().any(|r| r.name == op.name) {
+                        let query: String = format!(
+                            "ALTER DATABASE {} RENAME TO {}",
+                            op.name.quote(),
+                            new_name.quote()
+                        );
+
+                        warn!("renaming database '{}' to '{}'", op.name, new_name);
+                        client.execute(query.as_str(), &[])?;
+                    }
+                }
+                _ => {}
+            }
+        }
+    }
+
+    // Refresh Postgres databases info to handle possible renames
+    let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
+
+    info!("cluster spec databases:");
+    for db in &spec.cluster.databases {
+        let name = &db.name;
+
+        info_print!("{} - {}:{}", " ".repeat(27 + 5), db.name, db.owner);
+
+        // XXX: with a limited number of databases it is fine, but consider making it a HashMap
+        let pg_db = existing_dbs.iter().find(|r| r.name == *name);
+
+        if let Some(r) = pg_db {
+            // XXX: db owner name is returned as quoted string from Postgres,
+            // when quoting is needed.
+            let new_owner = if r.owner.starts_with('"') {
+                db.owner.quote()
+            } else {
+                db.owner.clone()
+            };
+
+            if new_owner != r.owner {
+                let query: String = format!(
+                    "ALTER DATABASE {} OWNER TO {}",
+                    name.quote(),
+                    db.owner.quote()
+                );
+                info_print!(" -> update");
+
+                client.execute(query.as_str(), &[])?;
+            }
+        } else {
+            let mut query: String = format!("CREATE DATABASE {} ", name.quote());
+            info_print!(" -> create");
+
+            query.push_str(&db.to_pg_options());
+            client.execute(query.as_str(), &[])?;
+        }
+
+        info_print!("\n");
+    }
+
+    Ok(())
+}
+
+// Grant CREATE ON DATABASE to the database owner
+// to allow clients create trusted extensions.
+pub fn handle_grants(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
+    info!("cluster spec grants:");
+
+    // We now have a separate `web_access` role to connect to the database
+    // via the web interface and proxy link auth. And also we grant a
+    // read / write all data privilege to every role. So also grant
+    // create to everyone.
+    // XXX: later we should stop messing with Postgres ACL in such horrible
+    // ways.
+    let roles = spec
+        .cluster
+        .roles
+        .iter()
+        .map(|r| r.name.quote())
+        .collect::<Vec<_>>();
+
+    for db in &spec.cluster.databases {
+        let dbname = &db.name;
+
+        let query: String = format!(
+            "GRANT CREATE ON DATABASE {} TO {}",
+            dbname.quote(),
+            roles.join(", ")
+        );
+        info!("grant query {}", &query);
+
+        client.execute(query.as_str(), &[])?;
+    }
+
+    Ok(())
+}
--- a/compute_tools/tests/cluster_spec.json
+++ b/compute_tools/tests/cluster_spec.json
@@ -0,0 +1,205 @@
+{
+    "format_version": 1.0,
+
+    "timestamp": "2021-05-23T18:25:43.511Z",
+    "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b",
+
+    "cluster": {
+        "cluster_id": "test-cluster-42",
+        "name": "Zenith Test",
+        "state": "restarted",
+        "roles": [
+            {
+                "name": "postgres",
+                "encrypted_password": "6b1d16b78004bbd51fa06af9eda75972",
+                "options": null
+            },
+            {
+                "name": "alexk",
+                "encrypted_password": null,
+                "options": null
+            },
+            {
+                "name": "zenith \"new\"",
+                "encrypted_password": "5b1d16b78004bbd51fa06af9eda75972",
+                "options": null
+            },
+            {
+                "name": "zen",
+                "encrypted_password": "9b1d16b78004bbd51fa06af9eda75972"
+            },
+            {
+                "name": "\"name\";\\n select 1;",
+                "encrypted_password": "5b1d16b78004bbd51fa06af9eda75972"
+            },
+            {
+                "name": "MyRole",
+                "encrypted_password": "5b1d16b78004bbd51fa06af9eda75972"
+            }
+        ],
+        "databases": [
+            {
+                "name": "DB2",
+                "owner": "alexk",
+                "options": [
+                    {
+                        "name": "LC_COLLATE",
+                        "value": "C",
+                        "vartype": "string"
+                    },
+                    {
+                        "name": "LC_CTYPE",
+                        "value": "C",
+                        "vartype": "string"
+                    },
+                    {
+                        "name": "TEMPLATE",
+                        "value": "template0",
+                        "vartype": "enum"
+                    }
+                ]
+            },
+            {
+                "name": "zenith",
+                "owner": "MyRole"
+            },
+            {
+                "name": "zen",
+                "owner": "zen"
+            }
+        ],
+        "settings": [
+            {
+                "name": "fsync",
+                "value": "off",
+                "vartype": "bool"
+            },
+            {
+                "name": "wal_level",
+                "value": "replica",
+                "vartype": "enum"
+            },
+            {
+                "name": "hot_standby",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "safekeepers",
+                "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501",
+                "vartype": "string"
+            },
+            {
+                "name": "wal_log_hints",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "log_connections",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "shared_buffers",
+                "value": "32768",
+                "vartype": "integer"
+            },
+            {
+                "name": "port",
+                "value": "55432",
+                "vartype": "integer"
+            },
+            {
+                "name": "max_connections",
+                "value": "100",
+                "vartype": "integer"
+            },
+            {
+                "name": "max_wal_senders",
+                "value": "10",
+                "vartype": "integer"
+            },
+            {
+                "name": "listen_addresses",
+                "value": "0.0.0.0",
+                "vartype": "string"
+            },
+            {
+                "name": "wal_sender_timeout",
+                "value": "0",
+                "vartype": "integer"
+            },
+            {
+                "name": "password_encryption",
+                "value": "md5",
+                "vartype": "enum"
+            },
+            {
+                "name": "maintenance_work_mem",
+                "value": "65536",
+                "vartype": "integer"
+            },
+            {
+                "name": "max_parallel_workers",
+                "value": "8",
+                "vartype": "integer"
+            },
+            {
+                "name": "max_worker_processes",
+                "value": "8",
+                "vartype": "integer"
+            },
+            {
+                "name": "neon.tenant_id",
+                "value": "b0554b632bd4d547a63b86c3630317e8",
+                "vartype": "string"
+            },
+            {
+                "name": "max_replication_slots",
+                "value": "10",
+                "vartype": "integer"
+            },
+            {
+                "name": "neon.timeline_id",
+                "value": "2414a61ffc94e428f14b5758fe308e13",
+                "vartype": "string"
+            },
+            {
+                "name": "shared_preload_libraries",
+                "value": "neon",
+                "vartype": "string"
+            },
+            {
+                "name": "synchronous_standby_names",
+                "value": "walproposer",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.pageserver_connstring",
+                "value": "host=127.0.0.1 port=6400",
+                "vartype": "string"
+            }
+        ]
+    },
+
+    "delta_operations": [
+        {
+            "action": "delete_db",
+            "name": "zenith_test"
+        },
+        {
+            "action": "rename_db",
+            "name": "DB",
+            "new_name": "DB2"
+        },
+        {
+            "action": "delete_role",
+            "name": "zenith2"
+        },
+        {
+            "action": "rename_role",
+            "name": "zenith new",
+            "new_name": "zenith \"new\""
+        }
+    ]
+}
--- a/compute_tools/tests/config_test.rs
+++ b/compute_tools/tests/config_test.rs
@@ -0,0 +1,48 @@
+#[cfg(test)]
+mod config_tests {
+
+    use std::fs::{remove_file, File};
+    use std::io::{Read, Write};
+    use std::path::Path;
+
+    use compute_tools::config::*;
+
+    fn write_test_file(path: &Path, content: &str) {
+        let mut file = File::create(path).unwrap();
+        file.write_all(content.as_bytes()).unwrap();
+    }
+
+    fn check_file_content(path: &Path, expected_content: &str) {
+        let mut file = File::open(path).unwrap();
+        let mut content = String::new();
+
+        file.read_to_string(&mut content).unwrap();
+        assert_eq!(content, expected_content);
+    }
+
+    #[test]
+    fn test_line_in_file() {
+        let path = Path::new("./tests/tmp/config_test.txt");
+        write_test_file(path, "line1\nline2.1\t line2.2\nline3");
+
+        let line = "line2.1\t line2.2";
+        let result = line_in_file(path, line).unwrap();
+        assert!(!result);
+        check_file_content(path, "line1\nline2.1\t line2.2\nline3");
+
+        let line = "line4";
+        let result = line_in_file(path, line).unwrap();
+        assert!(result);
+        check_file_content(path, "line1\nline2.1\t line2.2\nline3\nline4");
+
+        remove_file(path).unwrap();
+
+        let path = Path::new("./tests/tmp/new_config_test.txt");
+        let line = "line4";
+        let result = line_in_file(path, line).unwrap();
+        assert!(result);
+        check_file_content(path, "line4");
+
+        remove_file(path).unwrap();
+    }
+}
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -0,0 +1,41 @@
+#[cfg(test)]
+mod pg_helpers_tests {
+
+    use std::fs::File;
+
+    use compute_tools::pg_helpers::*;
+    use compute_tools::spec::ComputeSpec;
+
+    #[test]
+    fn params_serialize() {
+        let file = File::open("tests/cluster_spec.json").unwrap();
+        let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
+
+        assert_eq!(
+            spec.cluster.databases.first().unwrap().to_pg_options(),
+            "LC_COLLATE 'C' LC_CTYPE 'C' TEMPLATE template0 OWNER \"alexk\""
+        );
+        assert_eq!(
+            spec.cluster.roles.first().unwrap().to_pg_options(),
+            "LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'"
+        );
+    }
+
+    #[test]
+    fn settings_serialize() {
+        let file = File::open("tests/cluster_spec.json").unwrap();
+        let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
+
+        assert_eq!(
+            spec.cluster.settings.as_pg_settings(),
+            "fsync = off\nwal_level = replica\nhot_standby = on\nsafekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'"
+        );
+    }
+
+    #[test]
+    fn quote_ident() {
+        let ident: PgIdent = PgIdent::from("\"name\";\\n select 1;");
+
+        assert_eq!(ident.quote(), "\"\"\"name\"\";\\n select 1;\"");
+    }
+}
--- a/compute_tools/tests/tmp/.gitignore
+++ b/compute_tools/tests/tmp/.gitignore
@@ -0,0 +1 @@
+**/*
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -1,30 +1,23 @@
 [package]
 name = "control_plane"
 version = "0.1.0"
-authors = ["Stas Kelvich <stas@zenith.tech>"]
-edition = "2018"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+edition = "2021"

 [dependencies]
-rand = "0.8.3"
-tar = "0.4.33"
-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
+tar = "0.4.38"
+postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1"
+serde_with = "1.12.0"
 toml = "0.5"
 lazy_static = "1.4"
 regex = "1"
 anyhow = "1.0"
 thiserror = "1"
-bytes = "1.0.1"
-nix = "0.20"
+nix = "0.23"
 url = "2.2.2"
-hex = { version = "0.4.3", features = ["serde"] }
-reqwest = { version = "0.11", features = ["blocking", "json"] }
+reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }

 pageserver = { path = "../pageserver" }
-walkeeper = { path = "../walkeeper" }
-postgres_ffi = { path = "../postgres_ffi" }
-zenith_utils = { path = "../zenith_utils" }
-workspace_hack = { path = "../workspace_hack" }
+safekeeper = { path = "../safekeeper" }
+utils = { path = "../libs/utils" }
+workspace_hack = { version = "0.1", path = "../workspace_hack" }
--- a/control_plane/safekeepers.conf
+++ b/control_plane/safekeepers.conf
@@ -0,0 +1,20 @@
+# Page server and three safekeepers.
+[pageserver]
+listen_pg_addr = '127.0.0.1:64000'
+listen_http_addr = '127.0.0.1:9898'
+auth_type = 'Trust'
+
+[[safekeepers]]
+id = 1
+pg_port = 5454
+http_port = 7676
+
+[[safekeepers]]
+id = 2
+pg_port = 5455
+http_port = 7677
+
+[[safekeepers]]
+id = 3
+pg_port = 5456
+http_port = 7678
--- a/control_plane/simple.conf
+++ b/control_plane/simple.conf
@@ -0,0 +1,14 @@
+# Minimal zenith environment with one safekeeper. This is equivalent to the built-in
+# defaults that you get with no --config
+[pageserver]
+listen_pg_addr = '127.0.0.1:64000'
+listen_http_addr = '127.0.0.1:9898'
+auth_type = 'Trust'
+
+[[safekeepers]]
+id = 1
+pg_port = 5454
+http_port = 7676
+
+[etcd_broker]
+broker_endpoints = ['http://127.0.0.1:2379']
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -11,11 +11,12 @@ use std::sync::Arc;
 use std::time::Duration;

 use anyhow::{Context, Result};
-use zenith_utils::connstring::connection_host_port;
-use zenith_utils::lsn::Lsn;
-use zenith_utils::postgres_backend::AuthType;
-use zenith_utils::zid::ZTenantId;
-use zenith_utils::zid::ZTimelineId;
+use utils::{
+    connstring::connection_host_port,
+    lsn::Lsn,
+    postgres_backend::AuthType,
+    zid::{ZTenantId, ZTimelineId},
+};

 use crate::local_env::LocalEnv;
 use crate::postgresql_conf::PostgresConf;
@@ -37,10 +38,8 @@ impl ComputeControlPlane {
    // pgdatadirs
    // |- tenants
    // |  |- <tenant_id>
-    // |  |   |- <branch name>
+    // |  |   |- <node name>
    pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
-        // TODO: since pageserver do not have config file yet we believe here that
-        // it is running on default port. Change that when pageserver will have config.
        let pageserver = Arc::new(PageServerNode::from_env(&env));

        let mut nodes = BTreeMap::default();
@@ -54,7 +53,7 @@ impl ComputeControlPlane {
                .with_context(|| format!("failed to list {}", tenant_dir.path().display()))?
            {
                let node = PostgresNode::from_dir_entry(timeline_dir?, &env, &pageserver)?;
-                nodes.insert((node.tenantid, node.name.clone()), Arc::new(node));
+                nodes.insert((node.tenant_id, node.name.clone()), Arc::new(node));
            }
        }

@@ -75,43 +74,32 @@ impl ComputeControlPlane {
            .unwrap_or(self.base_port)
    }

-    pub fn local(local_env: &LocalEnv, pageserver: &Arc<PageServerNode>) -> ComputeControlPlane {
-        ComputeControlPlane {
-            base_port: 65431,
-            pageserver: Arc::clone(pageserver),
-            nodes: BTreeMap::new(),
-            env: local_env.clone(),
-        }
-    }
-
    pub fn new_node(
        &mut self,
-        tenantid: ZTenantId,
-        branch_name: &str,
+        tenant_id: ZTenantId,
+        name: &str,
+        timeline_id: ZTimelineId,
+        lsn: Option<Lsn>,
        port: Option<u16>,
    ) -> Result<Arc<PostgresNode>> {
-        let timeline_id = self
-            .pageserver
-            .branch_get_by_name(&tenantid, branch_name)?
-            .timeline_id;
-
        let port = port.unwrap_or_else(|| self.get_port());
        let node = Arc::new(PostgresNode {
-            name: branch_name.to_owned(),
+            name: name.to_owned(),
            address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
            env: self.env.clone(),
            pageserver: Arc::clone(&self.pageserver),
            is_test: false,
-            timelineid: timeline_id,
-            tenantid,
+            timeline_id,
+            lsn,
+            tenant_id,
            uses_wal_proposer: false,
        });

        node.create_pgdata()?;
-        node.setup_pg_conf(self.env.auth_type)?;
+        node.setup_pg_conf(self.env.pageserver.auth_type)?;

        self.nodes
-            .insert((tenantid, node.name.clone()), Arc::clone(&node));
+            .insert((tenant_id, node.name.clone()), Arc::clone(&node));

        Ok(node)
    }
@@ -126,8 +114,9 @@ pub struct PostgresNode {
    pub env: LocalEnv,
    pageserver: Arc<PageServerNode>,
    is_test: bool,
-    pub timelineid: ZTimelineId,
-    pub tenantid: ZTenantId,
+    pub timeline_id: ZTimelineId,
+    pub lsn: Option<Lsn>, // if it's a read-only node. None for primary
+    pub tenant_id: ZTenantId,
    uses_wal_proposer: bool,
 }

@@ -159,10 +148,13 @@ impl PostgresNode {
        // Read a few options from the config file
        let context = format!("in config file {}", cfg_path_str);
        let port: u16 = conf.parse_field("port", &context)?;
-        let timelineid: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?;
-        let tenantid: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?;
+        let timeline_id: ZTimelineId = conf.parse_field("neon.timeline_id", &context)?;
+        let tenant_id: ZTenantId = conf.parse_field("neon.tenant_id", &context)?;
+        let uses_wal_proposer = conf.get("safekeepers").is_some();

-        let uses_wal_proposer = conf.get("wal_acceptors").is_some();
+        // parse recovery_target_lsn, if any
+        let recovery_target_lsn: Option<Lsn> =
+            conf.parse_field_optional("recovery_target_lsn", &context)?;

        // ok now
        Ok(PostgresNode {
@@ -171,23 +163,31 @@ impl PostgresNode {
            env: env.clone(),
            pageserver: Arc::clone(pageserver),
            is_test: false,
-            timelineid,
-            tenantid,
+            timeline_id,
+            lsn: recovery_target_lsn,
+            tenant_id,
            uses_wal_proposer,
        })
    }

-    fn sync_walkeepers(&self) -> Result<Lsn> {
+    fn sync_safekeepers(&self, auth_token: &Option<String>) -> Result<Lsn> {
        let pg_path = self.env.pg_bin_dir().join("postgres");
-        let sync_handle = Command::new(pg_path)
-            .arg("--sync-safekeepers")
+        let mut cmd = Command::new(&pg_path);
+
+        cmd.arg("--sync-safekeepers")
            .env_clear()
            .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
            .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
            .env("PGDATA", self.pgdata().to_str().unwrap())
            .stdout(Stdio::piped())
            // Comment this to avoid capturing stderr (useful if command hangs)
-            .stderr(Stdio::piped())
+            .stderr(Stdio::piped());
+
+        if let Some(token) = auth_token {
+            cmd.env("ZENITH_AUTH_TOKEN", token);
+        }
+
+        let sync_handle = cmd
            .spawn()
            .expect("postgres --sync-safekeepers failed to start");

@@ -202,7 +202,7 @@ impl PostgresNode {
        }

        let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?;
-        println!("Walkeepers synced on {}", lsn);
+        println!("Safekeepers synced on {}", lsn);
        Ok(lsn)
    }

@@ -216,24 +216,29 @@ impl PostgresNode {
        );

        let sql = if let Some(lsn) = lsn {
-            format!("basebackup {} {} {}", self.tenantid, self.timelineid, lsn)
+            format!("basebackup {} {} {}", self.tenant_id, self.timeline_id, lsn)
        } else {
-            format!("basebackup {} {}", self.tenantid, self.timelineid)
+            format!("basebackup {} {}", self.tenant_id, self.timeline_id)
        };

        let mut client = self
            .pageserver
            .page_server_psql_client()
-            .with_context(|| "connecting to page server failed")?;
+            .context("connecting to page server failed")?;

        let copyreader = client
            .copy_out(sql.as_str())
-            .with_context(|| "page server 'basebackup' command failed")?;
+            .context("page server 'basebackup' command failed")?;

        // Read the archive directly from the `CopyOutReader`
-        tar::Archive::new(copyreader)
-            .unpack(&self.pgdata())
-            .with_context(|| "extracting page backup failed")?;
+        //
+        // Set `ignore_zeros` so that unpack() reads all the Copy data and
+        // doesn't stop at the end-of-archive marker. Otherwise, if the server
+        // sends an Error after finishing the tarball, we will not notice it.
+        let mut ar = tar::Archive::new(copyreader);
+        ar.set_ignore_zeros(true);
+        ar.unpack(&self.pgdata())
+            .context("extracting base backup failed")?;

        Ok(())
    }
@@ -267,16 +272,15 @@ impl PostgresNode {
        conf.append("shared_buffers", "1MB");
        conf.append("fsync", "off");
        conf.append("max_connections", "100");
-        conf.append("wal_sender_timeout", "0");
        conf.append("wal_level", "replica");
+        // wal_sender_timeout is the maximum time to wait for WAL replication.
+        // It also defines how often the walreciever will send a feedback message to the wal sender.
+        conf.append("wal_sender_timeout", "5s");
        conf.append("listen_addresses", &self.address.ip().to_string());
        conf.append("port", &self.address.port().to_string());
-
-        // Never clean up old WAL. TODO: We should use a replication
-        // slot or something proper, to prevent the compute node
-        // from removing WAL that hasn't been streamed to the safekeeper or
-        // page server yet. (gh issue #349)
-        conf.append("wal_keep_size", "10TB");
+        conf.append("wal_keep_size", "0");
+        // walproposer panics when basebackup is invalid, it is pointless to restart in this case.
+        conf.append("restart_after_crash", "off");

        // Configure the node to fetch pages from pageserver
        let pageserver_connstr = {
@@ -293,19 +297,62 @@ impl PostgresNode {
            } else {
                ""
            };
-
-            format!("host={} port={} password={}", host, port, password)
+            // NOTE avoiding spaces in connection string, because it is less error prone if we forward it somewhere.
+            // Also note that not all parameters are supported here. Because in compute we substitute $ZENITH_AUTH_TOKEN
+            // We parse this string and build it back with token from env var, and for simplicity rebuild
+            // uses only needed variables namely host, port, user, password.
+            format!("postgresql://no_user:{}@{}:{}", password, host, port)
        };
-        conf.append("shared_preload_libraries", "zenith");
-        conf.append_line("");
-        conf.append("zenith.page_server_connstring", &pageserver_connstr);
-        conf.append("zenith.zenith_tenant", &self.tenantid.to_string());
-        conf.append("zenith.zenith_timeline", &self.timelineid.to_string());
+        conf.append("shared_preload_libraries", "neon");
        conf.append_line("");
+        conf.append("neon.pageserver_connstring", &pageserver_connstr);
+        conf.append("neon.tenant_id", &self.tenant_id.to_string());
+        conf.append("neon.timeline_id", &self.timeline_id.to_string());
+        if let Some(lsn) = self.lsn {
+            conf.append("recovery_target_lsn", &lsn.to_string());
+        }

-        // Configure the node to stream WAL directly to the pageserver
-        conf.append("synchronous_standby_names", "pageserver"); // TODO: add a new function arg?
-        conf.append("zenith.callmemaybe_connstring", &self.connstr());
+        conf.append_line("");
+        // Configure backpressure
+        // - Replication write lag depends on how fast the walreceiver can process incoming WAL.
+        //   This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec,
+        //   so to avoid expiration of 1 minute timeout, this lag should not be larger than 600MB.
+        //   Actually latency should be much smaller (better if < 1sec). But we assume that recently
+        //   updates pages are not requested from pageserver.
+        // - Replication flush lag depends on speed of persisting data by checkpointer (creation of
+        //   delta/image layers) and advancing disk_consistent_lsn. Safekeepers are able to
+        //   remove/archive WAL only beyond disk_consistent_lsn. Too large a lag can cause long
+        //   recovery time (in case of pageserver crash) and disk space overflow at safekeepers.
+        // - Replication apply lag depends on speed of uploading changes to S3 by uploader thread.
+        //   To be able to restore database in case of pageserver node crash, safekeeper should not
+        //   remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers
+        //   (if they are not able to upload WAL to S3).
+        conf.append("max_replication_write_lag", "500MB");
+        conf.append("max_replication_flush_lag", "10GB");
+
+        if !self.env.safekeepers.is_empty() {
+            // Configure the node to connect to the safekeepers
+            conf.append("synchronous_standby_names", "walproposer");
+
+            let safekeepers = self
+                .env
+                .safekeepers
+                .iter()
+                .map(|sk| format!("localhost:{}", sk.pg_port))
+                .collect::<Vec<String>>()
+                .join(",");
+            conf.append("safekeepers", &safekeepers);
+        } else {
+            // We only use setup without safekeepers for tests,
+            // and don't care about data durability on pageserver,
+            // so set more relaxed synchronous_commit.
+            conf.append("synchronous_commit", "remote_write");
+
+            // Configure the node to stream WAL directly to the pageserver
+            // This isn't really a supported configuration, but can be useful for
+            // testing.
+            conf.append("synchronous_standby_names", "pageserver");
+        }

        let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
        file.write_all(conf.to_string().as_bytes())?;
@@ -313,13 +360,15 @@ impl PostgresNode {
        Ok(())
    }

-    fn load_basebackup(&self) -> Result<()> {
-        let lsn = if self.uses_wal_proposer {
+    fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
+        let backup_lsn = if let Some(lsn) = self.lsn {
+            Some(lsn)
+        } else if self.uses_wal_proposer {
            // LSN 0 means that it is bootstrap and we need to download just
            // latest data from the pageserver. That is a bit clumsy but whole bootstrap
            // procedure evolves quite actively right now, so let's think about it again
            // when things would be more stable (TODO).
-            let lsn = self.sync_walkeepers()?;
+            let lsn = self.sync_safekeepers(auth_token)?;
            if lsn == Lsn(0) {
                None
            } else {
@@ -329,13 +378,13 @@ impl PostgresNode {
            None
        };

-        self.do_basebackup(lsn)?;
+        self.do_basebackup(backup_lsn)?;

        Ok(())
    }

    pub fn pgdata(&self) -> PathBuf {
-        self.env.pg_data_dir(&self.tenantid, &self.name)
+        self.env.pg_data_dir(&self.tenant_id, &self.name)
    }

    pub fn status(&self) -> &str {
@@ -370,14 +419,18 @@ impl PostgresNode {
        .env_clear()
        .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
        .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap());
-
        if let Some(token) = auth_token {
            cmd.env("ZENITH_AUTH_TOKEN", token);
        }
-        let pg_ctl = cmd.status().with_context(|| "pg_ctl failed")?;

-        if !pg_ctl.success() {
-            anyhow::bail!("pg_ctl failed");
+        let pg_ctl = cmd.output().context("pg_ctl failed")?;
+        if !pg_ctl.status.success() {
+            anyhow::bail!(
+                "pg_ctl failed, exit code: {}, stdout: {}, stderr: {}",
+                pg_ctl.status,
+                String::from_utf8_lossy(&pg_ctl.stdout),
+                String::from_utf8_lossy(&pg_ctl.stderr),
+            );
        }
        Ok(())
    }
@@ -404,7 +457,11 @@ impl PostgresNode {
        fs::write(&postgresql_conf_path, postgresql_conf)?;

        // 3. Load basebackup
-        self.load_basebackup()?;
+        self.load_basebackup(auth_token)?;
+
+        if self.lsn.is_some() {
+            File::create(self.pgdata().join("standby.signal"))?;
+        }

        // 4. Finally start the compute node postgres
        println!("Starting postgres node at '{}'", self.connstr());
@@ -441,7 +498,7 @@ impl PostgresNode {
            "host={} port={} user={} dbname={}",
            self.address.ip(),
            self.address.port(),
-            "zenith_admin",
+            "cloud_admin",
            "postgres"
        )
    }
--- a/control_plane/src/etcd.rs
+++ b/control_plane/src/etcd.rs
@@ -0,0 +1,97 @@
+use std::{
+    fs,
+    path::PathBuf,
+    process::{Command, Stdio},
+};
+
+use anyhow::Context;
+use nix::{
+    sys::signal::{kill, Signal},
+    unistd::Pid,
+};
+
+use crate::{local_env, read_pidfile};
+
+pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
+    let etcd_broker = &env.etcd_broker;
+    println!(
+        "Starting etcd broker using {}",
+        etcd_broker.etcd_binary_path.display()
+    );
+
+    let etcd_data_dir = env.base_data_dir.join("etcd");
+    fs::create_dir_all(&etcd_data_dir).with_context(|| {
+        format!(
+            "Failed to create etcd data dir: {}",
+            etcd_data_dir.display()
+        )
+    })?;
+
+    let etcd_stdout_file =
+        fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| {
+            format!(
+                "Failed to create ectd stout file in directory {}",
+                etcd_data_dir.display()
+            )
+        })?;
+    let etcd_stderr_file =
+        fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| {
+            format!(
+                "Failed to create ectd stderr file in directory {}",
+                etcd_data_dir.display()
+            )
+        })?;
+    let client_urls = etcd_broker.comma_separated_endpoints();
+
+    let etcd_process = Command::new(&etcd_broker.etcd_binary_path)
+        .args(&[
+            format!("--data-dir={}", etcd_data_dir.display()),
+            format!("--listen-client-urls={client_urls}"),
+            format!("--advertise-client-urls={client_urls}"),
+            // Set --quota-backend-bytes to keep the etcd virtual memory
+            // size smaller. Our test etcd clusters are very small.
+            // See https://github.com/etcd-io/etcd/issues/7910
+            "--quota-backend-bytes=100000000".to_string(),
+        ])
+        .stdout(Stdio::from(etcd_stdout_file))
+        .stderr(Stdio::from(etcd_stderr_file))
+        .spawn()
+        .context("Failed to spawn etcd subprocess")?;
+    let pid = etcd_process.id();
+
+    let etcd_pid_file_path = etcd_pid_file_path(env);
+    fs::write(&etcd_pid_file_path, pid.to_string()).with_context(|| {
+        format!(
+            "Failed to create etcd pid file at {}",
+            etcd_pid_file_path.display()
+        )
+    })?;
+
+    Ok(())
+}
+
+pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
+    let etcd_path = &env.etcd_broker.etcd_binary_path;
+    println!("Stopping etcd broker at {}", etcd_path.display());
+
+    let etcd_pid_file_path = etcd_pid_file_path(env);
+    let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| {
+        format!(
+            "Failed to read etcd pid file at {}",
+            etcd_pid_file_path.display()
+        )
+    })?);
+
+    kill(pid, Signal::SIGTERM).with_context(|| {
+        format!(
+            "Failed to stop etcd with pid {pid} at {}",
+            etcd_pid_file_path.display()
+        )
+    })?;
+
+    Ok(())
+}
+
+fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
+    env.base_data_dir.join("etcd.pid")
+}
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -9,10 +9,13 @@
 use anyhow::{anyhow, bail, Context, Result};
 use std::fs;
 use std::path::Path;
+use std::process::Command;

 pub mod compute;
+pub mod etcd;
 pub mod local_env;
 pub mod postgresql_conf;
+pub mod safekeeper;
 pub mod storage;

 /// Read a PID file
@@ -30,3 +33,28 @@ pub fn read_pidfile(pidfile: &Path) -> Result<i32> {
    }
    Ok(pid)
 }
+
+fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
+    let cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
+
+    let var = "LLVM_PROFILE_FILE";
+    if let Some(val) = std::env::var_os(var) {
+        cmd.env(var, val);
+    }
+
+    const RUST_LOG_KEY: &str = "RUST_LOG";
+    if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
+        cmd.env(RUST_LOG_KEY, rust_log_value)
+    } else {
+        cmd
+    }
+}
+
+fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
+    for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] {
+        if let Ok(value) = std::env::var(env_key) {
+            cmd = cmd.env(env_key, value);
+        }
+    }
+    cmd
+}
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -1,52 +1,197 @@
-//
-// This module is responsible for locating and loading paths in a local setup.
-//
-// Now it also provides init method which acts like a stub for proper installation
-// script which will use local paths.
-//
-use anyhow::{Context, Result};
+//! This module is responsible for locating and loading paths in a local setup.
+//!
+//! Now it also provides init method which acts like a stub for proper installation
+//! script which will use local paths.
+
+use anyhow::{bail, ensure, Context};
+use reqwest::Url;
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
+use std::collections::HashMap;
 use std::env;
 use std::fs;
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
-use zenith_utils::auth::{encode_from_key_path, Claims, Scope};
-use zenith_utils::postgres_backend::AuthType;
-use zenith_utils::zid::ZTenantId;
+use utils::{
+    auth::{encode_from_key_file, Claims, Scope},
+    postgres_backend::AuthType,
+    zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
+};
+
+use crate::safekeeper::SafekeeperNode;

 //
-// This data structures represent deserialized zenith CLI config
+// This data structures represents neon_local CLI config
 //
-#[derive(Serialize, Deserialize, Clone, Debug)]
+// It is deserialized from the .neon/config file, or the config file passed
+// to 'zenith init --config=<path>' option. See control_plane/simple.conf for
+// an example.
+//
+#[serde_as]
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct LocalEnv {
-    // Pageserver connection settings
-    pub pageserver_pg_port: u16,
-    pub pageserver_http_port: u16,
-
-    // Base directory for both pageserver and compute nodes
+    // Base directory for all the nodes (the pageserver, safekeepers and
+    // compute nodes).
+    //
+    // This is not stored in the config file. Rather, this is the path where the
+    // config file itself is. It is read from the NEON_REPO_DIR env variable or
+    // '.neon' if not given.
+    #[serde(skip)]
    pub base_data_dir: PathBuf,

    // Path to postgres distribution. It's expected that "bin", "include",
    // "lib", "share" from postgres distribution are there. If at some point
    // in time we will be able to run against vanilla postgres we may split that
    // to four separate paths and match OS-specific installation layout.
+    #[serde(default)]
    pub pg_distrib_dir: PathBuf,

    // Path to pageserver binary.
+    #[serde(default)]
    pub zenith_distrib_dir: PathBuf,

-    // keeping tenant id in config to reduce copy paste when running zenith locally with single tenant
-    #[serde(with = "hex")]
-    pub tenantid: ZTenantId,
+    // Default tenant ID to use with the 'zenith' command line utility, when
+    // --tenantid is not explicitly specified.
+    #[serde(default)]
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub default_tenant_id: Option<ZTenantId>,

-    // jwt auth token used for communication with pageserver
-    pub auth_token: String,
+    // used to issue tokens during e.g pg start
+    #[serde(default)]
+    pub private_key_path: PathBuf,
+
+    pub etcd_broker: EtcdBroker,
+
+    pub pageserver: PageServerConf,
+
+    #[serde(default)]
+    pub safekeepers: Vec<SafekeeperConf>,
+
+    /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
+    #[serde(default)]
+    // A `HashMap<String, HashMap<ZTenantId, ZTimelineId>>` would be more appropriate here,
+    // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
+    // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
+    #[serde_as(as = "HashMap<_, Vec<(DisplayFromStr, DisplayFromStr)>>")]
+    branch_name_mappings: HashMap<String, Vec<(ZTenantId, ZTimelineId)>>,
+}
+
+/// Etcd broker config for cluster internal communication.
+#[serde_as]
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
+pub struct EtcdBroker {
+    /// A prefix to all to any key when pushing/polling etcd from a node.
+    #[serde(default)]
+    pub broker_etcd_prefix: Option<String>,
+
+    /// Broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'.
+    #[serde(default)]
+    #[serde_as(as = "Vec<DisplayFromStr>")]
+    pub broker_endpoints: Vec<Url>,
+
+    /// Etcd binary path to use.
+    #[serde(default)]
+    pub etcd_binary_path: PathBuf,
+}
+
+impl EtcdBroker {
+    pub fn locate_etcd() -> anyhow::Result<PathBuf> {
+        let which_output = Command::new("which")
+            .arg("etcd")
+            .output()
+            .context("Failed to run 'which etcd' command")?;
+        let stdout = String::from_utf8_lossy(&which_output.stdout);
+        ensure!(
+            which_output.status.success(),
+            "'which etcd' invocation failed. Status: {}, stdout: {stdout}, stderr: {}",
+            which_output.status,
+            String::from_utf8_lossy(&which_output.stderr)
+        );
+
+        let etcd_path = PathBuf::from(stdout.trim());
+        ensure!(
+            etcd_path.is_file(),
+            "'which etcd' invocation was successful, but the path it returned is not a file or does not exist: {}",
+            etcd_path.display()
+        );
+
+        Ok(etcd_path)
+    }
+
+    pub fn comma_separated_endpoints(&self) -> String {
+        self.broker_endpoints
+            .iter()
+            .map(|url| {
+                // URL by default adds a '/' path at the end, which is not what etcd CLI wants.
+                let url_string = url.as_str();
+                if url_string.ends_with('/') {
+                    &url_string[0..url_string.len() - 1]
+                } else {
+                    url_string
+                }
+            })
+            .fold(String::new(), |mut comma_separated_urls, url| {
+                if !comma_separated_urls.is_empty() {
+                    comma_separated_urls.push(',');
+                }
+                comma_separated_urls.push_str(url);
+                comma_separated_urls
+            })
+    }
+}
+
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
+#[serde(default)]
+pub struct PageServerConf {
+    // node id
+    pub id: NodeId,
+    // Pageserver connection settings
+    pub listen_pg_addr: String,
+    pub listen_http_addr: String,

    // used to determine which auth type is used
    pub auth_type: AuthType,

-    // used to issue tokens during e.g pg start
-    pub private_key_path: PathBuf,
+    // jwt auth token used for communication with pageserver
+    pub auth_token: String,
+}
+
+impl Default for PageServerConf {
+    fn default() -> Self {
+        Self {
+            id: NodeId(0),
+            listen_pg_addr: String::new(),
+            listen_http_addr: String::new(),
+            auth_type: AuthType::Trust,
+            auth_token: String::new(),
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
+#[serde(default)]
+pub struct SafekeeperConf {
+    pub id: NodeId,
+    pub pg_port: u16,
+    pub http_port: u16,
+    pub sync: bool,
+    pub remote_storage: Option<String>,
+    pub backup_threads: Option<u32>,
+    pub auth_enabled: bool,
+}
+
+impl Default for SafekeeperConf {
+    fn default() -> Self {
+        Self {
+            id: NodeId(0),
+            pg_port: 0,
+            http_port: 0,
+            sync: true,
+            remote_storage: None,
+            backup_threads: None,
+            auth_enabled: false,
+        }
+    }
 }

 impl LocalEnv {
@@ -58,10 +203,14 @@ impl LocalEnv {
        self.pg_distrib_dir.join("lib")
    }

-    pub fn pageserver_bin(&self) -> Result<PathBuf> {
+    pub fn pageserver_bin(&self) -> anyhow::Result<PathBuf> {
        Ok(self.zenith_distrib_dir.join("pageserver"))
    }

+    pub fn safekeeper_bin(&self) -> anyhow::Result<PathBuf> {
+        Ok(self.zenith_distrib_dir.join("safekeeper"))
+    }
+
    pub fn pg_data_dirs_path(&self) -> PathBuf {
        self.base_data_dir.join("pgdatadirs").join("tenants")
    }
@@ -76,127 +225,278 @@ impl LocalEnv {
    pub fn pageserver_data_dir(&self) -> PathBuf {
        self.base_data_dir.clone()
    }
+
+    pub fn safekeeper_data_dir(&self, data_dir_name: &str) -> PathBuf {
+        self.base_data_dir.join("safekeepers").join(data_dir_name)
+    }
+
+    pub fn register_branch_mapping(
+        &mut self,
+        branch_name: String,
+        tenant_id: ZTenantId,
+        timeline_id: ZTimelineId,
+    ) -> anyhow::Result<()> {
+        let existing_values = self
+            .branch_name_mappings
+            .entry(branch_name.clone())
+            .or_default();
+
+        let existing_ids = existing_values
+            .iter()
+            .find(|(existing_tenant_id, _)| existing_tenant_id == &tenant_id);
+
+        if let Some((_, old_timeline_id)) = existing_ids {
+            if old_timeline_id == &timeline_id {
+                Ok(())
+            } else {
+                bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}");
+            }
+        } else {
+            existing_values.push((tenant_id, timeline_id));
+            Ok(())
+        }
+    }
+
+    pub fn get_branch_timeline_id(
+        &self,
+        branch_name: &str,
+        tenant_id: ZTenantId,
+    ) -> Option<ZTimelineId> {
+        self.branch_name_mappings
+            .get(branch_name)?
+            .iter()
+            .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id)
+            .map(|&(_, timeline_id)| timeline_id)
+            .map(ZTimelineId::from)
+    }
+
+    pub fn timeline_name_mappings(&self) -> HashMap<ZTenantTimelineId, String> {
+        self.branch_name_mappings
+            .iter()
+            .flat_map(|(name, tenant_timelines)| {
+                tenant_timelines.iter().map(|&(tenant_id, timeline_id)| {
+                    (ZTenantTimelineId::new(tenant_id, timeline_id), name.clone())
+                })
+            })
+            .collect()
+    }
+
+    /// Create a LocalEnv from a config file.
+    ///
+    /// Unlike 'load_config', this function fills in any defaults that are missing
+    /// from the config file.
+    pub fn parse_config(toml: &str) -> anyhow::Result<Self> {
+        let mut env: LocalEnv = toml::from_str(toml)?;
+
+        // Find postgres binaries.
+        // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
+        if env.pg_distrib_dir == Path::new("") {
+            if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
+                env.pg_distrib_dir = postgres_bin.into();
+            } else {
+                let cwd = env::current_dir()?;
+                env.pg_distrib_dir = cwd.join("tmp_install")
+            }
+        }
+
+        // Find zenith binaries.
+        if env.zenith_distrib_dir == Path::new("") {
+            env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
+        }
+
+        // If no initial tenant ID was given, generate it.
+        if env.default_tenant_id.is_none() {
+            env.default_tenant_id = Some(ZTenantId::generate());
+        }
+
+        env.base_data_dir = base_path();
+
+        Ok(env)
+    }
+
+    /// Locate and load config
+    pub fn load_config() -> anyhow::Result<Self> {
+        let repopath = base_path();
+
+        if !repopath.exists() {
+            bail!(
+                "Zenith config is not found in {}. You need to run 'zenith init' first",
+                repopath.to_str().unwrap()
+            );
+        }
+
+        // TODO: check that it looks like a zenith repository
+
+        // load and parse file
+        let config = fs::read_to_string(repopath.join("config"))?;
+        let mut env: LocalEnv = toml::from_str(config.as_str())?;
+
+        env.base_data_dir = repopath;
+
+        Ok(env)
+    }
+
+    pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> {
+        // Currently, the user first passes a config file with 'zenith init --config=<path>'
+        // We read that in, in `create_config`, and fill any missing defaults. Then it's saved
+        // to .neon/config. TODO: We lose any formatting and comments along the way, which is
+        // a bit sad.
+        let mut conf_content = r#"# This file describes a locale deployment of the page server
+# and safekeeeper node. It is read by the 'zenith' command-line
+# utility.
+"#
+        .to_string();
+
+        // Convert the LocalEnv to a toml file.
+        //
+        // This could be as simple as this:
+        //
+        // conf_content += &toml::to_string_pretty(env)?;
+        //
+        // But it results in a "values must be emitted before tables". I'm not sure
+        // why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
+        // Maybe rust reorders the fields to squeeze avoid padding or something?
+        // In any case, converting to toml::Value first, and serializing that, works.
+        // See https://github.com/alexcrichton/toml-rs/issues/142
+        conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?;
+
+        let target_config_path = base_path.join("config");
+        fs::write(&target_config_path, conf_content).with_context(|| {
+            format!(
+                "Failed to write config file into path '{}'",
+                target_config_path.display()
+            )
+        })
+    }
+
+    // this function is used only for testing purposes in CLI e g generate tokens during init
+    pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result<String> {
+        let private_key_path = if self.private_key_path.is_absolute() {
+            self.private_key_path.to_path_buf()
+        } else {
+            self.base_data_dir.join(&self.private_key_path)
+        };
+
+        let key_data = fs::read(private_key_path)?;
+        encode_from_key_file(claims, &key_data)
+    }
+
+    //
+    // Initialize a new Zenith repository
+    //
+    pub fn init(&mut self) -> anyhow::Result<()> {
+        // check if config already exists
+        let base_path = &self.base_data_dir;
+        ensure!(
+            base_path != Path::new(""),
+            "repository base path is missing"
+        );
+
+        ensure!(
+            !base_path.exists(),
+            "directory '{}' already exists. Perhaps already initialized?",
+            base_path.display()
+        );
+        if !self.pg_distrib_dir.join("bin/postgres").exists() {
+            bail!(
+                "Can't find postgres binary at {}",
+                self.pg_distrib_dir.display()
+            );
+        }
+        for binary in ["pageserver", "safekeeper"] {
+            if !self.zenith_distrib_dir.join(binary).exists() {
+                bail!(
+                    "Can't find binary '{binary}' in zenith distrib dir '{}'",
+                    self.zenith_distrib_dir.display()
+                );
+            }
+        }
+
+        fs::create_dir(&base_path)?;
+
+        // generate keys for jwt
+        // openssl genrsa -out private_key.pem 2048
+        let private_key_path;
+        if self.private_key_path == PathBuf::new() {
+            private_key_path = base_path.join("auth_private_key.pem");
+            let keygen_output = Command::new("openssl")
+                .arg("genrsa")
+                .args(&["-out", private_key_path.to_str().unwrap()])
+                .arg("2048")
+                .stdout(Stdio::null())
+                .output()
+                .context("failed to generate auth private key")?;
+            if !keygen_output.status.success() {
+                bail!(
+                    "openssl failed: '{}'",
+                    String::from_utf8_lossy(&keygen_output.stderr)
+                );
+            }
+            self.private_key_path = PathBuf::from("auth_private_key.pem");
+
+            let public_key_path = base_path.join("auth_public_key.pem");
+            // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
+            let keygen_output = Command::new("openssl")
+                .arg("rsa")
+                .args(&["-in", private_key_path.to_str().unwrap()])
+                .arg("-pubout")
+                .args(&["-outform", "PEM"])
+                .args(&["-out", public_key_path.to_str().unwrap()])
+                .stdout(Stdio::null())
+                .output()
+                .context("failed to generate auth private key")?;
+            if !keygen_output.status.success() {
+                bail!(
+                    "openssl failed: '{}'",
+                    String::from_utf8_lossy(&keygen_output.stderr)
+                );
+            }
+        }
+
+        self.pageserver.auth_token =
+            self.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
+
+        fs::create_dir_all(self.pg_data_dirs_path())?;
+
+        for safekeeper in &self.safekeepers {
+            fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?;
+        }
+
+        self.persist_config(base_path)
+    }
 }

 fn base_path() -> PathBuf {
-    match std::env::var_os("ZENITH_REPO_DIR") {
-        Some(val) => PathBuf::from(val.to_str().unwrap()),
-        None => ".zenith".into(),
+    match std::env::var_os("NEON_REPO_DIR") {
+        Some(val) => PathBuf::from(val),
+        None => PathBuf::from(".neon"),
    }
 }

-//
-// Initialize a new Zenith repository
-//
-pub fn init(
-    pageserver_pg_port: u16,
-    pageserver_http_port: u16,
-    tenantid: ZTenantId,
-    auth_type: AuthType,
-) -> Result<()> {
-    // check if config already exists
-    let base_path = base_path();
-    if base_path.exists() {
-        anyhow::bail!(
-            "{} already exists. Perhaps already initialized?",
-            base_path.to_str().unwrap()
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn simple_conf_parsing() {
+        let simple_conf_toml = include_str!("../simple.conf");
+        let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml);
+        assert!(
+            simple_conf_parse_result.is_ok(),
+            "failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}"
+        );
+
+        let string_to_replace = "broker_endpoints = ['http://127.0.0.1:2379']";
+        let spoiled_url_str = "broker_endpoints = ['!@$XOXO%^&']";
+        let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str);
+        assert!(
+            spoiled_url_toml.contains(spoiled_url_str),
+            "Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}"
+        );
+        let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml);
+        assert!(
+            spoiled_url_parse_result.is_err(),
+            "expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}"
        );
    }
-    fs::create_dir(&base_path)?;
-
-    // ok, now check that expected binaries are present
-
-    // Find postgres binaries. Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
-    let pg_distrib_dir: PathBuf = {
-        if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
-            postgres_bin.into()
-        } else {
-            let cwd = env::current_dir()?;
-            cwd.join("tmp_install")
-        }
-    };
-    if !pg_distrib_dir.join("bin/postgres").exists() {
-        anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
-    }
-
-    // generate keys for jwt
-    // openssl genrsa -out private_key.pem 2048
-    let private_key_path = base_path.join("auth_private_key.pem");
-    let keygen_output = Command::new("openssl")
-        .arg("genrsa")
-        .args(&["-out", private_key_path.to_str().unwrap()])
-        .arg("2048")
-        .stdout(Stdio::null())
-        .output()
-        .with_context(|| "failed to generate auth private key")?;
-    if !keygen_output.status.success() {
-        anyhow::bail!(
-            "openssl failed: '{}'",
-            String::from_utf8_lossy(&keygen_output.stderr)
-        );
-    }
-
-    let public_key_path = base_path.join("auth_public_key.pem");
-    // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
-    let keygen_output = Command::new("openssl")
-        .arg("rsa")
-        .args(&["-in", private_key_path.to_str().unwrap()])
-        .arg("-pubout")
-        .args(&["-outform", "PEM"])
-        .args(&["-out", public_key_path.to_str().unwrap()])
-        .stdout(Stdio::null())
-        .output()
-        .with_context(|| "failed to generate auth private key")?;
-    if !keygen_output.status.success() {
-        anyhow::bail!(
-            "openssl failed: '{}'",
-            String::from_utf8_lossy(&keygen_output.stderr)
-        );
-    }
-
-    let auth_token =
-        encode_from_key_path(&Claims::new(None, Scope::PageServerApi), &private_key_path)?;
-
-    // Find zenith binaries.
-    let zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
-    if !zenith_distrib_dir.join("pageserver").exists() {
-        anyhow::bail!("Can't find pageserver binary.",);
-    }
-
-    let conf = LocalEnv {
-        pageserver_pg_port,
-        pageserver_http_port,
-        pg_distrib_dir,
-        zenith_distrib_dir,
-        base_data_dir: base_path,
-        tenantid,
-        auth_token,
-        auth_type,
-        private_key_path,
-    };
-
-    fs::create_dir_all(conf.pg_data_dirs_path())?;
-
-    let toml = toml::to_string_pretty(&conf)?;
-    fs::write(conf.base_data_dir.join("config"), toml)?;
-
-    Ok(())
-}
-
-// Locate and load config
-pub fn load_config() -> Result<LocalEnv> {
-    let repopath = base_path();
-
-    if !repopath.exists() {
-        anyhow::bail!(
-            "Zenith config is not found in {}. You need to run 'zenith init' first",
-            repopath.to_str().unwrap()
-        );
-    }
-
-    // TODO: check that it looks like a zenith repository
-
-    // load and parse file
-    let config = fs::read_to_string(repopath.join("config"))?;
-    toml::from_str(config.as_str()).map_err(|e| e.into())
 }
--- a/control_plane/src/postgresql_conf.rs
+++ b/control_plane/src/postgresql_conf.rs
@@ -4,7 +4,7 @@
 /// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just
 /// enough to extract a few settings we need in Zenith, assuming you don't do
 /// funny stuff like include-directives or funny escaping.
-use anyhow::{anyhow, bail, Context, Result};
+use anyhow::{bail, Context, Result};
 use lazy_static::lazy_static;
 use regex::Regex;
 use std::collections::HashMap;
@@ -78,11 +78,27 @@ impl PostgresConf {
        <T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
    {
        self.get(field_name)
-            .ok_or_else(|| anyhow!("could not find '{}' option {}", field_name, context))?
+            .with_context(|| format!("could not find '{}' option {}", field_name, context))?
            .parse::<T>()
            .with_context(|| format!("could not parse '{}' option {}", field_name, context))
    }

+    pub fn parse_field_optional<T>(&self, field_name: &str, context: &str) -> Result<Option<T>>
+    where
+        T: FromStr,
+        <T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
+    {
+        if let Some(val) = self.get(field_name) {
+            let result = val
+                .parse::<T>()
+                .with_context(|| format!("could not parse '{}' option {}", field_name, context))?;
+
+            Ok(Some(result))
+        } else {
+            Ok(None)
+        }
+    }
+
    ///
    /// Note: if you call this multiple times for the same option, the config
    /// file will a line for each call. It would be nice to have a function
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -0,0 +1,318 @@
+use std::io::Write;
+use std::net::TcpStream;
+use std::path::PathBuf;
+use std::process::Command;
+use std::sync::Arc;
+use std::time::Duration;
+use std::{io, result, thread};
+
+use anyhow::bail;
+use nix::errno::Errno;
+use nix::sys::signal::{kill, Signal};
+use nix::unistd::Pid;
+use postgres::Config;
+use reqwest::blocking::{Client, RequestBuilder, Response};
+use reqwest::{IntoUrl, Method};
+use safekeeper::http::models::TimelineCreateRequest;
+use thiserror::Error;
+use utils::{
+    connstring::connection_address,
+    http::error::HttpErrorBody,
+    zid::{NodeId, ZTenantId, ZTimelineId},
+};
+
+use crate::local_env::{LocalEnv, SafekeeperConf};
+use crate::storage::PageServerNode;
+use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
+
+#[derive(Error, Debug)]
+pub enum SafekeeperHttpError {
+    #[error("Reqwest error: {0}")]
+    Transport(#[from] reqwest::Error),
+
+    #[error("Error: {0}")]
+    Response(String),
+}
+
+type Result<T> = result::Result<T, SafekeeperHttpError>;
+
+pub trait ResponseErrorMessageExt: Sized {
+    fn error_from_body(self) -> Result<Self>;
+}
+
+impl ResponseErrorMessageExt for Response {
+    fn error_from_body(self) -> Result<Self> {
+        let status = self.status();
+        if !(status.is_client_error() || status.is_server_error()) {
+            return Ok(self);
+        }
+
+        // reqwest do not export it's error construction utility functions, so lets craft the message ourselves
+        let url = self.url().to_owned();
+        Err(SafekeeperHttpError::Response(
+            match self.json::<HttpErrorBody>() {
+                Ok(err_body) => format!("Error: {}", err_body.msg),
+                Err(_) => format!("Http error ({}) at {url}.", status.as_u16()),
+            },
+        ))
+    }
+}
+
+//
+// Control routines for safekeeper.
+//
+// Used in CLI and tests.
+//
+#[derive(Debug)]
+pub struct SafekeeperNode {
+    pub id: NodeId,
+
+    pub conf: SafekeeperConf,
+
+    pub pg_connection_config: Config,
+    pub env: LocalEnv,
+    pub http_client: Client,
+    pub http_base_url: String,
+
+    pub pageserver: Arc<PageServerNode>,
+}
+
+impl SafekeeperNode {
+    pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
+        let pageserver = Arc::new(PageServerNode::from_env(env));
+
+        SafekeeperNode {
+            id: conf.id,
+            conf: conf.clone(),
+            pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
+            env: env.clone(),
+            http_client: Client::new(),
+            http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
+            pageserver,
+        }
+    }
+
+    /// Construct libpq connection string for connecting to this safekeeper.
+    fn safekeeper_connection_config(port: u16) -> Config {
+        // TODO safekeeper authentication not implemented yet
+        format!("postgresql://no_user@127.0.0.1:{}/no_db", port)
+            .parse()
+            .unwrap()
+    }
+
+    pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
+        env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref())
+    }
+
+    pub fn datadir_path(&self) -> PathBuf {
+        SafekeeperNode::datadir_path_by_id(&self.env, self.id)
+    }
+
+    pub fn pid_file(&self) -> PathBuf {
+        self.datadir_path().join("safekeeper.pid")
+    }
+
+    pub fn start(&self) -> anyhow::Result<()> {
+        print!(
+            "Starting safekeeper at '{}' in '{}'",
+            connection_address(&self.pg_connection_config),
+            self.datadir_path().display()
+        );
+        io::stdout().flush().unwrap();
+
+        let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
+        let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
+
+        let mut cmd = Command::new(self.env.safekeeper_bin()?);
+        fill_rust_env_vars(
+            cmd.args(&["-D", self.datadir_path().to_str().unwrap()])
+                .args(&["--id", self.id.to_string().as_ref()])
+                .args(&["--listen-pg", &listen_pg])
+                .args(&["--listen-http", &listen_http])
+                .args(&["--recall", "1 second"])
+                .arg("--daemonize"),
+        );
+        if !self.conf.sync {
+            cmd.arg("--no-sync");
+        }
+
+        let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints();
+        if !comma_separated_endpoints.is_empty() {
+            cmd.args(&["--broker-endpoints", &comma_separated_endpoints]);
+        }
+        if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
+            cmd.args(&["--broker-etcd-prefix", prefix]);
+        }
+        if let Some(threads) = self.conf.backup_threads {
+            cmd.args(&["--backup-threads", threads.to_string().as_ref()]);
+        }
+        if let Some(ref remote_storage) = self.conf.remote_storage {
+            cmd.args(&["--remote-storage", remote_storage]);
+        }
+        if self.conf.auth_enabled {
+            cmd.arg("--auth-validation-public-key-path");
+            // PathBuf is better be passed as is, not via `String`.
+            cmd.arg(self.env.base_data_dir.join("auth_public_key.pem"));
+        }
+
+        fill_aws_secrets_vars(&mut cmd);
+
+        if !cmd.status()?.success() {
+            bail!(
+                "Safekeeper failed to start. See '{}' for details.",
+                self.datadir_path().join("safekeeper.log").display()
+            );
+        }
+
+        // It takes a while for the safekeeper to start up. Wait until it is
+        // open for business.
+        const RETRIES: i8 = 15;
+        for retries in 1..RETRIES {
+            match self.check_status() {
+                Ok(_) => {
+                    println!("\nSafekeeper started");
+                    return Ok(());
+                }
+                Err(err) => {
+                    match err {
+                        SafekeeperHttpError::Transport(err) => {
+                            if err.is_connect() && retries < 5 {
+                                print!(".");
+                                io::stdout().flush().unwrap();
+                            } else {
+                                if retries == 5 {
+                                    println!() // put a line break after dots for second message
+                                }
+                                println!(
+                                    "Safekeeper not responding yet, err {} retrying ({})...",
+                                    err, retries
+                                );
+                            }
+                        }
+                        SafekeeperHttpError::Response(msg) => {
+                            bail!("safekeeper failed to start: {} ", msg)
+                        }
+                    }
+                    thread::sleep(Duration::from_secs(1));
+                }
+            }
+        }
+        bail!("safekeeper failed to start in {} seconds", RETRIES);
+    }
+
+    ///
+    /// Stop the server.
+    ///
+    /// If 'immediate' is true, we use SIGQUIT, killing the process immediately.
+    /// Otherwise we use SIGTERM, triggering a clean shutdown
+    ///
+    /// If the server is not running, returns success
+    ///
+    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
+        let pid_file = self.pid_file();
+        if !pid_file.exists() {
+            println!("Safekeeper {} is already stopped", self.id);
+            return Ok(());
+        }
+        let pid = read_pidfile(&pid_file)?;
+        let pid = Pid::from_raw(pid);
+
+        let sig = if immediate {
+            print!("Stopping safekeeper {} immediately..", self.id);
+            Signal::SIGQUIT
+        } else {
+            print!("Stopping safekeeper {} gracefully..", self.id);
+            Signal::SIGTERM
+        };
+        io::stdout().flush().unwrap();
+        match kill(pid, sig) {
+            Ok(_) => (),
+            Err(Errno::ESRCH) => {
+                println!(
+                    "Safekeeper with pid {} does not exist, but a PID file was found",
+                    pid
+                );
+                return Ok(());
+            }
+            Err(err) => bail!(
+                "Failed to send signal to safekeeper with pid {}: {}",
+                pid,
+                err.desc()
+            ),
+        }
+
+        let address = connection_address(&self.pg_connection_config);
+
+        // TODO Remove this "timeout" and handle it on caller side instead.
+        // Shutting down may take a long time,
+        // if safekeeper flushes a lot of data
+        let mut tcp_stopped = false;
+        for _ in 0..100 {
+            if !tcp_stopped {
+                if let Err(err) = TcpStream::connect(&address) {
+                    tcp_stopped = true;
+                    if err.kind() != io::ErrorKind::ConnectionRefused {
+                        eprintln!("\nSafekeeper connection failed with error: {err}");
+                    }
+                }
+            }
+            if tcp_stopped {
+                // Also check status on the HTTP port
+                match self.check_status() {
+                    Err(SafekeeperHttpError::Transport(err)) if err.is_connect() => {
+                        println!("done!");
+                        return Ok(());
+                    }
+                    Err(err) => {
+                        eprintln!("\nSafekeeper status check failed with error: {err}");
+                        return Ok(());
+                    }
+                    Ok(()) => {
+                        // keep waiting
+                    }
+                }
+            }
+            print!(".");
+            io::stdout().flush().unwrap();
+            thread::sleep(Duration::from_secs(1));
+        }
+
+        bail!("Failed to stop safekeeper with pid {}", pid);
+    }
+
+    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
+        // TODO: authentication
+        //if self.env.auth_type == AuthType::ZenithJWT {
+        //    builder = builder.bearer_auth(&self.env.safekeeper_auth_token)
+        //}
+        self.http_client.request(method, url)
+    }
+
+    pub fn check_status(&self) -> Result<()> {
+        self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
+            .send()?
+            .error_from_body()?;
+        Ok(())
+    }
+
+    pub fn timeline_create(
+        &self,
+        tenant_id: ZTenantId,
+        timeline_id: ZTimelineId,
+        peer_ids: Vec<NodeId>,
+    ) -> Result<()> {
+        Ok(self
+            .http_request(
+                Method::POST,
+                format!("{}/{}", self.http_base_url, "timeline"),
+            )
+            .json(&TimelineCreateRequest {
+                tenant_id,
+                timeline_id,
+                peer_ids,
+            })
+            .send()?
+            .error_from_body()?
+            .json()?)
+    }
+}
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -1,26 +1,34 @@
-use std::io::Write;
+use std::collections::HashMap;
+use std::fs::File;
+use std::io::{BufReader, Write};
 use std::net::TcpStream;
+use std::num::NonZeroU64;
 use std::path::PathBuf;
 use std::process::Command;
 use std::time::Duration;
 use std::{io, result, thread};

-use anyhow::{anyhow, bail};
+use anyhow::{bail, Context};
+use nix::errno::Errno;
 use nix::sys::signal::{kill, Signal};
 use nix::unistd::Pid;
-use pageserver::http::models::{BranchCreateRequest, TenantCreateRequest};
+use pageserver::http::models::{TenantConfigRequest, TenantCreateRequest, TimelineCreateRequest};
+use pageserver::tenant_mgr::TenantInfo;
+use pageserver::timelines::TimelineInfo;
 use postgres::{Config, NoTls};
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
-use zenith_utils::http::error::HttpErrorBody;
-use zenith_utils::postgres_backend::AuthType;
-use zenith_utils::zid::ZTenantId;
+use utils::{
+    connstring::connection_address,
+    http::error::HttpErrorBody,
+    lsn::Lsn,
+    postgres_backend::AuthType,
+    zid::{ZTenantId, ZTimelineId},
+};

 use crate::local_env::LocalEnv;
-use crate::read_pidfile;
-use pageserver::branches::BranchInfo;
-use zenith_utils::connstring::connection_address;
+use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};

 #[derive(Error, Debug)]
 pub enum PageserverHttpError {
@@ -31,6 +39,12 @@ pub enum PageserverHttpError {
    Response(String),
 }

+impl From<anyhow::Error> for PageserverHttpError {
+    fn from(e: anyhow::Error) -> Self {
+        Self::Response(e.to_string())
+    }
+}
+
 type Result<T> = result::Result<T, PageserverHttpError>;

 pub trait ResponseErrorMessageExt: Sized {
@@ -62,7 +76,6 @@ impl ResponseErrorMessageExt for Response {
 //
 #[derive(Debug)]
 pub struct PageServerNode {
-    pub kill_on_exit: bool,
    pub pg_connection_config: Config,
    pub env: LocalEnv,
    pub http_client: Client,
@@ -71,67 +84,122 @@ pub struct PageServerNode {

 impl PageServerNode {
    pub fn from_env(env: &LocalEnv) -> PageServerNode {
-        let password = if env.auth_type == AuthType::ZenithJWT {
-            &env.auth_token
+        let password = if env.pageserver.auth_type == AuthType::ZenithJWT {
+            &env.pageserver.auth_token
        } else {
            ""
        };

-        PageServerNode {
-            kill_on_exit: false,
+        Self {
            pg_connection_config: Self::pageserver_connection_config(
                password,
-                env.pageserver_pg_port,
+                &env.pageserver.listen_pg_addr,
            ),
            env: env.clone(),
            http_client: Client::new(),
-            http_base_url: format!("http://localhost:{}/v1", env.pageserver_http_port),
+            http_base_url: format!("http://{}/v1", env.pageserver.listen_http_addr),
        }
    }

-    fn pageserver_connection_config(password: &str, port: u16) -> Config {
-        format!("postgresql://no_user:{}@localhost:{}/no_db", password, port)
+    /// Construct libpq connection string for connecting to the pageserver.
+    fn pageserver_connection_config(password: &str, listen_addr: &str) -> Config {
+        format!("postgresql://no_user:{}@{}/no_db", password, listen_addr)
            .parse()
            .unwrap()
    }

-    pub fn init(&self, create_tenant: Option<&str>, enable_auth: bool) -> anyhow::Result<()> {
+    pub fn init(
+        &self,
+        create_tenant: Option<ZTenantId>,
+        initial_timeline_id: Option<ZTimelineId>,
+        config_overrides: &[&str],
+    ) -> anyhow::Result<ZTimelineId> {
        let mut cmd = Command::new(self.env.pageserver_bin()?);
-        let listen_pg = format!("localhost:{}", self.env.pageserver_pg_port);
-        let listen_http = format!("localhost:{}", self.env.pageserver_http_port);
-        let mut args = vec![
-            "--init",
-            "-D",
-            self.env.base_data_dir.to_str().unwrap(),
-            "--postgres-distrib",
-            self.env.pg_distrib_dir.to_str().unwrap(),
-            "--listen-pg",
-            &listen_pg,
-            "--listen-http",
-            &listen_http,
-        ];

-        if enable_auth {
-            args.extend(&["--auth-validation-public-key-path", "auth_public_key.pem"]);
-            args.extend(&["--auth-type", "ZenithJWT"]);
+        let id = format!("id={}", self.env.pageserver.id);
+
+        // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
+        let base_data_dir_param = self.env.base_data_dir.display().to_string();
+        let pg_distrib_dir_param =
+            format!("pg_distrib_dir='{}'", self.env.pg_distrib_dir.display());
+        let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type);
+        let listen_http_addr_param = format!(
+            "listen_http_addr='{}'",
+            self.env.pageserver.listen_http_addr
+        );
+        let listen_pg_addr_param =
+            format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr);
+        let broker_endpoints_param = format!(
+            "broker_endpoints=[{}]",
+            self.env
+                .etcd_broker
+                .broker_endpoints
+                .iter()
+                .map(|url| format!("'{url}'"))
+                .collect::<Vec<_>>()
+                .join(",")
+        );
+        let mut args = Vec::with_capacity(20);
+
+        args.push("--init");
+        args.extend(["-D", &base_data_dir_param]);
+        args.extend(["-c", &pg_distrib_dir_param]);
+        args.extend(["-c", &authg_type_param]);
+        args.extend(["-c", &listen_http_addr_param]);
+        args.extend(["-c", &listen_pg_addr_param]);
+        args.extend(["-c", &broker_endpoints_param]);
+        args.extend(["-c", &id]);
+
+        let broker_etcd_prefix_param = self
+            .env
+            .etcd_broker
+            .broker_etcd_prefix
+            .as_ref()
+            .map(|prefix| format!("broker_etcd_prefix='{prefix}'"));
+        if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() {
+            args.extend(["-c", broker_etcd_prefix_param]);
        }

-        if let Some(tenantid) = create_tenant {
-            args.extend(&["--create-tenant", tenantid])
+        for config_override in config_overrides {
+            args.extend(["-c", config_override]);
        }

-        let status = cmd
-            .args(args)
-            .env_clear()
-            .env("RUST_BACKTRACE", "1")
-            .status()
-            .expect("pageserver init failed");
-
-        if status.success() {
-            Ok(())
-        } else {
-            Err(anyhow!("pageserver init failed"))
+        if self.env.pageserver.auth_type != AuthType::Trust {
+            args.extend([
+                "-c",
+                "auth_validation_public_key_path='auth_public_key.pem'",
+            ]);
        }
+
+        let create_tenant = create_tenant.map(|id| id.to_string());
+        if let Some(tenant_id) = create_tenant.as_deref() {
+            args.extend(["--create-tenant", tenant_id])
+        }
+
+        let initial_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate);
+        let initial_timeline_id_string = initial_timeline_id.to_string();
+        args.extend(["--initial-timeline-id", &initial_timeline_id_string]);
+
+        let cmd_with_args = cmd.args(args);
+        let init_output = fill_rust_env_vars(cmd_with_args)
+            .output()
+            .with_context(|| {
+                format!("failed to init pageserver with command {:?}", cmd_with_args)
+            })?;
+
+        if !init_output.status.success() {
+            bail!(
+                "init invocation failed, {}\nStdout: {}\nStderr: {}",
+                init_output.status,
+                String::from_utf8_lossy(&init_output.stdout),
+                String::from_utf8_lossy(&init_output.stderr)
+            );
+        }
+
+        // echo the captured output of the init command
+        println!("{}", String::from_utf8_lossy(&init_output.stdout));
+
+        Ok(initial_timeline_id)
    }

    pub fn repo_path(&self) -> PathBuf {
@@ -142,7 +210,7 @@ impl PageServerNode {
        self.repo_path().join("pageserver.pid")
    }

-    pub fn start(&self) -> anyhow::Result<()> {
+    pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
        print!(
            "Starting pageserver at '{}' in '{}'",
            connection_address(&self.pg_connection_config),
@@ -150,13 +218,18 @@ impl PageServerNode {
        );
        io::stdout().flush().unwrap();

-        let mut cmd = Command::new(self.env.pageserver_bin()?);
-        cmd.args(&["-D", self.repo_path().to_str().unwrap()])
-            .arg("-d")
-            .env_clear()
-            .env("RUST_BACKTRACE", "1");
+        let repo_path = self.repo_path();
+        let mut args = vec!["-D", repo_path.to_str().unwrap()];

-        if !cmd.status()?.success() {
+        for config_override in config_overrides {
+            args.extend(["-c", config_override]);
+        }
+
+        let mut cmd = Command::new(self.env.pageserver_bin()?);
+        let mut filled_cmd = fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
+        filled_cmd = fill_aws_secrets_vars(filled_cmd);
+
+        if !filled_cmd.status()?.success() {
            bail!(
                "Pageserver failed to start. See '{}' for details.",
                self.repo_path().join("pageserver.log").display()
@@ -199,23 +272,81 @@ impl PageServerNode {
        bail!("pageserver failed to start in {} seconds", RETRIES);
    }

-    pub fn stop(&self) -> anyhow::Result<()> {
-        let pid = read_pidfile(&self.pid_file())?;
-        let pid = Pid::from_raw(pid);
-        if kill(pid, Signal::SIGTERM).is_err() {
-            bail!("Failed to kill pageserver with pid {}", pid);
+    ///
+    /// Stop the server.
+    ///
+    /// If 'immediate' is true, we use SIGQUIT, killing the process immediately.
+    /// Otherwise we use SIGTERM, triggering a clean shutdown
+    ///
+    /// If the server is not running, returns success
+    ///
+    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
+        let pid_file = self.pid_file();
+        if !pid_file.exists() {
+            println!("Pageserver is already stopped");
+            return Ok(());
        }
+        let pid = Pid::from_raw(read_pidfile(&pid_file)?);

-        // wait for pageserver stop
-        let address = connection_address(&self.pg_connection_config);
-        for _ in 0..5 {
-            let stream = TcpStream::connect(&address);
-            thread::sleep(Duration::from_secs(1));
-            if let Err(_e) = stream {
-                println!("Pageserver stopped");
+        let sig = if immediate {
+            print!("Stopping pageserver immediately..");
+            Signal::SIGQUIT
+        } else {
+            print!("Stopping pageserver gracefully..");
+            Signal::SIGTERM
+        };
+        io::stdout().flush().unwrap();
+        match kill(pid, sig) {
+            Ok(_) => (),
+            Err(Errno::ESRCH) => {
+                println!(
+                    "Pageserver with pid {} does not exist, but a PID file was found",
+                    pid
+                );
                return Ok(());
            }
-            println!("Stopping pageserver on {}", address);
+            Err(err) => bail!(
+                "Failed to send signal to pageserver with pid {}: {}",
+                pid,
+                err.desc()
+            ),
+        }
+
+        let address = connection_address(&self.pg_connection_config);
+
+        // TODO Remove this "timeout" and handle it on caller side instead.
+        // Shutting down may take a long time,
+        // if pageserver checkpoints a lot of data
+        let mut tcp_stopped = false;
+        for _ in 0..100 {
+            if !tcp_stopped {
+                if let Err(err) = TcpStream::connect(&address) {
+                    tcp_stopped = true;
+                    if err.kind() != io::ErrorKind::ConnectionRefused {
+                        eprintln!("\nPageserver connection failed with error: {err}");
+                    }
+                }
+            }
+            if tcp_stopped {
+                // Also check status on the HTTP port
+
+                match self.check_status() {
+                    Err(PageserverHttpError::Transport(err)) if err.is_connect() => {
+                        println!("done!");
+                        return Ok(());
+                    }
+                    Err(err) => {
+                        eprintln!("\nPageserver status check failed with error: {err}");
+                        return Ok(());
+                    }
+                    Ok(()) => {
+                        // keep waiting
+                    }
+                }
+            }
+            print!(".");
+            io::stdout().flush().unwrap();
+            thread::sleep(Duration::from_secs(1));
        }

        bail!("Failed to stop pageserver with pid {}", pid);
@@ -234,87 +365,217 @@ impl PageServerNode {

    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
        let mut builder = self.http_client.request(method, url);
-        if self.env.auth_type == AuthType::ZenithJWT {
-            builder = builder.bearer_auth(&self.env.auth_token)
+        if self.env.pageserver.auth_type == AuthType::ZenithJWT {
+            builder = builder.bearer_auth(&self.env.pageserver.auth_token)
        }
        builder
    }

    pub fn check_status(&self) -> Result<()> {
-        self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
+        self.http_request(Method::GET, format!("{}/status", self.http_base_url))
            .send()?
            .error_from_body()?;
        Ok(())
    }

-    pub fn tenant_list(&self) -> Result<Vec<String>> {
+    pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
        Ok(self
-            .http_request(Method::GET, format!("{}/{}", self.http_base_url, "tenant"))
+            .http_request(Method::GET, format!("{}/tenant", self.http_base_url))
            .send()?
            .error_from_body()?
            .json()?)
    }

-    pub fn tenant_create(&self, tenantid: ZTenantId) -> Result<()> {
-        Ok(self
-            .http_request(Method::POST, format!("{}/{}", self.http_base_url, "tenant"))
+    pub fn tenant_create(
+        &self,
+        new_tenant_id: Option<ZTenantId>,
+        settings: HashMap<&str, &str>,
+    ) -> anyhow::Result<Option<ZTenantId>> {
+        let tenant_id_string = self
+            .http_request(Method::POST, format!("{}/tenant", self.http_base_url))
            .json(&TenantCreateRequest {
-                tenant_id: tenantid,
+                new_tenant_id,
+                checkpoint_distance: settings
+                    .get("checkpoint_distance")
+                    .map(|x| x.parse::<u64>())
+                    .transpose()?,
+                compaction_target_size: settings
+                    .get("compaction_target_size")
+                    .map(|x| x.parse::<u64>())
+                    .transpose()?,
+                compaction_period: settings.get("compaction_period").map(|x| x.to_string()),
+                compaction_threshold: settings
+                    .get("compaction_threshold")
+                    .map(|x| x.parse::<usize>())
+                    .transpose()?,
+                gc_horizon: settings
+                    .get("gc_horizon")
+                    .map(|x| x.parse::<u64>())
+                    .transpose()?,
+                gc_period: settings.get("gc_period").map(|x| x.to_string()),
+                image_creation_threshold: settings
+                    .get("image_creation_threshold")
+                    .map(|x| x.parse::<usize>())
+                    .transpose()?,
+                pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()),
+                walreceiver_connect_timeout: settings
+                    .get("walreceiver_connect_timeout")
+                    .map(|x| x.to_string()),
+                lagging_wal_timeout: settings.get("lagging_wal_timeout").map(|x| x.to_string()),
+                max_lsn_wal_lag: settings
+                    .get("max_lsn_wal_lag")
+                    .map(|x| x.parse::<NonZeroU64>())
+                    .transpose()
+                    .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
            })
            .send()?
            .error_from_body()?
-            .json()?)
+            .json::<Option<String>>()?;
+
+        tenant_id_string
+            .map(|id| {
+                id.parse().with_context(|| {
+                    format!(
+                        "Failed to parse tennat creation response as tenant id: {}",
+                        id
+                    )
+                })
+            })
+            .transpose()
    }

-    pub fn branch_list(&self, tenantid: &ZTenantId) -> Result<Vec<BranchInfo>> {
-        Ok(self
+    pub fn tenant_config(&self, tenant_id: ZTenantId, settings: HashMap<&str, &str>) -> Result<()> {
+        self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))
+            .json(&TenantConfigRequest {
+                tenant_id,
+                checkpoint_distance: settings
+                    .get("checkpoint_distance")
+                    .map(|x| x.parse::<u64>())
+                    .transpose()
+                    .context("Failed to parse 'checkpoint_distance' as an integer")?,
+                compaction_target_size: settings
+                    .get("compaction_target_size")
+                    .map(|x| x.parse::<u64>())
+                    .transpose()
+                    .context("Failed to parse 'compaction_target_size' as an integer")?,
+                compaction_period: settings.get("compaction_period").map(|x| x.to_string()),
+                compaction_threshold: settings
+                    .get("compaction_threshold")
+                    .map(|x| x.parse::<usize>())
+                    .transpose()
+                    .context("Failed to parse 'compaction_threshold' as an integer")?,
+                gc_horizon: settings
+                    .get("gc_horizon")
+                    .map(|x| x.parse::<u64>())
+                    .transpose()
+                    .context("Failed to parse 'gc_horizon' as an integer")?,
+                gc_period: settings.get("gc_period").map(|x| x.to_string()),
+                image_creation_threshold: settings
+                    .get("image_creation_threshold")
+                    .map(|x| x.parse::<usize>())
+                    .transpose()
+                    .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
+                pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()),
+                walreceiver_connect_timeout: settings
+                    .get("walreceiver_connect_timeout")
+                    .map(|x| x.to_string()),
+                lagging_wal_timeout: settings.get("lagging_wal_timeout").map(|x| x.to_string()),
+                max_lsn_wal_lag: settings
+                    .get("max_lsn_wal_lag")
+                    .map(|x| x.parse::<NonZeroU64>())
+                    .transpose()
+                    .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
+            })
+            .send()?
+            .error_from_body()?;
+
+        Ok(())
+    }
+
+    pub fn timeline_list(&self, tenant_id: &ZTenantId) -> anyhow::Result<Vec<TimelineInfo>> {
+        let timeline_infos: Vec<TimelineInfo> = self
            .http_request(
                Method::GET,
-                format!("{}/branch/{}", self.http_base_url, tenantid),
+                format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
            )
            .send()?
            .error_from_body()?
-            .json()?)
+            .json()?;
+
+        Ok(timeline_infos)
    }

-    pub fn branch_create(
+    pub fn timeline_create(
        &self,
-        branch_name: &str,
-        startpoint: &str,
-        tenantid: &ZTenantId,
-    ) -> Result<BranchInfo> {
-        Ok(self
-            .http_request(Method::POST, format!("{}/branch", self.http_base_url))
-            .json(&BranchCreateRequest {
-                tenant_id: tenantid.to_owned(),
-                name: branch_name.to_owned(),
-                start_point: startpoint.to_owned(),
+        tenant_id: ZTenantId,
+        new_timeline_id: Option<ZTimelineId>,
+        ancestor_start_lsn: Option<Lsn>,
+        ancestor_timeline_id: Option<ZTimelineId>,
+    ) -> anyhow::Result<Option<TimelineInfo>> {
+        let timeline_info_response = self
+            .http_request(
+                Method::POST,
+                format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
+            )
+            .json(&TimelineCreateRequest {
+                new_timeline_id,
+                ancestor_start_lsn,
+                ancestor_timeline_id,
            })
            .send()?
            .error_from_body()?
-            .json()?)
+            .json::<Option<TimelineInfo>>()?;
+
+        Ok(timeline_info_response)
    }

-    pub fn branch_get_by_name(
+    /// Import a basebackup prepared using either:
+    /// a) `pg_basebackup -F tar`, or
+    /// b) The `fullbackup` pageserver endpoint
+    ///
+    /// # Arguments
+    /// * `tenant_id` - tenant to import into. Created if not exists
+    /// * `timeline_id` - id to assign to imported timeline
+    /// * `base` - (start lsn of basebackup, path to `base.tar` file)
+    /// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`)
+    pub fn timeline_import(
        &self,
-        tenantid: &ZTenantId,
-        branch_name: &str,
-    ) -> Result<BranchInfo> {
-        Ok(self
-            .http_request(
-                Method::GET,
-                format!("{}/branch/{}/{}", self.http_base_url, tenantid, branch_name),
-            )
-            .send()?
-            .error_for_status()?
-            .json()?)
-    }
-}
+        tenant_id: ZTenantId,
+        timeline_id: ZTimelineId,
+        base: (Lsn, PathBuf),
+        pg_wal: Option<(Lsn, PathBuf)>,
+    ) -> anyhow::Result<()> {
+        let mut client = self.pg_connection_config.connect(NoTls).unwrap();

-impl Drop for PageServerNode {
-    fn drop(&mut self) {
-        if self.kill_on_exit {
-            let _ = self.stop();
+        // Init base reader
+        let (start_lsn, base_tarfile_path) = base;
+        let base_tarfile = File::open(base_tarfile_path)?;
+        let mut base_reader = BufReader::new(base_tarfile);
+
+        // Init wal reader if necessary
+        let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
+            let wal_tarfile = File::open(wal_tarfile_path)?;
+            let wal_reader = BufReader::new(wal_tarfile);
+            (end_lsn, Some(wal_reader))
+        } else {
+            (start_lsn, None)
+        };
+
+        // Import base
+        let import_cmd =
+            format!("import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn}");
+        let mut writer = client.copy_in(&import_cmd)?;
+        io::copy(&mut base_reader, &mut writer)?;
+        writer.finish()?;
+
+        // Import wal if necessary
+        if let Some(mut wal_reader) = wal_reader {
+            let import_cmd = format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}");
+            let mut writer = client.copy_in(&import_cmd)?;
+            io::copy(&mut wal_reader, &mut writer)?;
+            writer.finish()?;
        }
+
+        Ok(())
    }
 }
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -1,13 +1,20 @@
 #!/bin/sh
 set -eux

+broker_endpoints_param="${BROKER_ENDPOINT:-absent}"
+if [ "$broker_endpoints_param" != "absent" ]; then
+    broker_endpoints_param="-c broker_endpoints=['$broker_endpoints_param']"
+else
+    broker_endpoints_param=''
+fi
+
 if [ "$1" = 'pageserver' ]; then
    if [ ! -d "/data/tenants" ]; then
        echo "Initializing pageserver data directory"
-        pageserver --init -D /data --postgres-distrib /usr/local
+        pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10" $broker_endpoints_param
    fi
    echo "Staring pageserver at 0.0.0.0:6400"
-    pageserver -l 0.0.0.0:6400 -D /data
+    pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" $broker_endpoints_param -D /data
 else
    "$@"
 fi
--- a/docs/README.md
+++ b/docs/README.md
@@ -6,9 +6,9 @@
 - [docker.md](docker.md) — Docker images and building pipeline.
 - [glossary.md](glossary.md) — Glossary of all the terms used in codebase.
 - [multitenancy.md](multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI.
- [sourcetree.md](sourcetree.md) — Overview of the source tree layeout.
- [pageserver/README](/pageserver/README) — pageserver overview.
- [postgres_ffi/README](/postgres_ffi/README) — Postgres FFI overview.
+- [sourcetree.md](sourcetree.md) — Overview of the source tree layout.
+- [pageserver/README.md](/pageserver/README.md) — pageserver overview.
+- [postgres_ffi/README.md](/libs/postgres_ffi/README.md) — Postgres FFI overview.
 - [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
- [walkeeper/README](/walkeeper/README) — WAL service overview.
+- [safekeeper/README.md](/safekeeper/README.md) — WAL service overview.
 - [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core
--- a/docs/authentication.md
+++ b/docs/authentication.md
@@ -27,4 +27,4 @@ management_token = jwt.encode({"scope": "pageserverapi"}, auth_keys.priv, algori
 tenant_token = jwt.encode({"scope": "tenant", "tenant_id": ps.initial_tenant}, auth_keys.priv, algorithm="RS256")
 ```

-Utility functions to work with jwts in rust are located in zenith_utils/src/auth.rs
+Utility functions to work with jwts in rust are located in libs/utils/src/auth.rs
--- a/docs/core_changes.md
+++ b/docs/core_changes.md
@@ -188,7 +188,7 @@ Not currently committed but proposed:
 3. Prefetching
 - Why?
  As far as pages in Zenith are loaded on demand, to reduce node startup time
-  and also sppedup some massive queries we need some mechanism for bulk loading to
+  and also speedup some massive queries we need some mechanism for bulk loading to
  reduce page request round-trip overhead.

  Currently Postgres is supporting prefetching only for bitmap scan.
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -1,38 +1,20 @@
-# Docker images of Zenith
+# Docker images of Neon

 ## Images

 Currently we build two main images:

- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `wal_acceptor` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
- [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres).
+- [neondatabase/neon](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
+- [neondatabase/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres).

-And two intermediate images used either to reduce build time or to deliver some additional binary tools from other repos:
+And additional intermediate image:

- [zenithdb/build](https://hub.docker.com/repository/docker/zenithdb/build) — image with all the dependencies required to build Zenith and compute node images. This image is based on `rust:slim-buster`, so it also has a proper `rust` environment. Built from [/Dockerfile.build](/Dockerfile.build).
- [zenithdb/compute-tools](https://hub.docker.com/repository/docker/zenithdb/compute-tools) — compute node configuration management tools.
+- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.

 ## Building pipeline

-1. Image `zenithdb/compute-tools` is re-built automatically.
+We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs

-2. Image `zenithdb/build` is built manually. If you want to introduce any new compile time dependencies to Zenith or compute node you have to update this image as well, build it and push to Docker Hub.
+1. `neondatabase/compute-tools` and `neondatabase/compute-node`

-Build:
-```sh
-docker build -t zenithdb/build:buster -f Dockerfile.build .
-```
-
-Login:
-```sh
-docker login
-```
-
-Push to Docker Hub:
-```sh
-docker push zenithdb/build:buster
-```
-
-3. Image `zenithdb/compute-node` is built independently in the [zenithdb/postgres](https://github.com/zenithdb/postgres) repo.
-
-4. Image `zenithdb/zenith` is built in this repo after a successful `release` tests run and pushed to Docker Hub automatically.
+2. `neondatabase/neon`
--- a/docs/glossary.md
+++ b/docs/glossary.md
@@ -2,6 +2,16 @@

 ### Authentication

+### Backpressure
+
+Backpressure is used to limit the lag between pageserver and compute node or WAL service.
+
+If compute node or WAL service run far ahead of Page Server,
+the time of serving page requests increases. This may lead to timeout errors.
+
+To tune backpressure limits use `max_replication_write_lag`, `max_replication_flush_lag` and `max_replication_apply_lag` settings.
+When lag between current LSN (pg_current_wal_flush_lsn() at compute node) and minimal write/flush/apply position of replica exceeds the limit
+backends performing writes are blocked until the replica is caught up.
 ### Base image (page image)

 ### Basebackup
@@ -11,7 +21,7 @@ NOTE:It has nothing to do with PostgreSQL pg_basebackup.

 ### Branch

-We can create branch at certain LSN using `zenith branch` command.
+We can create branch at certain LSN using `neon_local timeline branch` command.
 Each Branch lives in a corresponding timeline[] and has an ancestor[].


@@ -19,24 +29,32 @@ Each Branch lives in a corresponding timeline[] and has an ancestor[].

 NOTE: This is an overloaded term.

-A checkpoint record in the WAL marks a point in the WAL sequence at which it is guaranteed that all data files have been updated with all information from shared memory modified before that checkpoint; 
+A checkpoint record in the WAL marks a point in the WAL sequence at which it is guaranteed that all data files have been updated with all information from shared memory modified before that checkpoint;

 ### Checkpoint (Layered repository)

 NOTE: This is an overloaded term.

 Whenever enough WAL has been accumulated in memory, the page server []
-writes out the changes from in-memory layers into new layer files[]. This process
-is called "checkpointing". The page server only creates layer files for
-relations that have been modified since the last checkpoint. 
+writes out the changes from the in-memory layer into a new delta layer file. This process
+is called "checkpointing".

 Configuration parameter `checkpoint_distance` defines the distance
 from current LSN to perform checkpoint of in-memory layers.
 Default is `DEFAULT_CHECKPOINT_DISTANCE`.
-Set this parameter to `0` to force checkpoint of every layer.

-Configuration parameter `checkpoint_period` defines the interval between checkpoint iterations.
-Default is `DEFAULT_CHECKPOINT_PERIOD`.
+### Compaction
+
+A background operation on layer files. Compaction takes a number of L0
+layer files, each of which covers the whole key space and a range of
+LSN, and reshuffles the data in them into L1 files so that each file
+covers the whole LSN range, but only part of the key space.
+
+Compaction should also opportunistically leave obsolete page versions
+from the L1 files, and materialize other page versions for faster
+access. That hasn't been implemented as of this writing, though.
+
+
 ### Compute node

 Stateless Postgres node that stores data in pageserver.
@@ -44,36 +62,69 @@ Stateless Postgres node that stores data in pageserver.
 ### Garbage collection

 The process of removing old on-disk layers that are not needed by any timeline anymore.
+
 ### Fork

 Each of the separate segmented file sets in which a relation is stored. The main fork is where the actual data resides. There also exist two secondary forks for metadata: the free space map and the visibility map.
-Each PostgreSQL fork is considered a separate relish.

 ### Layer

-Each layer corresponds to the specific version of a relish Segment in a range of LSNs.
+A layer contains data needed to reconstruct any page versions within the
+layer's Segment and range of LSNs.
+
 There are two kinds of layers, in-memory and on-disk layers. In-memory
 layers are used to ingest incoming WAL, and provide fast access
 to the recent page versions. On-disk layers are stored as files on disk, and
-are immutable.
+are immutable. See pageserver/src/layered_repository/README.md for more.
+
 ### Layer file (on-disk layer)

 Layered repository on-disk format is based on immutable files.  The
-files are called "layer files". Each file corresponds to one RELISH_SEG_SIZE
-segment of a PostgreSQL relation fork. There are two kinds of layer
-files: image files and delta files. An image file contains a
-"snapshot" of the segment at a particular LSN, and a delta file
-contains WAL records applicable to the segment, in a range of LSNs.
+files are called "layer files". There are two kinds of layer files:
+image files and delta files. An image file contains a "snapshot" of a
+range of keys at a particular LSN, and a delta file contains WAL
+records applicable to a range of keys, in a range of LSNs.

 ### Layer map

-The layer map tracks what layers exist for all the relishes in a timeline.
+The layer map tracks what layers exist in a timeline.
+
 ### Layered repository

-Zenith repository implementation that keeps data in layers.
+Neon repository implementation that keeps data in layers.
 ### LSN

+The Log Sequence Number (LSN) is a unique identifier of the WAL record[] in the WAL log.
+The insert position is a byte offset into the logs, increasing monotonically with each new record.
+Internally, an LSN is a 64-bit integer, representing a byte position in the write-ahead log stream.
+It is printed as two hexadecimal numbers of up to 8 digits each, separated by a slash.
+Check also [PostgreSQL doc about pg_lsn type](https://www.postgresql.org/docs/devel/datatype-pg-lsn.html)
+Values can be compared to calculate the volume of WAL data that separates them, so they are used to measure the progress of replication and recovery.

+In Postgres and Neon LSNs are used to describe certain points in WAL handling.
+
+PostgreSQL LSNs and functions to monitor them:
+* `pg_current_wal_insert_lsn()` - Returns the current write-ahead log insert location.
+* `pg_current_wal_lsn()` - Returns the current write-ahead log write location.
+* `pg_current_wal_flush_lsn()` - Returns the current write-ahead log flush location.
+* `pg_last_wal_receive_lsn()` - Returns the last write-ahead log location that has been received and synced to disk by streaming replication. While streaming replication is in progress this will increase monotonically.
+* `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically.
+[source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html):
+
+Neon safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md)
+* `CommitLSN`: position in WAL confirmed by quorum safekeepers.
+* `RestartLSN`: position in WAL confirmed by all safekeepers.
+* `FlushLSN`: part of WAL persisted to the disk by safekeeper.
+* `VCL`: the largest LSN for which we can guarantee availability of all prior records.
+
+Neon pageserver LSNs:
+* `last_record_lsn` - the end of last processed WAL record.
+* `disk_consistent_lsn` - data is known to be fully flushed and fsync'd to local disk on pageserver up to this LSN.
+* `remote_consistent_lsn` - The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash.
+TODO: use this name consistently in remote storage code. Now `disk_consistent_lsn` is used and meaning depends on the context.
+* `ancestor_lsn` - LSN of the branch point (the LSN at which this branch was created)
+
+TODO: add table that describes mapping between PostgreSQL (compute), safekeeper and pageserver LSNs.
 ### Page (block)

 The basic structure used to store relation data. All pages are of the same size.
@@ -81,7 +132,7 @@ This is the unit of data exchange between compute node and pageserver.

 ### Pageserver

-Zenith storage engine: repositories + wal receiver + page service + wal redo.
+Neon storage engine: repositories + wal receiver + page service + wal redo.

 ### Page service

@@ -106,14 +157,6 @@ and create new databases and accounts (control plane API in our case).

 The generic term in PostgreSQL for all objects in a database that have a name and a list of attributes defined in a specific order.

-### Relish
-
-We call each relation and other file that is stored in the
-repository a "relish". It comes from "rel"-ish, as in "kind of a
-rel", because it covers relations as well as other things that are
-not relations, but are treated similarly for the purposes of the
-storage layer.
-
 ### Replication slot


@@ -130,33 +173,24 @@ One repository corresponds to one Tenant.

 How much history do we need to keep around for PITR and read-only nodes?

-### Segment (PostgreSQL)
-
-NOTE: This is an overloaded term.
+### Segment

 A physical file that stores data for a given relation. File segments are
 limited in size by a compile-time setting (1 gigabyte by default), so if a
 relation exceeds that size, it is split into multiple segments.

-### Segment (Layered Repository)
-
-NOTE: This is an overloaded term.
-
-Segment is a RELISH_SEG_SIZE slice of relish (identified by a SegmentTag).
-
 ### SLRU

 SLRUs include pg_clog, pg_multixact/members, and
 pg_multixact/offsets. There are other SLRUs in PostgreSQL, but
 they don't need to be stored permanently (e.g. pg_subtrans),
-or we do not support them in zenith yet (pg_commit_ts).
-Each SLRU segment is considered a separate relish[].
+or we do not support them in neon yet (pg_commit_ts).

 ### Tenant (Multitenancy)
-Tenant represents a single customer, interacting with Zenith.
+Tenant represents a single customer, interacting with Neon.
 Wal redo[] activity, timelines[], layers[] are managed for each tenant independently.
 One pageserver[] can serve multiple tenants at once.
-One safekeeper 
+One safekeeper

 See `docs/multitenancy.md` for more.

--- a/docs/multitenancy.md
+++ b/docs/multitenancy.md
@@ -6,7 +6,7 @@ Zenith supports multitenancy. One pageserver can serve multiple tenants at once.

 ### Tenants in other commands

-By default during `zenith init` new tenant is created on the pageserver. Newly created tenant's id is saved to cli config, so other commands can use it automatically if no direct arugment `--tenantid=<tenantid>` is provided. So generally tenantid more frequently appears in internal pageserver interface. Its commands take tenantid argument to distinguish to which tenant operation should be applied. CLI support creation of new tenants.
+By default during `zenith init` new tenant is created on the pageserver. Newly created tenant's id is saved to cli config, so other commands can use it automatically if no direct argument `--tenantid=<tenantid>` is provided. So generally tenantid more frequently appears in internal pageserver interface. Its commands take tenantid argument to distinguish to which tenant operation should be applied. CLI support creation of new tenants.

 Examples for cli:

@@ -56,4 +56,4 @@ Tenant id is passed to postgres via GUC the same way as the timeline. Tenant id

 ### Safety

-For now particular tenant can only appear on a particular pageserver. Set of WAL acceptors are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline).
+For now particular tenant can only appear on a particular pageserver. Set of safekeepers are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline).
--- a/docs/pageserver-tenant-migration.md
+++ b/docs/pageserver-tenant-migration.md
@@ -0,0 +1,22 @@
+## Pageserver tenant migration
+
+### Overview
+
+This feature allows to migrate a timeline from one pageserver to another by utilizing remote storage capability.
+
+### Migration process
+
+Pageserver implements two new http handlers: timeline attach and timeline detach.
+Timeline migration is performed in a following way:
+1. Timeline attach is called on a target pageserver. This asks pageserver to download latest checkpoint uploaded to s3.
+2. For now it is necessary to manually initialize replication stream via callmemaybe call so target pageserver initializes replication from safekeeper (it is desired to avoid this and initialize replication directly in attach handler, but this requires some refactoring (probably [#997](https://github.com/zenithdb/zenith/issues/997)/[#1049](https://github.com/zenithdb/zenith/issues/1049))
+3. Replication state can be tracked via timeline detail pageserver call.
+4. Compute node should be restarted with new pageserver connection string. Issue with multiple compute nodes for one timeline is handled on the safekeeper consensus level. So this is not a problem here.Currently responsibility for rescheduling the compute with updated config lies on external coordinator (console).
+5. Timeline is detached from old pageserver. On disk data is removed.
+
+
+### Implementation details
+
+Now safekeeper needs to track which pageserver it is replicating to. This introduces complications into replication code:
+* We need to distinguish different pageservers (now this is done by connection string which is imperfect and is covered here: https://github.com/zenithdb/zenith/issues/1105). Callmemaybe subscription management also needs to track that (this is already implemented).
+* We need to track which pageserver is the primary. This is needed to avoid reconnections to non primary pageservers. Because we shouldn't reconnect to them when they decide to stop their walreceiver. I e this can appear when there is a load on the compute and we are trying to detach timeline from old pageserver. In this case callmemaybe will try to reconnect to it because replication termination condition is not met (page server with active compute could never catch up to the latest lsn, so there is always some wal tail)
--- a/docs/rfcs/002-storage.md
+++ b/docs/rfcs/002-storage.md
@@ -0,0 +1,186 @@
+# Zenith storage node — alternative
+
+## **Design considerations**
+
+Simplify storage operations for people => Gain adoption/installs on laptops and small private installation => Attract customers to DBaaS by seamless integration between our tooling and cloud.
+
+Proposed architecture addresses:
+
+- High availability -- tolerates n/2 - 1 failures
+- Multi-tenancy -- one storage for all databases
+- Elasticity -- increase storage size on the go by adding nodes
+- Snapshots / backups / PITR with S3 offload
+- Compression
+
+Minuses are:
+
+- Quite a lot of work
+- Single page access may touch few disk pages
+- Some bloat in data — may slowdown sequential scans
+
+## **Summary**
+
+Storage cluster is sharded key-value store with ordered keys. Key (****page_key****) is a tuple of `(pg_id, db_id, timeline_id, rel_id, forkno, segno, pageno, lsn)`. Value is either page or page diff/wal. Each chunk (chunk == shard) stores approx 50-100GB ~~and automatically splits in half when grows bigger then soft 100GB limit~~. by having a fixed range of pageno's it is responsible for. Chunks placement on storage nodes is stored in a separate metadata service, so chunk can be freely moved around the cluster if it is need. Chunk itself is a filesystem directory with following sub directories:
+
+```
+
+|-chunk_42/
+  |-store/ -- contains lsm with pages/pagediffs ranging from
+  |	      page_key_lo to page_key_hi
+  |-wal/
+  |  |- db_1234/ db-specific wal files with pages from page_key_lo
+  |		 to page_key_hi
+  |
+  |-chunk.meta -- small file with snapshot references
+		  (page_key_prefix+lsn+name)
+		  and PITR regions (page_key_start, page_key_end)
+```
+
+## **Chunk**
+
+Chunk is responsible for storing pages potentially from different databases and relations. Each page is addressed by a lexicographically ordered tuple (****page_key****) with following fields:
+
+- `pg_id` -- unique id of given postgres instance (or postgres cluster as it is called in postgres docs)
+- `db_id` -- database that was created by 'CREATE DATABASE' in a given postgres instance
+- `db_timeline` -- used to create Copy-on-Write instances from snapshots, described later
+- `rel_id` -- tuple of (relation_id, 0) for tables and (indexed_relation_id, rel_id) for indices. Done this way so table indices were closer to table itself on our global key space.
+- `(forkno, segno, pageno)` -- page coordinates in postgres data files
+- `lsn_timeline` -- postgres feature, increments when PITR was done.
+- `lsn` -- lsn of current page version.
+
+Chunk stores pages and page diffs ranging from page_key_lo to page_key_hi. Processing node looks at page in wal record and sends record to a chunk responsible for this page range. When wal record arrives to a chunk it is initially stored in `chunk_id/wal/db_id/wal_segno.wal`. Then background process moves records from that wal files to the lsm tree in `chunk_id/store`. Or, more precisely, wal records would be materialized into lsm memtable and when that memtable is flushed to SSTable on disk we may trim the wal. That way some not durably (in the distributed sense) committed pages may enter the tree -- here we rely on processing node behavior: page request from processing node should contain proper lsm horizons so that storage node may respond with proper page version.
+
+LSM here is a usual LSM for variable-length values: at first data is stored in memory (we hold incoming wal records to be able to regenerate it after restart) at some balanced tree. When this tree grows big enough we dump it into disk file (SSTable) sorting records by key. Then SStables are mergesorted in the background to a different files. All file operation are sequential and do not require WAL for durability.
+
+Content of SSTable can be following:
+
+```jsx
+(pg_id, db_id, ... , pageno=42, lsn=100) (full 8k page data)
+(pg_id, db_id, ... , pageno=42, lsn=150) (per-page diff)
+(pg_id, db_id, ... , pageno=42, lsn=180) (per-page diff)
+(pg_id, db_id, ... , pageno=42, lsn=200) (per-page diff)
+(pg_id, db_id, ... , pageno=42, lsn=220) (full 8k page data)
+(pg_id, db_id, ... , pageno=42, lsn=250) (per-page diff)
+(pg_id, db_id, ... , pageno=42, lsn=270) (per-page diff)
+(pg_id, db_id, ... , pageno=5000, lsn=100) (full 8k page data)
+```
+
+So query for `pageno=42 up to lsn=260` would need to find closest entry less then this key, iterate back to the latest full page and iterate forward to apply diffs. How often page is materialized in lsn-version sequence is up to us -- let's say each 5th version should be a full page.
+
+### **Page deletion**
+
+To delete old pages we insert blind deletion marker `(pg_id, db_id, #trim_lsn < 150)` into a lsm tree. During merges such marker would indicate that all pages with smaller lsn should be discarded. Delete marker will travel down the tree levels hierarchy until it reaches last level. In non-PITR scenario where old page version are not needed at all such deletion marker would (in average) prevent old page versions propagation down the tree -- so all bloat would concentrate at higher tree layers without affecting bigger bottom layers.
+
+### **Recovery**
+
+Upon storage node restart recent WAL files are applied to appropriate pages and resulting pages stored in lsm memtable. So this should be fast since we are not writing anything to disk.
+
+### **Checkpointing**
+
+No such mechanism is needed. Or we may look at the storage node as at kind of continuous checkpointer.
+
+### **Full page writes (torn page protection)**
+
+Storage node never updates individual pages, only merges SSTable, so torn pages is not an issue.
+
+### **Snapshot**
+
+That is the part that I like about this design -- snapshot creation is instant and cheap operation that can have flexible granularity level: whole instance, database, table. Snapshot creation inserts a record in `chunk.meta` file with lsn of this snapshot and key prefix `(pg_id, db_id, db_timeline, rel_id, *)` that prohibits pages deletion within this range. Storage node may not know anything about page internals, but by changing number of fields in our prefix we may change snapshot granularity.
+
+It is again useful to remap `rel_id` to `(indexed_relation_id, rel_id)` so that snapshot of relation would include it's indices. Also table snapshot would trickily interact with catalog. Probably all table snapshots should hold also a catalog snapshot. And when node is started with such snapshot it should check that only tables from snapshot are queried. I assume here that for snapshot reading one need to start a new postgres instance.
+
+Storage consumed by snapshot is proportional to the amount of data changed. We may have some heuristic (calculated based on cost of different storages) about when to offload old snapshot to s3. For example, if current database has more then 40% of changed pages with respect to previous snapshot then we may offload that snapshot to s3, and release this space.
+
+**Starting db from snapshot**
+
+When we are starting database from snapshot it can be done in two ways. First, we may create new db_id, move all the data from snapshot to a new db and start a database. Second option is to create Copy-on-Write (CoW) instance out of snapshot and read old pages from old snapshot and store new pages separately. That is why there is `db_timeline` key field near `db_id` -- CoW (🐮) database should create new `db_timeline` and remember old `db_timeline`. Such a database can have hashmap of pages that it is changed to query pages from proper snapshot on the first try. `db_timeline` is located near `db_id` so that new page versions generated by new instance would not bloat data of initial snapshot. It is not clear for whether it is possibly to effectively support "stacked" CoW snapshot, so we may disallow them. (Well, one way to support them is to move `db_timeline` close to `lsn` -- so we may scan neighboring pages and find right one. But again that way we bloat snapshot with unrelated data and may slowdown full scans that are happening in different database).
+
+**Snapshot export/import**
+
+Once we may start CoW instances it is easy to run auxiliary postgres instance on this snapshot and run `COPY FROM (...) TO stdout` or `pg_dump` and export data from the snapshot to some portable formats. Also we may start postgres on a new empty database and run `COPY FROM stdin`. This way we can initialize new non-CoW databases and transfer snapshots via network.
+
+### **PITR area**
+
+In described scheme PITR is just a prohibition to delete any versions within some key prefix, either it is a database or a table key prefix. So PITR may have different settings for different tables, databases, etc.
+
+PITR is quite bloaty, so we may aggressively offload it to s3 -- we may push same (or bigger) SSTables to s3 and maintain lsm structure there.
+
+### **Compression**
+
+Since we are storing page diffs of variable sizes there is no structural dependency on a page size and we may compress it. Again that could be enabled only on pages with some key prefixes, so we may have this with db/table granularity.
+
+### **Chunk metadata**
+
+Chunk metadata is a file lies in chunk directory that stores info about current snapshots and PITR regions. Chunk should always consult this data when merging SSTables and applying delete markers.
+
+### **Chunk splitting**
+
+*(NB: following paragraph is about how to avoid page splitting)*
+
+When chunks hits some soft storage limit (let's say 100Gb) it should be split in half and global metadata about chunk boundaries should be updated. Here i assume that chunk split is a local operation happening on single node. Process of chink splitting should look like following:
+
+1. Find separation key and spawn two new chunks with [lo, mid) [mid, hi) boundaries.
+
+2. Prohibit WAL deletion and old SSTables deletion on original chunk.
+
+3. On each lsm layer we would need to split only one SSTable, all other would fit within left or right range. Symlink/split that files to new chunks.
+
+4. Start WAL replay on new chunks.
+
+5. Update global metadata about new chunk boundaries.
+
+6. Eventually (metadata update should be pushed to processing node by metadata service) storage node will start sending WAL and page requests to the new nodes.
+
+7. New chunk may start serving read queries when following conditions are met:
+
+a) it receives at least on WAL record from processing node
+
+b) it replayed all WAL up to the new received one
+
+c) checked by downlinks that there were no WAL gaps.
+
+Chunk split as it is described here is quite fast operation when it is happening on the local disk -- vast majority of files will be just moved without copying anything. I suggest to keep split always local and not to mix it with chunk moving around cluster. So if we want to split some chunk but there is small amount of free space left on the device, we should first move some chunks away from the node and then proceed with splitting.
+
+### Fixed chunks
+
+Alternative strategy is to not to split at all and have pageno-fixed chunk boundaries. When table is created we first materialize this chunk by storing first new pages only and chunks is small. Then chunk is growing while table is filled, but it can't grow substantially bigger then allowed pageno range, so at max it would be 1GB or whatever limit we want + some bloat due to snapshots and old page versions.
+
+### **Chunk lsm internals**
+
+So how to implement chunk's lsm?
+
+- Write from scratch and use RocksDB to prototype/benchmark, then switch to own lsm implementation. RocksDB can provide some sanity check for performance of home-brewed implementation and it would be easier to prototype.
+- Use postgres as lego constructor. We may model memtable with postgres B-tree referencing some in-memory log of incoming records. SSTable merging may reuse postgres external merging algorithm, etc. One thing that would definitely not fit (or I didn't came up with idea how to fit that) -- is multi-tenancy. If we are storing pages from different databases we can't use postgres buffer pool, since there is no db_id in the page header. We can add new field there but IMO it would be no go for committing that to vanilla.
+
+Other possibility is to not to try to fit few databases in one storage node. But that way it is no go for multi-tenant cloud installation: we would need to run a lot of storage node instances on one physical storage node, all with it own local page cache. So that would be much closer to ordinary managed RDS.
+
+Multi-tenant storage makes sense even on a laptop, when you work with different databases, running tests with temp database, etc. And when installation grows bigger it start to make more and more sense, so it seems important.
+
+# Storage fleet
+
+# **Storage fleet**
+
+- When database is smaller then a chunk size we naturally can store them in one chunk (since their page_key would fit in some chunk's [hi, lo) range).
+
+<img width="937" alt="Screenshot_2021-02-22_at_16 49 17" src="https://user-images.githubusercontent.com/284219/108729836-ffcbd200-753b-11eb-9412-db802ec30021.png">
+
+Few databases are stored in one chunk, replicated three times
+
+- When database can't fit into one storage node it can occupy lots of chunks that were split while database was growing. Chunk placement on nodes is controlled by us with some automatization, but we always may manually move chunks around the cluster.
+
+<img width="940" alt="Screenshot_2021-02-22_at_16 49 10" src="https://user-images.githubusercontent.com/284219/108729815-fb071e00-753b-11eb-86e0-be6703e47d82.png">
+
+Here one big database occupies two set of nodes. Also some chunks were moved around to restore replication factor after disk failure. In this case we also have "sharded" storage for a big database and issue wal writes to different chunks in parallel.
+
+## **Chunk placement strategies**
+
+There are few scenarios where we may want to move chunks around the cluster:
+
+- disk usage on some node is big
+- some disk experienced a failure
+- some node experienced a failure or need maintenance
+
+## **Chunk replication**
+
+Chunk replication may be done by cloning page ranges with respect to some lsn from peer nodes, updating global metadata, waiting for WAL to come, replaying previous WAL and becoming online -- more or less like during chunk split.
+
--- a/docs/rfcs/003-laptop-cli.md
+++ b/docs/rfcs/003-laptop-cli.md
@@ -0,0 +1,267 @@
+# Command line interface (end-user)
+
+Zenith CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside zenith distribution at least at the start.
+
+This proposal is focused on managing local installations. For cluster operations, different tooling would be needed. The point of integration between the two is storage URL: no matter how complex cluster setup is it may provide an endpoint where the user may push snapshots.
+
+The most important concept here is a snapshot, which can be created/pushed/pulled/exported. Also, we may start temporary read-only postgres instance over any local snapshot. A more complex scenario would consist of several basic operations over snapshots.
+
+# Possible usage scenarios
+
+## Install zenith, run a postgres
+
+```
+> brew install pg-zenith 
+> zenith pg create # creates pgdata with default pattern pgdata$i
+> zenith pg list
+ID            PGDATA        USED    STORAGE            ENDPOINT
+primary1      pgdata1       0G      zenith-local       localhost:5432
+```
+
+## Import standalone postgres to zenith
+
+```
+> zenith snapshot import --from=basebackup://replication@localhost:5432/ oldpg
+[====================------------] 60% | 20MB/s
+> zenith snapshot list
+ID          SIZE        PARENT
+oldpg       5G          -
+
+> zenith pg create --snapshot oldpg
+Started postgres on localhost:5432
+
+> zenith pg list
+ID            PGDATA        USED    STORAGE            ENDPOINT
+primary1      pgdata1       5G      zenith-local       localhost:5432
+
+> zenith snapshot destroy oldpg
+Ok
+```
+
+Also, we may start snapshot import implicitly by looking at snapshot schema
+
+```
+> zenith pg create --snapshot basebackup://replication@localhost:5432/
+Downloading snapshot... Done.
+Started postgres on localhost:5432
+Destroying snapshot... Done.
+```
+
+## Pull snapshot with some publicly shared database
+
+Since we may export the whole snapshot as one big file (tar of basebackup, maybe with some manifest) it may be shared over conventional means: http, ssh, [git+lfs](https://docs.github.com/en/github/managing-large-files/about-git-large-file-storage).
+
+```
+> zenith pg create --snapshot http://learn-postgres.com/movies_db.zenith movies
+```
+
+## Create snapshot and push it to the cloud
+
+```
+> zenith snapshot create pgdata1@snap1
+> zenith snapshot push --to ssh://stas@zenith.tech pgdata1@snap1
+```
+
+## Rollback database to the snapshot
+
+One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `zenith pg checkout`.
+
+```
+> zenith pg list
+ID            PGDATA        USED    STORAGE            ENDPOINT
+primary1      pgdata1       5G      zenith-local       localhost:5432
+
+> zenith snapshot create pgdata1@snap1
+
+> zenith snapshot list
+ID                    SIZE        PARENT
+oldpg                 5G          -
+pgdata1@snap1         6G          -
+pgdata1@CURRENT       6G          -
+
+> zenith pg checkout pgdata1@snap1
+Stopping postgres on pgdata1.
+Rolling back pgdata1@CURRENT to pgdata1@snap1.
+Starting postgres on pgdata1.
+
+> zenith snapshot list
+ID                    SIZE        PARENT
+oldpg                 5G          -
+pgdata1@snap1         6G          -
+pgdata1@HEAD{0}       6G          -
+pgdata1@CURRENT       6G          -
+```
+
+Some notes: pgdata1@CURRENT -- implicit snapshot representing the current state of the database in the data directory. When we are checking out some snapshot CURRENT will be set to this snapshot and the old CURRENT state will be named HEAD{0} (0 is the number of postgres timeline, it would be incremented after each such checkout).
+
+## Configure PITR area (Point In Time Recovery).
+
+PITR area acts like a continuous snapshot where you can reset the database to any point in time within this area (by area I mean some TTL period or some size limit, both possibly infinite).
+
+```
+> zenith pitr create --storage s3tank --ttl 30d --name pitr_last_month
+```
+
+Resetting the database to some state in past would require creating a snapshot on some lsn / time in this pirt area.
+
+# Manual
+
+## storage
+
+Storage is either zenith pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default.
+
+**zenith storage attach** -t [native|s3] -c key=value -n name
+
+Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=zenith.tech/stas/mystore. Other possible term for native is 'zstore'.
+
+
+**zenith storage list**
+
+Show currently attached storages. For example:
+
+```
+> zenith storage list
+NAME            USED    TYPE                OPTIONS          PATH
+local           5.1G    zenith-local                         /opt/zenith/store/local
+local.compr     20.4G   zenith-local        compression=on    /opt/zenith/store/local.compr
+zcloud          60G     zenith-remote                        zenith.tech/stas/mystore
+s3tank          80G     S3
+```
+
+**zenith storage detach**
+
+**zenith storage show**
+
+
+
+## pg
+
+Manages postgres data directories and can start postgres instances with proper configuration. An experienced user may avoid using that (except pg create) and configure/run postgres by themselves.
+
+Pg is a term for a single postgres running on some data. I'm trying to avoid separation of datadir management and postgres instance management -- both that concepts bundled here together.
+
+**zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata
+
+Creates (initializes) new data directory in given storage and starts postgres. I imagine that storage for this operation may be only local and data movement to remote location happens through snapshots/pitr.
+
+--no-start: just init datadir without creating 
+
+--snapshot snap: init from the snapshot. Snap is a name or URL (zenith.tech/stas/mystore/snap1)
+
+--cow: initialize Copy-on-Write data directory on top of some snapshot (makes sense if it is a snapshot of currently running a database)
+
+**zenith pg destroy**
+
+**zenith pg start** [--replica] pgdata
+
+Start postgres with proper extensions preloaded/installed.
+
+**zenith pg checkout**
+
+Rollback data directory to some previous snapshot. 
+
+**zenith pg stop** pg_id
+
+**zenith pg list**
+
+```
+ROLE                 PGDATA        USED    STORAGE            ENDPOINT
+primary              my_pg         5.1G    local              localhost:5432
+replica-1                                                     localhost:5433
+replica-2                                                     localhost:5434
+primary              my_pg2        3.2G    local.compr        localhost:5435
+-                    my_pg3        9.2G    local.compr        -
+```
+
+**zenith pg show**
+
+```
+my_pg:
+    storage: local
+    space used on local: 5.1G
+    space used on all storages: 15.1G
+    snapshots:
+        on local:
+            snap1: 1G
+            snap2: 1G
+        on zcloud:
+            snap2: 1G
+        on s3tank:
+            snap5: 2G
+    pitr:
+        on s3tank:
+            pitr_one_month: 45G
+
+```
+
+**zenith pg start-rest/graphql** pgdata
+
+Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, just an idea.
+
+
+## snapshot
+
+Snapshot creation is cheap -- no actual data is copied, we just start retaining old pages. Snapshot size means the amount of retained data, not all data. Snapshot name looks like pgdata_name@tag_name. tag_name is set by the user during snapshot creation. There are some reserved tag names: CURRENT represents the current state of the data directory; HEAD{i} represents the data directory state that resided in the database before i-th checkout.
+
+**zenith snapshot create** pgdata_name@snap_name
+
+Creates a new snapshot in the same storage where pgdata_name exists.
+
+**zenith snapshot push** --to url pgdata_name@snap_name
+
+Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `zenith snapshot recv` before push happens. If url has some special schema like zenith:// receiving side may require auth start `zenith snapshot recv` on the go.
+
+**zenith snapshot recv**
+
+Starts a port listening for a basebackup stream, prints connection info to stdout (so that user may use that in push command), and expects data on that socket.
+
+**zenith snapshot pull** --from url or path
+
+Connects to a remote zenith/s3/file and pulls snapshot. The remote site should be zenith service or files in our format.
+
+**zenith snapshot import** --from basebackup://<...>  or path
+
+Creates a new snapshot out of running postgres via basebackup protocol or basebackup files.
+
+**zenith snapshot export**
+
+Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be zenith own format which is handy for us (but I think just tar of basebackup would be okay).
+
+**zenith snapshot diff** snap1 snap2
+
+Shows size of data changed between two snapshots. We also may provide options to diff schema/data in tables. To do that start temp read-only postgreses.
+
+**zenith snapshot destroy**
+
+## pitr
+
+Pitr represents wal stream and ttl policy for that stream
+
+XXX: any suggestions on a better name?
+
+**zenith pitr create** name
+
+--ttl = inf | period
+
+--size-limit = inf | limit
+
+--storage = storage_name
+
+**zenith pitr extract-snapshot** pitr_name --lsn xxx
+
+Creates a snapshot out of some lsn in PITR area. The obtained snapshot may be managed with snapshot routines (move/send/export)
+
+**zenith pitr gc** pitr_name
+
+Force garbage collection on some PITR area.
+
+**zenith pitr list**
+
+**zenith pitr destroy**
+
+
+## console
+
+**zenith console**
+
+Opens browser targeted at web console with the more or less same functionality as described here.
--- a/docs/rfcs/004-durability.md
+++ b/docs/rfcs/004-durability.md
@@ -0,0 +1,218 @@
+Durability & Consensus
+======================
+
+When a transaction commits, a commit record is generated in the WAL.
+When do we consider the WAL record as durable, so that we can
+acknowledge the commit to the client and be reasonably certain that we
+will not lose the transaction?
+
+Zenith uses a group of WAL safekeeper nodes to hold the generated WAL.
+A WAL record is considered durable, when it has been written to a
+majority of WAL safekeeper nodes. In this document, I use 5
+safekeepers, because I have five fingers. A WAL record is durable,
+when at least 3 safekeepers have written it to disk.
+
+First, assume that only one primary node can be running at a
+time. This can be achieved by Kubernetes or etcd or some
+cloud-provider specific facility, or we can implement it
+ourselves. These options are discussed in later chapters.  For now,
+assume that there is a Magic STONITH Fairy that ensures that.
+
+In addition to the WAL safekeeper nodes, the WAL is archived in
+S3. WAL that has been archived to S3 can be removed from the
+safekeepers, so the safekeepers don't need a lot of disk space.
+
+```
+                                +----------------+
+                        +-----> | WAL safekeeper |
+                        |       +----------------+
+                        |       +----------------+
+                        +-----> | WAL safekeeper |
+------------+          |       +----------------+
+|  Primary   |          |       +----------------+
+| Processing | ---------+-----> | WAL safekeeper |
+|   Node     |          |       +----------------+
+------------+          |       +----------------+
+            \           +-----> | WAL safekeeper |
+             \          |       +----------------+
+              \         |       +----------------+
+               \        +-----> | WAL safekeeper |
+                \               +----------------+
+                 \
+                  \
+                   \
+                    \
+                     \          +--------+
+                      \         |        |
+                       +------> |   S3   |
+                                |        |
+                                +--------+
+
+```
+Every WAL safekeeper holds a section of WAL, and a VCL value.
+The WAL can be divided into three portions:
+
+```
+                                    VCL                   LSN
+                                     |                     |
+                                     V                     V
+.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX
+Archived WAL       Completed WAL          In-flight WAL
+```
+
+Note that all this WAL kept in a safekeeper is a contiguous section.
+This is different from Aurora: In Aurora, there can be holes in the
+WAL, and there is a Gossip protocol to fill the holes. That could be
+implemented in the future, but let's keep it simple for now. WAL needs
+to be written to a safekeeper in order. However, during crash
+recovery, In-flight WAL that has already been stored in a safekeeper
+can be truncated or overwritten.
+
+The Archived WAL has already been stored in S3, and can be removed from
+the safekeeper.
+
+The Completed WAL has been written to at least three safekeepers. The
+algorithm ensures that it is not lost, when at most two nodes fail at
+the same time.
+
+The In-flight WAL has been persisted in the safekeeper, but if a crash
+happens, it may still be overwritten or truncated.
+
+
+The VCL point is determined in the Primary. It is not strictly
+necessary to store it in the safekeepers, but it allows some
+optimizations and sanity checks and is probably generally useful for
+the system as whole. The VCL values stored in the safekeepers can lag
+behind the VCL computed by the primary.
+
+
+Primary node Normal operation
+-----------------------------
+
+1. Generate some WAL.
+
+2. Send the WAL to all the safekeepers that you can reach.
+
+3. As soon as a quorum of safekeepers have acknowledged that they have
+   received and durably stored the WAL up to that LSN, update local VCL
+   value in memory, and acknowledge commits to the clients.
+
+4. Send the new VCL to all the safekeepers that were part of the quorum.
+   (Optional)
+
+
+Primary Crash recovery
+----------------------
+
+When a new Primary node starts up, before it can generate any new WAL
+it needs to contact a majority of the WAL safekeepers to compute the
+VCL. Remember that there is a Magic STONITH fairy that ensures that
+only node process can be doing this at a time.
+
+1. Contact all WAL safekeepers. Find the Max((Epoch, LSN)) tuple among the ones you
+   can reach. This is the Winner safekeeper, and its LSN becomes the new VCL.
+
+2. Update the other safekeepers you can reach, by copying all the WAL
+   from the Winner, starting from each safekeeper's old VCL point. Any old
+   In-Flight WAL from previous Epoch is truncated away.
+
+3. Increment Epoch, and send the new Epoch to the quorum of
+   safekeepers.  (This ensures that if any of the safekeepers that we
+   could not reach later come back online, they will be considered as
+   older than this in any future recovery)
+
+You can now start generating new WAL, starting from the newly-computed
+VCL.
+
+Optimizations
+-------------
+
+As described, the Primary node sends all the WAL to all the WAL safekeepers. That
+can be a lot of network traffic. Instead of sending the WAL directly from Primary,
+some safekeepers can be daisy-chained off other safekeepers, or there can be a
+broadcast mechanism among them. There should still be a direct connection from the
+each safekeeper to the Primary for the acknowledgments though.
+
+Similarly, the responsibility for archiving WAL to S3 can be delegated to one of
+the safekeepers, to reduce the load on the primary.
+
+
+Magic STONITH fairy
+-------------------
+
+Now that we have a system that works as long as only one primary node is running at a time, how
+do we ensure that?
+
+1. Use etcd to grant a lease on a key. The primary node is only allowed to operate as primary
+   when it's holding a valid lease. If the primary node dies, the lease expires after a timeout
+   period, and a new node is allowed to become the primary.
+
+2. Use S3 to store the lease. S3's consistency guarantees are more lenient, so in theory you
+   cannot do this safely. In practice, it would probably be OK if you make the lease times and
+   timeouts long enough. This has the advantage that we don't need to introduce a new
+   component to the architecture.
+
+3. Use Raft or Paxos, with the WAL safekeepers acting as the Acceptors to form the quorum. The
+   next chapter describes this option.
+
+
+Built-in Paxos
+--------------
+
+The WAL safekeepers act as PAXOS Acceptors, and the Processing nodes
+as both Proposers and Learners.
+
+Each WAL safekeeper holds an Epoch value in addition to the VCL and
+the WAL. Each request by the primary to safekeep WAL is accompanied by
+an Epoch value. If a safekeeper receives a request with Epoch that
+doesn't match its current Accepted Epoch, it must ignore (NACK) it.
+(In different Paxos papers, Epochs are called "terms" or "round
+numbers")
+
+When a node wants to become the primary, it generates a new Epoch
+value that is higher than any previously observed Epoch value, and
+globally unique.
+
+
+Accepted Epoch: 555                VCL                   LSN
+                                     |                     |
+                                     V                     V
+.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX
+Archived WAL       Completed WAL          In-flight WAL
+
+
+Primary node startup:
+
+1. Contact all WAL safekeepers that you can reach (if you cannot
+   connect to a quorum of them, you can give up immediately). Find the
+   latest Epoch among them.
+
+2. Generate a new globally unique Epoch, greater than the latest Epoch
+   found in previous step.
+
+2. Send the new Epoch in a Prepare message to a quorum of
+   safekeepers. (PAXOS Prepare message)
+
+3. Each safekeeper responds with a Promise. If a safekeeper has
+   already made a promise with a higher Epoch, it doesn't respond (or
+   responds with a NACK). After making a promise, the safekeeper stops
+   responding to any write requests with earlier Epoch.
+
+4. Once you have received a majority of promises, you know that the
+   VCL cannot advance on the old Epoch anymore. This effectively kills
+   any old primary server.
+
+5. Find the highest written LSN among the quorum of safekeepers (these
+   can be included in the Promise messages already). This is the new
+   VCL.  If a new node starts the election process after this point,
+   it will compute the same or higher VCL.
+
+6. Copy the WAL from the safekeeper with the highest LSN to the other
+   safekeepers in the quorum, using the new Epoch. (PAXOS Accept
+   phase)
+
+7. You can now start generating new WAL starting from the VCL. If
+   another process starts the election process after this point and
+   gains control of a majority of the safekeepers, we will no longer
+   be able to advance the VCL.
+
--- a/docs/rfcs/005-zenith_local.md
+++ b/docs/rfcs/005-zenith_local.md
@@ -0,0 +1,103 @@
+# Zenith local
+
+Here I list some objectives to keep in mind when discussing zenith-local design and a proposal that brings all components together.  Your comments on both parts are very welcome.
+
+#### Why do we need it?
+- For distribution - this easy to use binary will help us to build adoption among developers.
+- For internal use - to test all components together.
+
+In my understanding, we consider it to be just a mock-up version of zenith-cloud.
+> Question: How much should we care about durability and security issues for a local setup?
+
+
+#### Why is it better than a simple local postgres?
+
+- Easy one-line setup. As simple as `cargo install zenith && zenith start`
+
+- Quick and cheap creation of compute nodes over the same storage.
+> Question: How can we describe a use-case for this feature?
+
+- Zenith-local can work with S3 directly. 
+
+- Push and pull images (snapshots) to remote S3 to exchange data with other users.
+
+- Quick and cheap snapshot checkouts to switch back and forth in the database history.
+> Question: Do we want it in the very first release? This feature seems quite complicated.
+
+#### Distribution:
+
+Ideally, just one binary that incorporates all elements we need.
+> Question: Let's discuss pros and cons of having a separate package with modified PostgreSQL.
+
+#### Components:
+
+- **zenith-CLI** - interface for end-users.  Turns commands to REST requests and handles responses to show them in a user-friendly way.  
+CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md
+WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli
+
+- **zenith-console** - WEB UI with same functionality as CLI.
+>Note: not for the first release.
+
+- **zenith-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below.
+    > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping zenith-local.
+
+- **zenith-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation).
+> Question: Probably, for local setup we should be able to bypass page-storage and interact directly with S3 to avoid double caching in shared buffers and page-server?
+
+WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src
+
+- **zenith-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to zenith.
+> Question: How should it operate in a local setup? Will we manage it ourselves or ask user to provide credentials for existing S3 object storage (i.e. minio)?
+> Question: Do we use it together with local page store or they are interchangeable?
+
+WIP code is ???
+
+- **zenith-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed.
+> Question: How should it operate in a local setup? In my understanding it should push WAL directly to S3 (if we use it) or store all data locally (if we use local page storage). The latter option seems meaningless (extra overhead and no gain), but it is still good to test the system.
+
+WIP code is here: https://github.com/libzenith/postgres/tree/main/src/bin/safekeeper
+
+- **zenith-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database.
+ 
+ WIP code is in main branch and here: https://github.com/libzenith/postgres/commits/compute_node
+
+#### REST API:
+
+Service endpoint: `http://localhost:3000`
+
+Resources:
+- /storages - Where data lives: zenith-pageserver or zenith-s3
+- /pgs - Postgres - zenith-computenode
+- /snapshots - snapshots **TODO**
+
+>Question: Do we want to extend this API to manage zenith components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all?
+
+Methods and their mapping to CLI:
+
+- /storages - zenith-pageserver or zenith-s3
+
+CLI  | REST API
+------------- | -------------
+storage attach -n name --type [native\s3]  --path=[datadir\URL] | PUT  -d { "name": "name", "type": "native", "path": "/tmp" } /storages
+storage detach -n name | DELETE /storages/:storage_name 
+storage list | GET /storages
+storage show -n name | GET /storages/:storage_name 
+
+
+- /pgs - zenith-computenode
+
+CLI  | REST API
+------------- | -------------
+pg create -n name --s storage_name | PUT  -d { "name": "name", "storage_name": "storage_name" } /pgs
+pg destroy -n name | DELETE /pgs/:pg_name 
+pg start -n name --replica | POST -d {"action": "start", "is_replica":"replica"}  /pgs/:pg_name /actions
+pg stop -n name | POST  -d {"action": "stop"}  /pgs/:pg_name /actions
+pg promote -n name | POST  -d {"action": "promote"}  /pgs/:pg_name /actions
+pg list | GET /pgs
+pg show -n name | GET /pgs/:pg_name 
+
+- /snapshots **TODO**
+
+CLI  | REST API
+------------- | -------------
+
--- a/docs/rfcs/006-laptop-cli-v2-CLI.md
+++ b/docs/rfcs/006-laptop-cli-v2-CLI.md
@@ -0,0 +1,64 @@
+Zenith CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog".
+
+# CLI v2 (after chatting with Carl)
+
+Zenith introduces the notion of a repository.
+
+```bash
+zenith init
+zenith clone zenith://zenith.tech/piedpiper/northwind -- clones a repo to the northwind directory
+```
+
+Once you have a cluster catalog you can explore it
+
+```bash
+zenith log -- returns a list of commits
+zenith status -- returns if there are changes in the catalog that can be committed
+zenith commit -- commits the changes and generates a new commit hash
+zenith branch experimental <hash> -- creates a branch called testdb based on a given commit hash
+```
+
+To make changes in the catalog you need to run compute nodes
+
+```bash
+-- here is how you a compute node
+zenith start /home/pipedpiper/northwind:main -- starts a compute instance
+zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud
+-- you can start a compute node against any hash or branch
+zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port)
+-- you can start a compute node against any hash or branch
+zenith start /home/pipedpiper/northwind:<hash> --port 8009 -- start another compute instance (on different port)
+
+-- After running some DML you can run 
+-- zenith status and see how there are two WAL streams one on top of 
+-- the main branch
+zenith status 
+-- and another on top of the experimental branch
+zenith status -b experimental
+
+-- you can commit each branch separately
+zenith commit main
+-- or
+zenith commit -c /home/pipedpiper/northwind:experimental
+```
+
+Starting compute instances against cloud environments
+
+```bash
+-- you can start a compute instance against the cloud environment
+-- in this case all of the changes will be streamed into the cloud
+zenith start https://zenith:tech/pipedpiper/northwind:main
+zenith start https://zenith:tech/pipedpiper/northwind:main
+zenith status -c https://zenith:tech/pipedpiper/northwind:main
+zenith commit -c https://zenith:tech/pipedpiper/northwind:main
+zenith branch -c https://zenith:tech/pipedpiper/northwind:<hash> experimental
+```
+
+Pushing data into the cloud
+
+```bash
+-- pull all the commits from the cloud
+zenith pull
+-- push all the commits to the cloud
+zenith push
+```
--- a/docs/rfcs/006-laptop-cli-v2-repository-structure.md
+++ b/docs/rfcs/006-laptop-cli-v2-repository-structure.md
@@ -0,0 +1,140 @@
+# Repository format
+
+A Zenith repository is similar to a traditional PostgreSQL backup
+archive, like a WAL-G bucket or pgbarman backup catalogue. It holds
+multiple versions of a PostgreSQL database cluster.
+
+The distinguishing feature is that you can launch a Zenith Postgres
+server directly against a branch in the repository, without having to
+"restore" it first. Also, Zenith manages the storage automatically,
+there is no separation between full and incremental backups nor WAL
+archive. Zenith relies heavily on the WAL, and uses concepts similar
+to incremental backups and WAL archiving internally, but it is hidden
+from the user.
+
+## Directory structure, version 1
+
+This first version is pretty straightforward but not very
+efficient. Just something to get us started.
+
+The repository directory looks like this:
+
+    .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/
+    .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots/<lsn>/
+    .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history
+    
+    .zenith/refs/branches/mybranch
+    .zenith/refs/tags/foo
+    .zenith/refs/tags/bar
+    
+    .zenith/datadirs/<timeline uuid>
+
+### Timelines
+
+A timeline is similar to PostgeSQL's timeline, but is identified by a
+UUID instead of a 32-bit timeline Id.  For user convenience, it can be
+given a name that refers to the UUID (called a branch).
+
+All WAL is generated on a timeline. You can launch a read-only node
+against a tag or arbitrary LSN on a timeline, but in order to write,
+you need to create a timeline.
+
+Each timeline is stored in a directory under .zenith/timelines. It
+consists of a WAL archive, containing all the WAL in the standard
+PostgreSQL format, under the wal/ subdirectory.
+
+The 'snapshots/' subdirectory, contains "base backups" of the data
+directory at a different LSNs. Each snapshot is simply a copy of the
+Postgres data directory.
+
+When a new timeline is forked from a previous timeline, the ancestor
+timeline's UUID is stored in the 'history' file.
+
+### Refs
+
+There are two kinds of named objects in the repository: branches and
+tags.  A branch is a human-friendly name for a timeline UUID, and a
+tag is a human-friendly name for a specific LSN on a timeline
+(timeline UUID + LSN).  Like in git, these are just for user
+convenience; you can also use timeline UUIDs and LSNs directly.
+
+Refs do have one additional purpose though: naming a timeline or LSN
+prevents it from being automatically garbage collected.
+
+The refs directory contains a small text file for each tag/branch. It
+contains the UUID of the timeline (and LSN, for tags).
+
+### Datadirs
+
+.zenith/datadirs contains PostgreSQL data directories. You can launch
+a Postgres instance on one of them with:
+
+```
+  postgres -D .zenith/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c
+```
+
+All the actual data is kept in the timeline directories, under
+.zenith/timelines. The data directories are only needed for active
+PostgreQSL instances. After an instance is stopped, the data directory
+can be safely removed. "zenith start" will recreate it quickly from
+the data in .zenith/timelines, if it's missing.
+
+## Version 2
+
+The format described above isn't very different from a traditional
+daily base backup + WAL archive configuration. The main difference is
+the nicer naming of branches and tags.
+
+That's not very efficient. For performance, we need something like
+incremental backups that don't require making a full copy of all
+data. So only store modified files or pages. And instead of having to
+replay all WAL from the last snapshot, "slice" the WAL into
+per-relation WAL files and only recover what's needed when a table is
+accessed.
+
+In version 2, the file format in the "snapshots" subdirectory gets
+more advanced. The exact format is TODO. But it should support:
+- storing WAL records of individual relations/pages
+- storing a delta from an older snapshot
+- compression
+
+
+## Operations
+
+### Garbage collection
+
+When you run "zenith gc", old timelines that are no longer needed are
+removed. That involves collecting the list of "unreachable" objects,
+starting from the named branches and tags.
+
+Also, if enough WAL has been generated on a timeline since last
+snapshot, a new snapshot or delta is created.
+
+### zenith push/pull
+
+Compare the tags and branches on both servers, and copy missing ones.
+For each branch, compare the timeline it points to in both servers. If
+one is behind the other, copy the missing parts.
+
+FIXME: how do you prevent confusion if you have to clones of the same
+repository, launch an instance on the same branch in both clones, and
+later try to push/pull between them? Perhaps create a new timeline
+every time you start up an instance? Then you would detect that the
+timelines have diverged. That would match with the "epoch" concept
+that we have in the WAL safekeeper
+
+### zenith checkout/commit
+
+In this format, there is no concept of a "working tree", and hence no
+concept of checking out or committing. All modifications are done on
+a branch or a timeline. As soon as you launch a server, the changes are
+appended to the timeline.
+
+You can easily fork off a temporary timeline to emulate a "working tree".
+You can later remove it and have it garbage collected, or to "commit",
+re-point the branch to the new timeline.
+
+If we want to have a worktree and "zenith checkout/commit" concept, we can
+emulate that with a temporary timeline. Create the temporary timeline at
+"zenith checkout", and have "zenith commit" modify the branch to point to
+the new timeline.
--- a/docs/rfcs/007-serverless-on-laptop.md
+++ b/docs/rfcs/007-serverless-on-laptop.md
@@ -0,0 +1,93 @@
+How it works now
+----------------
+
+1. Create repository, start page server on it
+
+```
+$ zenith init
+...
+created main branch
+new zenith repository was created in .zenith
+
+$ zenith pageserver start
+Starting pageserver at '127.0.0.1:64000' in .zenith
+Page server started
+```
+
+2. Create a branch, and start a Postgres instance on it
+
+```
+$ zenith branch heikki main
+branching at end of WAL: 0/15ECF68
+
+$ zenith pg create heikki
+Initializing Postgres on timeline 76cf9279915be7797095241638e64644...
+Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/pg1 port=55432
+
+$ zenith pg start pg1
+Starting postgres node at 'host=127.0.0.1 port=55432 user=heikki'
+waiting for server to start.... done
+server started
+```
+
+
+3. Connect to it and run queries
+
+```
+$ psql "dbname=postgres port=55432"
+psql (14devel)
+Type "help" for help.
+
+postgres=# 
+```
+
+
+Proposal: Serverless on your Laptop
+-----------------------------------
+
+We've been talking about doing the "pg create" step automatically at
+"pg start", to eliminate that step. What if we go further, go
+serverless on your laptop, so that the workflow becomes just:
+
+1. Create repository, start page server on it (same as before)
+
+```
+$ zenith init
+...
+created main branch
+new zenith repository was created in .zenith
+
+$ zenith pageserver start
+Starting pageserver at '127.0.0.1:64000' in .zenith
+Page server started
+```
+
+2. Create branch
+
+```
+$ zenith branch heikki main
+branching at end of WAL: 0/15ECF68
+```
+
+3. Connect to it:
+
+```
+$ psql "dbname=postgres port=5432 branch=heikki"
+psql (14devel)
+Type "help" for help.
+
+postgres=# 
+```
+
+
+The trick behind the scenes is that when you launch the page server,
+it starts to listen on port 5432. When you connect to it with psql, it
+looks at the 'branch' parameter that you passed in the connection
+string. It automatically performs the "pg create" and "pg start" steps
+for that branch, and then forwards the connection to the Postgres
+instance that it launched. After you disconnect, if there are no more
+active connections to the server running on the branch, it can
+automatically shut it down again.
+
+This is how serverless would work in the cloud. We can do it on your
+laptop, too.
--- a/docs/rfcs/008-push-pull.md
+++ b/docs/rfcs/008-push-pull.md
@@ -0,0 +1,66 @@
+# Push and pull between pageservers
+
+Here is a proposal about implementing push/pull mechanics between pageservers. We also want to be able to push/pull to S3 but that would depend on the exact storage format so we don't touch that in this proposal.
+
+## Origin management
+
+The origin represents connection info for some remote pageserver. Let's use here same commands as git uses except using explicit list subcommand (git uses `origin -v` for that).
+
+```
+zenith origin add <name> <connection_uri>
+zenith origin list
+zenith origin remove <name>
+```
+
+Connection URI a string of form `postgresql://user:pass@hostname:port` (https://www.postgresql.org/docs/13/libpq-connect.html#id-1.7.3.8.3.6). We can start with libpq password auth and later add support for client certs or require ssh as transport or invent some other kind of transport.
+
+Behind the scenes, this commands may update toml file inside .zenith directory.
+
+## Push
+
+### Pushing branch
+
+```
+zenith push mybranch cloudserver # push to eponymous branch in cloudserver
+zenith push mybranch cloudserver:otherbranch # push to a different branch in cloudserver
+```
+
+Exact mechanics would be slightly different in the following situations:
+
+1) Destination branch does not exist.
+
+    That is the simplest scenario. We can just create an empty branch (or timeline in internal terminology) and transfer all the pages/records that we have in our timeline. Right now each timeline is quite independent of other timelines so I suggest skipping any checks that there is a common ancestor and just fill it with data. Later when CoW timelines will land to the pageserver we may add that check and decide whether this timeline belongs to this pageserver repository or not [*].
+
+    The exact mechanics may be the following:
+
+    * CLI asks local pageserver to perform push and hands over connection uri: `perform_push <branch_name> <uri>`.
+    * local pageserver connects to the remote pageserver and runs `branch_push <branch_name> <timetine_id>`
+        Handler for branch_create would create destination timeline and switch connection to copyboth mode.
+    * Sending pageserver may start iterator on that timeline and send all the records as copy messages.
+
+2) Destination branch exists and latest_valid_lsn is less than ours.
+
+    In this case, we need to send missing records. To do that we need to find all pages that were changed since that remote LSN. Right now we don't have any tracking mechanism for that, so let's just iterate over all records and send ones that are newer than remote LSN. Later we probably should add a sparse bitmap that would track changed pages to avoid full scan.
+
+3) Destination branch exists and latest_valid_lsn is bigger than ours.
+
+    In this case, we can't push to that branch. We can only pull.
+
+### Pulling branch
+
+Here we need to handle the same three cases, but also keep in mind that local pageserver can be behind NAT and we can't trivially re-use pushing by asking remote to 'perform_push' to our address. So we would need a new set of commands:
+
+* CLI calls `perform_pull <branch_name> <uri>` on local pageserver.
+* local pageserver calls `branch_pull <branch_name> <timetine_id>` on remote pageserver.
+* remote pageserver sends records in our direction
+
+But despite the different set of commands code that performs iteration over records and receiving code that inserts that records can be the same for both pull and push.
+
+
+
+[*] It looks to me that there are two different possible approaches to handling unrelated timelines:
+
+1) Allow storing unrelated timelines in one repo. Some timelines may have parents and some may not.
+2) Transparently create and manage several repositories in one pageserver.
+
+But that is the topic for a separate RFC/discussion.
--- a/docs/rfcs/009-snapshot-first-storage-cli.md
+++ b/docs/rfcs/009-snapshot-first-storage-cli.md
@@ -0,0 +1,56 @@
+While working on export/import commands, I understood that they fit really well into "snapshot-first design".
+
+We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files.
+
+Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to zenith.
+
+So here is an attempt to design consistent CLI for different usage scenarios:
+
+#### 1. Start empty pageserver.
+That is what we have now.
+Init empty pageserver using `initdb` in temporary directory.
+
+`--storage_dest=FILE_PREFIX | S3_PREFIX |...` option defines object storage type, all other parameters are passed via env variables. Inspired by WAL-G style naming : https://wal-g.readthedocs.io/STORAGES/.
+
+Save`storage_dest` and other parameters in config.
+Push snapshots to `storage_dest` in background.
+
+```
+zenith init --storage_dest=S3_PREFIX
+zenith start
+```
+
+#### 2. Restart pageserver (manually or crash-recovery).
+Take `storage_dest` from pageserver config, start pageserver from latest snapshot in `storage_dest`.
+Push snapshots to `storage_dest` in background.
+
+```
+zenith start
+```
+
+#### 3. Import.
+Start pageserver from existing snapshot.
+Path to snapshot provided via `--snapshot_path=FILE_PREFIX | S3_PREFIX | ...`
+Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time operation.
+Save`storage_dest` parameters in config.
+Push snapshots to `storage_dest` in background.
+```
+//I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage.
+zenith init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX
+zenith start
+```
+How to pass credentials needed for `snapshot_path`?
+
+#### 4. Export.
+Manually push snapshot to `snapshot_path` which differs from `storage_dest`
+Optionally set `snapshot_format`, which can be plain pgdata format or zenith format.
+```
+zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata
+```
+
+#### Notes and questions
+- safekeeper s3_offload should use same (similar) syntax for storage. How to set it in UI?
+- Why do we need `zenith init` as a separate command? Can't we init everything at first start?
+- We can think of better names for all options.
+- Export to plain postgres format will be useless, if we are not 100% compatible on page level.
+I can recall at least one such difference - PD_WAL_LOGGED flag in pages.
--- a/docs/rfcs/009-snapshot-first-storage-pitr.md
+++ b/docs/rfcs/009-snapshot-first-storage-pitr.md
@@ -0,0 +1,227 @@
+# Preface
+
+GetPage@LSN can be called with older LSNs, and the page server needs
+to be able to reconstruct older page versions. That's needed for
+having read-only replicas that lag behind the primary, or that are
+"anchored" at an older LSN, and internally in the page server when you
+branch at an older point in time. How do you do that?
+
+For now, I'm not considering incremental snapshots at all. I don't
+think that changes things. So whenever you create a snapshot or a
+snapshot file, it contains an image of all the pages, there is no need
+to look at an older snapshot file.
+
+Also, I'm imagining that this works on a per-relation basis, so that
+each snapshot file contains data for one relation. A "relation" is a
+fuzzy concept - it could actually be one 1 GB relation segment. Or it
+could include all the different "forks" of a relation, or you could
+treat each fork as a separate relation for storage purpose. And once
+we have the "non-relational" work is finished, a "relation" could
+actually mean some other versioned object kept in the PostgreSQL data
+directory. Let's ignore that for now.
+
+# Eric's RFC:
+
+Every now and then, you create a "snapshot". It means that you create
+a new snapshot file for each relation that was modified after the last
+snapshot, and write out the contents the relation as it is/was at the
+snapshot LSN. Write-ahead log is stored separately in S3 by the WAL
+safekeeping service, in the original PostgreSQL WAL file format.
+
+    SNAPSHOT @100       WAL
+       .                 |
+       .                 |
+       .                 |
+       .                 |
+    SNAPSHOT @200        |
+       .                 |
+       .                 |
+       .                 |
+       .                 |
+    SNAPSHOT @300        |
+       .                 |
+       .                 V
+    IN-MEMORY @400
+
+If a GetPage@LSN request comes from the primary, you return the latest
+page from the in-memory layer. If there is no trace of the page in
+memory, it means that it hasn't been modified since the last snapshot,
+so you return the page from the latest snapshot, at LSN 300 in the
+above example.
+
+PITR is implemented using the original WAL files:
+
+If a GetPage@LSN request comes from a read replica with LSN 250, you
+read the image of the page from the snapshot at LSN 200, and you also
+scan the WAL between 200 and 250, and apply all WAL records for the
+requested page, to reconstruct it at LSN 250.
+
+Scanning the WAL naively for every GetPage@LSN request would be
+expensive, so in practice you'd construct an in-memory data structure
+of all the WAL between 200 and 250 once that allows quickly looking up
+records for a given page.
+
+## Problems/questions
+
+I think you'll need to store the list of snapshot LSNs on each
+timeline somewhere.
+
+If the latest snapshot of a relation is at LSN 100, and you request a
+page at LSN 1000000, how do you know if there are some modifications
+to it between 100 and 1000000 that you need to replay? You can scan
+all the WAL between 100 and 1000000, but that would be expensive.
+
+You can skip that, if you know that a snapshot was taken e.g. at LSN
+999900. Then you know that the fact that there is no snapshot file at
+999900 means that the relation hasn't been modified between
+100-999900.  Then you only need to scan the WAL between 999900 and
+1000000. However, there is no trace of a snapshot happening at LSN
+999900 in the snapshot file for this relation, so you need to get
+that information from somewhere else.
+
+Where do you get that information from? Perhaps you can scan all the
+other relations, and if you see a snapshot file for *any* relation at
+LSN 999900, you know that if there were modifications to this
+relation, there would be a newer snapshot file for it, too. In other
+words, the list of snapshots that have been taken can be constructed
+by scanning all relations and computing the union of all snapshot LSNs
+that you see for any relation. But that's expensive so at least you
+should keep that in memory, after computing it once. Also, if you rely
+on that, it's not possible to have snapshots at different intervals
+for different files. That seems limiting.
+
+Another option is to explicitly store a list of snapshot LSNs in a
+separate metadata file.
+
+
+# Current implementation in the 'layered_repo' branch:
+
+We store snapshot files like in the RFC, but each snapshot file also
+contains all the WAL in the range of LSNs, so that you don't need to
+fetch the WAL separately from S3. So you have "layers" like this:
+
+    SNAPSHOT+WAL 100-200
+          |
+          |
+          |
+          |
+    SNAPSHOT+WAL 200-300
+          |
+          |
+          |
+          |
+    IN-MEMORY 300-
+
+Each "snapshot+WAL" is a file that contains a snapshot - i.e. full
+copy of each page in the relation, at the *start* LSN. In addition to
+that, it contains all the WAL applicable to the relation from the
+start LSN to the end LSN. With that, you can reconstruct any page
+version in the range that the file covers.
+
+
+## Problems/questions
+
+I can see one potential performance issue here, compared to the RFC.
+Let's focus on a single relation for now. Imagine that you start from
+an empty relation, and you receive WAL from 100 to 200, containing
+a bunch of inserts and updates to the relation. You now have all that
+WAL in memory:
+
+    memory:  WAL from 100-200
+
+We decide that it's time to materialize that to a snapshot file on
+disk.  We materialize full image of the relation as it was at LSN 100
+to the snapshot file, and include all of the WAL. Since the relation
+was initially empty, the "image" at the beginning of th range is empty
+too.
+
+So now you have one file on on disk:
+
+    SNAPSHOT+WAL 100-200
+
+It contains a full image of the relation at LSN 100 and all WAL
+between 100-200. (It's actually stored as a serialized BTreeMap of
+page versions, with the page images and WAL records all stored
+together in the same BtreeMap. But for this story, that's not
+important.)
+
+We now receive more WAL updating the relation, up to LSN 300. We
+decide it's time to materialize a new snapshot file, and we now have
+two files:
+
+    SNAPSHOT+WAL 100-200
+    SNAPSHOT+WAL 200-300
+
+Note that the latest "full snapshot" that we store on disk always lags
+behind by one snapshot cycle. The first file contains a full image of
+the relation at LSN 100, the second at LSN 200. When we have received
+WAL up to LSN 300, we write a materialized image at LSN 200. That
+seems a bit silly. In the design per your RFC, you would write a
+snapshots at LSNs 200 and 300, instead. That seems better.
+
+
+
+# Third option (not implemented yet)
+
+Store snapshot files like in the RFC, but also store per-relation
+WAL files that contain WAL in a range of LSNs for that relation.
+
+    SNAPSHOT @100   WAL 100-200
+       .                 |
+       .                 |
+       .                 |
+       .                 |
+    SNAPSHOT @200   WAL 200-300
+       .                 |
+       .                 |
+       .                 |
+       .                 |
+    SNAPSHOT @300
+       .
+       .
+    IN-MEMORY 300-
+
+
+This could be the best of both worlds. The snapshot files would be
+independent of the PostgreSQL WAL format. When it's time to write
+snapshot file @300, you write a full image of the relation at LSN 300,
+and you write the WAL that you had accumulated between 200 and 300 to
+a separate file. That way, you don't "lag behind" for one snapshot
+cycle like in the current implementation. But you still have the WAL
+for a particular relation readily available alongside the snapshot
+files, and you don't need to track what snapshot LSNs exist
+separately.
+
+(If we wanted to minimize the number of files, you could include the
+snapshot @300 and the WAL between 200 and 300 in the same file, but I
+feel it's probably better to keep them separate)
+
+
+
+# Further thoughts
+
+There's no fundamental reason why the LSNs of the snapshot files and the
+ranges of the WAL files would need to line up. So this would be possible
+too:
+
+    SNAPSHOT @100   WAL 100-150
+       .                 |
+       .                 |
+       .            WAL 150-250
+       .                 |
+    SNAPSHOT @200        |
+       .                 |
+       .            WAL 250-400
+       .                 |
+       .                 |
+    SNAPSHOT @300        |
+       .                 |
+       .                 |
+    IN-MEMORY 300-
+
+I'm not sure what the benefit of this would be. You could materialize
+additional snapshot files in the middle of a range covered by a WAL
+file, maybe? Might be useful to speed up access when you create a new
+branch in the middle of an LSN range or if there's some other reason
+to believe that a particular LSN is "interesting" and there will be
+a lot of requests using it.
--- a/docs/rfcs/009-snapshot-first-storage.md
+++ b/docs/rfcs/009-snapshot-first-storage.md
@@ -0,0 +1,148 @@
+# Snapshot-first storage architecture
+
+Goals:
+- Long-term storage of database pages.
+- Easy snapshots; simple snapshot and branch management.
+- Allow cloud-based snapshot/branch management.
+- Allow cloud-centric branching; decouple branch state from running pageserver.
+- Allow customer ownership of data via s3 permissions.
+- Provide same or better performance for typical workloads, vs plain postgres.
+
+Non-goals:
+- Service database reads from s3 (reads should be serviced from the pageserver cache).
+- Keep every version of every page / Implement point-in-time recovery (possibly a future paid feature, based on WAL replay from an existing snapshot).
+
+## Principle of operation
+
+The database “lives in s3”. This means that all of the long term page storage is in s3, and the “live database”-- the version that lives in the pageserver-- is a set of “dirty pages” that haven’t yet been written back to s3.
+
+In practice, this is mostly similar to storing frequent snapshots to s3 of a database that lives primarily elsewhere.
+
+The main difference is that s3 is authoritative about which branches exist; pageservers consume branches, snapshots, and related metadata by reading them from s3. This allows cloud-based management of branches and snapshots, regardless of whether a pageserver is running or not.
+
+It’s expected that a pageserver should keep a copy of all pages, to shield users from s3 latency. A cheap/slow pageserver that falls back to s3 for some reads would be possible, but doesn’t seem very useful right now.
+
+Because s3 keeps all history, and the safekeeper(s) preserve any WAL records needed to reconstruct the most recent changes, the pageserver can store dirty pages in RAM or using non-durable local storage; this should allow very good write performance, since there is no need for fsync or journaling.
+
+Objects in s3 are immutable snapshots, never to be modified once written (only deleted).
+
+Objects in s3 are files, each containing a set of pages for some branch/relation/segment as of a specific time (LSN). A snapshot could be complete (meaning it has a copy of every page), or it could be incremental (containing only the pages that were modified since the previous snapshot). It’s expected that most snapshots are incremental to keep storage costs low.
+
+It’s expected that the pageserver would upload new snapshot objects frequently, e.g. somewhere between 30 seconds and 15 minutes, depending on cost/performance balance.
+
+No-longer needed snapshots can be “squashed”-- meaning snapshot N and snapshot N+1 can be read by some cloud agent software, which writes out a new object containing the combined set of pages (keeping only the newest version of each page) and then deletes the original snapshots.
+
+A pageserver only needs to store the set of pages needed to satisfy operations in flight: if a snapshot is still being written, the pageserver needs to hold historical pages so that snapshot captures a consistent moment in time (similar to what is needed to satisfy a slow replica).
+
+WAL records can be discarded once a snapshot has been stored to s3. (Unless we want to keep them longer as part of a point-in-time recovery feature.)
+
+## Pageserver operation
+
+To start a pageserver from a stored snapshot, the pageserver downloads a set of snapshots sufficient to start handling requests. We assume this includes the latest copy of every page, though it might be possible to start handling requests early, and retrieve pages for the first time only when needed.
+
+To halt a pageserver, one final snapshot should be written containing all pending WAL updates; then the pageserver and safekeepers can shut down.
+
+It’s assumed there is some cloud management service that ensures only one pageserver is active and servicing writes to a given branch.
+
+The pageserver needs to be able to track whether a given page has been modified since the last snapshot, and should be able to produce the set of dirty pages efficiently to create a new snapshot.
+
+The pageserver need only store pages that are “reachable” from a particular LSN. For example, a page may be written four times, at LSN 100, 200, 300, and 400. If no snapshot is being created when LSN 200 is written, the page at LSN 100 can be discarded. If a snapshot is triggered when the pageserver is at LSN 299, the pageserver must preserve the page from LSN 200 until that snapshot is complete. As before, the page at LSN 300 can be discarded when the LSN 400 pages is written (regardless of whether the LSN 200 snapshot has completed.)
+
+If the pageserver is servicing multiple branches, those branches may contain common history. While it would be possible to serve branches with zero knowledge of their common history, a pageserver could save a lot of space using an awareness of branch history to share the common set of pages. Computing the “liveness” of a historical page may be tricky in the face of multiple branches.
+
+The pageserver may store dirty pages to memory or to local block storage; any local block storage format is only temporary “overflow” storage, and is not expected to be readable by future software versions.
+
+The pageserver may store clean pages (those that are captured in a snapshot) any way it likes: in memory, in a local filesystem (possibly keeping a local copy of the snapshot file), or using some custom storage format. Reading pages from s3 would be functional, but is expected to be prohibitively slow.
+
+The mechanism for recovery after a pageserver failure is WAL redo. If we find that too slow in some situations (e.g. write-heavy workload causes long startup), we can write more frequent snapshots to keep the number of outstanding WAL records low. If that’s still not good enough, we could look at other options (e.g. redundant pageserver or an EBS page journal).
+
+A read-only pageserver is possible; such a pageserver could be a read-only cache of a specific snapshot, or could auto-update to the latest snapshot on some branch. Either way, no safekeeper is required. Multiple read-only pageservers could exist for a single branch or snapshot.
+
+## Cloud snapshot manager operation
+
+Cloud software may wish to do the following operations (commanded by a user, or based on some pre-programmed policy or other cloud agent):
+Create/delete/clone/rename a database
+Create a new branch (possibly from a historical snapshot)
+Start/stop the pageserver/safekeeper on a branch
+List databases/branches/snapshots that are visible to this user account
+
+Some metadata operations (e.g. list branches/snapshots of a particular db) could be performed by scanning the contents of a bucket and inspecting the file headers of each snapshot object. This might not be fast enough; it might be necessary to build a metadata service that can respond more quickly to some queries.
+
+This is especially true if there are public databases: there may be many thousands of buckets that are public, and scanning all of them is not a practical strategy for answering metadata queries.
+
+## Snapshot names, deletion and concurrency
+
+There may be race conditions between operations-- in particular, a “squash” operation may replace two snapshot objects (A, B) with some combined object (C). Since C is logically equivalent to B, anything that attempts to access B should be able to seamlessly switch over to C. It’s assumed that concurrent delete won’t disrupt a read in flight, but it may be possible for some process to read B’s header, and then discover on the next operation that B is gone.
+
+For this reason, any attempted read should attempt a fallback procedure (list objects; search list for an equivalent object) if an attempted read fails.  This requires a predictable naming scheme, e.g. `XXXX_YYYY_ZZZZ_DDDD`, where `XXXX` is the branch unique id, and `YYYY` and `ZZZZ` are the starting/ending LSN values.  `DDDD` is a timestamp indicating when the object was created; this is used to disambiguate a series of empty snapshots, or to help a snapshot policy engine understand which snapshots should be kept or discarded.
+
+## Branching
+
+A user may request a new branch from the cloud user interface. There is a sequence of things that needs to happen:
+- If the branch is supposed to be based on the latest contents, the pageserver should perform an immediate snapshot. This is the parent snapshot for the new branch.
+- Cloud software should create the new branch, by generating a new (random) unique branch identifier, and creating a placeholder snapshot object.
+    - The placeholder object is an empty snapshot containing only metadata (which anchors it to the right parent history) and no pages.
+    - The placeholder can be discarded when the first snapshot (containing data) is completed. Discarding is equivalent to squashing, when the snapshot contains no data.
+- If the branch needs to be started immediately, a pageserver should be notified that it needs to start servicing the branch. This may not be the same pageserver that services the parent branch, though the common history may make it the best choice.
+
+Some of these steps could be combined into the pageserver, but that process would not be possible under all cases (e.g. if no pageserver is currently running, or if the branch is based on an older snapshot, or if a different pageserver will be serving the new branch). Regardless of which software drives the process, the result should look the same.
+
+## Long-term file format
+
+Snapshot files (and any other object stored in s3) must be readable by future software versions.
+
+It should be possible to build multiple tools (in addition to the pageserver) that can read and write this file format-- for example, to allow cloud snapshot management.
+
+Files should contain the following metadata, in addition to the set of pages:
+- The version of the file format.
+- A unique identifier for this branch (should be worldwide-unique and unchanging).
+- Optionally, any human-readable names assigned to this branch (for management UI/debugging/logging).
+- For incremental snapshots, the identifier of the predecessor snapshot. For new branches, this will be the parent snapshot (the point at which history diverges).
+- The location of the predecessor branch snapshot, if different from this branch’s location.
+- The LSN range `(parent, latest]` for this snapshot. For complete snapshots, the parent LSN can be 0.
+- The UTC timestamp of the snapshot creation (which may be different from the time of its highest LSN, if the database is idle).
+- A SHA2 checksum over the entire file (excluding the checksum itself), to preserve file integrity.
+
+A file may contain no pages, and an empty LSN range (probably `(latest, latest]`?), which serves as a placeholder for either a newly-created branch, or a snapshot of an idle database.
+
+Any human-readable names stored in the file may fall out of date if database/branch renames are allowed; there may need to be a cloud metadata service to query (current name -> unique identifier). We may choose instead to not store human-readable names in the database, or treat them as debugging information only.
+
+## S3 semantics, and other kinds of storage
+
+For development and testing, it may be easier to use other kinds of storage in place of s3. For example, a directory full of files can substitute for an s3 bucket with multiple objects. This mode is expected to match the s3 semantics (e.g. don’t edit existing files or use symlinks). Unit tests may omit files entirely and use an in-memory mock bucket.
+
+Some users may want to use a local or network filesystem in place of s3. This isn’t prohibited but it’s not a priority, either.
+
+Alternate implementations of s3 should be supported, including Google Cloud Storage.
+
+Azure Blob Storage should be supported. We assume (without evidence) that it’s semantically equivalent to s3 for this purpose.
+
+The properties of s3 that we depend on are:
+list objects
+streaming read of entire object
+read byte range from object
+streaming write new object (may use multipart upload for better reliability)
+delete object (that should not disrupt an already-started read).
+
+Uploaded files, restored backups, or s3 buckets controlled by users could contain malicious content. We should always validate that objects contain the content they’re supposed to. Incorrect, Corrupt or malicious-looking contents should cause software (cloud tools, pageserver) to fail gracefully.
+
+## Notes
+
+Possible simplifications, for a first draft implementation:
+- Assume that dirty pages fit in pageserver RAM. Can use kernel virtual memory to page out to disk if needed. Can improve this later.
+- Don’t worry about the details of the squashing process yet.
+- Don’t implement cloud metadata service; try to make everything work using basic s3 list-objects and reads.
+- Don’t implement rename, delete at first.
+- Don’t implement public/private, just use s3 permissions.
+- Don’t worry about sharing history yet-- each user has their own bucket and a full copy of all data.
+- Don’t worry about history that spans multiple buckets.
+- Don’t worry about s3 regions.
+- Don’t support user-writeable s3 buckets; users get only read-only access at most.
+
+Open questions:
+- How important is point-in-time recovery? When should we add this? How should it work?
+- Should snapshot files use compression?
+- Should we use snapshots for async replication? A spare pageserver could stay mostly warmed up by consuming snapshots as they’re created.
+- Should manual snapshots, or snapshots triggered by branch creation, be named differently from snapshots that are triggered by a snapshot policy?
+- When a new branch is created, should it always be served by the same pageserver that owns its parent branch? When should we start a new pageserver?
+- How can pageserver software upgrade be done with minimal downtime?
--- a/docs/rfcs/010-storage_details.md
+++ b/docs/rfcs/010-storage_details.md
@@ -0,0 +1,144 @@
+# Storage details
+
+Here I tried to describe the current state of thinking about our storage subsystem as I understand it. Feel free to correct me. Also, I tried to address items from Heikki's TODO and be specific on some of the details.
+
+## Overview
+
+![storage](images/storage.jpeg)
+
+### MemStore
+
+MemStore holds the data between `latest_snapshot_lsn` and `latest_lsn`. It consists of PageIndex that holds references to WAL records or pages, PageStore that stores recently materialized pages, and WalStore that stores recently received WAL.
+
+### PageIndex
+
+PageIndex is an ordered collection that maps `(BufferTag, LSN)` to one of the following references (by reference I mean some information that is needed to access that data, e.g. file_id and offset):
+
+* PageStoreRef -- page offset in the PageStore
+* LocalStoreRef -- snapshot_id and page offset inside of that snapshot
+* WalStoreRef -- offset (and size optionally) of WalRecord in WalStore
+
+PageIndex holds information about all the pages in all incremental snapshots and in the latest full snapshot. If we aren't using page compression inside snapshots we actually can avoid storing references to the full snapshot and calculate page offsets based on relation sizes metadata in the full snapshot (assuming that full snapshot stores pages sorted by page number). However, I would suggest embracing page compression from the beginning and treat all pages as variable-sized.
+
+We assume that PageIndex is few orders of magnitude smaller than addressed data hence it should fit memory. We also don't care about crash tolerance as we can rebuild it from snapshots metadata and WAL records from WalStore or/and Safekeeper.
+
+### WalStore
+
+WalStore is a queue of recent WalRecords. I imagine that we can store recent WAL the same way as Postgres does -- as 16MB files on disk. On top of that, we can add some fixed-size cache that would keep some amount of segments in memory.
+
+For now, we may rely on the Safekeeper to safely store that recent WAL. But generally, I think we can pack all S3 operations into the page server so that it would be also responsible for the recent WAL pushdown to S3 (and Safekeeper may just delete WAL that was confirmed as S3-durable by the page server).
+
+### PageStore
+
+PageStore is storage for recently materialized pages (or in other words cache of getPage results). It is also can be implemented as a file-based queue with some memory cache on top of it.
+
+There are few possible options for PageStore:
+
+a) we just add all recently materialized pages there (so several versions of the same page can be stored there) -- that is more or less how it happens now with the current RocksDB implementation.
+
+b) overwrite older pages with the newer pages -- if there is no replica we probably don't need older pages. During page overwrite, we would also need to change PageStoreRef back to WalStoreRef in PageIndex.
+
+I imagine that newly created pages would just be added to the back of PageStore (again in queue-like fashion) and this way there wouldn't be any meaningful ordering inside of that queue. When we are forming a new incremental snapshot we may prohibit any updates to the current set of pages in PageStore (giving up on single page version rule) and cut off that whole set when snapshot creation is complete.
+
+With option b) we can also treat PageStor as an uncompleted incremental snapshot.
+
+### LocalStore
+
+LocalStore keeps the latest full snapshot and set of incremental snapshots on top of it. We add new snapshots when the number of changed pages grows bigger than a certain threshold.
+
+## Granularity
+
+By granularity, I mean a set of pages that goes into a certain full snapshot. Following things should be taken into account:
+
+* can we shard big databases between page servers?
+* how much time will we spend applying WAL to access certain pages with older LSN's?
+* how many files do we create for a single database?
+
+I can think of the following options here:
+
+1. whole database goes to one full snapshot.
+    * +: we never create a lot of files for one database
+    * +: the approach is quite straightforward, moving data around is simple
+    * -: can not be sharded
+    * -: long recovery -- we always need to recover the whole database
+2. table segment is the unit of snapshotting
+    * +: straightforward for sharding
+    * +: individual segment can be quickly recovered with sliced WAL
+    * -: full snapshot can be really small (e.g. when the corresponding segment consists of a single page) and we can blow amount of files. Then we would spend eternity in directory scans and the amount of metadata for sharding can be also quite big.
+3. range-partitioned snapshots -- snapshot includes all pages between [BuffTagLo, BuffTagHi] mixing different relations, databases, and potentially clusters (albeit from one tenant only). When full snapshot outgrows a certain limit (could be also a few gigabytes) we split the snapshot in two during the next full snapshot write. That approach would also require pages sorted by BuffTag inside our snapshots.
+    * +: addresses all mentioned issues
+    * -: harder to implement
+
+I think it is okay to start with table segments granularity and just check how we will perform in cases of lots of small tables and check is there any way besides c) to deal with it.
+
+Both PageStore and WalStore should be "sharded" by this granularity level.
+
+## Security
+
+We can generate different IAM keys for each tenant and potentially share them with users (in read-only mode?) or even allow users to provide their S3 buckets credentials.
+
+Also, S3 backups are usually encrypted by per-tenant privates keys. I'm not sure in what threat model such encryption would improve something (taking into account per-tenant IAM keys), but it seems that everybody is doing that (both AMZN and YNDX). Most likely that comes as a requirement about "cold backups" by some certification procedure.
+
+## Dynamics
+
+### WAL stream handling
+
+When a new WAL record is received we need to parse BufferTags in that record and insert them in PageIndex with WalStoreRef as a value.
+
+### getPage queries
+
+Look up the page in PageIndex. If the value is a page reference then just respond with that page. If the referenced value is WAL record then find the most recent page with the same BuffTag (that is why we need ordering in PageIndex); recover it by applying WAL records; save it in PageStore; respond with that page.
+
+### Starting page server without local data
+
+* build set of latest full snapshots and incremental snapshots on top of them
+* load all their metadata into PageIndex
+* Safekeeper should connect soon and we can ask for a WAL stream starting from the latest incremental snapshot
+* for databases that are connected to us through the Safekeeper we can start loading the set of the latest snapshots or we can do that lazily based on getPage request (I'd better avoid doing that lazily for now without some access stats from the previous run and just transfer all data for active database from S3 to LocalStore).
+
+### Starting page server with local data (aka restart or reboot)
+
+* check that local snapshot files are consistent with S3
+
+### Snapshot creation
+
+Track size of future snapshots based on info in MemStore and when it exceeds some threshold (taking into account our granularity level) create a new incremental snapshot. Always emit incremental snapshots from MemStore.
+
+To create a new snapshot we need to walk through WalStore to get the list of all changed pages, sort it, and get the latest versions of that pages from PageStore or by WAL replay. It makes sense to maintain that set in memory while we are receiving the WAL stream to avoid parsing WAL during snapshot creation.
+
+Full snapshot creation can be done by GC (or we can call that entity differently -- e.g. merger?) by merging the previous full snapshot with several incremental snapshots.
+
+### S3 pushdown
+
+When we have several full snapshots GC can push the old one with its increments to S3.
+
+### Branch creation
+
+Create a new timeline and replay sliced WAL up to a requested point. When the page is not in PageIndex ask the parent timeline about a page. Relation sizes are tricky.
+
+## File formats
+
+As far as I understand Bookfile/Aversion addresses versioning and serialization parts.
+
+As for exact data that should go to snapshots I think it is the following for each snapshot:
+
+* format version number
+* set of key/values to interpret content (e.g. is page compression enabled, is that a full or incremental snapshot, previous snapshot id, is there WAL at the end on file, etc) -- it is up to a reader to decide what to do if some keys are missing or some unknown key are present. If we add something backward compatible to the file we can keep the version number.
+* array of [BuffTag, corresponding offset in file] for pages -- IIUC that is analogous to ToC in Bookfile
+* array of [(BuffTag, LSN), corresponding offset in file] for the WAL records
+* pages, one by one
+* WAL records, one by one
+
+It is also important to be able to load metadata quickly since it would be one of the main factors impacting the time of page server start. E.g. if would store/cache about 10TB of data per page server, the size of uncompressed page references would be about 30GB (10TB / ( 8192 bytes page size / ( ~18 bytes per ObjectTag + 8 bytes offset in the file))).
+
+1) Since our ToC/array of entries can be sorted by ObjectTag we can store the whole BufferTag only when relation_id is changed and store only delta-encoded offsets for a given relation. That would reduce the average per-page metadata size to something less than 4 bytes instead of 26 (assuming that pages would follow the same order and offset deltas would be small).
+2) It makes sense to keep ToC at the beginning of the file to avoid extra seeks to locate it. Doesn't matter too much with the local files but matters on S3 -- if we are accessing a lot of ~1Gb files with the size of metadata ~ 1Mb then the time to transfer this metadata would be comparable with access latency itself (which is about a half of a second). So by slurping metadata with one read of file header instead of N reads we can improve the speed of page server start by this N factor.
+
+I think both of that optimizations can be done later, but that is something to keep in mind when we are designing our storage serialization routines.
+
+Also, there were some discussions about how to embed WAL in incremental snapshots. So far following ideas were mentioned:
+1. snapshot lsn=200, includes WAL in range 200-300
+2. snapshot lsn=200, includes WAL in range 100-200
+3. data snapshots are separated from WAL snapshots
+
+Both options 2 and 3 look good. I'm inclined towards option 3 as it would allow us to apply different S3 pushdown strategies for data and WAL files (e.g. we may keep data snapshot until the next full snapshot, but we may push WAL snapshot to S3 just when they appeared if there are no replicas).
--- a/docs/rfcs/011-retention-policy.md
+++ b/docs/rfcs/011-retention-policy.md
@@ -0,0 +1,91 @@
+# User-visible timeline history
+
+The user can specify a retention policy. The retention policy is
+presented to the user as a PITR period and snapshots. The PITR period
+is the amount of recent history that needs to be retained, as minutes,
+hours, or days. Within that period, you can create a branch or
+snapshot at any point in time, open a compute node, and start running
+queries. Internally, a PITR period is represented as a range of LSNs
+
+The user can also create snapshots. A snapshot is a point in time,
+internally represented by an LSN. The user gives the snapshot a name.
+
+The user can also specify an interval, at which the system creates
+snapshots automatically. For example, create a snapshot every night at
+2 AM. After some user-specified time, old automatically created
+snapshots are removed.
+
+                     Snapshot       Snapshot
+         PITR        "Monday"       "Tuesday"        PITR
+    ----######----------+-------------+-------------######>
+
+If there are multiple branches, you can specify different policies or
+different branches.
+
+The PITR period and user-visible snapshots together define the
+retention policy.
+
+NOTE: As presented here, this is probably overly flexible. In reality,
+we want to keep the user interface simple. Only allow a PITR period at
+the tip of a branch, for example. But that doesn't make much
+difference to the internals.
+
+
+# Retention policy behind the scenes
+
+The retention policy consists of points (for snapshots) and ranges
+(for PITR periods).
+
+The system must be able to reconstruct any page within the retention
+policy. Other page versions can be garbage collected away. We have a
+lot of flexibility on when to perform the garbage collection and how
+aggressive it is.
+
+
+# Base images and WAL slices
+
+The page versions are stored in two kinds of files: base images and
+WAL slices. A base image contains a dump of all the pages of one
+relation at a specific LSN. A WAL slice contains all the WAL in an LSN
+range.
+
+
+    |
+    |
+    |
+    | --Base img @100   +
+    |                   |
+    |                   | WAL slice
+    |                   | 100-200
+    |                   |
+    | --Base img @200   +
+    |                   |
+    |                   | WAL slice
+    |                   | 200-300
+    |                   |
+    |                   +
+    |
+    V
+
+
+To recover a page e.g. at LSN 150, you need the base image at LSN 100,
+and the WAL slice 100-200.
+
+All of this works at a per-relation or per-relation-segment basis. If
+a relation is updated very frequently, we create base images and WAL
+slices for it more quickly. For a relation that's updated
+infrequently, we hold the recent WAL for that relation longer, and
+only write it out when we need to release the disk space occupied by
+the original WAL. (We need a backstop like that, because until all the
+WAL/base images have been been durably copied to S3, we must keep the
+original WAL for that period somewhere, in the WAL service or in S3.)
+
+
+# Branching
+
+Internally, branch points are also "retention points", in addition to
+the user-visible snapshots. If a branch has been forked off at LSN
+100, we need to be able to reconstruct any page on the parent branch
+at that LSN, because it is needed by the child branch. If a page is
+modified in the child, we don't need to keep that in the parent
+anymore, though.
--- a/docs/rfcs/012-background-tasks.md
+++ b/docs/rfcs/012-background-tasks.md
@@ -0,0 +1,38 @@
+# Eviction
+
+ Write out in-memory layer to disk, into a delta layer.
+
+- To release memory
+- To make it possible to advance disk_consistent_lsn and allow the WAL
+  service to release some WAL.
+
+- Triggered if we are short on memory
+- Or if the oldest in-memory layer is so old that it's holding back
+  the WAL service from removing old WAL
+
+# Materialization
+
+Create a new image layer of a segment, by performing WAL redo
+
+- To reduce the amount of WAL that needs to be replayed on a GetPage request.
+- To allow garbage collection of old layers
+
+- Triggered by distance to last full image of a page
+
+# Coalescing
+
+Replace N consecutive layers of a segment with one larger layer.
+
+- To reduce the number of small files that needs to be uploaded to S3
+
+
+# Bundling
+
+Zip together multiple small files belonging to different segments.
+
+- To reduce the number of small files that needs to be uploaded to S3
+
+
+# Garbage collection
+
+Remove a layer that's older than the GC horizon, and isn't needed anymore.
--- a/docs/rfcs/013-term-history.md
+++ b/docs/rfcs/013-term-history.md
@@ -0,0 +1,147 @@
+# What
+
+Currently, apart from WAL safekeeper persistently stores only two logical clock
+counter (aka term) values, sourced from the same sequence. The first is bumped
+whenever safekeeper gives vote to proposer (or acknowledges already elected one)
+and e.g. prevents electing two proposers with the same term -- it is actually
+called `term` in the code. The second, called `epoch`, reflects progress of log
+receival and this might lag behind `term`; safekeeper switches to epoch `n` when
+it has received all committed log records from all `< n` terms. This roughly
+corresponds to proposed in
+
+https://github.com/zenithdb/rfcs/pull/3/files
+
+
+This makes our biggest our difference from Raft. In Raft, every log record is
+stamped with term in which it was generated; while we essentially store in
+`epoch` only the term of the highest record on this safekeeper -- when we know
+it -- because during recovery generally we don't, and `epoch` is bumped directly
+to the term of the proposer who performs the recovery when it is finished. It is
+not immediately obvious that this simplification is safe. I thought and I still
+think it is; model checking confirmed that. However, some details now make me
+believe it is better to keep full term switching history (which is equivalent to
+knowing term of each record).
+
+# Why
+
+Without knowing full history (list of <term, LSN> pairs) of terms it is hard to
+determine the exact divergence point, and if we don't perform truncation at that
+point safety becomes questionable. Consider the following history, with
+safekeepers A, B, C, D, E. n_m means record created by proposer in term n with
+LSN m; (t=x, e=y) means safekeeper currently has term x and epoch y.
+
+1) P1 in term 1 writes 1.1 everywhere, which is committed, and some more only
+on A.
+
+<pre>
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=1, e=1) 1.1
+D(t=1, e=1) 1.1
+E(t=1, e=1) 1.1
+</pre>
+
+2) P2 is elected by CDE in term 2, epochStartLsn is 2, and writes 2.2, 2.3 on CD:
+
+<pre>
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=2, e=2) 1.1 2.2 2.3
+D(t=2, e=2) 1.1 2.2 2.3
+E(t=2, e=1) 1.1
+</pre>
+
+
+3) P3 is elected by CDE in term 3, epochStartLsn is 4, and writes 3.4 on D:
+
+<pre>
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=3, e=2) 1.1 2.2 2.3
+D(t=3, e=3) 1.1 2.2 2.3 3.4
+E(t=3, e=1) 1.1
+</pre>
+
+
+Now, A gets back and P3 starts recovering it. How it should proceed? There are
+two options.
+
+## Don't try to find divergence point at all
+
+...start sending WAL conservatively since the horizon (1.1), and truncate
+obsolete part of WAL only when recovery is finished, i.e. epochStartLsn (4) is
+reached, i.e. 2.3 transferred -- that's what https://github.com/zenithdb/zenith/pull/505 proposes.
+
+Then the following is possible:
+
+4) P3 moves one record 2.2 to A.
+
+<pre>
+A(t=1, e=1) 1.1 <b>2.2</b> 1.3 1.4
+B(t=1, e=1) 1.1 1.2
+C(t=3, e=2) 1.1 2.2 2.3
+D(t=3, e=3) 1.1 2.2 2.3 3.4
+E(t=3, e=1) 1.1
+</pre>
+
+Now log of A is basically corrupted. Moreover, since ABE are all in epoch 1 and
+A's log is the longest one, they can elect P4 who will commit such log.
+
+Note that this particular history couldn't happen if we forbid to *create* new
+records in term n until majority of safekeepers switch to it. It would force CDE
+to switch to 2 before 2.2 is created, and A could never become donor while his
+log is corrupted. Generally with this additional barrier I believe the algorithm
+becomes safe, but
+ - I don't like this kind of artificial barrier;
+ - I also feel somewhat discomfortable about even temporary having intentionally
+   corrupted WAL;
+ - I'd still model check the idea.
+
+## Find divergence point and truncate at it
+
+Then step 4 would delete 1.3 1.4 on A, and we are ok. The question is, how do we
+do that? Without term switching history we have to resort to sending again since
+the horizon and memcmp'ing records, which is inefficient and ugly. Or we can
+maintain full history and determine truncation point by comparing 'wrong' and
+'right' histories -- much like pg_rewind does -- and perform truncation + start
+streaming right there.
+
+# Proposal
+
+- Add term history as array of <term, LSN> pairs to safekeeper controlfile.
+- Return it to proposer with VoteResponse so 1) proposer can tell it to other
+  nodes and 2) determine personal streaming starting point. However, since we
+  don't append WAL and update controlfile atomically, let's first always update
+  controlfile but send only the history of what we really have (up to highest
+  term in history where begin_lsn >= end of wal; this highest term replaces
+  current `epoch`). We also send end of wal as we do now to determine the donor.
+- Create ProposerAnnouncement message which proposer sends before starting
+  streaming. It announces proposer as elected and
+  1) Truncates wrong part of WAL on safekeeper
+     (divergence point is already calculated at proposer, but can be
+     cross-verified here).
+  2) Communicates the 'right' history of its term (taken from donor). Seems
+     better to immediately put the history in the controlfile,
+	 though safekeeper might not have full WAL for previous terms in it --
+	 this way is simpler, and we can't update WAL and controlfile atomically anyway.
+
+	 This also constitutes analogue of current epoch bump for those safekeepers
+     which don't need recovery, which is important for sync-safekeepers (bump
+     epoch without waiting records from new term).
+- After ProposerAnnouncement proposer streams WAL since calculated starting
+  point -- only what is missing.
+
+
+pros/cons:
+ (more) clear safety of WAL truncation -- we get very close to Raft
+ no unnecessary data sending (faster recovery for not-oldest-safekeepers, matters
+   only for 5+ nodes)
+ adds some observability at safekeepers
+
+- complexity, but not that much
+
+
+# Misc
+
+- During model checking I did truncation on first locally non existent or
+  different record -- analogue of 'memcmp' variant described above.
--- a/docs/rfcs/014-safekeepers-gossip.md
+++ b/docs/rfcs/014-safekeepers-gossip.md
@@ -0,0 +1,69 @@
+# Safekeeper gossip
+
+Extracted from this [PR](https://github.com/zenithdb/rfcs/pull/13)
+
+## Motivation
+
+In some situations, safekeeper (SK) needs coordination with other SK's that serve the same tenant:
+
+1. WAL deletion. SK needs to know what WAL was already safely replicated to delete it. Now we keep WAL indefinitely.
+2. Deciding on who is sending WAL to the pageserver. Now sending SK crash may lead to a livelock where nobody sends WAL to the pageserver.
+3. To enable SK to SK direct recovery without involving the compute
+
+## Summary
+
+Compute node has connection strings to each safekeeper. During each compute->safekeeper connection establishment, the compute node should pass down all that connection strings to each safekeeper. With that info, safekeepers may establish Postgres connections to each other and periodically send ping messages with LSN payload.
+
+## Components
+
+safekeeper, compute, compute<->safekeeper protocol, possibly console (group SK addresses)
+
+## Proposed implementation
+
+Each safekeeper can periodically ping all its peers and share connectivity and liveness info. If the ping was not receiver for, let's say, four ping periods, we may consider sending safekeeper as dead. That would mean some of the alive safekeepers should connect to the pageserver. One way to decide which one exactly: `make_connection = my_node_id == min(alive_nodes)`
+
+Since safekeepers are multi-tenant, we may establish either per-tenant physical connections or per-safekeeper ones. So it makes sense to group "logical" connections between corresponding tenants on different nodes into a single physical connection. That means that we should implement an interconnect thread that maintains physical connections and periodically broadcasts info about all tenants.
+
+Right now console may assign any 3 SK addresses to a given compute node. That may lead to a high number of gossip connections between SK's. Instead, we can assign safekeeper triples to the compute node. But if we want to "break"/" change" group by an ad-hoc action, we can do it.
+
+### Corner cases
+
+- Current safekeeper may be alive but may not have connectivity to the pageserver
+
+  To address that, we need to gossip visibility info. Based on that info, we may define SK as alive only when it can connect to the pageserver.
+
+- Current safekeeper may be alive but may not have connectivity with the compute node.
+
+  We may broadcast last_received_lsn and presence of compute connection and decide who is alive based on that.
+
+- It is tricky to decide when to shut down gossip connections because we need to be sure that pageserver got all the committed (in the distributed sense, so local SK info is not enough) records, and it may never lose them. It is not a strict requirement since `--sync-safekeepers` that happen before the compute start will allow the pageserver to consume missing WAL, but it is better to do that in the background. So the condition may look like that: `majority_max(flush_lsn) == pageserver_s3_lsn` Here we rely on the two facts:
+    - that `--sync-safekeepers` happened after the compute shutdown, and it advanced local commit_lsn's allowing pageserver to consume that WAL.
+
+    - we wait for the `pageserver_s3_lsn` advancement to avoid pageserver's last_received_lsn/disk_consistent_lsn going backward due to the disk/hardware failure and subsequent S3 recovery
+
+    If those conditions are not met, we will have some gossip activity (but that may be okay).
+
+## Pros/cons
+
+Pros:
+
+- distributed, does not introduce new services (like etcd), does not add console as a storage dependency
+- lays the foundation for gossip-based recovery
+
+Cons:
+
+- Only compute knows a set of safekeepers, but they should communicate even without compute node. In case of safekeepers restart, we will lose that info and can't gossip anymore. Hence we can't trim some WAL tail until the compute node start. Also, it is ugly.
+
+- If the console assigns a random set of safekeepers to each Postgres, we may end up in a situation where each safekeeper needs to have a connection with all other safekeepers. We can group safekeepers into isolated triples in the console to avoid that. Then "mixing" would happen only if we do rebalancing.
+
+## Alternative implementation
+
+We can have a selected node (e.g., console) with everybody reporting to it.
+
+## Security implications
+
+We don't increase the attack surface here. Communication can happen in a private network that is not exposed to users.
+
+## Scalability implications
+
+The only thing that may grow as we grow the number of computes is the number of gossip connections. But if we group safekeepers and assign a compute node to the random SK triple, the number of connections would be constant.
--- a/docs/rfcs/014-storage-lsm.md
+++ b/docs/rfcs/014-storage-lsm.md
@@ -0,0 +1,145 @@
+# Why LSM trees?
+
+In general, an LSM tree has the nice property that random updates are
+fast, but the disk writes are sequential. When a new file is created,
+it is immutable. New files are created and old ones are deleted, but
+existing files are never modified. That fits well with storing the
+files on S3.
+
+Currently, we create a lot of small files. That is mostly a problem
+with S3, because each GET/PUT operation is expensive, and LIST
+operation only returns 1000 objects at a time, and isn't free
+either. Currently, the files are "archived" together into larger
+checkpoint files before they're uploaded to S3 to alleviate that
+problem, but garbage collecting data from the archive files would be
+difficult and we have not implemented it. This proposal addresses that
+problem.
+
+
+# Overview
+
+
+```
+^ LSN
+|
+|      Memtable:     +-----------------------------+
+|                    |                             |
+|                    +-----------------------------+
+|
+|
+|            L0:     +-----------------------------+
+|                    |                             |
+|                    +-----------------------------+
+|
+|                    +-----------------------------+
+|                    |                             |
+|                    +-----------------------------+
+|
+|                    +-----------------------------+
+|                    |                             |
+|                    +-----------------------------+
+|
+|                    +-----------------------------+
+|                    |                             |
+|                    +-----------------------------+
+|
+|
+|           L1:      +-------+ +-----+ +--+  +-+
+|                    |       | |     | |  |  | |
+|                    |       | |     | |  |  | |
+|                    +-------+ +-----+ +--+  +-+
+|
+|                       +----+ +-----+ +--+  +----+
+|                       |    | |     | |  |  |    |
+|                       |    | |     | |  |  |    |
+|                       +----+ +-----+ +--+  +----+
+|
+--------------------------------------------------------------> Page ID
+
+
+---+
+|   |   Layer file
+---+
+```
+
+
+# Memtable
+
+When new WAL arrives, it is first put into the Memtable. Despite the
+name, the Memtable is not a purely in-memory data structure. It can
+spill to a temporary file on disk if the system is low on memory, and
+is accessed through a buffer cache.
+
+If the page server crashes, the Memtable is lost. It is rebuilt by
+processing again the WAL that's newer than the latest layer in L0.
+
+The size of the Memtable is configured by the "checkpoint distance"
+setting. Because anything that hasn't been flushed to disk and
+uploaded to S3 yet needs to be kept in the safekeeper, the "checkpoint
+distance" also determines the amount of WAL that needs to kept in the
+safekeeper.
+
+# L0
+
+When the Memtable fills up, it is written out to a new file in L0. The
+files are immutable; when a file is created, it is never
+modified. Each file in L0 is roughly 1 GB in size (*). Like the
+Memtable, each file in L0 covers the whole key range.
+
+When enough files have been accumulated in L0, compaction
+starts. Compaction processes all the files in L0 and reshuffles the
+data to create a new set of files in L1.
+
+
+(*) except in corner cases like if we want to shut down the page
+server and want to flush out the memtable to disk even though it's not
+full yet.
+
+
+# L1
+
+L1 consists of ~ 1 GB files like L0. But each file covers only part of
+the overall key space, and a larger range of LSNs. This speeds up
+searches. When you're looking for a given page, you need to check all
+the files in L0, to see if they contain a page version for the requested
+page. But in L1, you only need to check the files whose key range covers
+the requested page. This is particularly important at cold start, when
+checking a file means downloading it from S3.
+
+Partitioning by key range also helps with garbage collection. If only a
+part of the database is updated, we will accumulate more files for
+the hot part in L1, and old files can be removed without affecting the
+cold part.
+
+
+# Image layers
+
+So far, we've only talked about delta layers. In addition to the delta
+layers, we create image layers, when "enough" WAL has been accumulated
+for some part of the database. Each image layer covers a 1 GB range of
+key space. It contains images of the pages at a single LSN, a snapshot
+if you will.
+
+The exact heuristic for what "enough" means is not clear yet. Maybe
+create a new image layer when 10 GB of WAL has been accumulated for a
+1 GB segment.
+
+The image layers limit the number of layers that a search needs to
+check. That put a cap on read latency, and it also allows garbage
+collecting layers that are older than the GC horizon.
+
+
+# Partitioning scheme
+
+When compaction happens and creates a new set of files in L1, how do
+we partition the data into the files?
+
+- Goal is that each file is ~ 1 GB in size
+- Try to match partition boundaries at relation boundaries. (See [1]
+  for how PebblesDB does this, and for why that's important)
+- Greedy algorithm
+
+# Additional Reading
+
+[1] Paper on PebblesDB and how it does partitioning.
+https://www.cs.utexas.edu/~rak/papers/sosp17-pebblesdb.pdf
--- a/docs/rfcs/015-storage-messaging.md
+++ b/docs/rfcs/015-storage-messaging.md
@@ -0,0 +1,295 @@
+# Storage messaging
+
+Created on 19.01.22
+
+Initially created [here](https://github.com/zenithdb/rfcs/pull/16) by @kelvich.
+
+That it is an alternative to (014-safekeeper-gossip)[]
+
+## Motivation
+
+As in 014-safekeeper-gossip we need to solve the following problems:
+
+* Trim WAL on safekeepers
+* Decide on which SK should push WAL to the S3
+* Decide on which SK should forward WAL to the pageserver
+* Decide on when to shut down SK<->pageserver connection
+
+This RFC suggests a more generic and hopefully more manageable way to address those problems. However, unlike 014-safekeeper-gossip, it does not bring us any closer to safekeeper-to-safekeeper recovery but rather unties two sets of different issues we previously wanted to solve with gossip.
+
+Also, with this approach, we would not need "call me maybe" anymore, and the pageserver will have all the data required to understand that it needs to reconnect to another safekeeper.
+
+## Summary
+
+Instead of p2p gossip, let's have a centralized broker where all the storage nodes report per-timeline state. Each storage node should have a `--broker-url=1.2.3.4` CLI param.
+
+Here I propose two ways to do that. After a lot of arguing with myself, I'm leaning towards the etcd approach. My arguments for it are in the pros/cons section. Both options require adding a Grpc client in our codebase either directly or as an etcd dependency.
+
+## Non-goals
+
+That RFC does *not* suggest moving the compute to pageserver and compute to safekeeper mappings out of the console. The console is still the only place in the cluster responsible for the persistency of that info. So I'm implying that each pageserver and safekeeper exactly knows what timelines he serves, as it currently is. We need some mechanism for a new pageserver to discover mapping info, but that is out of the scope of this RFC.
+
+## Impacted components
+
+pageserver, safekeeper
+adds either etcd or console as a storage dependency
+
+## Possible implementation: custom message broker in the console
+
+We've decided to go with an etcd approach instead of the message broker.
+
+<details closed>
+<summary>Original suggestion</summary>
+<br>
+We can add a Grpc service in the console that acts as a message broker since the console knows the addresses of all the components. The broker can ignore the payload and only redirect messages. So, for example, each safekeeper may send a message to the peering safekeepers or to the pageserver responsible for a given timeline.
+
+Message format could be `{sender, destination, payload}`.
+
+The destination is either:
+1. `sk_#{tenant}_#{timeline}` -- to be broadcasted on all safekeepers, responsible for that timeline, or
+2. `pserver_#{tenant}_#{timeline}` -- to be broadcasted on all pageservers, responsible for that timeline
+
+Sender is either:
+1. `sk_#{sk_id}`, or
+2. `pserver_#{pserver_id}`
+
+I can think of the following behavior to address our original problems:
+
+* WAL trimming
+  Each safekeeper periodically broadcasts `(write_lsn, commit_lsn)` to all peering (peering == responsible for that timeline) safekeepers
+
+* Decide on which SK should push WAL to the S3
+
+  Each safekeeper periodically broadcasts `i_am_alive_#{current_timestamp}` message to all peering safekeepers. That way, safekeepers may maintain the vector of alive peers (loose one, with false negatives). Alive safekeeper with the minimal id pushes data to S3.
+
+* Decide on which SK should forward WAL to the pageserver
+
+  Each safekeeper periodically sends (write_lsn, commit_lsn, compute_connected) to the relevant pageservers. With that info, pageserver can maintain a view of the safekeepers state, connect to a random one, and detect the moments (e.g., one the safekeepers is not making progress or down) when it needs to reconnect to another safekeeper. Pageserver should resolve exact IP addresses through the console, e.g., exchange `#sk_#{sk_id}` to `4.5.6.7:6400`.
+
+  Pageserver connection to the safekeeper triggered by the state change `compute_connected: false -> true`. With that, we don't need "call me maybe" anymore.
+
+  Also, we don't have a "peer address amnesia" problem as in the gossip approach (with gossip, after a simultaneous reboot, safekeepers wouldn't know each other addresses until the next compute connection).
+
+* Decide on when to shutdown sk<->pageserver connection
+
+  Again, pageserver would have all the info to understand when to shut down the safekeeper connection.
+
+### Scalability
+
+One node is enough (c) No, seriously, it is enough.
+
+### High Availability
+
+Broker lives in the console, so we can rely on k8s maintaining the console app alive.
+
+If the console is down, we won't trim WAL and reconnect the pageserver to another safekeeper. But, at the same, if the console is down, we already can't accept new compute connections and start stopped computes, so we are making things a bit worse, but not dramatically.
+
+### Interactions
+
+```
+         .________________.
+sk_1 <-> |                | <-> pserver_1
+...      | Console broker |     ...
+sk_n <-> |________________| <-> pserver_m
+```
+</details>
+
+
+## Implementation: etcd state store
+
+Alternatively, we can set up `etcd` and maintain the following data structure in it:
+
+```ruby
+"compute_#{tenant}_#{timeline}" => {
+    safekeepers => {
+        "sk_#{sk_id}" => {
+            write_lsn: "0/AEDF130",
+            commit_lsn: "0/AEDF100",
+            compute_connected: true,
+            last_updated: 1642621138,
+        },
+    }
+}
+```
+
+As etcd doesn't support field updates in the nested objects that translates to the following set of keys:
+
+```ruby
+"compute_#{tenant}_#{timeline}/safekeepers/sk_#{sk_id}/write_lsn",
+"compute_#{tenant}_#{timeline}/safekeepers/sk_#{sk_id}/commit_lsn",
+...
+```
+
+Each storage node can subscribe to the relevant sets of keys and maintain a local view of that structure. So in terms of the data flow, everything is the same as in the previous approach. Still, we can avoid implementing the message broker and prevent runtime storage dependency on a console.
+
+### Safekeeper address discovery
+
+During the startup safekeeper should publish the address he is listening on as the part of `{"sk_#{sk_id}" => ip_address}`. Then the pageserver can resolve `sk_#{sk_id}` to the actual address. This way it would work both locally and in the cloud setup. Safekeeper should have `--advertised-address` CLI option so that we can listen on e.g. 0.0.0.0 but advertise something more useful.
+
+### Safekeeper behavior
+
+For each timeline safekeeper periodically broadcasts `compute_#{tenant}_#{timeline}/safekeepers/sk_#{sk_id}/*` fields. It subscribes to changes of `compute_#{tenant}_#{timeline}` -- that way safekeeper will have an information about peering safekeepers.
+That amount of information is enough to properly trim WAL. To decide on who is pushing the data to S3 safekeeper may use etcd leases or broadcast a timestamp and hence track who is alive.
+
+### Pageserver behavior
+
+Pageserver subscribes to `compute_#{tenant}_#{timeline}` for each tenant it owns. With that info, pageserver can maintain a view of the safekeepers state, connect to a random one, and detect the moments (e.g., one the safekeepers is not making progress or down) when it needs to reconnect to another safekeeper. Pageserver should resolve exact IP addresses through the console, e.g., exchange `#sk_#{sk_id}` to `4.5.6.7:6400`.
+
+Pageserver connection to the safekeeper can be triggered by the state change `compute_connected: false -> true`. With that, we don't need "call me maybe" anymore.
+
+As an alternative to compute_connected, we can track timestamp of the latest message arrived to safekeeper from compute. Usually compute broadcasts KeepAlive to all safekeepers every second, so it'll be updated every second when connection is ok. Then the connection can be considered down when this timestamp isn't updated for a several seconds.
+
+This will help to faster detect issues with safekeeper (and switch to another) in the following cases:
+
+      when compute failed but TCP connection stays alive until timeout (usually about a minute)
+      when safekeeper failed and didn't set compute_connected to false
+
+Another way to deal with [2] is to process (write_lsn, commit_lsn, compute_connected) as a KeepAlive on the pageserver side and detect issues when sk_id don't send anything for some time. This way is fully compliant to this RFC.
+
+Also, we don't have a "peer address amnesia" problem as in the gossip approach (with gossip, after a simultaneous reboot, safekeepers wouldn't know each other addresses until the next compute connection).
+
+### Interactions
+
+```
+         .________________.
+sk_1 <-> |                | <-> pserver_1
+...      |      etcd      |     ...
+sk_n <-> |________________| <-> pserver_m
+```
+
+### Sequence diagrams for different workflows
+
+#### Cluster startup
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant C as Compute
+    participant SK1
+    participant SK2
+    participant SK3
+    participant PS1
+    participant PS2
+    participant O as Orchestrator
+    participant M as Metadata Service
+
+    PS1->>M: subscribe to updates to state of timeline N
+    C->>+SK1: WAL push
+    loop constantly update current lsns
+        SK1->>-M: I'm at lsn A
+    end
+    C->>+SK2: WAL push
+    loop constantly update current lsns
+        SK2->>-M: I'm at lsn B
+    end
+    C->>+SK3: WAL push
+    loop constantly update current lsns
+        SK3->>-M: I'm at lsn C
+    end
+    loop request pages
+        C->>+PS1: get_page@lsn
+        PS1->>-C: page image
+    end
+    M->>PS1: New compute appeared for timeline N. SK1 at A, SK2 at B, SK3 at C
+    note over PS1: Say SK1 at A=200, SK2 at B=150 SK3 at C=100 <br> so connect to SK1 because it is the most up to date one
+    PS1->>SK1: start replication
+```
+
+#### Behaviour of services during typical operations
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant C as Compute
+    participant SK1
+    participant SK2
+    participant SK3
+    participant PS1
+    participant PS2
+    participant O as Orchestrator
+    participant M as Metadata Service
+
+    note over C,M: Scenario 1: Pageserver checkpoint
+    note over PS1: Upload data to S3
+    PS1->>M: Update remote consistent lsn
+    M->>SK1: propagate remote consistent lsn update
+    note over SK1: truncate WAL up to remote consistent lsn
+    M->>SK2: propagate remote consistent lsn update
+    note over SK2: truncate WAL up to remote consistent lsn
+    M->>SK3: propagate remote consistent lsn update
+    note over SK3: truncate WAL up to remote consistent lsn
+    note over C,M: Scenario 2: SK1 finds itself lagging behind MAX(150 (SK2), 200 (SK2)) - 100 (SK1) > THRESHOLD
+    SK1->>SK2: Fetch WAL delta between 100 (SK1) and 200 (SK2)
+    note over C,M: Scenario 3: PS1 detects that SK1 is lagging behind: Connection from SK1 is broken or there is no messages from it in 30 seconds.
+    note over PS1: e.g. SK2 is at 150, SK3 is at 100, chose SK2 as a new replication source
+    PS1->>SK2: start replication
+```
+
+#### Behaviour during timeline relocation
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant C as Compute
+    participant SK1
+    participant SK2
+    participant SK3
+    participant PS1
+    participant PS2
+    participant O as Orchestrator
+    participant M as Metadata Service
+
+    note over C,M: Timeline is being relocated from PS1 to PS2
+    O->>+PS2: Attach timeline
+    PS2->>-O: 202 Accepted if timeline exists in S3
+    note over PS2: Download timeline from S3
+     note over O: Poll for timeline download (or subscribe to metadata service)
+    loop wait for attach to complete
+        O->>PS2: timeline detail should answer that timeline is ready
+    end
+    PS2->>M: Register downloaded timeline
+    PS2->>M: Get safekeepers for timeline, subscribe to changes
+    PS2->>SK1: Start replication to catch up
+    note over O: PS2 caught up, time to switch compute
+    O->>C: Restart compute with new pageserver url in config
+    note over C: Wal push is restarted
+    loop request pages
+        C->>+PS2: get_page@lsn
+        PS2->>-C: page image
+    end
+    O->>PS1: detach timeline
+    note over C,M: Scenario 1: Attach call failed
+    O--xPS2: Attach timeline
+    note over O: The operation can be safely retried, <br> if we hit some threshold we can try another pageserver
+    note over C,M: Scenario 2: Attach succeeded but pageserver failed to download the data or start replication
+    loop wait for attach to complete
+        O--xPS2: timeline detail should answer that timeline is ready
+    end
+    note over O: Can wait for a timeout, and then try another pageserver <br> there should be a limit on number of different pageservers to try
+    note over C,M: Scenario 3: Detach fails
+    O--xPS1: Detach timeline
+    note over O: can be retried, if continues to fail might lead to data duplication in s3
+```
+
+# Pros/cons
+
+## Console broker/etcd vs gossip:
+
+Gossip pros:
+* gossip allows running storage without the console or etcd
+
+Console broker/etcd pros:
+* simpler
+* solves "call me maybe" as well
+* avoid possible N-to-N connection issues with gossip without grouping safekeepers in pre-defined triples
+
+## Console broker vs. etcd:
+
+Initially, I wanted to avoid etcd as a dependency mostly because I've seen how painful for Clickhouse was their ZooKeeper dependency: in each chat, at each conference, people were complaining about configuration and maintenance barriers with ZooKeeper. It was that bad that ClickHouse re-implemented ZooKeeper to embed it: https://clickhouse.com/docs/en/operations/clickhouse-keeper/.
+
+But with an etcd we are in a bit different situation:
+
+1. We don't need persistency and strong consistency guarantees for the data we store in the etcd
+2. etcd uses Grpc as a protocol, and messages are pretty simple
+
+So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local zenith installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres).
--- a/Show More
+++ b/Show More