Prevent commit_lsn <= flush_lsn violation after a42eba3cd7.

Nothing complained about that yet, but we definitely don't hold at least one assert, so let's keep it this way until better version.
s3 WAL offloading staging review.
2026-05-30 19:40:39 +00:00 · 2022-05-27 14:11:56 +04:00 · 2022-05-27 13:17:10 +04:00 · 2022-05-27 06:19:23 +04:00 · 2022-05-26 14:59:03 -04:00 · 2022-05-26 12:21:15 -04:00
183 changed files with 9783 additions and 4823 deletions
--- a/.circleci/ansible/get_binaries.sh
+++ b/.circleci/ansible/get_binaries.sh
@@ -7,7 +7,7 @@ RELEASE=${RELEASE:-false}
 # look at docker hub for latest tag for neon docker image
 if [ "${RELEASE}" = "true" ]; then
    echo "search latest relase tag"
-    VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | tail -1)
+    VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | grep -E '^[0-9]+$' | sort -n | tail -1)
    if [ -z "${VERSION}" ]; then
        echo "no any docker tags found, exiting..."
        exit 1
@@ -16,7 +16,7 @@ if [ "${RELEASE}" = "true" ]; then
    fi
 else
    echo "search latest dev tag"
-    VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -v release | tail -1)
+    VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -E '^[0-9]+$' | sort -n | tail -1)
    if [ -z "${VERSION}" ]; then
        echo "no any docker tags found, exiting..."
        exit 1
--- a/.circleci/ansible/neon-stress.hosts
+++ b/.circleci/ansible/neon-stress.hosts
@@ -0,0 +1,19 @@
+[pageservers]
+neon-stress-ps-1 console_region_id=1
+neon-stress-ps-2 console_region_id=1
+
+[safekeepers]
+neon-stress-sk-1 console_region_id=1
+neon-stress-sk-2 console_region_id=1
+neon-stress-sk-3 console_region_id=1
+
+[storage:children]
+pageservers
+safekeepers
+
+[storage:vars]
+console_mgmt_base_url = http://neon-stress-console.local
+bucket_name           = neon-storage-ireland
+bucket_region         = eu-west-1
+etcd_endpoints        = etcd-stress.local:2379
+safekeeper_enable_s3_offload = false
--- a/.circleci/ansible/production.hosts
+++ b/.circleci/ansible/production.hosts
@@ -1,5 +1,6 @@
 [pageservers]
-zenith-1-ps-1 console_region_id=1
+#zenith-1-ps-1 console_region_id=1
+zenith-1-ps-2 console_region_id=1

 [safekeepers]
 zenith-1-sk-1 console_region_id=1
--- a/.circleci/ansible/staging.hosts
+++ b/.circleci/ansible/staging.hosts
@@ -4,8 +4,9 @@ zenith-us-stage-ps-2 console_region_id=27

 [safekeepers]
 zenith-us-stage-sk-1 console_region_id=27
-zenith-us-stage-sk-2 console_region_id=27
 zenith-us-stage-sk-4 console_region_id=27
+zenith-us-stage-sk-5 console_region_id=27
+zenith-us-stage-sk-6 console_region_id=27

 [storage:children]
 pageservers
--- a/.circleci/ansible/systemd/pageserver.service
+++ b/.circleci/ansible/systemd/pageserver.service
@@ -6,7 +6,7 @@ After=network.target auditd.service
 Type=simple
 User=pageserver
 Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib
-ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /storage/pageserver/data
+ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
 KillSignal=SIGINT
--- a/.circleci/ansible/systemd/safekeeper.service
+++ b/.circleci/ansible/systemd/safekeeper.service
@@ -6,7 +6,7 @@ After=network.target auditd.service
 Type=simple
 User=safekeeper
 Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
-ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }}
+ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="wal"}'
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
 KillSignal=SIGINT
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -11,15 +11,6 @@ executors:
      - image: zimg/rust:1.58

 jobs:
-  check-codestyle-rust:
-    executor: neon-xlarge-executor
-    steps:
-      - checkout
-      - run:
-          name: rustfmt
-          when: always
-          command: cargo fmt --all -- --check
-
  # A job to build postgres
  build-postgres:
    executor: neon-xlarge-executor
@@ -222,6 +213,12 @@ jobs:
          key: v2-python-deps-{{ checksum "poetry.lock" }}
          paths:
            - /home/circleci/.cache/pypoetry/virtualenvs
+      - run:
+          name: Print versions
+          when: always
+          command: |
+              poetry run python --version
+              poetry show
      - run:
          name: Run yapf to ensure code format
          when: always
@@ -355,7 +352,7 @@ jobs:
          when: always
          command: |
            du -sh /tmp/test_output/*
-            find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" -delete
+            find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete
            du -sh /tmp/test_output/*
      - store_artifacts:
          path: /tmp/test_output
@@ -584,9 +581,59 @@ jobs:
          name: Re-deploy proxy
          command: |
            DOCKER_TAG=$(git log --oneline|wc -l)
-            helm upgrade zenith-proxy     neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
+            helm upgrade neon-proxy       neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
            helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait

+  deploy-neon-stress:
+    docker:
+      - image: cimg/python:3.10
+    steps:
+      - checkout
+      - setup_remote_docker
+      - run:
+          name: Setup ansible
+          command: |
+            pip install --progress-bar off --user ansible boto3
+      - run:
+          name: Redeploy
+          command: |
+            cd "$(pwd)/.circleci/ansible"
+
+            ./get_binaries.sh
+
+            echo "${TELEPORT_SSH_KEY}"  | tr -d '\n'| base64 --decode >ssh-key
+            echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
+            chmod 0600 ssh-key
+            ssh-add ssh-key
+            rm -f ssh-key ssh-key-cert.pub
+
+            ansible-playbook deploy.yaml -i neon-stress.hosts
+            rm -f neon_install.tar.gz .neon_current_version
+
+  deploy-neon-stress-proxy:
+    docker:
+      - image: cimg/base:2021.04
+    environment:
+      KUBECONFIG: .kubeconfig
+    steps:
+      - checkout
+      - run:
+          name: Store kubeconfig file
+          command: |
+            echo "${NEON_STRESS_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
+            chmod 0600 ${KUBECONFIG}
+      - run:
+          name: Setup helm v3
+          command: |
+            curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+            helm repo add neondatabase https://neondatabase.github.io/helm-charts
+      - run:
+          name: Re-deploy proxy
+          command: |
+            DOCKER_TAG=$(git log --oneline|wc -l)
+            helm upgrade neon-stress-proxy       neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
+            helm upgrade neon-stress-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
+
  deploy-release:
    docker:
      - image: cimg/python:3.10
@@ -629,12 +676,13 @@ jobs:
          name: Setup helm v3
          command: |
            curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
-            helm repo add zenithdb https://neondatabase.github.io/helm-charts
+            helm repo add neondatabase https://neondatabase.github.io/helm-charts
      - run:
          name: Re-deploy proxy
          command: |
            DOCKER_TAG="release-$(git log --oneline|wc -l)"
-            helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
+            helm upgrade neon-proxy       neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
+            helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait

  # Trigger a new remote CI job
  remote-ci-trigger:
@@ -683,7 +731,6 @@ jobs:
 workflows:
  build_and_test:
    jobs:
-      - check-codestyle-rust
      - check-codestyle-python
      - build-postgres:
          name: build-postgres-<< matrix.build_type >>
@@ -771,6 +818,25 @@ workflows:
          requires:
            - docker-image

+      - deploy-neon-stress:
+          # Context gives an ability to login
+          context: Docker Hub
+          # deploy only for commits to main
+          filters:
+            branches:
+              only:
+                - main
+          requires:
+            - docker-image
+      - deploy-neon-stress-proxy:
+          # deploy only for commits to main
+          filters:
+            branches:
+              only:
+                - main
+          requires:
+            - docker-image
+
      - docker-image-release:
          # Context gives an ability to login
          context: Docker Hub
--- a/.circleci/helm-values/neon-stress.proxy-scram.yaml
+++ b/.circleci/helm-values/neon-stress.proxy-scram.yaml
@@ -0,0 +1,26 @@
+fullnameOverride: "neon-stress-proxy-scram"
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://neon-stress-console.local/management/api/v2"
+  domain: "*.stress.neon.tech"
+
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: staging
+  zenith_region: eu-west-1
+  zenith_region_slug: ireland
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: '*.stress.neon.tech'
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
--- a/.circleci/helm-values/neon-stress.proxy.yaml
+++ b/.circleci/helm-values/neon-stress.proxy.yaml
@@ -0,0 +1,34 @@
+fullnameOverride: "neon-stress-proxy"
+
+settings:
+  authEndpoint: "https://console.dev.neon.tech/authenticate_proxy_request/"
+  uri: "https://console.dev.neon.tech/psql_session/"
+
+# -- Additional labels for zenith-proxy pods
+podLabels:
+  zenith_service: proxy
+  zenith_env: staging
+  zenith_region: eu-west-1
+  zenith_region_slug: ireland
+
+service:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal
+    external-dns.alpha.kubernetes.io/hostname: neon-stress-proxy.local
+  type: LoadBalancer
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: connect.dev.neon.tech
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
--- a/.circleci/helm-values/production.proxy-scram.yaml
+++ b/.circleci/helm-values/production.proxy-scram.yaml
@@ -0,0 +1,24 @@
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-release.local/management/api/v2"
+  domain: "*.cloud.neon.tech"
+
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: production
+  zenith_region: us-west-2
+  zenith_region_slug: oregon
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: '*.cloud.neon.tech'
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
--- a/.circleci/helm-values/production.proxy.yaml
+++ b/.circleci/helm-values/production.proxy.yaml
@@ -1,9 +1,3 @@
-# Helm chart values for zenith-proxy.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
 settings:
  authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
  uri: "https://console.neon.tech/psql_session/"
@@ -28,7 +22,7 @@ exposedService:
    service.beta.kubernetes.io/aws-load-balancer-type: external
    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: start.zenith.tech,connect.neon.tech,pg.neon.tech
+    external-dns.alpha.kubernetes.io/hostname: connect.neon.tech,pg.neon.tech

 metrics:
  enabled: true
--- a/.circleci/helm-values/staging.proxy-scram.yaml
+++ b/.circleci/helm-values/staging.proxy-scram.yaml
@@ -6,7 +6,8 @@ image:

 settings:
  authBackend: "console"
-  authEndpoint: "http://console-staging.local:9095/management/api/v2"
+  authEndpoint: "http://console-staging.local/management/api/v2"
+  domain: "*.cloud.stage.neon.tech"

 # -- Additional labels for zenith-proxy pods
 podLabels:
@@ -20,7 +21,7 @@ exposedService:
    service.beta.kubernetes.io/aws-load-balancer-type: external
    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: *.cloud.stage.neon.tech
+    external-dns.alpha.kubernetes.io/hostname: cloud.stage.neon.tech

 metrics:
  enabled: true
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -1,6 +1,10 @@
 name: Build and Test

-on: push
+on:
+  push:
+    branches:
+    - main
+  pull_request:

 jobs:
  regression-check:
@@ -21,13 +25,17 @@ jobs:
          submodules: true
          fetch-depth: 2

-      - name: install rust toolchain ${{ matrix.rust_toolchain }}
+      - name: Install rust toolchain ${{ matrix.rust_toolchain }}
        uses: actions-rs/toolchain@v1
        with:
          profile: minimal
          toolchain: ${{ matrix.rust_toolchain }}
+          components: rustfmt, clippy
          override: true

+      - name: Check formatting
+        run: cargo fmt --all -- --check
+
      - name: Install Ubuntu postgres dependencies
        if: matrix.os == 'ubuntu-latest'
        run: |
--- a/20
+++ b/20
@@ -1,20 +0,0 @@
-This software is licensed under the Apache 2.0 License:
-
----------------------------------------------------------------------------
-Copyright 2021 Zenith Labs, Inc
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
----------------------------------------------------------------------------
-
-The PostgreSQL submodule in vendor/postgres is licensed under the
-PostgreSQL license. See vendor/postgres/COPYRIGHT.
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -113,6 +113,49 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"

+[[package]]
+name = "axum"
+version = "0.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4af7447fc1214c1f3a1ace861d0216a6c8bb13965b64bbad9650f375b67689a"
+dependencies = [
+ "async-trait",
+ "axum-core",
+ "bitflags",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "hyper",
+ "itoa 1.0.1",
+ "matchit",
+ "memchr",
+ "mime",
+ "percent-encoding",
+ "pin-project-lite",
+ "serde",
+ "sync_wrapper",
+ "tokio",
+ "tower",
+ "tower-http",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "axum-core"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3bdc19781b16e32f8a7200368a336fa4509d4b72ef15dd4e41df5290855ee1e6"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "mime",
+]
+
 [[package]]
 name = "backtrace"
 version = "0.3.64"
@@ -123,7 +166,7 @@ dependencies = [
 "cc",
 "cfg-if",
 "libc",
- "miniz_oxide",
+ "miniz_oxide 0.4.4",
 "object",
 "rustc-demangle",
 ]
@@ -320,6 +363,15 @@ dependencies = [
 "textwrap 0.14.2",
 ]

+[[package]]
+name = "cmake"
+version = "0.1.48"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8ad8cef104ac57b68b89df3208164d228503abbdce70f6880ffa3d970e7443a"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "combine"
 version = "4.6.3"
@@ -330,6 +382,18 @@ dependencies = [
 "memchr",
 ]

+[[package]]
+name = "comfy-table"
+version = "5.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b103d85ca6e209388771bfb7aa6b68a7aeec4afbf6f0a0264bfbf50360e5212e"
+dependencies = [
+ "crossterm",
+ "strum",
+ "strum_macros",
+ "unicode-width",
+]
+
 [[package]]
 name = "compute_tools"
 version = "0.1.0"
@@ -526,6 +590,31 @@ dependencies = [
 "lazy_static",
 ]

+[[package]]
+name = "crossterm"
+version = "0.23.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2102ea4f781910f8a5b98dd061f4c2023f479ce7bb1236330099ceb5a93cf17"
+dependencies = [
+ "bitflags",
+ "crossterm_winapi",
+ "libc",
+ "mio",
+ "parking_lot 0.12.0",
+ "signal-hook",
+ "signal-hook-mio",
+ "winapi",
+]
+
+[[package]]
+name = "crossterm_winapi"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2ae1b35a484aa10e07fe0638d02301c5ad24de82d310ccbd2f3693da5f09bf1c"
+dependencies = [
+ "winapi",
+]
+
 [[package]]
 name = "crypto-common"
 version = "0.1.3"
@@ -693,9 +782,9 @@ dependencies = [

 [[package]]
 name = "etcd-client"
-version = "0.8.4"
+version = "0.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "585de5039d1ecce74773db49ba4e8107e42be7c2cd0b1a9e7fce27181db7b118"
+checksum = "c434d2800b273a506b82397aad2f20971636f65e47b27c027f77d498530c5954"
 dependencies = [
 "http",
 "prost",
@@ -703,9 +792,26 @@ dependencies = [
 "tokio-stream",
 "tonic",
 "tonic-build",
+ "tower",
 "tower-service",
 ]

+[[package]]
+name = "etcd_broker"
+version = "0.1.0"
+dependencies = [
+ "etcd-client",
+ "regex",
+ "serde",
+ "serde_json",
+ "serde_with",
+ "thiserror",
+ "tokio",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "fail"
 version = "0.5.0"
@@ -762,6 +868,18 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "279fb028e20b3c4c320317955b77c5e0c9701f05a1d309905d6fc702cdc5053e"

+[[package]]
+name = "flate2"
+version = "1.0.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b39522e96686d38f4bc984b9198e3a0613264abaebaff2c5c918bfa6b6da09af"
+dependencies = [
+ "cfg-if",
+ "crc32fast",
+ "libc",
+ "miniz_oxide 0.5.1",
+]
+
 [[package]]
 name = "fnv"
 version = "1.0.7"
@@ -990,6 +1108,12 @@ dependencies = [
 "unicode-segmentation",
 ]

+[[package]]
+name = "heck"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9"
+
 [[package]]
 name = "hermit-abi"
 version = "0.1.19"
@@ -1055,6 +1179,12 @@ dependencies = [
 "pin-project-lite",
 ]

+[[package]]
+name = "http-range-header"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0bfe8eed0a9285ef776bb792479ea3834e8b94e13d615c2f66d03dd50a435a29"
+
 [[package]]
 name = "httparse"
 version = "1.6.0"
@@ -1320,6 +1450,12 @@ version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f"

+[[package]]
+name = "matchit"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73cbba799671b762df5a175adf59ce145165747bb891505c43d09aefbbf38beb"
+
 [[package]]
 name = "md-5"
 version = "0.9.1"
@@ -1403,6 +1539,15 @@ dependencies = [
 "autocfg",
 ]

+[[package]]
+name = "miniz_oxide"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2b29bd4bc3f33391105ebee3589c19197c4271e3e5a9ec9bfe8127eeff8f082"
+dependencies = [
+ "adler",
+]
+
 [[package]]
 name = "mio"
 version = "0.8.2"
@@ -1450,6 +1595,24 @@ dependencies = [
 "tempfile",
 ]

+[[package]]
+name = "neon_local"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "clap 3.0.14",
+ "comfy-table",
+ "control_plane",
+ "git-version",
+ "pageserver",
+ "postgres",
+ "postgres_ffi",
+ "safekeeper",
+ "serde_json",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "nix"
 version = "0.23.1"
@@ -1559,9 +1722,9 @@ dependencies = [

 [[package]]
 name = "once_cell"
-version = "1.9.0"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5"
+checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9"

 [[package]]
 name = "oorandom"
@@ -1622,7 +1785,6 @@ name = "pageserver"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "async-trait",
 "byteorder",
 "bytes",
 "chrono",
@@ -1631,8 +1793,10 @@ dependencies = [
 "crc32c",
 "crossbeam-utils",
 "daemonize",
+ "etcd_broker",
 "fail",
 "futures",
+ "git-version",
 "hex",
 "hex-literal",
 "humantime",
@@ -1650,8 +1814,7 @@ dependencies = [
 "pprof",
 "rand",
 "regex",
- "rusoto_core",
- "rusoto_s3",
+ "remote_storage",
 "scopeguard",
 "serde",
 "serde_json",
@@ -1663,7 +1826,6 @@ dependencies = [
 "tokio",
 "tokio-postgres",
 "tokio-stream",
- "tokio-util 0.7.0",
 "toml_edit",
 "tracing",
 "url",
@@ -1885,15 +2047,18 @@ dependencies = [
 "bytes",
 "chrono",
 "crc32c",
+ "env_logger",
 "hex",
 "lazy_static",
 "log",
 "memoffset",
+ "postgres",
 "rand",
 "regex",
 "serde",
 "thiserror",
 "utils",
+ "wal_generate",
 "workspace_hack",
 ]

@@ -1922,6 +2087,16 @@ version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"

+[[package]]
+name = "prettyplease"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9e07e3a46d0771a8a06b5f4441527802830b43e679ba12f44960f48dd4c6803"
+dependencies = [
+ "proc-macro2",
+ "syn",
+]
+
 [[package]]
 name = "proc-macro-hack"
 version = "0.5.19"
@@ -1937,6 +2112,20 @@ dependencies = [
 "unicode-xid",
 ]

+[[package]]
+name = "procfs"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95e344cafeaeefe487300c361654bcfc85db3ac53619eeccced29f5ea18c4c70"
+dependencies = [
+ "bitflags",
+ "byteorder",
+ "flate2",
+ "hex",
+ "lazy_static",
+ "libc",
+]
+
 [[package]]
 name = "prometheus"
 version = "0.13.0"
@@ -1946,16 +2135,18 @@ dependencies = [
 "cfg-if",
 "fnv",
 "lazy_static",
+ "libc",
 "memchr",
 "parking_lot 0.11.2",
+ "procfs",
 "thiserror",
 ]

 [[package]]
 name = "prost"
-version = "0.9.0"
+version = "0.10.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "444879275cb4fd84958b1a1d5420d15e6fcf7c235fe47f053c9c2a80aceb6001"
+checksum = "bc03e116981ff7d8da8e5c220e374587b98d294af7ba7dd7fda761158f00086f"
 dependencies = [
 "bytes",
 "prost-derive",
@@ -1963,12 +2154,14 @@ dependencies = [

 [[package]]
 name = "prost-build"
-version = "0.9.0"
+version = "0.10.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62941722fb675d463659e49c4f3fe1fe792ff24fe5bbaa9c08cd3b98a1c354f5"
+checksum = "65a1118354442de7feb8a2a76f3d80ef01426bd45542c8c1fdffca41a758f846"
 dependencies = [
 "bytes",
- "heck",
+ "cfg-if",
+ "cmake",
+ "heck 0.4.0",
 "itertools",
 "lazy_static",
 "log",
@@ -1983,9 +2176,9 @@ dependencies = [

 [[package]]
 name = "prost-derive"
-version = "0.9.0"
+version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9cc1a3263e07e0bf68e96268f37665207b49560d98739662cdfaae215c720fe"
+checksum = "7b670f45da57fb8542ebdbb6105a925fe571b67f9e7ed9f47a06a84e72b4e7cc"
 dependencies = [
 "anyhow",
 "itertools",
@@ -1996,9 +2189,9 @@ dependencies = [

 [[package]]
 name = "prost-types"
-version = "0.9.0"
+version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "534b7a0e836e3c482d2693070f982e39e7611da9695d4d1f5a4b186b51faef0a"
+checksum = "2d0a014229361011dc8e69c8a1ec6c2e8d0f2af7c91e3ea3f5b2170298461e68"
 dependencies = [
 "bytes",
 "prost",
@@ -2014,6 +2207,7 @@ dependencies = [
 "bytes",
 "clap 3.0.14",
 "futures",
+ "git-version",
 "hashbrown",
 "hex",
 "hmac 0.12.1",
@@ -2170,9 +2364,9 @@ dependencies = [

 [[package]]
 name = "regex"
-version = "1.5.4"
+version = "1.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461"
+checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286"
 dependencies = [
 "aho-corasick",
 "memchr",
@@ -2194,6 +2388,26 @@ version = "0.6.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"

+[[package]]
+name = "remote_storage"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "metrics",
+ "once_cell",
+ "rusoto_core",
+ "rusoto_s3",
+ "serde",
+ "serde_json",
+ "tempfile",
+ "tokio",
+ "tokio-util 0.7.0",
+ "toml_edit",
+ "tracing",
+ "workspace_hack",
+]
+
 [[package]]
 name = "remove_dir_all"
 version = "0.5.3"
@@ -2293,9 +2507,9 @@ dependencies = [

 [[package]]
 name = "rusoto_core"
-version = "0.47.0"
+version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b4f000e8934c1b4f70adde180056812e7ea6b1a247952db8ee98c94cd3116cc"
+checksum = "1db30db44ea73551326269adcf7a2169428a054f14faf9e1768f2163494f2fa2"
 dependencies = [
 "async-trait",
 "base64",
@@ -2318,9 +2532,9 @@ dependencies = [

 [[package]]
 name = "rusoto_credential"
-version = "0.47.0"
+version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a46b67db7bb66f5541e44db22b0a02fed59c9603e146db3a9e633272d3bac2f"
+checksum = "ee0a6c13db5aad6047b6a44ef023dbbc21a056b6dab5be3b79ce4283d5c02d05"
 dependencies = [
 "async-trait",
 "chrono",
@@ -2336,9 +2550,9 @@ dependencies = [

 [[package]]
 name = "rusoto_s3"
-version = "0.47.0"
+version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "048c2fe811a823ad5a9acc976e8bf4f1d910df719dcf44b15c3e96c5b7a51027"
+checksum = "7aae4677183411f6b0b412d66194ef5403293917d66e70ab118f07cc24c5b14d"
 dependencies = [
 "async-trait",
 "bytes",
@@ -2349,9 +2563,9 @@ dependencies = [

 [[package]]
 name = "rusoto_signature"
-version = "0.47.0"
+version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6264e93384b90a747758bcc82079711eacf2e755c3a8b5091687b5349d870bcc"
+checksum = "a5ae95491c8b4847931e291b151127eccd6ff8ca13f33603eb3d0035ecb05272"
 dependencies = [
 "base64",
 "bytes",
@@ -2441,25 +2655,28 @@ name = "safekeeper"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-trait",
 "byteorder",
 "bytes",
 "clap 3.0.14",
 "const_format",
 "crc32c",
 "daemonize",
- "etcd-client",
+ "etcd_broker",
 "fs2",
+ "futures",
+ "git-version",
 "hex",
 "humantime",
 "hyper",
 "lazy_static",
 "metrics",
+ "once_cell",
 "postgres",
 "postgres-protocol",
 "postgres_ffi",
 "regex",
- "rusoto_core",
- "rusoto_s3",
+ "remote_storage",
 "serde",
 "serde_json",
 "serde_with",
@@ -2468,6 +2685,7 @@ dependencies = [
 "tokio",
 "tokio-postgres",
 "tokio-util 0.7.0",
+ "toml_edit",
 "tracing",
 "url",
 "utils",
@@ -2664,6 +2882,17 @@ dependencies = [
 "signal-hook-registry",
 ]

+[[package]]
+name = "signal-hook-mio"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af"
+dependencies = [
+ "libc",
+ "mio",
+ "signal-hook",
+]
+
 [[package]]
 name = "signal-hook-registry"
 version = "1.4.0"
@@ -2753,6 +2982,25 @@ version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"

+[[package]]
+name = "strum"
+version = "0.23.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cae14b91c7d11c9a851d3fbc80a963198998c2a64eec840477fa92d8ce9b70bb"
+
+[[package]]
+name = "strum_macros"
+version = "0.23.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38"
+dependencies = [
+ "heck 0.3.3",
+ "proc-macro2",
+ "quote",
+ "rustversion",
+ "syn",
+]
+
 [[package]]
 name = "subtle"
 version = "2.4.1"
@@ -2784,15 +3032,21 @@ dependencies = [

 [[package]]
 name = "syn"
-version = "1.0.86"
+version = "1.0.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a65b3f4ffa0092e9887669db0eae07941f023991ab58ea44da8fe8e2d511c6b"
+checksum = "7ff7c592601f11445996a06f8ad0c27f094a58857c2f89e97974ab9235b92c52"
 dependencies = [
 "proc-macro2",
 "quote",
 "unicode-xid",
 ]

+[[package]]
+name = "sync_wrapper"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20518fe4a4c9acf048008599e464deb21beeae3d3578418951a189c235a7a9a8"
+
 [[package]]
 name = "tar"
 version = "0.4.38"
@@ -3086,12 +3340,13 @@ dependencies = [

 [[package]]
 name = "tonic"
-version = "0.6.2"
+version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ff08f4649d10a70ffa3522ca559031285d8e421d727ac85c60825761818f5d0a"
+checksum = "5be9d60db39854b30b835107500cf0aca0b0d14d6e1c3de124217c23a29c2ddb"
 dependencies = [
 "async-stream",
 "async-trait",
+ "axum",
 "base64",
 "bytes",
 "futures-core",
@@ -3107,7 +3362,7 @@ dependencies = [
 "prost-derive",
 "tokio",
 "tokio-stream",
- "tokio-util 0.6.9",
+ "tokio-util 0.7.0",
 "tower",
 "tower-layer",
 "tower-service",
@@ -3117,10 +3372,11 @@ dependencies = [

 [[package]]
 name = "tonic-build"
-version = "0.6.2"
+version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9403f1bafde247186684b230dc6f38b5cd514584e8bec1dd32514be4745fa757"
+checksum = "d9263bf4c9bfaae7317c1c2faf7f18491d2fe476f70c414b73bf5d445b00ffa1"
 dependencies = [
+ "prettyplease",
 "proc-macro2",
 "prost-build",
 "quote",
@@ -3147,6 +3403,25 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "tower-http"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e980386f06883cf4d0578d6c9178c81f68b45d77d00f2c2c1bc034b3439c2c56"
+dependencies = [
+ "bitflags",
+ "bytes",
+ "futures-core",
+ "futures-util",
+ "http",
+ "http-body",
+ "http-range-header",
+ "pin-project-lite",
+ "tower",
+ "tower-layer",
+ "tower-service",
+]
+
 [[package]]
 name = "tower-layer"
 version = "0.3.1"
@@ -3362,6 +3637,18 @@ version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"

+[[package]]
+name = "wal_generate"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "clap 3.0.14",
+ "env_logger",
+ "log",
+ "postgres",
+ "tempfile",
+]
+
 [[package]]
 name = "walkdir"
 version = "2.3.2"
@@ -3588,13 +3875,22 @@ dependencies = [
 name = "workspace_hack"
 version = "0.1.0"
 dependencies = [
+ "ahash",
 "anyhow",
 "bytes",
 "chrono",
 "clap 2.34.0",
 "either",
+ "fail",
+ "futures-channel",
+ "futures-task",
+ "futures-util",
+ "generic-array",
 "hashbrown",
+ "hex",
+ "hyper",
 "indexmap",
+ "itoa 0.4.8",
 "libc",
 "log",
 "memchr",
@@ -3608,6 +3904,7 @@ dependencies = [
 "serde",
 "syn",
 "tokio",
+ "tokio-util 0.7.0",
 "tracing",
 "tracing-core",
 ]
@@ -3636,22 +3933,6 @@ dependencies = [
 "chrono",
 ]

-[[package]]
-name = "zenith"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "clap 3.0.14",
- "control_plane",
- "pageserver",
- "postgres",
- "postgres_ffi",
- "safekeeper",
- "serde_json",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "zeroize"
 version = "1.5.2"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,7 +6,7 @@ members = [
    "proxy",
    "safekeeper",
    "workspace_hack",
-    "zenith",
+    "neon_local",
    "libs/*",
 ]

--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -15,4 +15,4 @@ RUN set -e \
 # Final image that only has one binary
 FROM debian:buster-slim

-COPY --from=rust-build /home/circleci/project/target/release/zenith_ctl /usr/local/bin/zenith_ctl
+COPY --from=rust-build /home/circleci/project/target/release/compute_ctl /usr/local/bin/compute_ctl
--- a/12
+++ b/12
@@ -12,15 +12,21 @@ endif
 #
 BUILD_TYPE ?= debug
 ifeq ($(BUILD_TYPE),release)
-	PG_CONFIGURE_OPTS = --enable-debug
+	PG_CONFIGURE_OPTS = --enable-debug --with-openssl
 	PG_CFLAGS = -O2 -g3 $(CFLAGS)
 	# Unfortunately, `--profile=...` is a nightly feature
 	CARGO_BUILD_FLAGS += --release
 else ifeq ($(BUILD_TYPE),debug)
-	PG_CONFIGURE_OPTS = --enable-debug --enable-cassert --enable-depend
+	PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
 	PG_CFLAGS = -O0 -g3 $(CFLAGS)
 else
-$(error Bad build type `$(BUILD_TYPE)', see Makefile for options)
+	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
+endif
+
+# macOS with brew-installed openssl requires explicit paths
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Darwin)
+    PG_CONFIGURE_OPTS += --with-includes=/usr/local/opt/openssl/include --with-libraries=/usr/local/opt/openssl/lib
 endif

 # Choose whether we should be silent or verbose
--- a/5
+++ b/5
@@ -0,0 +1,5 @@
+Neon
+Copyright 2022 Neon Inc.
+
+The PostgreSQL submodule in vendor/postgres is licensed under the
+PostgreSQL license. See vendor/postgres/COPYRIGHT.
--- a/README.md
+++ b/README.md
@@ -23,61 +23,100 @@ Pageserver consists of:

 ## Running local installation

+
+#### building on Ubuntu/ Debian (Linux)
 1. Install build dependencies and other useful packages

 On Ubuntu or Debian this set of packages should be sufficient to build the code:
 ```text
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
-libssl-dev clang pkg-config libpq-dev
+libssl-dev clang pkg-config libpq-dev libprotobuf-dev etcd
 ```

-[Rust] 1.58 or later is also required.
+2. [Install Rust](https://www.rust-lang.org/tools/install)
+```
+# recommended approach from https://www.rust-lang.org/tools/install
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+```

-To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.
+3. Install PostgreSQL Client
+```
+apt install postgresql-client
+```

-To run the integration tests or Python scripts (not required to use the code), install
-Python (3.7 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory.
-
-2. Build neon and patched postgres
+4. Build neon and patched postgres
 ```sh
 git clone --recursive https://github.com/neondatabase/neon.git
 cd neon
 make -j5
 ```

-3. Start pageserver and postgres on top of it (should be called from repo root):
+#### building on OSX (12.3.1)
+1. Install XCode and dependencies
+```
+xcode-select --install
+brew install protobuf etcd
+```
+
+2. [Install Rust](https://www.rust-lang.org/tools/install)
+```
+# recommended approach from https://www.rust-lang.org/tools/install
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+```
+
+3. Install PostgreSQL Client
+```
+# from https://stackoverflow.com/questions/44654216/correct-way-to-install-psql-without-full-postgres-on-macos
+brew install libpq
+brew link --force libpq
+```
+
+4. Build neon and patched postgres
+```sh
+git clone --recursive https://github.com/neondatabase/neon.git
+cd neon
+make -j5
+```
+
+#### dependency installation notes
+To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.
+
+To run the integration tests or Python scripts (not required to use the code), install
+Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory.
+
+
+#### running neon database
+1. Start pageserver and postgres on top of it (should be called from repo root):
 ```sh
 # Create repository in .zenith with proper paths to binaries and data
 # Later that would be responsibility of a package install script
-> ./target/debug/zenith init
-initializing tenantid c03ba6b7ad4c5e9cf556f059ade44229
-created initial timeline 5b014a9e41b4b63ce1a1febc04503636 timeline.lsn 0/169C3C8
-created main branch
+> ./target/debug/neon_local init
+initializing tenantid 9ef87a5bf0d92544f6fafeeb3239695c
+created initial timeline de200bd42b49cc1814412c7e592dd6e9 timeline.lsn 0/16B5A50
+initial timeline de200bd42b49cc1814412c7e592dd6e9 created
 pageserver init succeeded

 # start pageserver and safekeeper
-> ./target/debug/zenith start
-Starting pageserver at 'localhost:64000' in '.zenith'
+> ./target/debug/neon_local start
+Starting pageserver at '127.0.0.1:64000' in '.zenith'
 Pageserver started
-initializing for single for 7676
-Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/single'
+initializing for sk 1 for 7676
+Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/sk1'
 Safekeeper started

 # start postgres compute node
-> ./target/debug/zenith pg start main
-Starting new postgres main on timeline 5b014a9e41b4b63ce1a1febc04503636 ...
-Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432
+> ./target/debug/neon_local pg start main
+Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
+Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
 Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres'
-waiting for server to start.... done
-server started

 # check list of running postgres instances
-> ./target/debug/zenith pg list
-NODE	ADDRESS	TIMELINES	BRANCH NAME	LSN		STATUS
-main	127.0.0.1:55432	5b014a9e41b4b63ce1a1febc04503636	main	0/1609610	running
+> ./target/debug/neon_local pg list
+ NODE  ADDRESS          TIMELINE                          BRANCH NAME  LSN        STATUS
+ main  127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main         0/16B5BA8  running
 ```

-4. Now it is possible to connect to postgres and run some queries:
+2. Now it is possible to connect to postgres and run some queries:
 ```text
 > psql -p55432 -h 127.0.0.1 -U zenith_admin postgres
 postgres=# CREATE TABLE t(key int primary key, value text);
@@ -91,21 +130,28 @@ postgres=# select * from t;
 (1 row)
 ```

-5. And create branches and run postgres on them:
+3. And create branches and run postgres on them:
 ```sh
 # create branch named migration_check
-> ./target/debug/zenith timeline branch --branch-name migration_check
-Created timeline '0e9331cad6efbafe6a88dd73ae21a5c9' at Lsn 0/16F5830 for tenant: c03ba6b7ad4c5e9cf556f059ade44229. Ancestor timeline: 'main'
+> ./target/debug/neon_local timeline branch --branch-name migration_check
+Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c. Ancestor timeline: 'main'

 # check branches tree
-> ./target/debug/zenith timeline list
- main [5b014a9e41b4b63ce1a1febc04503636]
- ┗━ @0/1609610: migration_check [0e9331cad6efbafe6a88dd73ae21a5c9]
+> ./target/debug/neon_local timeline list
+(L) main [de200bd42b49cc1814412c7e592dd6e9]
+(L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]

 # start postgres on that branch
-> ./target/debug/zenith pg start migration_check
-Starting postgres node at 'host=127.0.0.1 port=55433 user=stas'
-waiting for server to start.... done
+> ./target/debug/neon_local pg start migration_check --branch-name migration_check
+Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
+Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
+Starting postgres node at 'host=127.0.0.1 port=55433 user=zenith_admin dbname=postgres'
+
+# check the new list of running postgres instances
+> ./target/debug/neon_local pg list
+ NODE             ADDRESS          TIMELINE                          BRANCH NAME      LSN        STATUS
+ main             127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main             0/16F9A38  running
+ migration_check  127.0.0.1:55433  b3b863fa45fa9e57e615f9f2d944e601  migration_check  0/16F9A70  running

 # this new postgres instance will have all the data from 'main' postgres,
 # but all modifications would not affect data in original postgres
@@ -118,12 +164,20 @@ postgres=# select * from t;

 postgres=# insert into t values(2,2);
 INSERT 0 1
+
+# check that the new change doesn't affect the 'main' postgres
+> psql -p55432 -h 127.0.0.1 -U zenith_admin postgres
+postgres=# select * from t;
+ key | value
+-----+-------
+   1 | 1
+(1 row)
 ```

-6. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances
+4. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances
   you have just started. You can stop them all with one command:
 ```sh
-> ./target/debug/zenith stop
+> ./target/debug/neon_local stop
 ```

 ## Running tests
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -1,9 +1,9 @@
 # Compute node tools

-Postgres wrapper (`zenith_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
-`ExecStart` option. It will handle all the `zenith` specifics during compute node
+Postgres wrapper (`compute_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
+`ExecStart` option. It will handle all the `Neon` specifics during compute node
 initialization:
- `zenith_ctl` accepts cluster (compute node) specification as a JSON file.
+- `compute_ctl` accepts cluster (compute node) specification as a JSON file.
 - Every start is a fresh start, so the data directory is removed and
  initialized again on each run.
 - Next it will put configuration files into the `PGDATA` directory.
@@ -13,18 +13,18 @@ initialization:
 - Check and alter/drop/create roles and databases.
 - Hang waiting on the `postmaster` process to exit.

-Also `zenith_ctl` spawns two separate service threads:
+Also `compute_ctl` spawns two separate service threads:
 - `compute-monitor` checks the last Postgres activity timestamp and saves it
-  into the shared `ComputeState`;
+  into the shared `ComputeNode`;
 - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
  last activity requests.

 Usage example:
 ```sh
-zenith_ctl -D /var/db/postgres/compute \
-           -C 'postgresql://zenith_admin@localhost/postgres' \
-           -S /var/db/postgres/specs/current.json \
-           -b /usr/local/bin/postgres
+compute_ctl -D /var/db/postgres/compute \
+            -C 'postgresql://zenith_admin@localhost/postgres' \
+            -S /var/db/postgres/specs/current.json \
+            -b /usr/local/bin/postgres
 ```

 ## Tests
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -0,0 +1,174 @@
+//!
+//! Postgres wrapper (`compute_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
+//! `ExecStart` option. It will handle all the `Neon` specifics during compute node
+//! initialization:
+//! - `compute_ctl` accepts cluster (compute node) specification as a JSON file.
+//! - Every start is a fresh start, so the data directory is removed and
+//!   initialized again on each run.
+//! - Next it will put configuration files into the `PGDATA` directory.
+//! - Sync safekeepers and get commit LSN.
+//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
+//! - Try to start `postgres` and wait until it is ready to accept connections.
+//! - Check and alter/drop/create roles and databases.
+//! - Hang waiting on the `postmaster` process to exit.
+//!
+//! Also `compute_ctl` spawns two separate service threads:
+//! - `compute-monitor` checks the last Postgres activity timestamp and saves it
+//!   into the shared `ComputeNode`;
+//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
+//!   last activity requests.
+//!
+//! Usage example:
+//! ```sh
+//! compute_ctl -D /var/db/postgres/compute \
+//!             -C 'postgresql://zenith_admin@localhost/postgres' \
+//!             -S /var/db/postgres/specs/current.json \
+//!             -b /usr/local/bin/postgres
+//! ```
+//!
+use std::fs::File;
+use std::panic;
+use std::path::Path;
+use std::process::exit;
+use std::sync::{Arc, RwLock};
+use std::{thread, time::Duration};
+
+use anyhow::Result;
+use chrono::Utc;
+use clap::Arg;
+use log::{error, info};
+
+use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
+use compute_tools::http::api::launch_http_server;
+use compute_tools::logger::*;
+use compute_tools::monitor::launch_monitor;
+use compute_tools::params::*;
+use compute_tools::pg_helpers::*;
+use compute_tools::spec::*;
+
+fn main() -> Result<()> {
+    // TODO: re-use `utils::logging` later
+    init_logger(DEFAULT_LOG_LEVEL)?;
+
+    // Env variable is set by `cargo`
+    let version: Option<&str> = option_env!("CARGO_PKG_VERSION");
+    let matches = clap::App::new("compute_ctl")
+        .version(version.unwrap_or("unknown"))
+        .arg(
+            Arg::new("connstr")
+                .short('C')
+                .long("connstr")
+                .value_name("DATABASE_URL")
+                .required(true),
+        )
+        .arg(
+            Arg::new("pgdata")
+                .short('D')
+                .long("pgdata")
+                .value_name("DATADIR")
+                .required(true),
+        )
+        .arg(
+            Arg::new("pgbin")
+                .short('b')
+                .long("pgbin")
+                .value_name("POSTGRES_PATH"),
+        )
+        .arg(
+            Arg::new("spec")
+                .short('s')
+                .long("spec")
+                .value_name("SPEC_JSON"),
+        )
+        .arg(
+            Arg::new("spec-path")
+                .short('S')
+                .long("spec-path")
+                .value_name("SPEC_PATH"),
+        )
+        .get_matches();
+
+    let pgdata = matches.value_of("pgdata").expect("PGDATA path is required");
+    let connstr = matches
+        .value_of("connstr")
+        .expect("Postgres connection string is required");
+    let spec = matches.value_of("spec");
+    let spec_path = matches.value_of("spec-path");
+
+    // Try to use just 'postgres' if no path is provided
+    let pgbin = matches.value_of("pgbin").unwrap_or("postgres");
+
+    let spec: ComputeSpec = match spec {
+        // First, try to get cluster spec from the cli argument
+        Some(json) => serde_json::from_str(json)?,
+        None => {
+            // Second, try to read it from the file if path is provided
+            if let Some(sp) = spec_path {
+                let path = Path::new(sp);
+                let file = File::open(path)?;
+                serde_json::from_reader(file)?
+            } else {
+                panic!("cluster spec should be provided via --spec or --spec-path argument");
+            }
+        }
+    };
+
+    let pageserver_connstr = spec
+        .cluster
+        .settings
+        .find("zenith.page_server_connstring")
+        .expect("pageserver connstr should be provided");
+    let tenant = spec
+        .cluster
+        .settings
+        .find("zenith.zenith_tenant")
+        .expect("tenant id should be provided");
+    let timeline = spec
+        .cluster
+        .settings
+        .find("zenith.zenith_timeline")
+        .expect("tenant id should be provided");
+
+    let compute_state = ComputeNode {
+        start_time: Utc::now(),
+        connstr: connstr.to_string(),
+        pgdata: pgdata.to_string(),
+        pgbin: pgbin.to_string(),
+        spec,
+        tenant,
+        timeline,
+        pageserver_connstr,
+        metrics: ComputeMetrics::new(),
+        state: RwLock::new(ComputeState::new()),
+    };
+    let compute = Arc::new(compute_state);
+
+    // Launch service threads first, so we were able to serve availability
+    // requests, while configuration is still in progress.
+    let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
+    let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
+
+    // Run compute (Postgres) and hang waiting on it.
+    match compute.prepare_and_run() {
+        Ok(ec) => {
+            let code = ec.code().unwrap_or(1);
+            info!("Postgres exited with code {}, shutting down", code);
+            exit(code)
+        }
+        Err(error) => {
+            error!("could not start the compute node: {}", error);
+
+            let mut state = compute.state.write().unwrap();
+            state.error = Some(format!("{:?}", error));
+            state.status = ComputeStatus::Failed;
+            drop(state);
+
+            // Keep serving HTTP requests, so the cloud control plane was able to
+            // get the actual error.
+            info!("giving control plane 30s to collect the error before shutdown");
+            thread::sleep(Duration::from_secs(30));
+            info!("shutting down");
+            Err(error)
+        }
+    }
+}
--- a/compute_tools/src/bin/zenith_ctl.rs
+++ b/compute_tools/src/bin/zenith_ctl.rs
@@ -1,252 +0,0 @@
-//!
-//! Postgres wrapper (`zenith_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
-//! `ExecStart` option. It will handle all the `zenith` specifics during compute node
-//! initialization:
-//! - `zenith_ctl` accepts cluster (compute node) specification as a JSON file.
-//! - Every start is a fresh start, so the data directory is removed and
-//!   initialized again on each run.
-//! - Next it will put configuration files into the `PGDATA` directory.
-//! - Sync safekeepers and get commit LSN.
-//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
-//! - Try to start `postgres` and wait until it is ready to accept connections.
-//! - Check and alter/drop/create roles and databases.
-//! - Hang waiting on the `postmaster` process to exit.
-//!
-//! Also `zenith_ctl` spawns two separate service threads:
-//! - `compute-monitor` checks the last Postgres activity timestamp and saves it
-//!   into the shared `ComputeState`;
-//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
-//!   last activity requests.
-//!
-//! Usage example:
-//! ```sh
-//! zenith_ctl -D /var/db/postgres/compute \
-//!            -C 'postgresql://zenith_admin@localhost/postgres' \
-//!            -S /var/db/postgres/specs/current.json \
-//!            -b /usr/local/bin/postgres
-//! ```
-//!
-use std::fs::File;
-use std::panic;
-use std::path::Path;
-use std::process::{exit, Command, ExitStatus};
-use std::sync::{Arc, RwLock};
-
-use anyhow::{Context, Result};
-use chrono::Utc;
-use clap::Arg;
-use log::info;
-use postgres::{Client, NoTls};
-
-use compute_tools::checker::create_writablity_check_data;
-use compute_tools::config;
-use compute_tools::http_api::launch_http_server;
-use compute_tools::logger::*;
-use compute_tools::monitor::launch_monitor;
-use compute_tools::params::*;
-use compute_tools::pg_helpers::*;
-use compute_tools::spec::*;
-use compute_tools::zenith::*;
-
-/// Do all the preparations like PGDATA directory creation, configuration,
-/// safekeepers sync, basebackup, etc.
-fn prepare_pgdata(state: &Arc<RwLock<ComputeState>>) -> Result<()> {
-    let state = state.read().unwrap();
-    let spec = &state.spec;
-    let pgdata_path = Path::new(&state.pgdata);
-    let pageserver_connstr = spec
-        .cluster
-        .settings
-        .find("zenith.page_server_connstring")
-        .expect("pageserver connstr should be provided");
-    let tenant = spec
-        .cluster
-        .settings
-        .find("zenith.zenith_tenant")
-        .expect("tenant id should be provided");
-    let timeline = spec
-        .cluster
-        .settings
-        .find("zenith.zenith_timeline")
-        .expect("tenant id should be provided");
-
-    info!(
-        "starting cluster #{}, operation #{}",
-        spec.cluster.cluster_id,
-        spec.operation_uuid.as_ref().unwrap()
-    );
-
-    // Remove/create an empty pgdata directory and put configuration there.
-    create_pgdata(&state.pgdata)?;
-    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
-
-    info!("starting safekeepers syncing");
-    let lsn = sync_safekeepers(&state.pgdata, &state.pgbin)
-        .with_context(|| "failed to sync safekeepers")?;
-    info!("safekeepers synced at LSN {}", lsn);
-
-    info!(
-        "getting basebackup@{} from pageserver {}",
-        lsn, pageserver_connstr
-    );
-    get_basebackup(&state.pgdata, &pageserver_connstr, &tenant, &timeline, &lsn).with_context(
-        || {
-            format!(
-                "failed to get basebackup@{} from pageserver {}",
-                lsn, pageserver_connstr
-            )
-        },
-    )?;
-
-    // Update pg_hba.conf received with basebackup.
-    update_pg_hba(pgdata_path)?;
-
-    Ok(())
-}
-
-/// Start Postgres as a child process and manage DBs/roles.
-/// After that this will hang waiting on the postmaster process to exit.
-fn run_compute(state: &Arc<RwLock<ComputeState>>) -> Result<ExitStatus> {
-    let read_state = state.read().unwrap();
-    let pgdata_path = Path::new(&read_state.pgdata);
-
-    // Run postgres as a child process.
-    let mut pg = Command::new(&read_state.pgbin)
-        .args(&["-D", &read_state.pgdata])
-        .spawn()
-        .expect("cannot start postgres process");
-
-    // Try default Postgres port if it is not provided
-    let port = read_state
-        .spec
-        .cluster
-        .settings
-        .find("port")
-        .unwrap_or_else(|| "5432".to_string());
-    wait_for_postgres(&port, pgdata_path)?;
-
-    let mut client = Client::connect(&read_state.connstr, NoTls)?;
-
-    handle_roles(&read_state.spec, &mut client)?;
-    handle_databases(&read_state.spec, &mut client)?;
-    handle_grants(&read_state.spec, &mut client)?;
-    create_writablity_check_data(&mut client)?;
-
-    // 'Close' connection
-    drop(client);
-
-    info!(
-        "finished configuration of cluster #{}",
-        read_state.spec.cluster.cluster_id
-    );
-
-    // Release the read lock.
-    drop(read_state);
-
-    // Get the write lock, update state and release the lock, so HTTP API
-    // was able to serve requests, while we are blocked waiting on
-    // Postgres.
-    let mut state = state.write().unwrap();
-    state.ready = true;
-    drop(state);
-
-    // Wait for child postgres process basically forever. In this state Ctrl+C
-    // will be propagated to postgres and it will be shut down as well.
-    let ecode = pg.wait().expect("failed to wait on postgres");
-
-    Ok(ecode)
-}
-
-fn main() -> Result<()> {
-    // TODO: re-use `utils::logging` later
-    init_logger(DEFAULT_LOG_LEVEL)?;
-
-    // Env variable is set by `cargo`
-    let version: Option<&str> = option_env!("CARGO_PKG_VERSION");
-    let matches = clap::App::new("zenith_ctl")
-        .version(version.unwrap_or("unknown"))
-        .arg(
-            Arg::new("connstr")
-                .short('C')
-                .long("connstr")
-                .value_name("DATABASE_URL")
-                .required(true),
-        )
-        .arg(
-            Arg::new("pgdata")
-                .short('D')
-                .long("pgdata")
-                .value_name("DATADIR")
-                .required(true),
-        )
-        .arg(
-            Arg::new("pgbin")
-                .short('b')
-                .long("pgbin")
-                .value_name("POSTGRES_PATH"),
-        )
-        .arg(
-            Arg::new("spec")
-                .short('s')
-                .long("spec")
-                .value_name("SPEC_JSON"),
-        )
-        .arg(
-            Arg::new("spec-path")
-                .short('S')
-                .long("spec-path")
-                .value_name("SPEC_PATH"),
-        )
-        .get_matches();
-
-    let pgdata = matches.value_of("pgdata").expect("PGDATA path is required");
-    let connstr = matches
-        .value_of("connstr")
-        .expect("Postgres connection string is required");
-    let spec = matches.value_of("spec");
-    let spec_path = matches.value_of("spec-path");
-
-    // Try to use just 'postgres' if no path is provided
-    let pgbin = matches.value_of("pgbin").unwrap_or("postgres");
-
-    let spec: ClusterSpec = match spec {
-        // First, try to get cluster spec from the cli argument
-        Some(json) => serde_json::from_str(json)?,
-        None => {
-            // Second, try to read it from the file if path is provided
-            if let Some(sp) = spec_path {
-                let path = Path::new(sp);
-                let file = File::open(path)?;
-                serde_json::from_reader(file)?
-            } else {
-                panic!("cluster spec should be provided via --spec or --spec-path argument");
-            }
-        }
-    };
-
-    let compute_state = ComputeState {
-        connstr: connstr.to_string(),
-        pgdata: pgdata.to_string(),
-        pgbin: pgbin.to_string(),
-        spec,
-        ready: false,
-        last_active: Utc::now(),
-    };
-    let compute_state = Arc::new(RwLock::new(compute_state));
-
-    // Launch service threads first, so we were able to serve availability
-    // requests, while configuration is still in progress.
-    let mut _threads = vec![
-        launch_http_server(&compute_state).expect("cannot launch compute monitor thread"),
-        launch_monitor(&compute_state).expect("cannot launch http endpoint thread"),
-    ];
-
-    prepare_pgdata(&compute_state)?;
-
-    // Run compute (Postgres) and hang waiting on it. Panic if any error happens,
-    // it will help us to trigger unwind and kill postmaster as well.
-    match run_compute(&compute_state) {
-        Ok(ec) => exit(ec.success() as i32),
-        Err(error) => panic!("cannot start compute node, error: {}", error),
-    }
-}
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -1,11 +1,11 @@
-use std::sync::{Arc, RwLock};
+use std::sync::Arc;

 use anyhow::{anyhow, Result};
 use log::error;
 use postgres::Client;
 use tokio_postgres::NoTls;

-use crate::zenith::ComputeState;
+use crate::compute::ComputeNode;

 pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
    let query = "
@@ -23,9 +23,9 @@ pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
    Ok(())
 }

-pub async fn check_writability(state: &Arc<RwLock<ComputeState>>) -> Result<()> {
-    let connstr = state.read().unwrap().connstr.clone();
-    let (client, connection) = tokio_postgres::connect(&connstr, NoTls).await?;
+pub async fn check_writability(compute: &Arc<ComputeNode>) -> Result<()> {
+    let connstr = &compute.connstr;
+    let (client, connection) = tokio_postgres::connect(connstr, NoTls).await?;
    if client.is_closed() {
        return Err(anyhow!("connection to postgres closed"));
    }
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -0,0 +1,321 @@
+//
+// XXX: This starts to be scarry similar to the `PostgresNode` from `control_plane`,
+// but there are several things that makes `PostgresNode` usage inconvenient in the
+// cloud:
+// - it inherits from `LocalEnv`, which contains **all-all** the information about
+//   a complete service running
+// - it uses `PageServerNode` with information about http endpoint, which we do not
+//   need in the cloud again
+// - many tiny pieces like, for example, we do not use `pg_ctl` in the cloud
+//
+// Thus, to use `PostgresNode` in the cloud, we need to 'mock' a bunch of required
+// attributes (not required for the cloud). Yet, it is still tempting to unify these
+// `PostgresNode` and `ComputeNode` and use one in both places.
+//
+// TODO: stabilize `ComputeNode` and think about using it in the `control_plane`.
+//
+use std::fs;
+use std::os::unix::fs::PermissionsExt;
+use std::path::Path;
+use std::process::{Command, ExitStatus, Stdio};
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::RwLock;
+
+use anyhow::{Context, Result};
+use chrono::{DateTime, Utc};
+use log::info;
+use postgres::{Client, NoTls};
+use serde::{Serialize, Serializer};
+
+use crate::checker::create_writablity_check_data;
+use crate::config;
+use crate::pg_helpers::*;
+use crate::spec::*;
+
+/// Compute node info shared across several `compute_ctl` threads.
+pub struct ComputeNode {
+    pub start_time: DateTime<Utc>,
+    pub connstr: String,
+    pub pgdata: String,
+    pub pgbin: String,
+    pub spec: ComputeSpec,
+    pub tenant: String,
+    pub timeline: String,
+    pub pageserver_connstr: String,
+    pub metrics: ComputeMetrics,
+    /// Volatile part of the `ComputeNode` so should be used under `RwLock`
+    /// to allow HTTP API server to serve status requests, while configuration
+    /// is in progress.
+    pub state: RwLock<ComputeState>,
+}
+
+fn rfc3339_serialize<S>(x: &DateTime<Utc>, s: S) -> Result<S::Ok, S::Error>
+where
+    S: Serializer,
+{
+    x.to_rfc3339().serialize(s)
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "snake_case")]
+pub struct ComputeState {
+    pub status: ComputeStatus,
+    /// Timestamp of the last Postgres activity
+    #[serde(serialize_with = "rfc3339_serialize")]
+    pub last_active: DateTime<Utc>,
+    pub error: Option<String>,
+}
+
+impl ComputeState {
+    pub fn new() -> Self {
+        Self {
+            status: ComputeStatus::Init,
+            last_active: Utc::now(),
+            error: None,
+        }
+    }
+}
+
+impl Default for ComputeState {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[derive(Serialize, Clone, Copy, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum ComputeStatus {
+    Init,
+    Running,
+    Failed,
+}
+
+#[derive(Serialize)]
+pub struct ComputeMetrics {
+    pub sync_safekeepers_ms: AtomicU64,
+    pub basebackup_ms: AtomicU64,
+    pub config_ms: AtomicU64,
+    pub total_startup_ms: AtomicU64,
+}
+
+impl ComputeMetrics {
+    pub fn new() -> Self {
+        Self {
+            sync_safekeepers_ms: AtomicU64::new(0),
+            basebackup_ms: AtomicU64::new(0),
+            config_ms: AtomicU64::new(0),
+            total_startup_ms: AtomicU64::new(0),
+        }
+    }
+}
+
+impl Default for ComputeMetrics {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ComputeNode {
+    pub fn set_status(&self, status: ComputeStatus) {
+        self.state.write().unwrap().status = status;
+    }
+
+    pub fn get_status(&self) -> ComputeStatus {
+        self.state.read().unwrap().status
+    }
+
+    // Remove `pgdata` directory and create it again with right permissions.
+    fn create_pgdata(&self) -> Result<()> {
+        // Ignore removal error, likely it is a 'No such file or directory (os error 2)'.
+        // If it is something different then create_dir() will error out anyway.
+        let _ok = fs::remove_dir_all(&self.pgdata);
+        fs::create_dir(&self.pgdata)?;
+        fs::set_permissions(&self.pgdata, fs::Permissions::from_mode(0o700))?;
+
+        Ok(())
+    }
+
+    // Get basebackup from the libpq connection to pageserver using `connstr` and
+    // unarchive it to `pgdata` directory overriding all its previous content.
+    fn get_basebackup(&self, lsn: &str) -> Result<()> {
+        let start_time = Utc::now();
+
+        let mut client = Client::connect(&self.pageserver_connstr, NoTls)?;
+        let basebackup_cmd = match lsn {
+            "0/0" => format!("basebackup {} {}", &self.tenant, &self.timeline), // First start of the compute
+            _ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn),
+        };
+        let copyreader = client.copy_out(basebackup_cmd.as_str())?;
+
+        // Read the archive directly from the `CopyOutReader`
+        //
+        // Set `ignore_zeros` so that unpack() reads all the Copy data and
+        // doesn't stop at the end-of-archive marker. Otherwise, if the server
+        // sends an Error after finishing the tarball, we will not notice it.
+        let mut ar = tar::Archive::new(copyreader);
+        ar.set_ignore_zeros(true);
+        ar.unpack(&self.pgdata)?;
+
+        self.metrics.basebackup_ms.store(
+            Utc::now()
+                .signed_duration_since(start_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64,
+            Ordering::Relaxed,
+        );
+
+        Ok(())
+    }
+
+    // Run `postgres` in a special mode with `--sync-safekeepers` argument
+    // and return the reported LSN back to the caller.
+    fn sync_safekeepers(&self) -> Result<String> {
+        let start_time = Utc::now();
+
+        let sync_handle = Command::new(&self.pgbin)
+            .args(&["--sync-safekeepers"])
+            .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
+            .stdout(Stdio::piped())
+            .spawn()
+            .expect("postgres --sync-safekeepers failed to start");
+
+        // `postgres --sync-safekeepers` will print all log output to stderr and
+        // final LSN to stdout. So we pipe only stdout, while stderr will be automatically
+        // redirected to the caller output.
+        let sync_output = sync_handle
+            .wait_with_output()
+            .expect("postgres --sync-safekeepers failed");
+        if !sync_output.status.success() {
+            anyhow::bail!(
+                "postgres --sync-safekeepers exited with non-zero status: {}",
+                sync_output.status,
+            );
+        }
+
+        self.metrics.sync_safekeepers_ms.store(
+            Utc::now()
+                .signed_duration_since(start_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64,
+            Ordering::Relaxed,
+        );
+
+        let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim());
+
+        Ok(lsn)
+    }
+
+    /// Do all the preparations like PGDATA directory creation, configuration,
+    /// safekeepers sync, basebackup, etc.
+    pub fn prepare_pgdata(&self) -> Result<()> {
+        let spec = &self.spec;
+        let pgdata_path = Path::new(&self.pgdata);
+
+        // Remove/create an empty pgdata directory and put configuration there.
+        self.create_pgdata()?;
+        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
+
+        info!("starting safekeepers syncing");
+        let lsn = self
+            .sync_safekeepers()
+            .with_context(|| "failed to sync safekeepers")?;
+        info!("safekeepers synced at LSN {}", lsn);
+
+        info!(
+            "getting basebackup@{} from pageserver {}",
+            lsn, &self.pageserver_connstr
+        );
+        self.get_basebackup(&lsn).with_context(|| {
+            format!(
+                "failed to get basebackup@{} from pageserver {}",
+                lsn, &self.pageserver_connstr
+            )
+        })?;
+
+        // Update pg_hba.conf received with basebackup.
+        update_pg_hba(pgdata_path)?;
+
+        Ok(())
+    }
+
+    /// Start Postgres as a child process and manage DBs/roles.
+    /// After that this will hang waiting on the postmaster process to exit.
+    pub fn run(&self) -> Result<ExitStatus> {
+        let start_time = Utc::now();
+
+        let pgdata_path = Path::new(&self.pgdata);
+
+        // Run postgres as a child process.
+        let mut pg = Command::new(&self.pgbin)
+            .args(&["-D", &self.pgdata])
+            .spawn()
+            .expect("cannot start postgres process");
+
+        // Try default Postgres port if it is not provided
+        let port = self
+            .spec
+            .cluster
+            .settings
+            .find("port")
+            .unwrap_or_else(|| "5432".to_string());
+        wait_for_postgres(&mut pg, &port, pgdata_path)?;
+
+        let mut client = Client::connect(&self.connstr, NoTls)?;
+
+        handle_roles(&self.spec, &mut client)?;
+        handle_databases(&self.spec, &mut client)?;
+        handle_grants(&self.spec, &mut client)?;
+        create_writablity_check_data(&mut client)?;
+
+        // 'Close' connection
+        drop(client);
+        let startup_end_time = Utc::now();
+
+        self.metrics.config_ms.store(
+            startup_end_time
+                .signed_duration_since(start_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64,
+            Ordering::Relaxed,
+        );
+        self.metrics.total_startup_ms.store(
+            startup_end_time
+                .signed_duration_since(self.start_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64,
+            Ordering::Relaxed,
+        );
+
+        self.set_status(ComputeStatus::Running);
+
+        info!(
+            "finished configuration of compute for project {}",
+            self.spec.cluster.cluster_id
+        );
+
+        // Wait for child Postgres process basically forever. In this state Ctrl+C
+        // will propagate to Postgres and it will be shut down as well.
+        let ecode = pg
+            .wait()
+            .expect("failed to start waiting on Postgres process");
+
+        Ok(ecode)
+    }
+
+    pub fn prepare_and_run(&self) -> Result<ExitStatus> {
+        info!(
+            "starting compute for project {}, operation {}, tenant {}, timeline {}",
+            self.spec.cluster.cluster_id,
+            self.spec.operation_uuid.as_ref().unwrap(),
+            self.tenant,
+            self.timeline,
+        );
+
+        self.prepare_pgdata()?;
+        self.run()
+    }
+}
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -6,7 +6,7 @@ use std::path::Path;
 use anyhow::Result;

 use crate::pg_helpers::PgOptionsSerialize;
-use crate::zenith::ClusterSpec;
+use crate::spec::ComputeSpec;

 /// Check that `line` is inside a text file and put it there if it is not.
 /// Create file if it doesn't exist.
@@ -32,20 +32,20 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
 }

 /// Create or completely rewrite configuration file specified by `path`
-pub fn write_postgres_conf(path: &Path, spec: &ClusterSpec) -> Result<()> {
+pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
    // File::create() destroys the file content if it exists.
    let mut postgres_conf = File::create(path)?;

-    write_zenith_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?;
+    write_auto_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?;

    Ok(())
 }

 // Write Postgres config block wrapped with generated comment section
-fn write_zenith_managed_block(file: &mut File, buf: &str) -> Result<()> {
-    writeln!(file, "# Managed by Zenith: begin")?;
+fn write_auto_managed_block(file: &mut File, buf: &str) -> Result<()> {
+    writeln!(file, "# Managed by compute_ctl: begin")?;
    writeln!(file, "{}", buf)?;
-    writeln!(file, "# Managed by Zenith: end")?;
+    writeln!(file, "# Managed by compute_ctl: end")?;

    Ok(())
 }
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -1,37 +1,64 @@
 use std::convert::Infallible;
 use std::net::SocketAddr;
-use std::sync::{Arc, RwLock};
+use std::sync::Arc;
 use std::thread;

 use anyhow::Result;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use log::{error, info};
+use serde_json;

-use crate::zenith::*;
+use crate::compute::{ComputeNode, ComputeStatus};

 // Service function to handle all available routes.
-async fn routes(req: Request<Body>, state: Arc<RwLock<ComputeState>>) -> Response<Body> {
+async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body> {
    match (req.method(), req.uri().path()) {
        // Timestamp of the last Postgres activity in the plain text.
+        // DEPRECATED in favour of /status
        (&Method::GET, "/last_activity") => {
            info!("serving /last_active GET request");
-            let state = state.read().unwrap();
+            let state = compute.state.read().unwrap();

            // Use RFC3339 format for consistency.
            Response::new(Body::from(state.last_active.to_rfc3339()))
        }

-        // Has compute setup process finished? -> true/false
+        // Has compute setup process finished? -> true/false.
+        // DEPRECATED in favour of /status
        (&Method::GET, "/ready") => {
            info!("serving /ready GET request");
-            let state = state.read().unwrap();
-            Response::new(Body::from(format!("{}", state.ready)))
+            let status = compute.get_status();
+            Response::new(Body::from(format!("{}", status == ComputeStatus::Running)))
        }

+        // Serialized compute state.
+        (&Method::GET, "/status") => {
+            info!("serving /status GET request");
+            let state = compute.state.read().unwrap();
+            Response::new(Body::from(serde_json::to_string(&*state).unwrap()))
+        }
+
+        // Startup metrics in JSON format. Keep /metrics reserved for a possible
+        // future use for Prometheus metrics format.
+        (&Method::GET, "/metrics.json") => {
+            info!("serving /metrics.json GET request");
+            Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap()))
+        }
+
+        // DEPRECATED, use POST instead
        (&Method::GET, "/check_writability") => {
            info!("serving /check_writability GET request");
-            let res = crate::checker::check_writability(&state).await;
+            let res = crate::checker::check_writability(&compute).await;
+            match res {
+                Ok(_) => Response::new(Body::from("true")),
+                Err(e) => Response::new(Body::from(e.to_string())),
+            }
+        }
+
+        (&Method::POST, "/check_writability") => {
+            info!("serving /check_writability POST request");
+            let res = crate::checker::check_writability(&compute).await;
            match res {
                Ok(_) => Response::new(Body::from("true")),
                Err(e) => Response::new(Body::from(e.to_string())),
@@ -49,7 +76,7 @@ async fn routes(req: Request<Body>, state: Arc<RwLock<ComputeState>>) -> Respons

 // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
 #[tokio::main]
-async fn serve(state: Arc<RwLock<ComputeState>>) {
+async fn serve(state: Arc<ComputeNode>) {
    let addr = SocketAddr::from(([0, 0, 0, 0], 3080));

    let make_service = make_service_fn(move |_conn| {
@@ -73,7 +100,7 @@ async fn serve(state: Arc<RwLock<ComputeState>>) {
 }

 /// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
-pub fn launch_http_server(state: &Arc<RwLock<ComputeState>>) -> Result<thread::JoinHandle<()>> {
+pub fn launch_http_server(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
    let state = Arc::clone(state);

    Ok(thread::Builder::new()
--- a/compute_tools/src/http/mod.rs
+++ b/compute_tools/src/http/mod.rs
@@ -0,0 +1 @@
+pub mod api;
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -0,0 +1,158 @@
+openapi: "3.0.2"
+info:
+  title: Compute node control API
+  version: "1.0"
+
+servers:
+  - url: "http://localhost:3080"
+
+paths:
+  /status:
+    get:
+      tags:
+      - "info"
+      summary: Get compute node internal status
+      description: ""
+      operationId: getComputeStatus
+      responses:
+        "200":
+          description: ComputeState
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ComputeState"
+
+  /metrics.json:
+    get:
+      tags:
+      - "info"
+      summary: Get compute node startup metrics in JSON format
+      description: ""
+      operationId: getComputeMetricsJSON
+      responses:
+        "200":
+          description: ComputeMetrics
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ComputeMetrics"
+
+  /ready:
+    get:
+      deprecated: true
+      tags:
+      - "info"
+      summary: Check whether compute startup process finished successfully
+      description: ""
+      operationId: computeIsReady
+      responses:
+        "200":
+          description: Compute is ready ('true') or not ('false')
+          content:
+            text/plain:
+              schema:
+                type: string
+                example: "true"
+
+  /last_activity:
+    get:
+      deprecated: true
+      tags:
+      - "info"
+      summary: Get timestamp of the last compute activity
+      description: ""
+      operationId: getLastComputeActivityTS
+      responses:
+        "200":
+          description: Timestamp of the last compute activity
+          content:
+            text/plain:
+              schema:
+                type: string
+                example: "2022-10-12T07:20:50.52Z"
+
+  /check_writability:
+    get:
+      deprecated: true
+      tags:
+      - "check"
+      summary: Check that we can write new data on this compute
+      description: ""
+      operationId: checkComputeWritabilityDeprecated
+      responses:
+        "200":
+          description: Check result
+          content:
+            text/plain:
+              schema:
+                type: string
+                description: Error text or 'true' if check passed
+                example: "true"
+
+    post:
+      tags:
+      - "check"
+      summary: Check that we can write new data on this compute
+      description: ""
+      operationId: checkComputeWritability
+      responses:
+        "200":
+          description: Check result
+          content:
+            text/plain:
+              schema:
+                type: string
+                description: Error text or 'true' if check passed
+                example: "true"
+
+components:
+  securitySchemes:
+    JWT:
+      type: http
+      scheme: bearer
+      bearerFormat: JWT
+
+  schemas:
+    ComputeMetrics:
+      type: object
+      description: Compute startup metrics
+      required:
+        - sync_safekeepers_ms
+        - basebackup_ms
+        - config_ms
+        - total_startup_ms
+      properties:
+        sync_safekeepers_ms:
+          type: integer
+        basebackup_ms:
+          type: integer
+        config_ms:
+          type: integer
+        total_startup_ms:
+          type: integer
+
+    ComputeState:
+      type: object
+      required:
+        - status
+        - last_active
+      properties:
+        status:
+          $ref: '#/components/schemas/ComputeStatus'
+        last_active:
+          type: string
+          description: The last detected compute activity timestamp in UTC and RFC3339 format
+          example: "2022-10-12T07:20:50.52Z"
+        error:
+          type: string
+          description: Text of the error during compute startup, if any
+
+    ComputeStatus:
+      type: string
+      enum:
+        - init
+        - failed
+        - running
+
+security:
+  - JWT: []
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -4,11 +4,11 @@
 //!
 pub mod checker;
 pub mod config;
-pub mod http_api;
+pub mod http;
 #[macro_use]
 pub mod logger;
+pub mod compute;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
 pub mod spec;
-pub mod zenith;
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -1,4 +1,4 @@
-use std::sync::{Arc, RwLock};
+use std::sync::Arc;
 use std::{thread, time};

 use anyhow::Result;
@@ -6,16 +6,16 @@ use chrono::{DateTime, Utc};
 use log::{debug, info};
 use postgres::{Client, NoTls};

-use crate::zenith::ComputeState;
+use crate::compute::ComputeNode;

 const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds

 // Spin in a loop and figure out the last activity time in the Postgres.
 // Then update it in the shared state. This function never errors out.
 // XXX: the only expected panic is at `RwLock` unwrap().
-fn watch_compute_activity(state: &Arc<RwLock<ComputeState>>) {
+fn watch_compute_activity(compute: &Arc<ComputeNode>) {
    // Suppose that `connstr` doesn't change
-    let connstr = state.read().unwrap().connstr.clone();
+    let connstr = compute.connstr.clone();
    // Define `client` outside of the loop to reuse existing connection if it's active.
    let mut client = Client::connect(&connstr, NoTls);
    let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);
@@ -46,7 +46,7 @@ fn watch_compute_activity(state: &Arc<RwLock<ComputeState>>) {
                            AND usename != 'zenith_admin';", // XXX: find a better way to filter other monitors?
                        &[],
                    );
-                let mut last_active = state.read().unwrap().last_active;
+                let mut last_active = compute.state.read().unwrap().last_active;

                if let Ok(backs) = backends {
                    let mut idle_backs: Vec<DateTime<Utc>> = vec![];
@@ -83,14 +83,14 @@ fn watch_compute_activity(state: &Arc<RwLock<ComputeState>>) {
                }

                // Update the last activity in the shared state if we got a more recent one.
-                let mut state = state.write().unwrap();
+                let mut state = compute.state.write().unwrap();
                if last_active > state.last_active {
                    state.last_active = last_active;
                    debug!("set the last compute activity time to: {}", last_active);
                }
            }
            Err(e) => {
-                info!("cannot connect to postgres: {}, retrying", e);
+                debug!("cannot connect to postgres: {}, retrying", e);

                // Establish a new connection and try again.
                client = Client::connect(&connstr, NoTls);
@@ -100,7 +100,7 @@ fn watch_compute_activity(state: &Arc<RwLock<ComputeState>>) {
 }

 /// Launch a separate compute monitor thread and return its `JoinHandle`.
-pub fn launch_monitor(state: &Arc<RwLock<ComputeState>>) -> Result<thread::JoinHandle<()>> {
+pub fn launch_monitor(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
    let state = Arc::clone(state);

    Ok(thread::Builder::new()
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -1,7 +1,9 @@
+use std::fs::File;
+use std::io::{BufRead, BufReader};
 use std::net::{SocketAddr, TcpStream};
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
-use std::process::Command;
+use std::process::Child;
 use std::str::FromStr;
 use std::{fs, thread, time};

@@ -220,12 +222,12 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
 /// Wait for Postgres to become ready to accept connections:
 /// - state should be `ready` in the `pgdata/postmaster.pid`
 /// - and we should be able to connect to 127.0.0.1:5432
-pub fn wait_for_postgres(port: &str, pgdata: &Path) -> Result<()> {
+pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()> {
    let pid_path = pgdata.join("postmaster.pid");
    let mut slept: u64 = 0; // ms
    let pause = time::Duration::from_millis(100);

-    let timeout = time::Duration::from_millis(200);
+    let timeout = time::Duration::from_millis(10);
    let addr = SocketAddr::from_str(&format!("127.0.0.1:{}", port)).unwrap();

    loop {
@@ -236,14 +238,19 @@ pub fn wait_for_postgres(port: &str, pgdata: &Path) -> Result<()> {
            bail!("timed out while waiting for Postgres to start");
        }

+        if let Ok(Some(status)) = pg.try_wait() {
+            // Postgres exited, that is not what we expected, bail out earlier.
+            let code = status.code().unwrap_or(-1);
+            bail!("Postgres exited unexpectedly with code {}", code);
+        }
+
        if pid_path.exists() {
-            // XXX: dumb and the simplest way to get the last line in a text file
-            // TODO: better use `.lines().last()` later
-            let stdout = Command::new("tail")
-                .args(&["-n1", pid_path.to_str().unwrap()])
-                .output()?
-                .stdout;
-            let status = String::from_utf8(stdout)?;
+            let file = BufReader::new(File::open(&pid_path)?);
+            let status = file
+                .lines()
+                .last()
+                .unwrap()
+                .unwrap_or_else(|_| "unknown".to_string());
            let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok();

            // Now Postgres is ready to accept connections
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -3,16 +3,53 @@ use std::path::Path;
 use anyhow::Result;
 use log::{info, log_enabled, warn, Level};
 use postgres::Client;
+use serde::Deserialize;

 use crate::config;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;
-use crate::zenith::ClusterSpec;
+
+/// Cluster spec or configuration represented as an optional number of
+/// delta operations + final cluster state description.
+#[derive(Clone, Deserialize)]
+pub struct ComputeSpec {
+    pub format_version: f32,
+    pub timestamp: String,
+    pub operation_uuid: Option<String>,
+    /// Expected cluster state at the end of transition process.
+    pub cluster: Cluster,
+    pub delta_operations: Option<Vec<DeltaOp>>,
+}
+
+/// Cluster state seen from the perspective of the external tools
+/// like Rails web console.
+#[derive(Clone, Deserialize)]
+pub struct Cluster {
+    pub cluster_id: String,
+    pub name: String,
+    pub state: Option<String>,
+    pub roles: Vec<Role>,
+    pub databases: Vec<Database>,
+    pub settings: GenericOptions,
+}
+
+/// Single cluster state changing operation that could not be represented as
+/// a static `Cluster` structure. For example:
+/// - DROP DATABASE
+/// - DROP ROLE
+/// - ALTER ROLE name RENAME TO new_name
+/// - ALTER DATABASE name RENAME TO new_name
+#[derive(Clone, Deserialize)]
+pub struct DeltaOp {
+    pub action: String,
+    pub name: PgIdent,
+    pub new_name: Option<PgIdent>,
+}

 /// It takes cluster specification and does the following:
 /// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file.
 /// - Update `pg_hba.conf` to allow external connections.
-pub fn handle_configuration(spec: &ClusterSpec, pgdata_path: &Path) -> Result<()> {
+pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
    // File `postgresql.conf` is no longer included into `basebackup`, so just
    // always write all config into it creating new file.
    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
@@ -39,7 +76,7 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {

 /// Given a cluster spec json and open transaction it handles roles creation,
 /// deletion and update.
-pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
+pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    let mut xact = client.transaction()?;
    let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;

@@ -136,13 +173,20 @@ pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
                xact.execute(query.as_str(), &[])?;
            }
        } else {
-            info!("role name {}", &name);
+            info!("role name: '{}'", &name);
            let mut query: String = format!("CREATE ROLE {} ", name.quote());
-            info!("role create query {}", &query);
+            info!("role create query: '{}'", &query);
            info_print!(" -> create");

            query.push_str(&role.to_pg_options());
            xact.execute(query.as_str(), &[])?;
+
+            let grant_query = format!(
+                "grant pg_read_all_data, pg_write_all_data to {}",
+                name.quote()
+            );
+            xact.execute(grant_query.as_str(), &[])?;
+            info!("role grant query: '{}'", &grant_query);
        }

        info_print!("\n");
@@ -158,7 +202,7 @@ pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
 /// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
 /// atomicity should be enough here due to the order of operations and various checks,
 /// which together provide us idempotency.
-pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
+pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    let existing_dbs: Vec<Database> = get_existing_dbs(client)?;

    // Print a list of existing Postgres databases (only in debug mode)
@@ -247,7 +291,7 @@ pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> {

 // Grant CREATE ON DATABASE to the database owner
 // to allow clients create trusted extensions.
-pub fn handle_grants(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
+pub fn handle_grants(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    info!("cluster spec grants:");

    for db in &spec.cluster.databases {
--- a/compute_tools/src/zenith.rs
+++ b/compute_tools/src/zenith.rs
@@ -1,109 +0,0 @@
-use std::process::{Command, Stdio};
-
-use anyhow::Result;
-use chrono::{DateTime, Utc};
-use postgres::{Client, NoTls};
-use serde::Deserialize;
-
-use crate::pg_helpers::*;
-
-/// Compute node state shared across several `zenith_ctl` threads.
-/// Should be used under `RwLock` to allow HTTP API server to serve
-/// status requests, while configuration is in progress.
-pub struct ComputeState {
-    pub connstr: String,
-    pub pgdata: String,
-    pub pgbin: String,
-    pub spec: ClusterSpec,
-    /// Compute setup process has finished
-    pub ready: bool,
-    /// Timestamp of the last Postgres activity
-    pub last_active: DateTime<Utc>,
-}
-
-/// Cluster spec or configuration represented as an optional number of
-/// delta operations + final cluster state description.
-#[derive(Clone, Deserialize)]
-pub struct ClusterSpec {
-    pub format_version: f32,
-    pub timestamp: String,
-    pub operation_uuid: Option<String>,
-    /// Expected cluster state at the end of transition process.
-    pub cluster: Cluster,
-    pub delta_operations: Option<Vec<DeltaOp>>,
-}
-
-/// Cluster state seen from the perspective of the external tools
-/// like Rails web console.
-#[derive(Clone, Deserialize)]
-pub struct Cluster {
-    pub cluster_id: String,
-    pub name: String,
-    pub state: Option<String>,
-    pub roles: Vec<Role>,
-    pub databases: Vec<Database>,
-    pub settings: GenericOptions,
-}
-
-/// Single cluster state changing operation that could not be represented as
-/// a static `Cluster` structure. For example:
-/// - DROP DATABASE
-/// - DROP ROLE
-/// - ALTER ROLE name RENAME TO new_name
-/// - ALTER DATABASE name RENAME TO new_name
-#[derive(Clone, Deserialize)]
-pub struct DeltaOp {
-    pub action: String,
-    pub name: PgIdent,
-    pub new_name: Option<PgIdent>,
-}
-
-/// Get basebackup from the libpq connection to pageserver using `connstr` and
-/// unarchive it to `pgdata` directory overriding all its previous content.
-pub fn get_basebackup(
-    pgdata: &str,
-    connstr: &str,
-    tenant: &str,
-    timeline: &str,
-    lsn: &str,
-) -> Result<()> {
-    let mut client = Client::connect(connstr, NoTls)?;
-    let basebackup_cmd = match lsn {
-        "0/0" => format!("basebackup {} {}", tenant, timeline), // First start of the compute
-        _ => format!("basebackup {} {} {}", tenant, timeline, lsn),
-    };
-    let copyreader = client.copy_out(basebackup_cmd.as_str())?;
-    let mut ar = tar::Archive::new(copyreader);
-
-    ar.unpack(&pgdata)?;
-
-    Ok(())
-}
-
-/// Run `postgres` in a special mode with `--sync-safekeepers` argument
-/// and return the reported LSN back to the caller.
-pub fn sync_safekeepers(pgdata: &str, pgbin: &str) -> Result<String> {
-    let sync_handle = Command::new(&pgbin)
-        .args(&["--sync-safekeepers"])
-        .env("PGDATA", &pgdata) // we cannot use -D in this mode
-        .stdout(Stdio::piped())
-        .spawn()
-        .expect("postgres --sync-safekeepers failed to start");
-
-    // `postgres --sync-safekeepers` will print all log output to stderr and
-    // final LSN to stdout. So we pipe only stdout, while stderr will be automatically
-    // redirected to the caller output.
-    let sync_output = sync_handle
-        .wait_with_output()
-        .expect("postgres --sync-safekeepers failed");
-    if !sync_output.status.success() {
-        anyhow::bail!(
-            "postgres --sync-safekeepers exited with non-zero status: {}",
-            sync_output.status,
-        );
-    }
-
-    let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim());
-
-    Ok(lsn)
-}
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -4,12 +4,12 @@ mod pg_helpers_tests {
    use std::fs::File;

    use compute_tools::pg_helpers::*;
-    use compute_tools::zenith::ClusterSpec;
+    use compute_tools::spec::ComputeSpec;

    #[test]
    fn params_serialize() {
        let file = File::open("tests/cluster_spec.json").unwrap();
-        let spec: ClusterSpec = serde_json::from_reader(file).unwrap();
+        let spec: ComputeSpec = serde_json::from_reader(file).unwrap();

        assert_eq!(
            spec.cluster.databases.first().unwrap().to_pg_options(),
@@ -24,7 +24,7 @@ mod pg_helpers_tests {
    #[test]
    fn settings_serialize() {
        let file = File::open("tests/cluster_spec.json").unwrap();
-        let spec: ClusterSpec = serde_json::from_reader(file).unwrap();
+        let spec: ComputeSpec = serde_json::from_reader(file).unwrap();

        assert_eq!(
            spec.cluster.settings.as_pg_settings(),
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 edition = "2021"

 [dependencies]
-tar = "0.4.33"
+tar = "0.4.38"
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 serde = { version = "1.0", features = ["derive"] }
 serde_with = "1.12.0"
--- a/control_plane/simple.conf
+++ b/control_plane/simple.conf
@@ -9,3 +9,6 @@ auth_type = 'Trust'
 id = 1
 pg_port = 5454
 http_port = 7676
+
+[etcd_broker]
+broker_endpoints = ['http://127.0.0.1:2379']
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -231,8 +231,13 @@ impl PostgresNode {
            .context("page server 'basebackup' command failed")?;

        // Read the archive directly from the `CopyOutReader`
-        tar::Archive::new(copyreader)
-            .unpack(&self.pgdata())
+        //
+        // Set `ignore_zeros` so that unpack() reads all the Copy data and
+        // doesn't stop at the end-of-archive marker. Otherwise, if the server
+        // sends an Error after finishing the tarball, we will not notice it.
+        let mut ar = tar::Archive::new(copyreader);
+        ar.set_ignore_zeros(true);
+        ar.unpack(&self.pgdata())
            .context("extracting base backup failed")?;

        Ok(())
@@ -274,6 +279,8 @@ impl PostgresNode {
        conf.append("listen_addresses", &self.address.ip().to_string());
        conf.append("port", &self.address.port().to_string());
        conf.append("wal_keep_size", "0");
+        // walproposer panics when basebackup is invalid, it is pointless to restart in this case.
+        conf.append("restart_after_crash", "off");

        // Configure the node to fetch pages from pageserver
        let pageserver_connstr = {
--- a/control_plane/src/etcd.rs
+++ b/control_plane/src/etcd.rs
@@ -0,0 +1,97 @@
+use std::{
+    fs,
+    path::PathBuf,
+    process::{Command, Stdio},
+};
+
+use anyhow::Context;
+use nix::{
+    sys::signal::{kill, Signal},
+    unistd::Pid,
+};
+
+use crate::{local_env, read_pidfile};
+
+pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
+    let etcd_broker = &env.etcd_broker;
+    println!(
+        "Starting etcd broker using {}",
+        etcd_broker.etcd_binary_path.display()
+    );
+
+    let etcd_data_dir = env.base_data_dir.join("etcd");
+    fs::create_dir_all(&etcd_data_dir).with_context(|| {
+        format!(
+            "Failed to create etcd data dir: {}",
+            etcd_data_dir.display()
+        )
+    })?;
+
+    let etcd_stdout_file =
+        fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| {
+            format!(
+                "Failed to create ectd stout file in directory {}",
+                etcd_data_dir.display()
+            )
+        })?;
+    let etcd_stderr_file =
+        fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| {
+            format!(
+                "Failed to create ectd stderr file in directory {}",
+                etcd_data_dir.display()
+            )
+        })?;
+    let client_urls = etcd_broker.comma_separated_endpoints();
+
+    let etcd_process = Command::new(&etcd_broker.etcd_binary_path)
+        .args(&[
+            format!("--data-dir={}", etcd_data_dir.display()),
+            format!("--listen-client-urls={client_urls}"),
+            format!("--advertise-client-urls={client_urls}"),
+            // Set --quota-backend-bytes to keep the etcd virtual memory
+            // size smaller. Our test etcd clusters are very small.
+            // See https://github.com/etcd-io/etcd/issues/7910
+            "--quota-backend-bytes=100000000".to_string(),
+        ])
+        .stdout(Stdio::from(etcd_stdout_file))
+        .stderr(Stdio::from(etcd_stderr_file))
+        .spawn()
+        .context("Failed to spawn etcd subprocess")?;
+    let pid = etcd_process.id();
+
+    let etcd_pid_file_path = etcd_pid_file_path(env);
+    fs::write(&etcd_pid_file_path, pid.to_string()).with_context(|| {
+        format!(
+            "Failed to create etcd pid file at {}",
+            etcd_pid_file_path.display()
+        )
+    })?;
+
+    Ok(())
+}
+
+pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
+    let etcd_path = &env.etcd_broker.etcd_binary_path;
+    println!("Stopping etcd broker at {}", etcd_path.display());
+
+    let etcd_pid_file_path = etcd_pid_file_path(env);
+    let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| {
+        format!(
+            "Failed to read etcd pid filea at {}",
+            etcd_pid_file_path.display()
+        )
+    })?);
+
+    kill(pid, Signal::SIGTERM).with_context(|| {
+        format!(
+            "Failed to stop etcd with pid {pid} at {}",
+            etcd_pid_file_path.display()
+        )
+    })?;
+
+    Ok(())
+}
+
+fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
+    env.base_data_dir.join("etcd.pid")
+}
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -12,6 +12,7 @@ use std::path::Path;
 use std::process::Command;

 pub mod compute;
+pub mod etcd;
 pub mod local_env;
 pub mod postgresql_conf;
 pub mod safekeeper;
@@ -48,3 +49,12 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
        cmd
    }
 }
+
+fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
+    for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] {
+        if let Ok(value) = std::env::var(env_key) {
+            cmd = cmd.env(env_key, value);
+        }
+    }
+    cmd
+}
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -4,6 +4,7 @@
 //! script which will use local paths.

 use anyhow::{bail, ensure, Context};
+use reqwest::Url;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
@@ -14,7 +15,7 @@ use std::process::{Command, Stdio};
 use utils::{
    auth::{encode_from_key_file, Claims, Scope},
    postgres_backend::AuthType,
-    zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
+    zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
 };

 use crate::safekeeper::SafekeeperNode;
@@ -59,9 +60,7 @@ pub struct LocalEnv {
    #[serde(default)]
    pub private_key_path: PathBuf,

-    // A comma separated broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'.
-    #[serde(default)]
-    pub broker_endpoints: Option<String>,
+    pub etcd_broker: EtcdBroker,

    pub pageserver: PageServerConf,

@@ -77,11 +76,67 @@ pub struct LocalEnv {
    branch_name_mappings: HashMap<String, Vec<(ZTenantId, ZTimelineId)>>,
 }

+/// Etcd broker config for cluster internal communication.
+#[serde_as]
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
+pub struct EtcdBroker {
+    /// A prefix to all to any key when pushing/polling etcd from a node.
+    #[serde(default)]
+    pub broker_etcd_prefix: Option<String>,
+
+    /// Broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'.
+    #[serde(default)]
+    #[serde_as(as = "Vec<DisplayFromStr>")]
+    pub broker_endpoints: Vec<Url>,
+
+    /// Etcd binary path to use.
+    #[serde(default)]
+    pub etcd_binary_path: PathBuf,
+}
+
+impl EtcdBroker {
+    pub fn locate_etcd() -> anyhow::Result<PathBuf> {
+        let which_output = Command::new("which")
+            .arg("etcd")
+            .output()
+            .context("Failed to run 'which etcd' command")?;
+        let stdout = String::from_utf8_lossy(&which_output.stdout);
+        ensure!(
+            which_output.status.success(),
+            "'which etcd' invocation failed. Status: {}, stdout: {stdout}, stderr: {}",
+            which_output.status,
+            String::from_utf8_lossy(&which_output.stderr)
+        );
+
+        let etcd_path = PathBuf::from(stdout.trim());
+        ensure!(
+            etcd_path.is_file(),
+            "'which etcd' invocation was successful, but the path it returned is not a file or does not exist: {}",
+            etcd_path.display()
+        );
+
+        Ok(etcd_path)
+    }
+
+    pub fn comma_separated_endpoints(&self) -> String {
+        self.broker_endpoints.iter().map(Url::as_str).fold(
+            String::new(),
+            |mut comma_separated_urls, url| {
+                if !comma_separated_urls.is_empty() {
+                    comma_separated_urls.push(',');
+                }
+                comma_separated_urls.push_str(url);
+                comma_separated_urls
+            },
+        )
+    }
+}
+
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default)]
 pub struct PageServerConf {
    // node id
-    pub id: ZNodeId,
+    pub id: NodeId,
    // Pageserver connection settings
    pub listen_pg_addr: String,
    pub listen_http_addr: String,
@@ -96,7 +151,7 @@ pub struct PageServerConf {
 impl Default for PageServerConf {
    fn default() -> Self {
        Self {
-            id: ZNodeId(0),
+            id: NodeId(0),
            listen_pg_addr: String::new(),
            listen_http_addr: String::new(),
            auth_type: AuthType::Trust,
@@ -108,19 +163,23 @@ impl Default for PageServerConf {
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default)]
 pub struct SafekeeperConf {
-    pub id: ZNodeId,
+    pub id: NodeId,
    pub pg_port: u16,
    pub http_port: u16,
    pub sync: bool,
+    pub remote_storage: Option<String>,
+    pub backup_threads: Option<u32>,
 }

 impl Default for SafekeeperConf {
    fn default() -> Self {
        Self {
-            id: ZNodeId(0),
+            id: NodeId(0),
            pg_port: 0,
            http_port: 0,
            sync: true,
+            remote_storage: None,
+            backup_threads: None,
        }
    }
 }
@@ -180,12 +239,7 @@ impl LocalEnv {
            if old_timeline_id == &timeline_id {
                Ok(())
            } else {
-                bail!(
-                    "branch '{}' is already mapped to timeline {}, cannot map to another timeline {}",
-                    branch_name,
-                    old_timeline_id,
-                    timeline_id
-                );
+                bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}");
            }
        } else {
            existing_values.push((tenant_id, timeline_id));
@@ -221,7 +275,7 @@ impl LocalEnv {
    ///
    /// Unlike 'load_config', this function fills in any defaults that are missing
    /// from the config file.
-    pub fn create_config(toml: &str) -> anyhow::Result<Self> {
+    pub fn parse_config(toml: &str) -> anyhow::Result<Self> {
        let mut env: LocalEnv = toml::from_str(toml)?;

        // Find postgres binaries.
@@ -234,26 +288,11 @@ impl LocalEnv {
                env.pg_distrib_dir = cwd.join("tmp_install")
            }
        }
-        if !env.pg_distrib_dir.join("bin/postgres").exists() {
-            bail!(
-                "Can't find postgres binary at {}",
-                env.pg_distrib_dir.display()
-            );
-        }

        // Find zenith binaries.
        if env.zenith_distrib_dir == Path::new("") {
            env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
        }
-        for binary in ["pageserver", "safekeeper"] {
-            if !env.zenith_distrib_dir.join(binary).exists() {
-                bail!(
-                    "Can't find binary '{}' in zenith distrib dir '{}'",
-                    binary,
-                    env.zenith_distrib_dir.display()
-                );
-            }
-        }

        // If no initial tenant ID was given, generate it.
        if env.default_tenant_id.is_none() {
@@ -342,11 +381,42 @@ impl LocalEnv {
            base_path != Path::new(""),
            "repository base path is missing"
        );
+
        ensure!(
            !base_path.exists(),
            "directory '{}' already exists. Perhaps already initialized?",
            base_path.display()
        );
+        if !self.pg_distrib_dir.join("bin/postgres").exists() {
+            bail!(
+                "Can't find postgres binary at {}",
+                self.pg_distrib_dir.display()
+            );
+        }
+        for binary in ["pageserver", "safekeeper"] {
+            if !self.zenith_distrib_dir.join(binary).exists() {
+                bail!(
+                    "Can't find binary '{}' in zenith distrib dir '{}'",
+                    binary,
+                    self.zenith_distrib_dir.display()
+                );
+            }
+        }
+
+        for binary in ["pageserver", "safekeeper"] {
+            if !self.zenith_distrib_dir.join(binary).exists() {
+                bail!(
+                    "Can't find binary '{binary}' in zenith distrib dir '{}'",
+                    self.zenith_distrib_dir.display()
+                );
+            }
+        }
+        if !self.pg_distrib_dir.join("bin/postgres").exists() {
+            bail!(
+                "Can't find postgres binary at {}",
+                self.pg_distrib_dir.display()
+            );
+        }

        fs::create_dir(&base_path)?;

@@ -404,7 +474,35 @@ impl LocalEnv {

 fn base_path() -> PathBuf {
    match std::env::var_os("ZENITH_REPO_DIR") {
-        Some(val) => PathBuf::from(val.to_str().unwrap()),
-        None => ".zenith".into(),
+        Some(val) => PathBuf::from(val),
+        None => PathBuf::from(".zenith"),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn simple_conf_parsing() {
+        let simple_conf_toml = include_str!("../simple.conf");
+        let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml);
+        assert!(
+            simple_conf_parse_result.is_ok(),
+            "failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}"
+        );
+
+        let string_to_replace = "broker_endpoints = ['http://127.0.0.1:2379']";
+        let spoiled_url_str = "broker_endpoints = ['!@$XOXO%^&']";
+        let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str);
+        assert!(
+            spoiled_url_toml.contains(spoiled_url_str),
+            "Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}"
+        );
+        let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml);
+        assert!(
+            spoiled_url_parse_result.is_err(),
+            "expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}"
+        );
    }
 }
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -18,12 +18,12 @@ use thiserror::Error;
 use utils::{
    connstring::connection_address,
    http::error::HttpErrorBody,
-    zid::{ZNodeId, ZTenantId, ZTimelineId},
+    zid::{NodeId, ZTenantId, ZTimelineId},
 };

 use crate::local_env::{LocalEnv, SafekeeperConf};
 use crate::storage::PageServerNode;
-use crate::{fill_rust_env_vars, read_pidfile};
+use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};

 #[derive(Error, Debug)]
 pub enum SafekeeperHttpError {
@@ -52,7 +52,7 @@ impl ResponseErrorMessageExt for Response {
        Err(SafekeeperHttpError::Response(
            match self.json::<HttpErrorBody>() {
                Ok(err_body) => format!("Error: {}", err_body.msg),
-                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
+                Err(_) => format!("Http error ({}) at {url}.", status.as_u16()),
            },
        ))
    }
@@ -65,7 +65,7 @@ impl ResponseErrorMessageExt for Response {
 //
 #[derive(Debug)]
 pub struct SafekeeperNode {
-    pub id: ZNodeId,
+    pub id: NodeId,

    pub conf: SafekeeperConf,

@@ -75,16 +75,12 @@ pub struct SafekeeperNode {
    pub http_base_url: String,

    pub pageserver: Arc<PageServerNode>,
-
-    broker_endpoints: Option<String>,
 }

 impl SafekeeperNode {
    pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
        let pageserver = Arc::new(PageServerNode::from_env(env));

-        println!("initializing for sk {} for {}", conf.id, conf.http_port);
-
        SafekeeperNode {
            id: conf.id,
            conf: conf.clone(),
@@ -93,7 +89,6 @@ impl SafekeeperNode {
            http_client: Client::new(),
            http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
            pageserver,
-            broker_endpoints: env.broker_endpoints.clone(),
        }
    }

@@ -105,7 +100,7 @@ impl SafekeeperNode {
            .unwrap()
    }

-    pub fn datadir_path_by_id(env: &LocalEnv, sk_id: ZNodeId) -> PathBuf {
+    pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
        env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref())
    }

@@ -140,9 +135,22 @@ impl SafekeeperNode {
        if !self.conf.sync {
            cmd.arg("--no-sync");
        }
-        if let Some(ref ep) = self.broker_endpoints {
-            cmd.args(&["--broker-endpoints", ep]);
+
+        let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints();
+        if !comma_separated_endpoints.is_empty() {
+            cmd.args(&["--broker-endpoints", &comma_separated_endpoints]);
        }
+        if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
+            cmd.args(&["--broker-etcd-prefix", prefix]);
+        }
+        if let Some(threads) = self.conf.backup_threads {
+            cmd.args(&["--backup-threads", threads.to_string().as_ref()]);
+        }
+        if let Some(ref remote_storage) = self.conf.remote_storage {
+            cmd.args(&["--remote-storage", remote_storage]);
+        }
+
+        fill_aws_secrets_vars(&mut cmd);

        if !cmd.status()?.success() {
            bail!(
@@ -205,12 +213,13 @@ impl SafekeeperNode {
        let pid = Pid::from_raw(pid);

        let sig = if immediate {
-            println!("Stop safekeeper immediately");
+            print!("Stopping safekeeper {} immediately..", self.id);
            Signal::SIGQUIT
        } else {
-            println!("Stop safekeeper gracefully");
+            print!("Stopping safekeeper {} gracefully..", self.id);
            Signal::SIGTERM
        };
+        io::stdout().flush().unwrap();
        match kill(pid, sig) {
            Ok(_) => (),
            Err(Errno::ESRCH) => {
@@ -232,25 +241,35 @@ impl SafekeeperNode {
        // TODO Remove this "timeout" and handle it on caller side instead.
        // Shutting down may take a long time,
        // if safekeeper flushes a lot of data
+        let mut tcp_stopped = false;
        for _ in 0..100 {
-            if let Err(_e) = TcpStream::connect(&address) {
-                println!("Safekeeper stopped receiving connections");
-
-                //Now check status
-                match self.check_status() {
-                    Ok(_) => {
-                        println!("Safekeeper status is OK. Wait a bit.");
-                        thread::sleep(Duration::from_secs(1));
-                    }
-                    Err(err) => {
-                        println!("Safekeeper status is: {}", err);
-                        return Ok(());
+            if !tcp_stopped {
+                if let Err(err) = TcpStream::connect(&address) {
+                    tcp_stopped = true;
+                    if err.kind() != io::ErrorKind::ConnectionRefused {
+                        eprintln!("\nSafekeeper connection failed with error: {err}");
                    }
                }
-            } else {
-                println!("Safekeeper still receives connections");
-                thread::sleep(Duration::from_secs(1));
            }
+            if tcp_stopped {
+                // Also check status on the HTTP port
+                match self.check_status() {
+                    Err(SafekeeperHttpError::Transport(err)) if err.is_connect() => {
+                        println!("done!");
+                        return Ok(());
+                    }
+                    Err(err) => {
+                        eprintln!("\nSafekeeper status check failed with error: {err}");
+                        return Ok(());
+                    }
+                    Ok(()) => {
+                        // keep waiting
+                    }
+                }
+            }
+            print!(".");
+            io::stdout().flush().unwrap();
+            thread::sleep(Duration::from_secs(1));
        }

        bail!("Failed to stop safekeeper with pid {}", pid);
@@ -275,7 +294,7 @@ impl SafekeeperNode {
        &self,
        tenant_id: ZTenantId,
        timeline_id: ZTimelineId,
-        peer_ids: Vec<ZNodeId>,
+        peer_ids: Vec<NodeId>,
    ) -> Result<()> {
        Ok(self
            .http_request(
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -25,7 +25,7 @@ use utils::{
 };

 use crate::local_env::LocalEnv;
-use crate::{fill_rust_env_vars, read_pidfile};
+use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
 use pageserver::tenant_mgr::TenantInfo;

 #[derive(Error, Debug)]
@@ -121,6 +121,16 @@ impl PageServerNode {
        );
        let listen_pg_addr_param =
            format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr);
+        let broker_endpoints_param = format!(
+            "broker_endpoints=[{}]",
+            self.env
+                .etcd_broker
+                .broker_endpoints
+                .iter()
+                .map(|url| format!("'{url}'"))
+                .collect::<Vec<_>>()
+                .join(",")
+        );
        let mut args = Vec::with_capacity(20);

        args.push("--init");
@@ -129,8 +139,19 @@ impl PageServerNode {
        args.extend(["-c", &authg_type_param]);
        args.extend(["-c", &listen_http_addr_param]);
        args.extend(["-c", &listen_pg_addr_param]);
+        args.extend(["-c", &broker_endpoints_param]);
        args.extend(["-c", &id]);

+        let broker_etcd_prefix_param = self
+            .env
+            .etcd_broker
+            .broker_etcd_prefix
+            .as_ref()
+            .map(|prefix| format!("broker_etcd_prefix='{prefix}'"));
+        if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() {
+            args.extend(["-c", broker_etcd_prefix_param]);
+        }
+
        for config_override in config_overrides {
            args.extend(["-c", config_override]);
        }
@@ -167,6 +188,9 @@ impl PageServerNode {
            );
        }

+        // echo the captured output of the init command
+        println!("{}", String::from_utf8_lossy(&init_output.stdout));
+
        Ok(initial_timeline_id)
    }

@@ -186,8 +210,6 @@ impl PageServerNode {
        );
        io::stdout().flush().unwrap();

-        let mut cmd = Command::new(self.env.pageserver_bin()?);
-
        let repo_path = self.repo_path();
        let mut args = vec!["-D", repo_path.to_str().unwrap()];

@@ -195,9 +217,11 @@ impl PageServerNode {
            args.extend(["-c", config_override]);
        }

-        fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
+        let mut cmd = Command::new(self.env.pageserver_bin()?);
+        let mut filled_cmd = fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
+        filled_cmd = fill_aws_secrets_vars(filled_cmd);

-        if !cmd.status()?.success() {
+        if !filled_cmd.status()?.success() {
            bail!(
                "Pageserver failed to start. See '{}' for details.",
                self.repo_path().join("pageserver.log").display()
@@ -257,12 +281,13 @@ impl PageServerNode {
        let pid = Pid::from_raw(read_pidfile(&pid_file)?);

        let sig = if immediate {
-            println!("Stop pageserver immediately");
+            print!("Stopping pageserver immediately..");
            Signal::SIGQUIT
        } else {
-            println!("Stop pageserver gracefully");
+            print!("Stopping pageserver gracefully..");
            Signal::SIGTERM
        };
+        io::stdout().flush().unwrap();
        match kill(pid, sig) {
            Ok(_) => (),
            Err(Errno::ESRCH) => {
@@ -284,25 +309,36 @@ impl PageServerNode {
        // TODO Remove this "timeout" and handle it on caller side instead.
        // Shutting down may take a long time,
        // if pageserver checkpoints a lot of data
+        let mut tcp_stopped = false;
        for _ in 0..100 {
-            if let Err(_e) = TcpStream::connect(&address) {
-                println!("Pageserver stopped receiving connections");
-
-                //Now check status
-                match self.check_status() {
-                    Ok(_) => {
-                        println!("Pageserver status is OK. Wait a bit.");
-                        thread::sleep(Duration::from_secs(1));
-                    }
-                    Err(err) => {
-                        println!("Pageserver status is: {}", err);
-                        return Ok(());
+            if !tcp_stopped {
+                if let Err(err) = TcpStream::connect(&address) {
+                    tcp_stopped = true;
+                    if err.kind() != io::ErrorKind::ConnectionRefused {
+                        eprintln!("\nPageserver connection failed with error: {err}");
                    }
                }
-            } else {
-                println!("Pageserver still receives connections");
-                thread::sleep(Duration::from_secs(1));
            }
+            if tcp_stopped {
+                // Also check status on the HTTP port
+
+                match self.check_status() {
+                    Err(PageserverHttpError::Transport(err)) if err.is_connect() => {
+                        println!("done!");
+                        return Ok(());
+                    }
+                    Err(err) => {
+                        eprintln!("\nPageserver status check failed with error: {err}");
+                        return Ok(());
+                    }
+                    Ok(()) => {
+                        // keep waiting
+                    }
+                }
+            }
+            print!(".");
+            io::stdout().flush().unwrap();
+            thread::sleep(Duration::from_secs(1));
        }

        bail!("Failed to stop pageserver with pid {}", pid);
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -1,13 +1,20 @@
 #!/bin/sh
 set -eux

+broker_endpoints_param="${BROKER_ENDPOINT:-absent}"
+if [ "$broker_endpoints_param" != "absent" ]; then
+    broker_endpoints_param="-c broker_endpoints=['$broker_endpoints_param']"
+else
+    broker_endpoints_param=''
+fi
+
 if [ "$1" = 'pageserver' ]; then
    if [ ! -d "/data/tenants" ]; then
        echo "Initializing pageserver data directory"
-        pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10"
+        pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10" $broker_endpoints_param
    fi
    echo "Staring pageserver at 0.0.0.0:6400"
-    pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /data
+    pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" $broker_endpoints_param -D /data
 else
    "$@"
 fi
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -1,20 +1,20 @@
-# Docker images of Zenith
+# Docker images of Neon

 ## Images

 Currently we build two main images:

- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
- [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres).
+- [neondatabase/neon](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
+- [neondatabase/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres).

-And additional intermediate images:
+And additional intermediate image:

- [zenithdb/compute-tools](https://hub.docker.com/repository/docker/zenithdb/compute-tools) — compute node configuration management tools.
+- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.

 ## Building pipeline

-1. Image `zenithdb/compute-tools` is re-built automatically.
+We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs

-2. Image `zenithdb/compute-node` is built independently in the [zenithdb/postgres](https://github.com/zenithdb/postgres) repo.
+1. `neondatabase/compute-tools` and `neondatabase/compute-node`

-3. Image `zenithdb/zenith` is built in this repo after a successful `release` tests run and pushed to Docker Hub automatically.
+2. `neondatabase/neon`
--- a/docs/glossary.md
+++ b/docs/glossary.md
@@ -21,7 +21,7 @@ NOTE:It has nothing to do with PostgreSQL pg_basebackup.

 ### Branch

-We can create branch at certain LSN using `zenith timeline branch` command.
+We can create branch at certain LSN using `neon_local timeline branch` command.
 Each Branch lives in a corresponding timeline[] and has an ancestor[].


@@ -91,7 +91,7 @@ The layer map tracks what layers exist in a timeline.

 ### Layered repository

-Zenith repository implementation that keeps data in layers.
+Neon repository implementation that keeps data in layers.
 ### LSN

 The Log Sequence Number (LSN) is a unique identifier of the WAL record[] in the WAL log.
@@ -101,7 +101,7 @@ It is printed as two hexadecimal numbers of up to 8 digits each, separated by a
 Check also [PostgreSQL doc about pg_lsn type](https://www.postgresql.org/docs/devel/datatype-pg-lsn.html)
 Values can be compared to calculate the volume of WAL data that separates them, so they are used to measure the progress of replication and recovery.

-In postgres and Zenith lsns are used to describe certain points in WAL handling.
+In Postgres and Neon LSNs are used to describe certain points in WAL handling.

 PostgreSQL LSNs and functions to monitor them:
 * `pg_current_wal_insert_lsn()` - Returns the current write-ahead log insert location.
@@ -111,13 +111,13 @@ PostgreSQL LSNs and functions to monitor them:
 * `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically.
 [source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html):

-Zenith safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md)
+Neon safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md)
 * `CommitLSN`: position in WAL confirmed by quorum safekeepers.
 * `RestartLSN`: position in WAL confirmed by all safekeepers.
 * `FlushLSN`: part of WAL persisted to the disk by safekeeper.
 * `VCL`: the largerst LSN for which we can guarantee availablity of all prior records.

-Zenith pageserver LSNs:
+Neon pageserver LSNs:
 * `last_record_lsn` - the end of last processed WAL record.
 * `disk_consistent_lsn` - data is known to be fully flushed and fsync'd to local disk on pageserver up to this LSN.
 * `remote_consistent_lsn` - The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash.
@@ -132,7 +132,7 @@ This is the unit of data exchange between compute node and pageserver.

 ### Pageserver

-Zenith storage engine: repositories + wal receiver + page service + wal redo.
+Neon storage engine: repositories + wal receiver + page service + wal redo.

 ### Page service

@@ -184,10 +184,10 @@ relation exceeds that size, it is split into multiple segments.
 SLRUs include pg_clog, pg_multixact/members, and
 pg_multixact/offsets. There are other SLRUs in PostgreSQL, but
 they don't need to be stored permanently (e.g. pg_subtrans),
-or we do not support them in zenith yet (pg_commit_ts).
+or we do not support them in neon yet (pg_commit_ts).

 ### Tenant (Multitenancy)
-Tenant represents a single customer, interacting with Zenith.
+Tenant represents a single customer, interacting with Neon.
 Wal redo[] activity, timelines[], layers[] are managed for each tenant independently.
 One pageserver[] can serve multiple tenants at once.
 One safekeeper
--- a/docs/rfcs/004-durability.md
+++ b/docs/rfcs/004-durability.md
@@ -22,7 +22,7 @@ In addition to the WAL safekeeper nodes, the WAL is archived in
 S3. WAL that has been archived to S3 can be removed from the
 safekeepers, so the safekeepers don't need a lot of disk space.

-
+```
                                +----------------+
                        +-----> | WAL safekeeper |
                        |       +----------------+
@@ -42,23 +42,23 @@ safekeepers, so the safekeepers don't need a lot of disk space.
                  \
                   \
                    \
-                     \      +--------+
-					  \		|        |
-					   +-->	|   S3   |
-							|        |
-                            +--------+
-
+                     \          +--------+
+                      \         |        |
+                       +------> |   S3   |
+                                |        |
+                                +--------+

+```
 Every WAL safekeeper holds a section of WAL, and a VCL value.
 The WAL can be divided into three portions:

-
+```
                                    VCL                   LSN
                                     |                     |
                                     V                     V
 .................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX
 Archived WAL       Completed WAL          In-flight WAL
-
+```

 Note that all this WAL kept in a safekeeper is a contiguous section.
 This is different from Aurora: In Aurora, there can be holes in the
--- a/docs/settings.md
+++ b/docs/settings.md
@@ -6,7 +6,6 @@ If there's no such file during `init` phase of the server, it creates the file i
 There's a possibility to pass an arbitrary config value to the pageserver binary as an argument: such values override
 the values in the config file, if any are specified for the same key and get into the final config during init phase.

-
 ### Config example

 ```toml
@@ -26,18 +25,22 @@ max_file_descriptors = '100'
 # initial superuser role name to use when creating a new tenant
 initial_superuser_name = 'zenith_admin'

+broker_etcd_prefix = 'neon'
+broker_endpoints = ['some://etcd']
+
 # [remote_storage]
 ```

-The config above shows default values for all basic pageserver settings.
+The config above shows default values for all basic pageserver settings, besides `broker_endpoints`: that one has to be set by the user, 
+see the corresponding section below.
 Pageserver uses default values for all files that are missing in the config, so it's not a hard error to leave the config blank.
 Yet, it validates the config values it can (e.g. postgres install dir) and errors if the validation fails, refusing to start.

 Note the `[remote_storage]` section: it's a [table](https://toml.io/en/v1.0.0#table) in TOML specification and

-* either has to be placed in the config after the table-less values such as `initial_superuser_name = 'zenith_admin'`
+- either has to be placed in the config after the table-less values such as `initial_superuser_name = 'zenith_admin'`

-* or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}`
+- or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}`

 ### Config values

@@ -47,6 +50,17 @@ Example: `${PAGESERVER_BIN} -c "checkpoint_period = '100 s'" -c "remote_storage=

 Note that TOML distinguishes between strings and integers, the former require single or double quotes around them.

+#### broker_endpoints
+
+A list of endpoints (etcd currently) to connect and pull the information from.
+Mandatory, does not have a default, since requires etcd to be started as a separate process,
+and its connection url should be specified separately. 
+
+#### broker_etcd_prefix
+
+A prefix to add for every etcd key used, to separate one group of related instances from another, in the same cluster.
+Default is `neon`.
+
 #### checkpoint_distance

 `checkpoint_distance` is the amount of incoming WAL that is held in
@@ -57,7 +71,7 @@ but it will trigger a checkpoint operation to get it back below the
 limit.

 `checkpoint_distance` also determines how much WAL needs to be kept
-durable in the safekeeper.  The safekeeper must have capacity to hold
+durable in the safekeeper. The safekeeper must have capacity to hold
 this much WAL, with some headroom, otherwise you can get stuck in a
 situation where the safekeeper is full and stops accepting new WAL,
 but the pageserver is not flushing out and releasing the space in the
@@ -72,7 +86,7 @@ The unit is # of bytes.

 Every `compaction_period` seconds, the page server checks if
 maintenance operations, like compaction, are needed on the layer
-files.  Default is 1 s, which should be fine.
+files. Default is 1 s, which should be fine.

 #### compaction_target_size

@@ -163,16 +177,12 @@ bucket_region = 'eu-north-1'
 # Optional, pageserver uses entire bucket if the prefix is not specified.
 prefix_in_bucket = '/some/prefix/'

-# Access key to connect to the bucket ("login" part of the credentials)
-access_key_id = 'SOMEKEYAAAAASADSAH*#'
-
-# Secret access key to connect to the bucket ("password" part of the credentials)
-secret_access_key = 'SOMEsEcReTsd292v'
-
 # S3 API query limit to avoid getting errors/throttling from AWS.
 concurrency_limit = 100
 ```

+If no IAM bucket access is used during the remote storage usage, use the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables to set the access credentials.
+
 ###### General remote storage configuration

 Pagesever allows only one remote storage configured concurrently and errors if parameters from multiple different remote configurations are used.
@@ -183,13 +193,12 @@ Besides, there are parameters common for all types of remote storage that can be
 ```toml
 [remote_storage]
 # Max number of concurrent timeline synchronized (layers uploaded or downloaded) with the remote storage at the same time.
-max_concurrent_timelines_sync = 50
+max_concurrent_syncs = 50

 # Max number of errors a single task can have before it's considered failed and not attempted to run anymore.
 max_sync_errors = 10
 ```

-
 ## safekeeper

 TODO
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -91,18 +91,22 @@ so manual installation of dependencies is not recommended.
 A single virtual environment with all dependencies is described in the single `Pipfile`.

 ### Prerequisites
- Install Python 3.7 (the minimal supported version) or greater.
+- Install Python 3.9 (the minimal supported version) or greater.
    - Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesnt work as expected.
-    - If you have some trouble with other version you can resolve it by installing Python 3.7 separately, via pyenv or via system package manager e.g.:
+    - If you have some trouble with other version you can resolve it by installing Python 3.9 separately, via [pyenv](https://github.com/pyenv/pyenv) or via system package manager e.g.:
      ```bash
      # In Ubuntu
      sudo add-apt-repository ppa:deadsnakes/ppa
      sudo apt update
-      sudo apt install python3.7
+      sudo apt install python3.9
      ```
 - Install `poetry`
    - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation)`.
- Install dependencies via `./scripts/pysync`. Note that CI uses Python 3.7 so if you have different version some linting tools can yield different result locally vs in the CI.
+- Install dependencies via `./scripts/pysync`.
+    - Note that CI uses specific Python version (look for `PYTHON_VERSION` [here](https://github.com/neondatabase/docker-images/blob/main/rust/Dockerfile))
+      so if you have different version some linting tools can yield different result locally vs in the CI.
+    - You can explicitly specify which Python to use by running `poetry env use /path/to/python`, e.g. `poetry env use python3.9`.
+      This may also disable the `The currently activated Python version X.Y.Z is not supported by the project` warning.

 Run `poetry shell` to activate the virtual environment.
 Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.
--- a/libs/etcd_broker/Cargo.toml
+++ b/libs/etcd_broker/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+ name = "etcd_broker"
+ version = "0.1.0"
+ edition = "2021"
+
+ [dependencies]
+ etcd-client = "0.9.0"
+ regex = "1.4.5"
+ serde = { version = "1.0", features = ["derive"] }
+ serde_json = "1"
+ serde_with = "1.12.0"
+
+ utils = { path = "../utils" }
+ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+ tokio = "1"
+ tracing = "0.1"
+ thiserror = "1"
--- a/libs/etcd_broker/src/lib.rs
+++ b/libs/etcd_broker/src/lib.rs
@@ -0,0 +1,348 @@
+//! A set of primitives to access a shared data/updates, propagated via etcd broker (not persistent).
+//! Intended to connect services to each other, not to store their data.
+use std::{
+    collections::{hash_map, HashMap},
+    fmt::Display,
+    str::FromStr,
+};
+
+use regex::{Captures, Regex};
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
+
+pub use etcd_client::*;
+
+use tokio::{sync::mpsc, task::JoinHandle};
+use tracing::*;
+use utils::{
+    lsn::Lsn,
+    zid::{NodeId, ZTenantId, ZTenantTimelineId},
+};
+
+/// Default value to use for prefixing to all etcd keys with.
+/// This way allows isolating safekeeper/pageserver groups in the same etcd cluster.
+pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon";
+
+#[derive(Debug, Deserialize, Serialize)]
+struct SafekeeperTimeline {
+    safekeeper_id: NodeId,
+    info: SkTimelineInfo,
+}
+
+/// Published data about safekeeper's timeline. Fields made optional for easy migrations.
+#[serde_as]
+#[derive(Debug, Deserialize, Serialize)]
+pub struct SkTimelineInfo {
+    /// Term of the last entry.
+    pub last_log_term: Option<u64>,
+    /// LSN of the last record.
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    #[serde(default)]
+    pub flush_lsn: Option<Lsn>,
+    /// Up to which LSN safekeeper regards its WAL as committed.
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    #[serde(default)]
+    pub commit_lsn: Option<Lsn>,
+    /// LSN up to which safekeeper has backed WAL.
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    #[serde(default)]
+    pub backup_lsn: Option<Lsn>,
+    /// LSN of last checkpoint uploaded by pageserver.
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    #[serde(default)]
+    pub remote_consistent_lsn: Option<Lsn>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    #[serde(default)]
+    pub peer_horizon_lsn: Option<Lsn>,
+    #[serde(default)]
+    pub safekeeper_connection_string: Option<String>,
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum BrokerError {
+    #[error("Etcd client error: {0}. Context: {1}")]
+    EtcdClient(etcd_client::Error, String),
+    #[error("Error during parsing etcd data: {0}")]
+    ParsingError(String),
+    #[error("Internal error: {0}")]
+    InternalError(String),
+}
+
+/// A way to control the data retrieval from a certain subscription.
+pub struct SkTimelineSubscription {
+    safekeeper_timeline_updates:
+        mpsc::UnboundedReceiver<HashMap<ZTenantTimelineId, HashMap<NodeId, SkTimelineInfo>>>,
+    kind: SkTimelineSubscriptionKind,
+    watcher_handle: JoinHandle<Result<(), BrokerError>>,
+    watcher: Watcher,
+}
+
+impl SkTimelineSubscription {
+    /// Asynchronously polls for more data from the subscription, suspending the current future if there's no data sent yet.
+    pub async fn fetch_data(
+        &mut self,
+    ) -> Option<HashMap<ZTenantTimelineId, HashMap<NodeId, SkTimelineInfo>>> {
+        self.safekeeper_timeline_updates.recv().await
+    }
+
+    /// Cancels the subscription, stopping the data poller and waiting for it to shut down.
+    pub async fn cancel(mut self) -> Result<(), BrokerError> {
+        self.watcher.cancel().await.map_err(|e| {
+            BrokerError::EtcdClient(
+                e,
+                format!(
+                    "Failed to cancel timeline subscription, kind: {:?}",
+                    self.kind
+                ),
+            )
+        })?;
+        self.watcher_handle.await.map_err(|e| {
+            BrokerError::InternalError(format!(
+                "Failed to join the timeline updates task, kind: {:?}, error: {e}",
+                self.kind
+            ))
+        })?
+    }
+}
+
+/// The subscription kind to the timeline updates from safekeeper.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct SkTimelineSubscriptionKind {
+    broker_etcd_prefix: String,
+    kind: SubscriptionKind,
+}
+
+impl SkTimelineSubscriptionKind {
+    pub fn all(broker_etcd_prefix: String) -> Self {
+        Self {
+            broker_etcd_prefix,
+            kind: SubscriptionKind::All,
+        }
+    }
+
+    pub fn tenant(broker_etcd_prefix: String, tenant: ZTenantId) -> Self {
+        Self {
+            broker_etcd_prefix,
+            kind: SubscriptionKind::Tenant(tenant),
+        }
+    }
+
+    pub fn timeline(broker_etcd_prefix: String, timeline: ZTenantTimelineId) -> Self {
+        Self {
+            broker_etcd_prefix,
+            kind: SubscriptionKind::Timeline(timeline),
+        }
+    }
+
+    fn watch_regex(&self) -> Regex {
+        match self.kind {
+            SubscriptionKind::All => Regex::new(&format!(
+                r"^{}/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]])$",
+                self.broker_etcd_prefix
+            ))
+            .expect("wrong regex for 'everything' subscription"),
+            SubscriptionKind::Tenant(tenant_id) => Regex::new(&format!(
+                r"^{}/{tenant_id}/([[:xdigit:]]+)/safekeeper/([[:digit:]])$",
+                self.broker_etcd_prefix
+            ))
+            .expect("wrong regex for 'tenant' subscription"),
+            SubscriptionKind::Timeline(ZTenantTimelineId {
+                tenant_id,
+                timeline_id,
+            }) => Regex::new(&format!(
+                r"^{}/{tenant_id}/{timeline_id}/safekeeper/([[:digit:]])$",
+                self.broker_etcd_prefix
+            ))
+            .expect("wrong regex for 'timeline' subscription"),
+        }
+    }
+
+    /// Etcd key to use for watching a certain timeline updates from safekeepers.
+    pub fn watch_key(&self) -> String {
+        match self.kind {
+            SubscriptionKind::All => self.broker_etcd_prefix.to_string(),
+            SubscriptionKind::Tenant(tenant_id) => {
+                format!("{}/{tenant_id}/safekeeper", self.broker_etcd_prefix)
+            }
+            SubscriptionKind::Timeline(ZTenantTimelineId {
+                tenant_id,
+                timeline_id,
+            }) => format!(
+                "{}/{tenant_id}/{timeline_id}/safekeeper",
+                self.broker_etcd_prefix
+            ),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+enum SubscriptionKind {
+    /// Get every timeline update.
+    All,
+    /// Get certain tenant timelines' updates.
+    Tenant(ZTenantId),
+    /// Get certain timeline updates.
+    Timeline(ZTenantTimelineId),
+}
+
+/// Creates a background task to poll etcd for timeline updates from safekeepers.
+/// Stops and returns `Err` on any error during etcd communication.
+/// Watches the key changes until either the watcher is cancelled via etcd or the subscription cancellation handle,
+/// exiting normally in such cases.
+pub async fn subscribe_to_safekeeper_timeline_updates(
+    client: &mut Client,
+    subscription: SkTimelineSubscriptionKind,
+) -> Result<SkTimelineSubscription, BrokerError> {
+    info!("Subscribing to timeline updates, subscription kind: {subscription:?}");
+
+    let (watcher, mut stream) = client
+        .watch(
+            subscription.watch_key(),
+            Some(WatchOptions::new().with_prefix()),
+        )
+        .await
+        .map_err(|e| {
+            BrokerError::EtcdClient(
+                e,
+                format!("Failed to init the watch for subscription {subscription:?}"),
+            )
+        })?;
+
+    let (timeline_updates_sender, safekeeper_timeline_updates) = mpsc::unbounded_channel();
+
+    let subscription_kind = subscription.kind;
+    let regex = subscription.watch_regex();
+    let watcher_handle = tokio::spawn(async move {
+        while let Some(resp) = stream.message().await.map_err(|e| BrokerError::InternalError(format!(
+            "Failed to get messages from the subscription stream, kind: {subscription_kind:?}, error: {e}"
+        )))? {
+            if resp.canceled() {
+                info!("Watch for timeline updates subscription was canceled, exiting");
+                break;
+            }
+
+            let mut timeline_updates: HashMap<ZTenantTimelineId, HashMap<NodeId, SkTimelineInfo>> = HashMap::new();
+            // Keep track that the timeline data updates from etcd arrive in the right order.
+            // https://etcd.io/docs/v3.5/learning/api_guarantees/#isolation-level-and-consistency-of-replicas
+            // > etcd does not ensure linearizability for watch operations. Users are expected to verify the revision of watch responses to ensure correct ordering.
+            let mut timeline_etcd_versions: HashMap<ZTenantTimelineId, i64> = HashMap::new();
+
+
+            let events = resp.events();
+            debug!("Processing {} events", events.len());
+
+            for event in events {
+                if EventType::Put == event.event_type() {
+                    if let Some(new_etcd_kv) = event.kv() {
+                        let new_kv_version = new_etcd_kv.version();
+
+                        match parse_etcd_key_value(subscription_kind, &regex, new_etcd_kv) {
+                            Ok(Some((zttid, timeline))) => {
+                                match timeline_updates
+                                    .entry(zttid)
+                                    .or_default()
+                                    .entry(timeline.safekeeper_id)
+                                {
+                                    hash_map::Entry::Occupied(mut o) => {
+                                        let old_etcd_kv_version = timeline_etcd_versions.get(&zttid).copied().unwrap_or(i64::MIN);
+                                        if old_etcd_kv_version < new_kv_version {
+                                            o.insert(timeline.info);
+                                            timeline_etcd_versions.insert(zttid,new_kv_version);
+                                        }
+                                    }
+                                    hash_map::Entry::Vacant(v) => {
+                                        v.insert(timeline.info);
+                                        timeline_etcd_versions.insert(zttid,new_kv_version);
+                                    }
+                                }
+                            }
+                            Ok(None) => {}
+                            Err(e) => error!("Failed to parse timeline update: {e}"),
+                        };
+                    }
+                }
+            }
+
+            if let Err(e) = timeline_updates_sender.send(timeline_updates) {
+                info!("Timeline updates sender got dropped, exiting: {e}");
+                break;
+            }
+        }
+
+        Ok(())
+    });
+
+    Ok(SkTimelineSubscription {
+        kind: subscription,
+        safekeeper_timeline_updates,
+        watcher_handle,
+        watcher,
+    })
+}
+
+fn parse_etcd_key_value(
+    subscription_kind: SubscriptionKind,
+    regex: &Regex,
+    kv: &KeyValue,
+) -> Result<Option<(ZTenantTimelineId, SafekeeperTimeline)>, BrokerError> {
+    let caps = if let Some(caps) = regex.captures(kv.key_str().map_err(|e| {
+        BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as key str"))
+    })?) {
+        caps
+    } else {
+        return Ok(None);
+    };
+
+    let (zttid, safekeeper_id) = match subscription_kind {
+        SubscriptionKind::All => (
+            ZTenantTimelineId::new(
+                parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?,
+                parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?,
+            ),
+            NodeId(parse_capture(&caps, 3).map_err(BrokerError::ParsingError)?),
+        ),
+        SubscriptionKind::Tenant(tenant_id) => (
+            ZTenantTimelineId::new(
+                tenant_id,
+                parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?,
+            ),
+            NodeId(parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?),
+        ),
+        SubscriptionKind::Timeline(zttid) => (
+            zttid,
+            NodeId(parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?),
+        ),
+    };
+
+    let info_str = kv.value_str().map_err(|e| {
+        BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as value str"))
+    })?;
+    Ok(Some((
+        zttid,
+        SafekeeperTimeline {
+            safekeeper_id,
+            info: serde_json::from_str(info_str).map_err(|e| {
+                BrokerError::ParsingError(format!(
+                    "Failed to parse '{info_str}' as safekeeper timeline info: {e}"
+                ))
+            })?,
+        },
+    )))
+}
+
+fn parse_capture<T>(caps: &Captures, index: usize) -> Result<T, String>
+where
+    T: FromStr,
+    <T as FromStr>::Err: Display,
+{
+    let capture_match = caps
+        .get(index)
+        .ok_or_else(|| format!("Failed to get capture match at index {index}"))?
+        .as_str();
+    capture_match.parse().map_err(|e| {
+        format!(
+            "Failed to parse {} from {capture_match}: {e}",
+            std::any::type_name::<T>()
+        )
+    })
+}
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 edition = "2021"

 [dependencies]
-prometheus = {version = "0.13", default_features=false} # removes protobuf dependency
+prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 libc = "0.2"
 lazy_static = "1.4"
 once_cell = "1.8.0"
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -3,7 +3,6 @@
 //! Otherwise, we might not see all metrics registered via
 //! a default registry.
 use lazy_static::lazy_static;
-use once_cell::race::OnceBox;
 pub use prometheus::{exponential_buckets, linear_buckets};
 pub use prometheus::{register_gauge, Gauge};
 pub use prometheus::{register_gauge_vec, GaugeVec};
@@ -27,48 +26,15 @@ pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
    prometheus::gather()
 }

-static COMMON_METRICS_PREFIX: OnceBox<&str> = OnceBox::new();
-
-/// Sets a prefix which will be used for all common metrics, typically a service
-/// name like 'pageserver'. Should be executed exactly once in the beginning of
-/// any executable which uses common metrics.
-pub fn set_common_metrics_prefix(prefix: &'static str) {
-    // Not unwrap() because metrics may be initialized after multiple threads have been started.
-    COMMON_METRICS_PREFIX
-        .set(prefix.into())
-        .unwrap_or_else(|_| {
-            eprintln!(
-                "set_common_metrics_prefix() was called second time with '{}', exiting",
-                prefix
-            );
-            std::process::exit(1);
-        });
-}
-
-/// Prepends a prefix to a common metric name so they are distinguished between
-/// different services, see <https://github.com/zenithdb/zenith/pull/681>
-/// A call to set_common_metrics_prefix() is necessary prior to calling this.
-pub fn new_common_metric_name(unprefixed_metric_name: &str) -> String {
-    // Not unwrap() because metrics may be initialized after multiple threads have been started.
-    format!(
-        "{}_{}",
-        COMMON_METRICS_PREFIX.get().unwrap_or_else(|| {
-            eprintln!("set_common_metrics_prefix() was not called, but metrics are used, exiting");
-            std::process::exit(1);
-        }),
-        unprefixed_metric_name
-    )
-}
-
 lazy_static! {
    static ref DISK_IO_BYTES: IntGaugeVec = register_int_gauge_vec!(
-        new_common_metric_name("disk_io_bytes"),
+        "libmetrics_disk_io_bytes_total",
        "Bytes written and read from disk, grouped by the operation (read|write)",
        &["io_operation"]
    )
    .expect("Failed to register disk i/o bytes int gauge vec");
    static ref MAXRSS_KB: IntGauge = register_int_gauge!(
-        new_common_metric_name("maxrss_kb"),
+        "libmetrics_maxrss_kb",
        "Memory usage (Maximum Resident Set Size)"
    )
    .expect("Failed to register maxrss_kb int gauge");
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -20,5 +20,10 @@ serde = { version = "1.0", features = ["derive"] }
 utils = { path = "../utils" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }

+[dev-dependencies]
+env_logger = "0.9"
+postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+wal_generate = { path = "wal_generate" }
+
 [build-dependencies]
 bindgen = "0.59.1"
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -8,6 +8,7 @@
 #![allow(deref_nullptr)]

 use serde::{Deserialize, Serialize};
+use utils::lsn::Lsn;

 include!(concat!(env!("OUT_DIR"), "/bindings.rs"));

@@ -37,3 +38,21 @@ pub const fn transaction_id_precedes(id1: TransactionId, id2: TransactionId) ->
    let diff = id1.wrapping_sub(id2) as i32;
    diff < 0
 }
+
+// Check if page is not yet initialized (port of Postgres PageIsInit() macro)
+pub fn page_is_new(pg: &[u8]) -> bool {
+    pg[14] == 0 && pg[15] == 0 // pg_upper == 0
+}
+
+// ExtractLSN from page header
+pub fn page_get_lsn(pg: &[u8]) -> Lsn {
+    Lsn(
+        ((u32::from_le_bytes(pg[0..4].try_into().unwrap()) as u64) << 32)
+            | u32::from_le_bytes(pg[4..8].try_into().unwrap()) as u64,
+    )
+}
+
+pub fn page_set_lsn(pg: &mut [u8], lsn: Lsn) {
+    pg[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes());
+    pg[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes());
+}
--- a/libs/postgres_ffi/src/waldecoder.rs
+++ b/libs/postgres_ffi/src/waldecoder.rs
@@ -89,7 +89,12 @@ impl WalStreamDecoder {
                    return Ok(None);
                }

-                let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf);
+                let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| {
+                    WalDecodeError {
+                        msg: format!("long header deserialization failed {}", e),
+                        lsn: self.lsn,
+                    }
+                })?;

                if hdr.std.xlp_pageaddr != self.lsn.0 {
                    return Err(WalDecodeError {
@@ -106,7 +111,12 @@ impl WalStreamDecoder {
                    return Ok(None);
                }

-                let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf);
+                let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| {
+                    WalDecodeError {
+                        msg: format!("header deserialization failed {}", e),
+                        lsn: self.lsn,
+                    }
+                })?;

                if hdr.xlp_pageaddr != self.lsn.0 {
                    return Err(WalDecodeError {
@@ -188,7 +198,13 @@ impl WalStreamDecoder {
        }

        // We now have a record in the 'recordbuf' local variable.
-        let xlogrec = XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]);
+        let xlogrec =
+            XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]).map_err(|e| {
+                WalDecodeError {
+                    msg: format!("xlog record deserialization failed {}", e),
+                    lsn: self.lsn,
+                }
+            })?;

        let mut crc = 0;
        crc = crc32c_append(crc, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]);
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -15,7 +15,7 @@ use crate::XLogPageHeaderData;
 use crate::XLogRecord;
 use crate::XLOG_PAGE_MAGIC;

-use anyhow::{bail, Result};
+use anyhow::{bail, ensure};
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::BytesMut;
 use bytes::{Buf, Bytes};
@@ -28,6 +28,9 @@ use std::io::prelude::*;
 use std::io::SeekFrom;
 use std::path::{Path, PathBuf};
 use std::time::SystemTime;
+use utils::bin_ser::DeserializeError;
+use utils::bin_ser::SerializeError;
+use utils::const_assert;
 use utils::lsn::Lsn;

 pub const XLOG_FNAME_LEN: usize = 24;
@@ -144,11 +147,12 @@ fn find_end_of_wal_segment(
    tli: TimeLineID,
    wal_seg_size: usize,
    start_offset: usize, // start reading at this point
-) -> Result<u32> {
+) -> anyhow::Result<u32> {
    // step back to the beginning of the page to read it in...
    let mut offs: usize = start_offset - start_offset % XLOG_BLCKSZ;
+    let mut skipping_first_contrecord: bool = false;
    let mut contlen: usize = 0;
-    let mut wal_crc: u32 = 0;
+    let mut xl_crc: u32 = 0;
    let mut crc: u32 = 0;
    let mut rec_offs: usize = 0;
    let mut buf = [0u8; XLOG_BLCKSZ];
@@ -156,11 +160,15 @@ fn find_end_of_wal_segment(
    let mut last_valid_rec_pos: usize = start_offset; // assume at given start_offset begins new record
    let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap();
    file.seek(SeekFrom::Start(offs as u64))?;
+    // xl_crc is the last field in XLogRecord, will not be read into rec_hdr
+    const_assert!(XLOG_RECORD_CRC_OFFS + 4 == XLOG_SIZE_OF_XLOG_RECORD);
    let mut rec_hdr = [0u8; XLOG_RECORD_CRC_OFFS];

+    trace!("find_end_of_wal_segment(data_dir={}, segno={}, tli={}, wal_seg_size={}, start_offset=0x{:x})", data_dir.display(), segno, tli, wal_seg_size, start_offset);
    while offs < wal_seg_size {
        // we are at the beginning of the page; read it in
        if offs % XLOG_BLCKSZ == 0 {
+            trace!("offs=0x{:x}: new page", offs);
            let bytes_read = file.read(&mut buf)?;
            if bytes_read != buf.len() {
                bail!(
@@ -174,30 +182,49 @@ fn find_end_of_wal_segment(
            let xlp_magic = LittleEndian::read_u16(&buf[0..2]);
            let xlp_info = LittleEndian::read_u16(&buf[2..4]);
            let xlp_rem_len = LittleEndian::read_u32(&buf[XLP_REM_LEN_OFFS..XLP_REM_LEN_OFFS + 4]);
+            trace!(
+                "  xlp_magic=0x{:x}, xlp_info=0x{:x}, xlp_rem_len={}",
+                xlp_magic,
+                xlp_info,
+                xlp_rem_len
+            );
            // this is expected in current usage when valid WAL starts after page header
            if xlp_magic != XLOG_PAGE_MAGIC as u16 {
                trace!(
-                    "invalid WAL file {}.partial magic {} at {:?}",
+                    "  invalid WAL file {}.partial magic {} at {:?}",
                    file_name,
                    xlp_magic,
                    Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size)),
                );
            }
            if offs == 0 {
-                offs = XLOG_SIZE_OF_XLOG_LONG_PHD;
+                offs += XLOG_SIZE_OF_XLOG_LONG_PHD;
                if (xlp_info & XLP_FIRST_IS_CONTRECORD) != 0 {
-                    offs += ((xlp_rem_len + 7) & !7) as usize;
+                    trace!("  first record is contrecord");
+                    skipping_first_contrecord = true;
+                    contlen = xlp_rem_len as usize;
+                    if offs < start_offset {
+                        // Pre-condition failed: the beginning of the segment is unexpectedly corrupted.
+                        ensure!(start_offset - offs >= contlen,
+                            "start_offset is in the middle of the first record (which happens to be a contrecord), \
+                             expected to be on a record boundary. Is beginning of the segment corrupted?");
+                        contlen = 0;
+                        // keep skipping_first_contrecord to avoid counting the contrecord as valid, we did not check it.
+                    }
+                } else {
+                    trace!("  first record is not contrecord");
                }
            } else {
                offs += XLOG_SIZE_OF_XLOG_SHORT_PHD;
            }
            // ... and step forward again if asked
+            trace!("  skipped header to 0x{:x}", offs);
            offs = max(offs, start_offset);
-
        // beginning of the next record
        } else if contlen == 0 {
            let page_offs = offs % XLOG_BLCKSZ;
            let xl_tot_len = LittleEndian::read_u32(&buf[page_offs..page_offs + 4]) as usize;
+            trace!("offs=0x{:x}: new record, xl_tot_len={}", offs, xl_tot_len);
            if xl_tot_len == 0 {
                info!(
                    "find_end_of_wal_segment reached zeros at {:?}, last records ends at {:?}",
@@ -210,10 +237,25 @@ fn find_end_of_wal_segment(
                );
                break; // zeros, reached the end
            }
-            last_valid_rec_pos = offs;
+            if skipping_first_contrecord {
+                skipping_first_contrecord = false;
+                trace!("  first contrecord has been just completed");
+            } else {
+                trace!(
+                    "  updating last_valid_rec_pos: 0x{:x} --> 0x{:x}",
+                    last_valid_rec_pos,
+                    offs
+                );
+                last_valid_rec_pos = offs;
+            }
            offs += 4;
            rec_offs = 4;
            contlen = xl_tot_len - 4;
+            trace!(
+                "  reading rec_hdr[0..4] <-- [0x{:x}; 0x{:x})",
+                page_offs,
+                page_offs + 4
+            );
            rec_hdr[0..4].copy_from_slice(&buf[page_offs..page_offs + 4]);
        } else {
            // we're continuing a record, possibly from previous page.
@@ -222,42 +264,118 @@ fn find_end_of_wal_segment(

            // read the rest of the record, or as much as fits on this page.
            let n = min(contlen, pageleft);
-            // fill rec_hdr (header up to (but not including) xl_crc field)
+            trace!(
+                "offs=0x{:x}, record continuation, pageleft={}, contlen={}",
+                offs,
+                pageleft,
+                contlen
+            );
+            // fill rec_hdr header up to (but not including) xl_crc field
+            trace!(
+                "  rec_offs={}, XLOG_RECORD_CRC_OFFS={}, XLOG_SIZE_OF_XLOG_RECORD={}",
+                rec_offs,
+                XLOG_RECORD_CRC_OFFS,
+                XLOG_SIZE_OF_XLOG_RECORD
+            );
            if rec_offs < XLOG_RECORD_CRC_OFFS {
                let len = min(XLOG_RECORD_CRC_OFFS - rec_offs, n);
+                trace!(
+                    "  reading rec_hdr[{}..{}] <-- [0x{:x}; 0x{:x})",
+                    rec_offs,
+                    rec_offs + len,
+                    page_offs,
+                    page_offs + len
+                );
                rec_hdr[rec_offs..rec_offs + len].copy_from_slice(&buf[page_offs..page_offs + len]);
            }
            if rec_offs <= XLOG_RECORD_CRC_OFFS && rec_offs + n >= XLOG_SIZE_OF_XLOG_RECORD {
                let crc_offs = page_offs - rec_offs + XLOG_RECORD_CRC_OFFS;
-                wal_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]);
+                // All records are aligned on 8-byte boundary, so their 8-byte frames
+                // cannot be split between pages. As xl_crc is the last field,
+                // its content is always on the same page.
+                const_assert!(XLOG_RECORD_CRC_OFFS % 8 == 4);
+                // We should always start reading aligned records even in incorrect WALs so if
+                // the condition is false it is likely a bug. However, it is localized somewhere
+                // in this function, hence we do not crash and just report failure instead.
+                ensure!(crc_offs % 8 == 4, "Record is not aligned properly (bug?)");
+                xl_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]);
+                trace!(
+                    "  reading xl_crc: [0x{:x}; 0x{:x}) = 0x{:x}",
+                    crc_offs,
+                    crc_offs + 4,
+                    xl_crc
+                );
                crc = crc32c_append(0, &buf[crc_offs + 4..page_offs + n]);
-            } else {
-                crc ^= 0xFFFFFFFFu32;
+                trace!(
+                    "  initializing crc: [0x{:x}; 0x{:x}); crc = 0x{:x}",
+                    crc_offs + 4,
+                    page_offs + n,
+                    crc
+                );
+            } else if rec_offs > XLOG_RECORD_CRC_OFFS {
+                // As all records are 8-byte aligned, the header is already fully read and `crc` is initialized in the branch above.
+                ensure!(rec_offs >= XLOG_SIZE_OF_XLOG_RECORD);
+                let old_crc = crc;
                crc = crc32c_append(crc, &buf[page_offs..page_offs + n]);
+                trace!(
+                    "  appending to crc: [0x{:x}; 0x{:x}); 0x{:x} --> 0x{:x}",
+                    page_offs,
+                    page_offs + n,
+                    old_crc,
+                    crc
+                );
+            } else {
+                // Correct because of the way conditions are written above.
+                assert!(rec_offs + n < XLOG_SIZE_OF_XLOG_RECORD);
+                // If `skipping_first_contrecord == true`, we may be reading from a middle of a record
+                // which started in the previous segment. Hence there is no point in validating the header.
+                if !skipping_first_contrecord && rec_offs + n > XLOG_RECORD_CRC_OFFS {
+                    info!(
+                        "Curiously corrupted WAL: a record stops inside the header; \
+                             offs=0x{:x}, record continuation, pageleft={}, contlen={}",
+                        offs, pageleft, contlen
+                    );
+                    break;
+                }
+                // Do nothing: we are still reading the header. It's accounted in CRC in the end of the record.
            }
-            crc = !crc;
            rec_offs += n;
            offs += n;
            contlen -= n;

            if contlen == 0 {
-                crc = !crc;
+                trace!("  record completed at 0x{:x}", offs);
                crc = crc32c_append(crc, &rec_hdr);
                offs = (offs + 7) & !7; // pad on 8 bytes boundary */
-                if crc == wal_crc {
+                trace!(
+                    "  padded offs to 0x{:x}, crc is {:x}, expected crc is {:x}",
+                    offs,
+                    crc,
+                    xl_crc
+                );
+                if skipping_first_contrecord {
+                    // do nothing, the flag will go down on next iteration when we're reading new record
+                    trace!("  first conrecord has been just completed");
+                } else if crc == xl_crc {
                    // record is valid, advance the result to its end (with
                    // alignment to the next record taken into account)
+                    trace!(
+                        "  updating last_valid_rec_pos: 0x{:x} --> 0x{:x}",
+                        last_valid_rec_pos,
+                        offs
+                    );
                    last_valid_rec_pos = offs;
                } else {
                    info!(
                        "CRC mismatch {} vs {} at {}",
-                        crc, wal_crc, last_valid_rec_pos
+                        crc, xl_crc, last_valid_rec_pos
                    );
                    break;
                }
            }
        }
    }
+    trace!("last_valid_rec_pos=0x{:x}", last_valid_rec_pos);
    Ok(last_valid_rec_pos as u32)
 }

@@ -272,7 +390,7 @@ pub fn find_end_of_wal(
    wal_seg_size: usize,
    precise: bool,
    start_lsn: Lsn, // start reading WAL at this point or later
-) -> Result<(XLogRecPtr, TimeLineID)> {
+) -> anyhow::Result<(XLogRecPtr, TimeLineID)> {
    let mut high_segno: XLogSegNo = 0;
    let mut high_tli: TimeLineID = 0;
    let mut high_ispartial = false;
@@ -354,19 +472,19 @@ pub fn main() {
 }

 impl XLogRecord {
-    pub fn from_slice(buf: &[u8]) -> XLogRecord {
+    pub fn from_slice(buf: &[u8]) -> Result<XLogRecord, DeserializeError> {
        use utils::bin_ser::LeSer;
-        XLogRecord::des(buf).unwrap()
+        XLogRecord::des(buf)
    }

-    pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogRecord {
+    pub fn from_bytes<B: Buf>(buf: &mut B) -> Result<XLogRecord, DeserializeError> {
        use utils::bin_ser::LeSer;
-        XLogRecord::des_from(&mut buf.reader()).unwrap()
+        XLogRecord::des_from(&mut buf.reader())
    }

-    pub fn encode(&self) -> Bytes {
+    pub fn encode(&self) -> Result<Bytes, SerializeError> {
        use utils::bin_ser::LeSer;
-        self.ser().unwrap().into()
+        Ok(self.ser()?.into())
    }

    // Is this record an XLOG_SWITCH record? They need some special processing,
@@ -376,35 +494,35 @@ impl XLogRecord {
 }

 impl XLogPageHeaderData {
-    pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogPageHeaderData {
+    pub fn from_bytes<B: Buf>(buf: &mut B) -> Result<XLogPageHeaderData, DeserializeError> {
        use utils::bin_ser::LeSer;
-        XLogPageHeaderData::des_from(&mut buf.reader()).unwrap()
+        XLogPageHeaderData::des_from(&mut buf.reader())
    }
 }

 impl XLogLongPageHeaderData {
-    pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogLongPageHeaderData {
+    pub fn from_bytes<B: Buf>(buf: &mut B) -> Result<XLogLongPageHeaderData, DeserializeError> {
        use utils::bin_ser::LeSer;
-        XLogLongPageHeaderData::des_from(&mut buf.reader()).unwrap()
+        XLogLongPageHeaderData::des_from(&mut buf.reader())
    }

-    pub fn encode(&self) -> Bytes {
+    pub fn encode(&self) -> Result<Bytes, SerializeError> {
        use utils::bin_ser::LeSer;
-        self.ser().unwrap().into()
+        self.ser().map(|b| b.into())
    }
 }

 pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::<CheckPoint>();

 impl CheckPoint {
-    pub fn encode(&self) -> Bytes {
+    pub fn encode(&self) -> Result<Bytes, SerializeError> {
        use utils::bin_ser::LeSer;
-        self.ser().unwrap().into()
+        Ok(self.ser()?.into())
    }

-    pub fn decode(buf: &[u8]) -> Result<CheckPoint, anyhow::Error> {
+    pub fn decode(buf: &[u8]) -> Result<CheckPoint, DeserializeError> {
        use utils::bin_ser::LeSer;
-        Ok(CheckPoint::des(buf)?)
+        CheckPoint::des(buf)
    }

    /// Update next XID based on provided new_xid and stored epoch.
@@ -442,7 +560,7 @@ impl CheckPoint {
 // Generate new, empty WAL segment.
 // We need this segment to start compute node.
 //
-pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes {
+pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, SerializeError> {
    let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize);

    let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, pg_constants::WAL_SEGMENT_SIZE);
@@ -462,90 +580,138 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes {
        xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
    };

-    let hdr_bytes = hdr.encode();
+    let hdr_bytes = hdr.encode()?;
    seg_buf.extend_from_slice(&hdr_bytes);

    //zero out the rest of the file
    seg_buf.resize(pg_constants::WAL_SEGMENT_SIZE, 0);
-    seg_buf.freeze()
+    Ok(seg_buf.freeze())
 }

 #[cfg(test)]
 mod tests {
    use super::*;
    use regex::Regex;
-    use std::{env, process::Command, str::FromStr};
+    use std::{env, str::FromStr};

-    // Run find_end_of_wal against file in test_wal dir
-    // Ensure that it finds last record correctly
-    #[test]
-    pub fn test_find_end_of_wal() {
-        // 1. Run initdb to generate some WAL
+    fn init_logging() {
+        let _ = env_logger::Builder::from_env(
+            env_logger::Env::default()
+                .default_filter_or("wal_generate=info,postgres_ffi::xlog_utils=trace"),
+        )
+        .is_test(true)
+        .try_init();
+    }
+
+    fn test_end_of_wal(
+        test_name: &str,
+        generate_wal: impl Fn(&mut postgres::Client) -> anyhow::Result<postgres::types::PgLsn>,
+        expected_end_of_wal_non_partial: Lsn,
+        last_segment: &str,
+    ) {
+        use wal_generate::*;
+        // 1. Generate some WAL
        let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
            .join("..")
            .join("..");
-        let data_dir = top_path.join("test_output/test_find_end_of_wal");
-        let initdb_path = top_path.join("tmp_install/bin/initdb");
-        let lib_path = top_path.join("tmp_install/lib");
-        if data_dir.exists() {
-            fs::remove_dir_all(&data_dir).unwrap();
+        let cfg = Conf {
+            pg_distrib_dir: top_path.join("tmp_install"),
+            datadir: top_path.join(format!("test_output/{}", test_name)),
+        };
+        if cfg.datadir.exists() {
+            fs::remove_dir_all(&cfg.datadir).unwrap();
        }
-        println!("Using initdb from '{}'", initdb_path.display());
-        println!("Data directory '{}'", data_dir.display());
-        let initdb_output = Command::new(initdb_path)
-            .args(&["-D", data_dir.to_str().unwrap()])
-            .arg("--no-instructions")
-            .arg("--no-sync")
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &lib_path)
-            .env("DYLD_LIBRARY_PATH", &lib_path)
-            .output()
-            .unwrap();
-        assert!(
-            initdb_output.status.success(),
-            "initdb failed. Status: '{}', stdout: '{}', stderr: '{}'",
-            initdb_output.status,
-            String::from_utf8_lossy(&initdb_output.stdout),
-            String::from_utf8_lossy(&initdb_output.stderr),
-        );
+        cfg.initdb().unwrap();
+        let mut srv = cfg.start_server().unwrap();
+        let expected_wal_end: Lsn =
+            u64::from(generate_wal(&mut srv.connect_with_timeout().unwrap()).unwrap()).into();
+        srv.kill();

        // 2. Pick WAL generated by initdb
-        let wal_dir = data_dir.join("pg_wal");
+        let wal_dir = cfg.datadir.join("pg_wal");
        let wal_seg_size = 16 * 1024 * 1024;

        // 3. Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
        let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap();
        let wal_end = Lsn(wal_end);
-        println!("wal_end={}, tli={}", wal_end, tli);
-        assert_eq!(wal_end, "0/2000000".parse::<Lsn>().unwrap());
+        info!(
+            "find_end_of_wal returned (wal_end={}, tli={})",
+            wal_end, tli
+        );
+        assert_eq!(wal_end, expected_end_of_wal_non_partial);

        // 4. Get the actual end of WAL by pg_waldump
-        let waldump_path = top_path.join("tmp_install/bin/pg_waldump");
-        let waldump_output = Command::new(waldump_path)
-            .arg(wal_dir.join("000000010000000000000001"))
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &lib_path)
-            .env("DYLD_LIBRARY_PATH", &lib_path)
-            .output()
-            .unwrap();
-        let waldump_output = std::str::from_utf8(&waldump_output.stderr).unwrap();
-        println!("waldump_output = '{}'", &waldump_output);
-        let re = Regex::new(r"invalid record length at (.+):").unwrap();
-        let caps = re.captures(waldump_output).unwrap();
+        let waldump_output = cfg
+            .pg_waldump("000000010000000000000001", last_segment)
+            .unwrap()
+            .stderr;
+        let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
+        let caps = match Regex::new(r"invalid record length at (.+):")
+            .unwrap()
+            .captures(waldump_output)
+        {
+            Some(caps) => caps,
+            None => {
+                error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output);
+                panic!();
+            }
+        };
        let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
+        info!(
+            "waldump erred on {}, expected wal end at {}",
+            waldump_wal_end, expected_wal_end
+        );
+        assert_eq!(waldump_wal_end, expected_wal_end);

        // 5. Rename file to partial to actually find last valid lsn
        fs::rename(
-            wal_dir.join("000000010000000000000001"),
-            wal_dir.join("000000010000000000000001.partial"),
+            wal_dir.join(last_segment),
+            wal_dir.join(format!("{}.partial", last_segment)),
        )
        .unwrap();
        let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap();
        let wal_end = Lsn(wal_end);
-        println!("wal_end={}, tli={}", wal_end, tli);
+        info!(
+            "find_end_of_wal returned (wal_end={}, tli={})",
+            wal_end, tli
+        );
        assert_eq!(wal_end, waldump_wal_end);
    }

+    #[test]
+    pub fn test_find_end_of_wal_simple() {
+        init_logging();
+        test_end_of_wal(
+            "test_find_end_of_wal_simple",
+            wal_generate::generate_simple,
+            "0/2000000".parse::<Lsn>().unwrap(),
+            "000000010000000000000001",
+        );
+    }
+
+    #[test]
+    pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
+        init_logging();
+        test_end_of_wal(
+            "test_find_end_of_wal_crossing_segment_followed_by_small_one",
+            wal_generate::generate_wal_record_crossing_segment_followed_by_small_one,
+            "0/3000000".parse::<Lsn>().unwrap(),
+            "000000010000000000000002",
+        );
+    }
+
+    #[test]
+    #[ignore = "not yet fixed, needs correct parsing of pre-last segments"] // TODO
+    pub fn test_find_end_of_wal_last_crossing_segment() {
+        init_logging();
+        test_end_of_wal(
+            "test_find_end_of_wal_last_crossing_segment",
+            wal_generate::generate_last_wal_record_crossing_segment,
+            "0/3000000".parse::<Lsn>().unwrap(),
+            "000000010000000000000002",
+        );
+    }
+
    /// Check the math in update_next_xid
    ///
    /// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL,
--- a/libs/postgres_ffi/wal_generate/Cargo.toml
+++ b/libs/postgres_ffi/wal_generate/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "wal_generate"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+anyhow = "1.0"
+clap = "3.0"
+env_logger = "0.9"
+log = "0.4"
+postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tempfile = "3.2"
--- a/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs
+++ b/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs
@@ -0,0 +1,58 @@
+use anyhow::*;
+use clap::{App, Arg};
+use wal_generate::*;
+
+fn main() -> Result<()> {
+    env_logger::Builder::from_env(
+        env_logger::Env::default().default_filter_or("wal_generate=info"),
+    )
+    .init();
+    let arg_matches = App::new("Postgres WAL generator")
+        .about("Generates Postgres databases with specific WAL properties")
+        .arg(
+            Arg::new("datadir")
+                .short('D')
+                .long("datadir")
+                .takes_value(true)
+                .help("Data directory for the Postgres server")
+                .required(true)
+        )
+        .arg(
+            Arg::new("pg-distrib-dir")
+                .long("pg-distrib-dir")
+                .takes_value(true)
+                .help("Directory with Postgres distribution (bin and lib directories, e.g. tmp_install)")
+                .default_value("/usr/local")
+        )
+        .arg(
+            Arg::new("type")
+                .long("type")
+                .takes_value(true)
+                .help("Type of WAL to generate")
+                .possible_values(["simple", "last_wal_record_crossing_segment", "wal_record_crossing_segment_followed_by_small_one"])
+                .required(true)
+        )
+        .get_matches();
+
+    let cfg = Conf {
+        pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(),
+        datadir: arg_matches.value_of("datadir").unwrap().into(),
+    };
+    cfg.initdb()?;
+    let mut srv = cfg.start_server()?;
+    let lsn = match arg_matches.value_of("type").unwrap() {
+        "simple" => generate_simple(&mut srv.connect_with_timeout()?)?,
+        "last_wal_record_crossing_segment" => {
+            generate_last_wal_record_crossing_segment(&mut srv.connect_with_timeout()?)?
+        }
+        "wal_record_crossing_segment_followed_by_small_one" => {
+            generate_wal_record_crossing_segment_followed_by_small_one(
+                &mut srv.connect_with_timeout()?,
+            )?
+        }
+        a => panic!("Unknown --type argument: {}", a),
+    };
+    println!("end_of_wal = {}", lsn);
+    srv.kill();
+    Ok(())
+}
--- a/libs/postgres_ffi/wal_generate/src/lib.rs
+++ b/libs/postgres_ffi/wal_generate/src/lib.rs
@@ -0,0 +1,278 @@
+use anyhow::*;
+use core::time::Duration;
+use log::*;
+use postgres::types::PgLsn;
+use postgres::Client;
+use std::cmp::Ordering;
+use std::path::{Path, PathBuf};
+use std::process::{Command, Stdio};
+use std::time::Instant;
+use tempfile::{tempdir, TempDir};
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct Conf {
+    pub pg_distrib_dir: PathBuf,
+    pub datadir: PathBuf,
+}
+
+pub struct PostgresServer {
+    process: std::process::Child,
+    _unix_socket_dir: TempDir,
+    client_config: postgres::Config,
+}
+
+impl Conf {
+    fn pg_bin_dir(&self) -> PathBuf {
+        self.pg_distrib_dir.join("bin")
+    }
+
+    fn pg_lib_dir(&self) -> PathBuf {
+        self.pg_distrib_dir.join("lib")
+    }
+
+    fn new_pg_command(&self, command: impl AsRef<Path>) -> Result<Command> {
+        let path = self.pg_bin_dir().join(command);
+        ensure!(path.exists(), "Command {:?} does not exist", path);
+        let mut cmd = Command::new(path);
+        cmd.env_clear()
+            .env("LD_LIBRARY_PATH", self.pg_lib_dir())
+            .env("DYLD_LIBRARY_PATH", self.pg_lib_dir());
+        Ok(cmd)
+    }
+
+    pub fn initdb(&self) -> Result<()> {
+        if let Some(parent) = self.datadir.parent() {
+            info!("Pre-creating parent directory {:?}", parent);
+            // Tests may be run concurrently and there may be a race to create `test_output/`.
+            // std::fs::create_dir_all is guaranteed to have no races with another thread creating directories.
+            std::fs::create_dir_all(parent)?;
+        }
+        info!(
+            "Running initdb in {:?} with user \"postgres\"",
+            self.datadir
+        );
+        let output = self
+            .new_pg_command("initdb")?
+            .arg("-D")
+            .arg(self.datadir.as_os_str())
+            .args(&["-U", "postgres", "--no-instructions", "--no-sync"])
+            .output()?;
+        debug!("initdb output: {:?}", output);
+        ensure!(
+            output.status.success(),
+            "initdb failed, stdout and stderr follow:\n{}{}",
+            String::from_utf8_lossy(&output.stdout),
+            String::from_utf8_lossy(&output.stderr),
+        );
+        Ok(())
+    }
+
+    pub fn start_server(&self) -> Result<PostgresServer> {
+        info!("Starting Postgres server in {:?}", self.datadir);
+        let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols)
+        let unix_socket_dir_path = unix_socket_dir.path().to_owned();
+        let server_process = self
+            .new_pg_command("postgres")?
+            .args(&["-c", "listen_addresses="])
+            .arg("-k")
+            .arg(unix_socket_dir_path.as_os_str())
+            .arg("-D")
+            .arg(self.datadir.as_os_str())
+            .args(&["-c", "wal_keep_size=50MB"]) // Ensure old WAL is not removed
+            .args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output
+            .args(&["-c", "shared_preload_libraries=zenith"]) // can only be loaded at startup
+            // Disable background processes as much as possible
+            .args(&["-c", "wal_writer_delay=10s"])
+            .args(&["-c", "autovacuum=off"])
+            .stderr(Stdio::null())
+            .spawn()?;
+        let server = PostgresServer {
+            process: server_process,
+            _unix_socket_dir: unix_socket_dir,
+            client_config: {
+                let mut c = postgres::Config::new();
+                c.host_path(&unix_socket_dir_path);
+                c.user("postgres");
+                c.connect_timeout(Duration::from_millis(1000));
+                c
+            },
+        };
+        Ok(server)
+    }
+
+    pub fn pg_waldump(
+        &self,
+        first_segment_name: &str,
+        last_segment_name: &str,
+    ) -> Result<std::process::Output> {
+        let first_segment_file = self.datadir.join(first_segment_name);
+        let last_segment_file = self.datadir.join(last_segment_name);
+        info!(
+            "Running pg_waldump for {} .. {}",
+            first_segment_file.display(),
+            last_segment_file.display()
+        );
+        let output = self
+            .new_pg_command("pg_waldump")?
+            .args(&[
+                &first_segment_file.as_os_str(),
+                &last_segment_file.as_os_str(),
+            ])
+            .output()?;
+        debug!("waldump output: {:?}", output);
+        Ok(output)
+    }
+}
+
+impl PostgresServer {
+    pub fn connect_with_timeout(&self) -> Result<Client> {
+        let retry_until = Instant::now() + *self.client_config.get_connect_timeout().unwrap();
+        while Instant::now() < retry_until {
+            use std::result::Result::Ok;
+            if let Ok(client) = self.client_config.connect(postgres::NoTls) {
+                return Ok(client);
+            }
+            std::thread::sleep(Duration::from_millis(100));
+        }
+        bail!("Connection timed out");
+    }
+
+    pub fn kill(&mut self) {
+        self.process.kill().unwrap();
+        self.process.wait().unwrap();
+    }
+}
+
+impl Drop for PostgresServer {
+    fn drop(&mut self) {
+        use std::result::Result::Ok;
+        match self.process.try_wait() {
+            Ok(Some(_)) => return,
+            Ok(None) => {
+                warn!("Server was not terminated, will be killed");
+            }
+            Err(e) => {
+                error!("Unable to get status of the server: {}, will be killed", e);
+            }
+        }
+        let _ = self.process.kill();
+    }
+}
+
+pub trait PostgresClientExt: postgres::GenericClient {
+    fn pg_current_wal_insert_lsn(&mut self) -> Result<PgLsn> {
+        Ok(self
+            .query_one("SELECT pg_current_wal_insert_lsn()", &[])?
+            .get(0))
+    }
+    fn pg_current_wal_flush_lsn(&mut self) -> Result<PgLsn> {
+        Ok(self
+            .query_one("SELECT pg_current_wal_flush_lsn()", &[])?
+            .get(0))
+    }
+}
+
+impl<C: postgres::GenericClient> PostgresClientExt for C {}
+
+fn generate_internal<C: postgres::GenericClient>(
+    client: &mut C,
+    f: impl Fn(&mut C, PgLsn) -> Result<Option<PgLsn>>,
+) -> Result<PgLsn> {
+    client.execute("create extension if not exists zenith_test_utils", &[])?;
+
+    let wal_segment_size = client.query_one(
+        "select cast(setting as bigint) as setting, unit \
+         from pg_settings where name = 'wal_segment_size'",
+        &[],
+    )?;
+    ensure!(
+        wal_segment_size.get::<_, String>("unit") == "B",
+        "Unexpected wal_segment_size unit"
+    );
+    ensure!(
+        wal_segment_size.get::<_, i64>("setting") == 16 * 1024 * 1024,
+        "Unexpected wal_segment_size in bytes"
+    );
+
+    let initial_lsn = client.pg_current_wal_insert_lsn()?;
+    info!("LSN initial = {}", initial_lsn);
+
+    let last_lsn = match f(client, initial_lsn)? {
+        None => client.pg_current_wal_insert_lsn()?,
+        Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) {
+            Ordering::Less => bail!("Some records were inserted after the generated WAL"),
+            Ordering::Equal => last_lsn,
+            Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
+        },
+    };
+
+    // Some records may be not flushed, e.g. non-transactional logical messages.
+    client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
+    match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) {
+        Ordering::Less => bail!("Some records were flushed after the generated WAL"),
+        Ordering::Equal => {}
+        Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"),
+    }
+    Ok(last_lsn)
+}
+
+pub fn generate_simple(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
+    generate_internal(client, |client, _| {
+        client.execute("CREATE table t(x int)", &[])?;
+        Ok(None)
+    })
+}
+
+fn generate_single_logical_message(
+    client: &mut impl postgres::GenericClient,
+    transactional: bool,
+) -> Result<PgLsn> {
+    generate_internal(client, |client, initial_lsn| {
+        ensure!(
+            initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
+            "Initial LSN is too far in the future"
+        );
+
+        let message_lsn: PgLsn = client
+            .query_one(
+                "select pg_logical_emit_message($1, 'big-16mb-msg', \
+                 concat(repeat('abcd', 16 * 256 * 1024), 'end')) as message_lsn",
+                &[&transactional],
+            )?
+            .get("message_lsn");
+        ensure!(
+            message_lsn > PgLsn::from(0x0200_0000 + 4 * 8192),
+            "Logical message did not cross the segment boundary"
+        );
+        ensure!(
+            message_lsn < PgLsn::from(0x0400_0000),
+            "Logical message crossed two segments"
+        );
+
+        if transactional {
+            // Transactional logical messages are part of a transaction, so the one above is
+            // followed by a small COMMIT record.
+
+            let after_message_lsn = client.pg_current_wal_insert_lsn()?;
+            ensure!(
+                message_lsn < after_message_lsn,
+                "No record found after the emitted message"
+            );
+            Ok(Some(after_message_lsn))
+        } else {
+            Ok(Some(message_lsn))
+        }
+    })
+}
+
+pub fn generate_wal_record_crossing_segment_followed_by_small_one(
+    client: &mut impl postgres::GenericClient,
+) -> Result<PgLsn> {
+    generate_single_logical_message(client, true)
+}
+
+pub fn generate_last_wal_record_crossing_segment<C: postgres::GenericClient>(
+    client: &mut C,
+) -> Result<PgLsn> {
+    generate_single_logical_message(client, false)
+}
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "remote_storage"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+anyhow = { version = "1.0", features = ["backtrace"] }
+async-trait = "0.1"
+metrics = { version = "0.1", path = "../metrics" }
+once_cell = "1.8.0"
+rusoto_core = "0.48"
+rusoto_s3 = "0.48"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1"
+tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] }
+tokio-util = { version = "0.7", features = ["io"] }
+toml_edit = { version = "0.13", features = ["easy"] }
+tracing = "0.1.27"
+
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+
+[dev-dependencies]
+tempfile = "3.2"
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -0,0 +1,319 @@
+//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage.
+//! No other modules from this tree are supposed to be used directly by the external code.
+//!
+//! [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
+//!   * [`local_fs`] allows to use local file system as an external storage
+//!   * [`s3_bucket`] uses AWS S3 bucket as an external storage
+//!
+mod local_fs;
+mod s3_bucket;
+
+use std::{
+    borrow::Cow,
+    collections::HashMap,
+    ffi::OsStr,
+    num::{NonZeroU32, NonZeroUsize},
+    path::{Path, PathBuf},
+};
+
+use anyhow::{bail, Context};
+
+use tokio::io;
+use toml_edit::Item;
+use tracing::info;
+
+pub use self::{
+    local_fs::LocalFs,
+    s3_bucket::{S3Bucket, S3ObjectKey},
+};
+
+/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
+/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
+/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
+/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
+pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
+pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
+/// Currently, sync happens with AWS S3, that has two limits on requests per second:
+/// ~200 RPS for IAM services
+/// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
+/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
+/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
+pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
+
+/// Storage (potentially remote) API to manage its state.
+/// This storage tries to be unaware of any layered repository context,
+/// providing basic CRUD operations for storage files.
+#[async_trait::async_trait]
+pub trait RemoteStorage: Send + Sync {
+    /// A way to uniquely reference a file in the remote storage.
+    type RemoteObjectId;
+
+    /// Attempts to derive the storage path out of the local path, if the latter is correct.
+    fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId>;
+
+    /// Gets the download path of the given storage file.
+    fn local_path(&self, remote_object_id: &Self::RemoteObjectId) -> anyhow::Result<PathBuf>;
+
+    /// Lists all items the storage has right now.
+    async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>>;
+
+    /// Streams the local file contents into remote into the remote storage entry.
+    async fn upload(
+        &self,
+        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        // S3 PUT request requires the content length to be specified,
+        // otherwise it starts to fail with the concurrent connection count increasing.
+        from_size_bytes: usize,
+        to: &Self::RemoteObjectId,
+        metadata: Option<StorageMetadata>,
+    ) -> anyhow::Result<()>;
+
+    /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
+    /// Returns the metadata, if any was stored with the file previously.
+    async fn download(
+        &self,
+        from: &Self::RemoteObjectId,
+        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
+    ) -> anyhow::Result<Option<StorageMetadata>>;
+
+    /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
+    /// Returns the metadata, if any was stored with the file previously.
+    async fn download_byte_range(
+        &self,
+        from: &Self::RemoteObjectId,
+        start_inclusive: u64,
+        end_exclusive: Option<u64>,
+        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
+    ) -> anyhow::Result<Option<StorageMetadata>>;
+
+    async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()>;
+}
+
+/// Every storage, currently supported.
+/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
+pub enum GenericRemoteStorage {
+    Local(LocalFs),
+    S3(S3Bucket),
+}
+
+impl GenericRemoteStorage {
+    pub fn new(
+        working_directory: PathBuf,
+        storage_config: &RemoteStorageConfig,
+    ) -> anyhow::Result<Self> {
+        match &storage_config.storage {
+            RemoteStorageKind::LocalFs(root) => {
+                info!("Using fs root '{}' as a remote storage", root.display());
+                LocalFs::new(root.clone(), working_directory).map(GenericRemoteStorage::Local)
+            }
+            RemoteStorageKind::AwsS3(s3_config) => {
+                info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
+                    s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
+                S3Bucket::new(s3_config, working_directory).map(GenericRemoteStorage::S3)
+            }
+        }
+    }
+}
+
+/// Extra set of key-value pairs that contain arbitrary metadata about the storage entry.
+/// Immutable, cannot be changed once the file is created.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct StorageMetadata(HashMap<String, String>);
+
+fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
+    if prefix == path {
+        anyhow::bail!(
+            "Prefix and the path are equal, cannot strip: '{}'",
+            prefix.display()
+        )
+    } else {
+        path.strip_prefix(prefix).with_context(|| {
+            format!(
+                "Path '{}' is not prefixed with '{}'",
+                path.display(),
+                prefix.display(),
+            )
+        })
+    }
+}
+
+/// External backup storage configuration, enough for creating a client for that storage.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct RemoteStorageConfig {
+    /// Max allowed number of concurrent sync operations between the API user and the remote storage.
+    pub max_concurrent_syncs: NonZeroUsize,
+    /// Max allowed errors before the sync task is considered failed and evicted.
+    pub max_sync_errors: NonZeroU32,
+    /// The storage connection configuration.
+    pub storage: RemoteStorageKind,
+}
+
+/// A kind of a remote storage to connect to, with its connection configuration.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum RemoteStorageKind {
+    /// Storage based on local file system.
+    /// Specify a root folder to place all stored files into.
+    LocalFs(PathBuf),
+    /// AWS S3 based storage, storing all files in the S3 bucket
+    /// specified by the config
+    AwsS3(S3Config),
+}
+
+/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
+#[derive(Clone, PartialEq, Eq)]
+pub struct S3Config {
+    /// Name of the bucket to connect to.
+    pub bucket_name: String,
+    /// The region where the bucket is located at.
+    pub bucket_region: String,
+    /// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once.
+    pub prefix_in_bucket: Option<String>,
+    /// A base URL to send S3 requests to.
+    /// By default, the endpoint is derived from a region name, assuming it's
+    /// an AWS S3 region name, erroring on wrong region name.
+    /// Endpoint provides a way to support other S3 flavors and their regions.
+    ///
+    /// Example: `http://127.0.0.1:5000`
+    pub endpoint: Option<String>,
+    /// AWS S3 has various limits on its API calls, we need not to exceed those.
+    /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
+    pub concurrency_limit: NonZeroUsize,
+}
+
+impl std::fmt::Debug for S3Config {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("S3Config")
+            .field("bucket_name", &self.bucket_name)
+            .field("bucket_region", &self.bucket_region)
+            .field("prefix_in_bucket", &self.prefix_in_bucket)
+            .field("concurrency_limit", &self.concurrency_limit)
+            .finish()
+    }
+}
+
+pub fn path_with_suffix_extension(original_path: impl AsRef<Path>, suffix: &str) -> PathBuf {
+    let new_extension = match original_path
+        .as_ref()
+        .extension()
+        .map(OsStr::to_string_lossy)
+    {
+        Some(extension) => Cow::Owned(format!("{extension}.{suffix}")),
+        None => Cow::Borrowed(suffix),
+    };
+    original_path
+        .as_ref()
+        .with_extension(new_extension.as_ref())
+}
+
+impl RemoteStorageConfig {
+    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
+        let local_path = toml.get("local_path");
+        let bucket_name = toml.get("bucket_name");
+        let bucket_region = toml.get("bucket_region");
+
+        let max_concurrent_syncs = NonZeroUsize::new(
+            parse_optional_integer("max_concurrent_syncs", toml)?
+                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
+        )
+        .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;
+
+        let max_sync_errors = NonZeroU32::new(
+            parse_optional_integer("max_sync_errors", toml)?
+                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
+        )
+        .context("Failed to parse 'max_sync_errors' as a positive integer")?;
+
+        let concurrency_limit = NonZeroUsize::new(
+            parse_optional_integer("concurrency_limit", toml)?
+                .unwrap_or(DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT),
+        )
+        .context("Failed to parse 'concurrency_limit' as a positive integer")?;
+
+        let storage = match (local_path, bucket_name, bucket_region) {
+            (None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"),
+            (_, Some(_), None) => {
+                bail!("'bucket_region' option is mandatory if 'bucket_name' is given ")
+            }
+            (_, None, Some(_)) => {
+                bail!("'bucket_name' option is mandatory if 'bucket_region' is given ")
+            }
+            (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config {
+                bucket_name: parse_toml_string("bucket_name", bucket_name)?,
+                bucket_region: parse_toml_string("bucket_region", bucket_region)?,
+                prefix_in_bucket: toml
+                    .get("prefix_in_bucket")
+                    .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket))
+                    .transpose()?,
+                endpoint: toml
+                    .get("endpoint")
+                    .map(|endpoint| parse_toml_string("endpoint", endpoint))
+                    .transpose()?,
+                concurrency_limit,
+            }),
+            (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from(
+                parse_toml_string("local_path", local_path)?,
+            )),
+            (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"),
+        };
+
+        Ok(RemoteStorageConfig {
+            max_concurrent_syncs,
+            max_sync_errors,
+            storage,
+        })
+    }
+}
+
+// Helper functions to parse a toml Item
+fn parse_optional_integer<I, E>(name: &str, item: &toml_edit::Item) -> anyhow::Result<Option<I>>
+where
+    I: TryFrom<i64, Error = E>,
+    E: std::error::Error + Send + Sync + 'static,
+{
+    let toml_integer = match item.get(name) {
+        Some(item) => item
+            .as_integer()
+            .with_context(|| format!("configure option {name} is not an integer"))?,
+        None => return Ok(None),
+    };
+
+    I::try_from(toml_integer)
+        .map(Some)
+        .with_context(|| format!("configure option {name} is too large"))
+}
+
+fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result<String> {
+    let s = item
+        .as_str()
+        .with_context(|| format!("configure option {name} is not a string"))?;
+    Ok(s.to_string())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_path_with_suffix_extension() {
+        let p = PathBuf::from("/foo/bar");
+        assert_eq!(
+            &path_with_suffix_extension(&p, "temp").to_string_lossy(),
+            "/foo/bar.temp"
+        );
+        let p = PathBuf::from("/foo/bar");
+        assert_eq!(
+            &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
+            "/foo/bar.temp.temp"
+        );
+        let p = PathBuf::from("/foo/bar.baz");
+        assert_eq!(
+            &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
+            "/foo/bar.baz.temp.temp"
+        );
+        let p = PathBuf::from("/foo/bar.baz");
+        assert_eq!(
+            &path_with_suffix_extension(&p, ".temp").to_string_lossy(),
+            "/foo/bar.baz..temp"
+        );
+    }
+}
--- a/pageserver/src/remote_storage/local_fs.rs
+++ b/pageserver/src/remote_storage/local_fs.rs
@@ -1,7 +1,7 @@
 //! Local filesystem acting as a remote storage.
-//! Multiple pageservers can use the same "storage" of this kind by using different storage roots.
+//! Multiple API users can use the same "storage" of this kind by using different storage roots.
 //!
-//! This storage used in pageserver tests, but can also be used in cases when a certain persistent
+//! This storage used in tests, but can also be used in cases when a certain persistent
 //! volume is mounted to the local FS.

 use std::{
@@ -17,18 +17,18 @@ use tokio::{
 };
 use tracing::*;

-use crate::remote_storage::storage_sync::path_with_suffix_extension;
+use crate::path_with_suffix_extension;

 use super::{strip_path_prefix, RemoteStorage, StorageMetadata};

 pub struct LocalFs {
-    pageserver_workdir: &'static Path,
-    root: PathBuf,
+    working_directory: PathBuf,
+    storage_root: PathBuf,
 }

 impl LocalFs {
    /// Attempts to create local FS storage, along with its root directory.
-    pub fn new(root: PathBuf, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
+    pub fn new(root: PathBuf, working_directory: PathBuf) -> anyhow::Result<Self> {
        if !root.exists() {
            std::fs::create_dir_all(&root).with_context(|| {
                format!(
@@ -38,15 +38,15 @@ impl LocalFs {
            })?;
        }
        Ok(Self {
-            pageserver_workdir,
-            root,
+            working_directory,
+            storage_root: root,
        })
    }

    fn resolve_in_storage(&self, path: &Path) -> anyhow::Result<PathBuf> {
        if path.is_relative() {
-            Ok(self.root.join(path))
-        } else if path.starts_with(&self.root) {
+            Ok(self.storage_root.join(path))
+        } else if path.starts_with(&self.storage_root) {
            Ok(path.to_path_buf())
        } else {
            bail!(
@@ -85,30 +85,30 @@ impl LocalFs {

 #[async_trait::async_trait]
 impl RemoteStorage for LocalFs {
-    type StoragePath = PathBuf;
+    type RemoteObjectId = PathBuf;

-    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath> {
-        Ok(self.root.join(
-            strip_path_prefix(self.pageserver_workdir, local_path)
+    fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId> {
+        Ok(self.storage_root.join(
+            strip_path_prefix(&self.working_directory, local_path)
                .context("local path does not belong to this storage")?,
        ))
    }

-    fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf> {
-        let relative_path = strip_path_prefix(&self.root, storage_path)
+    fn local_path(&self, storage_path: &Self::RemoteObjectId) -> anyhow::Result<PathBuf> {
+        let relative_path = strip_path_prefix(&self.storage_root, storage_path)
            .context("local path does not belong to this storage")?;
-        Ok(self.pageserver_workdir.join(relative_path))
+        Ok(self.working_directory.join(relative_path))
    }

-    async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
-        get_all_files(&self.root).await
+    async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
+        get_all_files(&self.storage_root).await
    }

    async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
        from_size_bytes: usize,
-        to: &Self::StoragePath,
+        to: &Self::RemoteObjectId,
        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<()> {
        let target_file_path = self.resolve_in_storage(to)?;
@@ -194,7 +194,7 @@ impl RemoteStorage for LocalFs {

    async fn download(
        &self,
-        from: &Self::StoragePath,
+        from: &Self::RemoteObjectId,
        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
    ) -> anyhow::Result<Option<StorageMetadata>> {
        let file_path = self.resolve_in_storage(from)?;
@@ -229,9 +229,9 @@ impl RemoteStorage for LocalFs {
        }
    }

-    async fn download_range(
+    async fn download_byte_range(
        &self,
-        from: &Self::StoragePath,
+        from: &Self::RemoteObjectId,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
@@ -288,7 +288,7 @@ impl RemoteStorage for LocalFs {
        }
    }

-    async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> {
+    async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> {
        let file_path = self.resolve_in_storage(path)?;
        if file_path.exists() && file_path.is_file() {
            Ok(fs::remove_file(file_path).await?)
@@ -354,29 +354,30 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()>

 #[cfg(test)]
 mod pure_tests {
-    use crate::{
-        layered_repository::metadata::METADATA_FILE_NAME,
-        repository::repo_harness::{RepoHarness, TIMELINE_ID},
-    };
+    use tempfile::tempdir;

    use super::*;

    #[test]
    fn storage_path_positive() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("storage_path_positive")?;
+        let workdir = tempdir()?.path().to_owned();
+
        let storage_root = PathBuf::from("somewhere").join("else");
        let storage = LocalFs {
-            pageserver_workdir: &repo_harness.conf.workdir,
-            root: storage_root.clone(),
+            working_directory: workdir.clone(),
+            storage_root: storage_root.clone(),
        };

-        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("file_name");
-        let expected_path = storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?);
+        let local_path = workdir
+            .join("timelines")
+            .join("some_timeline")
+            .join("file_name");
+        let expected_path = storage_root.join(local_path.strip_prefix(&workdir)?);

        assert_eq!(
            expected_path,
-            storage.storage_path(&local_path).expect("Matching path should map to storage path normally"),
-            "File paths from pageserver workdir should be stored in local fs storage with the same path they have relative to the workdir"
+            storage.remote_object_id(&local_path).expect("Matching path should map to storage path normally"),
+            "File paths from workdir should be stored in local fs storage with the same path they have relative to the workdir"
        );

        Ok(())
@@ -386,7 +387,7 @@ mod pure_tests {
    fn storage_path_negatives() -> anyhow::Result<()> {
        #[track_caller]
        fn storage_path_error(storage: &LocalFs, mismatching_path: &Path) -> String {
-            match storage.storage_path(mismatching_path) {
+            match storage.remote_object_id(mismatching_path) {
                Ok(wrong_path) => panic!(
                    "Expected path '{}' to error, but got storage path: {:?}",
                    mismatching_path.display(),
@@ -396,16 +397,16 @@ mod pure_tests {
            }
        }

-        let repo_harness = RepoHarness::create("storage_path_negatives")?;
+        let workdir = tempdir()?.path().to_owned();
        let storage_root = PathBuf::from("somewhere").join("else");
        let storage = LocalFs {
-            pageserver_workdir: &repo_harness.conf.workdir,
-            root: storage_root,
+            working_directory: workdir.clone(),
+            storage_root,
        };

-        let error_string = storage_path_error(&storage, &repo_harness.conf.workdir);
+        let error_string = storage_path_error(&storage, &workdir);
        assert!(error_string.contains("does not belong to this storage"));
-        assert!(error_string.contains(repo_harness.conf.workdir.to_str().unwrap()));
+        assert!(error_string.contains(workdir.to_str().unwrap()));

        let mismatching_path_str = "/something/else";
        let error_message = storage_path_error(&storage, Path::new(mismatching_path_str));
@@ -414,7 +415,7 @@ mod pure_tests {
            "Error should mention wrong path"
        );
        assert!(
-            error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
+            error_message.contains(workdir.to_str().unwrap()),
            "Error should mention server workdir"
        );
        assert!(error_message.contains("does not belong to this storage"));
@@ -424,29 +425,28 @@ mod pure_tests {

    #[test]
    fn local_path_positive() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("local_path_positive")?;
+        let workdir = tempdir()?.path().to_owned();
        let storage_root = PathBuf::from("somewhere").join("else");
        let storage = LocalFs {
-            pageserver_workdir: &repo_harness.conf.workdir,
-            root: storage_root.clone(),
+            working_directory: workdir.clone(),
+            storage_root: storage_root.clone(),
        };

        let name = "not a metadata";
-        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join(name);
+        let local_path = workdir.join("timelines").join("some_timeline").join(name);
        assert_eq!(
            local_path,
            storage
-                .local_path(
-                    &storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?)
-                )
+                .local_path(&storage_root.join(local_path.strip_prefix(&workdir)?))
                .expect("For a valid input, valid local path should be parsed"),
            "Should be able to parse metadata out of the correctly named remote delta file"
        );

-        let local_metadata_path = repo_harness
-            .timeline_path(&TIMELINE_ID)
-            .join(METADATA_FILE_NAME);
-        let remote_metadata_path = storage.storage_path(&local_metadata_path)?;
+        let local_metadata_path = workdir
+            .join("timelines")
+            .join("some_timeline")
+            .join("metadata");
+        let remote_metadata_path = storage.remote_object_id(&local_metadata_path)?;
        assert_eq!(
            local_metadata_path,
            storage
@@ -472,11 +472,10 @@ mod pure_tests {
            }
        }

-        let repo_harness = RepoHarness::create("local_path_negatives")?;
        let storage_root = PathBuf::from("somewhere").join("else");
        let storage = LocalFs {
-            pageserver_workdir: &repo_harness.conf.workdir,
-            root: storage_root,
+            working_directory: tempdir()?.path().to_owned(),
+            storage_root,
        };

        let totally_wrong_path = "wrong_wrong_wrong";
@@ -488,16 +487,19 @@ mod pure_tests {

    #[test]
    fn download_destination_matches_original_path() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
-        let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
+        let workdir = tempdir()?.path().to_owned();
+        let original_path = workdir
+            .join("timelines")
+            .join("some_timeline")
+            .join("some name");

        let storage_root = PathBuf::from("somewhere").join("else");
        let dummy_storage = LocalFs {
-            pageserver_workdir: &repo_harness.conf.workdir,
-            root: storage_root,
+            working_directory: workdir,
+            storage_root,
        };

-        let storage_path = dummy_storage.storage_path(&original_path)?;
+        let storage_path = dummy_storage.remote_object_id(&original_path)?;
        let download_destination = dummy_storage.local_path(&storage_path)?;

        assert_eq!(
@@ -512,18 +514,17 @@ mod pure_tests {
 #[cfg(test)]
 mod fs_tests {
    use super::*;
-    use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID};

    use std::{collections::HashMap, io::Write};
    use tempfile::tempdir;

    #[tokio::test]
    async fn upload_file() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("upload_file")?;
+        let workdir = tempdir()?.path().to_owned();
        let storage = create_storage()?;

        let (file, size) = create_file_for_upload(
-            &storage.pageserver_workdir.join("whatever"),
+            &storage.working_directory.join("whatever"),
            "whatever_contents",
        )
        .await?;
@@ -538,14 +539,14 @@ mod fs_tests {
        }
        assert!(storage.list().await?.is_empty());

-        let target_path_1 = upload_dummy_file(&repo_harness, &storage, "upload_1", None).await?;
+        let target_path_1 = upload_dummy_file(&workdir, &storage, "upload_1", None).await?;
        assert_eq!(
            storage.list().await?,
            vec![target_path_1.clone()],
            "Should list a single file after first upload"
        );

-        let target_path_2 = upload_dummy_file(&repo_harness, &storage, "upload_2", None).await?;
+        let target_path_2 = upload_dummy_file(&workdir, &storage, "upload_2", None).await?;
        assert_eq!(
            list_files_sorted(&storage).await?,
            vec![target_path_1.clone(), target_path_2.clone()],
@@ -556,17 +557,16 @@ mod fs_tests {
    }

    fn create_storage() -> anyhow::Result<LocalFs> {
-        let pageserver_workdir = Box::leak(Box::new(tempdir()?.path().to_owned()));
-        let storage = LocalFs::new(tempdir()?.path().to_owned(), pageserver_workdir)?;
-        Ok(storage)
+        LocalFs::new(tempdir()?.path().to_owned(), tempdir()?.path().to_owned())
    }

    #[tokio::test]
    async fn download_file() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("download_file")?;
+        let workdir = tempdir()?.path().to_owned();
+
        let storage = create_storage()?;
        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;

        let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
        let metadata = storage.download(&upload_target, &mut content_bytes).await?;
@@ -597,14 +597,15 @@ mod fs_tests {

    #[tokio::test]
    async fn download_file_range_positive() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("download_file_range_positive")?;
+        let workdir = tempdir()?.path().to_owned();
+
        let storage = create_storage()?;
        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;

        let mut full_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
        let metadata = storage
-            .download_range(&upload_target, 0, None, &mut full_range_bytes)
+            .download_byte_range(&upload_target, 0, None, &mut full_range_bytes)
            .await?;
        assert!(
            metadata.is_none(),
@@ -620,7 +621,7 @@ mod fs_tests {
        let mut zero_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
        let same_byte = 1_000_000_000;
        let metadata = storage
-            .download_range(
+            .download_byte_range(
                &upload_target,
                same_byte,
                Some(same_byte + 1), // exclusive end
@@ -642,7 +643,7 @@ mod fs_tests {

        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
        let metadata = storage
-            .download_range(
+            .download_byte_range(
                &upload_target,
                0,
                Some(first_part_local.len() as u64),
@@ -664,7 +665,7 @@ mod fs_tests {

        let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
        let metadata = storage
-            .download_range(
+            .download_byte_range(
                &upload_target,
                first_part_local.len() as u64,
                Some((first_part_local.len() + second_part_local.len()) as u64),
@@ -689,16 +690,17 @@ mod fs_tests {

    #[tokio::test]
    async fn download_file_range_negative() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("download_file_range_negative")?;
+        let workdir = tempdir()?.path().to_owned();
+
        let storage = create_storage()?;
        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;

        let start = 10000;
        let end = 234;
        assert!(start > end, "Should test an incorrect range");
        match storage
-            .download_range(&upload_target, start, Some(end), &mut io::sink())
+            .download_byte_range(&upload_target, start, Some(end), &mut io::sink())
            .await
        {
            Ok(_) => panic!("Should not allow downloading wrong ranges"),
@@ -712,7 +714,7 @@ mod fs_tests {

        let non_existing_path = PathBuf::from("somewhere").join("else");
        match storage
-            .download_range(&non_existing_path, 1, Some(3), &mut io::sink())
+            .download_byte_range(&non_existing_path, 1, Some(3), &mut io::sink())
            .await
        {
            Ok(_) => panic!("Should not allow downloading non-existing storage file ranges"),
@@ -727,10 +729,11 @@ mod fs_tests {

    #[tokio::test]
    async fn delete_file() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("delete_file")?;
+        let workdir = tempdir()?.path().to_owned();
+
        let storage = create_storage()?;
        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;

        storage.delete(&upload_target).await?;
        assert!(storage.list().await?.is_empty());
@@ -748,7 +751,8 @@ mod fs_tests {

    #[tokio::test]
    async fn file_with_metadata() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("download_file")?;
+        let workdir = tempdir()?.path().to_owned();
+
        let storage = create_storage()?;
        let upload_name = "upload_1";
        let metadata = StorageMetadata(HashMap::from([
@@ -756,7 +760,7 @@ mod fs_tests {
            ("two".to_string(), "2".to_string()),
        ]));
        let upload_target =
-            upload_dummy_file(&repo_harness, &storage, upload_name, Some(metadata.clone())).await?;
+            upload_dummy_file(&workdir, &storage, upload_name, Some(metadata.clone())).await?;

        let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
        let full_download_metadata = storage.download(&upload_target, &mut content_bytes).await?;
@@ -780,7 +784,7 @@ mod fs_tests {

        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
        let partial_download_metadata = storage
-            .download_range(
+            .download_byte_range(
                &upload_target,
                0,
                Some(first_part_local.len() as u64),
@@ -805,16 +809,16 @@ mod fs_tests {
    }

    async fn upload_dummy_file(
-        harness: &RepoHarness<'_>,
+        workdir: &Path,
        storage: &LocalFs,
        name: &str,
        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<PathBuf> {
-        let timeline_path = harness.timeline_path(&TIMELINE_ID);
-        let relative_timeline_path = timeline_path.strip_prefix(&harness.conf.workdir)?;
-        let storage_path = storage.root.join(relative_timeline_path).join(name);
+        let timeline_path = workdir.join("timelines").join("some_timeline");
+        let relative_timeline_path = timeline_path.strip_prefix(&workdir)?;
+        let storage_path = storage.storage_root.join(relative_timeline_path).join(name);

-        let from_path = storage.pageserver_workdir.join(name);
+        let from_path = storage.working_directory.join(name);
        let (file, size) = create_file_for_upload(&from_path, &dummy_contents(name)).await?;
        storage.upload(file, size, &storage_path, metadata).await?;
        Ok(storage_path)
--- a/pageserver/src/remote_storage/s3_bucket.rs
+++ b/pageserver/src/remote_storage/s3_bucket.rs
@@ -1,7 +1,7 @@
 //! AWS S3 storage wrapper around `rusoto` library.
 //!
 //! Respects `prefix_in_bucket` property from [`S3Config`],
-//! allowing multiple pageservers to independently work with the same S3 bucket, if
+//! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.

 use std::path::{Path, PathBuf};
@@ -19,16 +19,78 @@ use tokio::{io, sync::Semaphore};
 use tokio_util::io::ReaderStream;
 use tracing::debug;

-use crate::{
-    config::S3Config,
-    remote_storage::{strip_path_prefix, RemoteStorage},
-};
+use crate::{strip_path_prefix, RemoteStorage, S3Config};

 use super::StorageMetadata;

-const S3_FILE_SEPARATOR: char = '/';
+pub(super) mod metrics {
+    use metrics::{register_int_counter_vec, IntCounterVec};
+    use once_cell::sync::Lazy;

-#[derive(Debug, Eq, PartialEq)]
+    static S3_REQUESTS_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "remote_storage_s3_requests_count",
+            "Number of s3 requests of particular type",
+            &["request_type"],
+        )
+        .expect("failed to define a metric")
+    });
+
+    static S3_REQUESTS_FAIL_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "remote_storage_s3_failures_count",
+            "Number of failed s3 requests of particular type",
+            &["request_type"],
+        )
+        .expect("failed to define a metric")
+    });
+
+    pub fn inc_get_object() {
+        S3_REQUESTS_COUNT.with_label_values(&["get_object"]).inc();
+    }
+
+    pub fn inc_get_object_fail() {
+        S3_REQUESTS_FAIL_COUNT
+            .with_label_values(&["get_object"])
+            .inc();
+    }
+
+    pub fn inc_put_object() {
+        S3_REQUESTS_COUNT.with_label_values(&["put_object"]).inc();
+    }
+
+    pub fn inc_put_object_fail() {
+        S3_REQUESTS_FAIL_COUNT
+            .with_label_values(&["put_object"])
+            .inc();
+    }
+
+    pub fn inc_delete_object() {
+        S3_REQUESTS_COUNT
+            .with_label_values(&["delete_object"])
+            .inc();
+    }
+
+    pub fn inc_delete_object_fail() {
+        S3_REQUESTS_FAIL_COUNT
+            .with_label_values(&["delete_object"])
+            .inc();
+    }
+
+    pub fn inc_list_objects() {
+        S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
+    }
+
+    pub fn inc_list_objects_fail() {
+        S3_REQUESTS_FAIL_COUNT
+            .with_label_values(&["list_objects"])
+            .inc();
+    }
+}
+
+const S3_PREFIX_SEPARATOR: char = '/';
+
+#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Hash)]
 pub struct S3ObjectKey(String);

 impl S3ObjectKey {
@@ -36,11 +98,7 @@ impl S3ObjectKey {
        &self.0
    }

-    fn download_destination(
-        &self,
-        pageserver_workdir: &Path,
-        prefix_to_strip: Option<&str>,
-    ) -> PathBuf {
+    fn download_destination(&self, workdir: &Path, prefix_to_strip: Option<&str>) -> PathBuf {
        let path_without_prefix = match prefix_to_strip {
            Some(prefix) => self.0.strip_prefix(prefix).unwrap_or_else(|| {
                panic!(
@@ -51,9 +109,9 @@ impl S3ObjectKey {
            None => &self.0,
        };

-        pageserver_workdir.join(
+        workdir.join(
            path_without_prefix
-                .split(S3_FILE_SEPARATOR)
+                .split(S3_PREFIX_SEPARATOR)
                .collect::<PathBuf>(),
        )
    }
@@ -61,7 +119,7 @@ impl S3ObjectKey {

 /// AWS S3 storage.
 pub struct S3Bucket {
-    pageserver_workdir: &'static Path,
+    workdir: PathBuf,
    client: S3Client,
    bucket_name: String,
    prefix_in_bucket: Option<String>,
@@ -73,7 +131,7 @@ pub struct S3Bucket {

 impl S3Bucket {
    /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
-    pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
+    pub fn new(aws_config: &S3Config, workdir: PathBuf) -> anyhow::Result<Self> {
        debug!(
            "Creating s3 remote storage for S3 bucket {}",
            aws_config.bucket_name
@@ -89,8 +147,11 @@ impl S3Bucket {
                .context("Failed to parse the s3 region from config")?,
        };
        let request_dispatcher = HttpClient::new().context("Failed to create S3 http client")?;
-        let client = if aws_config.access_key_id.is_none() && aws_config.secret_access_key.is_none()
-        {
+
+        let access_key_id = std::env::var("AWS_ACCESS_KEY_ID").ok();
+        let secret_access_key = std::env::var("AWS_SECRET_ACCESS_KEY").ok();
+
+        let client = if access_key_id.is_none() && secret_access_key.is_none() {
            debug!("Using IAM-based AWS access");
            S3Client::new_with(request_dispatcher, InstanceMetadataProvider::new(), region)
        } else {
@@ -98,8 +159,8 @@ impl S3Bucket {
            S3Client::new_with(
                request_dispatcher,
                StaticProvider::new_minimal(
-                    aws_config.access_key_id.clone().unwrap_or_default(),
-                    aws_config.secret_access_key.clone().unwrap_or_default(),
+                    access_key_id.unwrap_or_default(),
+                    secret_access_key.unwrap_or_default(),
                ),
                region,
            )
@@ -107,12 +168,12 @@ impl S3Bucket {

        let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
            let mut prefix = prefix;
-            while prefix.starts_with(S3_FILE_SEPARATOR) {
+            while prefix.starts_with(S3_PREFIX_SEPARATOR) {
                prefix = &prefix[1..]
            }

            let mut prefix = prefix.to_string();
-            while prefix.ends_with(S3_FILE_SEPARATOR) {
+            while prefix.ends_with(S3_PREFIX_SEPARATOR) {
                prefix.pop();
            }
            prefix
@@ -120,7 +181,7 @@ impl S3Bucket {

        Ok(Self {
            client,
-            pageserver_workdir,
+            workdir,
            bucket_name: aws_config.bucket_name.clone(),
            prefix_in_bucket,
            concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
@@ -130,24 +191,23 @@ impl S3Bucket {

 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
-    type StoragePath = S3ObjectKey;
+    type RemoteObjectId = S3ObjectKey;

-    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath> {
-        let relative_path = strip_path_prefix(self.pageserver_workdir, local_path)?;
+    fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId> {
+        let relative_path = strip_path_prefix(&self.workdir, local_path)?;
        let mut key = self.prefix_in_bucket.clone().unwrap_or_default();
        for segment in relative_path {
-            key.push(S3_FILE_SEPARATOR);
+            key.push(S3_PREFIX_SEPARATOR);
            key.push_str(&segment.to_string_lossy());
        }
        Ok(S3ObjectKey(key))
    }

-    fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf> {
-        Ok(storage_path
-            .download_destination(self.pageserver_workdir, self.prefix_in_bucket.as_deref()))
+    fn local_path(&self, storage_path: &Self::RemoteObjectId) -> anyhow::Result<PathBuf> {
+        Ok(storage_path.download_destination(&self.workdir, self.prefix_in_bucket.as_deref()))
    }

-    async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
+    async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
        let mut document_keys = Vec::new();

        let mut continuation_token = None;
@@ -157,6 +217,9 @@ impl RemoteStorage for S3Bucket {
                .acquire()
                .await
                .context("Concurrency limiter semaphore got closed during S3 list")?;
+
+            metrics::inc_list_objects();
+
            let fetch_response = self
                .client
                .list_objects_v2(ListObjectsV2Request {
@@ -165,7 +228,11 @@ impl RemoteStorage for S3Bucket {
                    continuation_token,
                    ..ListObjectsV2Request::default()
                })
-                .await?;
+                .await
+                .map_err(|e| {
+                    metrics::inc_list_objects_fail();
+                    e
+                })?;
            document_keys.extend(
                fetch_response
                    .contents
@@ -187,7 +254,7 @@ impl RemoteStorage for S3Bucket {
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
        from_size_bytes: usize,
-        to: &Self::StoragePath,
+        to: &Self::RemoteObjectId,
        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<()> {
        let _guard = self
@@ -195,6 +262,8 @@ impl RemoteStorage for S3Bucket {
            .acquire()
            .await
            .context("Concurrency limiter semaphore got closed during S3 upload")?;
+
+        metrics::inc_put_object();
        self.client
            .put_object(PutObjectRequest {
                body: Some(StreamingBody::new_with_size(
@@ -206,13 +275,17 @@ impl RemoteStorage for S3Bucket {
                metadata: metadata.map(|m| m.0),
                ..PutObjectRequest::default()
            })
-            .await?;
+            .await
+            .map_err(|e| {
+                metrics::inc_put_object_fail();
+                e
+            })?;
        Ok(())
    }

    async fn download(
        &self,
-        from: &Self::StoragePath,
+        from: &Self::RemoteObjectId,
        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
    ) -> anyhow::Result<Option<StorageMetadata>> {
        let _guard = self
@@ -220,6 +293,9 @@ impl RemoteStorage for S3Bucket {
            .acquire()
            .await
            .context("Concurrency limiter semaphore got closed during S3 download")?;
+
+        metrics::inc_get_object();
+
        let object_output = self
            .client
            .get_object(GetObjectRequest {
@@ -227,7 +303,11 @@ impl RemoteStorage for S3Bucket {
                key: from.key().to_owned(),
                ..GetObjectRequest::default()
            })
-            .await?;
+            .await
+            .map_err(|e| {
+                metrics::inc_get_object_fail();
+                e
+            })?;

        if let Some(body) = object_output.body {
            let mut from = io::BufReader::new(body.into_async_read());
@@ -237,9 +317,9 @@ impl RemoteStorage for S3Bucket {
        Ok(object_output.metadata.map(StorageMetadata))
    }

-    async fn download_range(
+    async fn download_byte_range(
        &self,
-        from: &Self::StoragePath,
+        from: &Self::RemoteObjectId,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
@@ -256,6 +336,9 @@ impl RemoteStorage for S3Bucket {
            .acquire()
            .await
            .context("Concurrency limiter semaphore got closed during S3 range download")?;
+
+        metrics::inc_get_object();
+
        let object_output = self
            .client
            .get_object(GetObjectRequest {
@@ -264,7 +347,11 @@ impl RemoteStorage for S3Bucket {
                range,
                ..GetObjectRequest::default()
            })
-            .await?;
+            .await
+            .map_err(|e| {
+                metrics::inc_get_object_fail();
+                e
+            })?;

        if let Some(body) = object_output.body {
            let mut from = io::BufReader::new(body.into_async_read());
@@ -274,53 +361,56 @@ impl RemoteStorage for S3Bucket {
        Ok(object_output.metadata.map(StorageMetadata))
    }

-    async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> {
+    async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> {
        let _guard = self
            .concurrency_limiter
            .acquire()
            .await
            .context("Concurrency limiter semaphore got closed during S3 delete")?;
+
+        metrics::inc_delete_object();
+
        self.client
            .delete_object(DeleteObjectRequest {
                bucket: self.bucket_name.clone(),
                key: path.key().to_owned(),
                ..DeleteObjectRequest::default()
            })
-            .await?;
+            .await
+            .map_err(|e| {
+                metrics::inc_delete_object_fail();
+                e
+            })?;
        Ok(())
    }
 }

 #[cfg(test)]
 mod tests {
-    use crate::{
-        layered_repository::metadata::METADATA_FILE_NAME,
-        repository::repo_harness::{RepoHarness, TIMELINE_ID},
-    };
+    use tempfile::tempdir;

    use super::*;

    #[test]
    fn download_destination() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("download_destination")?;
-
-        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("test_name");
-        let relative_path = local_path.strip_prefix(&repo_harness.conf.workdir)?;
+        let workdir = tempdir()?.path().to_owned();
+        let local_path = workdir.join("one").join("two").join("test_name");
+        let relative_path = local_path.strip_prefix(&workdir)?;

        let key = S3ObjectKey(format!(
            "{}{}",
-            S3_FILE_SEPARATOR,
+            S3_PREFIX_SEPARATOR,
            relative_path
                .iter()
                .map(|segment| segment.to_str().unwrap())
                .collect::<Vec<_>>()
-                .join(&S3_FILE_SEPARATOR.to_string()),
+                .join(&S3_PREFIX_SEPARATOR.to_string()),
        ));

        assert_eq!(
            local_path,
-            key.download_destination(&repo_harness.conf.workdir, None),
-            "Download destination should consist of s3 path joined with the pageserver workdir prefix"
+            key.download_destination(&workdir, None),
+            "Download destination should consist of s3 path joined with the workdir prefix"
        );

        Ok(())
@@ -328,24 +418,21 @@ mod tests {

    #[test]
    fn storage_path_positive() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("storage_path_positive")?;
+        let workdir = tempdir()?.path().to_owned();

        let segment_1 = "matching";
        let segment_2 = "file";
-        let local_path = &repo_harness.conf.workdir.join(segment_1).join(segment_2);
+        let local_path = &workdir.join(segment_1).join(segment_2);

-        let storage = dummy_storage(&repo_harness.conf.workdir);
+        let storage = dummy_storage(workdir);

        let expected_key = S3ObjectKey(format!(
-            "{}{SEPARATOR}{}{SEPARATOR}{}",
+            "{}{S3_PREFIX_SEPARATOR}{segment_1}{S3_PREFIX_SEPARATOR}{segment_2}",
            storage.prefix_in_bucket.as_deref().unwrap_or_default(),
-            segment_1,
-            segment_2,
-            SEPARATOR = S3_FILE_SEPARATOR,
        ));

        let actual_key = storage
-            .storage_path(local_path)
+            .remote_object_id(local_path)
            .expect("Matching path should map to S3 path normally");
        assert_eq!(
            expected_key,
@@ -360,7 +447,7 @@ mod tests {
    fn storage_path_negatives() -> anyhow::Result<()> {
        #[track_caller]
        fn storage_path_error(storage: &S3Bucket, mismatching_path: &Path) -> String {
-            match storage.storage_path(mismatching_path) {
+            match storage.remote_object_id(mismatching_path) {
                Ok(wrong_key) => panic!(
                    "Expected path '{}' to error, but got S3 key: {:?}",
                    mismatching_path.display(),
@@ -370,10 +457,10 @@ mod tests {
            }
        }

-        let repo_harness = RepoHarness::create("storage_path_negatives")?;
-        let storage = dummy_storage(&repo_harness.conf.workdir);
+        let workdir = tempdir()?.path().to_owned();
+        let storage = dummy_storage(workdir.clone());

-        let error_message = storage_path_error(&storage, &repo_harness.conf.workdir);
+        let error_message = storage_path_error(&storage, &workdir);
        assert!(
            error_message.contains("Prefix and the path are equal"),
            "Message '{}' does not contain the required string",
@@ -387,7 +474,7 @@ mod tests {
            "Error should mention wrong path"
        );
        assert!(
-            error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
+            error_message.contains(workdir.to_str().unwrap()),
            "Error should mention server workdir"
        );
        assert!(
@@ -401,20 +488,17 @@ mod tests {

    #[test]
    fn local_path_positive() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("local_path_positive")?;
-        let storage = dummy_storage(&repo_harness.conf.workdir);
-        let timeline_dir = repo_harness.timeline_path(&TIMELINE_ID);
-        let relative_timeline_path = timeline_dir.strip_prefix(&repo_harness.conf.workdir)?;
+        let workdir = tempdir()?.path().to_owned();
+        let storage = dummy_storage(workdir.clone());
+        let timeline_dir = workdir.join("timelines").join("test_timeline");
+        let relative_timeline_path = timeline_dir.strip_prefix(&workdir)?;

        let s3_key = create_s3_key(
            &relative_timeline_path.join("not a metadata"),
            storage.prefix_in_bucket.as_deref(),
        );
        assert_eq!(
-            s3_key.download_destination(
-                &repo_harness.conf.workdir,
-                storage.prefix_in_bucket.as_deref()
-            ),
+            s3_key.download_destination(&workdir, storage.prefix_in_bucket.as_deref()),
            storage
                .local_path(&s3_key)
                .expect("For a valid input, valid S3 info should be parsed"),
@@ -422,14 +506,11 @@ mod tests {
        );

        let s3_key = create_s3_key(
-            &relative_timeline_path.join(METADATA_FILE_NAME),
+            &relative_timeline_path.join("metadata"),
            storage.prefix_in_bucket.as_deref(),
        );
        assert_eq!(
-            s3_key.download_destination(
-                &repo_harness.conf.workdir,
-                storage.prefix_in_bucket.as_deref()
-            ),
+            s3_key.download_destination(&workdir, storage.prefix_in_bucket.as_deref()),
            storage
                .local_path(&s3_key)
                .expect("For a valid input, valid S3 info should be parsed"),
@@ -441,12 +522,15 @@ mod tests {

    #[test]
    fn download_destination_matches_original_path() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
-        let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
+        let workdir = tempdir()?.path().to_owned();
+        let original_path = workdir
+            .join("timelines")
+            .join("some_timeline")
+            .join("some name");

-        let dummy_storage = dummy_storage(&repo_harness.conf.workdir);
+        let dummy_storage = dummy_storage(workdir);

-        let key = dummy_storage.storage_path(&original_path)?;
+        let key = dummy_storage.remote_object_id(&original_path)?;
        let download_destination = dummy_storage.local_path(&key)?;

        assert_eq!(
@@ -457,9 +541,9 @@ mod tests {
        Ok(())
    }

-    fn dummy_storage(pageserver_workdir: &'static Path) -> S3Bucket {
+    fn dummy_storage(workdir: PathBuf) -> S3Bucket {
        S3Bucket {
-            pageserver_workdir,
+            workdir,
            client: S3Client::new("us-east-1".parse().unwrap()),
            bucket_name: "dummy-bucket".to_string(),
            prefix_in_bucket: Some("dummy_prefix/".to_string()),
@@ -471,7 +555,7 @@ mod tests {
        S3ObjectKey(relative_file_path.iter().fold(
            prefix.unwrap_or_default().to_string(),
            |mut path_string, segment| {
-                path_string.push(S3_FILE_SEPARATOR);
+                path_string.push(S3_PREFIX_SEPARATOR);
                path_string.push_str(segment.to_str().unwrap());
                path_string
            },
--- a/libs/utils/build.rs
+++ b/libs/utils/build.rs
@@ -1,3 +0,0 @@
-fn main() {
-    println!("cargo:rerun-if-env-changed=GIT_VERSION");
-}
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -5,7 +5,7 @@ use anyhow::anyhow;
 use hyper::header::AUTHORIZATION;
 use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
 use lazy_static::lazy_static;
-use metrics::{new_common_metric_name, register_int_counter, Encoder, IntCounter, TextEncoder};
+use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use routerify::ext::RequestExt;
 use routerify::RequestInfo;
 use routerify::{Middleware, Router, RouterBuilder, RouterService};
@@ -18,7 +18,7 @@ use super::error::ApiError;

 lazy_static! {
    static ref SERVE_METRICS_COUNT: IntCounter = register_int_counter!(
-        new_common_metric_name("serve_metrics_count"),
+        "libmetrics_metric_handler_requests_total",
        "Number of metric requests made"
    )
    .expect("failed to define a metric");
--- a/libs/utils/src/http/request.rs
+++ b/libs/utils/src/http/request.rs
@@ -1,7 +1,7 @@
 use std::str::FromStr;

 use super::error::ApiError;
-use hyper::{Body, Request};
+use hyper::{body::HttpBody, Body, Request};
 use routerify::ext::RequestExt;

 pub fn get_request_param<'a>(
@@ -31,3 +31,10 @@ pub fn parse_request_param<T: FromStr>(
        ))),
    }
 }
+
+pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
+    match request.body_mut().data().await {
+        Some(_) => Err(ApiError::BadRequest("Unexpected request body".into())),
+        None => Ok(()),
+    }
+}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -54,31 +54,52 @@ pub mod nonblock;
 // Default signal handling
 pub mod signals;

-// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
-//
-// we have several cases:
-// * building locally from git repo
-// * building in CI from git repo
-// * building in docker (either in CI or locally)
-//
-// One thing to note is that .git is not available in docker (and it is bad to include it there).
-// So everything becides docker build is covered by git_version crate.
-// For docker use environment variable to pass git version, which is then retrieved by buildscript (build.rs).
-// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro.
-// Git version received from environment variable used as a fallback in git_version invokation.
-// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option.
-// So the build script will be run only when GIT_VERSION envvar has changed.
-//
-// Why not to use buildscript to get git commit sha directly without procmacro from different crate?
-// Caching and workspaces complicates that. In case `utils` is not
-// recompiled due to caching then version may become outdated.
-// git_version crate handles that case by introducing a dependency on .git internals via include_bytes! macro,
-// so if we changed the index state git_version will pick that up and rerun the macro.
-//
-// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`.
-use git_version::git_version;
-pub const GIT_VERSION: &str = git_version!(
-    prefix = "git:",
-    fallback = concat!("git-env:", env!("GIT_VERSION")),
-    args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
-);
+/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
+///
+/// we have several cases:
+/// * building locally from git repo
+/// * building in CI from git repo
+/// * building in docker (either in CI or locally)
+///
+/// One thing to note is that .git is not available in docker (and it is bad to include it there).
+/// So everything becides docker build is covered by git_version crate, and docker uses a `GIT_VERSION` argument to get the value required.
+/// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro.
+/// Git version received from environment variable used as a fallback in git_version invokation.
+/// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option.
+/// So the build script will be run only when GIT_VERSION envvar has changed.
+///
+/// Why not to use buildscript to get git commit sha directly without procmacro from different crate?
+/// Caching and workspaces complicates that. In case `utils` is not
+/// recompiled due to caching then version may become outdated.
+/// git_version crate handles that case by introducing a dependency on .git internals via include_bytes! macro,
+/// so if we changed the index state git_version will pick that up and rerun the macro.
+///
+/// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`.
+///
+/// #############################################################################################
+/// TODO this macro is not the way the library is intended to be used, see https://github.com/neondatabase/neon/issues/1565 for details.
+/// We use `cachepot` to reduce our current CI build times: https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036
+/// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
+/// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
+/// The problem needs further investigation and regular `const` declaration instead of a macro.
+#[macro_export]
+macro_rules! project_git_version {
+    ($const_identifier:ident) => {
+        const $const_identifier: &str = git_version::git_version!(
+            prefix = "git:",
+            fallback = concat!(
+                "git-env:",
+                env!("GIT_VERSION", "Missing GIT_VERSION envvar")
+            ),
+            args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
+        );
+    };
+}
+
+/// Same as `assert!`, but evaluated during compilation and gets optimized out in runtime.
+#[macro_export]
+macro_rules! const_assert {
+    ($($args:tt)*) => {
+        const _: () = assert!($($args)*);
+    };
+}
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -26,6 +26,9 @@ impl Lsn {
    /// Maximum possible value for an LSN
    pub const MAX: Lsn = Lsn(u64::MAX);

+    /// Invalid value for InvalidXLogRecPtr, as defined in xlogdefs.h
+    pub const INVALID: Lsn = Lsn(0);
+
    /// Subtract a number, returning None on overflow.
    pub fn checked_sub<T: Into<u64>>(self, other: T) -> Option<Lsn> {
        let other: u64 = other.into();
@@ -103,6 +106,12 @@ impl Lsn {
    pub fn is_aligned(&self) -> bool {
        *self == self.align()
    }
+
+    /// Return if the LSN is valid
+    /// mimics postgres XLogRecPtrIsInvalid macro
+    pub fn is_valid(self) -> bool {
+        self != Lsn::INVALID
+    }
 }

 impl From<u64> for Lsn {
--- a/libs/utils/src/postgres_backend.rs
+++ b/libs/utils/src/postgres_backend.rs
@@ -433,7 +433,12 @@ impl PostgresBackend {
                    // full cause of the error, not just the top-level context + its trace.
                    // We don't want to send that in the ErrorResponse though,
                    // because it's not relevant to the compute node logs.
-                    error!("query handler for '{}' failed: {:?}", query_string, e);
+                    if query_string.starts_with("callmemaybe") {
+                        // FIXME avoid printing a backtrace for tenant x not found errors until this is properly fixed
+                        error!("query handler for '{}' failed: {}", query_string, e);
+                    } else {
+                        error!("query handler for '{}' failed: {:?}", query_string, e);
+                    }
                    self.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?;
                    // TODO: untangle convoluted control flow
                    if e.to_string().contains("failed to run") {
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -1,11 +1,9 @@
 use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};

-use serde::{Deserialize, Serialize};
-
 /// Ordered map datastructure implemented in a Vec.
 /// Append only - can only add keys that are larger than the
 /// current max key.
-#[derive(Clone, Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug)]
 pub struct VecMap<K, V>(Vec<(K, V)>);

 impl<K, V> Default for VecMap<K, V> {
--- a/libs/utils/src/zid.rs
+++ b/libs/utils/src/zid.rs
@@ -218,17 +218,17 @@ impl ZTenantTimelineId {

 impl fmt::Display for ZTenantTimelineId {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "{}-{}", self.tenant_id, self.timeline_id)
+        write!(f, "{}/{}", self.tenant_id, self.timeline_id)
    }
 }

 // Unique ID of a storage node (safekeeper or pageserver). Supposed to be issued
 // by the console.
-#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Debug, Serialize, Deserialize)]
+#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Debug, Serialize, Deserialize)]
 #[serde(transparent)]
-pub struct ZNodeId(pub u64);
+pub struct NodeId(pub u64);

-impl fmt::Display for ZNodeId {
+impl fmt::Display for NodeId {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "{}", self.0)
    }
--- a/monitoring/docker-compose.yml
+++ b/monitoring/docker-compose.yml
@@ -1,25 +0,0 @@
-version: "3"
-services:
-
-  prometheus:
-    container_name: prometheus
-    image: prom/prometheus:latest
-    volumes:
-      - ./prometheus.yaml:/etc/prometheus/prometheus.yml
-    # ports:
-    #   - "9090:9090"
-    # TODO: find a proper portable solution
-    network_mode: "host"
-
-  grafana:
-    image: grafana/grafana:latest
-    volumes:
-      - ./grafana.yaml:/etc/grafana/provisioning/datasources/datasources.yaml
-    environment:
-      - GF_AUTH_ANONYMOUS_ENABLED=true
-      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
-      - GF_AUTH_DISABLE_LOGIN_FORM=true
-    # ports:
-    #   - "3000:3000"
-    # TODO: find a proper portable solution
-    network_mode: "host"
--- a/monitoring/grafana.yaml
+++ b/monitoring/grafana.yaml
@@ -1,12 +0,0 @@
-apiVersion: 1
-
-datasources:
- name: Prometheus
-  type: prometheus
-  access: proxy
-  orgId: 1
-  url: http://localhost:9090
-  basicAuth: false
-  isDefault: false
-  version: 1
-  editable: false
--- a/monitoring/prometheus.yaml
+++ b/monitoring/prometheus.yaml
@@ -1,5 +0,0 @@
-scrape_configs:
-  - job_name: 'default'
-    scrape_interval: 10s
-    static_configs:
-      - targets: ['localhost:9898']
--- a/neon_local/Cargo.toml
+++ b/neon_local/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "zenith"
+name = "neon_local"
 version = "0.1.0"
 edition = "2021"

@@ -7,7 +7,9 @@ edition = "2021"
 clap = "3.0"
 anyhow = "1.0"
 serde_json = "1"
+comfy-table = "5.0.1"
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+git-version = "0.3.5"

 # FIXME: 'pageserver' is needed for BranchInfo. Refactor
 pageserver = { path = "../pageserver" }
--- a/neon_local/src/main.rs
+++ b/neon_local/src/main.rs
@@ -1,10 +1,10 @@
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{App, AppSettings, Arg, ArgMatches};
 use control_plane::compute::ComputeControlPlane;
-use control_plane::local_env;
-use control_plane::local_env::LocalEnv;
+use control_plane::local_env::{EtcdBroker, LocalEnv};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage::PageServerNode;
+use control_plane::{etcd, local_env};
 use pageserver::config::defaults::{
    DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
    DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR,
@@ -14,45 +14,46 @@ use safekeeper::defaults::{
    DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
 };
 use std::collections::{BTreeSet, HashMap};
+use std::path::Path;
 use std::process::exit;
 use std::str::FromStr;
 use utils::{
    auth::{Claims, Scope},
    lsn::Lsn,
    postgres_backend::AuthType,
-    zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
-    GIT_VERSION,
+    project_git_version,
+    zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
 };

 use pageserver::timelines::TimelineInfo;

 // Default id of a safekeeper node, if not specified on the command line.
-const DEFAULT_SAFEKEEPER_ID: ZNodeId = ZNodeId(1);
-const DEFAULT_PAGESERVER_ID: ZNodeId = ZNodeId(1);
+const DEFAULT_SAFEKEEPER_ID: NodeId = NodeId(1);
+const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
+project_git_version!(GIT_VERSION);

-fn default_conf() -> String {
+fn default_conf(etcd_binary_path: &Path) -> String {
    format!(
        r#"
 # Default built-in configuration, defined in main.rs
+[etcd_broker]
+broker_endpoints = ['http://localhost:2379']
+etcd_binary_path = '{etcd_binary_path}'
+
 [pageserver]
-id = {pageserver_id}
-listen_pg_addr = '{pageserver_pg_addr}'
-listen_http_addr = '{pageserver_http_addr}'
+id = {DEFAULT_PAGESERVER_ID}
+listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}'
+listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}'
 auth_type = '{pageserver_auth_type}'

 [[safekeepers]]
-id = {safekeeper_id}
-pg_port = {safekeeper_pg_port}
-http_port = {safekeeper_http_port}
+id = {DEFAULT_SAFEKEEPER_ID}
+pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
+http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
 "#,
-        pageserver_id = DEFAULT_PAGESERVER_ID,
-        pageserver_pg_addr = DEFAULT_PAGESERVER_PG_ADDR,
-        pageserver_http_addr = DEFAULT_PAGESERVER_HTTP_ADDR,
+        etcd_binary_path = etcd_binary_path.display(),
        pageserver_auth_type = AuthType::Trust,
-        safekeeper_id = DEFAULT_SAFEKEEPER_ID,
-        safekeeper_pg_port = DEFAULT_SAFEKEEPER_PG_PORT,
-        safekeeper_http_port = DEFAULT_SAFEKEEPER_HTTP_PORT,
    )
 }

@@ -62,15 +63,15 @@ http_port = {safekeeper_http_port}
 struct TimelineTreeEl {
    /// `TimelineInfo` received from the `pageserver` via the `timeline_list` http API call.
    pub info: TimelineInfo,
-    /// Name, recovered from zenith config mappings
+    /// Name, recovered from neon config mappings
    pub name: Option<String>,
    /// Holds all direct children of this timeline referenced using `timeline_id`.
    pub children: BTreeSet<ZTimelineId>,
 }

-// Main entry point for the 'zenith' CLI utility
+// Main entry point for the 'neon_local' CLI utility
 //
-// This utility helps to manage zenith installation. That includes following:
+// This utility helps to manage neon installation. That includes following:
 //   * Management of local postgres installations running on top of the
 //     pageserver.
 //   * Providing CLI api to the pageserver
@@ -125,12 +126,12 @@ fn main() -> Result<()> {
        .takes_value(true)
        .required(false);

-    let matches = App::new("Zenith CLI")
+    let matches = App::new("Neon CLI")
        .setting(AppSettings::ArgRequiredElseHelp)
        .version(GIT_VERSION)
        .subcommand(
            App::new("init")
-                .about("Initialize a new Zenith repository")
+                .about("Initialize a new Neon repository")
                .arg(pageserver_config_args.clone())
                .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
                .arg(
@@ -166,12 +167,12 @@ fn main() -> Result<()> {
            .subcommand(App::new("create")
                .arg(tenant_id_arg.clone())
                .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
-				.arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false))
-				)
+                .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false))
+                )
            .subcommand(App::new("config")
                .arg(tenant_id_arg.clone())
-				.arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false))
-				)
+                .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false))
+                )
        )
        .subcommand(
            App::new("pageserver")
@@ -258,7 +259,7 @@ fn main() -> Result<()> {
        None => bail!("no subcommand provided"),
    };

-    // Check for 'zenith init' command first.
+    // Check for 'neon init' command first.
    let subcommand_result = if sub_name == "init" {
        handle_init(sub_args).map(Some)
    } else {
@@ -274,7 +275,7 @@ fn main() -> Result<()> {
            "pageserver" => handle_pageserver(sub_args, &env),
            "pg" => handle_pg(sub_args, &env),
            "safekeeper" => handle_safekeeper(sub_args, &env),
-            _ => bail!("unexpected subcommand {}", sub_name),
+            _ => bail!("unexpected subcommand {sub_name}"),
        };

        if original_env != env {
@@ -288,7 +289,7 @@ fn main() -> Result<()> {
        Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?,
        Ok(None) => (),
        Err(e) => {
-            eprintln!("command failed: {:?}", e);
+            eprintln!("command failed: {e:?}");
            exit(1);
        }
    }
@@ -467,23 +468,22 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<ZTimelineI
        .context("Failed to parse timeline id from the argument string")
 }

-fn handle_init(init_match: &ArgMatches) -> Result<LocalEnv> {
+fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
    let initial_timeline_id_arg = parse_timeline_id(init_match)?;

    // Create config file
    let toml_file: String = if let Some(config_path) = init_match.value_of("config") {
        // load and parse the file
        std::fs::read_to_string(std::path::Path::new(config_path))
-            .with_context(|| format!("Could not read configuration file \"{}\"", config_path))?
+            .with_context(|| format!("Could not read configuration file '{config_path}'"))?
    } else {
        // Built-in default config
-        default_conf()
+        default_conf(&EtcdBroker::locate_etcd()?)
    };

    let mut env =
-        LocalEnv::create_config(&toml_file).context("Failed to create zenith configuration")?;
-    env.init()
-        .context("Failed to initialize zenith repository")?;
+        LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
+    env.init().context("Failed to initialize neon repository")?;

    // default_tenantid was generated by the `env.init()` call above
    let initial_tenant_id = env.default_tenant_id.unwrap();
@@ -497,7 +497,7 @@ fn handle_init(init_match: &ArgMatches) -> Result<LocalEnv> {
            &pageserver_config_overrides(init_match),
        )
        .unwrap_or_else(|e| {
-            eprintln!("pageserver init failed: {}", e);
+            eprintln!("pageserver init failed: {e}");
            exit(1);
        });

@@ -518,7 +518,7 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
        .collect()
 }

-fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
+fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> {
    let pageserver = PageServerNode::from_env(env);
    match tenant_match.subcommand() {
        Some(("list", _)) => {
@@ -541,6 +541,29 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Re
                "tenant {} successfully created on the pageserver",
                new_tenant_id
            );
+
+            // Create an initial timeline for the new tenant
+            let new_timeline_id = parse_timeline_id(create_match)?;
+            let timeline = pageserver
+                .timeline_create(new_tenant_id, new_timeline_id, None, None)?
+                .context(format!(
+                    "Failed to create initial timeline for tenant {new_tenant_id}"
+                ))?;
+            let new_timeline_id = timeline.timeline_id;
+            let last_record_lsn = timeline
+                .local
+                .context(format!("Failed to get last record LSN: no local timeline info for timeline {new_timeline_id}"))?
+                .last_record_lsn;
+
+            env.register_branch_mapping(
+                DEFAULT_BRANCH_NAME.to_string(),
+                new_tenant_id,
+                new_timeline_id,
+            )?;
+
+            println!(
+                "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {new_tenant_id}",
+            );
        }
        Some(("config", create_match)) => {
            let tenant_id = get_tenant_id(create_match, env)?;
@@ -551,17 +574,8 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Re

            pageserver
                .tenant_config(tenant_id, tenant_conf)
-                .unwrap_or_else(|e| {
-                    anyhow!(
-                        "Tenant config failed for tenant with id {} : {}",
-                        tenant_id,
-                        e
-                    );
-                });
-            println!(
-                "tenant {} successfully configured on the pageserver",
-                tenant_id
-            );
+                .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
+            println!("tenant {tenant_id} successfully configured on the pageserver");
        }
        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
        None => bail!("no tenant subcommand provided"),
@@ -665,35 +679,56 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {

            let timeline_name_mappings = env.timeline_name_mappings();

-            println!("NODE\tADDRESS\tTIMELINE\tBRANCH NAME\tLSN\t\tSTATUS");
+            let mut table = comfy_table::Table::new();
+
+            table.load_preset(comfy_table::presets::NOTHING);
+
+            table.set_header(&[
+                "NODE",
+                "ADDRESS",
+                "TIMELINE",
+                "BRANCH NAME",
+                "LSN",
+                "STATUS",
+            ]);
+
            for ((_, node_name), node) in cplane
                .nodes
                .iter()
                .filter(|((node_tenant_id, _), _)| node_tenant_id == &tenant_id)
            {
-                // FIXME: This shows the LSN at the end of the timeline. It's not the
-                // right thing to do for read-only nodes that might be anchored at an
-                // older point in time, or following but lagging behind the primary.
-                let lsn_str = timeline_infos
-                    .get(&node.timeline_id)
-                    .and_then(|bi| bi.local.as_ref().map(|l| l.last_record_lsn.to_string()))
-                    .unwrap_or_else(|| "?".to_string());
+                let lsn_str = match node.lsn {
+                    None => {
+                        // -> primary node
+                        // Use the LSN at the end of the timeline.
+                        timeline_infos
+                            .get(&node.timeline_id)
+                            .and_then(|bi| bi.local.as_ref().map(|l| l.last_record_lsn.to_string()))
+                            .unwrap_or_else(|| "?".to_string())
+                    }
+                    Some(lsn) => {
+                        // -> read-only node
+                        // Use the node's LSN.
+                        lsn.to_string()
+                    }
+                };

                let branch_name = timeline_name_mappings
                    .get(&ZTenantTimelineId::new(tenant_id, node.timeline_id))
                    .map(|name| name.as_str())
                    .unwrap_or("?");

-                println!(
-                    "{}\t{}\t{}\t{}\t{}\t{}",
-                    node_name,
-                    node.address,
-                    node.timeline_id,
+                table.add_row(&[
+                    node_name.as_str(),
+                    &node.address.to_string(),
+                    &node.timeline_id.to_string(),
                    branch_name,
-                    lsn_str,
+                    lsn_str.as_str(),
                    node.status(),
-                );
+                ]);
            }
+
+            println!("{table}");
        }
        "create" => {
            let branch_name = sub_args
@@ -825,7 +860,7 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
    Ok(())
 }

-fn get_safekeeper(env: &local_env::LocalEnv, id: ZNodeId) -> Result<SafekeeperNode> {
+fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result<SafekeeperNode> {
    if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) {
        Ok(SafekeeperNode::from_env(env, node))
    } else {
@@ -841,7 +876,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul

    // All the commands take an optional safekeeper name argument
    let sk_id = if let Some(id_str) = sub_args.value_of("id") {
-        ZNodeId(id_str.parse().context("while parsing safekeeper id")?)
+        NodeId(id_str.parse().context("while parsing safekeeper id")?)
    } else {
        DEFAULT_SAFEKEEPER_ID
    };
@@ -885,20 +920,23 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
    Ok(())
 }

-fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
+    etcd::start_etcd_process(env)?;
    let pageserver = PageServerNode::from_env(env);

    // Postgres nodes are not started automatically

    if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
-        eprintln!("pageserver start failed: {}", e);
+        eprintln!("pageserver start failed: {e}");
+        try_stop_etcd_process(env);
        exit(1);
    }

    for node in env.safekeepers.iter() {
        let safekeeper = SafekeeperNode::from_env(env, node);
        if let Err(e) = safekeeper.start() {
-            eprintln!("safekeeper '{}' start failed: {}", safekeeper.id, e);
+            eprintln!("safekeeper '{}' start failed: {e}", safekeeper.id);
+            try_stop_etcd_process(env);
            exit(1);
        }
    }
@@ -928,5 +966,14 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<
            eprintln!("safekeeper '{}' stop failed: {}", safekeeper.id, e);
        }
    }
+
+    try_stop_etcd_process(env);
+
    Ok(())
 }
+
+fn try_stop_etcd_process(env: &local_env::LocalEnv) {
+    if let Err(e) = etcd::stop_etcd_process(env) {
+        eprintln!("etcd stop failed: {e}");
+    }
+}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -5,7 +5,7 @@ edition = "2021"

 [features]
 # It is simpler infra-wise to have failpoints enabled by default
-# It shouldnt affect perf in any way because failpoints 
+# It shouldn't affect perf in any way because failpoints
 # are not placed in hot code paths
 default = ["failpoints"]
 profiling = ["pprof"]
@@ -25,7 +25,6 @@ lazy_static = "1.4.0"
 clap = "3.0"
 daemonize = "0.4.1"
 tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
-tokio-util = { version = "0.7", features = ["io"] }
 postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
@@ -53,14 +52,13 @@ nix = "0.23"
 once_cell = "1.8.0"
 crossbeam-utils = "0.8.5"
 fail = "0.5.0"
-
-rusoto_core = "0.47"
-rusoto_s3 = "0.47"
-async-trait = "0.1"
+git-version = "0.3.5"

 postgres_ffi = { path = "../libs/postgres_ffi" }
+etcd_broker = { path = "../libs/etcd_broker" }
 metrics = { path = "../libs/metrics" }
 utils = { path = "../libs/utils" }
+remote_storage = { path = "../libs/remote_storage" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }

 [dev-dependencies]
--- a/pageserver/README.md
+++ b/pageserver/README.md
@@ -135,7 +135,7 @@ The backup service is disabled by default and can be enabled to interact with a

 CLI examples:
 * Local FS: `${PAGESERVER_BIN} -c "remote_storage={local_path='/some/local/path/'}"`
-* AWS S3  : `${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/',access_key_id='SOMEKEYAAAAASADSAH*#',secret_access_key='SOMEsEcReTsd292v'}"`
+* AWS S3  : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"`

 For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
 For local S3 installations, refer to the their documentation for name format and credentials.
@@ -155,11 +155,9 @@ or
 bucket_name = 'some-sample-bucket'
 bucket_region = 'eu-north-1'
 prefix_in_bucket = '/test_prefix/'
-access_key_id = 'SOMEKEYAAAAASADSAH*#'
-secret_access_key = 'SOMEsEcReTsd292v'
 ```

-Also, `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` variables can be used to specify the credentials instead of any of the ways above.
+`AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed.

 TODO: Sharding
 --------------------
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -10,8 +10,9 @@
 //! This module is responsible for creation of such tarball
 //! from data stored in object storage.
 //!
-use anyhow::{ensure, Context, Result};
+use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::{BufMut, BytesMut};
+use fail::fail_point;
 use std::fmt::Write as FmtWrite;
 use std::io;
 use std::io::Write;
@@ -30,11 +31,16 @@ use utils::lsn::Lsn;
 /// This is short-living object only for the time of tarball creation,
 /// created mostly to avoid passing a lot of parameters between various functions
 /// used for constructing tarball.
-pub struct Basebackup<'a> {
-    ar: Builder<&'a mut dyn Write>,
+pub struct Basebackup<'a, W>
+where
+    W: Write,
+{
+    ar: Builder<AbortableWrite<W>>,
    timeline: &'a Arc<DatadirTimelineImpl>,
    pub lsn: Lsn,
    prev_record_lsn: Lsn,
+
+    finished: bool,
 }

 // Create basebackup with non-rel data in it. Omit relational data.
@@ -44,12 +50,15 @@ pub struct Basebackup<'a> {
 //  * When working without safekeepers. In this situation it is important to match the lsn
 //    we are taking basebackup on with the lsn that is used in pageserver's walreceiver
 //    to start the replication.
-impl<'a> Basebackup<'a> {
+impl<'a, W> Basebackup<'a, W>
+where
+    W: Write,
+{
    pub fn new(
-        write: &'a mut dyn Write,
+        write: W,
        timeline: &'a Arc<DatadirTimelineImpl>,
        req_lsn: Option<Lsn>,
-    ) -> Result<Basebackup<'a>> {
+    ) -> Result<Basebackup<'a, W>> {
        // Compute postgres doesn't have any previous WAL files, but the first
        // record that it's going to write needs to include the LSN of the
        // previous record (xl_prev). We include prev_record_lsn in the
@@ -90,14 +99,15 @@ impl<'a> Basebackup<'a> {
        );

        Ok(Basebackup {
-            ar: Builder::new(write),
+            ar: Builder::new(AbortableWrite::new(write)),
            timeline,
            lsn: backup_lsn,
            prev_record_lsn: backup_prev,
+            finished: false,
        })
    }

-    pub fn send_tarball(&mut self) -> anyhow::Result<()> {
+    pub fn send_tarball(mut self) -> anyhow::Result<()> {
        // Create pgdata subdirs structure
        for dir in pg_constants::PGDATA_SUBDIRS.iter() {
            let header = new_tar_header_dir(*dir)?;
@@ -135,9 +145,14 @@ impl<'a> Basebackup<'a> {
            self.add_twophase_file(xid)?;
        }

+        fail_point!("basebackup-before-control-file", |_| {
+            bail!("failpoint basebackup-before-control-file")
+        });
+
        // Generate pg_control and bootstrap WAL segment.
        self.add_pgcontrol_file()?;
        self.ar.finish()?;
+        self.finished = true;
        debug!("all tarred up!");
        Ok(())
    }
@@ -323,13 +338,27 @@ impl<'a> Basebackup<'a> {
        let wal_file_name = XLogFileName(PG_TLI, segno, pg_constants::WAL_SEGMENT_SIZE);
        let wal_file_path = format!("pg_wal/{}", wal_file_name);
        let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?;
-        let wal_seg = generate_wal_segment(segno, pg_control.system_identifier);
+        let wal_seg = generate_wal_segment(segno, pg_control.system_identifier)
+            .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
        ensure!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE);
        self.ar.append(&header, &wal_seg[..])?;
        Ok(())
    }
 }

+impl<'a, W> Drop for Basebackup<'a, W>
+where
+    W: Write,
+{
+    /// If the basebackup was not finished, prevent the Archive::drop() from
+    /// writing the end-of-archive marker.
+    fn drop(&mut self) {
+        if !self.finished {
+            self.ar.get_mut().abort();
+        }
+    }
+}
+
 //
 // Create new tarball entry header
 //
@@ -365,3 +394,49 @@ fn new_tar_header_dir(path: &str) -> anyhow::Result<Header> {
    header.set_cksum();
    Ok(header)
 }
+
+/// A wrapper that passes through all data to the underlying Write,
+/// until abort() is called.
+///
+/// tar::Builder has an annoying habit of finishing the archive with
+/// a valid tar end-of-archive marker (two 512-byte sectors of zeros),
+/// even if an error occurs and we don't finish building the archive.
+/// We'd rather abort writing the tarball immediately than construct
+/// a seemingly valid but incomplete archive. This wrapper allows us
+/// to swallow the end-of-archive marker that Builder::drop() emits,
+/// without writing it to the underlying sink.
+///
+struct AbortableWrite<W> {
+    w: W,
+    aborted: bool,
+}
+
+impl<W> AbortableWrite<W> {
+    pub fn new(w: W) -> Self {
+        AbortableWrite { w, aborted: false }
+    }
+
+    pub fn abort(&mut self) {
+        self.aborted = true;
+    }
+}
+
+impl<W> Write for AbortableWrite<W>
+where
+    W: Write,
+{
+    fn write(&mut self, data: &[u8]) -> io::Result<usize> {
+        if self.aborted {
+            Ok(data.len())
+        } else {
+            self.w.write(data)
+        }
+    }
+    fn flush(&mut self) -> io::Result<()> {
+        if self.aborted {
+            Ok(())
+        } else {
+            self.w.flush()
+        }
+    }
+}
--- a/pageserver/src/bin/dump_layerfile.rs
+++ b/pageserver/src/bin/dump_layerfile.rs
@@ -7,7 +7,9 @@ use pageserver::layered_repository::dump_layerfile_from_path;
 use pageserver::page_cache;
 use pageserver::virtual_file;
 use std::path::PathBuf;
-use utils::GIT_VERSION;
+use utils::project_git_version;
+
+project_git_version!(GIT_VERSION);

 fn main() -> Result<()> {
    let arg_matches = App::new("Zenith dump_layerfile utility")
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -20,24 +20,24 @@ use utils::{
    http::endpoint,
    logging,
    postgres_backend::AuthType,
+    project_git_version,
    shutdown::exit_now,
    signals::{self, Signal},
    tcp_listener,
    zid::{ZTenantId, ZTimelineId},
-    GIT_VERSION,
 };

+project_git_version!(GIT_VERSION);
+
 fn version() -> String {
    format!(
-        "{} profiling:{} failpoints:{}",
-        GIT_VERSION,
+        "{GIT_VERSION} profiling:{} failpoints:{}",
        cfg!(feature = "profiling"),
        fail::has_failpoints()
    )
 }

 fn main() -> anyhow::Result<()> {
-    metrics::set_common_metrics_prefix("pageserver");
    let arg_matches = App::new("Zenith page server")
        .about("Materializes WAL stream to pages and serves them to the postgres")
        .version(&*version())
@@ -97,6 +97,8 @@ fn main() -> anyhow::Result<()> {
        let features: &[&str] = &[
            #[cfg(feature = "failpoints")]
            "failpoints",
+            #[cfg(feature = "profiling")]
+            "profiling",
        ];
        println!("{{\"features\": {features:?} }}");
        return Ok(());
@@ -182,13 +184,8 @@ fn main() -> anyhow::Result<()> {
    // as a ref.
    let conf: &'static PageServerConf = Box::leak(Box::new(conf));

-    // If failpoints are used, terminate the whole pageserver process if they are hit.
+    // Initialize up failpoints support
    let scenario = FailScenario::setup();
-    if fail::has_failpoints() {
-        std::panic::set_hook(Box::new(|_| {
-            std::process::exit(1);
-        }));
-    }

    // Basic initialization of things that don't change after startup
    virtual_file::init(conf.max_file_descriptors);
@@ -217,7 +214,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
    // Initialize logger
    let log_file = logging::init(LOG_FILE_NAME, daemonize)?;

-    info!("version: {}", GIT_VERSION);
+    info!("version: {GIT_VERSION}");

    // TODO: Check that it looks like a valid repository before going further

@@ -257,7 +254,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
        // Otherwise, the coverage data will be damaged.
        match daemonize.exit_action(|| exit_now(0)).start() {
            Ok(_) => info!("Success, daemonized"),
-            Err(err) => error!(%err, "could not daemonize"),
+            Err(err) => bail!("{err}. could not daemonize. bailing."),
        }
    }

@@ -287,7 +284,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
        None,
        None,
        "http_endpoint_thread",
-        false,
+        true,
        move || {
            let router = http::make_router(conf, auth_cloned, remote_index)?;
            endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher())
@@ -301,7 +298,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
        None,
        None,
        "libpq endpoint thread",
-        false,
+        true,
        move || page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type),
    )?;

--- a/pageserver/src/bin/update_metadata.rs
+++ b/pageserver/src/bin/update_metadata.rs
@@ -6,7 +6,9 @@ use clap::{App, Arg};
 use pageserver::layered_repository::metadata::TimelineMetadata;
 use std::path::PathBuf;
 use std::str::FromStr;
-use utils::{lsn::Lsn, GIT_VERSION};
+use utils::{lsn::Lsn, project_git_version};
+
+project_git_version!(GIT_VERSION);

 fn main() -> Result<()> {
    let arg_matches = App::new("Zenith update metadata utility")
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -5,16 +5,18 @@
 //! See also `settings.md` for better description on every parameter.

 use anyhow::{anyhow, bail, ensure, Context, Result};
+use remote_storage::RemoteStorageConfig;
 use std::env;
-use std::num::{NonZeroU32, NonZeroUsize};
+
 use std::path::{Path, PathBuf};
 use std::str::FromStr;
 use std::time::Duration;
 use toml_edit;
 use toml_edit::{Document, Item};
+use url::Url;
 use utils::{
    postgres_backend::AuthType,
-    zid::{ZNodeId, ZTenantId, ZTimelineId},
+    zid::{NodeId, ZTenantId, ZTimelineId},
 };

 use crate::layered_repository::TIMELINES_SEGMENT_NAME;
@@ -33,18 +35,6 @@ pub mod defaults {
    pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";

    pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
-    /// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
-    /// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
-    /// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
-    /// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
-    pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_TIMELINES_SYNC: usize = 50;
-    pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
-    /// Currently, sync happens with AWS S3, that has two limits on requests per second:
-    /// ~200 RPS for IAM services
-    /// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
-    /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
-    /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
-    pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;

    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
@@ -88,7 +78,7 @@ pub mod defaults {
 pub struct PageServerConf {
    // Identifier of that particular pageserver so e g safekeepers
    // can safely distinguish different pageservers
-    pub id: ZNodeId,
+    pub id: NodeId,

    /// Example (default): 127.0.0.1:64000
    pub listen_pg_addr: String,
@@ -122,6 +112,13 @@ pub struct PageServerConf {

    pub profiling: ProfilingConfig,
    pub default_tenant_conf: TenantConf,
+
+    /// A prefix to add in etcd brokers before every key.
+    /// Can be used for isolating different pageserver groups withing the same etcd cluster.
+    pub broker_etcd_prefix: String,
+
+    /// Etcd broker endpoints to connect to.
+    pub broker_endpoints: Vec<Url>,
 }

 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -183,9 +180,11 @@ struct PageServerConfigBuilder {
    auth_validation_public_key_path: BuilderValue<Option<PathBuf>>,
    remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,

-    id: BuilderValue<ZNodeId>,
+    id: BuilderValue<NodeId>,

    profiling: BuilderValue<ProfilingConfig>,
+    broker_etcd_prefix: BuilderValue<String>,
+    broker_endpoints: BuilderValue<Vec<Url>>,
 }

 impl Default for PageServerConfigBuilder {
@@ -211,6 +210,8 @@ impl Default for PageServerConfigBuilder {
            remote_storage_config: Set(None),
            id: NotSet,
            profiling: Set(ProfilingConfig::Disabled),
+            broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()),
+            broker_endpoints: Set(Vec::new()),
        }
    }
 }
@@ -267,7 +268,15 @@ impl PageServerConfigBuilder {
        self.remote_storage_config = BuilderValue::Set(remote_storage_config)
    }

-    pub fn id(&mut self, node_id: ZNodeId) {
+    pub fn broker_endpoints(&mut self, broker_endpoints: Vec<Url>) {
+        self.broker_endpoints = BuilderValue::Set(broker_endpoints)
+    }
+
+    pub fn broker_etcd_prefix(&mut self, broker_etcd_prefix: String) {
+        self.broker_etcd_prefix = BuilderValue::Set(broker_etcd_prefix)
+    }
+
+    pub fn id(&mut self, node_id: NodeId) {
        self.id = BuilderValue::Set(node_id)
    }

@@ -275,7 +284,11 @@ impl PageServerConfigBuilder {
        self.profiling = BuilderValue::Set(profiling)
    }

-    pub fn build(self) -> Result<PageServerConf> {
+    pub fn build(self) -> anyhow::Result<PageServerConf> {
+        let broker_endpoints = self
+            .broker_endpoints
+            .ok_or(anyhow!("No broker endpoints provided"))?;
+
        Ok(PageServerConf {
            listen_pg_addr: self
                .listen_pg_addr
@@ -311,71 +324,14 @@ impl PageServerConfigBuilder {
            profiling: self.profiling.ok_or(anyhow!("missing profiling"))?,
            // TenantConf is handled separately
            default_tenant_conf: TenantConf::default(),
+            broker_endpoints,
+            broker_etcd_prefix: self
+                .broker_etcd_prefix
+                .ok_or(anyhow!("missing broker_etcd_prefix"))?,
        })
    }
 }

-/// External backup storage configuration, enough for creating a client for that storage.
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub struct RemoteStorageConfig {
-    /// Max allowed number of concurrent sync operations between pageserver and the remote storage.
-    pub max_concurrent_timelines_sync: NonZeroUsize,
-    /// Max allowed errors before the sync task is considered failed and evicted.
-    pub max_sync_errors: NonZeroU32,
-    /// The storage connection configuration.
-    pub storage: RemoteStorageKind,
-}
-
-/// A kind of a remote storage to connect to, with its connection configuration.
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum RemoteStorageKind {
-    /// Storage based on local file system.
-    /// Specify a root folder to place all stored files into.
-    LocalFs(PathBuf),
-    /// AWS S3 based storage, storing all files in the S3 bucket
-    /// specified by the config
-    AwsS3(S3Config),
-}
-
-/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
-#[derive(Clone, PartialEq, Eq)]
-pub struct S3Config {
-    /// Name of the bucket to connect to.
-    pub bucket_name: String,
-    /// The region where the bucket is located at.
-    pub bucket_region: String,
-    /// A "subfolder" in the bucket, to use the same bucket separately by multiple pageservers at once.
-    pub prefix_in_bucket: Option<String>,
-    /// "Login" to use when connecting to bucket.
-    /// Can be empty for cases like AWS k8s IAM
-    /// where we can allow certain pods to connect
-    /// to the bucket directly without any credentials.
-    pub access_key_id: Option<String>,
-    /// "Password" to use when connecting to bucket.
-    pub secret_access_key: Option<String>,
-    /// A base URL to send S3 requests to.
-    /// By default, the endpoint is derived from a region name, assuming it's
-    /// an AWS S3 region name, erroring on wrong region name.
-    /// Endpoint provides a way to support other S3 flavors and their regions.
-    ///
-    /// Example: `http://127.0.0.1:5000`
-    pub endpoint: Option<String>,
-    /// AWS S3 has various limits on its API calls, we need not to exceed those.
-    /// See [`defaults::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
-    pub concurrency_limit: NonZeroUsize,
-}
-
-impl std::fmt::Debug for S3Config {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("S3Config")
-            .field("bucket_name", &self.bucket_name)
-            .field("bucket_region", &self.bucket_region)
-            .field("prefix_in_bucket", &self.prefix_in_bucket)
-            .field("concurrency_limit", &self.concurrency_limit)
-            .finish()
-    }
-}
-
 impl PageServerConf {
    //
    // Repository paths, relative to workdir.
@@ -413,7 +369,7 @@ impl PageServerConf {
    /// validating the input and failing on errors.
    ///
    /// This leaves any options not present in the file in the built-in defaults.
-    pub fn parse_and_validate(toml: &Document, workdir: &Path) -> Result<Self> {
+    pub fn parse_and_validate(toml: &Document, workdir: &Path) -> anyhow::Result<Self> {
        let mut builder = PageServerConfigBuilder::default();
        builder.workdir(workdir.to_owned());

@@ -438,13 +394,24 @@ impl PageServerConf {
                )),
                "auth_type" => builder.auth_type(parse_toml_from_str(key, item)?),
                "remote_storage" => {
-                    builder.remote_storage_config(Some(Self::parse_remote_storage_config(item)?))
+                    builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item)?))
                }
                "tenant_config" => {
                    t_conf = Self::parse_toml_tenant_conf(item)?;
                }
-                "id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)),
+                "id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
                "profiling" => builder.profiling(parse_toml_from_str(key, item)?),
+                "broker_etcd_prefix" => builder.broker_etcd_prefix(parse_toml_string(key, item)?),
+                "broker_endpoints" => builder.broker_endpoints(
+                    parse_toml_array(key, item)?
+                        .into_iter()
+                        .map(|endpoint_str| {
+                            endpoint_str.parse::<Url>().with_context(|| {
+                                format!("Array item {endpoint_str} for key {key} is not a valid url endpoint")
+                            })
+                        })
+                        .collect::<anyhow::Result<_>>()?,
+                ),
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -517,74 +484,6 @@ impl PageServerConf {
        Ok(t_conf)
    }

-    /// subroutine of parse_config(), to parse the `[remote_storage]` table.
-    fn parse_remote_storage_config(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
-        let local_path = toml.get("local_path");
-        let bucket_name = toml.get("bucket_name");
-        let bucket_region = toml.get("bucket_region");
-
-        let max_concurrent_timelines_sync = NonZeroUsize::new(
-            parse_optional_integer("max_concurrent_timelines_sync", toml)?
-                .unwrap_or(defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_TIMELINES_SYNC),
-        )
-        .context("Failed to parse 'max_concurrent_timelines_sync' as a positive integer")?;
-
-        let max_sync_errors = NonZeroU32::new(
-            parse_optional_integer("max_sync_errors", toml)?
-                .unwrap_or(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
-        )
-        .context("Failed to parse 'max_sync_errors' as a positive integer")?;
-
-        let concurrency_limit = NonZeroUsize::new(
-            parse_optional_integer("concurrency_limit", toml)?
-                .unwrap_or(defaults::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT),
-        )
-        .context("Failed to parse 'concurrency_limit' as a positive integer")?;
-
-        let storage = match (local_path, bucket_name, bucket_region) {
-            (None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"),
-            (_, Some(_), None) => {
-                bail!("'bucket_region' option is mandatory if 'bucket_name' is given ")
-            }
-            (_, None, Some(_)) => {
-                bail!("'bucket_name' option is mandatory if 'bucket_region' is given ")
-            }
-            (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config {
-                bucket_name: parse_toml_string("bucket_name", bucket_name)?,
-                bucket_region: parse_toml_string("bucket_region", bucket_region)?,
-                access_key_id: toml
-                    .get("access_key_id")
-                    .map(|access_key_id| parse_toml_string("access_key_id", access_key_id))
-                    .transpose()?,
-                secret_access_key: toml
-                    .get("secret_access_key")
-                    .map(|secret_access_key| {
-                        parse_toml_string("secret_access_key", secret_access_key)
-                    })
-                    .transpose()?,
-                prefix_in_bucket: toml
-                    .get("prefix_in_bucket")
-                    .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket))
-                    .transpose()?,
-                endpoint: toml
-                    .get("endpoint")
-                    .map(|endpoint| parse_toml_string("endpoint", endpoint))
-                    .transpose()?,
-                concurrency_limit,
-            }),
-            (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from(
-                parse_toml_string("local_path", local_path)?,
-            )),
-            (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"),
-        };
-
-        Ok(RemoteStorageConfig {
-            max_concurrent_timelines_sync,
-            max_sync_errors,
-            storage,
-        })
-    }
-
    #[cfg(test)]
    pub fn test_repo_dir(test_name: &str) -> PathBuf {
        PathBuf::from(format!("../tmp_check/test_{test_name}"))
@@ -593,7 +492,7 @@ impl PageServerConf {
    #[cfg(test)]
    pub fn dummy_conf(repo_dir: PathBuf) -> Self {
        PageServerConf {
-            id: ZNodeId(0),
+            id: NodeId(0),
            wait_lsn_timeout: Duration::from_secs(60),
            wal_redo_timeout: Duration::from_secs(60),
            page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
@@ -608,6 +507,8 @@ impl PageServerConf {
            remote_storage_config: None,
            profiling: ProfilingConfig::Disabled,
            default_tenant_conf: TenantConf::dummy_conf(),
+            broker_endpoints: Vec::new(),
+            broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
        }
    }
 }
@@ -633,23 +534,6 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
    Ok(i as u64)
 }

-fn parse_optional_integer<I, E>(name: &str, item: &toml_edit::Item) -> anyhow::Result<Option<I>>
-where
-    I: TryFrom<i64, Error = E>,
-    E: std::error::Error + Send + Sync + 'static,
-{
-    let toml_integer = match item.get(name) {
-        Some(item) => item
-            .as_integer()
-            .with_context(|| format!("configure option {name} is not an integer"))?,
-        None => return Ok(None),
-    };
-
-    I::try_from(toml_integer)
-        .map(Some)
-        .with_context(|| format!("configure option {name} is too large"))
-}
-
 fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> {
    let s = item
        .as_str()
@@ -658,20 +542,46 @@ fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> {
    Ok(humantime::parse_duration(s)?)
 }

-fn parse_toml_from_str<T>(name: &str, item: &Item) -> Result<T>
+fn parse_toml_from_str<T>(name: &str, item: &Item) -> anyhow::Result<T>
 where
-    T: FromStr<Err = anyhow::Error>,
+    T: FromStr,
+    <T as FromStr>::Err: std::fmt::Display,
 {
    let v = item
        .as_str()
        .with_context(|| format!("configure option {name} is not a string"))?;
-    T::from_str(v)
+    T::from_str(v).map_err(|e| {
+        anyhow!(
+            "Failed to parse string as {parse_type} for configure option {name}: {e}",
+            parse_type = stringify!(T)
+        )
+    })
+}
+
+fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result<Vec<String>> {
+    let array = item
+        .as_array()
+        .with_context(|| format!("configure option {name} is not an array"))?;
+
+    array
+        .iter()
+        .map(|value| {
+            value
+                .as_str()
+                .map(str::to_string)
+                .with_context(|| format!("Array item {value:?} for key {name} is not a string"))
+        })
+        .collect()
 }

 #[cfg(test)]
 mod tests {
-    use std::fs;
+    use std::{
+        fs,
+        num::{NonZeroU32, NonZeroUsize},
+    };

+    use remote_storage::{RemoteStorageKind, S3Config};
    use tempfile::{tempdir, TempDir};

    use super::*;
@@ -698,17 +608,21 @@ id = 10
    fn parse_defaults() -> anyhow::Result<()> {
        let tempdir = tempdir()?;
        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-        // we have to create dummy pathes to overcome the validation errors
-        let config_string = format!("pg_distrib_dir='{}'\nid=10", pg_distrib_dir.display());
+        let broker_endpoint = "http://127.0.0.1:7777";
+        // we have to create dummy values to overcome the validation errors
+        let config_string = format!(
+            "pg_distrib_dir='{}'\nid=10\nbroker_endpoints = ['{broker_endpoint}']",
+            pg_distrib_dir.display()
+        );
        let toml = config_string.parse()?;

        let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
-            .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e}"));
+            .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));

        assert_eq!(
            parsed_config,
            PageServerConf {
-                id: ZNodeId(10),
+                id: NodeId(10),
                listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
                listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
                wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?,
@@ -723,6 +637,10 @@ id = 10
                remote_storage_config: None,
                profiling: ProfilingConfig::Disabled,
                default_tenant_conf: TenantConf::default(),
+                broker_endpoints: vec![broker_endpoint
+                    .parse()
+                    .expect("Failed to parse a valid broker endpoint URL")],
+                broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -734,20 +652,21 @@ id = 10
    fn parse_basic_config() -> anyhow::Result<()> {
        let tempdir = tempdir()?;
        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
+        let broker_endpoint = "http://127.0.0.1:7777";

        let config_string = format!(
-            "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'",
+            "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'\nbroker_endpoints = ['{broker_endpoint}']",
            pg_distrib_dir.display()
        );
        let toml = config_string.parse()?;

        let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
-            .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e}"));
+            .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));

        assert_eq!(
            parsed_config,
            PageServerConf {
-                id: ZNodeId(10),
+                id: NodeId(10),
                listen_pg_addr: "127.0.0.1:64000".to_string(),
                listen_http_addr: "127.0.0.1:9898".to_string(),
                wait_lsn_timeout: Duration::from_secs(111),
@@ -762,6 +681,10 @@ id = 10
                remote_storage_config: None,
                profiling: ProfilingConfig::Disabled,
                default_tenant_conf: TenantConf::default(),
+                broker_endpoints: vec![broker_endpoint
+                    .parse()
+                    .expect("Failed to parse a valid broker endpoint URL")],
+                broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
            },
            "Should be able to parse all basic config values correctly"
        );
@@ -773,6 +696,7 @@ id = 10
    fn parse_remote_fs_storage_config() -> anyhow::Result<()> {
        let tempdir = tempdir()?;
        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
+        let broker_endpoint = "http://127.0.0.1:7777";

        let local_storage_path = tempdir.path().join("local_remote_storage");

@@ -792,6 +716,7 @@ local_path = '{}'"#,
            let config_string = format!(
                r#"{ALL_BASE_VALUES_TOML}
 pg_distrib_dir='{}'
+broker_endpoints = ['{broker_endpoint}']

 {remote_storage_config_str}"#,
                pg_distrib_dir.display(),
@@ -800,18 +725,20 @@ pg_distrib_dir='{}'
            let toml = config_string.parse()?;

            let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
-                .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e}"))
+                .unwrap_or_else(|e| {
+                    panic!("Failed to parse config '{config_string}', reason: {e:?}")
+                })
                .remote_storage_config
                .expect("Should have remote storage config for the local FS");

            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
-                    max_concurrent_timelines_sync: NonZeroUsize::new(
-                        defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_TIMELINES_SYNC
+                    max_concurrent_syncs: NonZeroUsize::new(
+                        remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS
                    )
-                    .unwrap(),
-                    max_sync_errors: NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
+                        .unwrap(),
+                    max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
                        .unwrap(),
                    storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
                },
@@ -829,29 +756,26 @@ pg_distrib_dir='{}'
        let bucket_name = "some-sample-bucket".to_string();
        let bucket_region = "eu-north-1".to_string();
        let prefix_in_bucket = "test_prefix".to_string();
-        let access_key_id = "SOMEKEYAAAAASADSAH*#".to_string();
-        let secret_access_key = "SOMEsEcReTsd292v".to_string();
        let endpoint = "http://localhost:5000".to_string();
-        let max_concurrent_timelines_sync = NonZeroUsize::new(111).unwrap();
+        let max_concurrent_syncs = NonZeroUsize::new(111).unwrap();
        let max_sync_errors = NonZeroU32::new(222).unwrap();
        let s3_concurrency_limit = NonZeroUsize::new(333).unwrap();
+        let broker_endpoint = "http://127.0.0.1:7777";

        let identical_toml_declarations = &[
            format!(
                r#"[remote_storage]
-max_concurrent_timelines_sync = {max_concurrent_timelines_sync}
+max_concurrent_syncs = {max_concurrent_syncs}
 max_sync_errors = {max_sync_errors}
 bucket_name = '{bucket_name}'
 bucket_region = '{bucket_region}'
 prefix_in_bucket = '{prefix_in_bucket}'
-access_key_id = '{access_key_id}'
-secret_access_key = '{secret_access_key}'
 endpoint = '{endpoint}'
 concurrency_limit = {s3_concurrency_limit}"#
            ),
            format!(
-                "remote_storage={{max_concurrent_timelines_sync={max_concurrent_timelines_sync}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\
-                bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', access_key_id='{access_key_id}', secret_access_key='{secret_access_key}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}",
+                "remote_storage={{max_concurrent_syncs={max_concurrent_syncs}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\
+                bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}",
            ),
        ];

@@ -859,6 +783,7 @@ concurrency_limit = {s3_concurrency_limit}"#
            let config_string = format!(
                r#"{ALL_BASE_VALUES_TOML}
 pg_distrib_dir='{}'
+broker_endpoints = ['{broker_endpoint}']

 {remote_storage_config_str}"#,
                pg_distrib_dir.display(),
@@ -867,20 +792,20 @@ pg_distrib_dir='{}'
            let toml = config_string.parse()?;

            let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
-                .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e}"))
+                .unwrap_or_else(|e| {
+                    panic!("Failed to parse config '{config_string}', reason: {e:?}")
+                })
                .remote_storage_config
                .expect("Should have remote storage config for S3");

            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
-                    max_concurrent_timelines_sync,
+                    max_concurrent_syncs,
                    max_sync_errors,
                    storage: RemoteStorageKind::AwsS3(S3Config {
                        bucket_name: bucket_name.clone(),
                        bucket_region: bucket_region.clone(),
-                        access_key_id: Some(access_key_id.clone()),
-                        secret_access_key: Some(secret_access_key.clone()),
                        prefix_in_bucket: Some(prefix_in_bucket.clone()),
                        endpoint: Some(endpoint.clone()),
                        concurrency_limit: s3_concurrency_limit,
--- a/pageserver/src/http/models.rs
+++ b/pageserver/src/http/models.rs
@@ -2,7 +2,7 @@ use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use utils::{
    lsn::Lsn,
-    zid::{ZNodeId, ZTenantId, ZTimelineId},
+    zid::{NodeId, ZTenantId, ZTimelineId},
 };

 #[serde_as]
@@ -42,7 +42,7 @@ pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub ZTenantId

 #[derive(Serialize)]
 pub struct StatusResponse {
-    pub id: ZNodeId,
+    pub id: NodeId,
 }

 impl TenantCreateRequest {
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -123,6 +123,53 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"

+  /v1/tenant/{tenant_id}/timeline/{timeline_id}/wal_receiver:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    get:
+      description: Get wal receiver's data attached to the timeline
+      responses:
+        "200":
+          description: WalReceiverEntry
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/WalReceiverEntry"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "404":
+          description: Error when no wal receiver is running or found
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/NotFoundError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"

  /v1/tenant/{tenant_id}/timeline/{timeline_id}/attach:
    parameters:
@@ -520,6 +567,21 @@ components:
          type: integer
        current_logical_size_non_incremental:
          type: integer
+    WalReceiverEntry:
+      type: object
+      required:
+        - thread_id
+        - wal_producer_connstr
+      properties:
+        thread_id:
+          type: integer
+        wal_producer_connstr:
+          type: string
+        last_received_msg_lsn:
+          type: string
+          format: hex
+        last_received_msg_ts:
+          type: integer

    Error:
      type: object
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3,17 +3,16 @@ use std::sync::Arc;
 use anyhow::{Context, Result};
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
+use remote_storage::GenericRemoteStorage;
 use tracing::*;

 use super::models::{
    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse,
    TimelineCreateRequest,
 };
-use crate::config::RemoteStorageKind;
-use crate::remote_storage::{
-    download_index_part, schedule_timeline_download, LocalFs, RemoteIndex, RemoteTimeline, S3Bucket,
-};
 use crate::repository::Repository;
+use crate::storage_sync;
+use crate::storage_sync::index::{RemoteIndex, RemoteTimeline};
 use crate::tenant_config::TenantConfOpt;
 use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo};
 use crate::{config::PageServerConf, tenant_mgr, timelines};
@@ -37,11 +36,6 @@ struct State {
    remote_storage: Option<GenericRemoteStorage>,
 }

-enum GenericRemoteStorage {
-    Local(LocalFs),
-    S3(S3Bucket),
-}
-
 impl State {
    fn new(
        conf: &'static PageServerConf,
@@ -57,14 +51,7 @@ impl State {
        let remote_storage = conf
            .remote_storage_config
            .as_ref()
-            .map(|storage_config| match &storage_config.storage {
-                RemoteStorageKind::LocalFs(root) => {
-                    LocalFs::new(root.clone(), &conf.workdir).map(GenericRemoteStorage::Local)
-                }
-                RemoteStorageKind::AwsS3(s3_config) => {
-                    S3Bucket::new(s3_config, &conf.workdir).map(GenericRemoteStorage::S3)
-                }
-            })
+            .map(|storage_config| GenericRemoteStorage::new(conf.workdir.clone(), storage_config))
            .transpose()
            .context("Failed to init generic remote storage")?;

@@ -237,6 +224,30 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
    json_response(StatusCode::OK, timeline_info)
 }

+async fn wal_receiver_get_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
+
+    let wal_receiver = tokio::task::spawn_blocking(move || {
+        let _enter =
+            info_span!("wal_receiver_get", tenant = %tenant_id, timeline = %timeline_id).entered();
+
+        crate::walreceiver::get_wal_receiver_entry(tenant_id, timeline_id)
+    })
+    .await
+    .map_err(ApiError::from_err)?
+    .ok_or_else(|| {
+        ApiError::NotFound(format!(
+            "WAL receiver not found for tenant {} and timeline {}",
+            tenant_id, timeline_id
+        ))
+    })?;
+
+    json_response(StatusCode::OK, wal_receiver)
+}
+
 async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
@@ -273,14 +284,14 @@ async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body
        }

        remote_timeline.awaits_download = true;
-        schedule_timeline_download(tenant_id, timeline_id);
+        storage_sync::schedule_layer_download(tenant_id, timeline_id);
        return json_response(StatusCode::ACCEPTED, ());
    } else {
        // no timeline in the index, release the lock to make the potentially lengthy download opetation
        drop(index_accessor);
    }

-    let new_timeline = match try_download_shard_data(state, sync_id).await {
+    let new_timeline = match try_download_index_part_data(state, sync_id).await {
        Ok(Some(mut new_timeline)) => {
            tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id))
                .await
@@ -309,35 +320,32 @@ async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body
        }
        None => index_accessor.add_timeline_entry(sync_id, new_timeline),
    }
-    schedule_timeline_download(tenant_id, timeline_id);
+    storage_sync::schedule_layer_download(tenant_id, timeline_id);
    json_response(StatusCode::ACCEPTED, ())
 }

-async fn try_download_shard_data(
+async fn try_download_index_part_data(
    state: &State,
    sync_id: ZTenantTimelineId,
 ) -> anyhow::Result<Option<RemoteTimeline>> {
-    let shard = match state.remote_storage.as_ref() {
+    let index_part = match state.remote_storage.as_ref() {
        Some(GenericRemoteStorage::Local(local_storage)) => {
-            download_index_part(state.conf, local_storage, sync_id).await
+            storage_sync::download_index_part(state.conf, local_storage, sync_id).await
        }
        Some(GenericRemoteStorage::S3(s3_storage)) => {
-            download_index_part(state.conf, s3_storage, sync_id).await
+            storage_sync::download_index_part(state.conf, s3_storage, sync_id).await
        }
        None => return Ok(None),
    }
-    .with_context(|| format!("Failed to download index shard for timeline {}", sync_id))?;
+    .with_context(|| format!("Failed to download index part for timeline {sync_id}"))?;

    let timeline_path = state
        .conf
        .timeline_path(&sync_id.timeline_id, &sync_id.tenant_id);
-    RemoteTimeline::from_index_part(&timeline_path, shard)
+    RemoteTimeline::from_index_part(&timeline_path, index_part)
        .map(Some)
        .with_context(|| {
-            format!(
-                "Failed to convert index shard into remote timeline for timeline {}",
-                sync_id
-            )
+            format!("Failed to convert index part into remote timeline for timeline {sync_id}")
        })
 }

@@ -501,6 +509,10 @@ pub fn make_router(
            "/v1/tenant/:tenant_id/timeline/:timeline_id",
            timeline_detail_handler,
        )
+        .get(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/wal_receiver",
+            wal_receiver_get_handler,
+        )
        .post(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/attach",
            timeline_attach_handler,
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -274,7 +274,7 @@ fn import_control_file<R: Repository>(

    // Extract the checkpoint record and import it separately.
    let pg_control = ControlFileData::decode(&buffer)?;
-    let checkpoint_bytes = pg_control.checkPointCopy.encode();
+    let checkpoint_bytes = pg_control.checkPointCopy.encode()?;
    modification.put_checkpoint(checkpoint_bytes)?;

    Ok(pg_control)
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
@@ -20,8 +20,8 @@ use tracing::*;

 use std::cmp::{max, min, Ordering};
 use std::collections::hash_map::Entry;
-use std::collections::BTreeSet;
 use std::collections::HashMap;
+use std::collections::{BTreeSet, HashSet};
 use std::fs;
 use std::fs::{File, OpenOptions};
 use std::io::Write;
@@ -34,10 +34,9 @@ use std::time::{Duration, Instant, SystemTime};
 use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME};
 use crate::config::PageServerConf;
 use crate::keyspace::KeySpace;
+use crate::storage_sync::index::RemoteIndex;
 use crate::tenant_config::{TenantConf, TenantConfOpt};

-use crate::page_cache;
-use crate::remote_storage::{schedule_timeline_checkpoint_upload, RemoteIndex};
 use crate::repository::{
    GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncStatusUpdate, TimelineWriter,
 };
@@ -48,6 +47,7 @@ use crate::virtual_file::VirtualFile;
 use crate::walreceiver::IS_WAL_RECEIVER;
 use crate::walredo::WalRedoManager;
 use crate::CheckpointConfig;
+use crate::{page_cache, storage_sync};

 use metrics::{
    register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge_vec,
@@ -74,6 +74,7 @@ pub mod metadata;
 mod par_fsync;
 mod storage_layer;

+use crate::pgdatadir_mapping::LsnForTimestamp;
 use delta_layer::{DeltaLayer, DeltaLayerWriter};
 use ephemeral_file::is_ephemeral_file;
 use filename::{DeltaFileName, ImageFileName};
@@ -81,6 +82,7 @@ use image_layer::{ImageLayer, ImageLayerWriter};
 use inmemory_layer::InMemoryLayer;
 use layer_map::LayerMap;
 use layer_map::SearchResult;
+use postgres_ffi::xlog_utils::to_pg_timestamp;
 use storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};

 // re-export this function so that page_cache.rs can use it.
@@ -89,7 +91,7 @@ pub use crate::layered_repository::ephemeral_file::writeback as writeback_epheme
 // Metrics collected on operations on the storage repository.
 lazy_static! {
    static ref STORAGE_TIME: HistogramVec = register_histogram_vec!(
-        "pageserver_storage_time",
+        "pageserver_storage_operations_seconds",
        "Time spent on storage operations",
        &["operation", "tenant_id", "timeline_id"]
    )
@@ -99,8 +101,8 @@ lazy_static! {
 // Metrics collected on operations on the storage repository.
 lazy_static! {
    static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!(
-        "pageserver_getpage_reconstruct_time",
-        "Time spent on storage operations",
+        "pageserver_getpage_reconstruct_seconds",
+        "Time spent in reconstruct_value",
        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric");
@@ -108,13 +110,13 @@ lazy_static! {

 lazy_static! {
    static ref MATERIALIZED_PAGE_CACHE_HIT: IntCounterVec = register_int_counter_vec!(
-        "materialize_page_cache_hits",
+        "pageserver_materialized_cache_hits_total",
        "Number of cache hits from materialized page cache",
        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric");
    static ref WAIT_LSN_TIME: HistogramVec = register_histogram_vec!(
-        "wait_lsn_time",
+        "pageserver_wait_lsn_seconds",
        "Time spent waiting for WAL to arrive",
        &["tenant_id", "timeline_id"]
    )
@@ -134,12 +136,12 @@ lazy_static! {
 // or in testing they estimate how much we would upload if we did.
 lazy_static! {
    static ref NUM_PERSISTENT_FILES_CREATED: IntCounter = register_int_counter!(
-        "pageserver_num_persistent_files_created",
+        "pageserver_created_persistent_files_total",
        "Number of files created that are meant to be uploaded to cloud storage",
    )
    .expect("failed to define a metric");
    static ref PERSISTENT_BYTES_WRITTEN: IntCounter = register_int_counter!(
-        "pageserver_persistent_bytes_written",
+        "pageserver_written_persistent_bytes_total",
        "Total bytes written that are meant to be uploaded to cloud storage",
    )
    .expect("failed to define a metric");
@@ -393,9 +395,22 @@ impl Repository for LayeredRepository {

    fn detach_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> {
        let mut timelines = self.timelines.lock().unwrap();
+        // check no child timelines, because detach will remove files, which will brake child branches
+        // FIXME this can still be violated because we do not guarantee
+        //   that all ancestors are downloaded/attached to the same pageserver
+        let num_children = timelines
+            .iter()
+            .filter(|(_, entry)| entry.ancestor_timeline_id() == Some(timeline_id))
+            .count();
+
+        ensure!(
+            num_children == 0,
+            "Cannot detach timeline which has child timelines"
+        );
+
        ensure!(
            timelines.remove(&timeline_id).is_some(),
-            "cannot detach timeline {timeline_id} that is not available locally"
+            "Cannot detach timeline {timeline_id} that is not available locally"
        );
        Ok(())
    }
@@ -415,7 +430,7 @@ impl Repository for LayeredRepository {
                    Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."),
                    Entry::Vacant(entry) => {
                        // we need to get metadata of a timeline, another option is to pass it along with Downloaded status
-                        let metadata = Self::load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?;
+                        let metadata = load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?;
                        // finally we make newly downloaded timeline visible to repository
                        entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, })
                    },
@@ -442,7 +457,7 @@ enum LayeredTimelineEntry {
 impl LayeredTimelineEntry {
    fn timeline_id(&self) -> ZTimelineId {
        match self {
-            LayeredTimelineEntry::Loaded(timeline) => timeline.timelineid,
+            LayeredTimelineEntry::Loaded(timeline) => timeline.timeline_id,
            LayeredTimelineEntry::Unloaded { id, .. } => *id,
        }
    }
@@ -602,21 +617,17 @@ impl LayeredRepository {

    fn load_local_timeline(
        &self,
-        timelineid: ZTimelineId,
+        timeline_id: ZTimelineId,
        timelines: &mut HashMap<ZTimelineId, LayeredTimelineEntry>,
    ) -> anyhow::Result<Arc<LayeredTimeline>> {
-        let metadata = Self::load_metadata(self.conf, timelineid, self.tenant_id)
+        let metadata = load_metadata(self.conf, timeline_id, self.tenant_id)
            .context("failed to load metadata")?;
        let disk_consistent_lsn = metadata.disk_consistent_lsn();

        let ancestor = metadata
            .ancestor_timeline()
            .map(|ancestor_timeline_id| {
-                trace!(
-                    "loading {}'s ancestor {}",
-                    timelineid,
-                    &ancestor_timeline_id
-                );
+                trace!("loading {timeline_id}'s ancestor {}", &ancestor_timeline_id);
                self.get_timeline_load_internal(ancestor_timeline_id, timelines)
            })
            .transpose()
@@ -630,7 +641,7 @@ impl LayeredRepository {
            Arc::clone(&self.tenant_conf),
            metadata,
            ancestor,
-            timelineid,
+            timeline_id,
            self.tenant_id,
            Arc::clone(&self.walredo_mgr),
            self.upload_layers,
@@ -763,17 +774,6 @@ impl LayeredRepository {
        Ok(())
    }

-    fn load_metadata(
-        conf: &'static PageServerConf,
-        timelineid: ZTimelineId,
-        tenantid: ZTenantId,
-    ) -> Result<TimelineMetadata> {
-        let path = metadata_path(conf, timelineid, tenantid);
-        info!("loading metadata from {}", path.display());
-        let metadata_bytes = std::fs::read(&path)?;
-        TimelineMetadata::from_bytes(&metadata_bytes)
-    }
-
    //
    // How garbage collection works:
    //
@@ -900,8 +900,8 @@ pub struct LayeredTimeline {
    conf: &'static PageServerConf,
    tenant_conf: Arc<RwLock<TenantConfOpt>>,

-    tenantid: ZTenantId,
-    timelineid: ZTimelineId,
+    tenant_id: ZTenantId,
+    timeline_id: ZTimelineId,

    layers: RwLock<LayerMap>,

@@ -1175,50 +1175,50 @@ impl LayeredTimeline {
        tenant_conf: Arc<RwLock<TenantConfOpt>>,
        metadata: TimelineMetadata,
        ancestor: Option<LayeredTimelineEntry>,
-        timelineid: ZTimelineId,
-        tenantid: ZTenantId,
+        timeline_id: ZTimelineId,
+        tenant_id: ZTenantId,
        walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
        upload_layers: bool,
    ) -> LayeredTimeline {
        let reconstruct_time_histo = RECONSTRUCT_TIME
-            .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()])
+            .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()])
            .unwrap();
        let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
-            .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()])
+            .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()])
            .unwrap();
        let flush_time_histo = STORAGE_TIME
            .get_metric_with_label_values(&[
                "layer flush",
-                &tenantid.to_string(),
-                &timelineid.to_string(),
+                &tenant_id.to_string(),
+                &timeline_id.to_string(),
            ])
            .unwrap();
        let compact_time_histo = STORAGE_TIME
            .get_metric_with_label_values(&[
                "compact",
-                &tenantid.to_string(),
-                &timelineid.to_string(),
+                &tenant_id.to_string(),
+                &timeline_id.to_string(),
            ])
            .unwrap();
        let create_images_time_histo = STORAGE_TIME
            .get_metric_with_label_values(&[
                "create images",
-                &tenantid.to_string(),
-                &timelineid.to_string(),
+                &tenant_id.to_string(),
+                &timeline_id.to_string(),
            ])
            .unwrap();
        let last_record_gauge = LAST_RECORD_LSN
-            .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()])
+            .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()])
            .unwrap();
        let wait_lsn_time_histo = WAIT_LSN_TIME
-            .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()])
+            .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()])
            .unwrap();

        LayeredTimeline {
            conf,
            tenant_conf,
-            timelineid,
-            tenantid,
+            timeline_id,
+            tenant_id,
            layers: RwLock::new(LayerMap::default()),

            walredo_mgr,
@@ -1230,7 +1230,7 @@ impl LayeredTimeline {
            }),
            disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0),

-            last_freeze_at: AtomicLsn::new(0),
+            last_freeze_at: AtomicLsn::new(metadata.disk_consistent_lsn().0),

            ancestor_timeline: ancestor,
            ancestor_lsn: metadata.ancestor_lsn(),
@@ -1270,7 +1270,7 @@ impl LayeredTimeline {

        // Scan timeline directory and create ImageFileName and DeltaFilename
        // structs representing all files on disk
-        let timeline_path = self.conf.timeline_path(&self.timelineid, &self.tenantid);
+        let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);

        for direntry in fs::read_dir(timeline_path)? {
            let direntry = direntry?;
@@ -1282,7 +1282,7 @@ impl LayeredTimeline {
                if imgfilename.lsn > disk_consistent_lsn {
                    warn!(
                        "found future image layer {} on timeline {} disk_consistent_lsn is {}",
-                        imgfilename, self.timelineid, disk_consistent_lsn
+                        imgfilename, self.timeline_id, disk_consistent_lsn
                    );

                    rename_to_backup(direntry.path())?;
@@ -1290,7 +1290,7 @@ impl LayeredTimeline {
                }

                let layer =
-                    ImageLayer::new(self.conf, self.timelineid, self.tenantid, &imgfilename);
+                    ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename);

                trace!("found layer {}", layer.filename().display());
                layers.insert_historic(Arc::new(layer));
@@ -1305,7 +1305,7 @@ impl LayeredTimeline {
                if deltafilename.lsn_range.end > disk_consistent_lsn + 1 {
                    warn!(
                        "found future delta layer {} on timeline {} disk_consistent_lsn is {}",
-                        deltafilename, self.timelineid, disk_consistent_lsn
+                        deltafilename, self.timeline_id, disk_consistent_lsn
                    );

                    rename_to_backup(direntry.path())?;
@@ -1313,7 +1313,7 @@ impl LayeredTimeline {
                }

                let layer =
-                    DeltaLayer::new(self.conf, self.timelineid, self.tenantid, &deltafilename);
+                    DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename);

                trace!("found layer {}", layer.filename().display());
                layers.insert_historic(Arc::new(layer));
@@ -1357,7 +1357,9 @@ impl LayeredTimeline {
        let mut timeline_owned;
        let mut timeline = self;

-        let mut path: Vec<(ValueReconstructResult, Lsn, Arc<dyn Layer>)> = Vec::new();
+        // For debugging purposes, collect the path of layers that we traversed
+        // through. It's included in the error message if we fail to find the key.
+        let mut traversal_path: Vec<(ValueReconstructResult, Lsn, Arc<dyn Layer>)> = Vec::new();

        let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img {
            *cached_lsn
@@ -1387,32 +1389,24 @@ impl LayeredTimeline {
                    if prev_lsn <= cont_lsn {
                        // Didn't make any progress in last iteration. Error out to avoid
                        // getting stuck in the loop.
-
-                        // For debugging purposes, print the path of layers that we traversed
-                        // through.
-                        for (r, c, l) in path {
-                            error!(
-                                "PATH: result {:?}, cont_lsn {}, layer: {}",
-                                r,
-                                c,
-                                l.filename().display()
-                            );
-                        }
-                        bail!("could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
-                          key,
-                          Lsn(cont_lsn.0 - 1),
-                              request_lsn,
-                        timeline.ancestor_lsn)
+                        return layer_traversal_error(format!(
+                            "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
+                            key,
+                            Lsn(cont_lsn.0 - 1),
+                            request_lsn,
+                            timeline.ancestor_lsn
+                        ), traversal_path);
                    }
                    prev_lsn = cont_lsn;
                }
                ValueReconstructResult::Missing => {
-                    bail!(
-                        "could not find data for key {} at LSN {}, for request at LSN {}",
-                        key,
-                        cont_lsn,
-                        request_lsn
-                    )
+                    return layer_traversal_error(
+                        format!(
+                            "could not find data for key {} at LSN {}, for request at LSN {}",
+                            key, cont_lsn, request_lsn
+                        ),
+                        traversal_path,
+                    );
                }
            }

@@ -1447,7 +1441,7 @@ impl LayeredTimeline {
                        reconstruct_state,
                    )?;
                    cont_lsn = lsn_floor;
-                    path.push((result, cont_lsn, open_layer.clone()));
+                    traversal_path.push((result, cont_lsn, open_layer.clone()));
                    continue;
                }
            }
@@ -1462,7 +1456,7 @@ impl LayeredTimeline {
                        reconstruct_state,
                    )?;
                    cont_lsn = lsn_floor;
-                    path.push((result, cont_lsn, frozen_layer.clone()));
+                    traversal_path.push((result, cont_lsn, frozen_layer.clone()));
                    continue 'outer;
                }
            }
@@ -1477,7 +1471,7 @@ impl LayeredTimeline {
                    reconstruct_state,
                )?;
                cont_lsn = lsn_floor;
-                path.push((result, cont_lsn, layer));
+                traversal_path.push((result, cont_lsn, layer));
            } else if timeline.ancestor_timeline.is_some() {
                // Nothing on this timeline. Traverse to parent
                result = ValueReconstructResult::Continue;
@@ -1495,7 +1489,7 @@ impl LayeredTimeline {
        // FIXME: It's pointless to check the cache for things that are not 8kB pages.
        // We should look at the key to determine if it's a cacheable object
        let (lsn, read_guard) =
-            cache.lookup_materialized_page(self.tenantid, self.timelineid, key, lsn)?;
+            cache.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)?;
        let img = Bytes::from(read_guard.to_vec());
        Some((lsn, img))
    }
@@ -1507,15 +1501,15 @@ impl LayeredTimeline {
            .with_context(|| {
                format!(
                    "Ancestor is missing. Timeline id: {} Ancestor id {:?}",
-                    self.timelineid,
+                    self.timeline_id,
                    self.get_ancestor_timeline_id(),
                )
            })?
            .ensure_loaded()
            .with_context(|| {
                format!(
-                    "Ancestor timeline is not is not loaded. Timeline id: {} Ancestor id {:?}",
-                    self.timelineid,
+                    "Ancestor timeline is not loaded. Timeline id: {} Ancestor id {:?}",
+                    self.timeline_id,
                    self.get_ancestor_timeline_id(),
                )
            })?;
@@ -1552,12 +1546,12 @@ impl LayeredTimeline {

            trace!(
                "creating layer for write at {}/{} for record at {}",
-                self.timelineid,
+                self.timeline_id,
                start_lsn,
                lsn
            );
            let new_layer =
-                InMemoryLayer::create(self.conf, self.timelineid, self.tenantid, start_lsn)?;
+                InMemoryLayer::create(self.conf, self.timeline_id, self.tenant_id, start_lsn)?;
            let layer_rc = Arc::new(new_layer);

            layers.open_layer = Some(Arc::clone(&layer_rc));
@@ -1621,22 +1615,30 @@ impl LayeredTimeline {
    pub fn check_checkpoint_distance(self: &Arc<LayeredTimeline>) -> Result<()> {
        let last_lsn = self.get_last_record_lsn();

+        // Has more than 'checkpoint_distance' of WAL been accumulated?
        let distance = last_lsn.widening_sub(self.last_freeze_at.load());
        if distance >= self.get_checkpoint_distance().into() {
+            // Yes. Freeze the current in-memory layer.
            self.freeze_inmem_layer(true);
            self.last_freeze_at.store(last_lsn);
-        }
-        if let Ok(guard) = self.layer_flush_lock.try_lock() {
-            drop(guard);
-            let self_clone = Arc::clone(self);
-            thread_mgr::spawn(
-                thread_mgr::ThreadKind::LayerFlushThread,
-                Some(self.tenantid),
-                Some(self.timelineid),
-                "layer flush thread",
-                false,
-                move || self_clone.flush_frozen_layers(false),
-            )?;
+
+            // Launch a thread to flush the frozen layer to disk, unless
+            // a thread was already running. (If the thread was running
+            // at the time that we froze the layer, it must've seen the
+            // the layer we just froze before it exited; see comments
+            // in flush_frozen_layers())
+            if let Ok(guard) = self.layer_flush_lock.try_lock() {
+                drop(guard);
+                let self_clone = Arc::clone(self);
+                thread_mgr::spawn(
+                    thread_mgr::ThreadKind::LayerFlushThread,
+                    Some(self.tenant_id),
+                    Some(self.timeline_id),
+                    "layer flush thread",
+                    false,
+                    move || self_clone.flush_frozen_layers(false),
+                )?;
+            }
        }
        Ok(())
    }
@@ -1701,7 +1703,7 @@ impl LayeredTimeline {
        // them all in parallel.
        par_fsync::par_fsync(&[
            new_delta_path.clone(),
-            self.conf.timeline_path(&self.timelineid, &self.tenantid),
+            self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
        ])?;
        fail_point!("checkpoint-before-sync");

@@ -1773,8 +1775,8 @@ impl LayeredTimeline {

            LayeredRepository::save_metadata(
                self.conf,
-                self.timelineid,
-                self.tenantid,
+                self.timeline_id,
+                self.tenant_id,
                &metadata,
                false,
            )?;
@@ -1783,11 +1785,11 @@ impl LayeredTimeline {
            PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len());

            if self.upload_layers.load(atomic::Ordering::Relaxed) {
-                schedule_timeline_checkpoint_upload(
-                    self.tenantid,
-                    self.timelineid,
-                    new_delta_path,
-                    metadata,
+                storage_sync::schedule_layer_upload(
+                    self.tenant_id,
+                    self.timeline_id,
+                    HashSet::from([new_delta_path]),
+                    Some(metadata),
                );
            }

@@ -1838,7 +1840,8 @@ impl LayeredTimeline {
        let target_file_size = self.get_checkpoint_distance();

        // Define partitioning schema if needed
-        if let Ok(pgdir) = tenant_mgr::get_local_timeline_with_load(self.tenantid, self.timelineid)
+        if let Ok(pgdir) =
+            tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)
        {
            let (partitioning, lsn) = pgdir.repartition(
                self.get_last_record_lsn(),
@@ -1847,11 +1850,21 @@ impl LayeredTimeline {
            let timer = self.create_images_time_histo.start_timer();
            // 2. Create new image layers for partitions that have been modified
            // "enough".
+            let mut layer_paths_to_upload = HashSet::with_capacity(partitioning.parts.len());
            for part in partitioning.parts.iter() {
                if self.time_for_new_image_layer(part, lsn)? {
-                    self.create_image_layer(part, lsn)?;
+                    let new_path = self.create_image_layer(part, lsn)?;
+                    layer_paths_to_upload.insert(new_path);
                }
            }
+            if self.upload_layers.load(atomic::Ordering::Relaxed) {
+                storage_sync::schedule_layer_upload(
+                    self.tenant_id,
+                    self.timeline_id,
+                    layer_paths_to_upload,
+                    None,
+                );
+            }
            timer.stop_and_record();

            // 3. Compact
@@ -1872,7 +1885,7 @@ impl LayeredTimeline {
        for part_range in &partition.ranges {
            let image_coverage = layers.image_coverage(part_range, lsn)?;
            for (img_range, last_img) in image_coverage {
-                let img_lsn = if let Some(ref last_img) = last_img {
+                let img_lsn = if let Some(last_img) = last_img {
                    last_img.get_lsn_range().end
                } else {
                    Lsn(0)
@@ -1893,11 +1906,11 @@ impl LayeredTimeline {
        Ok(false)
    }

-    fn create_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result<()> {
+    fn create_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result<PathBuf> {
        let img_range =
            partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end;
        let mut image_layer_writer =
-            ImageLayerWriter::new(self.conf, self.timelineid, self.tenantid, &img_range, lsn)?;
+            ImageLayerWriter::new(self.conf, self.timeline_id, self.tenant_id, &img_range, lsn)?;

        for range in &partition.ranges {
            let mut key = range.start;
@@ -1920,53 +1933,100 @@ impl LayeredTimeline {
        // and fsync them all in parallel.
        par_fsync::par_fsync(&[
            image_layer.path(),
-            self.conf.timeline_path(&self.timelineid, &self.tenantid),
+            self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
        ])?;

        // FIXME: Do we need to do something to upload it to remote storage here?

        let mut layers = self.layers.write().unwrap();
+        let new_path = image_layer.path();
        layers.insert_historic(Arc::new(image_layer));
        drop(layers);

-        Ok(())
+        Ok(new_path)
    }

+    ///
+    /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
+    /// as Level 1 files.
+    ///
    fn compact_level0(&self, target_file_size: u64) -> Result<()> {
        let layers = self.layers.read().unwrap();
-
-        let level0_deltas = layers.get_level0_deltas()?;
-
-        // We compact or "shuffle" the level-0 delta layers when they've
-        // accumulated over the compaction threshold.
-        if level0_deltas.len() < self.get_compaction_threshold() {
-            return Ok(());
-        }
+        let mut level0_deltas = layers.get_level0_deltas()?;
        drop(layers);

-        // FIXME: this function probably won't work correctly if there's overlap
-        // in the deltas.
-        let lsn_range = level0_deltas
-            .iter()
-            .map(|l| l.get_lsn_range())
-            .reduce(|a, b| min(a.start, b.start)..max(a.end, b.end))
-            .unwrap();
+        // Only compact if enough layers have accumulated.
+        if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() {
+            return Ok(());
+        }

-        let all_values_iter = level0_deltas.iter().map(|l| l.iter()).kmerge_by(|a, b| {
-            if let Ok((a_key, a_lsn, _)) = a {
-                if let Ok((b_key, b_lsn, _)) = b {
-                    match a_key.cmp(b_key) {
-                        Ordering::Less => true,
-                        Ordering::Equal => a_lsn <= b_lsn,
-                        Ordering::Greater => false,
+        // Gather the files to compact in this iteration.
+        //
+        // Start with the oldest Level 0 delta file, and collect any other
+        // level 0 files that form a contiguous sequence, such that the end
+        // LSN of previous file matches the start LSN of the next file.
+        //
+        // Note that if the files don't form such a sequence, we might
+        // "compact" just a single file. That's a bit pointless, but it allows
+        // us to get rid of the level 0 file, and compact the other files on
+        // the next iteration. This could probably made smarter, but such
+        // "gaps" in the sequence of level 0 files should only happen in case
+        // of a crash, partial download from cloud storage, or something like
+        // that, so it's not a big deal in practice.
+        level0_deltas.sort_by_key(|l| l.get_lsn_range().start);
+        let mut level0_deltas_iter = level0_deltas.iter();
+
+        let first_level0_delta = level0_deltas_iter.next().unwrap();
+        let mut prev_lsn_end = first_level0_delta.get_lsn_range().end;
+        let mut deltas_to_compact = vec![Arc::clone(first_level0_delta)];
+        for l in level0_deltas_iter {
+            let lsn_range = l.get_lsn_range();
+
+            if lsn_range.start != prev_lsn_end {
+                break;
+            }
+            deltas_to_compact.push(Arc::clone(l));
+            prev_lsn_end = lsn_range.end;
+        }
+        let lsn_range = Range {
+            start: deltas_to_compact.first().unwrap().get_lsn_range().start,
+            end: deltas_to_compact.last().unwrap().get_lsn_range().end,
+        };
+
+        info!(
+            "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)",
+            lsn_range.start,
+            lsn_range.end,
+            deltas_to_compact.len(),
+            level0_deltas.len()
+        );
+        for l in deltas_to_compact.iter() {
+            info!("compact includes {}", l.filename().display());
+        }
+        // We don't need the original list of layers anymore. Drop it so that
+        // we don't accidentally use it later in the function.
+        drop(level0_deltas);
+
+        // This iterator walks through all key-value pairs from all the layers
+        // we're compacting, in key, LSN order.
+        let all_values_iter = deltas_to_compact
+            .iter()
+            .map(|l| l.iter())
+            .kmerge_by(|a, b| {
+                if let Ok((a_key, a_lsn, _)) = a {
+                    if let Ok((b_key, b_lsn, _)) = b {
+                        match a_key.cmp(b_key) {
+                            Ordering::Less => true,
+                            Ordering::Equal => a_lsn <= b_lsn,
+                            Ordering::Greater => false,
+                        }
+                    } else {
+                        false
                    }
                } else {
-                    false
+                    true
                }
-            } else {
-                true
-            }
-        });
+            });

        // Merge the contents of all the input delta layers into a new set
        // of delta layers, based on the current partitioning.
@@ -1995,8 +2055,8 @@ impl LayeredTimeline {
            if writer.is_none() {
                writer = Some(DeltaLayerWriter::new(
                    self.conf,
-                    self.timelineid,
-                    self.tenantid,
+                    self.timeline_id,
+                    self.tenant_id,
                    key,
                    lsn_range.clone(),
                )?);
@@ -2014,7 +2074,7 @@ impl LayeredTimeline {
            let mut layer_paths: Vec<PathBuf> = new_layers.iter().map(|l| l.path()).collect();

            // also sync the directory
-            layer_paths.push(self.conf.timeline_path(&self.timelineid, &self.tenantid));
+            layer_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id));

            // Fsync all the layer files and directory using multiple threads to
            // minimize latency.
@@ -2024,18 +2084,38 @@ impl LayeredTimeline {
        }

        let mut layers = self.layers.write().unwrap();
+        let mut new_layer_paths = HashSet::with_capacity(new_layers.len());
        for l in new_layers {
+            new_layer_paths.insert(l.path());
            layers.insert_historic(Arc::new(l));
        }

        // Now that we have reshuffled the data to set of new delta layers, we can
        // delete the old ones
-        for l in level0_deltas {
+        let mut layer_paths_do_delete = HashSet::with_capacity(deltas_to_compact.len());
+        for l in deltas_to_compact {
            l.delete()?;
-            layers.remove_historic(l.clone());
+            if let Some(path) = l.local_path() {
+                layer_paths_do_delete.insert(path);
+            }
+            layers.remove_historic(l);
        }
        drop(layers);

+        if self.upload_layers.load(atomic::Ordering::Relaxed) {
+            storage_sync::schedule_layer_upload(
+                self.tenant_id,
+                self.timeline_id,
+                new_layer_paths,
+                None,
+            );
+            storage_sync::schedule_layer_delete(
+                self.tenant_id,
+                self.timeline_id,
+                layer_paths_do_delete,
+            );
+        }
+
        Ok(())
    }

@@ -2085,20 +2165,63 @@ impl LayeredTimeline {

        let gc_info = self.gc_info.read().unwrap();
        let retain_lsns = &gc_info.retain_lsns;
-        let cutoff = gc_info.cutoff;
+        let cutoff = min(gc_info.cutoff, disk_consistent_lsn);
        let pitr = gc_info.pitr;

-        let _enter = info_span!("garbage collection", timeline = %self.timelineid, tenant = %self.tenantid, cutoff = %cutoff).entered();
+        // Calculate pitr cutoff point.
+        // If we cannot determine a cutoff LSN, be conservative and don't GC anything.
+        let mut pitr_cutoff_lsn: Lsn = *self.get_latest_gc_cutoff_lsn();
+
+        if let Ok(timeline) =
+            tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)
+        {
+            // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
+            // If we don't have enough data to convert to LSN,
+            // play safe and don't remove any layers.
+            if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
+                let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
+
+                match timeline.find_lsn_for_timestamp(pitr_timestamp)? {
+                    LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn,
+                    LsnForTimestamp::Future(lsn) => {
+                        debug!("future({})", lsn);
+                        pitr_cutoff_lsn = cutoff;
+                    }
+                    LsnForTimestamp::Past(lsn) => {
+                        debug!("past({})", lsn);
+                    }
+                }
+                debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn)
+            }
+        } else if cfg!(test) {
+            // We don't have local timeline in mocked cargo tests.
+            // So, just ignore pitr_interval setting in this case.
+            pitr_cutoff_lsn = cutoff;
+        }
+
+        let new_gc_cutoff = Lsn::min(cutoff, pitr_cutoff_lsn);
+
+        // Nothing to GC. Return early.
+        if *self.get_latest_gc_cutoff_lsn() >= new_gc_cutoff {
+            info!(
+                "Nothing to GC for timeline {}. cutoff_lsn {}",
+                self.timeline_id, new_gc_cutoff
+            );
+            result.elapsed = now.elapsed()?;
+            return Ok(result);
+        }
+
+        let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %cutoff).entered();

        // We need to ensure that no one branches at a point before latest_gc_cutoff_lsn.
        // See branch_timeline() for details.
-        *self.latest_gc_cutoff_lsn.write().unwrap() = cutoff;
+        *self.latest_gc_cutoff_lsn.write().unwrap() = new_gc_cutoff;

        info!("GC starting");

        debug!("retain_lsns: {:?}", retain_lsns);

-        let mut layers_to_remove: Vec<Arc<dyn Layer>> = Vec::new();
+        let mut layers_to_remove = Vec::new();

        // Scan all on-disk layers in the timeline.
        //
@@ -2132,30 +2255,18 @@ impl LayeredTimeline {
                result.layers_needed_by_cutoff += 1;
                continue 'outer;
            }
-            // 2. It is newer than PiTR interval?
-            // We use modification time of layer file to estimate update time.
-            // This estimation is not quite precise but maintaining LSN->timestamp map seems to be overkill.
-            // It is not expected that users will need high precision here. And this estimation
-            // is conservative: modification time of file is always newer than actual time of version
-            // creation. So it is safe for users.
-            // TODO A possible "bloat" issue still persists here.
-            // If modification time changes because of layer upload/download, we will keep these files
-            // longer than necessary.
-            // https://github.com/neondatabase/neon/issues/1554
-            //
-            if let Ok(metadata) = fs::metadata(&l.filename()) {
-                let last_modified = metadata.modified()?;
-                if now.duration_since(last_modified)? < pitr {
-                    debug!(
-                        "keeping {} because it's modification time {:?} is newer than PITR {:?}",
-                        l.filename().display(),
-                        last_modified,
-                        pitr
-                    );
-                    result.layers_needed_by_pitr += 1;
-                    continue 'outer;
-                }
+
+            // 2. It is newer than PiTR cutoff point?
+            if l.get_lsn_range().end > pitr_cutoff_lsn {
+                debug!(
+                    "keeping {} because it's newer than pitr_cutoff_lsn {}",
+                    l.filename().display(),
+                    pitr_cutoff_lsn
+                );
+                result.layers_needed_by_pitr += 1;
+                continue 'outer;
            }
+
            // 3. Is it needed by a child branch?
            // NOTE With that wee would keep data that
            // might be referenced by child branches forever.
@@ -2183,12 +2294,20 @@ impl LayeredTimeline {
            // is 102, then it might not have been fully flushed to disk
            // before crash.
            //
-            // FIXME: This logic is wrong. See https://github.com/zenithdb/zenith/issues/707
-            if !layers.newer_image_layer_exists(
-                &l.get_key_range(),
-                l.get_lsn_range().end,
-                disk_consistent_lsn + 1,
-            )? {
+            // For example, imagine that the following layers exist:
+            //
+            // 1000      - image (A)
+            // 1000-2000 - delta (B)
+            // 2000      - image (C)
+            // 2000-3000 - delta (D)
+            // 3000      - image (E)
+            //
+            // If GC horizon is at 2500, we can remove layers A and B, but
+            // we cannot remove C, even though it's older than 2500, because
+            // the delta layer 2000-3000 depends on it.
+            if !layers
+                .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))?
+            {
                debug!(
                    "keeping {} because it is the latest layer",
                    l.filename().display()
@@ -2209,13 +2328,24 @@ impl LayeredTimeline {
        // Actually delete the layers from disk and remove them from the map.
        // (couldn't do this in the loop above, because you cannot modify a collection
        // while iterating it. BTreeMap::retain() would be another option)
+        let mut layer_paths_to_delete = HashSet::with_capacity(layers_to_remove.len());
        for doomed_layer in layers_to_remove {
            doomed_layer.delete()?;
-            layers.remove_historic(doomed_layer.clone());
-
+            if let Some(path) = doomed_layer.local_path() {
+                layer_paths_to_delete.insert(path);
+            }
+            layers.remove_historic(doomed_layer);
            result.layers_removed += 1;
        }

+        if self.upload_layers.load(atomic::Ordering::Relaxed) {
+            storage_sync::schedule_layer_delete(
+                self.tenant_id,
+                self.timeline_id,
+                layer_paths_to_delete,
+            );
+        }
+
        result.elapsed = now.elapsed()?;
        Ok(result)
    }
@@ -2279,8 +2409,8 @@ impl LayeredTimeline {
                if img.len() == page_cache::PAGE_SZ {
                    let cache = page_cache::get();
                    cache.memorize_materialized_page(
-                        self.tenantid,
-                        self.timelineid,
+                        self.tenant_id,
+                        self.timeline_id,
                        key,
                        last_rec_lsn,
                        &img,
@@ -2293,6 +2423,32 @@ impl LayeredTimeline {
    }
 }

+/// Helper function for get_reconstruct_data() to add the path of layers traversed
+/// to an error, as anyhow context information.
+fn layer_traversal_error(
+    msg: String,
+    path: Vec<(ValueReconstructResult, Lsn, Arc<dyn Layer>)>,
+) -> anyhow::Result<()> {
+    // We want the original 'msg' to be the outermost context. The outermost context
+    // is the most high-level information, which also gets propagated to the client.
+    let mut msg_iter = path
+        .iter()
+        .map(|(r, c, l)| {
+            format!(
+                "layer traversal: result {:?}, cont_lsn {}, layer: {}",
+                r,
+                c,
+                l.filename().display()
+            )
+        })
+        .chain(std::iter::once(msg));
+    // Construct initial message from the first traversed layer
+    let err = anyhow!(msg_iter.next().unwrap());
+
+    // Append all subsequent traversals, and the error message 'msg', as contexts.
+    Err(msg_iter.fold(err, |err, msg| err.context(msg)))
+}
+
 struct LayeredTimelineWriter<'a> {
    tl: &'a LayeredTimeline,
    _write_guard: MutexGuard<'a, ()>,
@@ -2362,6 +2518,26 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> {
    bail!("couldn't find an unused backup number for {:?}", path)
 }

+fn load_metadata(
+    conf: &'static PageServerConf,
+    timeline_id: ZTimelineId,
+    tenant_id: ZTenantId,
+) -> anyhow::Result<TimelineMetadata> {
+    let metadata_path = metadata_path(conf, timeline_id, tenant_id);
+    let metadata_bytes = std::fs::read(&metadata_path).with_context(|| {
+        format!(
+            "Failed to read metadata bytes from path {}",
+            metadata_path.display()
+        )
+    })?;
+    TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| {
+        format!(
+            "Failed to parse metadata bytes from path {}",
+            metadata_path.display()
+        )
+    })
+}
+
 ///
 /// Tests that are specific to the layered storage format.
 ///
@@ -2396,9 +2572,19 @@ pub mod tests {

        let err = harness.try_load().err().expect("should fail");
        assert_eq!(err.to_string(), "failed to load local metadata");
-        assert_eq!(
-            err.source().unwrap().to_string(),
-            "metadata checksum mismatch"
+
+        let mut found_error_message = false;
+        let mut err_source = err.source();
+        while let Some(source) = err_source {
+            if source.to_string() == "metadata checksum mismatch" {
+                found_error_message = true;
+                break;
+            }
+            err_source = source.source();
+        }
+        assert!(
+            found_error_message,
+            "didn't find the corrupted metadata error"
        );

        Ok(())
--- a/pageserver/src/layered_repository/README.md
+++ b/pageserver/src/layered_repository/README.md
@@ -23,6 +23,7 @@ distribution depends on the workload: the updates could be totally random, or
 there could be a long stream of updates to a single relation when data is bulk
 loaded, for example, or something in between.

+```
 Cloud Storage                   Page Server                           Safekeeper
                        L1               L0             Memory            WAL

@@ -37,6 +38,7 @@ Cloud Storage                   Page Server                           Safekeeper
 +----+----+          +----+----+      |   |     |
 |EEEE|               |EEEE|EEEE|      +---+-----+
 +----+               +----+----+
+```

 In this illustration, WAL is received as a stream from the Safekeeper, from the
 right.  It is immediately captured by the page server and stored quickly in
@@ -47,7 +49,7 @@ the same page and relation close to each other.
 From the page server memory, whenever enough WAL has been accumulated, it is flushed
 to disk into a new L0 layer file, and the memory is released.

-When enough L0 files have been accumulated, they are merged together rand sliced
+When enough L0 files have been accumulated, they are merged together and sliced
 per key-space, producing a new set of files where each file contains a more
 narrow key range, but larger LSN range.

@@ -121,7 +123,7 @@ The files are called "layer files". Each layer file covers a range of keys, and
 a range of LSNs (or a single LSN, in case of image layers). You can think of it
 as a rectangle in the two-dimensional key-LSN space. The layer files for each
 timeline are stored in the timeline's subdirectory under
-.zenith/tenants/<tenantid>/timelines.
+`.zenith/tenants/<tenantid>/timelines`.

 There are two kind of layer files: images, and delta layers. An image file
 contains a snapshot of all keys at a particular LSN, whereas a delta file
@@ -130,8 +132,11 @@ range of LSN.

 image file:

+```
    000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
              start key                          end key                           LSN
+```
+

 The first parts define the key range that the layer covers. See
 pgdatadir_mapping.rs for how the key space is used. The last part is the LSN.
@@ -140,8 +145,10 @@ delta file:

 Delta files are named similarly, but they cover a range of LSNs:

+```
    000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
              start key                          end key                          start LSN     end LSN
+```

 A delta file contains all the key-values in the key-range that were updated in
 the LSN range. If a key has not been modified, there is no trace of it in the
@@ -151,7 +158,9 @@ delta layer.
 A delta layer file can cover a part of the overall key space, as in the previous
 example, or the whole key range like this:

+```
    000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000578C6B29-0000000057A50051
+```

 A file that covers the whole key range is called a L0 file (Level 0), while a
 file that covers only part of the key range is called a L1 file. The "level" of
@@ -168,7 +177,9 @@ version, and how branching and GC works is still valid.

 The full path of a delta file looks like this:

+```
    .zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000
+```

 For simplicity, the examples below use a simplified notation for the
 paths.  The tenant ID is left out, the timeline ID is replaced with
@@ -177,8 +188,10 @@ with a human-readable table name. The LSNs are also shorter. For
 example, a base image file at LSN 100 and a delta file between 100-200
 for 'orders' table on 'main' branch is represented like this:

+```
    main/orders_100
    main/orders_100_200
+```


 # Creating layer files
@@ -188,12 +201,14 @@ branch called 'main' and two tables, 'orders' and 'customers'. The end
 of WAL is currently at LSN 250. In this starting situation, you would
 have these files on disk:

+```
 	main/orders_100
 	main/orders_100_200
 	main/orders_200
 	main/customers_100
 	main/customers_100_200
 	main/customers_200
+```

 In addition to those files, the recent changes between LSN 200 and the
 end of WAL at 250 are kept in memory. If the page server crashes, the
@@ -224,6 +239,7 @@ If the customers table is modified later, a new file is created for it
 at the next checkpoint. The new file will cover the "gap" from the
 last layer file, so the LSN ranges are always contiguous:

+```
 	main/orders_100
 	main/orders_100_200
 	main/orders_200
@@ -236,6 +252,7 @@ last layer file, so the LSN ranges are always contiguous:
 	main/customers_200
 	main/customers_200_500
 	main/customers_500
+```

 ## Reading page versions

@@ -259,15 +276,18 @@ involves replaying any WAL records applicable to the page between LSNs

 Imagine that a child branch is created at LSN 250:

+```
            @250
    ----main--+-------------------------->
               \
                +---child-------------->
+```


 Then, the 'orders' table is updated differently on the 'main' and
 'child' branches. You now have this situation on disk:

+```
    main/orders_100
    main/orders_100_200
    main/orders_200
@@ -282,6 +302,7 @@ Then, the 'orders' table is updated differently on the 'main' and
    child/orders_300
    child/orders_300_400
    child/orders_400
+```

 Because the 'customers' table hasn't been modified on the child
 branch, there is no file for it there. If you request a page for it on
@@ -294,6 +315,7 @@ is linear, and the request's LSN identifies unambiguously which file
 you need to look at. For example, the history for the 'orders' table
 on the 'main' branch consists of these files:

+```
    main/orders_100
    main/orders_100_200
    main/orders_200
@@ -301,10 +323,12 @@ on the 'main' branch consists of these files:
    main/orders_300
    main/orders_300_400
    main/orders_400
+```

 And from the 'child' branch's point of view, it consists of these
 files:

+```
    main/orders_100
    main/orders_100_200
    main/orders_200
@@ -313,6 +337,7 @@ files:
    child/orders_300
    child/orders_300_400
    child/orders_400
+```

 The branch metadata includes the point where the child branch was
 created, LSN 250. If a page request comes with LSN 275, we read the
@@ -345,6 +370,7 @@ Let's look at the single branch scenario again. Imagine that the end
 of the branch is LSN 525, so that the GC horizon is currently at
 525-150 = 375

+```
 	main/orders_100
 	main/orders_100_200
 	main/orders_200
@@ -357,11 +383,13 @@ of the branch is LSN 525, so that the GC horizon is currently at
 	main/customers_100
 	main/customers_100_200
 	main/customers_200
+```

 We can remove the following files because the end LSNs of those files are
 older than GC horizon 375, and there are more recent layer files for the
 table:

+```
 	main/orders_100       DELETE
 	main/orders_100_200   DELETE
 	main/orders_200       DELETE
@@ -374,8 +402,9 @@ table:
 	main/customers_100      DELETE
 	main/customers_100_200  DELETE
 	main/customers_200      KEEP, NO NEWER VERSION
+```

-'main/customers_100_200' is old enough, but it cannot be
+'main/customers_200' is old enough, but it cannot be
 removed because there is no newer layer file for the table.

 Things get slightly more complicated with multiple branches. All of
@@ -384,6 +413,7 @@ retain older shapshot files that are still needed by child branches.
 For example, if child branch is created at LSN 150, and the 'customers'
 table is updated on the branch, you would have these files:

+```
 	main/orders_100        KEEP, NEEDED BY child BRANCH
 	main/orders_100_200    KEEP, NEEDED BY child BRANCH
 	main/orders_200        DELETE
@@ -398,6 +428,7 @@ table is updated on the branch, you would have these files:
 	main/customers_200       KEEP, NO NEWER VERSION
 	child/customers_150_300  DELETE
 	child/customers_300      KEEP, NO NEWER VERSION
+```

 In this situation, 'main/orders_100' and 'main/orders_100_200' cannot
 be removed, even though they are older than the GC horizon, because
@@ -407,6 +438,7 @@ and 'main/orders_200_300' can still be removed.
 If 'orders' is modified later on the 'child' branch, we will create a
 new base image and delta file for it on the child:

+```
 	main/orders_100
 	main/orders_100_200

@@ -419,6 +451,7 @@ new base image and delta file for it on the child:
 	child/customers_300
 	child/orders_150_400
 	child/orders_400
+```

 After this, the 'main/orders_100' and 'main/orders_100_200' file could
 be removed. It is no longer needed by the child branch, because there
@@ -434,6 +467,7 @@ Describe GC and checkpoint interval settings.
 In principle, each relation can be checkpointed separately, i.e. the
 LSN ranges of the files don't need to line up. So this would be legal:

+```
 	main/orders_100
 	main/orders_100_200
 	main/orders_200
@@ -446,6 +480,7 @@ LSN ranges of the files don't need to line up. So this would be legal:
 	main/customers_250
 	main/customers_250_500
 	main/customers_500
+```

 However, the code currently always checkpoints all relations together.
 So that situation doesn't arise in practice.
@@ -468,11 +503,13 @@ does that.  It could be useful, however, as a transient state when
 garbage collecting around branch points, or explicit recovery
 points. For example, if we start with this:

+```
 	main/orders_100
 	main/orders_100_200
 	main/orders_200
 	main/orders_200_300
 	main/orders_300
+```

 And there is a branch or explicit recovery point at LSN 150, we could
 replace 'main/orders_100_200' with 'main/orders_150' to keep a
--- a/pageserver/src/layered_repository/delta_layer.rs
+++ b/pageserver/src/layered_repository/delta_layer.rs
@@ -37,11 +37,8 @@ use crate::virtual_file::VirtualFile;
 use crate::walrecord;
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
+use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use tracing::*;
-// avoid binding to Write (conflicts with std::io::Write)
-// while being able to use std::fmt::Write's methods
-use std::fmt::Write as _;
 use std::fs;
 use std::io::{BufWriter, Write};
 use std::io::{Seek, SeekFrom};
@@ -49,6 +46,7 @@ use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
 use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
+use tracing::*;

 use utils::{
    bin_ser::BeSer,
@@ -218,6 +216,10 @@ impl Layer for DeltaLayer {
        PathBuf::from(self.layer_name().to_string())
    }

+    fn local_path(&self) -> Option<PathBuf> {
+        Some(self.path())
+    }
+
    fn get_value_reconstruct_data(
        &self,
        key: Key,
@@ -250,6 +252,9 @@ impl Layer for DeltaLayer {
                    return false;
                }
                let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
+                if entry_lsn < lsn_range.start {
+                    return false;
+                }
                offsets.push((entry_lsn, blob_ref.pos()));

                !blob_ref.will_init()
@@ -358,6 +363,28 @@ impl Layer for DeltaLayer {
        tree_reader.dump()?;

        let mut cursor = file.block_cursor();
+
+        // A subroutine to dump a single blob
+        let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
+            let buf = cursor.read_blob(blob_ref.pos())?;
+            let val = Value::des(&buf)?;
+            let desc = match val {
+                Value::Image(img) => {
+                    format!(" img {} bytes", img.len())
+                }
+                Value::WalRecord(rec) => {
+                    let wal_desc = walrecord::describe_wal_record(&rec)?;
+                    format!(
+                        " rec {} bytes will_init: {} {}",
+                        buf.len(),
+                        rec.will_init(),
+                        wal_desc
+                    )
+                }
+            };
+            Ok(desc)
+        };
+
        tree_reader.visit(
            &[0u8; DELTA_KEY_SIZE],
            VisitDirection::Forwards,
@@ -366,34 +393,10 @@ impl Layer for DeltaLayer {
                let key = DeltaKey::extract_key_from_buf(delta_key);
                let lsn = DeltaKey::extract_lsn_from_buf(delta_key);

-                let mut desc = String::new();
-                match cursor.read_blob(blob_ref.pos()) {
-                    Ok(buf) => {
-                        let val = Value::des(&buf);
-                        match val {
-                            Ok(Value::Image(img)) => {
-                                write!(&mut desc, " img {} bytes", img.len()).unwrap();
-                            }
-                            Ok(Value::WalRecord(rec)) => {
-                                let wal_desc = walrecord::describe_wal_record(&rec);
-                                write!(
-                                    &mut desc,
-                                    " rec {} bytes will_init: {} {}",
-                                    buf.len(),
-                                    rec.will_init(),
-                                    wal_desc
-                                )
-                                .unwrap();
-                            }
-                            Err(err) => {
-                                write!(&mut desc, " DESERIALIZATION ERROR: {}", err).unwrap();
-                            }
-                        }
-                    }
-                    Err(err) => {
-                        write!(&mut desc, " READ ERROR: {}", err).unwrap();
-                    }
-                }
+                let desc = match dump_blob(blob_ref) {
+                    Ok(desc) => desc,
+                    Err(err) => format!("ERROR: {}", err),
+                };
                println!("  key {} at {}: {}", key, lsn, desc);
                true
            },
@@ -418,6 +421,28 @@ impl DeltaLayer {
        }
    }

+    fn temp_path_for(
+        conf: &PageServerConf,
+        timelineid: ZTimelineId,
+        tenantid: ZTenantId,
+        key_start: Key,
+        lsn_range: &Range<Lsn>,
+    ) -> PathBuf {
+        let rand_string: String = rand::thread_rng()
+            .sample_iter(&Alphanumeric)
+            .take(8)
+            .map(char::from)
+            .collect();
+
+        conf.timeline_path(&timelineid, &tenantid).join(format!(
+            "{}-XXX__{:016X}-{:016X}.{}.temp",
+            key_start,
+            u64::from(lsn_range.start),
+            u64::from(lsn_range.end),
+            rand_string
+        ))
+    }
+
    ///
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
@@ -605,12 +630,8 @@ impl DeltaLayerWriter {
        //
        // Note: This overwrites any existing file. There shouldn't be any.
        // FIXME: throw an error instead?
-        let path = conf.timeline_path(&timelineid, &tenantid).join(format!(
-            "{}-XXX__{:016X}-{:016X}.temp",
-            key_start,
-            u64::from(lsn_range.start),
-            u64::from(lsn_range.end)
-        ));
+        let path = DeltaLayer::temp_path_for(conf, timelineid, tenantid, key_start, &lsn_range);
+
        let mut file = VirtualFile::create(&path)?;
        // make room for the header block
        file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
@@ -703,6 +724,8 @@ impl DeltaLayerWriter {
            }),
        };

+        // fsync the file
+        file.sync_all()?;
        // Rename the file to its final name
        //
        // Note: This overwrites any existing file. There shouldn't be any.
--- a/pageserver/src/layered_repository/disk_btree.rs
+++ b/pageserver/src/layered_repository/disk_btree.rs
@@ -444,6 +444,13 @@ where
    ///
    /// stack[0] is the current root page, stack.last() is the leaf.
    ///
+    /// We maintain the length of the stack to be always greater than zero.
+    /// Two exceptions are:
+    /// 1. `Self::flush_node`. The method will push the new node if it extracted the last one.
+    ///   So because other methods cannot see the intermediate state invariant still holds.
+    /// 2. `Self::finish`. It consumes self and does not return it back,
+    ///  which means that this is where the structure is destroyed.
+    ///  Thus stack of zero length cannot be observed by other methods.
    stack: Vec<BuildNode<L>>,

    /// Last key that was appended to the tree. Used to sanity check that append
@@ -482,7 +489,10 @@ where

    fn append_internal(&mut self, key: &[u8; L], value: Value) -> Result<()> {
        // Try to append to the current leaf buffer
-        let last = self.stack.last_mut().unwrap();
+        let last = self
+            .stack
+            .last_mut()
+            .expect("should always have at least one item");
        let level = last.level;
        if last.push(key, value) {
            return Ok(());
@@ -512,19 +522,25 @@ where
        Ok(())
    }

+    /// Flush the bottommost node in the stack to disk. Appends a downlink to its parent,
+    /// and recursively flushes the parent too, if it becomes full. If the root page becomes full,
+    /// creates a new root page, increasing the height of the tree.
    fn flush_node(&mut self) -> Result<()> {
-        let last = self.stack.pop().unwrap();
+        // Get the current bottommost node in the stack and flush it to disk.
+        let last = self
+            .stack
+            .pop()
+            .expect("should always have at least one item");
        let buf = last.pack();
        let downlink_key = last.first_key();
        let downlink_ptr = self.writer.write_blk(buf)?;

-        // Append the downlink to the parent
+        // Append the downlink to the parent. If there is no parent, ie. this was the root page,
+        // create a new root page, increasing the height of the tree.
        if self.stack.is_empty() {
            self.stack.push(BuildNode::new(last.level + 1));
        }
-        self.append_internal(&downlink_key, Value::from_blknum(downlink_ptr))?;
-
-        Ok(())
+        self.append_internal(&downlink_key, Value::from_blknum(downlink_ptr))
    }

    ///
@@ -540,7 +556,10 @@ where
            self.flush_node()?;
        }

-        let root = self.stack.first().unwrap();
+        let root = self
+            .stack
+            .first()
+            .expect("by the check above we left one item there");
        let buf = root.pack();
        let root_blknum = self.writer.write_blk(buf)?;

--- a/pageserver/src/layered_repository/image_layer.rs
+++ b/pageserver/src/layered_repository/image_layer.rs
@@ -34,6 +34,7 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use hex;
+use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::fs;
 use std::io::Write;
@@ -125,6 +126,10 @@ impl Layer for ImageLayer {
        PathBuf::from(self.layer_name().to_string())
    }

+    fn local_path(&self) -> Option<PathBuf> {
+        Some(self.path())
+    }
+
    fn get_tenant_id(&self) -> ZTenantId {
        self.tenantid
    }
@@ -237,6 +242,22 @@ impl ImageLayer {
        }
    }

+    fn temp_path_for(
+        conf: &PageServerConf,
+        timelineid: ZTimelineId,
+        tenantid: ZTenantId,
+        fname: &ImageFileName,
+    ) -> PathBuf {
+        let rand_string: String = rand::thread_rng()
+            .sample_iter(&Alphanumeric)
+            .take(8)
+            .map(char::from)
+            .collect();
+
+        conf.timeline_path(&timelineid, &tenantid)
+            .join(format!("{}.{}.temp", fname, rand_string))
+    }
+
    ///
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
@@ -394,7 +415,7 @@ impl ImageLayer {
 ///
 pub struct ImageLayerWriter {
    conf: &'static PageServerConf,
-    _path: PathBuf,
+    path: PathBuf,
    timelineid: ZTimelineId,
    tenantid: ZTenantId,
    key_range: Range<Key>,
@@ -412,12 +433,10 @@ impl ImageLayerWriter {
        key_range: &Range<Key>,
        lsn: Lsn,
    ) -> anyhow::Result<ImageLayerWriter> {
-        // Create the file
-        //
-        // Note: This overwrites any existing file. There shouldn't be any.
-        // FIXME: throw an error instead?
-        let path = ImageLayer::path_for(
-            &PathOrConf::Conf(conf),
+        // Create the file initially with a temporary filename.
+        // We'll atomically rename it to the final name when we're done.
+        let path = ImageLayer::temp_path_for(
+            conf,
            timelineid,
            tenantid,
            &ImageFileName {
@@ -437,7 +456,7 @@ impl ImageLayerWriter {

        let writer = ImageLayerWriter {
            conf,
-            _path: path,
+            path,
            timelineid,
            tenantid,
            key_range: key_range.clone(),
@@ -508,6 +527,25 @@ impl ImageLayerWriter {
                index_root_blk,
            }),
        };
+
+        // fsync the file
+        file.sync_all()?;
+
+        // Rename the file to its final name
+        //
+        // Note: This overwrites any existing file. There shouldn't be any.
+        // FIXME: throw an error instead?
+        let final_path = ImageLayer::path_for(
+            &PathOrConf::Conf(self.conf),
+            self.timelineid,
+            self.tenantid,
+            &ImageFileName {
+                key_range: self.key_range.clone(),
+                lsn: self.lsn,
+            },
+        );
+        std::fs::rename(self.path, &final_path)?;
+
        trace!("created image layer {}", layer.path().display());

        Ok(layer)
--- a/pageserver/src/layered_repository/inmemory_layer.rs
+++ b/pageserver/src/layered_repository/inmemory_layer.rs
@@ -85,6 +85,10 @@ impl Layer for InMemoryLayer {
        ))
    }

+    fn local_path(&self) -> Option<PathBuf> {
+        None
+    }
+
    fn get_tenant_id(&self) -> ZTenantId {
        self.tenantid
    }
@@ -207,7 +211,7 @@ impl Layer for InMemoryLayer {
                        write!(&mut desc, " img {} bytes", img.len())?;
                    }
                    Ok(Value::WalRecord(rec)) => {
-                        let wal_desc = walrecord::describe_wal_record(&rec);
+                        let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
                        write!(
                            &mut desc,
                            " rec {} bytes will_init: {} {}",
--- a/pageserver/src/layered_repository/layer_map.rs
+++ b/pageserver/src/layered_repository/layer_map.rs
@@ -132,17 +132,15 @@ impl LayerMap {
                // this layer contains the requested point in the key/lsn space.
                // No need to search any further
                trace!(
-                    "found layer {} for request on {} at {}",
+                    "found layer {} for request on {key} at {end_lsn}",
                    l.filename().display(),
-                    key,
-                    end_lsn
                );
                latest_delta.replace(Arc::clone(l));
                break;
            }
            // this layer's end LSN is smaller than the requested point. If there's
            // nothing newer, this is what we need to return. Remember this.
-            if let Some(ref old_candidate) = latest_delta {
+            if let Some(old_candidate) = &latest_delta {
                if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
                    latest_delta.replace(Arc::clone(l));
                }
@@ -152,10 +150,8 @@ impl LayerMap {
        }
        if let Some(l) = latest_delta {
            trace!(
-                "found (old) layer {} for request on {} at {}",
+                "found (old) layer {} for request on {key} at {end_lsn}",
                l.filename().display(),
-                key,
-                end_lsn
            );
            let lsn_floor = std::cmp::max(
                Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
@@ -166,17 +162,13 @@ impl LayerMap {
                layer: l,
            }))
        } else if let Some(l) = latest_img {
-            trace!(
-                "found img layer and no deltas for request on {} at {}",
-                key,
-                end_lsn
-            );
+            trace!("found img layer and no deltas for request on {key} at {end_lsn}");
            Ok(Some(SearchResult {
                lsn_floor: latest_img_lsn.unwrap(),
                layer: l,
            }))
        } else {
-            trace!("no layer found for request on {} at {}", key, end_lsn);
+            trace!("no layer found for request on {key} at {end_lsn}");
            Ok(None)
        }
    }
@@ -194,7 +186,6 @@ impl LayerMap {
    ///
    /// This should be called when the corresponding file on disk has been deleted.
    ///
-    #[allow(dead_code)]
    pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
        let len_before = self.historic_layers.len();

@@ -210,18 +201,14 @@ impl LayerMap {
        NUM_ONDISK_LAYERS.dec();
    }

-    /// Is there a newer image layer for given key-range?
+    /// Is there a newer image layer for given key- and LSN-range?
    ///
    /// This is used for garbage collection, to determine if an old layer can
    /// be deleted.
-    /// We ignore layers newer than disk_consistent_lsn because they will be removed at restart
-    /// We also only look at historic layers
-    //#[allow(dead_code)]
-    pub fn newer_image_layer_exists(
+    pub fn image_layer_exists(
        &self,
        key_range: &Range<Key>,
-        lsn: Lsn,
-        disk_consistent_lsn: Lsn,
+        lsn_range: &Range<Lsn>,
    ) -> Result<bool> {
        let mut range_remain = key_range.clone();

@@ -234,8 +221,7 @@ impl LayerMap {
                let img_lsn = l.get_lsn_range().start;
                if !l.is_incremental()
                    && l.get_key_range().contains(&range_remain.start)
-                    && img_lsn > lsn
-                    && img_lsn < disk_consistent_lsn
+                    && lsn_range.contains(&img_lsn)
                {
                    made_progress = true;
                    let img_key_end = l.get_key_range().end;
@@ -253,7 +239,7 @@ impl LayerMap {
        }
    }

-    pub fn iter_historic_layers(&self) -> std::slice::Iter<Arc<dyn Layer>> {
+    pub fn iter_historic_layers(&self) -> impl Iterator<Item = &Arc<dyn Layer>> {
        self.historic_layers.iter()
    }

--- a/pageserver/src/layered_repository/storage_layer.rs
+++ b/pageserver/src/layered_repository/storage_layer.rs
@@ -105,6 +105,9 @@ pub trait Layer: Send + Sync {
    /// log messages, even though they're never not on disk.)
    fn filename(&self) -> PathBuf;

+    /// If a layer has a corresponding file on a local filesystem, return its absolute path.
+    fn local_path(&self) -> Option<PathBuf>;
+
    ///
    /// Return data needed to reconstruct given page at LSN.
    ///
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -9,8 +9,8 @@ pub mod page_service;
 pub mod pgdatadir_mapping;
 pub mod profiling;
 pub mod reltag;
-pub mod remote_storage;
 pub mod repository;
+pub mod storage_sync;
 pub mod tenant_config;
 pub mod tenant_mgr;
 pub mod tenant_threads;
@@ -45,7 +45,7 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;

 lazy_static! {
    static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!(
-        "pageserver_live_connections_count",
+        "pageserver_live_connections",
        "Number of live network connections",
        &["pageserver_connection_kind"]
    )
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -19,7 +19,6 @@ use std::net::TcpListener;
 use std::str;
 use std::str::FromStr;
 use std::sync::{Arc, RwLockReadGuard};
-use std::time::Duration;
 use tracing::*;
 use utils::{
    auth::{self, Claims, JwtAuth, Scope},
@@ -44,11 +43,14 @@ use crate::CheckpointConfig;
 use metrics::{register_histogram_vec, HistogramVec};
 use postgres_ffi::xlog_utils::to_pg_timestamp;

+use postgres_ffi::pg_constants;
+
 // Wrapped in libpq CopyData
 enum PagestreamFeMessage {
    Exists(PagestreamExistsRequest),
    Nblocks(PagestreamNblocksRequest),
    GetPage(PagestreamGetPageRequest),
+    DbSize(PagestreamDbSizeRequest),
 }

 // Wrapped in libpq CopyData
@@ -57,6 +59,7 @@ enum PagestreamBeMessage {
    Nblocks(PagestreamNblocksResponse),
    GetPage(PagestreamGetPageResponse),
    Error(PagestreamErrorResponse),
+    DbSize(PagestreamDbSizeResponse),
 }

 #[derive(Debug)]
@@ -81,6 +84,13 @@ struct PagestreamGetPageRequest {
    blkno: u32,
 }

+#[derive(Debug)]
+struct PagestreamDbSizeRequest {
+    latest: bool,
+    lsn: Lsn,
+    dbnode: u32,
+}
+
 #[derive(Debug)]
 struct PagestreamExistsResponse {
    exists: bool,
@@ -101,6 +111,11 @@ struct PagestreamErrorResponse {
    message: String,
 }

+#[derive(Debug)]
+struct PagestreamDbSizeResponse {
+    db_size: i64,
+}
+
 impl PagestreamFeMessage {
    fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
        // TODO these gets can fail
@@ -142,6 +157,11 @@ impl PagestreamFeMessage {
                },
                blkno: body.get_u32(),
            })),
+            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                dbnode: body.get_u32(),
+            })),
            _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
        }
    }
@@ -172,6 +192,10 @@ impl PagestreamBeMessage {
                bytes.put(resp.message.as_bytes());
                bytes.put_u8(0); // null terminator
            }
+            Self::DbSize(resp) => {
+                bytes.put_u8(104); /* tag from pagestore_client.h */
+                bytes.put_i64(resp.db_size);
+            }
        }

        bytes.into()
@@ -281,7 +305,29 @@ fn page_service_conn_main(

    let mut conn_handler = PageServerHandler::new(conf, auth);
    let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?;
-    pgbackend.run(&mut conn_handler)
+    match pgbackend.run(&mut conn_handler) {
+        Ok(()) => {
+            // we've been requested to shut down
+            Ok(())
+        }
+        Err(err) => {
+            let root_cause_io_err_kind = err
+                .root_cause()
+                .downcast_ref::<io::Error>()
+                .map(|e| e.kind());
+
+            // `ConnectionReset` error happens when the Postgres client closes the connection.
+            // As this disconnection happens quite often and is expected,
+            // we decided to downgrade the logging level to `INFO`.
+            // See: https://github.com/neondatabase/neon/issues/1683.
+            if root_cause_io_err_kind == Some(io::ErrorKind::ConnectionReset) {
+                info!("Postgres client disconnected");
+                Ok(())
+            } else {
+                Err(err)
+            }
+        }
+    }
 }

 #[derive(Debug)]
@@ -301,7 +347,7 @@ const TIME_BUCKETS: &[f64] = &[

 lazy_static! {
    static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!(
-        "pageserver_smgr_query_time",
+        "pageserver_smgr_query_seconds",
        "Time spent on smgr query handling",
        &["smgr_query_type", "tenant_id", "timeline_id"],
        TIME_BUCKETS.into()
@@ -367,6 +413,11 @@ impl PageServerHandler {
                                .observe_closure_duration(|| {
                                    self.handle_get_page_at_lsn_request(timeline.as_ref(), &req)
                                }),
+                            PagestreamFeMessage::DbSize(req) => SMGR_QUERY_TIME
+                                .with_label_values(&["get_db_size", &tenant_id, &timeline_id])
+                                .observe_closure_duration(|| {
+                                    self.handle_db_size_request(timeline.as_ref(), &req)
+                                }),
                        };

                        let response = response.unwrap_or_else(|e| {
@@ -487,6 +538,32 @@ impl PageServerHandler {
        }))
    }

+    fn handle_db_size_request<R: Repository>(
+        &self,
+        timeline: &DatadirTimeline<R>,
+        req: &PagestreamDbSizeRequest,
+    ) -> Result<PagestreamBeMessage> {
+        let _enter = info_span!("get_db_size", dbnode = %req.dbnode, req_lsn = %req.lsn).entered();
+        let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
+        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
+
+        let all_rels = timeline.list_rels(pg_constants::DEFAULTTABLESPACE_OID, req.dbnode, lsn)?;
+        let mut total_blocks: i64 = 0;
+
+        for rel in all_rels {
+            if rel.forknum == 0 {
+                let n_blocks = timeline.get_rel_size(rel, lsn).unwrap_or(0);
+                total_blocks += n_blocks as i64;
+            }
+        }
+
+        let db_size = total_blocks * pg_constants::BLCKSZ as i64;
+
+        Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
+            db_size,
+        }))
+    }
+
    fn handle_get_page_at_lsn_request<R: Repository>(
        &self,
        timeline: &DatadirTimeline<R>,
@@ -538,7 +615,8 @@ impl PageServerHandler {
        /* Send a tarball of the latest layer on the timeline */
        {
            let mut writer = CopyDataSink { pgb };
-            let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?;
+
+            let basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?;
            span.record("lsn", &basebackup.lsn.to_string().as_str());
            basebackup.send_tarball()?;
        }
@@ -675,7 +753,18 @@ impl postgres_backend::Handler for PageServerHandler {
            for failpoint in failpoints.split(';') {
                if let Some((name, actions)) = failpoint.split_once('=') {
                    info!("cfg failpoint: {} {}", name, actions);
-                    fail::cfg(name, actions).unwrap();
+
+                    // We recognize one extra "action" that's not natively recognized
+                    // by the failpoints crate: exit, to immediately kill the process
+                    if actions == "exit" {
+                        fail::cfg_callback(name, || {
+                            info!("Exit requested by failpoint");
+                            std::process::exit(1);
+                        })
+                        .unwrap();
+                    } else {
+                        fail::cfg(name, actions).unwrap();
+                    }
                } else {
                    bail!("Invalid failpoints format");
                }
@@ -740,7 +829,9 @@ impl postgres_backend::Handler for PageServerHandler {
                .unwrap_or_else(|| Ok(repo.get_gc_horizon()))?;

            let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
-            let result = repo.gc_iteration(Some(timelineid), gc_horizon, Duration::ZERO, true)?;
+            // Use tenant's pitr setting
+            let pitr = repo.get_pitr_interval();
+            let result = repo.gc_iteration(Some(timelineid), gc_horizon, pitr, true)?;
            pgb.write_message_noflush(&BeMessage::RowDescription(&[
                RowDescriptor::int8_col(b"layers_total"),
                RowDescriptor::int8_col(b"layers_needed_by_cutoff"),
--- a/pageserver/src/remote_storage.rs
+++ b/pageserver/src/remote_storage.rs
@@ -1,412 +0,0 @@
-//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage.
-//! This particular module serves as a public API border between pageserver and the internal storage machinery.
-//! No other modules from this tree are supposed to be used directly by the external code.
-//!
-//! There are a few components the storage machinery consists of:
-//! * [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
-//!     * [`local_fs`] allows to use local file system as an external storage
-//!     * [`s3_bucket`] uses AWS S3 bucket as an external storage
-//!
-//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync.
-//! Synchronization internals are split into submodules
-//!     * [`storage_sync::index`] to keep track of remote tenant files, the metadata and their mappings to local files
-//!     * [`storage_sync::upload`] and [`storage_sync::download`] to manage archive creation and upload; download and extraction, respectively
-//!
-//! * public API via to interact with the external world:
-//!     * [`start_local_timeline_sync`] to launch a background async loop to handle the synchronization
-//!     * [`schedule_timeline_checkpoint_upload`] and [`schedule_timeline_download`] to enqueue a new upload and download tasks,
-//!       to be processed by the async loop
-//!
-//! Here's a schematic overview of all interactions backup and the rest of the pageserver perform:
-//!
-//! +------------------------+                                    +--------->-------+
-//! |                        |  - - - (init async loop) - - - ->  |                 |
-//! |                        |                                    |                 |
-//! |                        |  ------------------------------->  |      async      |
-//! |       pageserver       |    (enqueue timeline sync task)    | upload/download |
-//! |                        |                                    |      loop       |
-//! |                        |  <-------------------------------  |                 |
-//! |                        |  (apply new timeline sync states)  |                 |
-//! +------------------------+                                    +---------<-------+
-//!                                                                         |
-//!                                                                         |
-//!                                          CRUD layer file operations     |
-//!                                     (upload/download/delete/list, etc.) |
-//!                                                                         V
-//!                                                            +------------------------+
-//!                                                            |                        |
-//!                                                            | [`RemoteStorage`] impl |
-//!                                                            |                        |
-//!                                                            | pageserver assumes it  |
-//!                                                            | owns exclusive write   |
-//!                                                            | access to this storage |
-//!                                                            +------------------------+
-//!
-//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop uninitialised, if configured so.
-//! The loop inits the storage connection and checks the remote files stored.
-//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server).
-//! Based on the remote storage data, the sync logic immediately schedules sync tasks for local timelines and reports about remote only timelines to pageserver, so it can
-//! query their downloads later if they are accessed.
-//!
-//! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata.
-//! If the storage sync loop was successfully started before, pageserver schedules the new checkpoint file uploads after every checkpoint.
-//! The checkpoint uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either).
-//! See [`crate::layered_repository`] for the upload calls and the adjacent logic.
-//!
-//! Synchronization logic is able to communicate back with updated timeline sync states, [`crate::repository::TimelineSyncStatusUpdate`],
-//! submitted via [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function. Tenant manager applies corresponding timeline updates in pageserver's in-memory state.
-//! Such submissions happen in two cases:
-//! * once after the sync loop startup, to signal pageserver which timelines will be synchronized in the near future
-//! * after every loop step, in case a timeline needs to be reloaded or evicted from pageserver's memory
-//!
-//! When the pageserver terminates, the sync loop finishes a current sync task (if any) and exits.
-//!
-//! The storage logic considers `image` as a set of local files (layers), fully representing a certain timeline at given moment (identified with `disk_consistent_lsn` from the corresponding `metadata` file).
-//! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed
-//! by the storage upload, if enabled.
-//! Yet timeline cannot alter already existing files, and cannot remove those too: only a GC process is capable of removing unused files.
-//! This way, remote storage synchronization relies on the fact that every checkpoint is incremental and local files are "immutable":
-//! * when a certain checkpoint gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same state
-//! * no files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten
-//! when the newer image is downloaded
-//!
-//! Pageserver maintains similar to the local file structure remotely: all layer files are uploaded with the same names under the same directory structure.
-//! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexShard`], containing the list of remote files.
-//! This file gets read to populate the cache, if the remote timeline data is missing from it and gets updated after every successful download.
-//! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`ZTenantId`] and [`ZTimelineId`],
-//! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrive its shard contents, if needed, same as any layer files.
-//!
-//! By default, pageserver reads the remote storage index data only for timelines located locally, to synchronize those, if needed.
-//! Bulk index data download happens only initially, on pageserer startup. The rest of the remote storage stays unknown to pageserver and loaded on demand only,
-//! when a new timeline is scheduled for the download.
-//!
-//! NOTES:
-//! * pageserver assumes it has exclusive write access to the remote storage. If supported, the way multiple pageservers can be separated in the same storage
-//! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API.
-//!
-//! * the sync tasks may not processed immediately after the submission: if they error and get re-enqueued, their execution might be backed off to ensure error cap is not exceeded too fast.
-//! The sync queue processing also happens in batches, so the sync tasks can wait in the queue for some time.
-
-mod local_fs;
-mod s3_bucket;
-mod storage_sync;
-
-use std::{
-    collections::{HashMap, HashSet},
-    ffi, fs,
-    path::{Path, PathBuf},
-};
-
-use anyhow::{bail, Context};
-use tokio::io;
-use tracing::{debug, error, info};
-
-use self::storage_sync::TEMP_DOWNLOAD_EXTENSION;
-pub use self::{
-    local_fs::LocalFs,
-    s3_bucket::S3Bucket,
-    storage_sync::{
-        download_index_part,
-        index::{IndexPart, RemoteIndex, RemoteTimeline},
-        schedule_timeline_checkpoint_upload, schedule_timeline_download,
-    },
-};
-use crate::{
-    config::{PageServerConf, RemoteStorageKind},
-    layered_repository::{
-        ephemeral_file::is_ephemeral_file,
-        metadata::{TimelineMetadata, METADATA_FILE_NAME},
-    },
-};
-use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
-
-/// A timeline status to share with pageserver's sync counterpart,
-/// after comparing local and remote timeline state.
-#[derive(Clone, Copy, Debug)]
-pub enum LocalTimelineInitStatus {
-    /// The timeline has every remote layer present locally.
-    /// There could be some layers requiring uploading,
-    /// but this does not block the timeline from any user interaction.
-    LocallyComplete,
-    /// A timeline has some files remotely, that are not present locally and need downloading.
-    /// Downloading might update timeline's metadata locally and current pageserver logic deals with local layers only,
-    /// so the data needs to be downloaded first before the timeline can be used.
-    NeedsSync,
-}
-
-type LocalTimelineInitStatuses = HashMap<ZTenantId, HashMap<ZTimelineId, LocalTimelineInitStatus>>;
-
-/// A structure to combine all synchronization data to share with pageserver after a successful sync loop initialization.
-/// Successful initialization includes a case when sync loop is not started, in which case the startup data is returned still,
-/// to simplify the received code.
-pub struct SyncStartupData {
-    pub remote_index: RemoteIndex,
-    pub local_timeline_init_statuses: LocalTimelineInitStatuses,
-}
-
-/// Based on the config, initiates the remote storage connection and starts a separate thread
-/// that ensures that pageserver and the remote storage are in sync with each other.
-/// If no external configuration connection given, no thread or storage initialization is done.
-/// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states.
-pub fn start_local_timeline_sync(
-    config: &'static PageServerConf,
-) -> anyhow::Result<SyncStartupData> {
-    let local_timeline_files = local_tenant_timeline_files(config)
-        .context("Failed to collect local tenant timeline files")?;
-
-    match &config.remote_storage_config {
-        Some(storage_config) => match &storage_config.storage {
-            RemoteStorageKind::LocalFs(root) => {
-                info!("Using fs root '{}' as a remote storage", root.display());
-                storage_sync::spawn_storage_sync_thread(
-                    config,
-                    local_timeline_files,
-                    LocalFs::new(root.clone(), &config.workdir)?,
-                    storage_config.max_concurrent_timelines_sync,
-                    storage_config.max_sync_errors,
-                )
-            },
-            RemoteStorageKind::AwsS3(s3_config) => {
-                info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
-                    s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
-                storage_sync::spawn_storage_sync_thread(
-                    config,
-                    local_timeline_files,
-                    S3Bucket::new(s3_config, &config.workdir)?,
-                    storage_config.max_concurrent_timelines_sync,
-                    storage_config.max_sync_errors,
-                )
-            },
-        }
-        .context("Failed to spawn the storage sync thread"),
-        None => {
-            info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled");
-            let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new();
-            for (ZTenantTimelineId { tenant_id, timeline_id }, _) in
-                local_timeline_files
-            {
-                local_timeline_init_statuses
-                    .entry(tenant_id)
-                    .or_default()
-                    .insert(timeline_id, LocalTimelineInitStatus::LocallyComplete);
-            }
-            Ok(SyncStartupData {
-                local_timeline_init_statuses,
-                remote_index: RemoteIndex::empty(),
-            })
-        }
-    }
-}
-
-fn local_tenant_timeline_files(
-    config: &'static PageServerConf,
-) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>> {
-    let mut local_tenant_timeline_files = HashMap::new();
-    let tenants_dir = config.tenants_path();
-    for tenants_dir_entry in fs::read_dir(&tenants_dir)
-        .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))?
-    {
-        match &tenants_dir_entry {
-            Ok(tenants_dir_entry) => {
-                match collect_timelines_for_tenant(config, &tenants_dir_entry.path()) {
-                    Ok(collected_files) => {
-                        local_tenant_timeline_files.extend(collected_files.into_iter())
-                    }
-                    Err(e) => error!(
-                        "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}",
-                        tenants_dir.display(),
-                        tenants_dir_entry,
-                        e
-                    ),
-                }
-            }
-            Err(e) => error!(
-                "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}",
-                tenants_dir_entry,
-                tenants_dir.display(),
-                e
-            ),
-        }
-    }
-
-    Ok(local_tenant_timeline_files)
-}
-
-fn collect_timelines_for_tenant(
-    config: &'static PageServerConf,
-    tenant_path: &Path,
-) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>> {
-    let mut timelines = HashMap::new();
-    let tenant_id = tenant_path
-        .file_name()
-        .and_then(ffi::OsStr::to_str)
-        .unwrap_or_default()
-        .parse::<ZTenantId>()
-        .context("Could not parse tenant id out of the tenant dir name")?;
-    let timelines_dir = config.timelines_path(&tenant_id);
-
-    for timelines_dir_entry in fs::read_dir(&timelines_dir).with_context(|| {
-        format!(
-            "Failed to list timelines dir entry for tenant {}",
-            tenant_id
-        )
-    })? {
-        match timelines_dir_entry {
-            Ok(timelines_dir_entry) => {
-                let timeline_path = timelines_dir_entry.path();
-                match collect_timeline_files(&timeline_path) {
-                    Ok((timeline_id, metadata, timeline_files)) => {
-                        timelines.insert(
-                            ZTenantTimelineId {
-                                tenant_id,
-                                timeline_id,
-                            },
-                            (metadata, timeline_files),
-                        );
-                    }
-                    Err(e) => error!(
-                        "Failed to process timeline dir contents at '{}', reason: {:?}",
-                        timeline_path.display(),
-                        e
-                    ),
-                }
-            }
-            Err(e) => error!(
-                "Failed to list timelines for entry tenant {}, reason: {:?}",
-                tenant_id, e
-            ),
-        }
-    }
-
-    Ok(timelines)
-}
-
-// discover timeline files and extract timeline metadata
-//  NOTE: ephemeral files are excluded from the list
-fn collect_timeline_files(
-    timeline_dir: &Path,
-) -> anyhow::Result<(ZTimelineId, TimelineMetadata, HashSet<PathBuf>)> {
-    let mut timeline_files = HashSet::new();
-    let mut timeline_metadata_path = None;
-
-    let timeline_id = timeline_dir
-        .file_name()
-        .and_then(ffi::OsStr::to_str)
-        .unwrap_or_default()
-        .parse::<ZTimelineId>()
-        .context("Could not parse timeline id out of the timeline dir name")?;
-    let timeline_dir_entries =
-        fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?;
-    for entry in timeline_dir_entries {
-        let entry_path = entry.context("Failed to list timeline dir entry")?.path();
-        if entry_path.is_file() {
-            if entry_path.file_name().and_then(ffi::OsStr::to_str) == Some(METADATA_FILE_NAME) {
-                timeline_metadata_path = Some(entry_path);
-            } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) {
-                debug!("skipping ephemeral file {}", entry_path.display());
-                continue;
-            } else if entry_path.extension().and_then(ffi::OsStr::to_str)
-                == Some(TEMP_DOWNLOAD_EXTENSION)
-            {
-                info!("removing temp download file at {}", entry_path.display());
-                fs::remove_file(&entry_path).with_context(|| {
-                    format!(
-                        "failed to remove temp download file at {}",
-                        entry_path.display()
-                    )
-                })?;
-            } else {
-                timeline_files.insert(entry_path);
-            }
-        }
-    }
-
-    // FIXME (rodionov) if attach call succeeded, and then pageserver is restarted before download is completed
-    //   then attach is lost. There would be no retries for that,
-    //   initial collect will fail because there is no metadata.
-    //   We either need to start download if we see empty dir after restart or attach caller should
-    //   be aware of that and retry attach if awaits_download for timeline switched from true to false
-    //   but timelinne didnt appear locally.
-    //   Check what happens with remote index in that case.
-    let timeline_metadata_path = match timeline_metadata_path {
-        Some(path) => path,
-        None => bail!("No metadata file found in the timeline directory"),
-    };
-    let metadata = TimelineMetadata::from_bytes(
-        &fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?,
-    )
-    .context("Failed to parse timeline metadata file bytes")?;
-
-    Ok((timeline_id, metadata, timeline_files))
-}
-
-/// Storage (potentially remote) API to manage its state.
-/// This storage tries to be unaware of any layered repository context,
-/// providing basic CRUD operations for storage files.
-#[async_trait::async_trait]
-pub trait RemoteStorage: Send + Sync {
-    /// A way to uniquely reference a file in the remote storage.
-    type StoragePath;
-
-    /// Attempts to derive the storage path out of the local path, if the latter is correct.
-    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath>;
-
-    /// Gets the download path of the given storage file.
-    fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf>;
-
-    /// Lists all items the storage has right now.
-    async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>>;
-
-    /// Streams the local file contents into remote into the remote storage entry.
-    async fn upload(
-        &self,
-        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
-        // S3 PUT request requires the content length to be specified,
-        // otherwise it starts to fail with the concurrent connection count increasing.
-        from_size_bytes: usize,
-        to: &Self::StoragePath,
-        metadata: Option<StorageMetadata>,
-    ) -> anyhow::Result<()>;
-
-    /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
-    /// Returns the metadata, if any was stored with the file previously.
-    async fn download(
-        &self,
-        from: &Self::StoragePath,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<Option<StorageMetadata>>;
-
-    /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
-    /// Returns the metadata, if any was stored with the file previously.
-    async fn download_range(
-        &self,
-        from: &Self::StoragePath,
-        start_inclusive: u64,
-        end_exclusive: Option<u64>,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<Option<StorageMetadata>>;
-
-    async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()>;
-}
-
-/// Extra set of key-value pairs that contain arbitrary metadata about the storage entry.
-/// Immutable, cannot be changed once the file is created.
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub struct StorageMetadata(HashMap<String, String>);
-
-fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
-    if prefix == path {
-        anyhow::bail!(
-            "Prefix and the path are equal, cannot strip: '{}'",
-            prefix.display()
-        )
-    } else {
-        path.strip_prefix(prefix).with_context(|| {
-            format!(
-                "Path '{}' is not prefixed with '{}'",
-                path.display(),
-                prefix.display(),
-            )
-        })
-    }
-}
--- a/pageserver/src/remote_storage/storage_sync.rs
+++ b/pageserver/src/remote_storage/storage_sync.rs
--- a/Show More
+++ b/Show More