drive by typo fix

chore: remove no longer needed empty rel fix
this seems to have been fixed long enough ago.
2026-02-12 07:00:36 +00:00 · 2022-11-02 21:11:05 +02:00 · 2022-11-02 21:10:44 +02:00 · 2022-11-02 18:37:48 +00:00 · 2022-11-02 12:30:09 -04:00 · 2022-11-02 16:22:58 +01:00
163 changed files with 14313 additions and 5582 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -73,6 +73,14 @@ runs:
      shell: bash -euxo pipefail {0}
      run: ./scripts/pysync

+    - name: Download compatibility snapshot for Postgres 14
+      if: inputs.build_type != 'remote'
+      uses: ./.github/actions/download
+      with:
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg14
+        path: /tmp/compatibility_snapshot_pg14
+        prefix: latest
+
    - name: Run pytest
      env:
        NEON_BIN: /tmp/neon/bin
@@ -80,6 +88,8 @@ runs:
        BUILD_TYPE: ${{ inputs.build_type }}
        AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
        AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
+        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
+        ALLOW_BREAKING_CHANGES: contains(github.event.pull_request.labels.*.name, 'breaking changes')
      shell: bash -euxo pipefail {0}
      run: |
        # PLATFORM will be embedded in the perf test report
@@ -154,6 +164,15 @@ runs:
          scripts/generate_and_push_perf_report.sh
        fi

+    - name: Upload compatibility snapshot for Postgres 14
+      if: github.ref_name == 'release'
+      uses: ./.github/actions/upload
+      with:
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
+        # The path includes a test name (test_prepare_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
+        path: /tmp/test_output/test_prepare_snapshot/compatibility_snapshot_pg14/
+        prefix: latest
+
    - name: Create Allure report
      if: always()
      uses: ./.github/actions/allure-report
--- a/.github/ansible/deploy.yaml
+++ b/.github/ansible/deploy.yaml
@@ -1,7 +1,7 @@
 - name: Upload Neon binaries
  hosts: storage
  gather_facts: False
-  remote_user: admin
+  remote_user: "{{ remote_user }}"

  tasks:

@@ -36,7 +36,7 @@
 - name: Deploy pageserver
  hosts: pageservers
  gather_facts: False
-  remote_user: admin
+  remote_user: "{{ remote_user }}"

  tasks:

@@ -124,7 +124,7 @@
 - name: Deploy safekeeper
  hosts: safekeepers
  gather_facts: False
-  remote_user: admin
+  remote_user: "{{ remote_user }}"

  tasks:

--- a/.github/ansible/neon-stress.hosts.yaml
+++ b/.github/ansible/neon-stress.hosts.yaml
@@ -3,7 +3,6 @@ storage:
    bucket_name: neon-storage-ireland
    bucket_region: eu-west-1
    console_mgmt_base_url: http://neon-stress-console.local
-    env_name: neon-stress
    etcd_endpoints: neon-stress-etcd.local:2379
    safekeeper_enable_s3_offload: 'false'
    pageserver_config_stub:
@@ -12,19 +11,21 @@ storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
        prefix_in_bucket: "{{ inventory_hostname }}"
-
+    safekeeper_s3_prefix: neon-stress/wal
+    hostname_suffix: ".local"
+    remote_user: admin
  children:
    pageservers:
      hosts:
        neon-stress-ps-1:
-          console_region_id: 1
+          console_region_id: aws-eu-west-1
        neon-stress-ps-2:
-          console_region_id: 1
+          console_region_id: aws-eu-west-1
    safekeepers:
      hosts:
        neon-stress-sk-1:
-          console_region_id: 1
+          console_region_id: aws-eu-west-1
        neon-stress-sk-2:
-          console_region_id: 1
+          console_region_id: aws-eu-west-1
        neon-stress-sk-3:
-          console_region_id: 1
+          console_region_id: aws-eu-west-1
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -0,0 +1,35 @@
+storage:
+  vars:
+    bucket_name: neon-prod-storage-ap-southeast-1
+    bucket_region: ap-southeast-1
+    console_mgmt_base_url: http://console-release.local
+    etcd_endpoints: etcd-0.ap-southeast-1.aws.neon.tech:2379
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: ap-southeast-1
+    ansible_aws_ssm_bucket_name: neon-prod-storage-ap-southeast-1
+    console_region_id: aws-ap-southeast-1
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-064de8ea28bdb495b
+        pageserver-1.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-0b180defcaeeb6b93
+
+    safekeepers:
+      hosts:
+        safekeeper-0.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-0d6f1dc5161eef894
+        safekeeper-1.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-0e338adda8eb2d19f
+        safekeeper-2.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-04fb63634e4679eb9
--- a/.github/ansible/prod.eu-central-1.hosts.yaml
+++ b/.github/ansible/prod.eu-central-1.hosts.yaml
@@ -0,0 +1,35 @@
+storage:
+  vars:
+    bucket_name: neon-prod-storage-eu-central-1
+    bucket_region: eu-central-1
+    console_mgmt_base_url: http://console-release.local
+    etcd_endpoints: etcd-0.eu-central-1.aws.neon.tech:2379
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: eu-central-1
+    ansible_aws_ssm_bucket_name: neon-prod-storage-eu-central-1
+    console_region_id: aws-eu-central-1
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.eu-central-1.aws.neon.tech:
+          ansible_host:  i-0cd8d316ecbb715be
+        pageserver-1.eu-central-1.aws.neon.tech:
+          ansible_host:  i-090044ed3d383fef0
+
+    safekeepers:
+      hosts:
+        safekeeper-0.eu-central-1.aws.neon.tech:
+          ansible_host:  i-0b238612d2318a050
+        safekeeper-1.eu-central-1.aws.neon.tech:
+          ansible_host:  i-07b9c45e5c2637cd4
+        safekeeper-2.eu-central-1.aws.neon.tech:
+          ansible_host:  i-020257302c3c93d88
--- a/.github/ansible/prod.us-east-2.hosts.yaml
+++ b/.github/ansible/prod.us-east-2.hosts.yaml
@@ -0,0 +1,36 @@
+storage:
+  vars:
+    bucket_name: neon-prod-storage-us-east-2
+    bucket_region: us-east-2
+    console_mgmt_base_url: http://console-release.local
+    etcd_endpoints: etcd-0.us-east-2.aws.neon.tech:2379
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: us-east-2
+    ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-2
+    console_region_id: aws-us-east-2
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.us-east-2.aws.neon.tech:
+          ansible_host:  i-062227ba7f119eb8c
+        pageserver-1.us-east-2.aws.neon.tech:
+          ansible_host:  i-0b3ec0afab5968938
+
+    safekeepers:
+      hosts:
+        safekeeper-0.us-east-2.aws.neon.tech:
+          ansible_host:  i-0e94224750c57d346
+        safekeeper-1.us-east-2.aws.neon.tech:
+          ansible_host:  i-06d113fb73bfddeb0
+        safekeeper-2.us-east-2.aws.neon.tech:
+          ansible_host:  i-09f66c8e04afff2e8
+          
--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -1,7 +1,6 @@
 ---
 storage:
  vars:
-    env_name: prod-1
    console_mgmt_base_url: http://console-release.local
    bucket_name: zenith-storage-oregon
    bucket_region: us-west-2
@@ -12,20 +11,23 @@ storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
        prefix_in_bucket: "{{ inventory_hostname }}"
+    safekeeper_s3_prefix: prod-1/wal
+    hostname_suffix: ".local"
+    remote_user: admin

  children:
    pageservers:
      hosts:
        zenith-1-ps-2:
-          console_region_id: 1
+          console_region_id: aws-us-west-2
        zenith-1-ps-3:
-          console_region_id: 1
+          console_region_id: aws-us-west-2

    safekeepers:
      hosts:
        zenith-1-sk-1:
-          console_region_id: 1
+          console_region_id: aws-us-west-2
        zenith-1-sk-2:
-          console_region_id: 1
+          console_region_id: aws-us-west-2
        zenith-1-sk-3:
-          console_region_id: 1
+          console_region_id: aws-us-west-2
--- a/.github/ansible/scripts/init_pageserver.sh
+++ b/.github/ansible/scripts/init_pageserver.sh
@@ -12,18 +12,19 @@ cat <<EOF | tee /tmp/payload
  "version": 1,
  "host": "${HOST}",
  "port": 6400,
-  "region_id": {{ console_region_id }},
+  "region_id": "{{ console_region_id }}",
  "instance_id": "${INSTANCE_ID}",
  "http_host": "${HOST}",
-  "http_port": 9898
+  "http_port": 9898,
+  "active": false
 }
 EOF

 # check if pageserver already registered or not
-if ! curl -sf -X PATCH -d '{}' {{ console_mgmt_base_url }}/api/v1/pageservers/${INSTANCE_ID} -o /dev/null; then
+if ! curl -sf -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/${INSTANCE_ID} -o /dev/null; then

    # not registered, so register it now
-    ID=$(curl -sf -X POST {{ console_mgmt_base_url }}/api/v1/pageservers -d@/tmp/payload | jq -r '.ID')
+    ID=$(curl -sf -X POST -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers -d@/tmp/payload | jq -r '.id')

    # init pageserver
    sudo -u pageserver /usr/local/bin/pageserver -c "id=${ID}" -c "pg_distrib_dir='/usr/local'" --init -D /storage/pageserver/data
--- a/.github/ansible/scripts/init_safekeeper.sh
+++ b/.github/ansible/scripts/init_safekeeper.sh
@@ -14,18 +14,18 @@ cat <<EOF | tee /tmp/payload
  "host": "${HOST}",
  "port": 6500,
  "http_port": 7676,
-  "region_id": {{ console_region_id }},
+  "region_id": "{{ console_region_id }}",
  "instance_id": "${INSTANCE_ID}",
-  "availability_zone_id": "${AZ_ID}"
+  "availability_zone_id": "${AZ_ID}",
+  "active": false
 }
 EOF

 # check if safekeeper already registered or not
-if ! curl -sf -X PATCH -d '{}' {{ console_mgmt_base_url }}/api/v1/safekeepers/${INSTANCE_ID} -o /dev/null; then
+if ! curl -sf -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/${INSTANCE_ID} -o /dev/null; then

    # not registered, so register it now
-    ID=$(curl -sf -X POST {{ console_mgmt_base_url }}/api/v1/safekeepers -d@/tmp/payload | jq -r '.ID')
-
+    ID=$(curl -sf -X POST -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers -d@/tmp/payload | jq -r '.id')
    # init safekeeper
    sudo -u safekeeper /usr/local/bin/safekeeper --id ${ID} --init -D /storage/safekeeper/data
 fi
--- a/.github/ansible/ssm_config
+++ b/.github/ansible/ssm_config
@@ -0,0 +1,2 @@
+ansible_connection: aws_ssm
+ansible_python_interpreter: /usr/bin/python3
--- a/.github/ansible/staging.hosts.yaml
+++ b/.github/ansible/staging.hosts.yaml
@@ -3,7 +3,6 @@ storage:
    bucket_name: zenith-staging-storage-us-east-1
    bucket_region: us-east-1
    console_mgmt_base_url: http://console-staging.local
-    env_name: us-stage
    etcd_endpoints: zenith-us-stage-etcd.local:2379
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
@@ -11,30 +10,25 @@ storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
        prefix_in_bucket: "{{ inventory_hostname }}"
+    safekeeper_s3_prefix: us-stage/wal
+    hostname_suffix: ".local"
+    remote_user: admin

  children:
    pageservers:
      hosts:
        zenith-us-stage-ps-2:
-          console_region_id: 27
+          console_region_id: aws-us-east-1
        zenith-us-stage-ps-3:
-          console_region_id: 27
+          console_region_id: aws-us-east-1
        zenith-us-stage-ps-4:
-          console_region_id: 27
-        zenith-us-stage-test-ps-1:
-          console_region_id: 28
+          console_region_id: aws-us-east-1

    safekeepers:
      hosts:
        zenith-us-stage-sk-4:
-          console_region_id: 27
+          console_region_id: aws-us-east-1
        zenith-us-stage-sk-5:
-          console_region_id: 27
+          console_region_id: aws-us-east-1
        zenith-us-stage-sk-6:
-          console_region_id: 27
-        zenith-us-stage-test-sk-1:
-          console_region_id: 28
-        zenith-us-stage-test-sk-2:
-          console_region_id: 28
-        zenith-us-stage-test-sk-3:
-          console_region_id: 28
+          console_region_id: aws-us-east-1
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -0,0 +1,33 @@
+storage:
+  vars:
+    bucket_name: neon-staging-storage-us-east-2
+    bucket_region: us-east-2
+    console_mgmt_base_url: http://console-staging.local
+    etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: us-east-2
+    ansible_aws_ssm_bucket_name: neon-staging-storage-us-east-2
+    console_region_id: aws-us-east-2
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.us-east-2.aws.neon.build:
+          ansible_host: i-0c3e70929edb5d691
+
+    safekeepers:
+      hosts:
+        safekeeper-0.us-east-2.aws.neon.build:
+          ansible_host: i-027662bd552bf5db0
+        safekeeper-1.us-east-2.aws.neon.build:
+          ansible_host: i-0171efc3604a7b907
+        safekeeper-2.us-east-2.aws.neon.build:
+          ansible_host: i-0de0b03a51676a6ce
--- a/.github/ansible/systemd/pageserver.service
+++ b/.github/ansible/systemd/pageserver.service
@@ -1,5 +1,5 @@
 [Unit]
-Description=Zenith pageserver
+Description=Neon pageserver
 After=network.target auditd.service

 [Service]
--- a/.github/ansible/systemd/safekeeper.service
+++ b/.github/ansible/systemd/safekeeper.service
@@ -1,12 +1,12 @@
 [Unit]
-Description=Zenith safekeeper
+Description=Neon safekeeper
 After=network.target auditd.service

 [Service]
 Type=simple
 User=safekeeper
 Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib
-ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
+ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}'
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
 KillSignal=SIGINT
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-staging.local/management/api/v2"
+  domain: "*.us-east-2.aws.neon.build"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: dev
+  zenith_region: us-east-2
+  zenith_region_slug: us-east-2
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.build
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-release.local/management/api/v2"
+  domain: "*.ap-southeast-1.aws.neon.tech"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: prod
+  zenith_region: ap-southeast-1
+  zenith_region_slug: ap-southeast-1
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-release.local/management/api/v2"
+  domain: "*.eu-central-1.aws.neon.tech"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: prod
+  zenith_region: eu-central-1
+  zenith_region_slug: eu-central-1
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-release.local/management/api/v2"
+  domain: "*.us-east-2.aws.neon.tech"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: prod
+  zenith_region: us-east-2
+  zenith_region_slug: us-east-2
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -127,8 +127,8 @@ jobs:
            target/
          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
          key: |
-            v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
-            v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-
+            v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
+            v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-

      - name: Cache postgres v14 build
        id: cache_pg_14
@@ -389,7 +389,7 @@ jobs:
            !~/.cargo/registry/src
            ~/.cargo/git/
            target/
-          key: v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
+          key: v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}

      - name: Get Neon artifact
        uses: ./.github/actions/download
@@ -481,6 +481,7 @@ jobs:

  neon-image:
    runs-on: dev
+    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug

    steps:
@@ -494,10 +495,11 @@ jobs:
        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

      - name: Kaniko build neon
-        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID
+        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}

  compute-tools-image:
    runs-on: dev
+    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug

    steps:
@@ -508,11 +510,12 @@ jobs:
        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

      - name: Kaniko build compute tools
-        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID
+        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}

  compute-node-image:
    runs-on: dev
    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    needs: [ tag ]
    steps:
      - name: Checkout
        uses: actions/checkout@v1 # v3 won't work with kaniko
@@ -527,11 +530,12 @@ jobs:
        # cloud repo depends on this image name, thus duplicating it
        # remove compute-node when cloud repo is updated
      - name: Kaniko build compute node with extensions v14 (compatibility)
-        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID
+        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}}

  compute-node-image-v14:
    runs-on: dev
    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    needs: [ tag ]
    steps:
      - name: Checkout
        uses: actions/checkout@v1 # v3 won't work with kaniko
@@ -543,12 +547,13 @@ jobs:
        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

      - name: Kaniko build compute node with extensions v14
-        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID
+        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}}


  compute-node-image-v15:
    runs-on: dev
    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    needs: [ tag ]
    steps:
      - name: Checkout
        uses: actions/checkout@v1 # v3 won't work with kaniko
@@ -560,11 +565,11 @@ jobs:
        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

      - name: Kaniko build compute node with extensions v15
-        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID
+        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}}

  promote-images:
    runs-on: dev
-    needs: [ neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
+    needs: [ tag, neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
    if: github.event_name != 'workflow_dispatch'
    container: amazon/aws-cli
    strategy:
@@ -577,8 +582,9 @@ jobs:

    steps:
      - name: Promote image to latest
-        run:
-          MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=$GITHUB_RUN_ID --query 'images[].imageManifest' --output text) && aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"
+        run: |
+          export MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=${{needs.tag.outputs.build-tag}} --query 'images[].imageManifest' --output text)
+          aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"

  push-docker-hub:
    runs-on: dev
@@ -597,19 +603,19 @@ jobs:
          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json

      - name: Pull neon image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:latest neon
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} neon

      - name: Pull compute tools image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest compute-tools
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools

      - name: Pull compute node image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest compute-node
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} compute-node

      - name: Pull compute node v14 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest compute-node-v14
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14

      - name: Pull compute node v15 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest compute-node-v15
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15

      - name: Pull rust image from ECR
        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust
@@ -619,11 +625,11 @@ jobs:
          (github.ref_name == 'main' || github.ref_name == 'release') &&
          github.event_name != 'workflow_dispatch'
        run: |
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v14:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v15:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest

      - name: Configure Docker Hub login
        run: |
@@ -671,11 +677,11 @@ jobs:
      - id: set-matrix
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA"}'
-            NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA"}'
+            STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}'
+            NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA", "console_api_key_secret": "NEON_CAPTEST_API_KEY"}'
            echo "include=[$STAGING, $NEON_STRESS]" >> $GITHUB_OUTPUT
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA"}'
+            PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
            echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
@@ -735,7 +741,87 @@ jobs:
          ssh-add ssh-key
          rm -f ssh-key ssh-key-cert.pub
          ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml
+          ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }}
+          rm -f neon_install.tar.gz .neon_current_version
+
+  deploy-new:
+    runs-on: dev
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
+    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
+    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    if: |
+      (github.ref_name == 'main') &&
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        target_region: [ us-east-2 ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Redeploy
+        run: |
+          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
+          cd "$(pwd)/.github/ansible"
+
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            ./get_binaries.sh
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            RELEASE=true ./get_binaries.sh
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+
+          ansible-galaxy collection install sivel.toiletwater
+          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
+          rm -f neon_install.tar.gz .neon_current_version
+
+  deploy-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
+    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
+    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    if: |
+      (github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        target_region: [ us-east-2, eu-central-1, ap-southeast-1 ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Redeploy
+        run: |
+          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
+          cd "$(pwd)/.github/ansible"
+
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            ./get_binaries.sh
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            RELEASE=true ./get_binaries.sh
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+
+          ansible-galaxy collection install sivel.toiletwater
+          ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_PRODUCTION_API_KEY}}
          rm -f neon_install.tar.gz .neon_current_version

  deploy-proxy:
@@ -779,3 +865,94 @@ jobs:
          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+
+  deploy-proxy-new:
+    runs-on: dev
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
+    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    if: |
+      (github.ref_name == 'main') &&
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: dev-us-east-2-beta
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+
+      - name: Re-deploy proxy
+        run: |
+          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+
+  deploy-proxy-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
+    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    if: |
+      (github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: prod-us-east-2-delta
+          - target_region: eu-central-1
+            target_cluster: prod-eu-central-1-gamma
+          - target_region: ap-southeast-1
+            target_cluster: prod-ap-southeast-1-epsilon
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+
+      - name: Re-deploy proxy
+        run: |
+          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+
+  promote-compatibility-test-snapshot:
+    runs-on: dev
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+    needs: [ deploy, deploy-proxy ]
+    if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
+    steps:
+      - name: Promote compatibility snapshot for the release
+        shell: bash -euxo pipefail {0}
+        env:
+          BUCKET: neon-github-public-dev
+          PREFIX: artifacts/latest
+        run: |
+          for build_type in debug release; do
+            OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst
+            NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst
+
+            time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
+          done
--- a/.github/workflows/codestyle.yml
+++ b/.github/workflows/codestyle.yml
@@ -36,7 +36,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
        with:
          submodules: true
          fetch-depth: 2
@@ -106,7 +106,7 @@ jobs:
            !~/.cargo/registry/src
            ~/.cargo/git
            target
-          key: v5-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
+          key: v6-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust

      - name: Run cargo clippy
        run: ./run_clippy.sh
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -40,8 +40,7 @@ dependencies = [
 [[package]]
 name = "amplify_num"
 version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f27d3d00d3d115395a7a8a4dc045feb7aa82b641e485f7e15f4e67ac16f4f56d"
+source = "git+https://github.com/hlinnaka/rust-amplify.git?branch=unsigned-int-perf#bd49b737c2e6e623ab8e9ba5ceaed5712d3a3940"

 [[package]]
 name = "android_system_properties"
@@ -75,9 +74,9 @@ checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6"

 [[package]]
 name = "asn1-rs"
-version = "0.3.1"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "30ff05a702273012438132f449575dbc804e27b2f3cbe3069aa237d26c98fa33"
+checksum = "cf6690c370453db30743b373a60ba498fc0d6d83b11f4abfd87a84a075db5dd4"
 dependencies = [
 "asn1-rs-derive",
 "asn1-rs-impl",
@@ -91,9 +90,9 @@ dependencies = [

 [[package]]
 name = "asn1-rs-derive"
-version = "0.1.0"
+version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "db8b7511298d5b7784b40b092d9e9dcd3a627a5707e4b5e507931ab0d44eeebf"
+checksum = "726535892e8eae7e70657b4c8ea93d26b8553afb1ce617caee529ef96d7dee6c"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -262,15 +261,13 @@ dependencies = [

 [[package]]
 name = "bindgen"
-version = "0.60.1"
+version = "0.61.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "062dddbc1ba4aca46de6338e2bf87771414c335f7b2f2036e8f3e9befebf88e6"
+checksum = "8a022e58a142a46fea340d68012b9201c094e93ec3d033a944a24f8fd4a4f09a"
 dependencies = [
 "bitflags",
 "cexpr",
 "clang-sys",
- "clap",
- "env_logger",
 "lazy_static",
 "lazycell",
 "log",
@@ -280,6 +277,7 @@ dependencies = [
 "regex",
 "rustc-hash",
 "shlex",
+ "syn",
 "which",
 ]

@@ -319,21 +317,16 @@ dependencies = [
 "generic-array",
 ]

-[[package]]
-name = "boxfnonce"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5988cb1d626264ac94100be357308f29ff7cbdd3b36bda27f450a4ee3f713426"
-
 [[package]]
 name = "bstr"
-version = "0.2.17"
+version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
+checksum = "fca0852af221f458706eb0725c03e4ed6c46af9ac98e6a689d5e634215d594dd"
 dependencies = [
- "lazy_static",
 "memchr",
+ "once_cell",
 "regex-automata",
+ "serde",
 ]

 [[package]]
@@ -450,28 +443,23 @@ version = "3.2.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "86447ad904c7fb335a790c9d7fe3d0d971dc523b8ccd1561a520de9a85302750"
 dependencies = [
- "atty",
 "bitflags",
- "clap_derive",
- "clap_lex",
+ "clap_lex 0.2.4",
 "indexmap",
- "once_cell",
- "strsim",
- "termcolor",
 "textwrap",
 ]

 [[package]]
-name = "clap_derive"
-version = "3.2.18"
+name = "clap"
+version = "4.0.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea0c8bce528c4be4da13ea6fead8965e95b6073585a2f05204bd8f4119f82a65"
+checksum = "6bf8832993da70a4c6d13c581f4463c2bdda27b9bf1c5498dc4365543abe6d6f"
 dependencies = [
- "heck",
- "proc-macro-error",
- "proc-macro2",
- "quote",
- "syn",
+ "atty",
+ "bitflags",
+ "clap_lex 0.3.0",
+ "strsim",
+ "termcolor",
 ]

 [[package]]
@@ -483,6 +471,15 @@ dependencies = [
 "os_str_bytes",
 ]

+[[package]]
+name = "clap_lex"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0d4198f73e42b4936b35b5bb248d81d2b595ecb170da0bac7655c54eedfa8da8"
+dependencies = [
+ "os_str_bytes",
+]
+
 [[package]]
 name = "close_fds"
 version = "0.3.2"
@@ -540,7 +537,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "chrono",
- "clap",
+ "clap 4.0.15",
 "env_logger",
 "futures",
 "hyper",
@@ -582,7 +579,7 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "clap",
+ "clap 4.0.15",
 "comfy-table",
 "git-version",
 "nix 0.25.0",
@@ -597,6 +594,7 @@ dependencies = [
 "tar",
 "thiserror",
 "toml",
+ "url",
 "utils",
 "workspace_hack",
 ]
@@ -675,7 +673,7 @@ dependencies = [
 "atty",
 "cast",
 "ciborium",
- "clap",
+ "clap 3.2.22",
 "criterion-plot",
 "itertools",
 "lazy_static",
@@ -743,7 +741,7 @@ dependencies = [
 "autocfg",
 "cfg-if",
 "crossbeam-utils",
- "memoffset",
+ "memoffset 0.6.5",
 "scopeguard",
 ]

@@ -846,16 +844,6 @@ dependencies = [
 "syn",
 ]

-[[package]]
-name = "daemonize"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70c24513e34f53b640819f0ac9f705b673fcf4006d7aab8778bee72ebfc89815"
-dependencies = [
- "boxfnonce",
- "libc",
-]
-
 [[package]]
 name = "darling"
 version = "0.14.1"
@@ -903,14 +891,14 @@ version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d6ee87af31d84ef885378aebca32be3d682b0e0dc119d5b4860a2c5bb5046730"
 dependencies = [
- "uuid",
+ "uuid 0.8.2",
 ]

 [[package]]
 name = "der-parser"
-version = "7.0.0"
+version = "8.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fe398ac75057914d7d07307bf67dc7f3f574a26783b4fc7805a20ffa9f506e82"
+checksum = "42d4bc9b0db0a0df9ae64634ac5bdefb7afcb534e182275ca0beadbe486701c1"
 dependencies = [
 "asn1-rs",
 "displaydoc",
@@ -1017,11 +1005,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9fb8664f6ea68aba5503d42dd1be786b0f1bd9b7972e7f40208c83ef74db91bf"
 dependencies = [
 "http",
- "prost 0.10.4",
+ "prost",
 "tokio",
 "tokio-stream",
- "tonic 0.7.2",
- "tonic-build 0.7.2",
+ "tonic",
+ "tonic-build",
 "tower",
 "tower-service",
 ]
@@ -1219,6 +1207,12 @@ version = "0.3.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a6508c467c73851293f390476d4491cf4d227dbabcd4170f3bb6044959b294f1"

+[[package]]
+name = "futures-timer"
+version = "3.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
+
 [[package]]
 name = "futures-util"
 version = "0.3.24"
@@ -1805,6 +1799,15 @@ dependencies = [
 "autocfg",
 ]

+[[package]]
+name = "memoffset"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4"
+dependencies = [
+ "autocfg",
+]
+
 [[package]]
 name = "metrics"
 version = "0.1.0"
@@ -1887,22 +1890,6 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "546c37ac5d9e56f55e73b677106873d9d9f5190605e41a856503623648488cae"

-[[package]]
-name = "neon_broker"
-version = "0.1.0"
-dependencies = [
- "async-stream",
- "clap",
- "futures-core",
- "futures-util",
- "prost 0.11.0",
- "tokio",
- "tokio-stream",
- "tonic 0.8.2",
- "tonic-build 0.8.2",
- "utils",
-]
-
 [[package]]
 name = "nix"
 version = "0.23.1"
@@ -1913,7 +1900,7 @@ dependencies = [
 "cc",
 "cfg-if",
 "libc",
- "memoffset",
+ "memoffset 0.6.5",
 ]

 [[package]]
@@ -1926,7 +1913,7 @@ dependencies = [
 "bitflags",
 "cfg-if",
 "libc",
- "memoffset",
+ "memoffset 0.6.5",
 "pin-utils",
 ]

@@ -2039,9 +2026,9 @@ dependencies = [

 [[package]]
 name = "oid-registry"
-version = "0.4.0"
+version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38e20717fa0541f39bd146692035c37bedfa532b3e5071b35761082407546b2a"
+checksum = "7d4bda43fd1b844cbc6e6e54b5444e2b1bc7838bce59ad205902cccbb26d6761"
 dependencies = [
 "asn1-rs",
 ]
@@ -2132,12 +2119,12 @@ dependencies = [
 "byteorder",
 "bytes",
 "chrono",
- "clap",
+ "clap 4.0.15",
 "close_fds",
 "const_format",
 "crc32c",
+ "criterion",
 "crossbeam-utils",
- "daemonize",
 "etcd_broker",
 "fail",
 "futures",
@@ -2167,6 +2154,7 @@ dependencies = [
 "serde_json",
 "serde_with",
 "signal-hook",
+ "svg_fmt",
 "tar",
 "tempfile",
 "thiserror",
@@ -2185,7 +2173,10 @@ dependencies = [
 name = "pageserver_api"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
+ "bytes",
 "const_format",
+ "postgres_ffi",
 "serde",
 "serde_with",
 "utils",
@@ -2408,7 +2399,7 @@ dependencies = [
 "env_logger",
 "hex",
 "log",
- "memoffset",
+ "memoffset 0.7.1",
 "once_cell",
 "postgres",
 "rand",
@@ -2455,30 +2446,6 @@ dependencies = [
 "syn",
 ]

-[[package]]
-name = "proc-macro-error"
-version = "1.0.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
-dependencies = [
- "proc-macro-error-attr",
- "proc-macro2",
- "quote",
- "syn",
- "version_check",
-]
-
-[[package]]
-name = "proc-macro-error-attr"
-version = "1.0.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
-dependencies = [
- "proc-macro2",
- "quote",
- "version_check",
-]
-
 [[package]]
 name = "proc-macro-hack"
 version = "0.5.19"
@@ -2530,17 +2497,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "71adf41db68aa0daaefc69bb30bcd68ded9b9abaad5d1fbb6304c4fb390e083e"
 dependencies = [
 "bytes",
- "prost-derive 0.10.1",
-]
-
-[[package]]
-name = "prost"
-version = "0.11.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "399c3c31cdec40583bb68f0b18403400d01ec4289c383aa047560439952c4dd7"
-dependencies = [
- "bytes",
- "prost-derive 0.11.0",
+ "prost-derive",
 ]

 [[package]]
@@ -2558,28 +2515,8 @@ dependencies = [
 "log",
 "multimap",
 "petgraph",
- "prost 0.10.4",
- "prost-types 0.10.1",
- "regex",
- "tempfile",
- "which",
-]
-
-[[package]]
-name = "prost-build"
-version = "0.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f835c582e6bd972ba8347313300219fed5bfa52caf175298d860b61ff6069bb"
-dependencies = [
- "bytes",
- "heck",
- "itertools",
- "lazy_static",
- "log",
- "multimap",
- "petgraph",
- "prost 0.11.0",
- "prost-types 0.11.1",
+ "prost",
+ "prost-types",
 "regex",
 "tempfile",
 "which",
@@ -2598,19 +2535,6 @@ dependencies = [
 "syn",
 ]

-[[package]]
-name = "prost-derive"
-version = "0.11.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7345d5f0e08c0536d7ac7229952590239e77abf0a0100a1b1d890add6ea96364"
-dependencies = [
- "anyhow",
- "itertools",
- "proc-macro2",
- "quote",
- "syn",
-]
-
 [[package]]
 name = "prost-types"
 version = "0.10.1"
@@ -2618,17 +2542,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2d0a014229361011dc8e69c8a1ec6c2e8d0f2af7c91e3ea3f5b2170298461e68"
 dependencies = [
 "bytes",
- "prost 0.10.4",
-]
-
-[[package]]
-name = "prost-types"
-version = "0.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4dfaa718ad76a44b3415e6c4d53b17c8f99160dcb3a99b10470fce8ad43f6e3e"
-dependencies = [
- "bytes",
- "prost 0.11.0",
+ "prost",
 ]

 [[package]]
@@ -2641,7 +2555,7 @@ dependencies = [
 "base64",
 "bstr",
 "bytes",
- "clap",
+ "clap 4.0.15",
 "futures",
 "git-version",
 "hashbrown",
@@ -2675,7 +2589,7 @@ dependencies = [
 "tracing-subscriber",
 "url",
 "utils",
- "uuid",
+ "uuid 1.2.1",
 "workspace_hack",
 "x509-parser",
 ]
@@ -2764,13 +2678,13 @@ dependencies = [

 [[package]]
 name = "rcgen"
-version = "0.8.14"
+version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5911d1403f4143c9d56a702069d593e8d0f3fab880a85e103604d0893ea31ba7"
+checksum = "ffbe84efe2f38dea12e9bfc1f65377fdf03e53a18cb3b995faedf7934c7e785b"
 dependencies = [
- "chrono",
 "pem",
 "ring",
+ "time 0.3.15",
 "yasna",
 ]

@@ -2960,9 +2874,21 @@ dependencies = [

 [[package]]
 name = "rstest"
-version = "0.12.0"
+version = "0.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d912f35156a3f99a66ee3e11ac2e0b3f34ac85a07e05263d05a7e2c8810d616f"
+checksum = "e9c9dc66cc29792b663ffb5269be669f1613664e69ad56441fdb895c2347b930"
+dependencies = [
+ "futures",
+ "futures-timer",
+ "rstest_macros",
+ "rustc_version 0.4.0",
+]
+
+[[package]]
+name = "rstest_macros"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5015e68a0685a95ade3eee617ff7101ab6a3fc689203101ca16ebc16f2b89c66"
 dependencies = [
 "cfg-if",
 "proc-macro2",
@@ -3142,10 +3068,9 @@ dependencies = [
 "async-trait",
 "byteorder",
 "bytes",
- "clap",
+ "clap 4.0.15",
 "const_format",
 "crc32c",
- "daemonize",
 "etcd_broker",
 "fs2",
 "git-version",
@@ -3153,6 +3078,7 @@ dependencies = [
 "humantime",
 "hyper",
 "metrics",
+ "nix 0.25.0",
 "once_cell",
 "parking_lot 0.12.1",
 "postgres",
@@ -3523,6 +3449,12 @@ version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601"

+[[package]]
+name = "svg_fmt"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2"
+
 [[package]]
 name = "symbolic-common"
 version = "8.8.0"
@@ -3532,7 +3464,7 @@ dependencies = [
 "debugid",
 "memmap2",
 "stable_deref_trait",
- "uuid",
+ "uuid 0.8.2",
 ]

 [[package]]
@@ -3863,40 +3795,8 @@ dependencies = [
 "hyper-timeout",
 "percent-encoding",
 "pin-project",
- "prost 0.10.4",
- "prost-derive 0.10.1",
- "tokio",
- "tokio-stream",
- "tokio-util",
- "tower",
- "tower-layer",
- "tower-service",
- "tracing",
- "tracing-futures",
-]
-
-[[package]]
-name = "tonic"
-version = "0.8.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "55b9af819e54b8f33d453655bef9b9acc171568fb49523078d0cc4e7484200ec"
-dependencies = [
- "async-stream",
- "async-trait",
- "axum",
- "base64",
- "bytes",
- "futures-core",
- "futures-util",
- "h2",
- "http",
- "http-body",
- "hyper",
- "hyper-timeout",
- "percent-encoding",
- "pin-project",
- "prost 0.11.0",
- "prost-derive 0.11.0",
+ "prost",
+ "prost-derive",
 "tokio",
 "tokio-stream",
 "tokio-util",
@@ -3915,20 +3815,7 @@ checksum = "d9263bf4c9bfaae7317c1c2faf7f18491d2fe476f70c414b73bf5d445b00ffa1"
 dependencies = [
 "prettyplease",
 "proc-macro2",
- "prost-build 0.10.4",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "tonic-build"
-version = "0.8.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48c6fd7c2581e36d63388a9e04c350c21beb7a8b059580b2e93993c526899ddc"
-dependencies = [
- "prettyplease",
- "proc-macro2",
- "prost-build 0.11.1",
+ "prost-build",
 "quote",
 "syn",
 ]
@@ -4039,6 +3926,16 @@ dependencies = [
 "tracing-core",
 ]

+[[package]]
+name = "tracing-serde"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1"
+dependencies = [
+ "serde",
+ "tracing-core",
+]
+
 [[package]]
 name = "tracing-subscriber"
 version = "0.3.16"
@@ -4049,12 +3946,15 @@ dependencies = [
 "nu-ansi-term",
 "once_cell",
 "regex",
+ "serde",
+ "serde_json",
 "sharded-slab",
 "smallvec",
 "thread_local",
 "tracing",
 "tracing-core",
 "tracing-log",
+ "tracing-serde",
 ]

 [[package]]
@@ -4149,6 +4049,8 @@ dependencies = [
 "serde_json",
 "serde_with",
 "signal-hook",
+ "strum",
+ "strum_macros",
 "tempfile",
 "thiserror",
 "tokio",
@@ -4163,6 +4065,12 @@ name = "uuid"
 version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7"
+
+[[package]]
+name = "uuid"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "feb41e78f93363bb2df8b0e86a2ca30eed7806ea16ea0c790d757cf93f79be83"
 dependencies = [
 "getrandom",
 "serde",
@@ -4212,7 +4120,7 @@ name = "wal_craft"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "clap",
+ "clap 4.0.15",
 "env_logger",
 "log",
 "once_cell",
@@ -4452,7 +4360,7 @@ dependencies = [
 "anyhow",
 "bytes",
 "chrono",
- "clap",
+ "clap 4.0.15",
 "crossbeam-utils",
 "either",
 "fail",
@@ -4460,11 +4368,12 @@ dependencies = [
 "indexmap",
 "libc",
 "log",
+ "memchr",
 "nom",
 "num-bigint",
 "num-integer",
 "num-traits",
- "prost 0.10.4",
+ "prost",
 "rand",
 "regex",
 "regex-syntax",
@@ -4477,14 +4386,13 @@ dependencies = [
 "tokio-util",
 "tracing",
 "tracing-core",
- "uuid",
 ]

 [[package]]
 name = "x509-parser"
-version = "0.13.2"
+version = "0.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9fb9bace5b5589ffead1afb76e43e34cff39cd0f3ce7e170ae0c29e53b88eb1c"
+checksum = "e0ecbeb7b67ce215e40e3cc7f2ff902f94a223acf44995934763467e7b1febc8"
 dependencies = [
 "asn1-rs",
 "base64",
@@ -4515,11 +4423,11 @@ checksum = "d2d7d3948613f75c98fd9328cfdcc45acc4d360655289d0a7d4ec931392200a3"

 [[package]]
 name = "yasna"
-version = "0.4.0"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e262a29d0e61ccf2b6190d7050d4b237535fc76ce4c1210d9caa316f71dffa75"
+checksum = "346d34a236c9d3e5f3b9b74563f238f955bbd05fa0b8b4efa53c130c43982f4c"
 dependencies = [
- "chrono",
+ "time 0.3.15",
 ]

 [[package]]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,7 +11,6 @@ cargo-features = ["named-profiles"]

 [workspace]
 members = [
-    "broker",
    "compute_tools",
    "control_plane",
    "pageserver",
--- a/3
+++ b/3
@@ -44,7 +44,7 @@ COPY . .
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin safekeeper --bin proxy --locked --release \
+&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin draw_timeline_dir --bin safekeeper --bin proxy --locked --release \
    && cachepot -s

 # Build final image
@@ -65,6 +65,7 @@ RUN set -e \

 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir   /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin

--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node-v14
@@ -1,50 +1,50 @@
-ARG TAG=pinned
-# apparently, ARGs don't get replaced in RUN commands in kaniko
-# ARG POSTGIS_VERSION=3.3.0
-# ARG PLV8_VERSION=3.1.4
-# ARG PG_VERSION=v14
+#
+# This file is identical to the Dockerfile.compute-node-v15 file
+# except for the version of Postgres that is built.
+#

+ARG TAG=pinned
+
+#########################################################################################
 #
 # Layer "build-deps"
 #
+#########################################################################################
 FROM debian:bullseye-slim AS build-deps
-RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
-    apt update
 RUN apt update &&  \
-    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-    libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev
+    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
+    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev

+#########################################################################################
 #
 # Layer "pg-build"
 # Build Postgres from the neon postgres repository.
 #
+#########################################################################################
 FROM build-deps AS pg-build
 COPY vendor/postgres-v14 postgres
 RUN cd postgres && \
-    ./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
+    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
    # Install headers
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install

+#########################################################################################
 #
 # Layer "postgis-build"
 # Build PostGIS from the upstream PostGIS mirror.
 #
-# PostGIS compiles against neon postgres sources without changes. Perhaps we
-# could even use the upstream binaries, compiled against vanilla Postgres, but
-# it would require some investigation to check that it works, and also keeps
-# working in the future. So for now, we compile our own binaries.
+#########################################################################################
 FROM build-deps AS postgis-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
    apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc

-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
-    tar xvzf postgis-3.3.0.tar.gz && \
-    cd postgis-3.3.0 && \
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
+    tar xvzf postgis-3.3.1.tar.gz && \
+    cd postgis-3.3.1 && \
    ./autogen.sh && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    ./configure && \
@@ -57,39 +57,55 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control

+#########################################################################################
 #
 # Layer "plv8-build"
 # Build plv8
 #
+#########################################################################################
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5
+    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils

-# https://github.com/plv8/plv8/issues/475
-# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing binutils
+# https://github.com/plv8/plv8/issues/475:
+#   v8 uses gold for linking and sets `--thread-count=4` which breaks
+#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
+# Install newer gold version manually as debian-testing binutils version updates
+# libc version, which in turn breaks other extension built against non-testing libc.
+RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
+    tar xvzf binutils-2.38.tar.gz && \
+    cd binutils-2.38 && \
+    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
+    cd ../bfd && ./configure && make bfdver.h && \
+    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
+    cp /usr/local/bin/ld.gold /usr/bin/gold

+# Sed is used to patch for https://github.com/plv8/plv8/issues/503
 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
    tar xvzf v3.1.4.tar.gz && \
    cd plv8-3.1.4 && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
+    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
    rm -rf /plv8-* && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control

+#########################################################################################
 #
 # Layer "h3-pg-build"
 # Build h3_pg
 #
+#########################################################################################
 FROM build-deps AS h3-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # packaged cmake is too old
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing cmake
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
+      -q -O /tmp/cmake-install.sh \
+      && chmod u+x /tmp/cmake-install.sh \
+      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
+      && rm /tmp/cmake-install.sh

 RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
    tar xvzf h3.tgz  && \
@@ -108,16 +124,18 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control

+#########################################################################################
 #
 # Layer "neon-pg-ext-build"
 # compile neon extensions
 #
+#########################################################################################
 FROM build-deps AS neon-pg-ext-build
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
-# plv8 still sometimes crashes during the creation
-# COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /h3/usr /
 COPY pgxn/ pgxn/
@@ -127,16 +145,22 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
        -C pgxn/neon \
        -s install

+#########################################################################################
+#
 # Compile and run the Neon-specific `compute_ctl` binary
+#
+#########################################################################################
 FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
 RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto

+#########################################################################################
 #
 # Clean up postgres folder before inclusion
 #
+#########################################################################################
 FROM neon-pg-ext-build AS postgres-cleanup-layer
 COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql

@@ -154,10 +178,12 @@ RUN rm -r /usr/local/pgsql/lib/pgxs/src
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a

+#########################################################################################
 #
 # Final layer
 # Put it all together into the final image
 #
+#########################################################################################
 FROM debian:bullseye-slim
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
@@ -174,8 +200,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libreadline8 for psql
 # libossp-uuid16 for extension ossp-uuid
 # libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
-# GLIBC 2.34 for plv8.
-#     Debian bullseye provides GLIBC 2.31, so we install the library from testing
 #
 # Lastly, link compute_ctl into zenith_ctl while we're at it,
 # so that we don't need to put this in another layer.
@@ -188,12 +212,6 @@ RUN apt update &&  \
        libproj19 \
        libprotobuf-c1 && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
-    echo "Installing GLIBC 2.34" && \
-    echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
-    apt update && \
-    apt install -y --no-install-recommends -t testing libc6 && \
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl

 USER postgres
--- a/Dockerfile.compute-node-v15
+++ b/Dockerfile.compute-node-v15
@@ -4,44 +4,39 @@
 #

 ARG TAG=pinned
-# apparently, ARGs don't get replaced in RUN commands in kaniko
-# ARG POSTGIS_VERSION=3.3.1
-# ARG PLV8_VERSION=3.1.4
-# ARG PG_VERSION=v15

+#########################################################################################
 #
 # Layer "build-deps"
 #
+#########################################################################################
 FROM debian:bullseye-slim AS build-deps
-RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
-    apt update
 RUN apt update &&  \
-    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-    libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev
+    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
+    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev

+#########################################################################################
 #
 # Layer "pg-build"
 # Build Postgres from the neon postgres repository.
 #
+#########################################################################################
 FROM build-deps AS pg-build
 COPY vendor/postgres-v15 postgres
 RUN cd postgres && \
-    ./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
+    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
    # Install headers
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install

+#########################################################################################
 #
 # Layer "postgis-build"
 # Build PostGIS from the upstream PostGIS mirror.
 #
-# PostGIS compiles against neon postgres sources without changes. Perhaps we
-# could even use the upstream binaries, compiled against vanilla Postgres, but
-# it would require some investigation to check that it works, and also keeps
-# working in the future. So for now, we compile our own binaries.
+#########################################################################################
 FROM build-deps AS postgis-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
@@ -62,39 +57,55 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control

+#########################################################################################
 #
 # Layer "plv8-build"
 # Build plv8
 #
+#########################################################################################
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5
+    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils

-# https://github.com/plv8/plv8/issues/475
-# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing binutils
+# https://github.com/plv8/plv8/issues/475:
+#   v8 uses gold for linking and sets `--thread-count=4` which breaks
+#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
+# Install newer gold version manually as debian-testing binutils version updates
+# libc version, which in turn breaks other extension built against non-testing libc.
+RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
+    tar xvzf binutils-2.38.tar.gz && \
+    cd binutils-2.38 && \
+    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
+    cd ../bfd && ./configure && make bfdver.h && \
+    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
+    cp /usr/local/bin/ld.gold /usr/bin/gold

+# Sed is used to patch for https://github.com/plv8/plv8/issues/503
 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
    tar xvzf v3.1.4.tar.gz && \
    cd plv8-3.1.4 && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
+    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
    rm -rf /plv8-* && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control

+#########################################################################################
 #
 # Layer "h3-pg-build"
 # Build h3_pg
 #
+#########################################################################################
 FROM build-deps AS h3-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # packaged cmake is too old
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing cmake
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
+      -q -O /tmp/cmake-install.sh \
+      && chmod u+x /tmp/cmake-install.sh \
+      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
+      && rm /tmp/cmake-install.sh

 RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
    tar xvzf h3.tgz  && \
@@ -113,16 +124,18 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control

+#########################################################################################
 #
 # Layer "neon-pg-ext-build"
 # compile neon extensions
 #
+#########################################################################################
 FROM build-deps AS neon-pg-ext-build
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
-# plv8 still sometimes crashes during the creation
-# COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /h3/usr /
 COPY pgxn/ pgxn/
@@ -132,16 +145,22 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
        -C pgxn/neon \
        -s install

+#########################################################################################
+#
 # Compile and run the Neon-specific `compute_ctl` binary
+#
+#########################################################################################
 FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
 RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto

+#########################################################################################
 #
 # Clean up postgres folder before inclusion
 #
+#########################################################################################
 FROM neon-pg-ext-build AS postgres-cleanup-layer
 COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql

@@ -159,10 +178,12 @@ RUN rm -r /usr/local/pgsql/lib/pgxs/src
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a

+#########################################################################################
 #
 # Final layer
 # Put it all together into the final image
 #
+#########################################################################################
 FROM debian:bullseye-slim
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
@@ -179,8 +200,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libreadline8 for psql
 # libossp-uuid16 for extension ossp-uuid
 # libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
-# GLIBC 2.34 for plv8.
-#     Debian bullseye provides GLIBC 2.31, so we install the library from testing
 #
 # Lastly, link compute_ctl into zenith_ctl while we're at it,
 # so that we don't need to put this in another layer.
@@ -193,12 +212,6 @@ RUN apt update &&  \
        libproj19 \
        libprotobuf-c1 && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
-    echo "Installing GLIBC 2.34" && \
-    echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
-    apt update && \
-    apt install -y --no-install-recommends -t testing libc6 && \
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl

 USER postgres
--- a/10
+++ b/10
@@ -151,6 +151,11 @@ neon-pg-ext-v14: postgres-v14
 	(cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
+	+@echo "Compiling neon_walredo v14"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14
+	(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 && \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
 	+@echo "Compiling neon_test_utils" v14
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14
 	(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \
@@ -163,6 +168,11 @@ neon-pg-ext-v15: postgres-v15
 	(cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
+	+@echo "Compiling neon_walredo v15"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15
+	(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 && \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
 	+@echo "Compiling neon_test_utils" v15
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15
 	(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \
--- a/README.md
+++ b/README.md
@@ -223,10 +223,7 @@ Ensure your dependencies are installed as described [here](https://github.com/ne
 ```sh
 git clone --recursive https://github.com/neondatabase/neon.git

-# either:
 CARGO_BUILD_FLAGS="--features=testing" make
-# or:
-make debug

 ./scripts/pytest
 ```
--- a/broker/Cargo.lock
+++ b/broker/Cargo.lock
--- a/broker/Cargo.toml
+++ b/broker/Cargo.toml
@@ -1,35 +0,0 @@
-[package]
-name = "neon_broker"
-version = "0.1.0"
-edition = "2021"
-
-[features]
-bench = []
-
-[[bin]]
-name = "neon_broker"
-path = "src/broker.rs"
-
-[[bin]]
-name = "neon_broker_bench"
-path = "src/bench.rs"
-# build benchmarking binary only if explicitly requested with '--feature bench'
-# required-features = ["bench"]
-
-[dependencies]
-async-stream = "0.3"
-futures-core = "0.3"
-futures-util = "0.3"
-tonic = "0.8"
-prost = "0.11"
-tokio = { version = "1.0", features = ["macros", "rt-multi-thread"] }
-# for exploring with tokio-console
-# tokio = { version = "1", features = ["full", "tracing"] }
-# console-subscriber = "0.1.8"
-tokio-stream = "0.1"
-clap = { version = "3.2.17", features = ["derive"] }
-
-utils = { path = "../libs/utils" }
-
-[build-dependencies]
-tonic-build = "0.8"
--- a/broker/build.rs
+++ b/broker/build.rs
@@ -1,4 +0,0 @@
-fn main() -> Result<(), Box<dyn std::error::Error>> {
-    tonic_build::compile_protos("proto/broker.proto")?;
-    Ok(())
-}
--- a/broker/proto/broker.proto
+++ b/broker/proto/broker.proto
@@ -1,38 +0,0 @@
-syntax = "proto3";
-
-package neon_broker;
-
-service NeonBroker {
-    // Subscribe to safekeeper updates.
-    rpc SubscribeSafekeeperInfo(SubscribeSafekeeperInfoRequest) returns (stream SafekeeperTimelineInfo) {};
-
-    // Publish safekeeper updates.
-    rpc PublishSafekeeperInfo(stream SafekeeperTimelineInfo) returns (Empty) {};
-}
-
-message SubscribeSafekeeperInfoRequest {
-    oneof subscription_key {
-        Empty all = 1; // subscribe to everything
-        TenantTimelineId tenant_timeline_id = 2; // subscribe to specific timeline
-    }
-}
-
-message SafekeeperTimelineInfo {
-    uint64 safekeeper_id = 1;
-    TenantTimelineId tenant_timeline_id = 2;
-    uint64 last_log_term = 3;
-    uint64 flush_lsn = 4;
-    uint64 commit_lsn = 5;
-    uint64 backup_lsn = 6;
-    uint64 remote_consistent_lsn = 7;
-    uint64 peer_horizon_lsn = 8;
-    string safekeeper_connstr = 9;
-}
-
-message TenantTimelineId {
-    bytes tenant_id = 1;
-    bytes timeline_id = 2;
-}
-
-message Empty {
-}
--- a/broker/readme.md
+++ b/broker/readme.md
@@ -1,4 +0,0 @@
-```
-cargo build -r -p neon_broker --features bench && target/release/neon_broker
-target/release/neon_broker_bench -s 1 -p 1
-```
--- a/broker/src/bench.rs
+++ b/broker/src/bench.rs
@@ -1,179 +0,0 @@
-pub mod neon_broker {
-    tonic::include_proto!("neon_broker");
-}
-
-use std::sync::atomic::{AtomicU64, Ordering};
-use std::sync::Arc;
-use std::time::{Duration, Instant};
-
-use clap::Parser;
-use neon_broker::neon_broker_client::NeonBrokerClient;
-use neon_broker::subscribe_safekeeper_info_request::SubscriptionKey;
-use neon_broker::TenantTimelineId as ProtoTenantTimelineId;
-use neon_broker::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest};
-use tokio::time::{self, sleep};
-
-use tonic::transport::Channel;
-use tonic::Request;
-
-#[derive(Parser, Debug)]
-#[clap(author, version, about, long_about = None)]
-struct Args {
-    /// Number of publishers
-    #[clap(short = 'p', long, value_parser, default_value_t = 1)]
-    num_pubs: u64,
-    /// Number of subscribers
-    #[clap(short = 's', long, value_parser, default_value_t = 1)]
-    num_subs: u64,
-}
-
-async fn progress_reporter(counters: Vec<Arc<AtomicU64>>) {
-    let mut interval = time::interval(Duration::from_millis(1000));
-    let mut c_old = counters.iter().map(|c| c.load(Ordering::Relaxed)).sum();
-    let mut c_min_old = counters
-        .iter()
-        .map(|c| c.load(Ordering::Relaxed))
-        .min()
-        .unwrap_or(0);
-    let mut started_at = None;
-    let mut skipped: u64 = 0;
-    loop {
-        interval.tick().await;
-        // print!(
-        //     "cnts are {:?}",
-        //     counters
-        //         .iter()
-        //         .map(|c| c.load(Ordering::Relaxed))
-        //         .collect::<Vec<_>>()
-        // );
-        let c_new = counters.iter().map(|c| c.load(Ordering::Relaxed)).sum();
-        let c_min_new = counters
-            .iter()
-            .map(|c| c.load(Ordering::Relaxed))
-            .min()
-            .unwrap_or(0);
-        if c_new > 0 && started_at.is_none() {
-            started_at = Some(Instant::now());
-            skipped = c_new;
-        }
-        let avg_rps = started_at.map(|s| {
-            let dur = s.elapsed();
-            let dur_secs = dur.as_secs() as f64 + (dur.subsec_millis() as f64) / 1000.0;
-            let avg_rps = (c_new - skipped) as f64 / dur_secs;
-            (dur, avg_rps)
-        });
-        println!(
-            "sum rps {}, min rps {} total {}, total min {}, duration, avg sum rps {:?}",
-            c_new - c_old,
-            c_min_new - c_min_old,
-            c_new,
-            c_min_new,
-            avg_rps
-        );
-        c_old = c_new;
-        c_min_old = c_min_new;
-    }
-}
-
-fn tli_from_u64(i: u64) -> Vec<u8> {
-    let mut timeline_id = vec![0xFF; 8];
-    timeline_id.extend_from_slice(&i.to_be_bytes());
-    timeline_id
-}
-
-async fn subscribe(client: Option<NeonBrokerClient<Channel>>, counter: Arc<AtomicU64>, i: u64) {
-    let mut client = match client {
-        Some(c) => c,
-        None => NeonBrokerClient::connect("http://[::1]:50051")
-            .await
-            .unwrap(),
-    };
-
-    // let key = SubscriptionKey::All(Empty {});
-    let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
-        tenant_id: vec![0xFF; 16],
-        timeline_id: tli_from_u64(i),
-    });
-    let request = SubscribeSafekeeperInfoRequest {
-        subscription_key: Some(key),
-    };
-    let mut stream = client
-        .subscribe_safekeeper_info(request)
-        .await
-        .unwrap()
-        .into_inner();
-
-    while let Some(_feature) = stream.message().await.unwrap() {
-        counter.fetch_add(1, Ordering::Relaxed);
-        // println!("info = {:?}, client {}", _feature, i);
-    }
-}
-
-async fn publish(client: Option<NeonBrokerClient<Channel>>, n_keys: u64) {
-    let mut client = match client {
-        Some(c) => c,
-        None => NeonBrokerClient::connect("http://[::1]:50051")
-            .await
-            .unwrap(),
-    };
-    let mut counter: u64 = 0;
-
-    // create stream producing new values
-    let outbound = async_stream::stream! {
-        loop {
-            let info = SafekeeperTimelineInfo {
-                safekeeper_id: 1,
-                tenant_timeline_id: Some(ProtoTenantTimelineId {
-                    tenant_id: vec![0xFF; 16],
-                    timeline_id: tli_from_u64(counter % n_keys),
-                }),
-                last_log_term: 0,
-                flush_lsn: counter,
-                commit_lsn: 2,
-                backup_lsn: 3,
-                remote_consistent_lsn: 4,
-                peer_horizon_lsn: 5,
-                safekeeper_connstr: "zenith-1-sk-1.local:7676".to_owned(),
-            };
-            counter += 1;
-            // println!("sending info = {:?}", info);
-            // if counter >= 1000 {
-                // break;
-            // }
-            yield info;
-            // sleep(Duration::from_millis(100)).await;
-        }
-    };
-    let _response = client
-        .publish_safekeeper_info(Request::new(outbound))
-        .await
-        .unwrap();
-}
-
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    let args = Args::parse();
-
-    let mut counters = Vec::with_capacity(args.num_subs as usize);
-    for _ in 0..args.num_subs {
-        counters.push(Arc::new(AtomicU64::new(0)));
-    }
-    let h = tokio::spawn(progress_reporter(counters.clone()));
-
-    let c = NeonBrokerClient::connect("http://[::1]:50051")
-        .await
-        .unwrap();
-
-    for i in 0..args.num_subs {
-        let c = Some(c.clone());
-        // let c = None;
-        tokio::spawn(subscribe(c, counters[i as usize].clone(), i));
-    }
-    for _i in 0..args.num_pubs {
-        // let c = Some(c.clone());
-        tokio::spawn(publish(None, args.num_subs as u64));
-    }
-
-    h.await?;
-    Ok(())
-}
--- a/broker/src/broker.rs
+++ b/broker/src/broker.rs
@@ -1,526 +0,0 @@
-//! Simple pub-sub based on grpc (tonic) and Tokio mpsc for storage nodes
-//! messaging. The main design goal is to avoid central synchronization during
-//! normal flow, resorting to it only when pub/sub change happens. Each
-//! subscriber holds mpsc for messages it sits on; tx end is sent to existing
-//! publishers and saved in shared state for new ones. Publishers maintain
-//! locally set of subscribers they stream messages to.
-//!
-//! Subscriptions to 1) single timeline 2) everything are possible. We could add
-//! subscription to set of timelines to save grpc streams, but testing shows
-//! many individual streams is also ok.
-//!
-//! Message is dropped if subscriber can't consume it, not affecting other
-//! subscribers.
-//!
-//! Only safekeeper message is supported, but it is not hard to add something
-//! else with templating.
-use std::collections::hash_map::Entry;
-use std::collections::HashMap;
-use std::fmt;
-use std::pin::Pin;
-use std::sync::{Arc, Mutex};
-use std::task::{Context, Poll};
-use std::time::Duration;
-
-use futures_core::Stream;
-use futures_util::StreamExt;
-use tokio::sync::mpsc::error::TrySendError;
-use tokio::sync::mpsc::{self, Receiver, Sender};
-use tokio::{select, time};
-use tokio_stream::wrappers::ReceiverStream;
-use tonic::Code;
-use tonic::{transport::Server, Request, Response, Status};
-
-use neon_broker_proto::neon_broker_server::{NeonBroker, NeonBrokerServer};
-use neon_broker_proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
-use neon_broker_proto::TenantTimelineId as ProtoTenantTimelineId;
-use neon_broker_proto::{Empty, SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest};
-use utils::id::{TenantId, TenantTimelineId, TimelineId};
-
-pub mod neon_broker_proto {
-    // The string specified here must match the proto package name.
-    // If you want to have a look at the generated code, it is at path similar to
-    // target/debug/build/neon_broker-0fde81d03bedc3b2/out/neon_broker.rs
-    tonic::include_proto!("neon_broker");
-}
-
-// Max size of the queue to the subscriber.
-const CHAN_SIZE: usize = 256;
-
-type PubId = u64; // id of publisher for registering in maps
-type SubId = u64; // id of subscriber for registering in maps
-
-#[derive(Copy, Clone)]
-enum SubscriptionKey {
-    All,
-    Timeline(TenantTimelineId),
-}
-
-impl SubscriptionKey {
-    // Parse protobuf subkey (protobuf doesn't have fixed size bytes, we get vectors).
-    pub fn from_proto_subscription_key(key: ProtoSubscriptionKey) -> Result<Self, Status> {
-        match key {
-            ProtoSubscriptionKey::All(_) => Ok(SubscriptionKey::All),
-            ProtoSubscriptionKey::TenantTimelineId(proto_ttid) => {
-                Ok(SubscriptionKey::Timeline(parse_proto_ttid(&proto_ttid)?))
-            }
-        }
-    }
-}
-
-// Subscriber id + tx end of the channel for messages to it.
-#[derive(Clone)]
-struct SubSender(SubId, Sender<SafekeeperTimelineInfo>);
-
-impl fmt::Debug for SubSender {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "Subscription id {}", self.0)
-    }
-}
-
-// Announcements subscriber sends to publisher(s) asking it to stream to the
-// provided channel, or forget about it, releasing memory.
-#[derive(Clone)]
-enum SubAnnounce {
-    AddAll(Sender<SafekeeperTimelineInfo>), // add subscription to all timelines
-    AddTimeline(TenantTimelineId, SubSender), // add subsciption to the specific timeline
-    RemoveTimeline(TenantTimelineId, SubId), // remove subscription to the specific timeline
-                                            // RemoveAll is not needed as publisher will notice closed channel while
-                                            // trying to send the next message.
-}
-
-struct SharedState {
-    // Registered publishers. They sit on the rx end of these channels and
-    // receive through it tx handles of chans to subscribers.
-    //
-    // Note: publishers don't identify which keys they publish, so each
-    // publisher will receive channels to all subs and filter them before sending.
-    pub_txs: HashMap<PubId, Sender<SubAnnounce>>,
-    next_pub_id: PubId,
-    // Registered subscribers -- when publisher joins it walks over them,
-    // collecting txs to send messages.
-    subs_to_all: HashMap<SubId, Sender<SafekeeperTimelineInfo>>,
-    subs_to_timelines: HashMap<TenantTimelineId, Vec<SubSender>>,
-    next_sub_id: SubId,
-}
-
-// Utility func to remove subscription from the map
-fn remove_sub(
-    subs_to_timelines: &mut HashMap<TenantTimelineId, Vec<SubSender>>,
-    ttid: &TenantTimelineId,
-    sub_id: SubId,
-) {
-    if let Some(subsenders) = subs_to_timelines.get_mut(&ttid) {
-        subsenders.retain(|ss| ss.0 != sub_id);
-        if subsenders.len() == 0 {
-            subs_to_timelines.remove(&ttid);
-        }
-    }
-    // Note that subscription might be not here if subscriber task was aborted
-    // earlier than it managed to notify publisher about itself.
-}
-
-impl SharedState {
-    // Register new publisher.
-    pub fn register_publisher(&mut self, announce_tx: Sender<SubAnnounce>) -> PubId {
-        let pub_id = self.next_pub_id;
-        self.next_pub_id += 1;
-        assert!(!self.pub_txs.contains_key(&pub_id));
-        self.pub_txs.insert(pub_id, announce_tx);
-        pub_id
-    }
-
-    pub fn unregister_publisher(&mut self, pub_id: PubId) {
-        assert!(self.pub_txs.contains_key(&pub_id));
-        self.pub_txs.remove(&pub_id);
-    }
-
-    // Register new subscriber.
-    // Returns list of channels through which existing publishers must be notified
-    // about new subscriber; we can't do it here due to risk of deadlock.
-    pub fn register_subscriber(
-        &mut self,
-        sub_key: SubscriptionKey,
-        sub_tx: Sender<SafekeeperTimelineInfo>,
-    ) -> (SubId, Vec<Sender<SubAnnounce>>, SubAnnounce) {
-        let sub_id = self.next_sub_id;
-        self.next_sub_id += 1;
-        let announce = match sub_key {
-            SubscriptionKey::All => {
-                assert!(!self.subs_to_all.contains_key(&sub_id));
-                self.subs_to_all.insert(sub_id, sub_tx.clone());
-                SubAnnounce::AddAll(sub_tx)
-            }
-            SubscriptionKey::Timeline(ttid) => {
-                match self.subs_to_timelines.entry(ttid) {
-                    Entry::Occupied(mut o) => {
-                        let subsenders = o.get_mut();
-                        subsenders.push(SubSender(sub_id, sub_tx.clone()));
-                    }
-                    Entry::Vacant(v) => {
-                        v.insert(vec![SubSender(sub_id, sub_tx.clone())]);
-                    }
-                }
-                SubAnnounce::AddTimeline(ttid, SubSender(sub_id, sub_tx))
-            }
-        };
-        // Collect existing publishers to notify them after lock is released;
-        // TODO: the probability of channels being full here is tiny (publisher
-        // always blocks listening chan), we can try sending first and resort to
-        // cloning if needed.
-        //
-        // Deadlock is possible only if publisher tries to access shared state
-        // during its lifetime, i.e. we add maintenance of set of published
-        // tlis. Otherwise we can just await here (but lock must be replaced
-        // with Tokio one).
-        //
-        // We could also just error out if some chan is full, but that needs
-        // cleanup of incompleted job, and notifying publishers when unregistering
-        // is mandatory anyway.
-        (sub_id, self.pub_txs.values().cloned().collect(), announce)
-    }
-
-    // Unregister the subscriber. Similar to register_subscriber, returns list
-    // of channels through which publishers must be notified about the removal.
-    pub fn unregister_subscriber(
-        &mut self,
-        sub_id: SubId,
-        sub_key: SubscriptionKey,
-    ) -> Option<(Vec<Sender<SubAnnounce>>, SubAnnounce)> {
-        // We need to notify existing publishers only about per timeline
-        // subscriptions, 'all' kind is detected on its own through closed
-        // channels.
-        let announce = match sub_key {
-            SubscriptionKey::All => {
-                assert!(self.subs_to_all.contains_key(&sub_id));
-                self.subs_to_all.remove(&sub_id);
-                None
-            }
-            SubscriptionKey::Timeline(ref ttid) => {
-                remove_sub(&mut self.subs_to_timelines, ttid, sub_id);
-                Some(SubAnnounce::RemoveTimeline(*ttid, sub_id))
-            }
-        };
-        announce.map(|a| (self.pub_txs.values().cloned().collect(), a))
-    }
-
-    pub fn report(&mut self) {
-        println!(
-            "registered {} publishers, {} subs to all, {} subs to timelines",
-            self.pub_txs.len(),
-            self.subs_to_all.len(),
-            self.subs_to_timelines.len(),
-        );
-    }
-}
-
-// SharedState wrapper for post-locking operations (sending to pub_tx chans).
-#[derive(Clone)]
-struct Registry {
-    shared_state: Arc<Mutex<SharedState>>,
-}
-
-impl Registry {
-    // Register new publisher in shared state.
-    pub fn register_publisher(&self) -> Publisher {
-        let (announce_tx, announce_rx) = mpsc::channel(128);
-        let mut ss = self.shared_state.lock().unwrap();
-        let id = ss.register_publisher(announce_tx);
-        let (subs_to_all, subs_to_timelines) = (
-            ss.subs_to_all.values().cloned().collect(),
-            ss.subs_to_timelines.clone(),
-        );
-        drop(ss);
-        // println!("registered publisher {}", id);
-        Publisher {
-            id,
-            announce_rx: announce_rx.into(),
-            subs_to_all,
-            subs_to_timelines,
-            registry: self.clone(),
-        }
-    }
-
-    pub fn unregister_publisher(&self, publisher: &Publisher) {
-        self.shared_state
-            .lock()
-            .unwrap()
-            .unregister_publisher(publisher.id);
-        // println!("unregistered publisher {}", publisher.id);
-    }
-
-    // Register new subscriber in shared state.
-    pub async fn register_subscriber(&self, sub_key: SubscriptionKey) -> Subscriber {
-        let (tx, rx) = mpsc::channel(CHAN_SIZE);
-        let id;
-        let mut pub_txs;
-        let announce;
-        {
-            let mut ss = self.shared_state.lock().unwrap();
-            (id, pub_txs, announce) = ss.register_subscriber(sub_key, tx);
-        }
-        // Note: it is important to create Subscriber before .await. If client
-        // disconnects during await, which would terminate the Future we still
-        // need to run Subscriber's drop() which will unregister it from the
-        // shared state.
-        let subscriber = Subscriber {
-            id,
-            key: sub_key,
-            sub_rx: rx,
-            registry: self.clone(),
-        };
-        // Notify existing publishers about new subscriber.
-        for pub_tx in pub_txs.iter_mut() {
-            // Closed channel is fine; it means publisher has gone.
-            pub_tx.send(announce.clone()).await.ok();
-        }
-        // println!("registered subscriber {}", id);
-        subscriber
-    }
-
-    // Unregister the subscriber
-    pub fn unregister_subscriber(&self, sub: &Subscriber) {
-        let mut ss = self.shared_state.lock().unwrap();
-        let announce_pack = ss.unregister_subscriber(sub.id, sub.key);
-        drop(ss);
-        // Notify publishers about the removal. Apart from wanting to do it
-        // outside lock, here we also spin a task as Drop impl can't be async.
-        if let Some((mut pub_txs, announce)) = announce_pack {
-            tokio::spawn(async move {
-                for pub_tx in pub_txs.iter_mut() {
-                    // Closed channel is fine; it means publisher has gone.
-                    pub_tx.send(announce.clone()).await.ok();
-                }
-            });
-        }
-        // println!("unregistered subscriber {}", sub.id);
-    }
-
-    pub async fn report(&self) {
-        let mut interval = time::interval(Duration::from_millis(1000));
-        loop {
-            interval.tick().await;
-            self.shared_state.lock().unwrap().report();
-        }
-    }
-}
-
-// Private subscriber state.
-struct Subscriber {
-    id: SubId,
-    key: SubscriptionKey,
-    // Subscriber receives messages from publishers here.
-    sub_rx: Receiver<SafekeeperTimelineInfo>,
-    // to unregister itself from shared state in Drop
-    registry: Registry,
-}
-
-impl Drop for Subscriber {
-    fn drop(&mut self) {
-        self.registry.unregister_subscriber(self);
-    }
-}
-
-// Private publisher state
-struct Publisher {
-    id: PubId,
-    // new subscribers request to send (or stop sending) msgs them here.
-    // It could be just Receiver, but weirdly it doesn't implement futures_core Stream directly.
-    announce_rx: ReceiverStream<SubAnnounce>,
-    subs_to_all: Vec<Sender<SafekeeperTimelineInfo>>,
-    subs_to_timelines: HashMap<TenantTimelineId, Vec<SubSender>>,
-    // to unregister itself from shared state in Drop
-    registry: Registry,
-}
-
-impl Publisher {
-    // Send msg to relevant subscribers.
-    pub fn send_msg(&mut self, msg: &SafekeeperTimelineInfo) -> Result<(), Status> {
-        // send message to subscribers for everything
-        let mut cleanup_subs_to_all = false;
-        for sub in self.subs_to_all.iter() {
-            match sub.try_send(msg.clone()) {
-                Err(TrySendError::Full(_)) => {
-                    // println!("dropping message, channel is full");
-                }
-                Err(TrySendError::Closed(_)) => {
-                    cleanup_subs_to_all = true;
-                }
-                _ => (),
-            }
-        }
-        // some channels got closed (subscriber gone), remove them
-        if cleanup_subs_to_all {
-            self.subs_to_all.retain(|tx| !tx.is_closed());
-        }
-
-        // send message to per timeline subscribers
-        let ttid = parse_proto_ttid(msg.tenant_timeline_id.as_ref().ok_or(Status::new(
-            Code::InvalidArgument,
-            "missing tenant_timeline_id",
-        ))?)?;
-        if let Some(subs) = self.subs_to_timelines.get(&ttid) {
-            for tx in subs.iter().map(|sub_sender| &sub_sender.1) {
-                if let Err(TrySendError::Full(_)) = tx.try_send(msg.clone()) {
-                    // println!("dropping message, channel is full");
-                }
-                // closed channel is ignored here; we will be notified and remove it soon
-            }
-        }
-        Ok(())
-    }
-
-    // Add/remove subscriber according to sub_announce.
-    pub fn update_sub(&mut self, sub_announce: SubAnnounce) {
-        match sub_announce {
-            SubAnnounce::AddAll(tx) => self.subs_to_all.push(tx),
-            SubAnnounce::AddTimeline(ttid, sub_sender) => {
-                match self.subs_to_timelines.entry(ttid) {
-                    Entry::Occupied(mut o) => {
-                        let subsenders = o.get_mut();
-                        subsenders.push(sub_sender);
-                    }
-                    Entry::Vacant(v) => {
-                        v.insert(vec![sub_sender]);
-                    }
-                }
-            }
-            SubAnnounce::RemoveTimeline(ref ttid, sub_id) => {
-                remove_sub(&mut self.subs_to_timelines, ttid, sub_id);
-            }
-        }
-    }
-}
-
-impl Drop for Publisher {
-    fn drop(&mut self) {
-        self.registry.unregister_publisher(self);
-    }
-}
-
-struct NeonBrokerService {
-    registry: Registry,
-}
-
-#[tonic::async_trait]
-impl NeonBroker for NeonBrokerService {
-    async fn publish_safekeeper_info(
-        &self,
-        request: Request<tonic::Streaming<SafekeeperTimelineInfo>>,
-    ) -> Result<Response<Empty>, Status> {
-        let mut publisher = self.registry.register_publisher();
-
-        let mut stream = request.into_inner();
-
-        loop {
-            select! {
-                msg = stream.next() => {
-                    match msg {
-                        Some(Ok(msg)) => {publisher.send_msg(&msg)?;},
-                        Some(Err(e)) => {return Err(e);}, // grpc error from the stream
-                        None => {break;} // closed stream
-                    }
-                }
-                Some(announce) = publisher.announce_rx.next() => {
-                    publisher.update_sub(announce);
-                }
-            }
-        }
-
-        Ok(Response::new(Empty {}))
-    }
-
-    type SubscribeSafekeeperInfoStream =
-        Pin<Box<dyn Stream<Item = Result<SafekeeperTimelineInfo, Status>> + Send + 'static>>;
-
-    async fn subscribe_safekeeper_info(
-        &self,
-        request: Request<SubscribeSafekeeperInfoRequest>,
-    ) -> Result<Response<Self::SubscribeSafekeeperInfoStream>, Status> {
-        let proto_key = request.into_inner().subscription_key.ok_or(Status::new(
-            Code::InvalidArgument,
-            "missing subscription key",
-        ))?;
-        let sub_key = SubscriptionKey::from_proto_subscription_key(proto_key)?;
-        let mut subscriber = self.registry.register_subscriber(sub_key).await;
-
-        // transform rx into stream with item = Result, as method result demands
-        let output = async_stream::try_stream! {
-            while let Some(info) = subscriber.sub_rx.recv().await {
-                    yield info
-                }
-
-            // internal generator
-            // let _ = subscriber.sub_rx.try_recv().ok();
-            // let mut counter = 0;
-            // loop {
-            // let info = SafekeeperTimelineInfo {
-            //     safekeeper_id: 1,
-            //     tenant_timeline_id: Some(ProtoTenantTimelineId {
-            //         tenant_id: vec![0xFF; 16],
-            //         timeline_id: vec![0xFF; 16],
-            //         // timeline_id: tli_from_u64(counter),
-            //     }),
-            //     last_log_term: 0,
-            //     flush_lsn: counter,
-            //     commit_lsn: 2,
-            //     backup_lsn: 3,
-            //     remote_consistent_lsn: 4,
-            //     peer_horizon_lsn: 5,
-            //     safekeeper_connstr: "zenith-1-sk-1.local:7676".to_owned(),
-            // };
-            // counter += 1;
-            // yield info;
-            // }
-        };
-
-        Ok(Response::new(
-            Box::pin(output) as Self::SubscribeSafekeeperInfoStream
-        ))
-    }
-}
-
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    // console_subscriber::init();
-
-    let addr = "[::1]:50051".parse()?;
-    let registry = Registry {
-        shared_state: Arc::new(Mutex::new(SharedState {
-            pub_txs: HashMap::new(),
-            next_pub_id: 0,
-            subs_to_all: HashMap::new(),
-            subs_to_timelines: HashMap::new(),
-            next_sub_id: 0,
-        })),
-    };
-    let neon_broker_service = NeonBrokerService {
-        registry: registry.clone(),
-    };
-
-    tokio::spawn(async move { registry.report().await });
-
-    Server::builder()
-        .http2_keepalive_interval(Some(Duration::from_millis(5000)))
-        .add_service(NeonBrokerServer::new(neon_broker_service))
-        .serve(addr)
-        .await?;
-
-    Ok(())
-}
-
-// parse variable length bytes from protobuf
-fn parse_proto_ttid(proto_ttid: &ProtoTenantTimelineId) -> Result<TenantTimelineId, Status> {
-    let tenant_id = TenantId::from_vec(&proto_ttid.tenant_id)
-        .map_err(|e| Status::new(Code::InvalidArgument, format!("malformed tenant_id: {}", e)))?;
-    let timeline_id = TimelineId::from_vec(&proto_ttid.timeline_id).map_err(|e| {
-        Status::new(
-            Code::InvalidArgument,
-            format!("malformed timeline_id: {}", e),
-        )
-    })?;
-    Ok(TenantTimelineId {
-        tenant_id,
-        timeline_id,
-    })
-}
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -6,7 +6,7 @@ edition = "2021"
 [dependencies]
 anyhow = "1.0"
 chrono = "0.4"
-clap = "3.0"
+clap = "4.0"
 env_logger = "0.9"
 futures = "0.3.13"
 hyper = { version = "0.14", features = ["full"] }
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -51,53 +51,19 @@ fn main() -> Result<()> {
    // TODO: re-use `utils::logging` later
    init_logger(DEFAULT_LOG_LEVEL)?;

-    // Env variable is set by `cargo`
-    let version: Option<&str> = option_env!("CARGO_PKG_VERSION");
-    let matches = clap::App::new("compute_ctl")
-        .version(version.unwrap_or("unknown"))
-        .arg(
-            Arg::new("connstr")
-                .short('C')
-                .long("connstr")
-                .value_name("DATABASE_URL")
-                .required(true),
-        )
-        .arg(
-            Arg::new("pgdata")
-                .short('D')
-                .long("pgdata")
-                .value_name("DATADIR")
-                .required(true),
-        )
-        .arg(
-            Arg::new("pgbin")
-                .short('b')
-                .long("pgbin")
-                .value_name("POSTGRES_PATH"),
-        )
-        .arg(
-            Arg::new("spec")
-                .short('s')
-                .long("spec")
-                .value_name("SPEC_JSON"),
-        )
-        .arg(
-            Arg::new("spec-path")
-                .short('S')
-                .long("spec-path")
-                .value_name("SPEC_PATH"),
-        )
-        .get_matches();
+    let matches = cli().get_matches();

-    let pgdata = matches.value_of("pgdata").expect("PGDATA path is required");
+    let pgdata = matches
+        .get_one::<String>("pgdata")
+        .expect("PGDATA path is required");
    let connstr = matches
-        .value_of("connstr")
+        .get_one::<String>("connstr")
        .expect("Postgres connection string is required");
-    let spec = matches.value_of("spec");
-    let spec_path = matches.value_of("spec-path");
+    let spec = matches.get_one::<String>("spec");
+    let spec_path = matches.get_one::<String>("spec-path");

    // Try to use just 'postgres' if no path is provided
-    let pgbin = matches.value_of("pgbin").unwrap_or("postgres");
+    let pgbin = matches.get_one::<String>("pgbin").unwrap();

    let spec: ComputeSpec = match spec {
        // First, try to get cluster spec from the cli argument
@@ -173,3 +139,48 @@ fn main() -> Result<()> {
        }
    }
 }
+
+fn cli() -> clap::Command {
+    // Env variable is set by `cargo`
+    let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown");
+    clap::Command::new("compute_ctl")
+        .version(version)
+        .arg(
+            Arg::new("connstr")
+                .short('C')
+                .long("connstr")
+                .value_name("DATABASE_URL")
+                .required(true),
+        )
+        .arg(
+            Arg::new("pgdata")
+                .short('D')
+                .long("pgdata")
+                .value_name("DATADIR")
+                .required(true),
+        )
+        .arg(
+            Arg::new("pgbin")
+                .short('b')
+                .long("pgbin")
+                .default_value("postgres")
+                .value_name("POSTGRES_PATH"),
+        )
+        .arg(
+            Arg::new("spec")
+                .short('s')
+                .long("spec")
+                .value_name("SPEC_JSON"),
+        )
+        .arg(
+            Arg::new("spec-path")
+                .short('S')
+                .long("spec-path")
+                .value_name("SPEC_PATH"),
+        )
+}
+
+#[test]
+fn verify_cli() {
+    cli().debug_assert()
+}
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -8,11 +8,10 @@ use std::process::Child;
 use std::time::{Duration, Instant};

 use anyhow::{bail, Result};
+use notify::{RecursiveMode, Watcher};
 use postgres::{Client, Transaction};
 use serde::Deserialize;

-use notify::{RecursiveMode, Watcher};
-
 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds

 /// Rust representation of Postgres role info with only those fields
@@ -169,7 +168,7 @@ impl Database {
    /// it may require a proper quoting too.
    pub fn to_pg_options(&self) -> String {
        let mut params: String = self.options.as_pg_options();
-        write!(params, " OWNER {}", &self.owner.quote())
+        write!(params, " OWNER {}", &self.owner.pg_quote())
            .expect("String is documented to not to error during write operations");

        params
@@ -180,18 +179,17 @@ impl Database {
 /// intended to be used for DB / role names.
 pub type PgIdent = String;

-/// Generic trait used to provide quoting for strings used in the
-/// Postgres SQL queries. Currently used only to implement quoting
-/// of identifiers, but could be used for literals in the future.
-pub trait PgQuote {
-    fn quote(&self) -> String;
+/// Generic trait used to provide quoting / encoding for strings used in the
+/// Postgres SQL queries and DATABASE_URL.
+pub trait Escaping {
+    fn pg_quote(&self) -> String;
 }

-impl PgQuote for PgIdent {
+impl Escaping for PgIdent {
    /// This is intended to mimic Postgres quote_ident(), but for simplicity it
-    /// always quotes provided string with `""` and escapes every `"`. Not idempotent,
-    /// i.e. if string is already escaped it will be escaped again.
-    fn quote(&self) -> String {
+    /// always quotes provided string with `""` and escapes every `"`.
+    /// **Not idempotent**, i.e. if string is already escaped it will be escaped again.
+    fn pg_quote(&self) -> String {
        let result = format!("\"{}\"", self.replace('"', "\"\""));
        result
    }
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,7 +1,9 @@
 use std::path::Path;
+use std::str::FromStr;

 use anyhow::Result;
 use log::{info, log_enabled, warn, Level};
+use postgres::config::Config;
 use postgres::{Client, NoTls};
 use serde::Deserialize;

@@ -115,8 +117,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                    if existing_roles.iter().any(|r| r.name == op.name) {
                        let query: String = format!(
                            "ALTER ROLE {} RENAME TO {}",
-                            op.name.quote(),
-                            new_name.quote()
+                            op.name.pg_quote(),
+                            new_name.pg_quote()
                        );

                        warn!("renaming role '{}' to '{}'", op.name, new_name);
@@ -162,7 +164,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            }

            if update_role {
-                let mut query: String = format!("ALTER ROLE {} ", name.quote());
+                let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
                info_print!(" -> update");

                query.push_str(&role.to_pg_options());
@@ -170,7 +172,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            }
        } else {
            info!("role name: '{}'", &name);
-            let mut query: String = format!("CREATE ROLE {} ", name.quote());
+            let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
            info!("role create query: '{}'", &query);
            info_print!(" -> create");

@@ -179,7 +181,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {

            let grant_query = format!(
                "GRANT pg_read_all_data, pg_write_all_data TO {}",
-                name.quote()
+                name.pg_quote()
            );
            xact.execute(grant_query.as_str(), &[])?;
            info!("role grant query: '{}'", &grant_query);
@@ -215,7 +217,7 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
            // We do not check either role exists or not,
            // Postgres will take care of it for us
            if op.action == "delete_role" {
-                let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote());
+                let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.pg_quote());

                warn!("deleting role '{}'", &op.name);
                xact.execute(query.as_str(), &[])?;
@@ -230,17 +232,16 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
 fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> {
    for db in &node.spec.cluster.databases {
        if db.owner != *role_name {
-            let mut connstr = node.connstr.clone();
-            // database name is always the last and the only component of the path
-            connstr.set_path(&db.name);
+            let mut conf = Config::from_str(node.connstr.as_str())?;
+            conf.dbname(&db.name);

-            let mut client = Client::connect(connstr.as_str(), NoTls)?;
+            let mut client = conf.connect(NoTls)?;

            // This will reassign all dependent objects to the db owner
            let reassign_query = format!(
                "REASSIGN OWNED BY {} TO {}",
-                role_name.quote(),
-                db.owner.quote()
+                role_name.pg_quote(),
+                db.owner.pg_quote()
            );
            info!(
                "reassigning objects owned by '{}' in db '{}' to '{}'",
@@ -249,7 +250,7 @@ fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()>
            client.simple_query(&reassign_query)?;

            // This now will only drop privileges of the role
-            let drop_query = format!("DROP OWNED BY {}", role_name.quote());
+            let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
            client.simple_query(&drop_query)?;
        }
    }
@@ -279,7 +280,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                // We do not check either DB exists or not,
                // Postgres will take care of it for us
                "delete_db" => {
-                    let query: String = format!("DROP DATABASE IF EXISTS {}", &op.name.quote());
+                    let query: String = format!("DROP DATABASE IF EXISTS {}", &op.name.pg_quote());

                    warn!("deleting database '{}'", &op.name);
                    client.execute(query.as_str(), &[])?;
@@ -291,8 +292,8 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                    if existing_dbs.iter().any(|r| r.name == op.name) {
                        let query: String = format!(
                            "ALTER DATABASE {} RENAME TO {}",
-                            op.name.quote(),
-                            new_name.quote()
+                            op.name.pg_quote(),
+                            new_name.pg_quote()
                        );

                        warn!("renaming database '{}' to '{}'", op.name, new_name);
@@ -320,7 +321,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            // XXX: db owner name is returned as quoted string from Postgres,
            // when quoting is needed.
            let new_owner = if r.owner.starts_with('"') {
-                db.owner.quote()
+                db.owner.pg_quote()
            } else {
                db.owner.clone()
            };
@@ -328,15 +329,15 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            if new_owner != r.owner {
                let query: String = format!(
                    "ALTER DATABASE {} OWNER TO {}",
-                    name.quote(),
-                    db.owner.quote()
+                    name.pg_quote(),
+                    db.owner.pg_quote()
                );
                info_print!(" -> update");

                client.execute(query.as_str(), &[])?;
            }
        } else {
-            let mut query: String = format!("CREATE DATABASE {} ", name.quote());
+            let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
            info_print!(" -> create");

            query.push_str(&db.to_pg_options());
@@ -366,7 +367,7 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
        .cluster
        .roles
        .iter()
-        .map(|r| r.name.quote())
+        .map(|r| r.name.pg_quote())
        .collect::<Vec<_>>();

    for db in &spec.cluster.databases {
@@ -374,7 +375,7 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {

        let query: String = format!(
            "GRANT CREATE ON DATABASE {} TO {}",
-            dbname.quote(),
+            dbname.pg_quote(),
            roles.join(", ")
        );
        info!("grant query {}", &query);
@@ -385,12 +386,11 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
    // Do some per-database access adjustments. We'd better do this at db creation time,
    // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
    // atomically.
-    let mut db_connstr = node.connstr.clone();
    for db in &node.spec.cluster.databases {
-        // database name is always the last and the only component of the path
-        db_connstr.set_path(&db.name);
+        let mut conf = Config::from_str(node.connstr.as_str())?;
+        conf.dbname(&db.name);

-        let mut db_client = Client::connect(db_connstr.as_str(), NoTls)?;
+        let mut db_client = conf.connect(NoTls)?;

        // This will only change ownership on the schema itself, not the objects
        // inside it. Without it owner of the `public` schema will be `cloud_admin`
@@ -419,9 +419,36 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
                    END IF;\n\
                END\n\
            $$;",
-            db.owner.quote()
+            db.owner.pg_quote()
        );
        db_client.simple_query(&alter_query)?;
+
+        // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
+        // This is needed because since postgres 15 this privilege is removed by default.
+        let grant_query = "DO $$\n\
+                BEGIN\n\
+                    IF EXISTS(\n\
+                        SELECT nspname\n\
+                        FROM pg_catalog.pg_namespace\n\
+                        WHERE nspname = 'public'\n\
+                    ) AND\n\
+                    current_setting('server_version_num')::int/10000 >= 15\n\
+                    THEN\n\
+                        IF EXISTS(\n\
+                            SELECT rolname\n\
+                            FROM pg_catalog.pg_roles\n\
+                            WHERE rolname = 'web_access'\n\
+                        )\n\
+                        THEN\n\
+                            GRANT CREATE ON SCHEMA public TO web_access;\n\
+                        END IF;\n\
+                    END IF;\n\
+                END\n\
+            $$;"
+        .to_string();
+
+        info!("grant query for db {} : {}", &db.name, &grant_query);
+        db_client.simple_query(&grant_query)?;
    }

    Ok(())
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -33,9 +33,9 @@ mod pg_helpers_tests {
    }

    #[test]
-    fn quote_ident() {
+    fn ident_pg_quote() {
        let ident: PgIdent = PgIdent::from("\"name\";\\n select 1;");

-        assert_eq!(ident.quote(), "\"\"\"name\"\";\\n select 1;\"");
+        assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\"");
    }
 }
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -4,20 +4,21 @@ version = "0.1.0"
 edition = "2021"

 [dependencies]
-clap = "3.0"
+anyhow = "1.0"
+clap = "4.0"
 comfy-table = "6.1"
 git-version = "0.3.5"
-tar = "0.4.38"
+nix = "0.25"
+once_cell = "1.13.0"
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+regex = "1"
+reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
 serde = { version = "1.0", features = ["derive"] }
 serde_with = "2.0"
-toml = "0.5"
-once_cell = "1.13.0"
-regex = "1"
-anyhow = "1.0"
+tar = "0.4.38"
 thiserror = "1"
-nix = "0.25"
-reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
+toml = "0.5"
+url = "2.2.2"

 # Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
 # instead, so that recompile times are better.
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -0,0 +1,264 @@
+//! Spawns and kills background processes that are needed by Neon CLI.
+//! Applies common set-up such as log and pid files (if needed) to every process.
+//!
+//! Neon CLI does not run in background, so it needs to store the information about
+//! spawned processes, which it does in this module.
+//! We do that by storing the pid of the process in the "${process_name}.pid" file.
+//! The pid file can be created by the process itself
+//! (Neon storage binaries do that and also ensure that a lock is taken onto that file)
+//! or we create such file after starting the process
+//! (non-Neon binaries don't necessarily follow our pidfile conventions).
+//! The pid stored in the file is later used to stop the service.
+//!
+//! See [`lock_file`] module for more info.
+
+use std::ffi::OsStr;
+use std::io::Write;
+use std::path::Path;
+use std::process::{Child, Command};
+use std::time::Duration;
+use std::{fs, io, thread};
+
+use anyhow::{anyhow, bail, Context, Result};
+use nix::errno::Errno;
+use nix::sys::signal::{kill, Signal};
+use nix::unistd::Pid;
+
+use utils::lock_file;
+
+const RETRIES: u32 = 15;
+const RETRY_TIMEOUT_MILLIS: u64 = 500;
+
+/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
+/// it itself.
+pub enum InitialPidFile<'t> {
+    /// Create a pidfile, to allow future CLI invocations to manipulate the process.
+    Create(&'t Path),
+    /// The process will create the pidfile itself, need to wait for that event.
+    Expect(&'t Path),
+}
+
+/// Start a background child process using the parameters given.
+pub fn start_process<F, S: AsRef<OsStr>>(
+    process_name: &str,
+    datadir: &Path,
+    command: &Path,
+    args: &[S],
+    initial_pid_file: InitialPidFile,
+    process_status_check: F,
+) -> anyhow::Result<Child>
+where
+    F: Fn() -> anyhow::Result<bool>,
+{
+    let log_path = datadir.join(format!("{process_name}.log"));
+    let process_log_file = fs::OpenOptions::new()
+        .create(true)
+        .write(true)
+        .append(true)
+        .open(&log_path)
+        .with_context(|| {
+            format!("Could not open {process_name} log file {log_path:?} for writing")
+        })?;
+    let same_file_for_stderr = process_log_file.try_clone().with_context(|| {
+        format!("Could not reuse {process_name} log file {log_path:?} for writing stderr")
+    })?;
+
+    let mut command = Command::new(command);
+    let background_command = command
+        .stdout(process_log_file)
+        .stderr(same_file_for_stderr)
+        .args(args);
+    let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
+
+    let mut spawned_process = filled_cmd.spawn().with_context(|| {
+        format!("Could not spawn {process_name}, see console output and log files for details.")
+    })?;
+    let pid = spawned_process.id();
+    let pid = Pid::from_raw(
+        i32::try_from(pid)
+            .with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
+    );
+
+    let pid_file_to_check = match initial_pid_file {
+        InitialPidFile::Create(target_pid_file_path) => {
+            match lock_file::create_lock_file(target_pid_file_path, pid.to_string()) {
+                lock_file::LockCreationResult::Created { .. } => {
+                    // We use "lock" file here only to create the pid file. The lock on the pidfile will be dropped as soon
+                    // as this CLI invocation exits, so it's a bit useless, but doesn't any harm either.
+                }
+                lock_file::LockCreationResult::AlreadyLocked { .. } => {
+                    anyhow::bail!("Cannot write pid file for {process_name} at path {target_pid_file_path:?}: file is already locked by another process")
+                }
+                lock_file::LockCreationResult::CreationFailed(e) => {
+                    return Err(e.context(format!(
+                    "Failed to create pid file for {process_name} at path {target_pid_file_path:?}"
+                )))
+                }
+            }
+            None
+        }
+        InitialPidFile::Expect(pid_file_path) => Some(pid_file_path),
+    };
+
+    for retries in 0..RETRIES {
+        match process_started(pid, pid_file_to_check, &process_status_check) {
+            Ok(true) => {
+                println!("\n{process_name} started, pid: {pid}");
+                return Ok(spawned_process);
+            }
+            Ok(false) => {
+                if retries < 5 {
+                    print!(".");
+                    io::stdout().flush().unwrap();
+                } else {
+                    if retries == 5 {
+                        println!() // put a line break after dots for second message
+                    }
+                    println!("{process_name} has not started yet, retrying ({retries})...");
+                }
+                thread::sleep(Duration::from_millis(RETRY_TIMEOUT_MILLIS));
+            }
+            Err(e) => {
+                println!("{process_name} failed to start: {e:#}");
+                if let Err(e) = spawned_process.kill() {
+                    println!("Could not stop {process_name} subprocess: {e:#}")
+                };
+                return Err(e);
+            }
+        }
+    }
+    anyhow::bail!("{process_name} could not start in {RETRIES} attempts");
+}
+
+/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
+pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
+    if !pid_file.exists() {
+        println!("{process_name} is already stopped: no pid file {pid_file:?} is present");
+        return Ok(());
+    }
+    let pid = read_pidfile(pid_file)?;
+
+    let sig = if immediate {
+        print!("Stopping {process_name} with pid {pid} immediately..");
+        Signal::SIGQUIT
+    } else {
+        print!("Stopping {process_name} with pid {pid} gracefully..");
+        Signal::SIGTERM
+    };
+    io::stdout().flush().unwrap();
+    match kill(pid, sig) {
+        Ok(()) => (),
+        Err(Errno::ESRCH) => {
+            println!(
+                "{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found"
+            );
+            return Ok(());
+        }
+        Err(e) => anyhow::bail!("Failed to send signal to {process_name} with pid {pid}: {e}"),
+    }
+
+    // Wait until process is gone
+    for _ in 0..RETRIES {
+        match process_has_stopped(pid) {
+            Ok(true) => {
+                println!("\n{process_name} stopped");
+                if let Err(e) = fs::remove_file(pid_file) {
+                    if e.kind() != io::ErrorKind::NotFound {
+                        eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
+                    }
+                }
+                return Ok(());
+            }
+            Ok(false) => {
+                print!(".");
+                io::stdout().flush().unwrap();
+                thread::sleep(Duration::from_secs(1))
+            }
+            Err(e) => {
+                println!("{process_name} with pid {pid} failed to stop: {e:#}");
+                return Err(e);
+            }
+        }
+    }
+
+    anyhow::bail!("{process_name} with pid {pid} failed to stop in {RETRIES} attempts");
+}
+
+fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
+    let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
+
+    let var = "LLVM_PROFILE_FILE";
+    if let Some(val) = std::env::var_os(var) {
+        filled_cmd = filled_cmd.env(var, val);
+    }
+
+    const RUST_LOG_KEY: &str = "RUST_LOG";
+    if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
+        filled_cmd.env(RUST_LOG_KEY, rust_log_value)
+    } else {
+        filled_cmd
+    }
+}
+
+fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
+    for env_key in [
+        "AWS_ACCESS_KEY_ID",
+        "AWS_SECRET_ACCESS_KEY",
+        "AWS_SESSION_TOKEN",
+    ] {
+        if let Ok(value) = std::env::var(env_key) {
+            cmd = cmd.env(env_key, value);
+        }
+    }
+    cmd
+}
+
+fn process_started<F>(
+    pid: Pid,
+    pid_file_to_check: Option<&Path>,
+    status_check: &F,
+) -> anyhow::Result<bool>
+where
+    F: Fn() -> anyhow::Result<bool>,
+{
+    match status_check() {
+        Ok(true) => match pid_file_to_check {
+            Some(pid_file_path) => {
+                if pid_file_path.exists() {
+                    let pid_in_file = read_pidfile(pid_file_path)?;
+                    Ok(pid_in_file == pid)
+                } else {
+                    Ok(false)
+                }
+            }
+            None => Ok(true),
+        },
+        Ok(false) => Ok(false),
+        Err(e) => anyhow::bail!("process failed to start: {e}"),
+    }
+}
+
+/// Read a PID file
+///
+/// We expect a file that contains a single integer.
+fn read_pidfile(pidfile: &Path) -> Result<Pid> {
+    let pid_str = fs::read_to_string(pidfile)
+        .with_context(|| format!("failed to read pidfile {pidfile:?}"))?;
+    let pid: i32 = pid_str
+        .parse()
+        .map_err(|_| anyhow!("failed to parse pidfile {pidfile:?}"))?;
+    if pid < 1 {
+        bail!("pidfile {pidfile:?} contained bad value '{pid}'");
+    }
+    Ok(Pid::from_raw(pid))
+}
+
+fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
+    match kill(pid, None) {
+        // Process exists, keep waiting
+        Ok(_) => Ok(false),
+        // Process not found, we're done
+        Err(Errno::ESRCH) => Ok(true),
+        Err(err) => anyhow::bail!("Failed to send signal to process with pid {pid}: {err}"),
+    }
+}
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -6,11 +6,11 @@
 //! rely on `neon_local` to set up the environment for each test.
 //!
 use anyhow::{anyhow, bail, Context, Result};
-use clap::{App, AppSettings, Arg, ArgMatches};
+use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
 use control_plane::compute::ComputeControlPlane;
 use control_plane::local_env::{EtcdBroker, LocalEnv};
+use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
-use control_plane::storage::PageServerNode;
 use control_plane::{etcd, local_env};
 use pageserver_api::models::TimelineInfo;
 use pageserver_api::{
@@ -85,212 +85,7 @@ struct TimelineTreeEl {
 //   * Providing CLI api to the pageserver
 //   * TODO: export/import to/from usual postgres
 fn main() -> Result<()> {
-    let branch_name_arg = Arg::new("branch-name")
-        .long("branch-name")
-        .takes_value(true)
-        .help("Name of the branch to be created or used as an alias for other services")
-        .required(false);
-
-    let pg_node_arg = Arg::new("node").help("Postgres node name").required(false);
-
-    let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);
-
-    let tenant_id_arg = Arg::new("tenant-id")
-        .long("tenant-id")
-        .help("Tenant id. Represented as a hexadecimal string 32 symbols length")
-        .takes_value(true)
-        .required(false);
-
-    let timeline_id_arg = Arg::new("timeline-id")
-        .long("timeline-id")
-        .help("Timeline id. Represented as a hexadecimal string 32 symbols length")
-        .takes_value(true)
-        .required(false);
-
-    let pg_version_arg = Arg::new("pg-version")
-        .long("pg-version")
-        .help("Postgres version to use for the initial tenant")
-        .required(false)
-        .takes_value(true)
-        .default_value(DEFAULT_PG_VERSION);
-
-    let port_arg = Arg::new("port")
-        .long("port")
-        .required(false)
-        .value_name("port");
-
-    let stop_mode_arg = Arg::new("stop-mode")
-        .short('m')
-        .takes_value(true)
-        .possible_values(&["fast", "immediate"])
-        .help("If 'immediate', don't flush repository data at shutdown")
-        .required(false)
-        .value_name("stop-mode");
-
-    let pageserver_config_args = Arg::new("pageserver-config-override")
-        .long("pageserver-config-override")
-        .takes_value(true)
-        .number_of_values(1)
-        .multiple_occurrences(true)
-        .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
-        .required(false);
-
-    let lsn_arg = Arg::new("lsn")
-        .long("lsn")
-        .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
-        .takes_value(true)
-        .required(false);
-
-    let matches = App::new("Neon CLI")
-        .setting(AppSettings::ArgRequiredElseHelp)
-        .version(GIT_VERSION)
-        .subcommand(
-            App::new("init")
-                .about("Initialize a new Neon repository")
-                .arg(pageserver_config_args.clone())
-                .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
-                .arg(
-                    Arg::new("config")
-                        .long("config")
-                        .required(false)
-                        .value_name("config"),
-                )
-                .arg(pg_version_arg.clone())
-        )
-        .subcommand(
-            App::new("timeline")
-            .about("Manage timelines")
-            .subcommand(App::new("list")
-                .about("List all timelines, available to this pageserver")
-                .arg(tenant_id_arg.clone()))
-            .subcommand(App::new("branch")
-                .about("Create a new timeline, using another timeline as a base, copying its data")
-                .arg(tenant_id_arg.clone())
-                .arg(branch_name_arg.clone())
-                .arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name").takes_value(true)
-                    .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false))
-                .arg(Arg::new("ancestor-start-lsn").long("ancestor-start-lsn").takes_value(true)
-                    .help("When using another timeline as base, use a specific Lsn in it instead of the latest one").required(false)))
-            .subcommand(App::new("create")
-                .about("Create a new blank timeline")
-                .arg(tenant_id_arg.clone())
-                .arg(branch_name_arg.clone())
-                .arg(pg_version_arg.clone())
-            )
-            .subcommand(App::new("import")
-                .about("Import timeline from basebackup directory")
-                .arg(tenant_id_arg.clone())
-                .arg(timeline_id_arg.clone())
-                .arg(Arg::new("node-name").long("node-name").takes_value(true)
-                    .help("Name to assign to the imported timeline"))
-                .arg(Arg::new("base-tarfile").long("base-tarfile").takes_value(true)
-                    .help("Basebackup tarfile to import"))
-                .arg(Arg::new("base-lsn").long("base-lsn").takes_value(true)
-                    .help("Lsn the basebackup starts at"))
-                .arg(Arg::new("wal-tarfile").long("wal-tarfile").takes_value(true)
-                    .help("Wal to add after base"))
-                .arg(Arg::new("end-lsn").long("end-lsn").takes_value(true)
-                    .help("Lsn the basebackup ends at"))
-                .arg(pg_version_arg.clone())
-            )
-        ).subcommand(
-            App::new("tenant")
-            .setting(AppSettings::ArgRequiredElseHelp)
-            .about("Manage tenants")
-            .subcommand(App::new("list"))
-            .subcommand(App::new("create")
-                .arg(tenant_id_arg.clone())
-                .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
-                .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false))
-                .arg(pg_version_arg.clone())
-                )
-            .subcommand(App::new("config")
-                .arg(tenant_id_arg.clone())
-                .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false))
-                )
-        )
-        .subcommand(
-            App::new("pageserver")
-                .setting(AppSettings::ArgRequiredElseHelp)
-                .about("Manage pageserver")
-                .subcommand(App::new("status"))
-                .subcommand(App::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
-                .subcommand(App::new("stop").about("Stop local pageserver")
-                            .arg(stop_mode_arg.clone()))
-                .subcommand(App::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone()))
-        )
-        .subcommand(
-            App::new("safekeeper")
-                .setting(AppSettings::ArgRequiredElseHelp)
-                .about("Manage safekeepers")
-                .subcommand(App::new("start")
-                            .about("Start local safekeeper")
-                            .arg(safekeeper_id_arg.clone())
-                )
-                .subcommand(App::new("stop")
-                            .about("Stop local safekeeper")
-                            .arg(safekeeper_id_arg.clone())
-                            .arg(stop_mode_arg.clone())
-                )
-                .subcommand(App::new("restart")
-                            .about("Restart local safekeeper")
-                            .arg(safekeeper_id_arg.clone())
-                            .arg(stop_mode_arg.clone())
-                )
-        )
-        .subcommand(
-            App::new("pg")
-                .setting(AppSettings::ArgRequiredElseHelp)
-                .about("Manage postgres instances")
-                .subcommand(App::new("list").arg(tenant_id_arg.clone()))
-                .subcommand(App::new("create")
-                    .about("Create a postgres compute node")
-                    .arg(pg_node_arg.clone())
-                    .arg(branch_name_arg.clone())
-                    .arg(tenant_id_arg.clone())
-                    .arg(lsn_arg.clone())
-                    .arg(port_arg.clone())
-                    .arg(
-                        Arg::new("config-only")
-                            .help("Don't do basebackup, create compute node with only config files")
-                            .long("config-only")
-                            .required(false))
-                    .arg(pg_version_arg.clone())
-                )
-                .subcommand(App::new("start")
-                    .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files")
-                    .arg(pg_node_arg.clone())
-                    .arg(tenant_id_arg.clone())
-                    .arg(branch_name_arg.clone())
-                    .arg(timeline_id_arg.clone())
-                    .arg(lsn_arg.clone())
-                    .arg(port_arg.clone())
-                    .arg(pg_version_arg.clone())
-                )
-                .subcommand(
-                    App::new("stop")
-                    .arg(pg_node_arg.clone())
-                    .arg(tenant_id_arg.clone())
-                    .arg(
-                        Arg::new("destroy")
-                            .help("Also delete data directory (now optional, should be default in future)")
-                            .long("destroy")
-                            .required(false)
-                    )
-                    )
-
-        )
-        .subcommand(
-            App::new("start")
-                .about("Start page server and safekeepers")
-                .arg(pageserver_config_args)
-        )
-        .subcommand(
-            App::new("stop")
-                .about("Stop page server and safekeepers")
-                .arg(stop_mode_arg.clone())
-        )
-        .get_matches();
+    let matches = cli().get_matches();

    let (sub_name, sub_args) = match matches.subcommand() {
        Some(subcommand_data) => subcommand_data,
@@ -475,16 +270,16 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R

 fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
    sub_match
-        .value_of("tenant-id")
-        .map(TenantId::from_str)
+        .get_one::<String>("tenant-id")
+        .map(|tenant_id| TenantId::from_str(tenant_id))
        .transpose()
        .context("Failed to parse tenant id from the argument string")
 }

 fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId>> {
    sub_match
-        .value_of("timeline-id")
-        .map(TimelineId::from_str)
+        .get_one::<String>("timeline-id")
+        .map(|timeline_id| TimelineId::from_str(timeline_id))
        .transpose()
        .context("Failed to parse timeline id from the argument string")
 }
@@ -493,19 +288,22 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
    let initial_timeline_id_arg = parse_timeline_id(init_match)?;

    // Create config file
-    let toml_file: String = if let Some(config_path) = init_match.value_of("config") {
+    let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
        // load and parse the file
-        std::fs::read_to_string(std::path::Path::new(config_path))
-            .with_context(|| format!("Could not read configuration file '{config_path}'"))?
+        std::fs::read_to_string(config_path).with_context(|| {
+            format!(
+                "Could not read configuration file '{}'",
+                config_path.display()
+            )
+        })?
    } else {
        // Built-in default config
        default_conf(&EtcdBroker::locate_etcd()?)
    };

    let pg_version = init_match
-        .value_of("pg-version")
-        .unwrap()
-        .parse::<u32>()
+        .get_one::<u32>("pg-version")
+        .copied()
        .context("Failed to parse postgres version from the argument string")?;

    let mut env =
@@ -541,9 +339,10 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {

 fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
    init_match
-        .values_of("pageserver-config-override")
+        .get_many::<String>("pageserver-config-override")
        .into_iter()
        .flatten()
+        .map(|s| s.as_str())
        .collect()
 }

@@ -558,7 +357,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
        Some(("create", create_match)) => {
            let initial_tenant_id = parse_tenant_id(create_match)?;
            let tenant_conf: HashMap<_, _> = create_match
-                .values_of("config")
+                .get_many::<String>("config")
                .map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
                .unwrap_or_default();
            let new_tenant_id = pageserver.tenant_create(initial_tenant_id, tenant_conf)?;
@@ -567,9 +366,8 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
            // Create an initial timeline for the new tenant
            let new_timeline_id = parse_timeline_id(create_match)?;
            let pg_version = create_match
-                .value_of("pg-version")
-                .unwrap()
-                .parse::<u32>()
+                .get_one::<u32>("pg-version")
+                .copied()
                .context("Failed to parse postgres version from the argument string")?;

            let timeline_info = pageserver.timeline_create(
@@ -595,7 +393,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
        Some(("config", create_match)) => {
            let tenant_id = get_tenant_id(create_match, env)?;
            let tenant_conf: HashMap<_, _> = create_match
-                .values_of("config")
+                .get_many::<String>("config")
                .map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
                .unwrap_or_default();

@@ -622,13 +420,12 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
        Some(("create", create_match)) => {
            let tenant_id = get_tenant_id(create_match, env)?;
            let new_branch_name = create_match
-                .value_of("branch-name")
+                .get_one::<String>("branch-name")
                .ok_or_else(|| anyhow!("No branch name provided"))?;

            let pg_version = create_match
-                .value_of("pg-version")
-                .unwrap()
-                .parse::<u32>()
+                .get_one::<u32>("pg-version")
+                .copied()
                .context("Failed to parse postgres version from the argument string")?;

            let timeline_info =
@@ -647,35 +444,32 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
            let tenant_id = get_tenant_id(import_match, env)?;
            let timeline_id = parse_timeline_id(import_match)?.expect("No timeline id provided");
            let name = import_match
-                .value_of("node-name")
+                .get_one::<String>("node-name")
                .ok_or_else(|| anyhow!("No node name provided"))?;

            // Parse base inputs
            let base_tarfile = import_match
-                .value_of("base-tarfile")
-                .map(|s| PathBuf::from_str(s).unwrap())
-                .ok_or_else(|| anyhow!("No base-tarfile provided"))?;
+                .get_one::<PathBuf>("base-tarfile")
+                .ok_or_else(|| anyhow!("No base-tarfile provided"))?
+                .to_owned();
            let base_lsn = Lsn::from_str(
                import_match
-                    .value_of("base-lsn")
+                    .get_one::<String>("base-lsn")
                    .ok_or_else(|| anyhow!("No base-lsn provided"))?,
            )?;
            let base = (base_lsn, base_tarfile);

            // Parse pg_wal inputs
-            let wal_tarfile = import_match
-                .value_of("wal-tarfile")
-                .map(|s| PathBuf::from_str(s).unwrap());
+            let wal_tarfile = import_match.get_one::<PathBuf>("wal-tarfile").cloned();
            let end_lsn = import_match
-                .value_of("end-lsn")
+                .get_one::<String>("end-lsn")
                .map(|s| Lsn::from_str(s).unwrap());
            // TODO validate both or none are provided
            let pg_wal = end_lsn.zip(wal_tarfile);

            let pg_version = import_match
-                .value_of("pg-version")
-                .unwrap()
-                .parse::<u32>()
+                .get_one::<u32>("pg-version")
+                .copied()
                .context("Failed to parse postgres version from the argument string")?;

            let mut cplane = ComputeControlPlane::load(env.clone())?;
@@ -690,10 +484,11 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
        Some(("branch", branch_match)) => {
            let tenant_id = get_tenant_id(branch_match, env)?;
            let new_branch_name = branch_match
-                .value_of("branch-name")
+                .get_one::<String>("branch-name")
                .ok_or_else(|| anyhow!("No branch name provided"))?;
            let ancestor_branch_name = branch_match
-                .value_of("ancestor-branch-name")
+                .get_one::<String>("ancestor-branch-name")
+                .map(|s| s.as_str())
                .unwrap_or(DEFAULT_BRANCH_NAME);
            let ancestor_timeline_id = env
                .get_branch_timeline_id(ancestor_branch_name, tenant_id)
@@ -702,8 +497,8 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                })?;

            let start_lsn = branch_match
-                .value_of("ancestor-start-lsn")
-                .map(Lsn::from_str)
+                .get_one::<String>("ancestor-start-lsn")
+                .map(|lsn_str| Lsn::from_str(lsn_str))
                .transpose()
                .context("Failed to parse ancestor start Lsn from the request")?;
            let timeline_info = pageserver.timeline_create(
@@ -804,45 +599,39 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
        }
        "create" => {
            let branch_name = sub_args
-                .value_of("branch-name")
+                .get_one::<String>("branch-name")
+                .map(|s| s.as_str())
                .unwrap_or(DEFAULT_BRANCH_NAME);
            let node_name = sub_args
-                .value_of("node")
-                .map(ToString::to_string)
-                .unwrap_or_else(|| format!("{}_node", branch_name));
+                .get_one::<String>("node")
+                .map(|node_name| node_name.to_string())
+                .unwrap_or_else(|| format!("{branch_name}_node"));

            let lsn = sub_args
-                .value_of("lsn")
-                .map(Lsn::from_str)
+                .get_one::<String>("lsn")
+                .map(|lsn_str| Lsn::from_str(lsn_str))
                .transpose()
                .context("Failed to parse Lsn from the request")?;
            let timeline_id = env
                .get_branch_timeline_id(branch_name, tenant_id)
-                .ok_or_else(|| anyhow!("Found no timeline id for branch name '{}'", branch_name))?;
+                .ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?;

-            let port: Option<u16> = match sub_args.value_of("port") {
-                Some(p) => Some(p.parse()?),
-                None => None,
-            };
+            let port: Option<u16> = sub_args.get_one::<u16>("port").copied();

            let pg_version = sub_args
-                .value_of("pg-version")
-                .unwrap()
-                .parse::<u32>()
+                .get_one::<u32>("pg-version")
+                .copied()
                .context("Failed to parse postgres version from the argument string")?;

            cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port, pg_version)?;
        }
        "start" => {
-            let port: Option<u16> = match sub_args.value_of("port") {
-                Some(p) => Some(p.parse()?),
-                None => None,
-            };
+            let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
            let node_name = sub_args
-                .value_of("node")
+                .get_one::<String>("node")
                .ok_or_else(|| anyhow!("No node name was provided to start"))?;

-            let node = cplane.nodes.get(&(tenant_id, node_name.to_owned()));
+            let node = cplane.nodes.get(&(tenant_id, node_name.to_string()));

            let auth_token = if matches!(env.pageserver.auth_type, AuthType::NeonJWT) {
                let claims = Claims::new(Some(tenant_id), Scope::Tenant);
@@ -853,36 +642,33 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
            };

            if let Some(node) = node {
-                println!("Starting existing postgres {}...", node_name);
+                println!("Starting existing postgres {node_name}...");
                node.start(&auth_token)?;
            } else {
                let branch_name = sub_args
-                    .value_of("branch-name")
+                    .get_one::<String>("branch-name")
+                    .map(|s| s.as_str())
                    .unwrap_or(DEFAULT_BRANCH_NAME);
                let timeline_id = env
                    .get_branch_timeline_id(branch_name, tenant_id)
                    .ok_or_else(|| {
-                        anyhow!("Found no timeline id for branch name '{}'", branch_name)
+                        anyhow!("Found no timeline id for branch name '{branch_name}'")
                    })?;
                let lsn = sub_args
-                    .value_of("lsn")
-                    .map(Lsn::from_str)
+                    .get_one::<String>("lsn")
+                    .map(|lsn_str| Lsn::from_str(lsn_str))
                    .transpose()
                    .context("Failed to parse Lsn from the request")?;
                let pg_version = sub_args
-                    .value_of("pg-version")
-                    .unwrap()
-                    .parse::<u32>()
-                    .context("Failed to parse postgres version from the argument string")?;
+                    .get_one::<u32>("pg-version")
+                    .copied()
+                    .context("Failed to `pg-version` from the argument string")?;
                // when used with custom port this results in non obvious behaviour
                // port is remembered from first start command, i e
                // start --port X
                // stop
                // start <-- will also use port X even without explicit port argument
-                println!(
-                    "Starting new postgres (v{}) {} on timeline {} ...",
-                    pg_version, node_name, timeline_id
-                );
+                println!("Starting new postgres (v{pg_version}) {node_name} on timeline {timeline_id} ...");

                let node =
                    cplane.new_node(tenant_id, node_name, timeline_id, lsn, port, pg_version)?;
@@ -891,18 +677,18 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
        }
        "stop" => {
            let node_name = sub_args
-                .value_of("node")
+                .get_one::<String>("node")
                .ok_or_else(|| anyhow!("No node name was provided to stop"))?;
-            let destroy = sub_args.is_present("destroy");
+            let destroy = sub_args.get_flag("destroy");

            let node = cplane
                .nodes
-                .get(&(tenant_id, node_name.to_owned()))
-                .with_context(|| format!("postgres {} is not found", node_name))?;
+                .get(&(tenant_id, node_name.to_string()))
+                .with_context(|| format!("postgres {node_name} is not found"))?;
            node.stop(destroy)?;
        }

-        _ => bail!("Unexpected pg subcommand '{}'", sub_name),
+        _ => bail!("Unexpected pg subcommand '{sub_name}'"),
    }

    Ok(())
@@ -920,7 +706,10 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
        }

        Some(("stop", stop_match)) => {
-            let immediate = stop_match.value_of("stop-mode") == Some("immediate");
+            let immediate = stop_match
+                .get_one::<String>("stop-mode")
+                .map(|s| s.as_str())
+                == Some("immediate");

            if let Err(e) = pageserver.stop(immediate) {
                eprintln!("pageserver stop failed: {}", e);
@@ -970,7 +759,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
    };

    // All the commands take an optional safekeeper name argument
-    let sk_id = if let Some(id_str) = sub_args.value_of("id") {
+    let sk_id = if let Some(id_str) = sub_args.get_one::<String>("id") {
        NodeId(id_str.parse().context("while parsing safekeeper id")?)
    } else {
        DEFAULT_SAFEKEEPER_ID
@@ -986,7 +775,8 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
        }

        "stop" => {
-            let immediate = sub_args.value_of("stop-mode") == Some("immediate");
+            let immediate =
+                sub_args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");

            if let Err(e) = safekeeper.stop(immediate) {
                eprintln!("safekeeper stop failed: {}", e);
@@ -995,7 +785,8 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
        }

        "restart" => {
-            let immediate = sub_args.value_of("stop-mode") == Some("immediate");
+            let immediate =
+                sub_args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");

            if let Err(e) = safekeeper.stop(immediate) {
                eprintln!("safekeeper stop failed: {}", e);
@@ -1039,7 +830,8 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
 }

 fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    let immediate = sub_match.value_of("stop-mode") == Some("immediate");
+    let immediate =
+        sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");

    let pageserver = PageServerNode::from_env(env);

@@ -1072,3 +864,219 @@ fn try_stop_etcd_process(env: &local_env::LocalEnv) {
        eprintln!("etcd stop failed: {e}");
    }
 }
+
+fn cli() -> Command {
+    let branch_name_arg = Arg::new("branch-name")
+        .long("branch-name")
+        .help("Name of the branch to be created or used as an alias for other services")
+        .required(false);
+
+    let pg_node_arg = Arg::new("node").help("Postgres node name").required(false);
+
+    let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);
+
+    let tenant_id_arg = Arg::new("tenant-id")
+        .long("tenant-id")
+        .help("Tenant id. Represented as a hexadecimal string 32 symbols length")
+        .required(false);
+
+    let timeline_id_arg = Arg::new("timeline-id")
+        .long("timeline-id")
+        .help("Timeline id. Represented as a hexadecimal string 32 symbols length")
+        .required(false);
+
+    let pg_version_arg = Arg::new("pg-version")
+        .long("pg-version")
+        .help("Postgres version to use for the initial tenant")
+        .required(false)
+        .value_parser(value_parser!(u32))
+        .default_value(DEFAULT_PG_VERSION);
+
+    let port_arg = Arg::new("port")
+        .long("port")
+        .required(false)
+        .value_parser(value_parser!(u16))
+        .value_name("port");
+
+    let stop_mode_arg = Arg::new("stop-mode")
+        .short('m')
+        .value_parser(["fast", "immediate"])
+        .help("If 'immediate', don't flush repository data at shutdown")
+        .required(false)
+        .value_name("stop-mode");
+
+    let pageserver_config_args = Arg::new("pageserver-config-override")
+        .long("pageserver-config-override")
+        .num_args(1)
+        .action(ArgAction::Append)
+        .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
+        .required(false);
+
+    let lsn_arg = Arg::new("lsn")
+        .long("lsn")
+        .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
+        .required(false);
+
+    Command::new("Neon CLI")
+        .arg_required_else_help(true)
+        .version(GIT_VERSION)
+        .subcommand(
+            Command::new("init")
+                .about("Initialize a new Neon repository")
+                .arg(pageserver_config_args.clone())
+                .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
+                .arg(
+                    Arg::new("config")
+                        .long("config")
+                        .required(false)
+                        .value_parser(value_parser!(PathBuf))
+                        .value_name("config"),
+                )
+                .arg(pg_version_arg.clone())
+        )
+        .subcommand(
+            Command::new("timeline")
+            .about("Manage timelines")
+            .subcommand(Command::new("list")
+                .about("List all timelines, available to this pageserver")
+                .arg(tenant_id_arg.clone()))
+            .subcommand(Command::new("branch")
+                .about("Create a new timeline, using another timeline as a base, copying its data")
+                .arg(tenant_id_arg.clone())
+                .arg(branch_name_arg.clone())
+                .arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name")
+                    .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false))
+                .arg(Arg::new("ancestor-start-lsn").long("ancestor-start-lsn")
+                    .help("When using another timeline as base, use a specific Lsn in it instead of the latest one").required(false)))
+            .subcommand(Command::new("create")
+                .about("Create a new blank timeline")
+                .arg(tenant_id_arg.clone())
+                .arg(branch_name_arg.clone())
+                .arg(pg_version_arg.clone())
+            )
+            .subcommand(Command::new("import")
+                .about("Import timeline from basebackup directory")
+                .arg(tenant_id_arg.clone())
+                .arg(timeline_id_arg.clone())
+                .arg(Arg::new("node-name").long("node-name")
+                    .help("Name to assign to the imported timeline"))
+                .arg(Arg::new("base-tarfile")
+                    .long("base-tarfile")
+                    .value_parser(value_parser!(PathBuf))
+                    .help("Basebackup tarfile to import")
+                )
+                .arg(Arg::new("base-lsn").long("base-lsn")
+                    .help("Lsn the basebackup starts at"))
+                .arg(Arg::new("wal-tarfile")
+                    .long("wal-tarfile")
+                    .value_parser(value_parser!(PathBuf))
+                    .help("Wal to add after base")
+                )
+                .arg(Arg::new("end-lsn").long("end-lsn")
+                    .help("Lsn the basebackup ends at"))
+                .arg(pg_version_arg.clone())
+            )
+        ).subcommand(
+            Command::new("tenant")
+            .arg_required_else_help(true)
+            .about("Manage tenants")
+            .subcommand(Command::new("list"))
+            .subcommand(Command::new("create")
+                .arg(tenant_id_arg.clone())
+                .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
+                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
+                .arg(pg_version_arg.clone())
+                )
+            .subcommand(Command::new("config")
+                .arg(tenant_id_arg.clone())
+                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
+                )
+        )
+        .subcommand(
+            Command::new("pageserver")
+                .arg_required_else_help(true)
+                .about("Manage pageserver")
+                .subcommand(Command::new("status"))
+                .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
+                .subcommand(Command::new("stop").about("Stop local pageserver")
+                            .arg(stop_mode_arg.clone()))
+                .subcommand(Command::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone()))
+        )
+        .subcommand(
+            Command::new("safekeeper")
+                .arg_required_else_help(true)
+                .about("Manage safekeepers")
+                .subcommand(Command::new("start")
+                            .about("Start local safekeeper")
+                            .arg(safekeeper_id_arg.clone())
+                )
+                .subcommand(Command::new("stop")
+                            .about("Stop local safekeeper")
+                            .arg(safekeeper_id_arg.clone())
+                            .arg(stop_mode_arg.clone())
+                )
+                .subcommand(Command::new("restart")
+                            .about("Restart local safekeeper")
+                            .arg(safekeeper_id_arg)
+                            .arg(stop_mode_arg.clone())
+                )
+        )
+        .subcommand(
+            Command::new("pg")
+                .arg_required_else_help(true)
+                .about("Manage postgres instances")
+                .subcommand(Command::new("list").arg(tenant_id_arg.clone()))
+                .subcommand(Command::new("create")
+                    .about("Create a postgres compute node")
+                    .arg(pg_node_arg.clone())
+                    .arg(branch_name_arg.clone())
+                    .arg(tenant_id_arg.clone())
+                    .arg(lsn_arg.clone())
+                    .arg(port_arg.clone())
+                    .arg(
+                        Arg::new("config-only")
+                            .help("Don't do basebackup, create compute node with only config files")
+                            .long("config-only")
+                            .required(false))
+                    .arg(pg_version_arg.clone())
+                )
+                .subcommand(Command::new("start")
+                    .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files")
+                    .arg(pg_node_arg.clone())
+                    .arg(tenant_id_arg.clone())
+                    .arg(branch_name_arg)
+                    .arg(timeline_id_arg)
+                    .arg(lsn_arg)
+                    .arg(port_arg)
+                    .arg(pg_version_arg)
+                )
+                .subcommand(
+                    Command::new("stop")
+                    .arg(pg_node_arg)
+                    .arg(tenant_id_arg)
+                    .arg(
+                        Arg::new("destroy")
+                            .help("Also delete data directory (now optional, should be default in future)")
+                            .long("destroy")
+                            .action(ArgAction::SetTrue)
+                            .required(false)
+                        )
+                )
+
+        )
+        .subcommand(
+            Command::new("start")
+                .about("Start page server and safekeepers")
+                .arg(pageserver_config_args)
+        )
+        .subcommand(
+            Command::new("stop")
+                .about("Stop page server and safekeepers")
+                .arg(stop_mode_arg)
+        )
+}
+
+#[test]
+fn verify_cli() {
+    cli().debug_assert();
+}
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -12,15 +12,14 @@ use std::time::Duration;

 use anyhow::{Context, Result};
 use utils::{
-    connstring::connection_host_port,
    id::{TenantId, TimelineId},
    lsn::Lsn,
    postgres_backend::AuthType,
 };

 use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION};
+use crate::pageserver::PageServerNode;
 use crate::postgresql_conf::PostgresConf;
-use crate::storage::PageServerNode;

 //
 // ComputeControlPlane
@@ -183,18 +182,18 @@ impl PostgresNode {
    }

    fn sync_safekeepers(&self, auth_token: &Option<String>, pg_version: u32) -> Result<Lsn> {
-        let pg_path = self.env.pg_bin_dir(pg_version).join("postgres");
+        let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres");
        let mut cmd = Command::new(&pg_path);

        cmd.arg("--sync-safekeepers")
            .env_clear()
            .env(
                "LD_LIBRARY_PATH",
-                self.env.pg_lib_dir(pg_version).to_str().unwrap(),
+                self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
            )
            .env(
                "DYLD_LIBRARY_PATH",
-                self.env.pg_lib_dir(pg_version).to_str().unwrap(),
+                self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
            )
            .env("PGDATA", self.pgdata().to_str().unwrap())
            .stdout(Stdio::piped())
@@ -282,9 +281,7 @@ impl PostgresNode {
    fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> {
        let mut conf = PostgresConf::new();
        conf.append("max_wal_senders", "10");
-        // wal_log_hints is mandatory when running against pageserver (see gh issue#192)
-        // TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE?
-        conf.append("wal_log_hints", "on");
+        conf.append("wal_log_hints", "off");
        conf.append("max_replication_slots", "10");
        conf.append("hot_standby", "on");
        conf.append("shared_buffers", "1MB");
@@ -302,7 +299,8 @@ impl PostgresNode {

        // Configure the node to fetch pages from pageserver
        let pageserver_connstr = {
-            let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
+            let config = &self.pageserver.pg_connection_config;
+            let (host, port) = (config.host(), config.port());

            // Set up authentication
            //
@@ -422,7 +420,7 @@ impl PostgresNode {
    }

    fn pg_ctl(&self, args: &[&str], auth_token: &Option<String>) -> Result<()> {
-        let pg_ctl_path = self.env.pg_bin_dir(self.pg_version).join("pg_ctl");
+        let pg_ctl_path = self.env.pg_bin_dir(self.pg_version)?.join("pg_ctl");
        let mut cmd = Command::new(pg_ctl_path);
        cmd.args(
            [
@@ -440,11 +438,11 @@ impl PostgresNode {
        .env_clear()
        .env(
            "LD_LIBRARY_PATH",
-            self.env.pg_lib_dir(self.pg_version).to_str().unwrap(),
+            self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(),
        )
        .env(
            "DYLD_LIBRARY_PATH",
-            self.env.pg_lib_dir(self.pg_version).to_str().unwrap(),
+            self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(),
        );
        if let Some(token) = auth_token {
            cmd.env("ZENITH_AUTH_TOKEN", token);
--- a/control_plane/src/connection.rs
+++ b/control_plane/src/connection.rs
@@ -0,0 +1,57 @@
+use url::Url;
+
+#[derive(Debug)]
+pub struct PgConnectionConfig {
+    url: Url,
+}
+
+impl PgConnectionConfig {
+    pub fn host(&self) -> &str {
+        self.url.host_str().expect("BUG: no host")
+    }
+
+    pub fn port(&self) -> u16 {
+        self.url.port().expect("BUG: no port")
+    }
+
+    /// Return a `<host>:<port>` string.
+    pub fn raw_address(&self) -> String {
+        format!("{}:{}", self.host(), self.port())
+    }
+
+    /// Connect using postgres protocol with TLS disabled.
+    pub fn connect_no_tls(&self) -> Result<postgres::Client, postgres::Error> {
+        postgres::Client::connect(self.url.as_str(), postgres::NoTls)
+    }
+}
+
+impl std::str::FromStr for PgConnectionConfig {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let mut url: Url = s.parse()?;
+
+        match url.scheme() {
+            "postgres" | "postgresql" => {}
+            other => anyhow::bail!("invalid scheme: {other}"),
+        }
+
+        // It's not a valid connection url if host is unavailable.
+        if url.host().is_none() {
+            anyhow::bail!(url::ParseError::EmptyHost);
+        }
+
+        // E.g. `postgres:bar`.
+        if url.cannot_be_a_base() {
+            anyhow::bail!("URL cannot be a base");
+        }
+
+        // Set the default PG port if it's missing.
+        if url.port().is_none() {
+            url.set_port(Some(5432))
+                .expect("BUG: couldn't set the default port");
+        }
+
+        Ok(Self { url })
+    }
+}
--- a/control_plane/src/etcd.rs
+++ b/control_plane/src/etcd.rs
@@ -1,95 +1,75 @@
-use std::{
-    fs,
-    path::PathBuf,
-    process::{Command, Stdio},
-};
+use std::{fs, path::PathBuf};

 use anyhow::Context;
-use nix::{
-    sys::signal::{kill, Signal},
-    unistd::Pid,
-};

-use crate::{local_env, read_pidfile};
+use crate::{background_process, local_env};

 pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
    let etcd_broker = &env.etcd_broker;
    println!(
-        "Starting etcd broker using {}",
-        etcd_broker.etcd_binary_path.display()
+        "Starting etcd broker using {:?}",
+        etcd_broker.etcd_binary_path
    );

    let etcd_data_dir = env.base_data_dir.join("etcd");
-    fs::create_dir_all(&etcd_data_dir).with_context(|| {
-        format!(
-            "Failed to create etcd data dir: {}",
-            etcd_data_dir.display()
-        )
-    })?;
+    fs::create_dir_all(&etcd_data_dir)
+        .with_context(|| format!("Failed to create etcd data dir {etcd_data_dir:?}"))?;

-    let etcd_stdout_file =
-        fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| {
-            format!(
-                "Failed to create etcd stout file in directory {}",
-                etcd_data_dir.display()
-            )
-        })?;
-    let etcd_stderr_file =
-        fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| {
-            format!(
-                "Failed to create etcd stderr file in directory {}",
-                etcd_data_dir.display()
-            )
-        })?;
    let client_urls = etcd_broker.comma_separated_endpoints();
+    let args = [
+        format!("--data-dir={}", etcd_data_dir.display()),
+        format!("--listen-client-urls={client_urls}"),
+        format!("--advertise-client-urls={client_urls}"),
+        // Set --quota-backend-bytes to keep the etcd virtual memory
+        // size smaller. Our test etcd clusters are very small.
+        // See https://github.com/etcd-io/etcd/issues/7910
+        "--quota-backend-bytes=100000000".to_string(),
+        // etcd doesn't compact (vacuum) with default settings,
+        // enable it to prevent space exhaustion.
+        "--auto-compaction-mode=revision".to_string(),
+        "--auto-compaction-retention=1".to_string(),
+    ];

-    let etcd_process = Command::new(&etcd_broker.etcd_binary_path)
-        .args(&[
-            format!("--data-dir={}", etcd_data_dir.display()),
-            format!("--listen-client-urls={client_urls}"),
-            format!("--advertise-client-urls={client_urls}"),
-            // Set --quota-backend-bytes to keep the etcd virtual memory
-            // size smaller. Our test etcd clusters are very small.
-            // See https://github.com/etcd-io/etcd/issues/7910
-            "--quota-backend-bytes=100000000".to_string(),
-        ])
-        .stdout(Stdio::from(etcd_stdout_file))
-        .stderr(Stdio::from(etcd_stderr_file))
-        .spawn()
-        .context("Failed to spawn etcd subprocess")?;
-    let pid = etcd_process.id();
+    let pid_file_path = etcd_pid_file_path(env);

-    let etcd_pid_file_path = etcd_pid_file_path(env);
-    fs::write(&etcd_pid_file_path, pid.to_string()).with_context(|| {
-        format!(
-            "Failed to create etcd pid file at {}",
-            etcd_pid_file_path.display()
-        )
-    })?;
+    let client = reqwest::blocking::Client::new();
+
+    background_process::start_process(
+        "etcd",
+        &etcd_data_dir,
+        &etcd_broker.etcd_binary_path,
+        &args,
+        background_process::InitialPidFile::Create(&pid_file_path),
+        || {
+            for broker_endpoint in &etcd_broker.broker_endpoints {
+                let request = broker_endpoint
+                    .join("health")
+                    .with_context(|| {
+                        format!(
+                            "Failed to append /health path to broker endopint {}",
+                            broker_endpoint
+                        )
+                    })
+                    .and_then(|url| {
+                        client.get(&url.to_string()).build().with_context(|| {
+                            format!("Failed to construct request to etcd endpoint {url}")
+                        })
+                    })?;
+                if client.execute(request).is_ok() {
+                    return Ok(true);
+                }
+            }
+
+            Ok(false)
+        },
+    )
+    .context("Failed to spawn etcd subprocess")?;

    Ok(())
 }

 pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
-    let etcd_path = &env.etcd_broker.etcd_binary_path;
-    println!("Stopping etcd broker at {}", etcd_path.display());
-
-    let etcd_pid_file_path = etcd_pid_file_path(env);
-    let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| {
-        format!(
-            "Failed to read etcd pid file at {}",
-            etcd_pid_file_path.display()
-        )
-    })?);
-
-    kill(pid, Signal::SIGTERM).with_context(|| {
-        format!(
-            "Failed to stop etcd with pid {pid} at {}",
-            etcd_pid_file_path.display()
-        )
-    })?;
-
-    Ok(())
+    background_process::stop_process(true, "etcd", &etcd_pid_file_path(env))
 }

 fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -6,59 +6,12 @@
 // Intended to be used in integration tests and in CLI tools for
 // local installations.
 //
-use anyhow::{anyhow, bail, Context, Result};
-use std::fs;
-use std::path::Path;
-use std::process::Command;

+mod background_process;
 pub mod compute;
+pub mod connection;
 pub mod etcd;
 pub mod local_env;
+pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
-pub mod storage;
-
-/// Read a PID file
-///
-/// We expect a file that contains a single integer.
-/// We return an i32 for compatibility with libc and nix.
-pub fn read_pidfile(pidfile: &Path) -> Result<i32> {
-    let pid_str = fs::read_to_string(pidfile)
-        .with_context(|| format!("failed to read pidfile {:?}", pidfile))?;
-    let pid: i32 = pid_str
-        .parse()
-        .map_err(|_| anyhow!("failed to parse pidfile {:?}", pidfile))?;
-    if pid < 1 {
-        bail!("pidfile {:?} contained bad value '{}'", pidfile, pid);
-    }
-    Ok(pid)
-}
-
-fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
-    let cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
-
-    let var = "LLVM_PROFILE_FILE";
-    if let Some(val) = std::env::var_os(var) {
-        cmd.env(var, val);
-    }
-
-    const RUST_LOG_KEY: &str = "RUST_LOG";
-    if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
-        cmd.env(RUST_LOG_KEY, rust_log_value)
-    } else {
-        cmd
-    }
-}
-
-fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
-    for env_key in [
-        "AWS_ACCESS_KEY_ID",
-        "AWS_SECRET_ACCESS_KEY",
-        "AWS_SESSION_TOKEN",
-    ] {
-        if let Ok(value) = std::env::var(env_key) {
-            cmd = cmd.env(env_key, value);
-        }
-    }
-    cmd
-}
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -201,37 +201,37 @@ impl LocalEnv {
        self.pg_distrib_dir.clone()
    }

-    pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf {
+    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

        match pg_version {
-            14 => path.join(format!("v{pg_version}")),
-            15 => path.join(format!("v{pg_version}")),
-            _ => panic!("Unsupported postgres version: {}", pg_version),
+            14 => Ok(path.join(format!("v{pg_version}"))),
+            15 => Ok(path.join(format!("v{pg_version}"))),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }

-    pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf {
+    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        match pg_version {
-            14 => self.pg_distrib_dir(pg_version).join("bin"),
-            15 => self.pg_distrib_dir(pg_version).join("bin"),
-            _ => panic!("Unsupported postgres version: {}", pg_version),
+            14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
+            15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }
-    pub fn pg_lib_dir(&self, pg_version: u32) -> PathBuf {
+    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        match pg_version {
-            14 => self.pg_distrib_dir(pg_version).join("lib"),
-            15 => self.pg_distrib_dir(pg_version).join("lib"),
-            _ => panic!("Unsupported postgres version: {}", pg_version),
+            14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
+            15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }

-    pub fn pageserver_bin(&self) -> anyhow::Result<PathBuf> {
-        Ok(self.neon_distrib_dir.join("pageserver"))
+    pub fn pageserver_bin(&self) -> PathBuf {
+        self.neon_distrib_dir.join("pageserver")
    }

-    pub fn safekeeper_bin(&self) -> anyhow::Result<PathBuf> {
-        Ok(self.neon_distrib_dir.join("safekeeper"))
+    pub fn safekeeper_bin(&self) -> PathBuf {
+        self.neon_distrib_dir.join("safekeeper")
    }

    pub fn pg_data_dirs_path(&self) -> PathBuf {
@@ -422,10 +422,10 @@ impl LocalEnv {
            "directory '{}' already exists. Perhaps already initialized?",
            base_path.display()
        );
-        if !self.pg_bin_dir(pg_version).join("postgres").exists() {
+        if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
            bail!(
                "Can't find postgres binary at {}",
-                self.pg_bin_dir(pg_version).display()
+                self.pg_bin_dir(pg_version)?.display()
            );
        }
        for binary in ["pageserver", "safekeeper"] {
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -1,33 +1,27 @@
 use std::collections::HashMap;
-use std::fs::File;
+use std::fs::{self, File};
 use std::io::{BufReader, Write};
 use std::num::NonZeroU64;
 use std::path::{Path, PathBuf};
-use std::process::Command;
-use std::time::Duration;
-use std::{io, result, thread};
+use std::process::Child;
+use std::{io, result};

+use crate::connection::PgConnectionConfig;
 use anyhow::{bail, Context};
-use nix::errno::Errno;
-use nix::sys::signal::{kill, Signal};
-use nix::unistd::Pid;
 use pageserver_api::models::{
    TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
 };
-use postgres::{Config, NoTls};
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
 use utils::{
-    connstring::connection_address,
    http::error::HttpErrorBody,
    id::{TenantId, TimelineId},
    lsn::Lsn,
    postgres_backend::AuthType,
 };

-use crate::local_env::LocalEnv;
-use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
+use crate::{background_process, local_env::LocalEnv};

 #[derive(Error, Debug)]
 pub enum PageserverHttpError {
@@ -75,7 +69,7 @@ impl ResponseErrorMessageExt for Response {
 //
 #[derive(Debug)]
 pub struct PageServerNode {
-    pub pg_connection_config: Config,
+    pub pg_connection_config: PgConnectionConfig,
    pub env: LocalEnv,
    pub http_client: Client,
    pub http_base_url: String,
@@ -101,7 +95,7 @@ impl PageServerNode {
    }

    /// Construct libpq connection string for connecting to the pageserver.
-    fn pageserver_connection_config(password: &str, listen_addr: &str) -> Config {
+    fn pageserver_connection_config(password: &str, listen_addr: &str) -> PgConnectionConfig {
        format!("postgresql://no_user:{password}@{listen_addr}/no_db")
            .parse()
            .unwrap()
@@ -161,7 +155,15 @@ impl PageServerNode {
            init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'");
        }

-        self.start_node(&init_config_overrides, &self.env.base_data_dir, true)?;
+        let mut pageserver_process = self
+            .start_node(&init_config_overrides, &self.env.base_data_dir, true)
+            .with_context(|| {
+                format!(
+                    "Failed to start a process for pageserver {}",
+                    self.env.pageserver.id,
+                )
+            })?;
+
        let init_result = self
            .try_init_timeline(create_tenant, initial_timeline_id, pg_version)
            .context("Failed to create initial tenant and timeline for pageserver");
@@ -171,7 +173,29 @@ impl PageServerNode {
            }
            Err(e) => eprintln!("{e:#}"),
        }
-        self.stop(false)?;
+        match pageserver_process.kill() {
+            Err(e) => {
+                eprintln!(
+                    "Failed to stop pageserver {} process with pid {}: {e:#}",
+                    self.env.pageserver.id,
+                    pageserver_process.id(),
+                )
+            }
+            Ok(()) => {
+                println!(
+                    "Stopped pageserver {} process with pid {}",
+                    self.env.pageserver.id,
+                    pageserver_process.id(),
+                );
+                // cleanup after pageserver startup, since we do not call regular `stop_process` during init
+                let pid_file = self.pid_file();
+                if let Err(e) = fs::remove_file(&pid_file) {
+                    if e.kind() != io::ErrorKind::NotFound {
+                        eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
+                    }
+                }
+            }
+        }
        init_result
    }

@@ -196,11 +220,14 @@ impl PageServerNode {
        self.env.pageserver_data_dir()
    }

-    pub fn pid_file(&self) -> PathBuf {
+    /// The pid file is created by the pageserver process, with its pid stored inside.
+    /// Other pageservers cannot lock the same file and overwrite it for as long as the current
+    /// pageserver runs. (Unless someone removes the file manually; never do that!)
+    fn pid_file(&self) -> PathBuf {
        self.repo_path().join("pageserver.pid")
    }

-    pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
+    pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
        self.start_node(config_overrides, &self.repo_path(), false)
    }

@@ -209,10 +236,10 @@ impl PageServerNode {
        config_overrides: &[&str],
        datadir: &Path,
        update_config: bool,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<Child> {
        println!(
            "Starting pageserver at '{}' in '{}'",
-            connection_address(&self.pg_connection_config),
+            self.pg_connection_config.raw_address(),
            datadir.display()
        );
        io::stdout().flush()?;
@@ -220,10 +247,7 @@ impl PageServerNode {
        let mut args = vec![
            "-D",
            datadir.to_str().with_context(|| {
-                format!(
-                    "Datadir path '{}' cannot be represented as a unicode string",
-                    datadir.display()
-                )
+                format!("Datadir path {datadir:?} cannot be represented as a unicode string")
            })?,
        ];

@@ -235,48 +259,18 @@ impl PageServerNode {
            args.extend(["-c", config_override]);
        }

-        let mut cmd = Command::new(self.env.pageserver_bin()?);
-        let mut filled_cmd = fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
-        filled_cmd = fill_aws_secrets_vars(filled_cmd);
-
-        if !filled_cmd.status()?.success() {
-            bail!(
-                "Pageserver failed to start. See console output and '{}' for details.",
-                datadir.join("pageserver.log").display()
-            );
-        }
-
-        // It takes a while for the page server to start up. Wait until it is
-        // open for business.
-        const RETRIES: i8 = 15;
-        for retries in 1..RETRIES {
-            match self.check_status() {
-                Ok(()) => {
-                    println!("\nPageserver started");
-                    return Ok(());
-                }
-                Err(err) => {
-                    match err {
-                        PageserverHttpError::Transport(err) => {
-                            if err.is_connect() && retries < 5 {
-                                print!(".");
-                                io::stdout().flush().unwrap();
-                            } else {
-                                if retries == 5 {
-                                    println!() // put a line break after dots for second message
-                                }
-                                println!("Pageserver not responding yet, err {err} retrying ({retries})...");
-                            }
-                        }
-                        PageserverHttpError::Response(msg) => {
-                            bail!("pageserver failed to start: {msg} ")
-                        }
-                    }
-                    thread::sleep(Duration::from_secs(1));
-                }
-            }
-        }
-        bail!("pageserver failed to start in {RETRIES} seconds");
+        background_process::start_process(
+            "pageserver",
+            datadir,
+            &self.env.pageserver_bin(),
+            &args,
+            background_process::InitialPidFile::Expect(&self.pid_file()),
+            || match self.check_status() {
+                Ok(()) => Ok(true),
+                Err(PageserverHttpError::Transport(_)) => Ok(false),
+                Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
+            },
+        )
    }

    ///
@@ -288,69 +282,18 @@ impl PageServerNode {
    /// If the server is not running, returns success
    ///
    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        let pid_file = self.pid_file();
-        if !pid_file.exists() {
-            println!("Pageserver is already stopped");
-            return Ok(());
-        }
-        let pid = Pid::from_raw(read_pidfile(&pid_file)?);
-
-        let sig = if immediate {
-            print!("Stopping pageserver immediately..");
-            Signal::SIGQUIT
-        } else {
-            print!("Stopping pageserver gracefully..");
-            Signal::SIGTERM
-        };
-        io::stdout().flush().unwrap();
-        match kill(pid, sig) {
-            Ok(_) => (),
-            Err(Errno::ESRCH) => {
-                println!("Pageserver with pid {pid} does not exist, but a PID file was found");
-                return Ok(());
-            }
-            Err(err) => bail!(
-                "Failed to send signal to pageserver with pid {pid}: {}",
-                err.desc()
-            ),
-        }
-
-        // Wait until process is gone
-        for i in 0..600 {
-            let signal = None; // Send no signal, just get the error code
-            match kill(pid, signal) {
-                Ok(_) => (), // Process exists, keep waiting
-                Err(Errno::ESRCH) => {
-                    // Process not found, we're done
-                    println!("done!");
-                    return Ok(());
-                }
-                Err(err) => bail!(
-                    "Failed to send signal to pageserver with pid {}: {}",
-                    pid,
-                    err.desc()
-                ),
-            };
-
-            if i % 10 == 0 {
-                print!(".");
-                io::stdout().flush().unwrap();
-            }
-            thread::sleep(Duration::from_millis(100));
-        }
-
-        bail!("Failed to stop pageserver with pid {pid}");
+        background_process::stop_process(immediate, "pageserver", &self.pid_file())
    }

    pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
-        let mut client = self.pg_connection_config.connect(NoTls).unwrap();
+        let mut client = self.pg_connection_config.connect_no_tls().unwrap();

        println!("Pageserver query: '{sql}'");
        client.simple_query(sql).unwrap()
    }

    pub fn page_server_psql_client(&self) -> result::Result<postgres::Client, postgres::Error> {
-        self.pg_connection_config.connect(NoTls)
+        self.pg_connection_config.connect_no_tls()
    }

    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
@@ -549,7 +492,7 @@ impl PageServerNode {
        pg_wal: Option<(Lsn, PathBuf)>,
        pg_version: u32,
    ) -> anyhow::Result<()> {
-        let mut client = self.pg_connection_config.connect(NoTls).unwrap();
+        let mut client = self.pg_connection_config.connect_no_tls().unwrap();

        // Init base reader
        let (start_lsn, base_tarfile_path) = base;
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -1,23 +1,21 @@
 use std::io::Write;
 use std::path::PathBuf;
-use std::process::Command;
+use std::process::Child;
 use std::sync::Arc;
-use std::time::Duration;
-use std::{io, result, thread};
+use std::{io, result};

-use anyhow::bail;
-use nix::errno::Errno;
-use nix::sys::signal::{kill, Signal};
-use nix::unistd::Pid;
-use postgres::Config;
+use anyhow::Context;
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
-use utils::{connstring::connection_address, http::error::HttpErrorBody, id::NodeId};
+use utils::{http::error::HttpErrorBody, id::NodeId};

-use crate::local_env::{LocalEnv, SafekeeperConf};
-use crate::storage::PageServerNode;
-use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
+use crate::connection::PgConnectionConfig;
+use crate::pageserver::PageServerNode;
+use crate::{
+    background_process,
+    local_env::{LocalEnv, SafekeeperConf},
+};

 #[derive(Error, Debug)]
 pub enum SafekeeperHttpError {
@@ -63,7 +61,7 @@ pub struct SafekeeperNode {

    pub conf: SafekeeperConf,

-    pub pg_connection_config: Config,
+    pub pg_connection_config: PgConnectionConfig,
    pub env: LocalEnv,
    pub http_client: Client,
    pub http_base_url: String,
@@ -87,15 +85,15 @@ impl SafekeeperNode {
    }

    /// Construct libpq connection string for connecting to this safekeeper.
-    fn safekeeper_connection_config(port: u16) -> Config {
+    fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
        // TODO safekeeper authentication not implemented yet
-        format!("postgresql://no_user@127.0.0.1:{}/no_db", port)
+        format!("postgresql://no_user@127.0.0.1:{port}/no_db")
            .parse()
            .unwrap()
    }

    pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
-        env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref())
+        env.safekeeper_data_dir(&format!("sk{sk_id}"))
    }

    pub fn datadir_path(&self) -> PathBuf {
@@ -106,92 +104,78 @@ impl SafekeeperNode {
        self.datadir_path().join("safekeeper.pid")
    }

-    pub fn start(&self) -> anyhow::Result<()> {
+    pub fn start(&self) -> anyhow::Result<Child> {
        print!(
            "Starting safekeeper at '{}' in '{}'",
-            connection_address(&self.pg_connection_config),
+            self.pg_connection_config.raw_address(),
            self.datadir_path().display()
        );
        io::stdout().flush().unwrap();

        let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
        let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
+        let id = self.id;
+        let datadir = self.datadir_path();

-        let mut cmd = Command::new(self.env.safekeeper_bin()?);
-        fill_rust_env_vars(
-            cmd.args(&["-D", self.datadir_path().to_str().unwrap()])
-                .args(&["--id", self.id.to_string().as_ref()])
-                .args(&["--listen-pg", &listen_pg])
-                .args(&["--listen-http", &listen_http])
-                .args(&["--recall", "1 second"])
-                .arg("--daemonize"),
-        );
+        let id_string = id.to_string();
+        let mut args = vec![
+            "-D",
+            datadir.to_str().with_context(|| {
+                format!("Datadir path {datadir:?} cannot be represented as a unicode string")
+            })?,
+            "--id",
+            &id_string,
+            "--listen-pg",
+            &listen_pg,
+            "--listen-http",
+            &listen_http,
+        ];
        if !self.conf.sync {
-            cmd.arg("--no-sync");
+            args.push("--no-sync");
        }

        let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints();
        if !comma_separated_endpoints.is_empty() {
-            cmd.args(&["--broker-endpoints", &comma_separated_endpoints]);
+            args.extend(["--broker-endpoints", &comma_separated_endpoints]);
        }
        if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
-            cmd.args(&["--broker-etcd-prefix", prefix]);
+            args.extend(["--broker-etcd-prefix", prefix]);
        }
+
+        let mut backup_threads = String::new();
        if let Some(threads) = self.conf.backup_threads {
-            cmd.args(&["--backup-threads", threads.to_string().as_ref()]);
+            backup_threads = threads.to_string();
+            args.extend(["--backup-threads", &backup_threads]);
+        } else {
+            drop(backup_threads);
        }
+
        if let Some(ref remote_storage) = self.conf.remote_storage {
-            cmd.args(&["--remote-storage", remote_storage]);
+            args.extend(["--remote-storage", remote_storage]);
        }
+
+        let key_path = self.env.base_data_dir.join("auth_public_key.pem");
        if self.conf.auth_enabled {
-            cmd.arg("--auth-validation-public-key-path");
-            // PathBuf is better be passed as is, not via `String`.
-            cmd.arg(self.env.base_data_dir.join("auth_public_key.pem"));
+            args.extend([
+                "--auth-validation-public-key-path",
+                key_path.to_str().with_context(|| {
+                    format!("Key path {key_path:?} cannot be represented as a unicode string")
+                })?,
+            ]);
        }

-        fill_aws_secrets_vars(&mut cmd);
-
-        if !cmd.status()?.success() {
-            bail!(
-                "Safekeeper failed to start. See '{}' for details.",
-                self.datadir_path().join("safekeeper.log").display()
-            );
-        }
-
-        // It takes a while for the safekeeper to start up. Wait until it is
-        // open for business.
-        const RETRIES: i8 = 15;
-        for retries in 1..RETRIES {
-            match self.check_status() {
-                Ok(_) => {
-                    println!("\nSafekeeper started");
-                    return Ok(());
-                }
-                Err(err) => {
-                    match err {
-                        SafekeeperHttpError::Transport(err) => {
-                            if err.is_connect() && retries < 5 {
-                                print!(".");
-                                io::stdout().flush().unwrap();
-                            } else {
-                                if retries == 5 {
-                                    println!() // put a line break after dots for second message
-                                }
-                                println!(
-                                    "Safekeeper not responding yet, err {} retrying ({})...",
-                                    err, retries
-                                );
-                            }
-                        }
-                        SafekeeperHttpError::Response(msg) => {
-                            bail!("safekeeper failed to start: {} ", msg)
-                        }
-                    }
-                    thread::sleep(Duration::from_secs(1));
-                }
-            }
-        }
-        bail!("safekeeper failed to start in {} seconds", RETRIES);
+        background_process::start_process(
+            &format!("safekeeper {id}"),
+            &datadir,
+            &self.env.safekeeper_bin(),
+            &args,
+            background_process::InitialPidFile::Expect(&self.pid_file()),
+            || match self.check_status() {
+                Ok(()) => Ok(true),
+                Err(SafekeeperHttpError::Transport(_)) => Ok(false),
+                Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
+            },
+        )
    }

    ///
@@ -203,63 +187,11 @@ impl SafekeeperNode {
    /// If the server is not running, returns success
    ///
    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        let pid_file = self.pid_file();
-        if !pid_file.exists() {
-            println!("Safekeeper {} is already stopped", self.id);
-            return Ok(());
-        }
-        let pid = read_pidfile(&pid_file)?;
-        let pid = Pid::from_raw(pid);
-
-        let sig = if immediate {
-            print!("Stopping safekeeper {} immediately..", self.id);
-            Signal::SIGQUIT
-        } else {
-            print!("Stopping safekeeper {} gracefully..", self.id);
-            Signal::SIGTERM
-        };
-        io::stdout().flush().unwrap();
-        match kill(pid, sig) {
-            Ok(_) => (),
-            Err(Errno::ESRCH) => {
-                println!(
-                    "Safekeeper with pid {} does not exist, but a PID file was found",
-                    pid
-                );
-                return Ok(());
-            }
-            Err(err) => bail!(
-                "Failed to send signal to safekeeper with pid {}: {}",
-                pid,
-                err.desc()
-            ),
-        }
-
-        // Wait until process is gone
-        for i in 0..600 {
-            let signal = None; // Send no signal, just get the error code
-            match kill(pid, signal) {
-                Ok(_) => (), // Process exists, keep waiting
-                Err(Errno::ESRCH) => {
-                    // Process not found, we're done
-                    println!("done!");
-                    return Ok(());
-                }
-                Err(err) => bail!(
-                    "Failed to send signal to pageserver with pid {}: {}",
-                    pid,
-                    err.desc()
-                ),
-            };
-
-            if i % 10 == 0 {
-                print!(".");
-                io::stdout().flush().unwrap();
-            }
-            thread::sleep(Duration::from_millis(100));
-        }
-
-        bail!("Failed to stop safekeeper with pid {}", pid);
+        background_process::stop_process(
+            immediate,
+            &format!("safekeeper {}", self.id),
+            &self.pid_file(),
+        )
    }

    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
--- a/docker-compose/compute/shell/compute.sh
+++ b/docker-compose/compute/shell/compute.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+set -eux
+
+PG_VERSION=${PG_VERSION:-14}
+
+SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
+SPEC_FILE=/tmp/spec.json
+
+echo "Waiting pageserver become ready."
+while ! nc -z pageserver 6400; do
+     sleep 1;
+done
+echo "Page server is ready."
+
+echo "Create a tenant and timeline"
+PARAMS=(
+     -sb 
+     -X POST
+     -H "Content-Type: application/json"
+     -d "{}"
+     http://pageserver:9898/v1/tenant/
+)
+tenant_id=$(curl "${PARAMS[@]}" | sed 's/"//g')
+
+PARAMS=(
+     -sb 
+     -X POST
+     -H "Content-Type: application/json"
+     -d "{\"tenant_id\":\"${tenant_id}\", \"pg_version\": ${PG_VERSION}}"
+     "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
+)
+result=$(curl "${PARAMS[@]}")
+echo $result | jq .
+
+echo "Overwrite tenant id and timeline id in spec file"
+tenant_id=$(echo ${result} | jq -r .tenant_id)
+timeline_id=$(echo ${result} | jq -r .timeline_id)
+
+sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE}
+sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
+
+cat ${SPEC_FILE}
+
+echo "Start compute node"
+/usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
+     -C "postgresql://cloud_admin@localhost:55433/postgres"  \
+     -b /usr/local/bin/postgres                              \
+     -S ${SPEC_FILE}
--- a/docker-compose/compute/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute/var/db/postgres/specs/spec.json
@@ -0,0 +1,141 @@
+{
+    "format_version": 1.0,
+
+    "timestamp": "2022-10-12T18:00:00.000Z",
+    "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
+
+    "cluster": {
+        "cluster_id": "docker_compose",
+        "name": "docker_compose_test",
+        "state": "restarted",
+        "roles": [
+            {
+                "name": "cloud_admin",
+                "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
+                "options": null
+            }
+        ],
+        "databases": [
+        ],
+        "settings": [
+            {
+                "name": "fsync",
+                "value": "off",
+                "vartype": "bool"
+            },
+            {
+                "name": "wal_level",
+                "value": "replica",
+                "vartype": "enum"
+            },
+            {
+                "name": "hot_standby",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "wal_log_hints",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "log_connections",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "port",
+                "value": "55433",
+                "vartype": "integer"
+            },
+            {
+                "name": "shared_buffers",
+                "value": "1MB",
+                "vartype": "string"
+            },
+            {
+                "name": "max_connections",
+                "value": "100",
+                "vartype": "integer"
+            },
+            {
+                "name": "listen_addresses",
+                "value": "0.0.0.0",
+                "vartype": "string"
+            },
+            {
+                "name": "max_wal_senders",
+                "value": "10",
+                "vartype": "integer"
+            },
+            {
+                "name": "max_replication_slots",
+                "value": "10",
+                "vartype": "integer"
+            },
+            {
+                "name": "wal_sender_timeout",
+                "value": "5s",
+                "vartype": "string"
+            },
+            {
+                "name": "wal_keep_size",
+                "value": "0",
+                "vartype": "integer"
+            },
+            {
+                "name": "password_encryption",
+                "value": "md5",
+                "vartype": "enum"
+            },
+            {
+                "name": "restart_after_crash",
+                "value": "off",
+                "vartype": "bool"
+            },
+            {
+                "name": "synchronous_standby_names",
+                "value": "walproposer",
+                "vartype": "string"
+            },
+            {
+                "name": "shared_preload_libraries",
+                "value": "neon",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.safekeepers",
+                "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.timeline_id",
+                "value": "TIMELINE_ID",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.tenant_id",
+                "value": "TENANT_ID",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.pageserver_connstring",
+                "value": "host=pageserver port=6400",
+                "vartype": "string"
+            },
+            {
+                "name": "max_replication_write_lag",
+                "value": "500MB",
+                "vartype": "string"
+            },
+            {
+                "name": "max_replication_flush_lag",
+                "value": "10GB",
+                "vartype": "string"
+            }
+        ]
+    },
+
+    "delta_operations": [
+    ]
+}
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -0,0 +1,200 @@
+version: '3'
+
+services:
+  etcd:
+    image: quay.io/coreos/etcd:v3.5.4
+    ports:
+      - 2379:2379
+      - 2380:2380
+    environment:
+      # This signifficantly speeds up etcd and we anyway don't data persistency there.
+      ETCD_UNSAFE_NO_FSYNC: "1"
+    command: 
+      - "etcd"
+      - "--auto-compaction-mode=revision"
+      - "--auto-compaction-retention=1"
+      - "--name=etcd-cluster"
+      - "--initial-cluster-state=new"
+      - "--initial-cluster-token=etcd-cluster-1"
+      - "--initial-cluster=etcd-cluster=http://etcd:2380"
+      - "--initial-advertise-peer-urls=http://etcd:2380"
+      - "--advertise-client-urls=http://etcd:2379"
+      - "--listen-client-urls=http://0.0.0.0:2379"
+      - "--listen-peer-urls=http://0.0.0.0:2380"
+      - "--quota-backend-bytes=134217728" # 128 MB
+
+  minio:
+    image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z
+    ports:
+      - 9000:9000
+      - 9001:9001
+    environment:
+      - MINIO_ROOT_USER=minio
+      - MINIO_ROOT_PASSWORD=password
+    command: server /data --address :9000 --console-address ":9001"
+
+  minio_create_buckets:
+    image: minio/mc
+    environment:
+      - MINIO_ROOT_USER=minio
+      - MINIO_ROOT_PASSWORD=password
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command: 
+      - "until (/usr/bin/mc alias set minio http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD) do
+             echo 'Waiting to start minio...' && sleep 1;
+         done;
+         /usr/bin/mc mb minio/neon --region=eu-north-1;
+         exit 0;"
+    depends_on:
+      - minio
+
+  pageserver:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - BROKER_ENDPOINT='http://etcd:2379'
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+       #- 6400:6400  # pg protocol handler
+       - 9898:9898 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "/usr/local/bin/pageserver -D /data/.neon/
+                                   -c \"broker_endpoints=[$$BROKER_ENDPOINT]\"
+                                   -c \"listen_pg_addr='0.0.0.0:6400'\"
+                                   -c \"listen_http_addr='0.0.0.0:9898'\"
+                                   -c \"remote_storage={endpoint='http://minio:9000',
+                                                        bucket_name='neon',
+                                                        bucket_region='eu-north-1',
+                                                        prefix_in_bucket='/pageserver/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  safekeeper1:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
+      - SAFEKEEPER_ID=1
+      - BROKER_ENDPOINT=http://etcd:2379
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+      #- 5454:5454 # pg protocol handler
+      - 7676:7676 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
+                    --listen-http='0.0.0.0:7676'
+                    --id=$$SAFEKEEPER_ID
+                    --broker-endpoints=$$BROKER_ENDPOINT
+                    -D /data
+                    --remote-storage=\"{endpoint='http://minio:9000',
+                                        bucket_name='neon',
+                                        bucket_region='eu-north-1',
+                                        prefix_in_bucket='/safekeeper/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  safekeeper2:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
+      - SAFEKEEPER_ID=2
+      - BROKER_ENDPOINT=http://etcd:2379
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+      #- 5454:5454 # pg protocol handler
+      - 7677:7676 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
+                    --listen-http='0.0.0.0:7676'
+                    --id=$$SAFEKEEPER_ID
+                    --broker-endpoints=$$BROKER_ENDPOINT
+                    -D /data
+                    --remote-storage=\"{endpoint='http://minio:9000',
+                                        bucket_name='neon',
+                                        bucket_region='eu-north-1',
+                                        prefix_in_bucket='/safekeeper/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  safekeeper3:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
+      - SAFEKEEPER_ID=3
+      - BROKER_ENDPOINT=http://etcd:2379
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+      #- 5454:5454 # pg protocol handler
+      - 7678:7676 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
+                    --listen-http='0.0.0.0:7676'
+                    --id=$$SAFEKEEPER_ID
+                    --broker-endpoints=$$BROKER_ENDPOINT
+                    -D /data
+                    --remote-storage=\"{endpoint='http://minio:9000',
+                                        bucket_name='neon',
+                                        bucket_region='eu-north-1',
+                                        prefix_in_bucket='/safekeeper/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  compute:
+    build:
+      context: ./image/compute
+      args:
+        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}:${TAG:-latest}
+        - http_proxy=$http_proxy
+        - https_proxy=$https_proxy
+    environment:
+      - PG_VERSION=${PG_VERSION:-14}
+      #- RUST_BACKTRACE=1
+    volumes:
+      - ./compute/var/db/postgres/specs/:/var/db/postgres/specs/
+      - ./compute/shell/:/shell/
+    ports:
+      - 55433:55433 # pg protocol handler
+      - 3080:3080 # http endpoints
+    entrypoint:
+      - "/shell/compute.sh"
+    depends_on:
+      - safekeeper1
+      - safekeeper2
+      - safekeeper3
+      - pageserver
+
+  compute_is_ready:
+    image: postgres:latest
+    entrypoint:
+      - "/bin/bash"
+      - "-c"
+    command:
+      - "until pg_isready -h compute -p 55433 ; do
+            echo 'Waiting to start compute...' && sleep 1;
+         done"
+    depends_on:
+      - compute
--- a/docker-compose/image/compute/Dockerfile
+++ b/docker-compose/image/compute/Dockerfile
@@ -0,0 +1,10 @@
+ARG COMPUTE_IMAGE=compute-node-v14:latest
+FROM neondatabase/${COMPUTE_IMAGE}
+
+USER root
+RUN apt-get update &&       \
+    apt-get install -y curl \
+                       jq   \
+                       netcat
+
+USER postgres
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -80,4 +80,6 @@
 - [015-storage-messaging](rfcs/015-storage-messaging.md)
 - [016-connection-routing](rfcs/016-connection-routing.md)
 - [017-timeline-data-management](rfcs/017-timeline-data-management.md)
+- [018-storage-messaging-2](rfcs/018-storage-messaging-2.md)
+- [019-tenant-timeline-lifecycles](rfcs/019-tenant-timeline-lifecycles.md)
 - [cluster-size-limits](rfcs/cluster-size-limits.md)
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -18,3 +18,67 @@ We build all images after a successful `release` tests run and push automaticall
 1. `neondatabase/compute-tools` and `neondatabase/compute-node`

 2. `neondatabase/neon`
+
+## Docker Compose example
+
+You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers.
+
+- etcd x 1
+- pageserver x 1
+- safekeeper x 3
+- compute x 1
+- MinIO x 1        # This is Amazon S3 compatible object storage
+
+### How to use
+
+1. create containers
+
+You can specify version of neon cluster using following environment values.
+- PG_VERSION: postgres version for compute (default is 14)
+- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
+```
+$ cd docker-compose/docker-compose.yml
+$ docker-compose down   # remove the conainers if exists
+$ PG_VERSION=15 TAG=2221 docker-compose up --build -d  # You can specify the postgres and image version
+Creating network "dockercompose_default" with the default driver
+Creating dockercompose_etcd3_1 ...
+(...omit...)
+```
+
+2. connect compute node
+```
+$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
+$ psql -h localhost -p 55433 -U cloud_admin
+postgres=# CREATE TABLE t(key int primary key, value text);
+CREATE TABLE
+postgres=# insert into t values(1,1);
+INSERT 0 1
+postgres=# select * from t;
+ key | value
+-----+-------
+   1 | 1
+(1 row)
+```
+
+3. If you want to see the log, you can use `docker-compose logs` command.
+```
+# check the container name you want to see
+$ docker ps
+CONTAINER ID   IMAGE                                              COMMAND                  CREATED         STATUS         PORTS                                                                                                                                  NAMES
+d6968a5ae912   dockercompose_compute                              "/shell/compute.sh"      5 minutes ago   Up 5 minutes   0.0.0.0:3080->3080/tcp, 0.0.0.0:55433->55433/tcp                                                                                       dockercompose_compute_1
+(...omit...)
+
+$ docker logs -f dockercompose_compute_1
+2022-10-21 06:15:48.757 GMT [56] LOG:  connection authorized: user=cloud_admin database=postgres application_name=psql
+2022-10-21 06:17:00.307 GMT [56] LOG:  [NEON_SMGR] libpagestore: connected to 'host=pageserver port=6400'
+(...omit...)
+```
+
+4. If you want to see durable data in MinIO which is s3 compatible storage
+
+Access http://localhost:9001 and sign in.
+
+- Username: `minio`
+- Password: `password`
+
+You can see durable pages and WAL data in `neon` bucket.
--- a/docs/rfcs/019-tenant-timeline-lifecycles.md
+++ b/docs/rfcs/019-tenant-timeline-lifecycles.md
@@ -0,0 +1,91 @@
+# Managing Tenant and Timeline lifecycles
+
+## Summary
+
+The pageserver has a Tenant object in memory for each tenant it manages, and a
+Timeline for each timeline. There are a lot of tasks that operate on the tenants
+and timelines with references to those objects. We have some mechanisms to track
+which tasks are operating on each Tenant and Timeline, and to request them to
+shutdown when a tenant or timeline is deleted, but it does not cover all uses,
+and as a result we have many race conditions around tenant/timeline shutdown.
+
+## Motivation
+
+We have a bunch of race conditions that can produce weird errors and can be hard
+to track down.
+
+## Non Goals
+
+This RFC only covers the problem of ensuring that a task/thread isn't operating
+on a Tenant or Timeline. It does not cover what states, aside from Active and
+non-Active, each Tenant and Timeline should have, or when exactly the transitions
+should happen.
+
+## Impacted components (e.g. pageserver, safekeeper, console, etc)
+
+Pageserver. Although I wonder if the safekeeper should have a similar mechanism.
+
+## Current situation
+
+Most pageserver tasks of are managed by task_mgr.rs:
+
+- LibpqEndpointListener
+- HttpEndPointListener
+- WalReceiverManager and -Connection
+- GarbageCollector and Compaction
+- InitialLogicalSizeCalculation
+
+In addition to those tasks, the walreceiver performs some direct tokio::spawn
+calls to spawn tasks that are not registered with 'task_mgr'. And all of these
+tasks can spawn extra operations with tokio spawn_blocking.
+
+Whenever a tenant or timeline is removed from the system, by pageserver
+shutdown, delete_timeline or tenant-detach operation, we rely on the task
+registry in 'task_mgr.rs' to wait until there are no tasks operating on the
+tenant or timeline, before its Tenant/Timeline object is removed. That relies on
+each task to register itself with the tenant/timeline ID in
+'task_mgr.rs'. However, there are many gaps in that. For example,
+GarbageCollection and Compaction tasks are registered with the tenant, but when
+they proceed to operate on a particular timeline of the tenant, they don't
+register with timeline ID. Because of that, the timeline can be deleted while GC
+or compaction is running on it, causing failures in the GC or compaction (see
+https://github.com/neondatabase/neon/issues/2442).
+
+Another problem is that the task registry only works for tokio Tasks. There is
+no way to register a piece of code that runs inside spawn_blocking(), for
+example.
+
+## Proposed implementation
+
+This "voluntary" registration of tasks is fragile. Let's use Rust language features
+to enforce that a tenant/timeline cannot be removed from the system when there is
+still some code operating on it.
+
+Let's introduce new Guard objects for Tenant and Timeline, and do all actions through
+the Guard object. Something like:
+
+TenantActiveGuard: Guard object over Arc<Tenant>. When you acquire the guard,
+the code checks that the tenant is in Active state. If it's not, you get an
+error. You can change the state of the tenant to Stopping while there are
+ActiveTenantGuard objects still on it, to prevent new ActiveTenantGuards from
+being acquired, but the Tenant cannot be removed until all the guards are gone.
+
+TenantMaintenanceGuard: Like ActiveTenantGuard, but can be held even when the
+tenant is not in Active state. Used for operations like attach/detach. Perhaps
+allow only one such guard on a Tenant at a time.
+
+Similarly for Timelines. We don't currentl have a "state" on Timeline, but I think
+we need at least two states: Active and Stopping. The Stopping state is used at
+deletion, to prevent new TimelineActiveGuards from appearing, while you wait for
+existing TimelineActiveGuards to die out.
+
+The shutdown-signaling, using shutdown_watcher() and is_shutdown_requested(),
+probably also needs changes to deal with the new Guards. The rule is that if you
+have a TenantActiveGuard, and the tenant's state changes from Active to
+Stopping, the is_shutdown_requested() function should return true, and
+shutdown_watcher() future should return.
+
+This signaling doesn't neessarily need to cover all cases. For example, if you
+have a block of code in spawn_blocking(), it might be acceptable if
+is_shutdown_requested() doesn't return true even though the tenant is in
+Stopping state, as long as the code finishes reasonably fast.
--- a/docs/rfcs/020-pageserver-s3-coordination.md
+++ b/docs/rfcs/020-pageserver-s3-coordination.md
@@ -0,0 +1,246 @@
+# Coordinating access of multiple pageservers to the same s3 data
+
+## Motivation
+
+There are some blind spots around coordinating access of multiple pageservers
+to the same s3 data. Currently this is applicable only to tenant relocation
+case, but in the future we'll need to solve similar problems for
+replica/standby pageservers.
+
+## Impacted components (e.g. pageserver, safekeeper, console, etc)
+
+Pageserver
+
+## The problem
+
+### Relocation
+
+During relocation both pageservers can write to s3. This should be ok for all
+data except the `index_part.json`. For index part it causes problems during
+compaction/gc because they remove files from index/s3.
+
+Imagine this case:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant PS1
+    participant S3
+    participant PS2
+
+    PS1->>S3: Uploads L1, L2 <br/> Index contains L1 L2
+    PS2->>S3: Attach called, sees L1, L2
+    PS1->>S3: Compaction comes <br/> Removes L1, adds L3
+    note over S3: Index now L2, L3
+    PS2->>S3: Uploads new layer L4 <br/> (added to previous view of the index)
+    note over S3: Index now L1, L2, L4
+```
+
+At this point it is not possible to restore from index, it contains L2 which
+is no longer available in s3 and doesnt contain L3 added by compaction by the
+first pageserver. So if any of the pageservers restart initial sync will fail
+(or in on-demand world it will fail a bit later during page request from
+missing layer)
+
+### Standby pageserver
+
+Another related case is standby pageserver. In this case second pageserver can
+be used as a replica to scale reads and serve as a failover target in case
+first one fails.
+
+In this mode second pageserver needs to have the same picture of s3 files to
+be able to load layers on-demand. To accomplish that second pageserver
+cannot run gc/compaction jobs. Instead it needs to receive updates for index
+contents. (There is no need to run walreceiver on the second pageserver then).
+
+## Observations
+
+- If both pageservers ingest wal then their layer set diverges, because layer
+  file generation is not deterministic
+- If one of the pageservers does not ingest wal (and just picks up layer
+  updates) then it lags behind and cannot really answer queries in the same
+  pace as the primary one
+- Can compaction help make layers deterministic? E g we do not upload level
+  zero layers and construction of higher levels should be deterministic.
+  This way we can guarantee that layer creation by timeout wont mess things up.
+  This way one pageserver uploads data and second one can just ingest it.
+  But we still need some form of election
+
+## Solutions
+
+### Manual orchestration
+
+One possible solution for relocation case is to orchestrate background jobs
+from outside. The oracle who runs migration can turn off background jobs on
+PS1 before migration and then run migration -> enable them on PS2. The problem
+comes if migration fails. In this case in order to resume background jobs
+oracle needs to guarantee that PS2 doesnt run background jobs and if it doesnt
+respond then PS1 is stuck unable to run compaction/gc. This cannot be solved
+without human ensuring that no upload from PS2 can happen. In order to be able
+to resolve this automatically CAS is required on S3 side so pageserver can
+avoid overwriting index part if it is no longer the leading one
+
+Note that flag that disables background jobs needs to be persistent, because
+otherwise pageserver restart will clean it
+
+### Avoid index_part.json
+
+Index part consists of two parts, list of layers and metadata. List of layers
+can be easily obtained by `ListObjects` S3 API method. But what to do with
+metadata? Create metadata instance for each checkpoint and add some counter
+to the file name?
+
+Back to potentially long s3 ls.
+
+### Coordination based approach
+
+Do it like safekeepers chose leader for WAL upload. Ping each other and decide
+based on some heuristics e g smallest node id. During relocation PS1 sends
+"resign" ping message so others can start election without waiting for a timeout.
+
+This still leaves metadata question open and non deterministic layers are a
+problem as well
+
+### Avoid metadata file
+
+One way to eliminate metadata file is to store it in layer files under some
+special key. This may resonate with intention to keep all relation sizes in
+some special segment to avoid initial download during size calculation.
+Maybe with that we can even store pre calculated value.
+
+As a downside each checkpoint gets 512 bytes larger.
+
+If we entirely avoid metadata file this opens up many approaches
+
+* * *
+
+During discussion it seems that we converged on the approach consisting of:
+
+- index files stored per pageserver in the same timeline directory. With that
+  index file name starts to look like: `<pageserver_node_id>_index_part.json`.
+  In such set up there are no concurrent overwrites of index file by different
+  pageservers.
+- For replica pageservers the solution would be for primary to broadcast index
+  changes to any followers with an ability to check index files in s3 and
+  restore the full state. To properly merge changes with index files we can use
+  a counter that is persisted in an index file, is incremented on every change
+  to it and passed along with broadcasted change. This way we can determine
+  whether we need to apply change to the index state or not.
+- Responsibility for running background jobs is assigned externally. Pageserver
+  keeps locally persistent flag for each tenant that indicates whether this
+  pageserver is considered as primary one or not. TODO what happends if we
+  crash and cannot start for some extended period of time? Control plane can
+  assign ownership to some other pageserver. Pageserver needs some way to check
+  if its still the blessed one. Maybe by explicit request to control plane on
+  start.
+
+Requirement for deterministic layer generation was considered overly strict
+because of two reasons:
+
+- It can limit possible optimizations e g when pageserver wants to reshuffle
+  some data locally and doesnt want to coordinate this
+- The deterministic algorithm itself can change so during deployments for some
+  time there will be two different version running at the same time which can
+  cause non determinism
+
+### External elections
+
+The above case with lost state in this schema with externally managed
+leadership is represented like this:
+
+Note that here we keep objects list in the index file.
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant PS1
+    participant CP as Control Plane
+    participant S3
+    participant PS2
+
+    note over PS1,PS2: PS1 starts up and still a leader
+    PS1->>CP: Am I still the leader for Tenant X?
+    activate CP
+    CP->>PS1: Yes
+    deactivate CP
+    PS1->>S3: Fetch PS1 index.
+    note over PS1: Continue operations, start backround jobs
+    note over PS1,PS2: PS1 starts up and still and is not a leader anymore
+    PS1->>CP: Am I still the leader for Tenant X?
+    CP->>PS1: No
+    PS1->>PS2: Subscribe to index changes
+    PS1->>S3: Fetch PS1 and PS2 indexes
+    note over PS1: Combine index file to include layers <br> from both indexes to be able <br> to see newer files from leader (PS2)
+    note over PS1: Continue operations, do not start background jobs
+```
+
+### Internal elections
+
+To manage leadership internally we can use broker to exchange pings so nodes
+can decide on the leader roles. In case multiple pageservers are active leader
+is the one with lowest node id.
+
+Operations with internally managed elections:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant PS1
+    participant S3
+
+    note over PS1: Starts up
+    note over PS1: Subscribes to changes, waits for two ping <br> timeouts to see if there is a leader
+    PS1->>S3: Fetch indexes from s3
+    alt there is a leader
+        note over PS1: do not start background jobs, <br> continue applying index updates
+    else there is no leader
+        note over PS1: start background jobs, <br> broadcast index changes
+    end
+
+    note over PS1,S3: Then the picture is similar to external elections <br> the difference is that follower can become a leader <br> if there are no pings after some timeout new leader gets elected
+```
+
+### Eviction
+
+When two pageservers operate on a tenant for extended period of time follower
+doesnt perform write operations in s3. When layer is evicted follower relies
+on updates from primary to get info about layers it needs to cover range for
+evicted layer.
+
+Note that it wont match evicted layer exactly, so layers will overlap and
+lookup code needs to correctly handle that.
+
+### Relocation flow
+
+Actions become:
+
+- Attach tenant to new pageserver
+- New pageserver becomes follower since previous one is still leading
+- New pageserver starts replicating from safekeepers but does not upload layers
+- Detach is called on the old one
+- New pageserver becomes leader after it realizes that old one disappeared
+
+### Index File
+
+Using `s3 ls` on startup simplifies things, but we still need metadata, so we
+need to fetch index files anyway. If they contain list of files we can combine
+them and avoid costly `s3 ls`
+
+### Remaining issues
+
+- More than one remote consistent lsn for safekeepers to know
+
+Anything else?
+
+### Proposed solution
+
+To recap. On meeting we converged on approach with external elections but I
+think it will be overall harder to manage and will introduce a dependency on
+control plane for pageserver. Using separate index files for each pageserver
+consisting of log of operations and a metadata snapshot should be enough.
+
+### What we need to get there?
+
+- Change index file structure to contain log of changes instead of just the
+  file list
+- Implement pinging/elections for pageservers
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -52,6 +52,10 @@ PostgreSQL extension that implements storage manager API and network communicati

 PostgreSQL extension that contains functions needed for testing and debugging.

+`/pgxn/neon_walredo`:
+
+Library to run Postgres as a "WAL redo process" in the pageserver.
+
 `/safekeeper`:

 The neon WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -7,6 +7,9 @@ edition = "2021"
 serde = { version = "1.0", features = ["derive"] }
 serde_with = "2.0"
 const_format = "0.2.21"
+anyhow = { version = "1.0", features = ["backtrace"] }
+bytes = "1.0.1"

 utils = { path = "../utils" }
+postgres_ffi = { path = "../postgres_ffi" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -2,6 +2,7 @@ use const_format::formatcp;

 /// Public API types
 pub mod models;
+pub mod reltag;

 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -7,6 +7,10 @@ use utils::{
    lsn::Lsn,
 };

+use crate::reltag::RelTag;
+use anyhow::bail;
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+
 /// A state of a tenant in pageserver's memory.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TenantState {
@@ -19,6 +23,22 @@ pub enum TenantState {
    Broken,
 }

+/// A state of a timeline in pageserver's memory.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub enum TimelineState {
+    /// Timeline is fully operational, its background jobs are running.
+    Active,
+    /// A timeline is recognized by pageserver, but not yet ready to operate.
+    /// The status indicates, that the timeline could eventually go back to Active automatically:
+    /// for example, if the owning tenant goes back to Active again.
+    Suspended,
+    /// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to
+    /// automatically become Active after certain events: only a management call can change this status.
+    Paused,
+    /// A timeline is recognized by the pageserver, but no longer used for any operations, as failed to get activated.
+    Broken,
+}
+
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
@@ -160,6 +180,8 @@ pub struct TimelineInfo {
    pub remote_consistent_lsn: Option<Lsn>,
    pub awaits_download: bool,

+    pub state: TimelineState,
+
    // Some of the above fields are duplicated in 'local' and 'remote', for backwards-
    // compatility with older clients.
    pub local: LocalTimelineInfo,
@@ -201,3 +223,160 @@ pub struct FailpointConfig {
 pub struct TimelineGcRequest {
    pub gc_horizon: Option<u64>,
 }
+
+// Wrapped in libpq CopyData
+pub enum PagestreamFeMessage {
+    Exists(PagestreamExistsRequest),
+    Nblocks(PagestreamNblocksRequest),
+    GetPage(PagestreamGetPageRequest),
+    DbSize(PagestreamDbSizeRequest),
+}
+
+// Wrapped in libpq CopyData
+pub enum PagestreamBeMessage {
+    Exists(PagestreamExistsResponse),
+    Nblocks(PagestreamNblocksResponse),
+    GetPage(PagestreamGetPageResponse),
+    Error(PagestreamErrorResponse),
+    DbSize(PagestreamDbSizeResponse),
+}
+
+#[derive(Debug)]
+pub struct PagestreamExistsRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub rel: RelTag,
+}
+
+#[derive(Debug)]
+pub struct PagestreamNblocksRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub rel: RelTag,
+}
+
+#[derive(Debug)]
+pub struct PagestreamGetPageRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub rel: RelTag,
+    pub blkno: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamDbSizeRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub dbnode: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamExistsResponse {
+    pub exists: bool,
+}
+
+#[derive(Debug)]
+pub struct PagestreamNblocksResponse {
+    pub n_blocks: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamGetPageResponse {
+    pub page: Bytes,
+}
+
+#[derive(Debug)]
+pub struct PagestreamErrorResponse {
+    pub message: String,
+}
+
+#[derive(Debug)]
+pub struct PagestreamDbSizeResponse {
+    pub db_size: i64,
+}
+
+impl PagestreamFeMessage {
+    pub fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
+        // TODO these gets can fail
+
+        // these correspond to the NeonMessageTag enum in pagestore_client.h
+        //
+        // TODO: consider using protobuf or serde bincode for less error prone
+        // serialization.
+        let msg_tag = body.get_u8();
+        match msg_tag {
+            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                rel: RelTag {
+                    spcnode: body.get_u32(),
+                    dbnode: body.get_u32(),
+                    relnode: body.get_u32(),
+                    forknum: body.get_u8(),
+                },
+            })),
+            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                rel: RelTag {
+                    spcnode: body.get_u32(),
+                    dbnode: body.get_u32(),
+                    relnode: body.get_u32(),
+                    forknum: body.get_u8(),
+                },
+            })),
+            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                rel: RelTag {
+                    spcnode: body.get_u32(),
+                    dbnode: body.get_u32(),
+                    relnode: body.get_u32(),
+                    forknum: body.get_u8(),
+                },
+                blkno: body.get_u32(),
+            })),
+            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                dbnode: body.get_u32(),
+            })),
+            _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
+        }
+    }
+}
+
+impl PagestreamBeMessage {
+    pub fn serialize(&self) -> Bytes {
+        let mut bytes = BytesMut::new();
+
+        match self {
+            Self::Exists(resp) => {
+                bytes.put_u8(100); /* tag from pagestore_client.h */
+                bytes.put_u8(resp.exists as u8);
+            }
+
+            Self::Nblocks(resp) => {
+                bytes.put_u8(101); /* tag from pagestore_client.h */
+                bytes.put_u32(resp.n_blocks);
+            }
+
+            Self::GetPage(resp) => {
+                bytes.put_u8(102); /* tag from pagestore_client.h */
+                bytes.put(&resp.page[..]);
+            }
+
+            Self::Error(resp) => {
+                bytes.put_u8(103); /* tag from pagestore_client.h */
+                bytes.put(resp.message.as_bytes());
+                bytes.put_u8(0); // null terminator
+            }
+            Self::DbSize(resp) => {
+                bytes.put_u8(104); /* tag from pagestore_client.h */
+                bytes.put_i64(resp.db_size);
+            }
+        }
+
+        bytes.into()
+    }
+}
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -13,7 +13,7 @@ crc32c = "0.6.0"
 hex = "0.4.3"
 once_cell = "1.13.0"
 log = "0.4.14"
-memoffset = "0.6.2"
+memoffset = "0.7"
 thiserror = "1.0"
 serde = { version = "1.0", features = ["derive"] }
 utils = { path = "../utils" }
@@ -26,4 +26,4 @@ wal_craft = { path = "wal_craft" }

 [build-dependencies]
 anyhow = "1.0"
-bindgen = "0.60.1"
+bindgen = "0.61"
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -7,7 +7,7 @@ edition = "2021"

 [dependencies]
 anyhow = "1.0"
-clap = "3.0"
+clap = "4.0"
 env_logger = "0.9"
 log = "0.4"
 once_cell = "1.13.0"
--- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
+++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
@@ -1,68 +1,19 @@
 use anyhow::*;
-use clap::{App, Arg, ArgMatches};
-use std::str::FromStr;
+use clap::{value_parser, Arg, ArgMatches, Command};
+use std::{path::PathBuf, str::FromStr};
 use wal_craft::*;

 fn main() -> Result<()> {
    env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("wal_craft=info"))
        .init();
-    let type_arg = &Arg::new("type")
-        .takes_value(true)
-        .help("Type of WAL to craft")
-        .possible_values([
-            Simple::NAME,
-            LastWalRecordXlogSwitch::NAME,
-            LastWalRecordXlogSwitchEndsOnPageBoundary::NAME,
-            WalRecordCrossingSegmentFollowedBySmallOne::NAME,
-            LastWalRecordCrossingSegment::NAME,
-        ])
-        .required(true);
-    let arg_matches = App::new("Postgres WAL crafter")
-        .about("Crafts Postgres databases with specific WAL properties")
-        .subcommand(
-            App::new("print-postgres-config")
-                .about("Print the configuration required for PostgreSQL server before running this script")
-        )
-        .subcommand(
-            App::new("with-initdb")
-                .about("Craft WAL in a new data directory first initialized with initdb")
-                .arg(type_arg)
-                .arg(
-                    Arg::new("datadir")
-                        .takes_value(true)
-                        .help("Data directory for the Postgres server")
-                        .required(true)
-                )
-                .arg(
-                    Arg::new("pg-distrib-dir")
-                        .long("pg-distrib-dir")
-                        .takes_value(true)
-                        .help("Directory with Postgres distributions (bin and lib directories, e.g. pg_install containing subpath `v14/bin/postgresql`)")
-                        .default_value("/usr/local")
-                )
-                .arg(
-                    Arg::new("pg-version")
-                    .long("pg-version")
-                    .help("Postgres version to use for the initial tenant")
-                    .required(true)
-                    .takes_value(true)
-                )
-        )
-        .subcommand(
-            App::new("in-existing")
-                .about("Craft WAL at an existing recently created Postgres database. Note that server may append new WAL entries on shutdown.")
-                .arg(type_arg)
-                .arg(
-                    Arg::new("connection")
-                        .takes_value(true)
-                        .help("Connection string to the Postgres database to populate")
-                        .required(true)
-                )
-        )
-        .get_matches();
+    let arg_matches = cli().get_matches();

    let wal_craft = |arg_matches: &ArgMatches, client| {
-        let (intermediate_lsns, end_of_wal_lsn) = match arg_matches.value_of("type").unwrap() {
+        let (intermediate_lsns, end_of_wal_lsn) = match arg_matches
+            .get_one::<String>("type")
+            .map(|s| s.as_str())
+            .context("'type' is required")?
+        {
            Simple::NAME => Simple::craft(client)?,
            LastWalRecordXlogSwitch::NAME => LastWalRecordXlogSwitch::craft(client)?,
            LastWalRecordXlogSwitchEndsOnPageBoundary::NAME => {
@@ -72,12 +23,12 @@ fn main() -> Result<()> {
                WalRecordCrossingSegmentFollowedBySmallOne::craft(client)?
            }
            LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
-            a => panic!("Unknown --type argument: {}", a),
+            a => panic!("Unknown --type argument: {a}"),
        };
        for lsn in intermediate_lsns {
-            println!("intermediate_lsn = {}", lsn);
+            println!("intermediate_lsn = {lsn}");
        }
-        println!("end_of_wal = {}", end_of_wal_lsn);
+        println!("end_of_wal = {end_of_wal_lsn}");
        Ok(())
    };

@@ -85,20 +36,24 @@ fn main() -> Result<()> {
        None => panic!("No subcommand provided"),
        Some(("print-postgres-config", _)) => {
            for cfg in REQUIRED_POSTGRES_CONFIG.iter() {
-                println!("{}", cfg);
+                println!("{cfg}");
            }
            Ok(())
        }

        Some(("with-initdb", arg_matches)) => {
            let cfg = Conf {
-                pg_version: arg_matches
-                    .value_of("pg-version")
-                    .unwrap()
-                    .parse::<u32>()
-                    .context("Failed to parse postgres version from the argument string")?,
-                pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(),
-                datadir: arg_matches.value_of("datadir").unwrap().into(),
+                pg_version: *arg_matches
+                    .get_one::<u32>("pg-version")
+                    .context("'pg-version' is required")?,
+                pg_distrib_dir: arg_matches
+                    .get_one::<PathBuf>("pg-distrib-dir")
+                    .context("'pg-distrib-dir' is required")?
+                    .to_owned(),
+                datadir: arg_matches
+                    .get_one::<PathBuf>("datadir")
+                    .context("'datadir' is required")?
+                    .to_owned(),
            };
            cfg.initdb()?;
            let srv = cfg.start_server()?;
@@ -108,9 +63,77 @@ fn main() -> Result<()> {
        }
        Some(("in-existing", arg_matches)) => wal_craft(
            arg_matches,
-            &mut postgres::Config::from_str(arg_matches.value_of("connection").unwrap())?
-                .connect(postgres::NoTls)?,
+            &mut postgres::Config::from_str(
+                arg_matches
+                    .get_one::<String>("connection")
+                    .context("'connection' is required")?,
+            )
+            .context(
+                "'connection' argument value could not be parsed as a postgres connection string",
+            )?
+            .connect(postgres::NoTls)?,
        ),
        Some(_) => panic!("Unknown subcommand"),
    }
 }
+
+fn cli() -> Command {
+    let type_arg = &Arg::new("type")
+        .help("Type of WAL to craft")
+        .value_parser([
+            Simple::NAME,
+            LastWalRecordXlogSwitch::NAME,
+            LastWalRecordXlogSwitchEndsOnPageBoundary::NAME,
+            WalRecordCrossingSegmentFollowedBySmallOne::NAME,
+            LastWalRecordCrossingSegment::NAME,
+        ])
+        .required(true);
+
+    Command::new("Postgres WAL crafter")
+        .about("Crafts Postgres databases with specific WAL properties")
+        .subcommand(
+            Command::new("print-postgres-config")
+                .about("Print the configuration required for PostgreSQL server before running this script")
+        )
+        .subcommand(
+            Command::new("with-initdb")
+                .about("Craft WAL in a new data directory first initialized with initdb")
+                .arg(type_arg)
+                .arg(
+                    Arg::new("datadir")
+                        .help("Data directory for the Postgres server")
+                        .value_parser(value_parser!(PathBuf))
+                        .required(true)
+                )
+                .arg(
+                    Arg::new("pg-distrib-dir")
+                        .long("pg-distrib-dir")
+                        .value_parser(value_parser!(PathBuf))
+                        .help("Directory with Postgres distributions (bin and lib directories, e.g. pg_install containing subpath `v14/bin/postgresql`)")
+                        .default_value("/usr/local")
+                )
+                .arg(
+                    Arg::new("pg-version")
+                    .long("pg-version")
+                    .help("Postgres version to use for the initial tenant")
+                    .value_parser(value_parser!(u32))
+                    .required(true)
+
+                )
+        )
+        .subcommand(
+            Command::new("in-existing")
+                .about("Craft WAL at an existing recently created Postgres database. Note that server may append new WAL entries on shutdown.")
+                .arg(type_arg)
+                .arg(
+                    Arg::new("connection")
+                        .help("Connection string to the Postgres database to populate")
+                        .required(true)
+                )
+        )
+}
+
+#[test]
+fn verify_cli() {
+    cli().debug_assert();
+}
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -37,22 +37,22 @@ pub static REQUIRED_POSTGRES_CONFIG: Lazy<Vec<&'static str>> = Lazy::new(|| {
 });

 impl Conf {
-    pub fn pg_distrib_dir(&self) -> PathBuf {
+    pub fn pg_distrib_dir(&self) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

        match self.pg_version {
-            14 => path.join(format!("v{}", self.pg_version)),
-            15 => path.join(format!("v{}", self.pg_version)),
-            _ => panic!("Unsupported postgres version: {}", self.pg_version),
+            14 => Ok(path.join(format!("v{}", self.pg_version))),
+            15 => Ok(path.join(format!("v{}", self.pg_version))),
+            _ => bail!("Unsupported postgres version: {}", self.pg_version),
        }
    }

-    fn pg_bin_dir(&self) -> PathBuf {
-        self.pg_distrib_dir().join("bin")
+    fn pg_bin_dir(&self) -> anyhow::Result<PathBuf> {
+        Ok(self.pg_distrib_dir()?.join("bin"))
    }

-    fn pg_lib_dir(&self) -> PathBuf {
-        self.pg_distrib_dir().join("lib")
+    fn pg_lib_dir(&self) -> anyhow::Result<PathBuf> {
+        Ok(self.pg_distrib_dir()?.join("lib"))
    }

    pub fn wal_dir(&self) -> PathBuf {
@@ -60,12 +60,12 @@ impl Conf {
    }

    fn new_pg_command(&self, command: impl AsRef<Path>) -> Result<Command> {
-        let path = self.pg_bin_dir().join(command);
+        let path = self.pg_bin_dir()?.join(command);
        ensure!(path.exists(), "Command {:?} does not exist", path);
        let mut cmd = Command::new(path);
        cmd.env_clear()
-            .env("LD_LIBRARY_PATH", self.pg_lib_dir())
-            .env("DYLD_LIBRARY_PATH", self.pg_lib_dir());
+            .env("LD_LIBRARY_PATH", self.pg_lib_dir()?)
+            .env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?);
        Ok(cmd)
    }

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -16,7 +16,7 @@ use tokio::{
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
 use tracing::*;
-use utils::crashsafe_dir::path_with_suffix_extension;
+use utils::crashsafe::path_with_suffix_extension;

 use crate::{Download, DownloadError, RemoteObjectId};

--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -19,7 +19,7 @@ thiserror = "1.0"
 tokio = { version = "1.17", features = ["macros"]}
 tokio-rustls = "0.23"
 tracing = "0.1"
-tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
 nix = "0.25"
 signal-hook = "0.3.10"
 rand = "0.8.3"
@@ -30,6 +30,8 @@ rustls-split = "0.3.0"
 git-version = "0.3.5"
 serde_with = "2.0"
 once_cell = "1.13.0"
+strum = "0.24"
+strum_macros = "0.24"


 metrics = { path = "../metrics" }
--- a/libs/utils/src/connstring.rs
+++ b/libs/utils/src/connstring.rs
@@ -1,52 +0,0 @@
-use postgres::Config;
-
-pub fn connection_host_port(config: &Config) -> (String, u16) {
-    assert_eq!(
-        config.get_hosts().len(),
-        1,
-        "only one pair of host and port is supported in connection string"
-    );
-    assert_eq!(
-        config.get_ports().len(),
-        1,
-        "only one pair of host and port is supported in connection string"
-    );
-    let host = match &config.get_hosts()[0] {
-        postgres::config::Host::Tcp(host) => host.as_ref(),
-        postgres::config::Host::Unix(host) => host.to_str().unwrap(),
-    };
-    (host.to_owned(), config.get_ports()[0])
-}
-
-pub fn connection_address(config: &Config) -> String {
-    let (host, port) = connection_host_port(config);
-    format!("{}:{}", host, port)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_connection_host_port() {
-        let config: Config = "postgresql://no_user@localhost:64000/no_db"
-            .parse()
-            .unwrap();
-        assert_eq!(
-            connection_host_port(&config),
-            ("localhost".to_owned(), 64000)
-        );
-    }
-
-    #[test]
-    #[should_panic(expected = "only one pair of host and port is supported in connection string")]
-    fn test_connection_host_port_multiple_ports() {
-        let config: Config = "postgresql://no_user@localhost:64000,localhost:64001/no_db"
-            .parse()
-            .unwrap();
-        assert_eq!(
-            connection_host_port(&config),
-            ("localhost".to_owned(), 64000)
-        );
-    }
-}
--- a/libs/utils/src/crashsafe_dir.rs
+++ b/libs/utils/src/crashsafe_dir.rs
@@ -12,16 +12,8 @@ pub fn create_dir(path: impl AsRef<Path>) -> io::Result<()> {
    let path = path.as_ref();

    fs::create_dir(path)?;
-    File::open(path)?.sync_all()?;
-
-    if let Some(parent) = path.parent() {
-        File::open(parent)?.sync_all()
-    } else {
-        Err(io::Error::new(
-            io::ErrorKind::InvalidInput,
-            "can't find parent",
-        ))
-    }
+    fsync_file_and_parent(path)?;
+    Ok(())
 }

 /// Similar to [`std::fs::create_dir_all`], except we fsync all
@@ -65,12 +57,12 @@ pub fn create_dir_all(path: impl AsRef<Path>) -> io::Result<()> {

    // Fsync the created directories from child to parent.
    for &path in dirs_to_create.iter() {
-        File::open(path)?.sync_all()?;
+        fsync(path)?;
    }

    // If we created any new directories, fsync the parent.
    if !dirs_to_create.is_empty() {
-        File::open(path)?.sync_all()?;
+        fsync(path)?;
    }

    Ok(())
@@ -92,6 +84,33 @@ pub fn path_with_suffix_extension(original_path: impl AsRef<Path>, suffix: &str)
        .with_extension(new_extension.as_ref())
 }

+pub fn fsync_file_and_parent(file_path: &Path) -> io::Result<()> {
+    let parent = file_path.parent().ok_or_else(|| {
+        io::Error::new(
+            io::ErrorKind::Other,
+            format!("File {file_path:?} has no parent"),
+        )
+    })?;
+
+    fsync(file_path)?;
+    fsync(parent)?;
+    Ok(())
+}
+
+pub fn fsync(path: &Path) -> io::Result<()> {
+    File::open(path)
+        .map_err(|e| io::Error::new(e.kind(), format!("Failed to open the file {path:?}: {e}")))
+        .and_then(|file| {
+            file.sync_all().map_err(|e| {
+                io::Error::new(
+                    e.kind(),
+                    format!("Failed to sync file {path:?} data and metadata: {e}"),
+                )
+            })
+        })
+        .map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}")))
+}
+
 #[cfg(test)]
 mod tests {
    use tempfile::tempdir;
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -3,13 +3,6 @@ use std::{fmt, str::FromStr};
 use hex::FromHex;
 use rand::Rng;
 use serde::{Deserialize, Serialize};
-use thiserror::Error;
-
-#[derive(Error, Debug)]
-pub enum IdError {
-    #[error("invalid id length {0}")]
-    VecParseError(usize),
-}

 /// Neon ID is a 128-bit random ID.
 /// Used to represent various identifiers. Provides handy utility methods and impls.
@@ -29,15 +22,6 @@ impl Id {
        Id::from(arr)
    }

-    pub fn from_vec(src: &Vec<u8>) -> Result<Id, IdError> {
-        if src.len() != 16 {
-            return Err(IdError::VecParseError(src.len()));
-        }
-        let mut zid_slice = [0u8; 16];
-        zid_slice.copy_from_slice(&src);
-        Ok(zid_slice.into())
-    }
-
    pub fn as_arr(&self) -> [u8; 16] {
        self.0
    }
@@ -116,10 +100,6 @@ macro_rules! id_newtype {
                $t(Id::get_from_buf(buf))
            }

-            pub fn from_vec(src: &Vec<u8>) -> Result<$t, IdError> {
-                Ok($t(Id::from_vec(src)?))
-            }
-
            pub fn as_arr(&self) -> [u8; 16] {
                self.0.as_arr()
            }
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -19,11 +19,8 @@ pub mod postgres_backend;
 pub mod postgres_backend_async;
 pub mod pq_proto;

-// dealing with connstring parsing and handy access to it's parts
-pub mod connstring;
-
-// helper functions for creating and fsyncing directories/trees
-pub mod crashsafe_dir;
+// helper functions for creating and fsyncing
+pub mod crashsafe;

 // common authentication routines
 pub mod auth;
@@ -39,6 +36,8 @@ pub mod sock_split;
 // common log initialisation routine
 pub mod logging;

+pub mod lock_file;
+
 // Misc
 pub mod accum;
 pub mod shutdown;
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -0,0 +1,81 @@
+//! A module to create and read lock files. A lock file ensures that only one
+//! process is running at a time, in a particular directory.
+//!
+//! File locking is done using [`fcntl::flock`], which means that holding the
+//! lock on file only prevents acquiring another lock on it; all other
+//! operations are still possible on files. Other process can still open, read,
+//! write, or remove the file, for example.
+//! If the file is removed while a process is holding a lock on it,
+//! the process that holds the lock does not get any error or notification.
+//! Furthermore, you can create a new file with the same name and lock the new file,
+//! while the old process is still running.
+//! Deleting the lock file while the locking process is still running is a bad idea!
+
+use std::{fs, os::unix::prelude::AsRawFd, path::Path};
+
+use anyhow::Context;
+use nix::fcntl;
+
+use crate::crashsafe;
+
+pub enum LockCreationResult {
+    Created {
+        new_lock_contents: String,
+        file: fs::File,
+    },
+    AlreadyLocked {
+        existing_lock_contents: String,
+    },
+    CreationFailed(anyhow::Error),
+}
+
+/// Creates a lock file in the path given and writes the given contents into the file.
+/// Note: The lock is automatically released when the file closed. You might want to use Box::leak to make sure it lives until the end of the program.
+pub fn create_lock_file(lock_file_path: &Path, contents: String) -> LockCreationResult {
+    let lock_file = match fs::OpenOptions::new()
+        .create(true) // O_CREAT
+        .write(true)
+        .open(lock_file_path)
+        .context("Failed to open lock file")
+    {
+        Ok(file) => file,
+        Err(e) => return LockCreationResult::CreationFailed(e),
+    };
+
+    match fcntl::flock(
+        lock_file.as_raw_fd(),
+        fcntl::FlockArg::LockExclusiveNonblock,
+    ) {
+        Ok(()) => {
+            match lock_file
+                .set_len(0)
+                .context("Failed to truncate lockfile")
+                .and_then(|()| {
+                    fs::write(lock_file_path, &contents).with_context(|| {
+                        format!("Failed to write '{contents}' contents into lockfile")
+                    })
+                })
+                .and_then(|()| {
+                    crashsafe::fsync_file_and_parent(lock_file_path)
+                        .context("Failed to fsync lockfile")
+                }) {
+                Ok(()) => LockCreationResult::Created {
+                    new_lock_contents: contents,
+                    file: lock_file,
+                },
+                Err(e) => LockCreationResult::CreationFailed(e),
+            }
+        }
+        Err(nix::errno::Errno::EAGAIN) => {
+            match fs::read_to_string(lock_file_path).context("Failed to read lockfile contents") {
+                Ok(existing_lock_contents) => LockCreationResult::AlreadyLocked {
+                    existing_lock_contents,
+                },
+                Err(e) => LockCreationResult::CreationFailed(e),
+            }
+        }
+        Err(e) => {
+            LockCreationResult::CreationFailed(anyhow::anyhow!("Failed to lock lockfile: {e}"))
+        }
+    }
+}
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,19 +1,28 @@
-use std::{
-    fs::{File, OpenOptions},
-    path::Path,
-};
+use std::str::FromStr;

-use anyhow::{Context, Result};
+use anyhow::Context;
+use strum_macros::{EnumString, EnumVariantNames};

-pub fn init(log_filename: impl AsRef<Path>, daemonize: bool) -> Result<File> {
-    // Don't open the same file for output multiple times;
-    // the different fds could overwrite each other's output.
-    let log_file = OpenOptions::new()
-        .create(true)
-        .append(true)
-        .open(&log_filename)
-        .with_context(|| format!("failed to open {:?}", log_filename.as_ref()))?;
+#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
+#[strum(serialize_all = "snake_case")]
+pub enum LogFormat {
+    Plain,
+    Json,
+}

+impl LogFormat {
+    pub fn from_config(s: &str) -> anyhow::Result<LogFormat> {
+        use strum::VariantNames;
+        LogFormat::from_str(s).with_context(|| {
+            format!(
+                "Unrecognized log format. Please specify one of: {:?}",
+                LogFormat::VARIANTS
+            )
+        })
+    }
+}
+
+pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
    let default_filter_str = "info";

    // We fall back to printing all spans at info-level or above if
@@ -23,20 +32,14 @@ pub fn init(log_filename: impl AsRef<Path>, daemonize: bool) -> Result<File> {

    let base_logger = tracing_subscriber::fmt()
        .with_env_filter(env_filter)
-        .with_target(false) // don't include event targets
-        .with_ansi(false); // don't use colors in log file;
+        .with_target(false)
+        .with_ansi(false)
+        .with_writer(std::io::stdout);

-    // we are cloning and returning log file in order to allow redirecting daemonized stdout and stderr to it
-    // if we do not use daemonization (e.g. in docker) it is better to log to stdout directly
-    // for example to be in line with docker log command which expects logs comimg from stdout
-    if daemonize {
-        let x = log_file.try_clone().unwrap();
-        base_logger
-            .with_writer(move || x.try_clone().unwrap())
-            .init();
-    } else {
-        base_logger.init();
+    match log_format {
+        LogFormat::Json => base_logger.json().init(),
+        LogFormat::Plain => base_logger.init(),
    }

-    Ok(log_file)
+    Ok(())
 }
--- a/libs/utils/src/postgres_backend_async.rs
+++ b/libs/utils/src/postgres_backend_async.rs
@@ -15,7 +15,7 @@ use std::sync::Arc;
 use std::task::Poll;
 use tracing::{debug, error, trace};

-use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader};
 use tokio_rustls::TlsAcceptor;

 #[async_trait::async_trait]
@@ -66,8 +66,8 @@ pub enum ProcessMsgResult {
 /// Always-writeable sock_split stream.
 /// May not be readable. See [`PostgresBackend::take_stream_in`]
 pub enum Stream {
-    Unencrypted(tokio::net::TcpStream),
-    Tls(Box<tokio_rustls::server::TlsStream<tokio::net::TcpStream>>),
+    Unencrypted(BufReader<tokio::net::TcpStream>),
+    Tls(Box<tokio_rustls::server::TlsStream<BufReader<tokio::net::TcpStream>>>),
    Broken,
 }

@@ -157,7 +157,7 @@ impl PostgresBackend {
        let peer_addr = socket.peer_addr()?;

        Ok(Self {
-            stream: Stream::Unencrypted(socket),
+            stream: Stream::Unencrypted(BufReader::new(socket)),
            buf_out: BytesMut::with_capacity(10 * 1024),
            state: ProtoState::Initialization,
            md5_salt: [0u8; 4],
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -23,8 +23,7 @@ futures = "0.3.13"
 hex = "0.4.3"
 hyper = "0.14"
 itertools = "0.10.3"
-clap = "3.0"
-daemonize = "0.4.1"
+clap = { version = "4.0", features = ["string"] }
 tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
 tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
 postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
@@ -56,7 +55,7 @@ fail = "0.5.0"
 git-version = "0.3.5"
 rstar = "0.9.3"
 num-traits = "0.2.15"
-amplify_num = "0.4.1"
+amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }

 pageserver_api = { path = "../libs/pageserver_api" }
 postgres_ffi = { path = "../libs/postgres_ffi" }
@@ -67,7 +66,13 @@ remote_storage = { path = "../libs/remote_storage" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
 close_fds = "0.3.2"
 walkdir = "2.3.2"
+svg_fmt = "0.4.1"

 [dev-dependencies]
+criterion = "0.4"
 hex-literal = "0.3"
 tempfile = "3.2"
+
+[[bench]]
+name = "bench_layer_map"
+harness = false
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -22,8 +22,8 @@ use std::time::SystemTime;
 use tar::{Builder, EntryType, Header};
 use tracing::*;

-use crate::reltag::{RelTag, SlruKind};
 use crate::tenant::Timeline;
+use pageserver_api::reltag::{RelTag, SlruKind};

 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
 use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
--- a/pageserver/src/bin/draw_timeline_dir.rs
+++ b/pageserver/src/bin/draw_timeline_dir.rs
@@ -0,0 +1,150 @@
+//! A tool for visualizing the arrangement of layerfiles within a timeline.
+//!
+//! It reads filenames from stdin and prints a svg on stdout. The image is a plot in
+//! page-lsn space, where every delta layer is a rectangle and every image layer is a
+//! thick line. Legend:
+//! - The x axis (left to right) represents page index.
+//! - The y axis represents LSN, growing upwards.
+//!
+//! Coordinates in both axis are compressed for better readability.
+//! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb)
+//!
+//! Example use:
+//! ```
+//! $ cd test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE
+//! $ ls | grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
+//! $ firefox out.svg
+//! ```
+//!
+//! This API was chosen so that we can easily work with filenames extracted from ssh,
+//! or from pageserver log files.
+//!
+//! TODO Consider shipping this as a grafana panel plugin:
+//!      https://grafana.com/tutorials/build-a-panel-plugin/
+use anyhow::Result;
+use pageserver::repository::Key;
+use std::cmp::Ordering;
+use std::io::{self, BufRead};
+use std::{
+    collections::{BTreeMap, BTreeSet},
+    ops::Range,
+};
+use svg_fmt::{rectangle, rgb, BeginSvg, EndSvg, Fill, Stroke};
+use utils::{lsn::Lsn, project_git_version};
+
+project_git_version!(GIT_VERSION);
+
+// Map values to their compressed coordinate - the index the value
+// would have in a sorted and deduplicated list of all values.
+fn build_coordinate_compression_map<T: Ord + Copy>(coords: Vec<T>) -> BTreeMap<T, usize> {
+    let set: BTreeSet<T> = coords.into_iter().collect();
+
+    let mut map: BTreeMap<T, usize> = BTreeMap::new();
+    for (i, e) in set.iter().enumerate() {
+        map.insert(*e, i);
+    }
+
+    map
+}
+
+fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
+    let split: Vec<&str> = name.split("__").collect();
+    let keys: Vec<&str> = split[0].split('-').collect();
+    let mut lsns: Vec<&str> = split[1].split('-').collect();
+    if lsns.len() == 1 {
+        lsns.push(lsns[0]);
+    }
+
+    let keys = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap();
+    let lsns = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap();
+    (keys, lsns)
+}
+
+fn main() -> Result<()> {
+    // Parse layer filenames from stdin
+    let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
+    let stdin = io::stdin();
+    for line in stdin.lock().lines() {
+        let range = parse_filename(&line.unwrap());
+        ranges.push(range);
+    }
+
+    // Collect all coordinates
+    let mut keys: Vec<Key> = vec![];
+    let mut lsns: Vec<Lsn> = vec![];
+    for (keyr, lsnr) in &ranges {
+        keys.push(keyr.start);
+        keys.push(keyr.end);
+        lsns.push(lsnr.start);
+        lsns.push(lsnr.end);
+    }
+
+    // Analyze
+    let key_map = build_coordinate_compression_map(keys);
+    let lsn_map = build_coordinate_compression_map(lsns);
+
+    // Initialize stats
+    let mut num_deltas = 0;
+    let mut num_images = 0;
+
+    // Draw
+    let stretch = 3.0; // Stretch out vertically for better visibility
+    println!(
+        "{}",
+        BeginSvg {
+            w: key_map.len() as f32,
+            h: stretch * lsn_map.len() as f32
+        }
+    );
+    for (keyr, lsnr) in &ranges {
+        let key_start = *key_map.get(&keyr.start).unwrap();
+        let key_end = *key_map.get(&keyr.end).unwrap();
+        let key_diff = key_end - key_start;
+        let lsn_max = lsn_map.len();
+
+        if key_start >= key_end {
+            panic!("Invalid key range {}-{}", key_start, key_end);
+        }
+
+        let lsn_start = *lsn_map.get(&lsnr.start).unwrap();
+        let lsn_end = *lsn_map.get(&lsnr.end).unwrap();
+
+        let mut lsn_diff = (lsn_end - lsn_start) as f32;
+        let mut fill = Fill::None;
+        let mut margin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
+        let mut lsn_offset = 0.0;
+
+        // Fill in and thicken rectangle if it's an
+        // image layer so that we can see it.
+        match lsn_start.cmp(&lsn_end) {
+            Ordering::Less => num_deltas += 1,
+            Ordering::Equal => {
+                num_images += 1;
+                lsn_diff = 0.3;
+                lsn_offset = -lsn_diff / 2.0;
+                margin = 0.05;
+                fill = Fill::Color(rgb(0, 0, 0));
+            }
+            Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
+        }
+
+        println!(
+            "    {}",
+            rectangle(
+                key_start as f32 + stretch * margin,
+                stretch * (lsn_max as f32 - (lsn_end as f32 - margin - lsn_offset)),
+                key_diff as f32 - stretch * 2.0 * margin,
+                stretch * (lsn_diff - 2.0 * margin)
+            )
+            .fill(fill)
+            .stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
+            .border_radius(0.4)
+        );
+    }
+    println!("{}", EndSvg);
+
+    eprintln!("num_images: {}", num_images);
+    eprintln!("num_deltas: {}", num_deltas);
+
+    Ok(())
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -1,17 +1,14 @@
 //! Main entry point for the Page Server executable.

-use remote_storage::GenericRemoteStorage;
 use std::{env, ops::ControlFlow, path::Path, str::FromStr};
+
+use anyhow::{anyhow, Context};
+use clap::{Arg, ArgAction, Command};
+use fail::FailScenario;
+use nix::unistd::Pid;
 use tracing::*;

-use anyhow::{anyhow, bail, Context, Result};
-
-use clap::{App, Arg};
-use daemonize::Daemonize;
-
-use fail::FailScenario;
 use metrics::set_build_info_metric;
-
 use pageserver::{
    config::{defaults::*, PageServerConf},
    http, page_cache, page_service, profiling, task_mgr,
@@ -19,20 +16,22 @@ use pageserver::{
    task_mgr::{
        BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
    },
-    tenant_mgr, virtual_file, LOG_FILE_NAME,
+    tenant_mgr, virtual_file,
 };
+use remote_storage::GenericRemoteStorage;
 use utils::{
    auth::JwtAuth,
-    logging,
+    lock_file, logging,
    postgres_backend::AuthType,
    project_git_version,
-    shutdown::exit_now,
    signals::{self, Signal},
    tcp_listener,
 };

 project_git_version!(GIT_VERSION);

+const PID_FILE_NAME: &str = "pageserver.pid";
+
 const FEATURES: &[&str] = &[
    #[cfg(feature = "testing")]
    "testing",
@@ -51,60 +50,21 @@ fn version() -> String {
 }

 fn main() -> anyhow::Result<()> {
-    let arg_matches = App::new("Neon page server")
-        .about("Materializes WAL stream to pages and serves them to the postgres")
-        .version(&*version())
-        .arg(
+    let arg_matches = cli().get_matches();

-            Arg::new("daemonize")
-                .short('d')
-                .long("daemonize")
-                .takes_value(false)
-                .help("Run in the background"),
-        )
-        .arg(
-            Arg::new("init")
-                .long("init")
-                .takes_value(false)
-                .help("Initialize pageserver with all given config overrides"),
-        )
-        .arg(
-            Arg::new("workdir")
-                .short('D')
-                .long("workdir")
-                .takes_value(true)
-                .help("Working directory for the pageserver"),
-        )
-        // See `settings.md` for more details on the extra configuration patameters pageserver can process
-        .arg(
-            Arg::new("config-override")
-                .short('c')
-                .takes_value(true)
-                .number_of_values(1)
-                .multiple_occurrences(true)
-                .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there).
-                Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
-        )
-        .arg(Arg::new("update-config").long("update-config").takes_value(false).help(
-            "Update the config file when started",
-        ))
-        .arg(
-            Arg::new("enabled-features")
-                .long("enabled-features")
-                .takes_value(false)
-                .help("Show enabled compile time features"),
-        )
-        .get_matches();
-
-    if arg_matches.is_present("enabled-features") {
+    if arg_matches.get_flag("enabled-features") {
        println!("{{\"features\": {FEATURES:?} }}");
        return Ok(());
    }

-    let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".neon"));
+    let workdir = arg_matches
+        .get_one::<String>("workdir")
+        .map(Path::new)
+        .unwrap_or_else(|| Path::new(".neon"));
    let workdir = workdir
        .canonicalize()
        .with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
+
    let cfg_file_path = workdir.join("pageserver.toml");

    // Set CWD to workdir for non-daemon modes
@@ -115,8 +75,6 @@ fn main() -> anyhow::Result<()> {
        )
    })?;

-    let daemonize = arg_matches.is_present("daemonize");
-
    let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
        ControlFlow::Continue(conf) => conf,
        ControlFlow::Break(()) => {
@@ -127,7 +85,7 @@ fn main() -> anyhow::Result<()> {

    let tenants_path = conf.tenants_path();
    if !tenants_path.exists() {
-        utils::crashsafe_dir::create_dir_all(conf.tenants_path()).with_context(|| {
+        utils::crashsafe::create_dir_all(conf.tenants_path()).with_context(|| {
            format!(
                "Failed to create tenants root dir at '{}'",
                tenants_path.display()
@@ -142,7 +100,7 @@ fn main() -> anyhow::Result<()> {
    virtual_file::init(conf.max_file_descriptors);
    page_cache::init(conf.page_cache_size);

-    start_pageserver(conf, daemonize).context("Failed to start pageserver")?;
+    start_pageserver(conf).context("Failed to start pageserver")?;

    scenario.teardown();
    Ok(())
@@ -153,8 +111,8 @@ fn initialize_config(
    arg_matches: clap::ArgMatches,
    workdir: &Path,
 ) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
-    let init = arg_matches.is_present("init");
-    let update_config = init || arg_matches.is_present("update-config");
+    let init = arg_matches.get_flag("init");
+    let update_config = init || arg_matches.get_flag("update-config");

    let (mut toml, config_file_exists) = if cfg_file_path.is_file() {
        if init {
@@ -196,13 +154,10 @@ fn initialize_config(
        )
    };

-    if let Some(values) = arg_matches.values_of("config-override") {
+    if let Some(values) = arg_matches.get_many::<String>("config-override") {
        for option_line in values {
            let doc = toml_edit::Document::from_str(option_line).with_context(|| {
-                format!(
-                    "Option '{}' could not be parsed as a toml document",
-                    option_line
-                )
+                format!("Option '{option_line}' could not be parsed as a toml document")
            })?;

            for (key, item) in doc.iter() {
@@ -240,11 +195,33 @@ fn initialize_config(
    })
 }

-fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> {
-    // Initialize logger
-    let log_file = logging::init(LOG_FILE_NAME, daemonize)?;
+fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
+    logging::init(conf.log_format)?;
+    info!("version: {}", version());

-    info!("version: {GIT_VERSION}");
+    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
+    let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
+        lock_file::LockCreationResult::Created {
+            new_lock_contents,
+            file,
+        } => {
+            info!("Created lock file at {lock_file_path:?} with contents {new_lock_contents}");
+            file
+        }
+        lock_file::LockCreationResult::AlreadyLocked {
+            existing_lock_contents,
+        } => anyhow::bail!(
+            "Could not lock pid file; pageserver is already running in {:?} with PID {}",
+            conf.workdir,
+            existing_lock_contents
+        ),
+        lock_file::LockCreationResult::CreationFailed(e) => {
+            return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
+        }
+    };
+    // ensure that the lock file is held even if the main thread of the process is panics
+    // we need to release the lock file only when the current process is gone
+    let _ = Box::leak(Box::new(lock_file));

    // TODO: Check that it looks like a valid repository before going further

@@ -261,33 +238,6 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
    );
    let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;

-    // NB: Don't spawn any threads before daemonizing!
-    if daemonize {
-        info!("daemonizing...");
-
-        // There shouldn't be any logging to stdin/stdout. Redirect it to the main log so
-        // that we will see any accidental manual fprintf's or backtraces.
-        let stdout = log_file
-            .try_clone()
-            .with_context(|| format!("Failed to clone log file '{:?}'", log_file))?;
-        let stderr = log_file;
-
-        let daemonize = Daemonize::new()
-            .pid_file("pageserver.pid")
-            .working_directory(".")
-            .stdout(stdout)
-            .stderr(stderr);
-
-        // XXX: The parent process should exit abruptly right after
-        // it has spawned a child to prevent coverage machinery from
-        // dumping stats into a `profraw` file now owned by the child.
-        // Otherwise, the coverage data will be damaged.
-        match daemonize.exit_action(|| exit_now(0)).start() {
-            Ok(_) => info!("Success, daemonized"),
-            Err(err) => bail!("{err}. could not daemonize. bailing."),
-        }
-    }
-
    let signals = signals::install_shutdown_handlers()?;

    // start profiler (if enabled)
@@ -385,3 +335,47 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
        }
    })
 }
+
+fn cli() -> Command {
+    Command::new("Neon page server")
+        .about("Materializes WAL stream to pages and serves them to the postgres")
+        .version(version())
+        .arg(
+            Arg::new("init")
+                .long("init")
+                .action(ArgAction::SetTrue)
+                .help("Initialize pageserver with all given config overrides"),
+        )
+        .arg(
+            Arg::new("workdir")
+                .short('D')
+                .long("workdir")
+                .help("Working directory for the pageserver"),
+        )
+        // See `settings.md` for more details on the extra configuration patameters pageserver can process
+        .arg(
+            Arg::new("config-override")
+                .short('c')
+                .num_args(1)
+                .action(ArgAction::Append)
+                .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \
+                Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
+        )
+        .arg(
+            Arg::new("update-config")
+                .long("update-config")
+                .action(ArgAction::SetTrue)
+                .help("Update the config file when started"),
+        )
+        .arg(
+            Arg::new("enabled-features")
+                .long("enabled-features")
+                .action(ArgAction::SetTrue)
+                .help("Show enabled compile time features"),
+        )
+}
+
+#[test]
+fn verify_cli() {
+    cli().debug_assert();
+}
--- a/pageserver/src/bin/pageserver_binutils.rs
+++ b/pageserver/src/bin/pageserver_binutils.rs
@@ -9,7 +9,7 @@ use std::{
 };

 use anyhow::Context;
-use clap::{App, Arg};
+use clap::{value_parser, Arg, Command};

 use pageserver::{
    page_cache,
@@ -24,40 +24,14 @@ project_git_version!(GIT_VERSION);
 const METADATA_SUBCOMMAND: &str = "metadata";

 fn main() -> anyhow::Result<()> {
-    let arg_matches = App::new("Neon Pageserver binutils")
-        .about("Reads pageserver (and related) binary files management utility")
-        .version(GIT_VERSION)
-        .arg(Arg::new("path").help("Input file path").required(false))
-        .subcommand(
-            App::new(METADATA_SUBCOMMAND)
-                .about("Read and update pageserver metadata file")
-                .arg(
-                    Arg::new("metadata_path")
-                        .help("Input metadata file path")
-                        .required(false),
-                )
-                .arg(
-                    Arg::new("disk_consistent_lsn")
-                        .long("disk_consistent_lsn")
-                        .takes_value(true)
-                        .help("Replace disk consistent Lsn"),
-                )
-                .arg(
-                    Arg::new("prev_record_lsn")
-                        .long("prev_record_lsn")
-                        .takes_value(true)
-                        .help("Replace previous record Lsn"),
-                ),
-        )
-        .get_matches();
+    let arg_matches = cli().get_matches();

    match arg_matches.subcommand() {
        Some((subcommand_name, subcommand_matches)) => {
-            let path = PathBuf::from(
-                subcommand_matches
-                    .value_of("metadata_path")
-                    .context("'metadata_path' argument is missing")?,
-            );
+            let path = subcommand_matches
+                .get_one::<PathBuf>("metadata_path")
+                .context("'metadata_path' argument is missing")?
+                .to_path_buf();
            anyhow::ensure!(
                subcommand_name == METADATA_SUBCOMMAND,
                "Unknown subcommand {subcommand_name}"
@@ -65,11 +39,10 @@ fn main() -> anyhow::Result<()> {
            handle_metadata(&path, subcommand_matches)?;
        }
        None => {
-            let path = PathBuf::from(
-                arg_matches
-                    .value_of("path")
-                    .context("'path' argument is missing")?,
-            );
+            let path = arg_matches
+                .get_one::<PathBuf>("path")
+                .context("'path' argument is missing")?
+                .to_path_buf();
            println!(
                "No subcommand specified, attempting to guess the format for file {}",
                path.display()
@@ -110,7 +83,7 @@ fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), an
    let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
    println!("Current metadata:\n{meta:?}");
    let mut update_meta = false;
-    if let Some(disk_consistent_lsn) = arg_matches.value_of("disk_consistent_lsn") {
+    if let Some(disk_consistent_lsn) = arg_matches.get_one::<String>("disk_consistent_lsn") {
        meta = TimelineMetadata::new(
            Lsn::from_str(disk_consistent_lsn)?,
            meta.prev_record_lsn(),
@@ -122,7 +95,7 @@ fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), an
        );
        update_meta = true;
    }
-    if let Some(prev_record_lsn) = arg_matches.value_of("prev_record_lsn") {
+    if let Some(prev_record_lsn) = arg_matches.get_one::<String>("prev_record_lsn") {
        meta = TimelineMetadata::new(
            meta.disk_consistent_lsn(),
            Some(Lsn::from_str(prev_record_lsn)?),
@@ -142,3 +115,40 @@ fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), an

    Ok(())
 }
+
+fn cli() -> Command {
+    Command::new("Neon Pageserver binutils")
+        .about("Reads pageserver (and related) binary files management utility")
+        .version(GIT_VERSION)
+        .arg(
+            Arg::new("path")
+                .help("Input file path")
+                .value_parser(value_parser!(PathBuf))
+                .required(false),
+        )
+        .subcommand(
+            Command::new(METADATA_SUBCOMMAND)
+                .about("Read and update pageserver metadata file")
+                .arg(
+                    Arg::new("metadata_path")
+                        .help("Input metadata file path")
+                        .value_parser(value_parser!(PathBuf))
+                        .required(false),
+                )
+                .arg(
+                    Arg::new("disk_consistent_lsn")
+                        .long("disk_consistent_lsn")
+                        .help("Replace disk consistent Lsn"),
+                )
+                .arg(
+                    Arg::new("prev_record_lsn")
+                        .long("prev_record_lsn")
+                        .help("Replace previous record Lsn"),
+                ),
+        )
+}
+
+#[test]
+fn verify_cli() {
+    cli().debug_assert();
+}
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -7,6 +7,7 @@
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use remote_storage::RemoteStorageConfig;
 use std::env;
+use utils::crashsafe::path_with_suffix_extension;

 use std::path::{Path, PathBuf};
 use std::str::FromStr;
@@ -16,6 +17,7 @@ use toml_edit::{Document, Item};
 use url::Url;
 use utils::{
    id::{NodeId, TenantId, TimelineId},
+    logging::LogFormat,
    postgres_backend::AuthType,
 };

@@ -24,6 +26,7 @@ use crate::tenant_config::{TenantConf, TenantConfOpt};

 /// The name of the metadata file pageserver creates per timeline.
 pub const METADATA_FILE_NAME: &str = "metadata";
+pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
 const TENANT_CONFIG_NAME: &str = "config";

 pub mod defaults {
@@ -43,6 +46,8 @@ pub mod defaults {
    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;

+    pub const DEFAULT_LOG_FORMAT: &str = "plain";
+
    ///
    /// Default built-in configuration file.
    ///
@@ -61,6 +66,7 @@ pub mod defaults {
 # initial superuser role name to use when creating a new tenant
 #initial_superuser_name = '{DEFAULT_SUPERUSER}'

+#log_format = '{DEFAULT_LOG_FORMAT}'
 # [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -124,6 +130,8 @@ pub struct PageServerConf {

    /// Etcd broker endpoints to connect to.
    pub broker_endpoints: Vec<Url>,
+
+    pub log_format: LogFormat,
 }

 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -190,6 +198,8 @@ struct PageServerConfigBuilder {
    profiling: BuilderValue<ProfilingConfig>,
    broker_etcd_prefix: BuilderValue<String>,
    broker_endpoints: BuilderValue<Vec<Url>>,
+
+    log_format: BuilderValue<LogFormat>,
 }

 impl Default for PageServerConfigBuilder {
@@ -217,6 +227,7 @@ impl Default for PageServerConfigBuilder {
            profiling: Set(ProfilingConfig::Disabled),
            broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()),
            broker_endpoints: Set(Vec::new()),
+            log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
        }
    }
 }
@@ -289,6 +300,10 @@ impl PageServerConfigBuilder {
        self.profiling = BuilderValue::Set(profiling)
    }

+    pub fn log_format(&mut self, log_format: LogFormat) {
+        self.log_format = BuilderValue::Set(log_format)
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let broker_endpoints = self
            .broker_endpoints
@@ -333,6 +348,7 @@ impl PageServerConfigBuilder {
            broker_etcd_prefix: self
                .broker_etcd_prefix
                .ok_or(anyhow!("missing broker_etcd_prefix"))?,
+            log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
        })
    }
 }
@@ -364,6 +380,17 @@ impl PageServerConf {
        self.timelines_path(tenant_id).join(timeline_id.to_string())
    }

+    pub fn timeline_uninit_mark_file_path(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> PathBuf {
+        path_with_suffix_extension(
+            self.timeline_path(&timeline_id, &tenant_id),
+            TIMELINE_UNINIT_MARK_SUFFIX,
+        )
+    }
+
    /// Points to a place in pageserver's local directory,
    /// where certain timeline's metadata file should be located.
    pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf {
@@ -374,28 +401,28 @@ impl PageServerConf {
    //
    // Postgres distribution paths
    //
-    pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf {
+    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

        match pg_version {
-            14 => path.join(format!("v{pg_version}")),
-            15 => path.join(format!("v{pg_version}")),
-            _ => panic!("Unsupported postgres version: {}", pg_version),
+            14 => Ok(path.join(format!("v{pg_version}"))),
+            15 => Ok(path.join(format!("v{pg_version}"))),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }

-    pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf {
+    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        match pg_version {
-            14 => self.pg_distrib_dir(pg_version).join("bin"),
-            15 => self.pg_distrib_dir(pg_version).join("bin"),
-            _ => panic!("Unsupported postgres version: {}", pg_version),
+            14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
+            15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }
-    pub fn pg_lib_dir(&self, pg_version: u32) -> PathBuf {
+    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        match pg_version {
-            14 => self.pg_distrib_dir(pg_version).join("lib"),
-            15 => self.pg_distrib_dir(pg_version).join("lib"),
-            _ => panic!("Unsupported postgres version: {}", pg_version),
+            14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
+            15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }

@@ -446,6 +473,9 @@ impl PageServerConf {
                        })
                        .collect::<anyhow::Result<_>>()?,
                ),
+                "log_format" => builder.log_format(
+                    LogFormat::from_config(&parse_toml_string(key, item)?)?
+                ),
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -558,6 +588,7 @@ impl PageServerConf {
            default_tenant_conf: TenantConf::dummy_conf(),
            broker_endpoints: Vec::new(),
            broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
+            log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
        }
    }
 }
@@ -652,6 +683,8 @@ max_file_descriptors = 333
 initial_superuser_name = 'zzzz'
 id = 10

+log_format = 'json'
+
 "#;

    #[test]
@@ -691,6 +724,7 @@ id = 10
                    .parse()
                    .expect("Failed to parse a valid broker endpoint URL")],
                broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
+                log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -735,6 +769,7 @@ id = 10
                    .parse()
                    .expect("Failed to parse a valid broker endpoint URL")],
                broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
+                log_format: LogFormat::Json,
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -618,6 +618,7 @@ components:
        - last_record_lsn
        - disk_consistent_lsn
        - awaits_download
+        - state
      properties:
        timeline_id:
          type: string
@@ -660,6 +661,8 @@ components:
          type: integer
        awaits_download:
          type: boolean
+        state:
+          type: string

        # These 'local' and 'remote' fields just duplicate some of the fields
        # above. They are kept for backwards-compatibility. They can be removed,
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -129,6 +129,7 @@ async fn build_timeline_info(
        }
    };
    let current_physical_size = Some(timeline.get_physical_size());
+    let state = timeline.current_state();

    let info = TimelineInfo {
        tenant_id: timeline.tenant_id,
@@ -158,6 +159,7 @@ async fn build_timeline_info(

        remote_consistent_lsn,
        awaits_download,
+        state,

        // Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility
        // with the control plane.
@@ -225,13 +227,10 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,

    let state = get_state(&request);

-    let timelines = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
+    let timelines = info_span!("timeline_list", tenant = %tenant_id).in_scope(|| {
        let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
        Ok(tenant.list_timelines())
-    })
-    .await
-    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
+    })?;

    let mut response_data = Vec::with_capacity(timelines.len());
    for timeline in timelines {
@@ -294,7 +293,7 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body

    let timeline_info = async {
        let timeline = tokio::task::spawn_blocking(move || {
-            tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id)
+            tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id, false)
        })
        .await
        .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
@@ -331,14 +330,13 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
    let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);

    let timeline = tenant_mgr::get_tenant(tenant_id, true)
-        .and_then(|tenant| tenant.get_timeline(timeline_id))
-        .with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
+        .and_then(|tenant| tenant.get_timeline(timeline_id, true))
        .map_err(ApiError::NotFound)?;
    let result = match timeline
        .find_lsn_for_timestamp(timestamp_pg)
        .map_err(ApiError::InternalServerError)?
    {
-        LsnForTimestamp::Present(lsn) => format!("{}", lsn),
+        LsnForTimestamp::Present(lsn) => format!("{lsn}"),
        LsnForTimestamp::Future(_lsn) => "future".into(),
        LsnForTimestamp::Past(_lsn) => "past".into(),
        LsnForTimestamp::NoData(_lsn) => "nodata".into(),
@@ -386,7 +384,7 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
        }
        return json_response(StatusCode::ACCEPTED, ());
    }
-    // no tenant in the index, release the lock to make the potentially lengthy download opetation
+    // no tenant in the index, release the lock to make the potentially lengthy download operation
    drop(index_accessor);

    // download index parts for every tenant timeline
@@ -522,9 +520,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
    check_permission(&request, Some(tenant_id))?;

    // if tenant is in progress of downloading it can be absent in global tenant map
-    let tenant = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant(tenant_id, false))
-        .await
-        .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
+    let tenant = tenant_mgr::get_tenant(tenant_id, false);

    let state = get_state(&request);
    let remote_index = &state.remote_index;
@@ -781,11 +777,6 @@ async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>
 }

 // Run GC immediately on given timeline.
-// FIXME: This is just for tests. See test_runner/regress/test_gc.py.
-// This probably should require special authentication or a global flag to
-// enable, I don't think we want to or need to allow regular clients to invoke
-// GC.
-//     @hllinnaka in commits ec44f4b29, 3aca717f3
 #[cfg(feature = "testing")]
 async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
@@ -793,16 +784,16 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
    check_permission(&request, Some(tenant_id))?;

    // FIXME: currently this will return a 500 error on bad tenant id; it should be 4XX
-    let repo = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?;
+    let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?;
    let gc_req: TimelineGcRequest = json_request(&mut request).await?;

    let _span_guard =
        info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered();
-    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| repo.get_gc_horizon());
+    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());

    // Use tenant's pitr setting
-    let pitr = repo.get_pitr_interval();
-    let result = repo
+    let pitr = tenant.get_pitr_interval();
+    let result = tenant
        .gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
        // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
        // better once the types support it.
@@ -811,19 +802,15 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
 }

 // Run compaction immediately on given timeline.
-// FIXME This is just for tests. Don't expect this to be exposed to
-// the users or the api.
-//     @dhammika in commit a0781f229
 #[cfg(feature = "testing")]
 async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let repo = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
-    let timeline = repo
-        .get_timeline(timeline_id)
-        .with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
+    let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
        .map_err(ApiError::NotFound)?;
    timeline.compact().map_err(ApiError::InternalServerError)?;

@@ -837,10 +824,9 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let repo = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
-    let timeline = repo
-        .get_timeline(timeline_id)
-        .with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
+    let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
        .map_err(ApiError::NotFound)?;
    timeline
        .checkpoint(CheckpointConfig::Forced)
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -12,10 +12,10 @@ use tracing::*;
 use walkdir::WalkDir;

 use crate::pgdatadir_mapping::*;
-use crate::reltag::{RelTag, SlruKind};
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
 use crate::walrecord::DecodedWALRecord;
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::*;
 use postgres_ffi::waldecoder::WalStreamDecoder;
@@ -43,19 +43,19 @@ pub fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
 /// The code that deals with the checkpoint would not work right if the
 /// cluster was not shut down cleanly.
 pub fn import_timeline_from_postgres_datadir(
-    path: &Path,
    tline: &Timeline,
-    lsn: Lsn,
+    pgdata_path: &Path,
+    pgdata_lsn: Lsn,
 ) -> Result<()> {
    let mut pg_control: Option<ControlFileData> = None;

    // TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn)
    // Then fishing out pg_control would be unnecessary
-    let mut modification = tline.begin_modification(lsn);
+    let mut modification = tline.begin_modification(pgdata_lsn);
    modification.init_empty()?;

    // Import all but pg_wal
-    let all_but_wal = WalkDir::new(path)
+    let all_but_wal = WalkDir::new(pgdata_path)
        .into_iter()
        .filter_entry(|entry| !entry.path().ends_with("pg_wal"));
    for entry in all_but_wal {
@@ -63,7 +63,7 @@ pub fn import_timeline_from_postgres_datadir(
        let metadata = entry.metadata().expect("error getting dir entry metadata");
        if metadata.is_file() {
            let absolute_path = entry.path();
-            let relative_path = absolute_path.strip_prefix(path)?;
+            let relative_path = absolute_path.strip_prefix(pgdata_path)?;

            let file = File::open(absolute_path)?;
            let len = metadata.len() as usize;
@@ -84,7 +84,7 @@ pub fn import_timeline_from_postgres_datadir(
        "Postgres cluster was not shut down cleanly"
    );
    ensure!(
-        pg_control.checkPointCopy.redo == lsn.0,
+        pg_control.checkPointCopy.redo == pgdata_lsn.0,
        "unexpected checkpoint REDO pointer"
    );

@@ -92,10 +92,10 @@ pub fn import_timeline_from_postgres_datadir(
    // this reads the checkpoint record itself, advancing the tip of the timeline to
    // *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'.
    import_wal(
-        &path.join("pg_wal"),
+        &pgdata_path.join("pg_wal"),
        tline,
        Lsn(pg_control.checkPointCopy.redo),
-        lsn,
+        pgdata_lsn,
    )?;

    Ok(())
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -8,7 +8,6 @@ pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
 pub mod profiling;
-pub mod reltag;
 pub mod repository;
 pub mod storage_sync;
 pub mod task_mgr;
@@ -44,7 +43,7 @@ pub const DEFAULT_PG_VERSION: u32 = 14;
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
 pub const DELTA_FILE_MAGIC: u16 = 0x5A61;

-pub const LOG_FILE_NAME: &str = "pageserver.log";
+static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);

 /// Config for the Repository checkpointer
 #[derive(Debug, Clone, Copy)]
@@ -80,7 +79,6 @@ pub async fn shutdown_pageserver(exit_code: i32) {

    // There should be nothing left, but let's be sure
    task_mgr::shutdown_tasks(None, None, None).await;
-
    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -107,18 +107,20 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {

 // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
 // or in testing they estimate how much we would upload if we did.
-static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
+static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
        "pageserver_created_persistent_files_total",
        "Number of files created that are meant to be uploaded to cloud storage",
+        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });

-static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
+static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
        "pageserver_written_persistent_bytes_total",
        "Total bytes written that are meant to be uploaded to cloud storage",
+        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -275,11 +277,15 @@ pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
 /// smallest redo processing times. These buckets allow us to measure down
 /// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec.
 /// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec.
+///
+/// Values up to 1s are recorded because metrics show that we have redo
+/// durations and lock times larger than 0.250s.
 macro_rules! redo_histogram_time_buckets {
    () => {
        vec![
            0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000,
-            0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000,
+            0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000, 0.500_000,
+            1.000_000,
        ]
    };
 }
@@ -294,6 +300,17 @@ macro_rules! redo_histogram_count_buckets {
    };
 }

+macro_rules! redo_bytes_histogram_count_buckets {
+    () => {
+        // powers of (2^.5), from 2^4.5 to 2^15 (22 buckets)
+        // rounded up to the next multiple of 8 to capture any MAXALIGNed record of that size, too.
+        vec![
+            24.0, 32.0, 48.0, 64.0, 96.0, 128.0, 184.0, 256.0, 368.0, 512.0, 728.0, 1024.0, 1456.0,
+            2048.0, 2904.0, 4096.0, 5800.0, 8192.0, 11592.0, 16384.0, 23176.0, 32768.0,
+        ]
+    };
+}
+
 pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_seconds",
@@ -321,6 +338,15 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_wal_redo_bytes_histogram",
+        "Histogram of number of records replayed per redo",
+        redo_bytes_histogram_count_buckets!(),
+    )
+    .expect("failed to define a metric")
+});
+
 pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_replayed_wal_records_total",
@@ -386,8 +412,12 @@ impl TimelineMetrics {
        let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
-        let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED.clone();
-        let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN.clone();
+        let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
+        let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();

        TimelineMetrics {
            tenant_id,
@@ -419,6 +449,8 @@ impl Drop for TimelineMetrics {
        let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = CURRENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);

        for op in STORAGE_TIME_OPERATIONS {
            let _ = STORAGE_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -10,8 +10,14 @@
 //

 use anyhow::{bail, ensure, Context, Result};
-use bytes::{Buf, BufMut, Bytes, BytesMut};
+use bytes::Bytes;
 use futures::{Stream, StreamExt};
+use pageserver_api::models::{
+    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
+    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
+    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
+    PagestreamNblocksRequest, PagestreamNblocksResponse,
+};
 use std::io;
 use std::net::TcpListener;
 use std::str;
@@ -32,10 +38,9 @@ use utils::{

 use crate::basebackup;
 use crate::config::{PageServerConf, ProfilingConfig};
-use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar};
+use crate::import_datadir::import_wal_from_tar;
 use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
 use crate::profiling::profpoint_start;
-use crate::reltag::RelTag;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::Timeline;
@@ -45,163 +50,6 @@ use crate::CheckpointConfig;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

-// Wrapped in libpq CopyData
-enum PagestreamFeMessage {
-    Exists(PagestreamExistsRequest),
-    Nblocks(PagestreamNblocksRequest),
-    GetPage(PagestreamGetPageRequest),
-    DbSize(PagestreamDbSizeRequest),
-}
-
-// Wrapped in libpq CopyData
-enum PagestreamBeMessage {
-    Exists(PagestreamExistsResponse),
-    Nblocks(PagestreamNblocksResponse),
-    GetPage(PagestreamGetPageResponse),
-    Error(PagestreamErrorResponse),
-    DbSize(PagestreamDbSizeResponse),
-}
-
-#[derive(Debug)]
-struct PagestreamExistsRequest {
-    latest: bool,
-    lsn: Lsn,
-    rel: RelTag,
-}
-
-#[derive(Debug)]
-struct PagestreamNblocksRequest {
-    latest: bool,
-    lsn: Lsn,
-    rel: RelTag,
-}
-
-#[derive(Debug)]
-struct PagestreamGetPageRequest {
-    latest: bool,
-    lsn: Lsn,
-    rel: RelTag,
-    blkno: u32,
-}
-
-#[derive(Debug)]
-struct PagestreamDbSizeRequest {
-    latest: bool,
-    lsn: Lsn,
-    dbnode: u32,
-}
-
-#[derive(Debug)]
-struct PagestreamExistsResponse {
-    exists: bool,
-}
-
-#[derive(Debug)]
-struct PagestreamNblocksResponse {
-    n_blocks: u32,
-}
-
-#[derive(Debug)]
-struct PagestreamGetPageResponse {
-    page: Bytes,
-}
-
-#[derive(Debug)]
-struct PagestreamErrorResponse {
-    message: String,
-}
-
-#[derive(Debug)]
-struct PagestreamDbSizeResponse {
-    db_size: i64,
-}
-
-impl PagestreamFeMessage {
-    fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
-        // TODO these gets can fail
-
-        // these correspond to the NeonMessageTag enum in pagestore_client.h
-        //
-        // TODO: consider using protobuf or serde bincode for less error prone
-        // serialization.
-        let msg_tag = body.get_u8();
-        match msg_tag {
-            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
-                },
-            })),
-            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
-                },
-            })),
-            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
-                },
-                blkno: body.get_u32(),
-            })),
-            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                dbnode: body.get_u32(),
-            })),
-            _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
-        }
-    }
-}
-
-impl PagestreamBeMessage {
-    fn serialize(&self) -> Bytes {
-        let mut bytes = BytesMut::new();
-
-        match self {
-            Self::Exists(resp) => {
-                bytes.put_u8(100); /* tag from pagestore_client.h */
-                bytes.put_u8(resp.exists as u8);
-            }
-
-            Self::Nblocks(resp) => {
-                bytes.put_u8(101); /* tag from pagestore_client.h */
-                bytes.put_u32(resp.n_blocks);
-            }
-
-            Self::GetPage(resp) => {
-                bytes.put_u8(102); /* tag from pagestore_client.h */
-                bytes.put(&resp.page[..]);
-            }
-
-            Self::Error(resp) => {
-                bytes.put_u8(103); /* tag from pagestore_client.h */
-                bytes.put(resp.message.as_bytes());
-                bytes.put_u8(0); // null terminator
-            }
-            Self::DbSize(resp) => {
-                bytes.put_u8(104); /* tag from pagestore_client.h */
-                bytes.put_i64(resp.db_size);
-            }
-        }
-
-        bytes.into()
-    }
-}
-
 fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Bytes>> + '_ {
    async_stream::try_stream! {
        loop {
@@ -500,11 +348,8 @@ impl PageServerHandler {
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
        // Create empty timeline
        info!("creating new timeline");
-        let timeline = tenant_mgr::get_tenant(tenant_id, true)?.create_empty_timeline(
-            timeline_id,
-            base_lsn,
-            pg_version,
-        )?;
+        let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
+        let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version)?;

        // TODO mark timeline as not ready until it reaches end_lsn.
        // We might have some wal to import as well, and we should prevent compute
@@ -527,7 +372,8 @@ impl PageServerHandler {
        // - use block_in_place()
        let mut copyin_stream = Box::pin(copyin_stream(pgb));
        let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
-        tokio::task::block_in_place(|| import_basebackup_from_tar(&timeline, reader, base_lsn))?;
+        tokio::task::block_in_place(|| timeline.import_basebackup_from_tar(reader, base_lsn))?;
+        timeline.initialize()?;

        // Drain the rest of the Copy data
        let mut bytes_after_tar = 0;
@@ -544,12 +390,6 @@ impl PageServerHandler {
        // It wouldn't work if base came from vanilla postgres though,
        // since we discard some log files.

-        // Flush data to disk, then upload to s3
-        info!("flushing layers");
-        timeline.checkpoint(CheckpointConfig::Flush)?;
-
-        timeline.launch_wal_receiver()?;
-
        info!("done");
        Ok(())
    }
@@ -1068,7 +908,8 @@ impl postgres_backend_async::Handler for PageServerHandler {
 }

 fn get_local_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> Result<Arc<Timeline>> {
-    tenant_mgr::get_tenant(tenant_id, true).and_then(|tenant| tenant.get_timeline(timeline_id))
+    tenant_mgr::get_tenant(tenant_id, true)
+        .and_then(|tenant| tenant.get_timeline(timeline_id, true))
 }

 ///
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -7,12 +7,12 @@
 //! Clarify that)
 //!
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-use crate::reltag::{RelTag, SlruKind};
 use crate::repository::*;
 use crate::tenant::Timeline;
 use crate::walrecord::NeonWalRecord;
 use anyhow::{bail, ensure, Result};
 use bytes::{Buf, Bytes};
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
 use postgres_ffi::{Oid, TimestampTz, TransactionId};
@@ -1373,6 +1373,17 @@ fn is_rel_block_key(key: Key) -> bool {
    key.field1 == 0x00 && key.field4 != 0
 }

+pub fn is_rel_fsm_block_key(key: Key) -> bool {
+    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
+}
+
+pub fn is_rel_vm_block_key(key: Key) -> bool {
+    key.field1 == 0x00
+        && key.field4 != 0
+        && key.field5 == VISIBILITYMAP_FORKNUM
+        && key.field6 != 0xffffffff
+}
+
 pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> {
    Ok(match key.field1 {
        0x01 => {
@@ -1403,7 +1414,9 @@ pub fn create_test_timeline(
    timeline_id: utils::id::TimelineId,
    pg_version: u32,
 ) -> Result<std::sync::Arc<Timeline>> {
-    let tline = tenant.create_empty_timeline(timeline_id, Lsn(8), pg_version)?;
+    let tline = tenant
+        .create_empty_timeline(timeline_id, Lsn(8), pg_version)?
+        .initialize()?;
    let mut m = tline.begin_modification(Lsn(8));
    m.init_empty()?;
    m.commit()?;
--- a/pageserver/src/storage_sync.rs
+++ b/pageserver/src/storage_sync.rs
@@ -171,7 +171,7 @@ use self::{
 use crate::{
    config::PageServerConf,
    exponential_backoff,
-    storage_sync::index::RemoteIndex,
+    storage_sync::index::{LayerFileMetadata, RemoteIndex},
    task_mgr,
    task_mgr::TaskKind,
    task_mgr::BACKGROUND_RUNTIME,
@@ -193,7 +193,7 @@ static SYNC_QUEUE: OnceCell<SyncQueue> = OnceCell::new();

 /// A timeline status to share with pageserver's sync counterpart,
 /// after comparing local and remote timeline state.
-#[derive(Clone)]
+#[derive(Clone, PartialEq, Eq)]
 pub enum LocalTimelineInitStatus {
    /// The timeline has every remote layer present locally.
    /// There could be some layers requiring uploading,
@@ -316,7 +316,7 @@ impl SyncQueue {

 /// A task to run in the async download/upload loop.
 /// Limited by the number of retries, after certain threshold the failing task gets evicted and the timeline disabled.
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, PartialEq, Eq)]
 enum SyncTask {
    /// A checkpoint outcome with possible local file updates that need actualization in the remote storage.
    /// Not necessary more fresh than the one already uploaded.
@@ -427,7 +427,7 @@ impl SyncTaskBatch {
                            .extend(new_delete.data.deleted_layers.iter().cloned());
                    }
                    if let Some(batch_upload) = &mut self.upload {
-                        let not_deleted = |layer: &PathBuf| {
+                        let not_deleted = |layer: &PathBuf, _: &mut LayerFileMetadata| {
                            !new_delete.data.layers_to_delete.contains(layer)
                                && !new_delete.data.deleted_layers.contains(layer)
                        };
@@ -455,21 +455,35 @@ impl SyncTaskBatch {
 #[derive(Debug, Clone, PartialEq, Eq)]
 struct LayersUpload {
    /// Layer file path in the pageserver workdir, that were added for the corresponding checkpoint.
-    layers_to_upload: HashSet<PathBuf>,
+    layers_to_upload: HashMap<PathBuf, LayerFileMetadata>,
    /// Already uploaded layers. Used to store the data about the uploads between task retries
    /// and to record the data into the remote index after the task got completed or evicted.
-    uploaded_layers: HashSet<PathBuf>,
+    uploaded_layers: HashMap<PathBuf, LayerFileMetadata>,
    metadata: Option<TimelineMetadata>,
 }

 /// A timeline download task.
 /// Does not contain the file list to download, to allow other
 /// parts of the pageserer code to schedule the task
-/// without using the remote index or any other ways to list the remote timleine files.
+/// without using the remote index or any other ways to list the remote timeline files.
 /// Skips the files that are already downloaded.
 #[derive(Debug, Clone, PartialEq, Eq)]
 struct LayersDownload {
    layers_to_skip: HashSet<PathBuf>,
+
+    /// Paths which have been downloaded, and had their metadata verified or generated.
+    ///
+    /// Metadata generation happens when upgrading from past version of `IndexPart`.
+    gathered_metadata: HashMap<PathBuf, LayerFileMetadata>,
+}
+
+impl LayersDownload {
+    fn from_skipped_layers(layers_to_skip: HashSet<PathBuf>) -> Self {
+        LayersDownload {
+            layers_to_skip,
+            gathered_metadata: HashMap::default(),
+        }
+    }
 }

 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -491,7 +505,7 @@ struct LayersDeletion {
 pub fn schedule_layer_upload(
    tenant_id: TenantId,
    timeline_id: TimelineId,
-    layers_to_upload: HashSet<PathBuf>,
+    layers_to_upload: HashMap<PathBuf, LayerFileMetadata>,
    metadata: Option<TimelineMetadata>,
 ) {
    let sync_queue = match SYNC_QUEUE.get() {
@@ -508,7 +522,7 @@ pub fn schedule_layer_upload(
        },
        SyncTask::upload(LayersUpload {
            layers_to_upload,
-            uploaded_layers: HashSet::new(),
+            uploaded_layers: HashMap::new(),
            metadata,
        }),
    );
@@ -566,21 +580,44 @@ pub fn schedule_layer_download(tenant_id: TenantId, timeline_id: TimelineId) {
            tenant_id,
            timeline_id,
        },
-        SyncTask::download(LayersDownload {
-            layers_to_skip: HashSet::new(),
-        }),
+        SyncTask::download(LayersDownload::from_skipped_layers(HashSet::new())),
    );
    debug!("Download task for tenant {tenant_id}, timeline {timeline_id} sent")
 }

+/// Local existing timeline files
+///
+/// Values of this type serve different meanings in different contexts. On startup, collected
+/// timelines come with the full collected information and when signalling readyness to attach
+/// after completed download. After the download the file information is no longer carried, because
+/// it is already merged into [`RemoteTimeline`].
+#[derive(Debug)]
+pub struct TimelineLocalFiles(TimelineMetadata, HashMap<PathBuf, LayerFileMetadata>);
+
+impl TimelineLocalFiles {
+    pub fn metadata(&self) -> &TimelineMetadata {
+        &self.0
+    }
+
+    /// Called during startup, for all of the local files with full metadata.
+    pub(crate) fn collected(
+        metadata: TimelineMetadata,
+        timeline_files: HashMap<PathBuf, LayerFileMetadata>,
+    ) -> TimelineLocalFiles {
+        TimelineLocalFiles(metadata, timeline_files)
+    }
+
+    /// Called near the end of tenant initialization, to signal readyness to attach tenants.
+    pub(crate) fn ready(metadata: TimelineMetadata) -> Self {
+        TimelineLocalFiles(metadata, HashMap::new())
+    }
+}
+
 /// Launch a thread to perform remote storage sync tasks.
 /// See module docs for loop step description.
 pub fn spawn_storage_sync_task(
    conf: &'static PageServerConf,
-    local_timeline_files: HashMap<
-        TenantId,
-        HashMap<TimelineId, (TimelineMetadata, HashSet<PathBuf>)>,
-    >,
+    local_timeline_files: HashMap<TenantId, HashMap<TimelineId, TimelineLocalFiles>>,
    storage: GenericRemoteStorage,
    max_concurrent_timelines_sync: NonZeroUsize,
    max_sync_errors: NonZeroU32,
@@ -738,7 +775,7 @@ async fn storage_sync_loop(
                                    tenant_entry
                                        .iter()
                                        .map(|(&id, entry)| {
-                                            (id, (entry.metadata.clone(), HashSet::new()))
+                                            (id, TimelineLocalFiles::ready(entry.metadata.clone()))
                                        })
                                        .collect(),
                                ),
@@ -983,15 +1020,27 @@ async fn download_timeline_data(
        }
        DownloadedTimeline::Successful(mut download_data) => {
            match update_local_metadata(conf, sync_id, current_remote_timeline).await {
-                Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) {
-                    Ok(()) => {
-                        register_sync_status(sync_id, sync_start, TASK_NAME, Some(true));
-                        return DownloadStatus::Downloaded;
-                    }
-                    Err(e) => {
-                        error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}");
-                    }
-                },
+                Ok(()) => {
+                    let mut g = index.write().await;
+
+                    match g.set_awaits_download(&sync_id, false) {
+                        Ok(()) => {
+                            let timeline = g
+                                .timeline_entry_mut(&sync_id)
+                                .expect("set_awaits_download verified existence");
+
+                            timeline.merge_metadata_from_downloaded(
+                                &download_data.data.gathered_metadata,
+                            );
+
+                            register_sync_status(sync_id, sync_start, TASK_NAME, Some(true));
+                            return DownloadStatus::Downloaded;
+                        }
+                        Err(e) => {
+                            error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}");
+                        }
+                    };
+                }
                Err(e) => {
                    error!("Failed to update local timeline metadata: {e:?}");
                    download_data.retries += 1;
@@ -1194,11 +1243,18 @@ async fn update_remote_data(
                        }
                        if upload_failed {
                            existing_entry.add_upload_failures(
-                                uploaded_data.layers_to_upload.iter().cloned(),
+                                uploaded_data
+                                    .layers_to_upload
+                                    .iter()
+                                    .map(|(k, v)| (k.to_owned(), v.to_owned())),
                            );
                        } else {
-                            existing_entry
-                                .add_timeline_layers(uploaded_data.uploaded_layers.iter().cloned());
+                            existing_entry.add_timeline_layers(
+                                uploaded_data
+                                    .uploaded_layers
+                                    .iter()
+                                    .map(|(k, v)| (k.to_owned(), v.to_owned())),
+                            );
                        }
                    }
                    RemoteDataUpdate::Delete(layers_to_remove) => {
@@ -1218,11 +1274,19 @@ async fn update_remote_data(
                    };
                    let mut new_remote_timeline = RemoteTimeline::new(new_metadata.clone());
                    if upload_failed {
-                        new_remote_timeline
-                            .add_upload_failures(uploaded_data.layers_to_upload.iter().cloned());
+                        new_remote_timeline.add_upload_failures(
+                            uploaded_data
+                                .layers_to_upload
+                                .iter()
+                                .map(|(k, v)| (k.to_owned(), v.to_owned())),
+                        );
                    } else {
-                        new_remote_timeline
-                            .add_timeline_layers(uploaded_data.uploaded_layers.iter().cloned());
+                        new_remote_timeline.add_timeline_layers(
+                            uploaded_data
+                                .uploaded_layers
+                                .iter()
+                                .map(|(k, v)| (k.to_owned(), v.to_owned())),
+                        );
                    }

                    index_accessor.add_timeline_entry(sync_id, new_remote_timeline.clone());
@@ -1270,13 +1334,14 @@ async fn validate_task_retries(
 fn schedule_first_sync_tasks(
    index: &mut RemoteTimelineIndex,
    sync_queue: &SyncQueue,
-    local_timeline_files: HashMap<TenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>,
+    local_timeline_files: HashMap<TenantTimelineId, TimelineLocalFiles>,
 ) -> TenantTimelineValues<LocalTimelineInitStatus> {
    let mut local_timeline_init_statuses = TenantTimelineValues::new();

    let mut new_sync_tasks = VecDeque::with_capacity(local_timeline_files.len());

-    for (sync_id, (local_metadata, local_files)) in local_timeline_files {
+    for (sync_id, local_timeline) in local_timeline_files {
+        let TimelineLocalFiles(local_metadata, local_files) = local_timeline;
        match index.timeline_entry_mut(&sync_id) {
            Some(remote_timeline) => {
                let (timeline_status, awaits_download) = compare_local_and_remote_timeline(
@@ -1320,7 +1385,7 @@ fn schedule_first_sync_tasks(
                    sync_id,
                    SyncTask::upload(LayersUpload {
                        layers_to_upload: local_files,
-                        uploaded_layers: HashSet::new(),
+                        uploaded_layers: HashMap::new(),
                        metadata: Some(local_metadata.clone()),
                    }),
                ));
@@ -1347,20 +1412,46 @@ fn compare_local_and_remote_timeline(
    new_sync_tasks: &mut VecDeque<(TenantTimelineId, SyncTask)>,
    sync_id: TenantTimelineId,
    local_metadata: TimelineMetadata,
-    local_files: HashSet<PathBuf>,
+    local_files: HashMap<PathBuf, LayerFileMetadata>,
    remote_entry: &RemoteTimeline,
 ) -> (LocalTimelineInitStatus, bool) {
    let _entered = info_span!("compare_local_and_remote_timeline", sync_id = %sync_id).entered();

-    let remote_files = remote_entry.stored_files();
+    let needed_to_download_files = remote_entry
+        .stored_files()
+        .iter()
+        .filter_map(|(layer_file, remote_metadata)| {
+            if let Some(local_metadata) = local_files.get(layer_file) {
+                match (remote_metadata.file_size(), local_metadata.file_size()) {
+                    (Some(x), Some(y)) if x == y => { None },
+                    (None, Some(_)) => {
+                        // upgrading from an earlier IndexPart without metadata
+                        None
+                    },
+                    _ => {
+                        // having to deal with other than (Some(x), Some(y)) where x != y here is a
+                        // bummer, but see #2582 and #2610 for attempts and discussion.
+                        warn!("Redownloading locally existing {layer_file:?} due to size mismatch, size on index: {:?}, on disk: {:?}", remote_metadata.file_size(), local_metadata.file_size());
+                        Some(layer_file)
+                    },
+                }
+            } else {
+                // doesn't exist locally
+                Some(layer_file)
+            }
+        })
+        .collect::<HashSet<_>>();

-    let number_of_layers_to_download = remote_files.difference(&local_files).count();
-    let (initial_timeline_status, awaits_download) = if number_of_layers_to_download > 0 {
+    let (initial_timeline_status, awaits_download) = if !needed_to_download_files.is_empty() {
        new_sync_tasks.push_back((
            sync_id,
-            SyncTask::download(LayersDownload {
-                layers_to_skip: local_files.clone(),
-            }),
+            SyncTask::download(LayersDownload::from_skipped_layers(
+                local_files
+                    .keys()
+                    .filter(|path| !needed_to_download_files.contains(path))
+                    .cloned()
+                    .collect(),
+            )),
        ));
        info!("NeedsSync");
        (LocalTimelineInitStatus::NeedsSync, true)
@@ -1375,15 +1466,22 @@ fn compare_local_and_remote_timeline(
    };

    let layers_to_upload = local_files
-        .difference(remote_files)
-        .cloned()
-        .collect::<HashSet<_>>();
+        .iter()
+        .filter_map(|(local_file, metadata)| {
+            if !remote_entry.stored_files().contains_key(local_file) {
+                Some((local_file.to_owned(), metadata.to_owned()))
+            } else {
+                None
+            }
+        })
+        .collect::<HashMap<_, _>>();
+
    if !layers_to_upload.is_empty() {
        new_sync_tasks.push_back((
            sync_id,
            SyncTask::upload(LayersUpload {
                layers_to_upload,
-                uploaded_layers: HashSet::new(),
+                uploaded_layers: HashMap::new(),
                metadata: Some(local_metadata),
            }),
        ));
@@ -1439,11 +1537,12 @@ mod test_utils {
        let timeline_path = harness.timeline_path(&timeline_id);
        fs::create_dir_all(&timeline_path).await?;

-        let mut layers_to_upload = HashSet::with_capacity(filenames.len());
+        let mut layers_to_upload = HashMap::with_capacity(filenames.len());
        for &file in filenames {
            let file_path = timeline_path.join(file);
            fs::write(&file_path, dummy_contents(file).into_bytes()).await?;
-            layers_to_upload.insert(file_path);
+            let metadata = LayerFileMetadata::new(file_path.metadata()?.len());
+            layers_to_upload.insert(file_path, metadata);
        }

        fs::write(
@@ -1454,7 +1553,7 @@ mod test_utils {

        Ok(LayersUpload {
            layers_to_upload,
-            uploaded_layers: HashSet::new(),
+            uploaded_layers: HashMap::new(),
            metadata: Some(metadata),
        })
    }
@@ -1509,12 +1608,13 @@ mod tests {
        assert!(sync_id_2 != sync_id_3);
        assert!(sync_id_3 != TEST_SYNC_ID);

-        let download_task = SyncTask::download(LayersDownload {
-            layers_to_skip: HashSet::from([PathBuf::from("sk")]),
-        });
+        let download_task =
+            SyncTask::download(LayersDownload::from_skipped_layers(HashSet::from([
+                PathBuf::from("sk"),
+            ])));
        let upload_task = SyncTask::upload(LayersUpload {
-            layers_to_upload: HashSet::from([PathBuf::from("up")]),
-            uploaded_layers: HashSet::from([PathBuf::from("upl")]),
+            layers_to_upload: HashMap::from([(PathBuf::from("up"), LayerFileMetadata::new(123))]),
+            uploaded_layers: HashMap::from([(PathBuf::from("upl"), LayerFileMetadata::new(123))]),
            metadata: Some(dummy_metadata(Lsn(2))),
        });
        let delete_task = SyncTask::delete(LayersDeletion {
@@ -1558,12 +1658,10 @@ mod tests {
        let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
        assert_eq!(sync_queue.len(), 0);

-        let download = LayersDownload {
-            layers_to_skip: HashSet::from([PathBuf::from("sk")]),
-        };
+        let download = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk")]));
        let upload = LayersUpload {
-            layers_to_upload: HashSet::from([PathBuf::from("up")]),
-            uploaded_layers: HashSet::from([PathBuf::from("upl")]),
+            layers_to_upload: HashMap::from([(PathBuf::from("up"), LayerFileMetadata::new(123))]),
+            uploaded_layers: HashMap::from([(PathBuf::from("upl"), LayerFileMetadata::new(123))]),
            metadata: Some(dummy_metadata(Lsn(2))),
        };
        let delete = LayersDeletion {
@@ -1611,18 +1709,10 @@ mod tests {
    #[tokio::test]
    async fn same_task_id_same_tasks_batch() {
        let sync_queue = SyncQueue::new(NonZeroUsize::new(1).unwrap());
-        let download_1 = LayersDownload {
-            layers_to_skip: HashSet::from([PathBuf::from("sk1")]),
-        };
-        let download_2 = LayersDownload {
-            layers_to_skip: HashSet::from([PathBuf::from("sk2")]),
-        };
-        let download_3 = LayersDownload {
-            layers_to_skip: HashSet::from([PathBuf::from("sk3")]),
-        };
-        let download_4 = LayersDownload {
-            layers_to_skip: HashSet::from([PathBuf::from("sk4")]),
-        };
+        let download_1 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk1")]));
+        let download_2 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk2")]));
+        let download_3 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk3")]));
+        let download_4 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk4")]));

        let sync_id_2 = TenantTimelineId {
            tenant_id: TenantId::from_array(hex!("22223344556677881122334455667788")),
@@ -1646,15 +1736,15 @@ mod tests {
            Some(SyncTaskBatch {
                download: Some(SyncData {
                    retries: 0,
-                    data: LayersDownload {
-                        layers_to_skip: {
+                    data: LayersDownload::from_skipped_layers(
+                        {
                            let mut set = HashSet::new();
                            set.extend(download_1.layers_to_skip.into_iter());
                            set.extend(download_2.layers_to_skip.into_iter());
                            set.extend(download_4.layers_to_skip.into_iter());
                            set
                        },
-                    }
+                    )
                }),
                upload: None,
                delete: None,
@@ -1670,4 +1760,148 @@ mod tests {
            "Should have one task left out of the batch"
        );
    }
+
+    mod local_and_remote_comparisons {
+        use super::*;
+
+        #[test]
+        fn ready() {
+            let mut new_sync_tasks = VecDeque::default();
+            let sync_id = TenantTimelineId::generate();
+            let local_metadata = dummy_metadata(0x02.into());
+            let local_files =
+                HashMap::from([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]);
+            let mut remote_entry = RemoteTimeline::new(local_metadata.clone());
+            remote_entry
+                .add_timeline_layers([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]);
+
+            let (status, sync_needed) = compare_local_and_remote_timeline(
+                &mut new_sync_tasks,
+                sync_id,
+                local_metadata.clone(),
+                local_files,
+                &remote_entry,
+            );
+
+            assert_eq!(
+                status,
+                LocalTimelineInitStatus::LocallyComplete(local_metadata)
+            );
+            assert!(!sync_needed);
+
+            assert!(new_sync_tasks.is_empty(), "{:?}", new_sync_tasks);
+        }
+
+        #[test]
+        fn needs_download() {
+            let mut new_sync_tasks = VecDeque::default();
+            let sync_id = TenantTimelineId::generate();
+            let local_metadata = dummy_metadata(0x02.into());
+            let local_files = HashMap::default();
+            let mut remote_entry = RemoteTimeline::new(local_metadata.clone());
+            remote_entry
+                .add_timeline_layers([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]);
+
+            let (status, sync_needed) = compare_local_and_remote_timeline(
+                &mut new_sync_tasks,
+                sync_id,
+                local_metadata,
+                local_files.clone(),
+                &remote_entry,
+            );
+
+            assert_eq!(status, LocalTimelineInitStatus::NeedsSync);
+            assert!(sync_needed);
+
+            let new_sync_tasks = new_sync_tasks.into_iter().collect::<Vec<_>>();
+
+            assert_eq!(
+                &new_sync_tasks,
+                &[(
+                    sync_id,
+                    SyncTask::download(LayersDownload::from_skipped_layers(
+                        local_files.keys().cloned().collect()
+                    ))
+                )]
+            );
+        }
+
+        #[test]
+        fn redownload_is_not_needed_on_upgrade() {
+            // originally the implementation missed the `(None, Some(_))` case in the match, and
+            // proceeded to always redownload if the remote metadata was not available.
+
+            let mut new_sync_tasks = VecDeque::default();
+            let sync_id = TenantTimelineId::generate();
+
+            let local_metadata = dummy_metadata(0x02.into());
+
+            // type system would in general allow that LayerFileMetadata would be created with
+            // file_size: None, however `LayerFileMetadata::default` is only allowed from tests,
+            // and so everywhere within the system valid LayerFileMetadata is being created, it is
+            // created through `::new`.
+            let local_files =
+                HashMap::from([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]);
+
+            let mut remote_entry = RemoteTimeline::new(local_metadata.clone());
+
+            // RemoteTimeline is constructed out of an older version IndexPart, which didn't carry
+            // any metadata.
+            remote_entry
+                .add_timeline_layers([(PathBuf::from("first_file"), LayerFileMetadata::default())]);
+
+            let (status, sync_needed) = compare_local_and_remote_timeline(
+                &mut new_sync_tasks,
+                sync_id,
+                local_metadata.clone(),
+                local_files,
+                &remote_entry,
+            );
+
+            assert_eq!(
+                status,
+                LocalTimelineInitStatus::LocallyComplete(local_metadata)
+            );
+            assert!(!sync_needed);
+        }
+
+        #[test]
+        fn needs_upload() {
+            let mut new_sync_tasks = VecDeque::default();
+            let sync_id = TenantTimelineId::generate();
+            let local_metadata = dummy_metadata(0x02.into());
+            let local_files =
+                HashMap::from([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]);
+            let mut remote_entry = RemoteTimeline::new(local_metadata.clone());
+            remote_entry.add_timeline_layers([]);
+
+            let (status, sync_needed) = compare_local_and_remote_timeline(
+                &mut new_sync_tasks,
+                sync_id,
+                local_metadata.clone(),
+                local_files.clone(),
+                &remote_entry,
+            );
+
+            assert_eq!(
+                status,
+                LocalTimelineInitStatus::LocallyComplete(local_metadata.clone())
+            );
+            assert!(!sync_needed);
+
+            let new_sync_tasks = new_sync_tasks.into_iter().collect::<Vec<_>>();
+
+            assert_eq!(
+                &new_sync_tasks,
+                &[(
+                    sync_id,
+                    SyncTask::upload(LayersUpload {
+                        layers_to_upload: local_files,
+                        uploaded_layers: HashMap::default(),
+                        metadata: Some(local_metadata),
+                    })
+                )]
+            );
+        }
+    }
 }
--- a/pageserver/src/storage_sync/delete.rs
+++ b/pageserver/src/storage_sync/delete.rs
@@ -171,7 +171,7 @@ mod tests {
        let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
        let timeline_upload =
            create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
-        for local_path in timeline_upload.layers_to_upload {
+        for (local_path, _metadata) in timeline_upload.layers_to_upload {
            let remote_path =
                local_storage.resolve_in_storage(&local_storage.remote_object_id(&local_path)?)?;
            let remote_parent_dir = remote_path.parent().unwrap();
--- a/pageserver/src/storage_sync/download.rs
+++ b/pageserver/src/storage_sync/download.rs
@@ -16,9 +16,13 @@ use tokio::{
 };
 use tracing::{debug, error, info, warn};

-use crate::{config::PageServerConf, storage_sync::SyncTask, TEMP_FILE_SUFFIX};
+use crate::{
+    config::PageServerConf,
+    storage_sync::{index::LayerFileMetadata, SyncTask},
+    TEMP_FILE_SUFFIX,
+};
 use utils::{
-    crashsafe_dir::path_with_suffix_extension,
+    crashsafe::path_with_suffix_extension,
    id::{TenantId, TenantTimelineId, TimelineId},
 };

@@ -219,8 +223,14 @@ pub(super) async fn download_timeline_layers<'a>(

    let layers_to_download = remote_timeline
        .stored_files()
-        .difference(&download.layers_to_skip)
-        .cloned()
+        .iter()
+        .filter_map(|(layer_path, metadata)| {
+            if !download.layers_to_skip.contains(layer_path) {
+                Some((layer_path.to_owned(), metadata.to_owned()))
+            } else {
+                None
+            }
+        })
        .collect::<Vec<_>>();

    debug!("Layers to download: {layers_to_download:?}");
@@ -233,89 +243,129 @@ pub(super) async fn download_timeline_layers<'a>(

    let mut download_tasks = layers_to_download
        .into_iter()
-        .map(|layer_destination_path| async move {
-            if layer_destination_path.exists() {
-                debug!(
-                    "Layer already exists locally, skipping download: {}",
-                    layer_destination_path.display()
-                );
-            } else {
-                // Perform a rename inspired by durable_rename from file_utils.c.
-                // The sequence:
-                //     write(tmp)
-                //     fsync(tmp)
-                //     rename(tmp, new)
-                //     fsync(new)
-                //     fsync(parent)
-                // For more context about durable_rename check this email from postgres mailing list:
-                // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
-                // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
-                let temp_file_path =
-                    path_with_suffix_extension(&layer_destination_path, TEMP_FILE_SUFFIX);
+        .map(|(layer_destination_path, metadata)| async move {

-                let mut destination_file =
-                    fs::File::create(&temp_file_path).await.with_context(|| {
-                        format!(
-                            "Failed to create a destination file for layer '{}'",
-                            temp_file_path.display()
-                        )
-                    })?;
+            match layer_destination_path.metadata() {
+                Ok(m) if m.is_file() => {
+                    // the file exists from earlier round when we failed after renaming it as
+                    // layer_destination_path
+                    let verified = if let Some(expected) = metadata.file_size() {
+                        m.len() == expected
+                    } else {
+                        // behaviour before recording metadata was to accept any existing
+                        true
+                    };

-                let mut layer_download = storage.download_storage_object(None, &layer_destination_path)
-                    .await
-                    .with_context(|| {
-                        format!(
-                            "Failed to initiate the download the layer for {sync_id} into file '{}'",
-                            temp_file_path.display()
-                        )
-                    })?;
-                io::copy(&mut layer_download.download_stream, &mut destination_file)
-                    .await
-                    .with_context(|| {
-                        format!(
-                            "Failed to download the layer for {sync_id} into file '{}'",
-                            temp_file_path.display()
-                        )
-                    })?;
-
-                // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
-                // A file will not be closed immediately when it goes out of scope if there are any IO operations
-                // that have not yet completed. To ensure that a file is closed immediately when it is dropped,
-                // you should call flush before dropping it.
-                //
-                // From the tokio code I see that it waits for pending operations to complete. There shouldn't be any because
-                // we assume that `destination_file` file is fully written. I.e there is no pending .write(...).await operations.
-                // But for additional safety let's check/wait for any pending operations.
-                destination_file.flush().await.with_context(|| {
-                    format!(
-                        "failed to flush source file at {}",
-                        temp_file_path.display()
-                    )
-                })?;
-
-                // not using sync_data because it can lose file size update
-                destination_file.sync_all().await.with_context(|| {
-                    format!(
-                        "failed to fsync source file at {}",
-                        temp_file_path.display()
-                    )
-                })?;
-                drop(destination_file);
-
-                fail::fail_point!("remote-storage-download-pre-rename", |_| {
-                    anyhow::bail!("remote-storage-download-pre-rename failpoint triggered")
-                });
-
-                fs::rename(&temp_file_path, &layer_destination_path).await?;
-
-                fsync_path(&layer_destination_path).await.with_context(|| {
-                    format!(
-                        "Cannot fsync layer destination path {}",
-                        layer_destination_path.display(),
-                    )
-                })?;
+                    if verified {
+                        debug!(
+                            "Layer already exists locally, skipping download: {}",
+                            layer_destination_path.display()
+                        );
+                        return Ok((layer_destination_path, LayerFileMetadata::new(m.len())))
+                    } else {
+                        // no need to remove it, it will be overwritten by fs::rename
+                        // after successful download
+                        warn!("Downloaded layer exists already but layer file metadata mismatches: {}, metadata {:?}", layer_destination_path.display(), metadata);
+                    }
+                }
+                Ok(m) => {
+                    return Err(anyhow::anyhow!("Downloaded layer destination exists but is not a file: {m:?}, target needs to be removed/archived manually: {layer_destination_path:?}"));
+                }
+                Err(_) => {
+                    // behave as the file didn't exist
+                }
            }
-            Ok::<_, anyhow::Error>(layer_destination_path)
+
+            // Perform a rename inspired by durable_rename from file_utils.c.
+            // The sequence:
+            //     write(tmp)
+            //     fsync(tmp)
+            //     rename(tmp, new)
+            //     fsync(new)
+            //     fsync(parent)
+            // For more context about durable_rename check this email from postgres mailing list:
+            // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
+            // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
+            let temp_file_path =
+                path_with_suffix_extension(&layer_destination_path, TEMP_FILE_SUFFIX);
+
+            // TODO: this doesn't use the cached fd for some reason?
+            let mut destination_file =
+                fs::File::create(&temp_file_path).await.with_context(|| {
+                    format!(
+                        "Failed to create a destination file for layer '{}'",
+                        temp_file_path.display()
+                    )
+                })?;
+
+            let mut layer_download = storage.download_storage_object(None, &layer_destination_path)
+                .await
+                .with_context(|| {
+                    format!(
+                        "Failed to initiate the download the layer for {sync_id} into file '{}'",
+                        temp_file_path.display()
+                    )
+                })?;
+
+            let bytes_amount = io::copy(&mut layer_download.download_stream, &mut destination_file)
+                .await
+                .with_context(|| {
+                    format!(
+                        "Failed to download the layer for {sync_id} into file '{}'",
+                        temp_file_path.display()
+                    )
+                })?;
+
+            // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
+            // A file will not be closed immediately when it goes out of scope if there are any IO operations
+            // that have not yet completed. To ensure that a file is closed immediately when it is dropped,
+            // you should call flush before dropping it.
+            //
+            // From the tokio code I see that it waits for pending operations to complete. There shouldn't be any because
+            // we assume that `destination_file` file is fully written. I.e there is no pending .write(...).await operations.
+            // But for additional safety let's check/wait for any pending operations.
+            destination_file.flush().await.with_context(|| {
+                format!(
+                    "failed to flush source file at {}",
+                    temp_file_path.display()
+                )
+            })?;
+
+            match metadata.file_size() {
+                Some(expected) if expected != bytes_amount => {
+                    anyhow::bail!(
+                        "According to layer file metadata should had downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
+                        temp_file_path.display()
+                    );
+                },
+                Some(_) | None => {
+                    // matches, or upgrading from an earlier IndexPart version
+                }
+            }
+
+            // not using sync_data because it can lose file size update
+            destination_file.sync_all().await.with_context(|| {
+                format!(
+                    "failed to fsync source file at {}",
+                    temp_file_path.display()
+                )
+            })?;
+            drop(destination_file);
+
+            fail::fail_point!("remote-storage-download-pre-rename", |_| {
+                anyhow::bail!("remote-storage-download-pre-rename failpoint triggered")
+            });
+
+            fs::rename(&temp_file_path, &layer_destination_path).await?;
+
+            fsync_path(&layer_destination_path).await.with_context(|| {
+                format!(
+                    "Cannot fsync layer destination path {}",
+                    layer_destination_path.display(),
+                )
+            })?;
+
+            Ok::<_, anyhow::Error>((layer_destination_path, LayerFileMetadata::new(bytes_amount)))
        })
        .collect::<FuturesUnordered<_>>();

@@ -324,9 +374,12 @@ pub(super) async fn download_timeline_layers<'a>(
    let mut undo = HashSet::new();
    while let Some(download_result) = download_tasks.next().await {
        match download_result {
-            Ok(downloaded_path) => {
+            Ok((downloaded_path, metadata)) => {
                undo.insert(downloaded_path.clone());
-                download.layers_to_skip.insert(downloaded_path);
+                download.layers_to_skip.insert(downloaded_path.clone());
+                // what if the key existed already? ignore, because then we would had
+                // downloaded a partial file, and had to retry
+                download.gathered_metadata.insert(downloaded_path, metadata);
            }
            Err(e) => {
                errors_happened = true;
@@ -349,6 +402,8 @@ pub(super) async fn download_timeline_layers<'a>(
        );
        for item in undo {
            download.layers_to_skip.remove(&item);
+            // intentionally don't clear the gathered_metadata because it exists for fsync_path
+            // failure on parent directory
        }
        errors_happened = true;
    }
@@ -453,9 +508,9 @@ mod tests {
        let timeline_upload =
            create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;

-        for local_path in timeline_upload.layers_to_upload {
+        for local_path in timeline_upload.layers_to_upload.keys() {
            let remote_path =
-                local_storage.resolve_in_storage(&storage.remote_object_id(&local_path)?)?;
+                local_storage.resolve_in_storage(&storage.remote_object_id(local_path)?)?;
            let remote_parent_dir = remote_path.parent().unwrap();
            if !remote_parent_dir.exists() {
                fs::create_dir_all(&remote_parent_dir).await?;
@@ -473,11 +528,19 @@ mod tests {

        let mut remote_timeline = RemoteTimeline::new(metadata.clone());
        remote_timeline.awaits_download = true;
-        remote_timeline.add_timeline_layers(
-            layer_files
-                .iter()
-                .map(|layer| local_timeline_path.join(layer)),
-        );
+        remote_timeline.add_timeline_layers(layer_files.iter().map(|layer| {
+            let layer_path = local_timeline_path.join(layer);
+
+            // this could had also been LayerFileMetadata::default(), but since in this test we
+            // don't do the merge operation done by storage_sync::download_timeline_data, it would
+            // not be merged back to timeline.
+            let metadata_from_upload = timeline_upload
+                .layers_to_upload
+                .get(&layer_path)
+                .expect("layer must exist in previously uploaded paths")
+                .to_owned();
+            (layer_path, metadata_from_upload)
+        }));

        let download_data = match download_timeline_layers(
            harness.conf,
@@ -487,9 +550,9 @@ mod tests {
            sync_id,
            SyncData::new(
                current_retries,
-                LayersDownload {
-                    layers_to_skip: HashSet::from([local_timeline_path.join("layer_to_skip")]),
-                },
+                LayersDownload::from_skipped_layers(HashSet::from([
+                    local_timeline_path.join("layer_to_skip")
+                ])),
            ),
        )
        .await
@@ -552,12 +615,7 @@ mod tests {
            &sync_queue,
            None,
            sync_id,
-            SyncData::new(
-                0,
-                LayersDownload {
-                    layers_to_skip: HashSet::new(),
-                },
-            ),
+            SyncData::new(0, LayersDownload::from_skipped_layers(HashSet::new())),
        )
        .await;
        assert!(
@@ -576,12 +634,7 @@ mod tests {
            &sync_queue,
            Some(&not_expecting_download_remote_timeline),
            sync_id,
-            SyncData::new(
-                0,
-                LayersDownload {
-                    layers_to_skip: HashSet::new(),
-                },
-            ),
+            SyncData::new(0, LayersDownload::from_skipped_layers(HashSet::new())),
        )
        .await;
        assert!(
--- a/pageserver/src/storage_sync/index.rs
+++ b/pageserver/src/storage_sync/index.rs
@@ -212,8 +212,8 @@ impl RemoteTimelineIndex {
 /// Restored index part data about the timeline, stored in the remote index.
 #[derive(Debug, Clone)]
 pub struct RemoteTimeline {
-    timeline_layers: HashSet<PathBuf>,
-    missing_layers: HashSet<PathBuf>,
+    timeline_layers: HashMap<PathBuf, LayerFileMetadata>,
+    missing_layers: HashMap<PathBuf, LayerFileMetadata>,

    pub metadata: TimelineMetadata,
    pub awaits_download: bool,
@@ -222,62 +222,161 @@ pub struct RemoteTimeline {
 impl RemoteTimeline {
    pub fn new(metadata: TimelineMetadata) -> Self {
        Self {
-            timeline_layers: HashSet::new(),
-            missing_layers: HashSet::new(),
+            timeline_layers: HashMap::default(),
+            missing_layers: HashMap::default(),
            metadata,
            awaits_download: false,
        }
    }

-    pub fn add_timeline_layers(&mut self, new_layers: impl IntoIterator<Item = PathBuf>) {
-        self.timeline_layers.extend(new_layers.into_iter());
+    pub fn add_timeline_layers(
+        &mut self,
+        new_layers: impl IntoIterator<Item = (PathBuf, LayerFileMetadata)>,
+    ) {
+        self.timeline_layers.extend(new_layers);
    }

-    pub fn add_upload_failures(&mut self, upload_failures: impl IntoIterator<Item = PathBuf>) {
-        self.missing_layers.extend(upload_failures.into_iter());
+    pub fn add_upload_failures(
+        &mut self,
+        upload_failures: impl IntoIterator<Item = (PathBuf, LayerFileMetadata)>,
+    ) {
+        self.missing_layers.extend(upload_failures);
    }

    pub fn remove_layers(&mut self, layers_to_remove: &HashSet<PathBuf>) {
        self.timeline_layers
-            .retain(|layer| !layers_to_remove.contains(layer));
+            .retain(|layer, _| !layers_to_remove.contains(layer));
        self.missing_layers
-            .retain(|layer| !layers_to_remove.contains(layer));
+            .retain(|layer, _| !layers_to_remove.contains(layer));
    }

    /// Lists all layer files in the given remote timeline. Omits the metadata file.
-    pub fn stored_files(&self) -> &HashSet<PathBuf> {
+    pub fn stored_files(&self) -> &HashMap<PathBuf, LayerFileMetadata> {
        &self.timeline_layers
    }

+    /// Combines metadata gathered or verified during downloading needed layer files to metadata on
+    /// the [`RemoteIndex`], so it can be uploaded later.
+    pub fn merge_metadata_from_downloaded(
+        &mut self,
+        downloaded: &HashMap<PathBuf, LayerFileMetadata>,
+    ) {
+        downloaded.iter().for_each(|(path, metadata)| {
+            if let Some(upgraded) = self.timeline_layers.get_mut(path) {
+                upgraded.merge(metadata);
+            }
+        });
+    }
+
    pub fn from_index_part(timeline_path: &Path, index_part: IndexPart) -> anyhow::Result<Self> {
        let metadata = TimelineMetadata::from_bytes(&index_part.metadata_bytes)?;
+        let default_metadata = &IndexLayerMetadata::default();
+
+        let find_metadata = |key: &RelativePath| -> LayerFileMetadata {
+            index_part
+                .layer_metadata
+                .get(key)
+                .unwrap_or(default_metadata)
+                .into()
+        };
+
        Ok(Self {
-            timeline_layers: to_local_paths(timeline_path, index_part.timeline_layers),
-            missing_layers: to_local_paths(timeline_path, index_part.missing_layers),
+            timeline_layers: index_part
+                .timeline_layers
+                .iter()
+                .map(|layer_path| (layer_path.as_path(timeline_path), find_metadata(layer_path)))
+                .collect(),
+            missing_layers: index_part
+                .missing_layers
+                .iter()
+                .map(|layer_path| (layer_path.as_path(timeline_path), find_metadata(layer_path)))
+                .collect(),
            metadata,
            awaits_download: false,
        })
    }
 }

+/// Metadata gathered for each of the layer files.
+///
+/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
+/// might have less or more metadata depending if upgrading or rolling back an upgrade.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
+#[cfg_attr(test, derive(Default))]
+pub struct LayerFileMetadata {
+    file_size: Option<u64>,
+}
+
+impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
+    fn from(other: &IndexLayerMetadata) -> Self {
+        LayerFileMetadata {
+            file_size: other.file_size,
+        }
+    }
+}
+
+impl LayerFileMetadata {
+    pub fn new(file_size: u64) -> Self {
+        LayerFileMetadata {
+            file_size: Some(file_size),
+        }
+    }
+
+    pub fn file_size(&self) -> Option<u64> {
+        self.file_size
+    }
+
+    /// Metadata has holes due to version upgrades. This method is called to upgrade self with the
+    /// other value.
+    ///
+    /// This is called on the possibly outdated version.
+    pub fn merge(&mut self, other: &Self) {
+        self.file_size = other.file_size.or(self.file_size);
+    }
+}
+
 /// Part of the remote index, corresponding to a certain timeline.
 /// Contains the data about all files in the timeline, present remotely and its metadata.
+///
+/// This type needs to be backwards and forwards compatible. When changing the fields,
+/// remember to add a test case for the changed version.
 #[serde_as]
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub struct IndexPart {
+    /// Debugging aid describing the version of this type.
+    #[serde(default)]
+    version: usize,
+
+    /// Each of the layers present on remote storage.
+    ///
+    /// Additional metadata can might exist in `layer_metadata`.
    timeline_layers: HashSet<RelativePath>,
+
    /// Currently is not really used in pageserver,
    /// present to manually keep track of the layer files that pageserver might never retrieve.
    ///
    /// Such "holes" might appear if any upload task was evicted on an error threshold:
    /// the this layer will only be rescheduled for upload on pageserver restart.
    missing_layers: HashSet<RelativePath>,
+
+    /// Per layer file metadata, which can be present for a present or missing layer file.
+    ///
+    /// Older versions of `IndexPart` will not have this property or have only a part of metadata
+    /// that latest version stores.
+    #[serde(default)]
+    layer_metadata: HashMap<RelativePath, IndexLayerMetadata>,
+
    #[serde_as(as = "DisplayFromStr")]
    disk_consistent_lsn: Lsn,
    metadata_bytes: Vec<u8>,
 }

 impl IndexPart {
+    /// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
+    /// used to understand later versions.
+    ///
+    /// Version is currently informative only.
+    const LATEST_VERSION: usize = 1;
    pub const FILE_NAME: &'static str = "index_part.json";

    #[cfg(test)]
@@ -288,8 +387,10 @@ impl IndexPart {
        metadata_bytes: Vec<u8>,
    ) -> Self {
        Self {
+            version: Self::LATEST_VERSION,
            timeline_layers,
            missing_layers,
+            layer_metadata: HashMap::default(),
            disk_consistent_lsn,
            metadata_bytes,
        }
@@ -304,35 +405,68 @@ impl IndexPart {
        remote_timeline: RemoteTimeline,
    ) -> anyhow::Result<Self> {
        let metadata_bytes = remote_timeline.metadata.to_bytes()?;
+
+        let mut layer_metadata = HashMap::new();
+
+        let mut missing_layers = HashSet::new();
+
+        separate_paths_and_metadata(
+            timeline_path,
+            &remote_timeline.missing_layers,
+            &mut missing_layers,
+            &mut layer_metadata,
+        )
+        .context("Failed to convert missing layers' paths to relative ones")?;
+
+        let mut timeline_layers = HashSet::new();
+
+        separate_paths_and_metadata(
+            timeline_path,
+            &remote_timeline.timeline_layers,
+            &mut timeline_layers,
+            &mut layer_metadata,
+        )
+        .context("Failed to convert timeline layers' paths to relative ones")?;
+
        Ok(Self {
-            timeline_layers: to_relative_paths(timeline_path, remote_timeline.timeline_layers)
-                .context("Failed to convert timeline layers' paths to relative ones")?,
-            missing_layers: to_relative_paths(timeline_path, remote_timeline.missing_layers)
-                .context("Failed to convert missing layers' paths to relative ones")?,
+            version: Self::LATEST_VERSION,
+            timeline_layers,
+            missing_layers,
+            layer_metadata,
            disk_consistent_lsn: remote_timeline.metadata.disk_consistent_lsn(),
            metadata_bytes,
        })
    }
 }

-fn to_local_paths(
-    timeline_path: &Path,
-    paths: impl IntoIterator<Item = RelativePath>,
-) -> HashSet<PathBuf> {
-    paths
-        .into_iter()
-        .map(|path| path.as_path(timeline_path))
-        .collect()
+/// Serialized form of [`LayerFileMetadata`].
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
+pub struct IndexLayerMetadata {
+    file_size: Option<u64>,
 }

-fn to_relative_paths(
+impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
+    fn from(other: &'_ LayerFileMetadata) -> Self {
+        IndexLayerMetadata {
+            file_size: other.file_size,
+        }
+    }
+}
+
+fn separate_paths_and_metadata(
    timeline_path: &Path,
-    paths: impl IntoIterator<Item = PathBuf>,
-) -> anyhow::Result<HashSet<RelativePath>> {
-    paths
-        .into_iter()
-        .map(|path| RelativePath::new(timeline_path, path))
-        .collect()
+    input: &HashMap<PathBuf, LayerFileMetadata>,
+    output: &mut HashSet<RelativePath>,
+    layer_metadata: &mut HashMap<RelativePath, IndexLayerMetadata>,
+) -> anyhow::Result<()> {
+    for (path, metadata) in input {
+        let rel_path = RelativePath::new(timeline_path, path)?;
+        let metadata = IndexLayerMetadata::from(metadata);
+
+        layer_metadata.insert(rel_path.clone(), metadata);
+        output.insert(rel_path);
+    }
+    Ok(())
 }

 #[cfg(test)]
@@ -357,13 +491,13 @@ mod tests {
            DEFAULT_PG_VERSION,
        );
        let remote_timeline = RemoteTimeline {
-            timeline_layers: HashSet::from([
-                timeline_path.join("layer_1"),
-                timeline_path.join("layer_2"),
+            timeline_layers: HashMap::from([
+                (timeline_path.join("layer_1"), LayerFileMetadata::new(1)),
+                (timeline_path.join("layer_2"), LayerFileMetadata::new(2)),
            ]),
-            missing_layers: HashSet::from([
-                timeline_path.join("missing_1"),
-                timeline_path.join("missing_2"),
+            missing_layers: HashMap::from([
+                (timeline_path.join("missing_1"), LayerFileMetadata::new(3)),
+                (timeline_path.join("missing_2"), LayerFileMetadata::new(4)),
            ]),
            metadata: metadata.clone(),
            awaits_download: false,
@@ -485,13 +619,13 @@ mod tests {
        let conversion_result = IndexPart::from_remote_timeline(
            &timeline_path,
            RemoteTimeline {
-                timeline_layers: HashSet::from([
-                    PathBuf::from("bad_path"),
-                    timeline_path.join("layer_2"),
+                timeline_layers: HashMap::from([
+                    (PathBuf::from("bad_path"), LayerFileMetadata::new(1)),
+                    (timeline_path.join("layer_2"), LayerFileMetadata::new(2)),
                ]),
-                missing_layers: HashSet::from([
-                    timeline_path.join("missing_1"),
-                    timeline_path.join("missing_2"),
+                missing_layers: HashMap::from([
+                    (timeline_path.join("missing_1"), LayerFileMetadata::new(3)),
+                    (timeline_path.join("missing_2"), LayerFileMetadata::new(4)),
                ]),
                metadata: metadata.clone(),
                awaits_download: false,
@@ -502,13 +636,13 @@ mod tests {
        let conversion_result = IndexPart::from_remote_timeline(
            &timeline_path,
            RemoteTimeline {
-                timeline_layers: HashSet::from([
-                    timeline_path.join("layer_1"),
-                    timeline_path.join("layer_2"),
+                timeline_layers: HashMap::from([
+                    (timeline_path.join("layer_1"), LayerFileMetadata::new(1)),
+                    (timeline_path.join("layer_2"), LayerFileMetadata::new(2)),
                ]),
-                missing_layers: HashSet::from([
-                    PathBuf::from("bad_path"),
-                    timeline_path.join("missing_2"),
+                missing_layers: HashMap::from([
+                    (PathBuf::from("bad_path"), LayerFileMetadata::new(3)),
+                    (timeline_path.join("missing_2"), LayerFileMetadata::new(4)),
                ]),
                metadata,
                awaits_download: false,
@@ -516,4 +650,63 @@ mod tests {
        );
        assert!(conversion_result.is_err(), "Should not be able to convert metadata with missing layer paths that are not in the timeline directory");
    }
+
+    #[test]
+    fn v0_indexpart_is_parsed() {
+        let example = r#"{
+            "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
+            "missing_layers":["not_a_real_layer_but_adding_coverage"],
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+        }"#;
+
+        let expected = IndexPart {
+            version: 0,
+            timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
+            missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(),
+            layer_metadata: HashMap::default(),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+        };
+
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        assert_eq!(part, expected);
+    }
+
+    #[test]
+    fn v1_indexpart_is_parsed() {
+        let example = r#"{
+            "version":1,
+            "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
+            "missing_layers":["not_a_real_layer_but_adding_coverage"],
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+        }"#;
+
+        let expected = IndexPart {
+            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
+            version: 1,
+            timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
+            missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(),
+            layer_metadata: HashMap::from([
+                (RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned()), IndexLayerMetadata {
+                    file_size: Some(25600000),
+                }),
+                (RelativePath("not_a_real_layer_but_adding_coverage".to_owned()), IndexLayerMetadata {
+                    // serde_json should always parse this but this might be a double with jq for
+                    // example.
+                    file_size: Some(9007199254741001),
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+        };
+
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        assert_eq!(part, expected);
+    }
 }
--- a/pageserver/src/storage_sync/upload.rs
+++ b/pageserver/src/storage_sync/upload.rs
@@ -69,14 +69,25 @@ pub(super) async fn upload_timeline_layers<'a>(
        .map(|meta| meta.disk_consistent_lsn());

    let already_uploaded_layers = remote_timeline
-        .map(|timeline| timeline.stored_files())
-        .cloned()
+        .map(|timeline| {
+            timeline
+                .stored_files()
+                .keys()
+                .cloned()
+                .collect::<std::collections::HashSet<_>>()
+        })
        .unwrap_or_default();

    let layers_to_upload = upload
        .layers_to_upload
-        .difference(&already_uploaded_layers)
-        .cloned()
+        .iter()
+        .filter_map(|(k, v)| {
+            if !already_uploaded_layers.contains(k) {
+                Some((k.to_owned(), v.to_owned()))
+            } else {
+                None
+            }
+        })
        .collect::<Vec<_>>();

    if layers_to_upload.is_empty() {
@@ -98,7 +109,7 @@ pub(super) async fn upload_timeline_layers<'a>(

    let mut upload_tasks = layers_to_upload
        .into_iter()
-        .map(|source_path| async move {
+        .map(|(source_path, known_metadata)| async move {
            let source_file = match fs::File::open(&source_path).await.with_context(|| {
                format!(
                    "Failed to upen a source file for layer '{}'",
@@ -109,7 +120,7 @@ pub(super) async fn upload_timeline_layers<'a>(
                Err(e) => return Err(UploadError::MissingLocalFile(source_path, e)),
            };

-            let source_size = source_file
+            let fs_size = source_file
                .metadata()
                .await
                .with_context(|| {
@@ -119,10 +130,24 @@ pub(super) async fn upload_timeline_layers<'a>(
                    )
                })
                .map_err(UploadError::Other)?
-                .len() as usize;
+                .len();
+
+            // FIXME: this looks bad
+            if let Some(metadata_size) = known_metadata.file_size() {
+                if metadata_size != fs_size {
+                    return Err(UploadError::Other(anyhow::anyhow!(
+                        "File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"
+                    )));
+                }
+            } else {
+                // this is a silly state we would like to avoid
+            }
+
+            let fs_size = usize::try_from(fs_size).with_context(|| format!("File {source_path:?} size {fs_size} could not be converted to usize"))
+                .map_err(UploadError::Other)?;

            match storage
-                .upload_storage_object(Box::new(source_file), source_size, &source_path)
+                .upload_storage_object(Box::new(source_file), fs_size, &source_path)
                .await
                .with_context(|| format!("Failed to upload layer file for {sync_id}"))
            {
@@ -136,8 +161,11 @@ pub(super) async fn upload_timeline_layers<'a>(
    while let Some(upload_result) = upload_tasks.next().await {
        match upload_result {
            Ok(uploaded_path) => {
-                upload.layers_to_upload.remove(&uploaded_path);
-                upload.uploaded_layers.insert(uploaded_path);
+                let metadata = upload
+                    .layers_to_upload
+                    .remove(&uploaded_path)
+                    .expect("metadata should always exist, assuming no double uploads");
+                upload.uploaded_layers.insert(uploaded_path, metadata);
            }
            Err(e) => match e {
                UploadError::Other(e) => {
@@ -262,7 +290,7 @@ mod tests {
        assert_eq!(
            upload
                .uploaded_layers
-                .iter()
+                .keys()
                .cloned()
                .collect::<BTreeSet<_>>(),
            layer_files
@@ -357,7 +385,7 @@ mod tests {
        assert_eq!(
            upload
                .uploaded_layers
-                .iter()
+                .keys()
                .cloned()
                .collect::<BTreeSet<_>>(),
            layer_files
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/delta_layer.rs
+++ b/pageserver/src/tenant/delta_layer.rs
@@ -610,9 +610,9 @@ impl DeltaLayer {
 ///
 /// 3. Call `finish`.
 ///
-pub struct DeltaLayerWriter {
+struct DeltaLayerWriterInner {
    conf: &'static PageServerConf,
-    path: PathBuf,
+    pub path: PathBuf,
    timeline_id: TimelineId,
    tenant_id: TenantId,

@@ -624,17 +624,17 @@ pub struct DeltaLayerWriter {
    blob_writer: WriteBlobWriter<BufWriter<VirtualFile>>,
 }

-impl DeltaLayerWriter {
+impl DeltaLayerWriterInner {
    ///
    /// Start building a new delta layer.
    ///
-    pub fn new(
+    fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
        key_start: Key,
        lsn_range: Range<Lsn>,
-    ) -> Result<DeltaLayerWriter> {
+    ) -> anyhow::Result<Self> {
        // Create the file initially with a temporary filename. We don't know
        // the end key yet, so we cannot form the final filename yet. We will
        // rename it when we're done.
@@ -653,7 +653,7 @@ impl DeltaLayerWriter {
        let block_buf = BlockBuf::new();
        let tree_builder = DiskBtreeBuilder::new(block_buf);

-        Ok(DeltaLayerWriter {
+        Ok(Self {
            conf,
            path,
            timeline_id,
@@ -670,17 +670,17 @@ impl DeltaLayerWriter {
    ///
    /// The values must be appended in key, lsn order.
    ///
-    pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
+    fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
        self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
    }

-    pub fn put_value_bytes(
+    fn put_value_bytes(
        &mut self,
        key: Key,
        lsn: Lsn,
        val: &[u8],
        will_init: bool,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
        assert!(self.lsn_range.start <= lsn);

        let off = self.blob_writer.write_blob(val)?;
@@ -693,14 +693,14 @@ impl DeltaLayerWriter {
        Ok(())
    }

-    pub fn size(&self) -> u64 {
+    fn size(&self) -> u64 {
        self.blob_writer.size() + self.tree.borrow_writer().size()
    }

    ///
    /// Finish writing the delta layer.
    ///
-    pub fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+    fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -768,6 +768,102 @@ impl DeltaLayerWriter {
    }
 }

+/// A builder object for constructing a new delta layer.
+///
+/// Usage:
+///
+/// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...)
+///
+/// 2. Write the contents by calling `put_value` for every page
+///    version to store in the layer.
+///
+/// 3. Call `finish`.
+///
+/// # Note
+///
+/// As described in https://github.com/neondatabase/neon/issues/2650, it's
+/// possible for the writer to drop before `finish` is actually called. So this
+/// could lead to odd temporary files in the directory, exhausting file system.
+/// This structure wraps `DeltaLayerWriterInner` and also contains `Drop`
+/// implementation that cleans up the temporary file in failure. It's not
+/// possible to do this directly in `DeltaLayerWriterInner` since `finish` moves
+/// out some fields, making it impossible to implement `Drop`.
+///
+#[must_use]
+pub struct DeltaLayerWriter {
+    inner: Option<DeltaLayerWriterInner>,
+}
+
+impl DeltaLayerWriter {
+    ///
+    /// Start building a new delta layer.
+    ///
+    pub fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_id: TenantId,
+        key_start: Key,
+        lsn_range: Range<Lsn>,
+    ) -> anyhow::Result<Self> {
+        Ok(Self {
+            inner: Some(DeltaLayerWriterInner::new(
+                conf,
+                timeline_id,
+                tenant_id,
+                key_start,
+                lsn_range,
+            )?),
+        })
+    }
+
+    ///
+    /// Append a key-value pair to the file.
+    ///
+    /// The values must be appended in key, lsn order.
+    ///
+    pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
+        self.inner.as_mut().unwrap().put_value(key, lsn, val)
+    }
+
+    pub fn put_value_bytes(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        val: &[u8],
+        will_init: bool,
+    ) -> anyhow::Result<()> {
+        self.inner
+            .as_mut()
+            .unwrap()
+            .put_value_bytes(key, lsn, val, will_init)
+    }
+
+    pub fn size(&self) -> u64 {
+        self.inner.as_ref().unwrap().size()
+    }
+
+    ///
+    /// Finish writing the delta layer.
+    ///
+    pub fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+        self.inner.take().unwrap().finish(key_end)
+    }
+}
+
+impl Drop for DeltaLayerWriter {
+    fn drop(&mut self) {
+        if let Some(inner) = self.inner.take() {
+            match inner.blob_writer.into_inner().into_inner() {
+                Ok(vfile) => vfile.remove(),
+                Err(err) => warn!(
+                    "error while flushing buffer of image layer temporary file: {}",
+                    err
+                ),
+            }
+        }
+    }
+}
+
 ///
 /// Iterator over all key-value pairse stored in a delta layer
 ///
--- a/pageserver/src/tenant/image_layer.rs
+++ b/pageserver/src/tenant/image_layer.rs
@@ -411,7 +411,7 @@ impl ImageLayer {
 ///
 /// 3. Call `finish`.
 ///
-pub struct ImageLayerWriter {
+struct ImageLayerWriterInner {
    conf: &'static PageServerConf,
    path: PathBuf,
    timeline_id: TimelineId,
@@ -423,14 +423,17 @@ pub struct ImageLayerWriter {
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
 }

-impl ImageLayerWriter {
-    pub fn new(
+impl ImageLayerWriterInner {
+    ///
+    /// Start building a new image layer.
+    ///
+    fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
        key_range: &Range<Key>,
        lsn: Lsn,
-    ) -> anyhow::Result<ImageLayerWriter> {
+    ) -> anyhow::Result<Self> {
        // Create the file initially with a temporary filename.
        // We'll atomically rename it to the final name when we're done.
        let path = ImageLayer::temp_path_for(
@@ -455,7 +458,7 @@ impl ImageLayerWriter {
        let block_buf = BlockBuf::new();
        let tree_builder = DiskBtreeBuilder::new(block_buf);

-        let writer = ImageLayerWriter {
+        let writer = Self {
            conf,
            path,
            timeline_id,
@@ -474,7 +477,7 @@ impl ImageLayerWriter {
    ///
    /// The page versions must be appended in blknum order.
    ///
-    pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> {
+    fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
        ensure!(self.key_range.contains(&key));
        let off = self.blob_writer.write_blob(img)?;

@@ -485,7 +488,10 @@ impl ImageLayerWriter {
        Ok(())
    }

-    pub fn finish(self) -> anyhow::Result<ImageLayer> {
+    ///
+    /// Finish writing the image layer.
+    ///
+    fn finish(self) -> anyhow::Result<ImageLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -552,3 +558,76 @@ impl ImageLayerWriter {
        Ok(layer)
    }
 }
+
+/// A builder object for constructing a new image layer.
+///
+/// Usage:
+///
+/// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
+///
+/// 2. Write the contents by calling `put_page_image` for every key-value
+///    pair in the key range.
+///
+/// 3. Call `finish`.
+///
+/// # Note
+///
+/// As described in https://github.com/neondatabase/neon/issues/2650, it's
+/// possible for the writer to drop before `finish` is actually called. So this
+/// could lead to odd temporary files in the directory, exhausting file system.
+/// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
+/// implementation that cleans up the temporary file in failure. It's not
+/// possible to do this directly in `ImageLayerWriterInner` since `finish` moves
+/// out some fields, making it impossible to implement `Drop`.
+///
+#[must_use]
+pub struct ImageLayerWriter {
+    inner: Option<ImageLayerWriterInner>,
+}
+
+impl ImageLayerWriter {
+    ///
+    /// Start building a new image layer.
+    ///
+    pub fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_id: TenantId,
+        key_range: &Range<Key>,
+        lsn: Lsn,
+    ) -> anyhow::Result<ImageLayerWriter> {
+        Ok(Self {
+            inner: Some(ImageLayerWriterInner::new(
+                conf,
+                timeline_id,
+                tenant_id,
+                key_range,
+                lsn,
+            )?),
+        })
+    }
+
+    ///
+    /// Write next value to the file.
+    ///
+    /// The page versions must be appended in blknum order.
+    ///
+    pub fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
+        self.inner.as_mut().unwrap().put_image(key, img)
+    }
+
+    ///
+    /// Finish writing the image layer.
+    ///
+    pub fn finish(mut self) -> anyhow::Result<ImageLayer> {
+        self.inner.take().unwrap().finish()
+    }
+}
+
+impl Drop for ImageLayerWriter {
+    fn drop(&mut self) {
+        if let Some(inner) = self.inner.take() {
+            inner.blob_writer.into_inner().remove();
+        }
+    }
+}
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -62,6 +62,8 @@ pub struct LayerMap {

 struct LayerRTreeObject {
    layer: Arc<dyn Layer>,
+
+    envelope: AABB<[IntKey; 2]>,
 }

 // Representation of Key as numeric type.
@@ -197,9 +199,16 @@ impl PartialEq for LayerRTreeObject {
 impl RTreeObject for LayerRTreeObject {
    type Envelope = AABB<[IntKey; 2]>;
    fn envelope(&self) -> Self::Envelope {
-        let key_range = self.layer.get_key_range();
-        let lsn_range = self.layer.get_lsn_range();
-        AABB::from_corners(
+        self.envelope
+    }
+}
+
+impl LayerRTreeObject {
+    fn new(layer: Arc<dyn Layer>) -> Self {
+        let key_range = layer.get_key_range();
+        let lsn_range = layer.get_lsn_range();
+
+        let envelope = AABB::from_corners(
            [
                IntKey::from(key_range.start.to_i128()),
                IntKey::from(lsn_range.start.0 as i128),
@@ -208,7 +217,8 @@ impl RTreeObject for LayerRTreeObject {
                IntKey::from(key_range.end.to_i128() - 1),
                IntKey::from(lsn_range.end.0 as i128 - 1),
            ], // AABB::upper is inclusive, while `key_range.end` and `lsn_range.end` are exclusive
-        )
+        );
+        LayerRTreeObject { layer, envelope }
    }
 }

@@ -338,7 +348,7 @@ impl LayerMap {
        if layer.get_key_range() == (Key::MIN..Key::MAX) {
            self.l0_delta_layers.push(layer.clone());
        }
-        self.historic_layers.insert(LayerRTreeObject { layer });
+        self.historic_layers.insert(LayerRTreeObject::new(layer));
        NUM_ONDISK_LAYERS.inc();
    }

@@ -362,7 +372,7 @@ impl LayerMap {
        }
        assert!(self
            .historic_layers
-            .remove(&LayerRTreeObject { layer })
+            .remove(&LayerRTreeObject::new(layer))
            .is_some());
        NUM_ONDISK_LAYERS.dec();
    }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -1,7 +1,7 @@
 //! This module acts as a switchboard to access different repositories managed by this
 //! page server.

-use std::collections::{hash_map, HashMap, HashSet};
+use std::collections::{hash_map, HashMap};
 use std::ffi::OsStr;
 use std::fs;
 use std::path::{Path, PathBuf};
@@ -12,10 +12,10 @@ use tracing::*;

 use remote_storage::GenericRemoteStorage;

-use crate::config::{PageServerConf, METADATA_FILE_NAME};
+use crate::config::{PageServerConf, METADATA_FILE_NAME, TIMELINE_UNINIT_MARK_SUFFIX};
 use crate::http::models::TenantInfo;
-use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex};
-use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData};
+use crate::storage_sync::index::{LayerFileMetadata, RemoteIndex, RemoteTimelineIndex};
+use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData, TimelineLocalFiles};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::{
    ephemeral_file::is_ephemeral_file, metadata::TimelineMetadata, Tenant, TenantState,
@@ -24,7 +24,7 @@ use crate::tenant_config::TenantConfOpt;
 use crate::walredo::PostgresRedoManager;
 use crate::TEMP_FILE_SUFFIX;

-use utils::crashsafe_dir::{self, path_with_suffix_extension};
+use utils::crashsafe::{self, path_with_suffix_extension};
 use utils::id::{TenantId, TimelineId};

 mod tenants_state {
@@ -104,7 +104,7 @@ pub fn init_tenant_mgr(
                if let TenantAttachData::Ready(t) = new_timeline_values {
                    for (timeline_id, old_value) in old_values {
                        if let LocalTimelineInitStatus::LocallyComplete(metadata) = old_value {
-                            t.insert(timeline_id, (metadata, HashSet::new()));
+                            t.insert(timeline_id, TimelineLocalFiles::ready(metadata));
                        }
                    }
                }
@@ -189,7 +189,7 @@ pub fn attach_local_tenants(
                let has_timelines = !timelines.is_empty();
                let timelines_to_attach = timelines
                    .iter()
-                    .map(|(&k, (v, _))| (k, v.clone()))
+                    .map(|(&k, v)| (k, v.metadata().to_owned()))
                    .collect();
                match tenant.init_attach_timelines(timelines_to_attach) {
                    Ok(()) => {
@@ -265,58 +265,98 @@ fn create_tenant_files(
        temporary_tenant_dir.display()
    );

-    let temporary_tenant_timelines_dir = rebase_directory(
-        &conf.timelines_path(&tenant_id),
-        &target_tenant_directory,
-        &temporary_tenant_dir,
-    )?;
-    let temporary_tenant_config_path = rebase_directory(
-        &conf.tenant_config_path(tenant_id),
-        &target_tenant_directory,
-        &temporary_tenant_dir,
-    )?;
-
    // top-level dir may exist if we are creating it through CLI
-    crashsafe_dir::create_dir_all(&temporary_tenant_dir).with_context(|| {
+    crashsafe::create_dir_all(&temporary_tenant_dir).with_context(|| {
        format!(
            "could not create temporary tenant directory {}",
            temporary_tenant_dir.display()
        )
    })?;
-    // first, create a config in the top-level temp directory, fsync the file
-    Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true)?;
-    // then, create a subdirectory in the top-level temp directory, fsynced
-    crashsafe_dir::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
+
+    let creation_result = try_create_target_tenant_dir(
+        conf,
+        tenant_conf,
+        tenant_id,
+        &temporary_tenant_dir,
+        &target_tenant_directory,
+    );
+
+    if creation_result.is_err() {
+        error!("Failed to create directory structure for tenant {tenant_id}, cleaning tmp data");
+        if let Err(e) = fs::remove_dir_all(&temporary_tenant_dir) {
+            error!("Failed to remove temporary tenant directory {temporary_tenant_dir:?}: {e}")
+        } else if let Err(e) = crashsafe::fsync(&temporary_tenant_dir) {
+            error!(
+                "Failed to fsync removed temporary tenant directory {temporary_tenant_dir:?}: {e}"
+            )
+        }
+    }
+
+    creation_result
+}
+
+fn try_create_target_tenant_dir(
+    conf: &'static PageServerConf,
+    tenant_conf: TenantConfOpt,
+    tenant_id: TenantId,
+    temporary_tenant_dir: &Path,
+    target_tenant_directory: &Path,
+) -> Result<(), anyhow::Error> {
+    let temporary_tenant_timelines_dir = rebase_directory(
+        &conf.timelines_path(&tenant_id),
+        target_tenant_directory,
+        temporary_tenant_dir,
+    )
+    .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary timelines dir"))?;
+    let temporary_tenant_config_path = rebase_directory(
+        &conf.tenant_config_path(tenant_id),
+        target_tenant_directory,
+        temporary_tenant_dir,
+    )
+    .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary config path"))?;
+
+    Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true).with_context(
+        || {
+            format!(
+                "Failed to write tenant {} config to {}",
+                tenant_id,
+                temporary_tenant_config_path.display()
+            )
+        },
+    )?;
+    crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
        format!(
-            "could not create temporary tenant timelines directory {}",
+            "could not create tenant {} temporary timelines directory {}",
+            tenant_id,
            temporary_tenant_timelines_dir.display()
        )
    })?;
-
    fail::fail_point!("tenant-creation-before-tmp-rename", |_| {
        anyhow::bail!("failpoint tenant-creation-before-tmp-rename");
    });

-    // move-rename tmp directory with all files synced into a permanent directory, fsync its parent
-    fs::rename(&temporary_tenant_dir, &target_tenant_directory).with_context(|| {
+    fs::rename(&temporary_tenant_dir, target_tenant_directory).with_context(|| {
        format!(
-            "failed to move temporary tenant directory {} into the permanent one {}",
+            "failed to move tenant {} temporary directory {} into the permanent one {}",
+            tenant_id,
            temporary_tenant_dir.display(),
            target_tenant_directory.display()
        )
    })?;
    let target_dir_parent = target_tenant_directory.parent().with_context(|| {
        format!(
-            "Failed to get tenant dir parent for {}",
+            "Failed to get tenant {} dir parent for {}",
+            tenant_id,
            target_tenant_directory.display()
        )
    })?;
-    fs::File::open(target_dir_parent)?.sync_all()?;
-
-    info!(
-        "created tenant directory structure in {}",
-        target_tenant_directory.display()
-    );
+    crashsafe::fsync(target_dir_parent).with_context(|| {
+        format!(
+            "Failed to fsync renamed directory's parent {} for tenant {}",
+            target_dir_parent.display(),
+            tenant_id,
+        )
+    })?;

    Ok(())
 }
@@ -483,7 +523,7 @@ pub fn list_tenant_info(remote_index: &RemoteTimelineIndex) -> Vec<TenantInfo> {

 #[derive(Debug)]
 pub enum TenantAttachData {
-    Ready(HashMap<TimelineId, (TimelineMetadata, HashSet<PathBuf>)>),
+    Ready(HashMap<TimelineId, TimelineLocalFiles>),
    Broken(anyhow::Error),
 }
 /// Attempts to collect information about all tenant and timelines, existing on the local FS.
@@ -602,7 +642,15 @@ fn is_temporary(path: &Path) -> bool {
    }
 }

-#[allow(clippy::type_complexity)]
+fn is_uninit_mark(path: &Path) -> bool {
+    match path.file_name() {
+        Some(name) => name
+            .to_string_lossy()
+            .ends_with(TIMELINE_UNINIT_MARK_SUFFIX),
+        None => false,
+    }
+}
+
 fn collect_timelines_for_tenant(
    config: &'static PageServerConf,
    tenant_path: &Path,
@@ -645,25 +693,74 @@ fn collect_timelines_for_tenant(
                            e
                        );
                    }
+                } else if is_uninit_mark(&timeline_dir) {
+                    let timeline_uninit_mark_file = &timeline_dir;
+                    info!(
+                        "Found an uninit mark file {}, removing the timeline and its uninit mark",
+                        timeline_uninit_mark_file.display()
+                    );
+                    let timeline_id = timeline_uninit_mark_file
+                        .file_stem()
+                        .and_then(OsStr::to_str)
+                        .unwrap_or_default()
+                        .parse::<TimelineId>()
+                        .with_context(|| {
+                            format!(
+                                "Could not parse timeline id out of the timeline uninit mark name {}",
+                                timeline_uninit_mark_file.display()
+                            )
+                        })?;
+                    let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);
+                    if let Err(e) =
+                        remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
+                    {
+                        error!("Failed to clean up uninit marked timeline: {e:?}");
+                    }
                } else {
-                    match collect_timeline_files(&timeline_dir) {
-                        Ok((timeline_id, metadata, timeline_files)) => {
-                            tenant_timelines.insert(timeline_id, (metadata, timeline_files));
+                    let timeline_id = timeline_dir
+                        .file_name()
+                        .and_then(OsStr::to_str)
+                        .unwrap_or_default()
+                        .parse::<TimelineId>()
+                        .with_context(|| {
+                            format!(
+                                "Could not parse timeline id out of the timeline dir name {}",
+                                timeline_dir.display()
+                            )
+                        })?;
+                    let timeline_uninit_mark_file =
+                        config.timeline_uninit_mark_file_path(tenant_id, timeline_id);
+                    if timeline_uninit_mark_file.exists() {
+                        info!("Found an uninit mark file for timeline {tenant_id}/{timeline_id}, removing the timeline and its uninit mark");
+                        if let Err(e) = remove_timeline_and_uninit_mark(
+                            &timeline_dir,
+                            &timeline_uninit_mark_file,
+                        ) {
+                            error!("Failed to clean up uninit marked timeline: {e:?}");
                        }
-                        Err(e) => {
-                            error!(
-                                "Failed to process timeline dir contents at '{}', reason: {:?}",
-                                timeline_dir.display(),
-                                e
-                            );
-                            match remove_if_empty(&timeline_dir) {
-                                Ok(true) => info!(
-                                    "Removed empty timeline directory {}",
-                                    timeline_dir.display()
-                                ),
-                                Ok(false) => (),
-                                Err(e) => {
-                                    error!("Failed to remove empty timeline directory: {e:?}")
+                    } else {
+                        match collect_timeline_files(&timeline_dir) {
+                            Ok((metadata, timeline_files)) => {
+                                tenant_timelines.insert(
+                                    timeline_id,
+                                    TimelineLocalFiles::collected(metadata, timeline_files),
+                                );
+                            }
+                            Err(e) => {
+                                error!(
+                                    "Failed to process timeline dir contents at '{}', reason: {:?}",
+                                    timeline_dir.display(),
+                                    e
+                                );
+                                match remove_if_empty(&timeline_dir) {
+                                    Ok(true) => info!(
+                                        "Removed empty timeline directory {}",
+                                        timeline_dir.display()
+                                    ),
+                                    Ok(false) => (),
+                                    Err(e) => {
+                                        error!("Failed to remove empty timeline directory: {e:?}")
+                                    }
                                }
                            }
                        }
@@ -686,25 +783,48 @@ fn collect_timelines_for_tenant(
    Ok((tenant_id, TenantAttachData::Ready(tenant_timelines)))
 }

+fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> anyhow::Result<()> {
+    fs::remove_dir_all(&timeline_dir)
+        .or_else(|e| {
+            if e.kind() == std::io::ErrorKind::NotFound {
+                // we can leave the uninit mark without a timeline dir,
+                // just remove the mark then
+                Ok(())
+            } else {
+                Err(e)
+            }
+        })
+        .with_context(|| {
+            format!(
+                "Failed to remove unit marked timeline directory {}",
+                timeline_dir.display()
+            )
+        })?;
+    fs::remove_file(&uninit_mark).with_context(|| {
+        format!(
+            "Failed to remove timeline uninit mark file {}",
+            uninit_mark.display()
+        )
+    })?;
+
+    Ok(())
+}
+
 // discover timeline files and extract timeline metadata
 //  NOTE: ephemeral files are excluded from the list
 fn collect_timeline_files(
    timeline_dir: &Path,
-) -> anyhow::Result<(TimelineId, TimelineMetadata, HashSet<PathBuf>)> {
-    let mut timeline_files = HashSet::new();
+) -> anyhow::Result<(TimelineMetadata, HashMap<PathBuf, LayerFileMetadata>)> {
+    let mut timeline_files = HashMap::new();
    let mut timeline_metadata_path = None;

-    let timeline_id = timeline_dir
-        .file_name()
-        .and_then(OsStr::to_str)
-        .unwrap_or_default()
-        .parse::<TimelineId>()
-        .context("Could not parse timeline id out of the timeline dir name")?;
    let timeline_dir_entries =
        fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?;
    for entry in timeline_dir_entries {
        let entry_path = entry.context("Failed to list timeline dir entry")?.path();
-        if entry_path.is_file() {
+        let metadata = entry_path.metadata()?;
+
+        if metadata.is_file() {
            if entry_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME) {
                timeline_metadata_path = Some(entry_path);
            } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) {
@@ -719,7 +839,8 @@ fn collect_timeline_files(
                    )
                })?;
            } else {
-                timeline_files.insert(entry_path);
+                let layer_metadata = LayerFileMetadata::new(metadata.len());
+                timeline_files.insert(entry_path, layer_metadata);
            }
        }
    }
@@ -745,5 +866,5 @@ fn collect_timeline_files(
        "Timeline has no ancestor and no layer files"
    );

-    Ok((timeline_id, metadata, timeline_files))
+    Ok((metadata, timeline_files))
 }
--- a/Show More
+++ b/Show More