[proxy] Add more metrics

Deploy storage into new account and migrate to management API v2 (#2619 )
Deploy storage into new account Migrate safekeeper and pageserver initialisation to management api v2
2026-02-21 03:20:36 +00:00 · 2022-10-18 16:07:12 +03:00 · 2022-10-18 15:52:15 +03:00 · 2022-10-18 15:00:10 +03:00 · 2022-10-18 15:00:10 +03:00 · 2022-10-18 15:00:10 +03:00
151 changed files with 11301 additions and 2717 deletions
--- a/.github/PULL_REQUEST_TEMPLATE/release-pr.md
+++ b/.github/PULL_REQUEST_TEMPLATE/release-pr.md
@@ -10,7 +10,7 @@
 <!-- List everything that should be done **before** release, any issues / setting changes / etc -->

 ### Checklist after release
- [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/120/files))
+- [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/219/files))
 - [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel
 - [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true)
 - [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some)
--- a/.github/actions/allure-report/action.yml
+++ b/.github/actions/allure-report/action.yml
@@ -47,7 +47,7 @@ runs:
        else
          key=branch-$(echo ${GITHUB_REF#refs/heads/} | tr -c "[:alnum:]._-" "-")
        fi
-        echo "::set-output name=KEY::${key}"
+        echo "KEY=${key}" >> $GITHUB_OUTPUT

    - uses: actions/setup-java@v3
      if: ${{ inputs.action == 'generate' }}
@@ -186,7 +186,7 @@ runs:
        aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"

        echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
-        echo "::set-output name=report-url::${REPORT_URL}"
+        echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT

    - name: Release Allure lock
      if: ${{ inputs.action == 'generate' && always() }}
--- a/.github/actions/download/action.yml
+++ b/.github/actions/download/action.yml
@@ -12,6 +12,9 @@ inputs:
    description: "Allow to skip if file doesn't exist, fail otherwise"
    default: false
    required: false
+  prefix:
+    description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
+    required: false

 runs:
  using: "composite"
@@ -23,23 +26,23 @@ runs:
        TARGET: ${{ inputs.path }}
        ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst
        SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }}
+        PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }}
      run: |
        BUCKET=neon-github-public-dev
-        PREFIX=artifacts/${GITHUB_RUN_ID}
        FILENAME=$(basename $ARCHIVE)

-        S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
+        S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
        if [ -z "${S3_KEY}" ]; then
          if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then
-            echo '::set-output name=SKIPPED::true'
+            echo 'SKIPPED=true' >> $GITHUB_OUTPUT
            exit 0
          else
-            echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME} nor its version from previous attempts exist"
+            echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist"
            exit 1
          fi
        fi

-        echo '::set-output name=SKIPPED::false'
+        echo 'SKIPPED=false' >> $GITHUB_OUTPUT

        mkdir -p $(dirname $ARCHIVE)
        time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} ${ARCHIVE}
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -41,8 +41,8 @@ runs:
            ;;
        esac

-        echo "::set-output name=api_host::${API_HOST}"
-        echo "::set-output name=region_id::${REGION_ID}"
+        echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT
+        echo "region_id=${REGION_ID}" >> $GITHUB_OUTPUT
      env:
        ENVIRONMENT: ${{ inputs.environment }}
        REGION_ID: ${{ inputs.region_id }}
@@ -72,10 +72,10 @@ runs:

        dsn=$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .dsn')/main
        echo "::add-mask::${dsn}"
-        echo "::set-output name=dsn::${dsn}"
+        echo "dsn=${dsn}" >> $GITHUB_OUTPUT

        project_id=$(echo $project | jq --raw-output '.id')
-        echo "::set-output name=project_id::${project_id}"
+        echo "project_id=${project_id}" >> $GITHUB_OUTPUT
      env:
        API_KEY: ${{ inputs.api_key }}
        API_HOST: ${{ steps.parse-input.outputs.api_host }}
--- a/.github/actions/neon-project-delete/action.yml
+++ b/.github/actions/neon-project-delete/action.yml
@@ -32,7 +32,7 @@ runs:
            ;;
        esac

-        echo "::set-output name=api_host::${API_HOST}"
+        echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT
      env:
        ENVIRONMENT: ${{ inputs.environment }}

--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -127,7 +127,7 @@ runs:

        # Wake up the cluster if we use remote neon instance
        if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then
-          ${POSTGRES_DISTRIB_DIR}/v14/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();"
+          ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();"
        fi

        # Run the tests.
--- a/.github/actions/upload/action.yml
+++ b/.github/actions/upload/action.yml
@@ -7,6 +7,9 @@ inputs:
  path:
    description: "A directory or file to upload"
    required: true
+  prefix:
+    description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
+    required: false

 runs:
  using: "composite"
@@ -42,14 +45,14 @@ runs:
      env:
        SOURCE: ${{ inputs.path }}
        ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst
+        PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }}
      run: |
        BUCKET=neon-github-public-dev
-        PREFIX=artifacts/${GITHUB_RUN_ID}
        FILENAME=$(basename $ARCHIVE)

        FILESIZE=$(du -sh ${ARCHIVE} | cut -f1)

-        time aws s3 mv --only-show-errors ${ARCHIVE} s3://${BUCKET}/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME}
+        time aws s3 mv --only-show-errors ${ARCHIVE} s3://${BUCKET}/${PREFIX}/${FILENAME}

        # Ref https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#adding-a-job-summary
-        echo "[${FILENAME}](https://${BUCKET}.s3.amazonaws.com/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME}) ${FILESIZE}" >> ${GITHUB_STEP_SUMMARY}
+        echo "[${FILENAME}](https://${BUCKET}.s3.amazonaws.com/${PREFIX}/${FILENAME}) ${FILESIZE}" >> ${GITHUB_STEP_SUMMARY}
--- a/.github/ansible/.gitignore
+++ b/.github/ansible/.gitignore
@@ -2,3 +2,6 @@ zenith_install.tar.gz
 .zenith_current_version
 neon_install.tar.gz
 .neon_current_version
+
+collections/*
+!collections/.keep
--- a/.github/ansible/ansible.cfg
+++ b/.github/ansible/ansible.cfg
@@ -3,6 +3,7 @@
 localhost_warning = False
 host_key_checking = False
 timeout = 30
+collections_paths = ./collections

 [ssh_connection]
 ssh_args   = -F ./ansible.ssh.cfg
--- a/.github/ansible/collections/.keep
+++ b/.github/ansible/collections/.keep
--- a/.github/ansible/deploy.yaml
+++ b/.github/ansible/deploy.yaml
@@ -1,7 +1,7 @@
 - name: Upload Neon binaries
  hosts: storage
  gather_facts: False
-  remote_user: admin
+  remote_user: "{{ remote_user }}"

  tasks:

@@ -14,7 +14,8 @@
      - safekeeper

    - name: inform about versions
-      debug: msg="Version to deploy - {{ current_version }}"
+      debug:
+        msg: "Version to deploy - {{ current_version }}"
      tags:
      - pageserver
      - safekeeper
@@ -35,7 +36,7 @@
 - name: Deploy pageserver
  hosts: pageservers
  gather_facts: False
-  remote_user: admin
+  remote_user: "{{ remote_user }}"

  tasks:

@@ -58,23 +59,37 @@
        creates: "/storage/pageserver/data/tenants"
      environment:
        NEON_REPO_DIR: "/storage/pageserver/data"
-        LD_LIBRARY_PATH: "/usr/local/lib"
+        LD_LIBRARY_PATH: "/usr/local/v14/lib"
      become: true
      tags:
      - pageserver

-    # - name: update remote storage (s3) config
-    #   lineinfile:
-    #     path: /storage/pageserver/data/pageserver.toml
-    #     line: "{{ item }}"
-    #   loop:
-    #     - "[remote_storage]"
-    #     - "bucket_name = '{{ bucket_name }}'"
-    #     - "bucket_region = '{{ bucket_region }}'"
-    #     - "prefix_in_bucket = '{{ inventory_hostname }}'"
-    #   become: true
-    #   tags:
-    #   - pageserver
+    - name: read the existing remote pageserver config
+      ansible.builtin.slurp:
+        src: /storage/pageserver/data/pageserver.toml
+      register: _remote_ps_config
+      tags:
+      - pageserver
+
+    - name: parse the existing pageserver configuration
+      ansible.builtin.set_fact:
+        _existing_ps_config: "{{ _remote_ps_config['content'] | b64decode | sivel.toiletwater.from_toml }}"
+      tags:
+      - pageserver
+
+    - name: construct the final pageserver configuration dict
+      ansible.builtin.set_fact:
+        pageserver_config: "{{ pageserver_config_stub | combine({'id': _existing_ps_config.id }) }}"
+      tags:
+      - pageserver
+
+    - name: template the pageserver config
+      template:
+        src: templates/pageserver.toml.j2
+        dest: /storage/pageserver/data/pageserver.toml
+      become: true
+      tags:
+      - pageserver

    - name: upload systemd service definition
      ansible.builtin.template:
@@ -87,15 +102,15 @@
      tags:
      - pageserver

-    # - name: start systemd service
-    #   ansible.builtin.systemd:
-    #     daemon_reload: yes
-    #     name: pageserver
-    #     enabled: yes
-    #     state: restarted
-    #   become: true
-    #   tags:
-    #   - pageserver
+    - name: start systemd service
+      ansible.builtin.systemd:
+        daemon_reload: yes
+        name: pageserver
+        enabled: yes
+        state: restarted
+      become: true
+      tags:
+      - pageserver

    - name: post version to console
      when: console_mgmt_base_url is defined
@@ -109,7 +124,7 @@
 - name: Deploy safekeeper
  hosts: safekeepers
  gather_facts: False
-  remote_user: admin
+  remote_user: "{{ remote_user }}"

  tasks:

@@ -132,7 +147,7 @@
        creates: "/storage/safekeeper/data/safekeeper.id"
      environment:
        NEON_REPO_DIR: "/storage/safekeeper/data"
-        LD_LIBRARY_PATH: "/usr/local/lib"
+        LD_LIBRARY_PATH: "/usr/local/v14/lib"
      become: true
      tags:
      - safekeeper
--- a/.github/ansible/get_binaries.sh
+++ b/.github/ansible/get_binaries.sh
@@ -21,10 +21,15 @@ docker pull --quiet neondatabase/neon:${DOCKER_TAG}
 ID=$(docker create neondatabase/neon:${DOCKER_TAG})
 docker cp ${ID}:/data/postgres_install.tar.gz .
 tar -xzf postgres_install.tar.gz -C neon_install
+mkdir neon_install/bin/
 docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/
+docker cp ${ID}:/usr/local/bin/pageserver_binutils neon_install/bin/
 docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/
 docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/
-docker cp ${ID}:/usr/local/bin/postgres neon_install/bin/
+docker cp ${ID}:/usr/local/v14/bin/ neon_install/v14/bin/
+docker cp ${ID}:/usr/local/v15/bin/ neon_install/v15/bin/
+docker cp ${ID}:/usr/local/v14/lib/ neon_install/v14/lib/
+docker cp ${ID}:/usr/local/v15/lib/ neon_install/v15/lib/
 docker rm -vf ${ID}

 # store version to file (for ansible playbooks) and create binaries tarball
--- a/.github/ansible/neon-stress.hosts
+++ b/.github/ansible/neon-stress.hosts
@@ -1,20 +0,0 @@
-[pageservers]
-neon-stress-ps-1 console_region_id=1
-neon-stress-ps-2 console_region_id=1
-
-[safekeepers]
-neon-stress-sk-1 console_region_id=1
-neon-stress-sk-2 console_region_id=1
-neon-stress-sk-3 console_region_id=1
-
-[storage:children]
-pageservers
-safekeepers
-
-[storage:vars]
-env_name = neon-stress
-console_mgmt_base_url = http://neon-stress-console.local
-bucket_name           = neon-storage-ireland
-bucket_region         = eu-west-1
-etcd_endpoints        = etcd-stress.local:2379
-safekeeper_enable_s3_offload = false
--- a/.github/ansible/neon-stress.hosts.yaml
+++ b/.github/ansible/neon-stress.hosts.yaml
@@ -0,0 +1,31 @@
+storage:
+  vars:
+    bucket_name: neon-storage-ireland
+    bucket_region: eu-west-1
+    console_mgmt_base_url: http://neon-stress-console.local
+    env_name: neon-stress
+    etcd_endpoints: neon-stress-etcd.local:2379
+    safekeeper_enable_s3_offload: 'false'
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "{{ inventory_hostname }}"
+    hostname_suffix: ".local"
+    remote_user: admin
+  children:
+    pageservers:
+      hosts:
+        neon-stress-ps-1:
+          console_region_id: aws-eu-west-1
+        neon-stress-ps-2:
+          console_region_id: aws-eu-west-1
+    safekeepers:
+      hosts:
+        neon-stress-sk-1:
+          console_region_id: aws-eu-west-1
+        neon-stress-sk-2:
+          console_region_id: aws-eu-west-1
+        neon-stress-sk-3:
+          console_region_id: aws-eu-west-1
--- a/.github/ansible/production.hosts
+++ b/.github/ansible/production.hosts
@@ -1,20 +0,0 @@
-[pageservers]
-#zenith-1-ps-1 console_region_id=1
-zenith-1-ps-2 console_region_id=1
-zenith-1-ps-3 console_region_id=1
-
-[safekeepers]
-zenith-1-sk-1 console_region_id=1
-zenith-1-sk-2 console_region_id=1
-zenith-1-sk-3 console_region_id=1
-
-[storage:children]
-pageservers
-safekeepers
-
-[storage:vars]
-env_name = prod-1
-console_mgmt_base_url = http://console-release.local
-bucket_name           = zenith-storage-oregon
-bucket_region         = us-west-2
-etcd_endpoints        = zenith-1-etcd.local:2379
--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -0,0 +1,33 @@
+---
+storage:
+  vars:
+    env_name: prod-1
+    console_mgmt_base_url: http://console-release.local
+    bucket_name: zenith-storage-oregon
+    bucket_region: us-west-2
+    etcd_endpoints: zenith-1-etcd.local:2379
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "{{ inventory_hostname }}"
+    hostname_suffix: ".local"
+    remote_user: admin
+
+  children:
+    pageservers:
+      hosts:
+        zenith-1-ps-2:
+          console_region_id: aws-us-west-2
+        zenith-1-ps-3:
+          console_region_id: aws-us-west-2
+
+    safekeepers:
+      hosts:
+        zenith-1-sk-1:
+          console_region_id: aws-us-west-2
+        zenith-1-sk-2:
+          console_region_id: aws-us-west-2
+        zenith-1-sk-3:
+          console_region_id: aws-us-west-2
--- a/.github/ansible/scripts/init_pageserver.sh
+++ b/.github/ansible/scripts/init_pageserver.sh
@@ -12,18 +12,19 @@ cat <<EOF | tee /tmp/payload
  "version": 1,
  "host": "${HOST}",
  "port": 6400,
-  "region_id": {{ console_region_id }},
+  "region_id": "{{ console_region_id }}",
  "instance_id": "${INSTANCE_ID}",
  "http_host": "${HOST}",
-  "http_port": 9898
+  "http_port": 9898,
+  "active": false
 }
 EOF

 # check if pageserver already registered or not
-if ! curl -sf -X PATCH -d '{}' {{ console_mgmt_base_url }}/api/v1/pageservers/${INSTANCE_ID} -o /dev/null; then
+if ! curl -sf -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/${INSTANCE_ID} -o /dev/null; then

    # not registered, so register it now
-    ID=$(curl -sf -X POST {{ console_mgmt_base_url }}/api/v1/pageservers -d@/tmp/payload | jq -r '.ID')
+    ID=$(curl -sf -X POST -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers -d@/tmp/payload | jq -r '.id')

    # init pageserver
    sudo -u pageserver /usr/local/bin/pageserver -c "id=${ID}" -c "pg_distrib_dir='/usr/local'" --init -D /storage/pageserver/data
--- a/.github/ansible/scripts/init_safekeeper.sh
+++ b/.github/ansible/scripts/init_safekeeper.sh
@@ -14,18 +14,18 @@ cat <<EOF | tee /tmp/payload
  "host": "${HOST}",
  "port": 6500,
  "http_port": 7676,
-  "region_id": {{ console_region_id }},
+  "region_id": "{{ console_region_id }}",
  "instance_id": "${INSTANCE_ID}",
-  "availability_zone_id": "${AZ_ID}"
+  "availability_zone_id": "${AZ_ID}",
+  "active": false
 }
 EOF

 # check if safekeeper already registered or not
-if ! curl -sf -X PATCH -d '{}' {{ console_mgmt_base_url }}/api/v1/safekeepers/${INSTANCE_ID} -o /dev/null; then
+if ! curl -sf -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/${INSTANCE_ID} -o /dev/null; then

    # not registered, so register it now
-    ID=$(curl -sf -X POST {{ console_mgmt_base_url }}/api/v1/safekeepers -d@/tmp/payload | jq -r '.ID')
-
+    ID=$(curl -sf -X POST -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers -d@/tmp/payload | jq -r '.id')
    # init safekeeper
    sudo -u safekeeper /usr/local/bin/safekeeper --id ${ID} --init -D /storage/safekeeper/data
 fi
--- a/.github/ansible/ssm_config
+++ b/.github/ansible/ssm_config
@@ -0,0 +1,3 @@
+ansible_connection: aws_ssm
+ansible_aws_ssm_bucket_name: neon-dev-bucket
+ansible_python_interpreter: /usr/bin/python3
--- a/.github/ansible/staging.hosts
+++ b/.github/ansible/staging.hosts
@@ -1,21 +0,0 @@
-[pageservers]
-#zenith-us-stage-ps-1 console_region_id=27
-zenith-us-stage-ps-2 console_region_id=27
-zenith-us-stage-ps-3 console_region_id=27
-zenith-us-stage-ps-4 console_region_id=27
-
-[safekeepers]
-zenith-us-stage-sk-4 console_region_id=27
-zenith-us-stage-sk-5 console_region_id=27
-zenith-us-stage-sk-6 console_region_id=27
-
-[storage:children]
-pageservers
-safekeepers
-
-[storage:vars]
-env_name = us-stage
-console_mgmt_base_url = http://console-staging.local
-bucket_name           = zenith-staging-storage-us-east-1
-bucket_region         = us-east-1
-etcd_endpoints        = zenith-us-stage-etcd.local:2379
--- a/.github/ansible/staging.hosts.yaml
+++ b/.github/ansible/staging.hosts.yaml
@@ -0,0 +1,34 @@
+storage:
+  vars:
+    bucket_name: zenith-staging-storage-us-east-1
+    bucket_region: us-east-1
+    console_mgmt_base_url: http://console-staging.local
+    env_name: us-stage
+    etcd_endpoints: zenith-us-stage-etcd.local:2379
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "{{ inventory_hostname }}"
+    hostname_suffix: ".local"
+    remote_user: admin
+
+  children:
+    pageservers:
+      hosts:
+        zenith-us-stage-ps-2:
+          console_region_id: aws-us-east-1
+        zenith-us-stage-ps-3:
+          console_region_id: aws-us-east-1
+        zenith-us-stage-ps-4:
+          console_region_id: aws-us-east-1
+
+    safekeepers:
+      hosts:
+        zenith-us-stage-sk-4:
+          console_region_id: aws-us-east-1
+        zenith-us-stage-sk-5:
+          console_region_id: aws-us-east-1
+        zenith-us-stage-sk-6:
+          console_region_id: aws-us-east-1
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -0,0 +1,32 @@
+storage:
+  vars:
+    bucket_name: neon-staging-storage-us-east-2
+    bucket_region: us-east-2
+    console_mgmt_base_url: http://console-staging.local
+    env_name: us-stage
+    etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: us-east-2
+    console_region_id: aws-us-east-2
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.us-east-2.aws.neon.build:
+          ansible_host: i-0c3e70929edb5d691
+
+    safekeepers:
+      hosts:
+        safekeeper-0.us-east-2.aws.neon.build:
+          ansible_host: i-027662bd552bf5db0
+        safekeeper-1.us-east-2.aws.neon.build:
+          ansible_host: i-0171efc3604a7b907
+        safekeeper-2.us-east-2.aws.neon.build:
+          ansible_host: i-0de0b03a51676a6ce
--- a/.github/ansible/systemd/pageserver.service
+++ b/.github/ansible/systemd/pageserver.service
@@ -1,11 +1,11 @@
 [Unit]
-Description=Zenith pageserver
+Description=Neon pageserver
 After=network.target auditd.service

 [Service]
 Type=simple
 User=pageserver
-Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib
+Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib
 ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
--- a/.github/ansible/systemd/safekeeper.service
+++ b/.github/ansible/systemd/safekeeper.service
@@ -1,12 +1,12 @@
 [Unit]
-Description=Zenith safekeeper
+Description=Neon safekeeper
 After=network.target auditd.service

 [Service]
 Type=simple
 User=safekeeper
-Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
-ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
+Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib
+ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
 KillSignal=SIGINT
--- a/.github/ansible/templates/pageserver.toml.j2
+++ b/.github/ansible/templates/pageserver.toml.j2
@@ -0,0 +1 @@
+{{ pageserver_config | sivel.toiletwater.to_toml }}
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -46,7 +46,8 @@ jobs:
    runs-on: [self-hosted, zenith-benchmarker]

    env:
-      POSTGRES_DISTRIB_DIR: "/usr/pgsql-14"
+      POSTGRES_DISTRIB_DIR: /usr/pgsql
+      DEFAULT_PG_VERSION: 14

    steps:
    - name: Checkout zenith repo
@@ -71,7 +72,7 @@ jobs:
        echo Poetry
        poetry --version
        echo Pgbench
-        $POSTGRES_DISTRIB_DIR/bin/pgbench --version
+        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version

    - name: Create Neon Project
      id: create-neon-project
@@ -137,21 +138,31 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  pgbench-compare:
-    env:
-      TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
-      TEST_PG_BENCH_SCALES_MATRIX: "10gb"
-      POSTGRES_DISTRIB_DIR: /tmp/pg_install
-      TEST_OUTPUT: /tmp/test_output
-      BUILD_TYPE: remote
-      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
-
    strategy:
      fail-fast: false
      matrix:
        # neon-captest-new: Run pgbench in a freshly created project
        # neon-captest-reuse: Same, but reusing existing project
        # neon-captest-prefetch: Same, with prefetching enabled (new project)
-        platform: [ neon-captest-reuse ]
+        platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch ]
+        db_size: [ 10gb ]
+        include:
+          - platform: neon-captest-new
+            db_size: 50gb
+          - platform: neon-captest-prefetch
+            db_size: 50gb
+          - platform: rds-aurora
+            db_size: 50gb
+
+    env:
+      TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
+      TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }}
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 14
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
+      PLATFORM: ${{ matrix.platform }}

    runs-on: dev
    container:
@@ -163,13 +174,20 @@ jobs:
    steps:
    - uses: actions/checkout@v3

-    - name: Install Deps
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Add Postgres binaries to PATH
      run: |
-        sudo apt -y update
-        sudo apt install -y postgresql-14
+        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
+        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH

    - name: Create Neon Project
-      if: matrix.platform != 'neon-captest-reuse'
+      if: contains(fromJson('["neon-captest-new", "neon-captest-prefetch"]'), matrix.platform)
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
@@ -195,18 +213,9 @@ jobs:
            ;;
        esac

-        echo "::set-output name=connstr::${CONNSTR}"
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

        psql ${CONNSTR} -c "SELECT version();"
-      env:
-        PLATFORM: ${{ matrix.platform }}
-
-    - name: Hack psql path
-      run: |
-        mkdir /tmp/pg_install
-        ln -s /usr/ /tmp/pg_install/v14
-      env:
-        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}

    - name: Set database options
      if: matrix.platform == 'neon-captest-prefetch'
@@ -225,7 +234,6 @@ jobs:
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
      env:
-        PLATFORM: ${{ matrix.platform }}
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -239,7 +247,6 @@ jobs:
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
      env:
-        PLATFORM: ${{ matrix.platform }}
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -253,7 +260,6 @@ jobs:
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
      env:
-        PLATFORM: ${{ matrix.platform }}
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -266,7 +272,7 @@ jobs:
        build_type: ${{ env.BUILD_TYPE }}

    - name: Delete Neon Project
-      if: ${{ matrix.platform != 'neon-captest-reuse' && always() }}
+      if: ${{ steps.create-neon-project.outputs.project_id && always() }}
      uses: ./.github/actions/neon-project-delete
      with:
        environment: dev
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -35,12 +35,12 @@ jobs:
          echo ref:$GITHUB_REF_NAME
          echo rev:$(git rev-list --count HEAD)
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            echo "::set-output name=tag::$(git rev-list --count HEAD)"
+            echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
+            echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
-            echo "::set-output name=tag::$GITHUB_RUN_ID"
+            echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
          fi
        shell: bash
        id: build-tag
@@ -78,12 +78,12 @@ jobs:

      - name: Set pg 14 revision for caching
        id: pg_v14_rev
-        run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v14)
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
        shell: bash -euxo pipefail {0}

      - name: Set pg 15 revision for caching
        id: pg_v15_rev
-        run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v15)
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
        shell: bash -euxo pipefail {0}

      # Set some environment variables used by all the steps.
@@ -127,8 +127,8 @@ jobs:
            target/
          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
          key: |
-            v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
-            v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-
+            v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
+            v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-

      - name: Cache postgres v14 build
        id: cache_pg_14
@@ -268,6 +268,32 @@ jobs:
        if: matrix.build_type == 'debug'
        uses: ./.github/actions/save-coverage-data

+  upload-latest-artifacts:
+    runs-on: dev
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+    needs: [ regress-tests ]
+    if: github.ref_name == 'main'
+    steps:
+      - name: Copy Neon artifact to the latest directory
+        shell: bash -euxo pipefail {0}
+        env:
+          BUCKET: neon-github-public-dev
+          PREFIX: artifacts/${{ github.run_id }}
+        run: |
+          for build_type in debug release; do
+            FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
+
+            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
+            if [ -z "${S3_KEY}" ]; then
+              echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist"
+              exit 1
+            fi
+
+            time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/artifacts/latest/${FILENAME}
+          done
+
  benchmarks:
    runs-on: dev
    container:
@@ -335,9 +361,6 @@ jobs:
          curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json
          ./scripts/pysync

-          # Workaround for https://github.com/neondatabase/cloud/issues/2188
-          psql "$TEST_RESULT_CONNSTR" -c "SELECT 1;" || sleep 10
-
          DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json

  coverage-report:
@@ -366,7 +389,7 @@ jobs:
            !~/.cargo/registry/src
            ~/.cargo/git/
            target/
-          key: v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
+          key: v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}

      - name: Get Neon artifact
        uses: ./.github/actions/download
@@ -471,7 +494,7 @@ jobs:
        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

      - name: Kaniko build neon
-        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID
+        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID

  compute-tools-image:
    runs-on: dev
@@ -485,7 +508,7 @@ jobs:
        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

      - name: Kaniko build compute tools
-        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID
+        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID

  compute-node-image:
    runs-on: dev
@@ -504,7 +527,7 @@ jobs:
        # cloud repo depends on this image name, thus duplicating it
        # remove compute-node when cloud repo is updated
      - name: Kaniko build compute node with extensions v14 (compatibility)
-        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID
+        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID

  compute-node-image-v14:
    runs-on: dev
@@ -520,7 +543,7 @@ jobs:
        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

      - name: Kaniko build compute node with extensions v14
-        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID
+        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID


  compute-node-image-v15:
@@ -537,11 +560,11 @@ jobs:
        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

      - name: Kaniko build compute node with extensions v15
-        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID
+        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID

  promote-images:
    runs-on: dev
-    needs: [ neon-image, compute-node-image, compute-node-image-v14, compute-tools-image ]
+    needs: [ neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
    if: github.event_name != 'workflow_dispatch'
    container: amazon/aws-cli
    strategy:
@@ -550,7 +573,7 @@ jobs:
        # compute-node uses postgres 14, which is default now
        # cloud repo depends on this image name, thus duplicating it
        # remove compute-node when cloud repo is updated
-        name: [ neon, compute-node, compute-node-v14, compute-tools ]
+        name: [ neon, compute-node, compute-node-v14, compute-node-v15, compute-tools ]

    steps:
      - name: Promote image to latest
@@ -585,10 +608,24 @@ jobs:
      - name: Pull compute node v14 image from ECR
        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest compute-node-v14

+      - name: Pull compute node v15 image from ECR
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest compute-node-v15
+
      - name: Pull rust image from ECR
        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust

-      - name: Configure docker login
+      - name: Push images to production ECR
+        if: |
+          (github.ref_name == 'main' || github.ref_name == 'release') &&
+          github.event_name != 'workflow_dispatch'
+        run: |
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v14:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v15:latest
+
+      - name: Configure Docker Hub login
        run: |
          # ECR Credential Helper & Docker Hub don't work together in config, hence reset
          echo "" > /github/home/.docker/config.json
@@ -606,10 +643,13 @@ jobs:
      - name: Push compute node v14 image to Docker Hub
        run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}

+      - name: Push compute node v15 image to Docker Hub
+        run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}}
+
      - name: Push rust image to Docker Hub
        run: crane push rust neondatabase/rust:pinned

-      - name: Add latest tag to images
+      - name: Add latest tag to images in Docker Hub
        if: |
          (github.ref_name == 'main' || github.ref_name == 'release') &&
          github.event_name != 'workflow_dispatch'
@@ -618,6 +658,7 @@ jobs:
          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest

  calculate-deploy-targets:
    runs-on: [ self-hosted, Linux, k8s-runner ]
@@ -630,12 +671,12 @@ jobs:
      - id: set-matrix
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA"}'
-            NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA"}'
-            echo "::set-output name=include::[$STAGING, $NEON_STRESS]"
+            STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}'
+            NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA", "console_api_key_secret": "NEON_CAPTEST_API_KEY"}'
+            echo "include=[$STAGING, $NEON_STRESS]" >> $GITHUB_OUTPUT
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA"}'
-            echo "::set-output name=include::[$PRODUCTION]"
+            PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
+            echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
            exit 1
@@ -671,7 +712,7 @@ jobs:
      - name: Setup ansible
        run: |
          export PATH="/root/.local/bin:$PATH"
-          pip install --progress-bar off --user ansible boto3
+          pip install --progress-bar off --user ansible boto3 toml

      - name: Redeploy
        run: |
@@ -693,8 +734,47 @@ jobs:
          chmod 0600 ssh-key
          ssh-add ssh-key
          rm -f ssh-key ssh-key-cert.pub
+          ansible-galaxy collection install sivel.toiletwater
+          ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }}
+          rm -f neon_install.tar.gz .neon_current_version

-          ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts
+  deploy-new:
+    runs-on: dev
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
+    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
+    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    if: |
+      (github.ref_name == 'main') &&
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    env:
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Redeploy
+        run: |
+          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
+          cd "$(pwd)/.github/ansible"
+
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            ./get_binaries.sh
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            RELEASE=true ./get_binaries.sh
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+
+          ansible-playbook deploy.yaml -i staging.us-east-2.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
          rm -f neon_install.tar.gz .neon_current_version

  deploy-proxy:
@@ -736,5 +816,5 @@ jobs:
      - name: Re-deploy proxy
        run: |
          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
-          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
--- a/.github/workflows/codestyle.yml
+++ b/.github/workflows/codestyle.yml
@@ -56,12 +56,12 @@ jobs:

      - name: Set pg 14 revision for caching
        id: pg_v14_rev
-        run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v14)
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
        shell: bash -euxo pipefail {0}

      - name: Set pg 15 revision for caching
        id: pg_v15_rev
-        run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v15)
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
        shell: bash -euxo pipefail {0}

      - name: Cache postgres v14 build
@@ -106,7 +106,7 @@ jobs:
            !~/.cargo/registry/src
            ~/.cargo/git
            target
-          key: v4-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
+          key: v5-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust

      - name: Run cargo clippy
        run: ./run_clippy.sh
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,3 +1,14 @@
+# 'named-profiles' feature was stabilized in cargo 1.57. This line makes the
+# build work with older cargo versions.
+#
+# We have this because as of this writing, the latest cargo Debian package
+# that's available is 1.56. (Confusingly, the Debian package version number
+# is 0.57, whereas 'cargo --version' says 1.56.)
+#
+# See https://tracker.debian.org/pkg/cargo for the current status of the
+# package. When that gets updated, we can remove this.
+cargo-features = ["named-profiles"]
+
 [workspace]
 members = [
    "compute_tools",
--- a/15
+++ b/15
@@ -19,9 +19,8 @@ COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
 ENV BUILD_TYPE release
 RUN set -e \
    && mold -run make -j $(nproc) -s neon-pg-ext \
-    && rm -rf pg_install/v14/build \
-    && rm -rf pg_install/v15/build \
-    && tar -C pg_install/v14 -czf /home/nonroot/postgres_install.tar.gz .
+    && rm -rf pg_install/build \
+    && tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz .

 # Build neon binaries
 FROM $REPOSITORY/$IMAGE:$TAG AS build
@@ -45,7 +44,7 @@ COPY . .
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-&& mold -run cargo build --bin pageserver --bin safekeeper --bin proxy --locked --release \
+&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin safekeeper --bin proxy --locked --release \
    && cachepot -s

 # Build final image
@@ -64,9 +63,10 @@ RUN set -e \
    && useradd -d /data neon \
    && chown -R neon:neon /data

-COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy      /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
@@ -86,4 +86,3 @@ VOLUME ["/data"]
 USER neon
 EXPOSE 6400
 EXPOSE 9898
-CMD ["/bin/bash"]
--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node-v14
@@ -8,9 +8,12 @@ ARG TAG=pinned
 # Layer "build-deps"
 #
 FROM debian:bullseye-slim AS build-deps
+RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
+    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
+    apt update
 RUN apt update &&  \
    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-    libcurl4-openssl-dev libossp-uuid-dev
+    libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev

 #
 # Layer "pg-build"
@@ -37,7 +40,7 @@ RUN cd postgres && \
 FROM build-deps AS postgis-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc wget
+    apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc

 RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
    tar xvzf postgis-3.3.0.tar.gz && \
@@ -59,15 +62,13 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
 # Build plv8
 #
 FROM build-deps AS plv8-build
-COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y git curl wget make ninja-build build-essential libncurses5 python3-dev pkg-config libc++-dev libc++abi-dev libglib2.0-dev
+    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5

 # https://github.com/plv8/plv8/issues/475
 # Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
-RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
-    apt update && \
+RUN apt update && \
    apt install -y --no-install-recommends -t testing binutils

 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
@@ -79,12 +80,46 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
    rm -rf /plv8-* && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control

+#
+# Layer "h3-pg-build"
+# Build h3_pg
+#
+FROM build-deps AS h3-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+# packaged cmake is too old
+RUN apt update && \
+    apt install -y --no-install-recommends -t testing cmake
+
+RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
+    tar xvzf h3.tgz  && \
+    cd h3-4.0.1 && \
+    mkdir build && \
+    cd build && \
+    cmake .. -DCMAKE_BUILD_TYPE=Release && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    DESTDIR=/h3 make install && \
+    cp -R /h3/usr / && \
+    rm -rf build
+
+RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \
+    tar xvzf h3-pg.tgz && \
+    cd h3-pg-4.0.1 && \
+    export PATH="/usr/local/pgsql/bin:$PATH" && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control
+
 #
 # Layer "neon-pg-ext-build"
 # compile neon extensions
 #
 FROM build-deps AS neon-pg-ext-build
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
+# plv8 still sometimes crashes during the creation
+# COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=h3-pg-build /h3/usr /
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -132,8 +167,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
    chmod 0750 /var/db/postgres/compute && \
    echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig

-# TODO: Check if we can make the extension setup more modular versus a linear build
-# currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc#
 COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
 COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl

--- a/Dockerfile.compute-node-v15
+++ b/Dockerfile.compute-node-v15
@@ -5,7 +5,7 @@

 ARG TAG=pinned
 # apparently, ARGs don't get replaced in RUN commands in kaniko
-# ARG POSTGIS_VERSION=3.3.0
+# ARG POSTGIS_VERSION=3.3.1
 # ARG PLV8_VERSION=3.1.4
 # ARG PG_VERSION=v15

@@ -13,9 +13,12 @@ ARG TAG=pinned
 # Layer "build-deps"
 #
 FROM debian:bullseye-slim AS build-deps
+RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
+    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
+    apt update
 RUN apt update &&  \
    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-    libcurl4-openssl-dev libossp-uuid-dev
+    libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev

 #
 # Layer "pg-build"
@@ -42,11 +45,11 @@ RUN cd postgres && \
 FROM build-deps AS postgis-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc wget
+    apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc

-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
-    tar xvzf postgis-3.3.0.tar.gz && \
-    cd postgis-3.3.0 && \
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
+    tar xvzf postgis-3.3.1.tar.gz && \
+    cd postgis-3.3.1 && \
    ./autogen.sh && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    ./configure && \
@@ -64,15 +67,13 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
 # Build plv8
 #
 FROM build-deps AS plv8-build
-COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y git curl wget make ninja-build build-essential libncurses5 python3-dev pkg-config libc++-dev libc++abi-dev libglib2.0-dev
+    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5

 # https://github.com/plv8/plv8/issues/475
 # Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
-RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
-    apt update && \
+RUN apt update && \
    apt install -y --no-install-recommends -t testing binutils

 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
@@ -84,12 +85,46 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
    rm -rf /plv8-* && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control

+#
+# Layer "h3-pg-build"
+# Build h3_pg
+#
+FROM build-deps AS h3-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+# packaged cmake is too old
+RUN apt update && \
+    apt install -y --no-install-recommends -t testing cmake
+
+RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
+    tar xvzf h3.tgz  && \
+    cd h3-4.0.1 && \
+    mkdir build && \
+    cd build && \
+    cmake .. -DCMAKE_BUILD_TYPE=Release && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    DESTDIR=/h3 make install && \
+    cp -R /h3/usr / && \
+    rm -rf build
+
+RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \
+    tar xvzf h3-pg.tgz && \
+    cd h3-pg-4.0.1 && \
+    export PATH="/usr/local/pgsql/bin:$PATH" && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control
+
 #
 # Layer "neon-pg-ext-build"
 # compile neon extensions
 #
 FROM build-deps AS neon-pg-ext-build
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
+# plv8 still sometimes crashes during the creation
+# COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=h3-pg-build /h3/usr /
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -137,8 +172,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
    chmod 0750 /var/db/postgres/compute && \
    echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig

-# TODO: Check if we can make the extension setup more modular versus a linear build
-# currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc#
 COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
 COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl

--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -6,10 +6,12 @@ edition = "2021"
 [dependencies]
 anyhow = "1.0"
 chrono = "0.4"
-clap = "3.0"
+clap = "4.0"
 env_logger = "0.9"
+futures = "0.3.13"
 hyper = { version = "0.14", features = ["full"] }
 log = { version = "0.4", features = ["std", "serde"] }
+notify = "5.0.0"
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 regex = "1"
 serde = { version = "1.0", features = ["derive"] }
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -51,53 +51,19 @@ fn main() -> Result<()> {
    // TODO: re-use `utils::logging` later
    init_logger(DEFAULT_LOG_LEVEL)?;

-    // Env variable is set by `cargo`
-    let version: Option<&str> = option_env!("CARGO_PKG_VERSION");
-    let matches = clap::App::new("compute_ctl")
-        .version(version.unwrap_or("unknown"))
-        .arg(
-            Arg::new("connstr")
-                .short('C')
-                .long("connstr")
-                .value_name("DATABASE_URL")
-                .required(true),
-        )
-        .arg(
-            Arg::new("pgdata")
-                .short('D')
-                .long("pgdata")
-                .value_name("DATADIR")
-                .required(true),
-        )
-        .arg(
-            Arg::new("pgbin")
-                .short('b')
-                .long("pgbin")
-                .value_name("POSTGRES_PATH"),
-        )
-        .arg(
-            Arg::new("spec")
-                .short('s')
-                .long("spec")
-                .value_name("SPEC_JSON"),
-        )
-        .arg(
-            Arg::new("spec-path")
-                .short('S')
-                .long("spec-path")
-                .value_name("SPEC_PATH"),
-        )
-        .get_matches();
+    let matches = cli().get_matches();

-    let pgdata = matches.value_of("pgdata").expect("PGDATA path is required");
+    let pgdata = matches
+        .get_one::<String>("pgdata")
+        .expect("PGDATA path is required");
    let connstr = matches
-        .value_of("connstr")
+        .get_one::<String>("connstr")
        .expect("Postgres connection string is required");
-    let spec = matches.value_of("spec");
-    let spec_path = matches.value_of("spec-path");
+    let spec = matches.get_one::<String>("spec");
+    let spec_path = matches.get_one::<String>("spec-path");

    // Try to use just 'postgres' if no path is provided
-    let pgbin = matches.value_of("pgbin").unwrap_or("postgres");
+    let pgbin = matches.get_one::<String>("pgbin").unwrap();

    let spec: ComputeSpec = match spec {
        // First, try to get cluster spec from the cli argument
@@ -173,3 +139,48 @@ fn main() -> Result<()> {
        }
    }
 }
+
+fn cli() -> clap::Command {
+    // Env variable is set by `cargo`
+    let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown");
+    clap::Command::new("compute_ctl")
+        .version(version)
+        .arg(
+            Arg::new("connstr")
+                .short('C')
+                .long("connstr")
+                .value_name("DATABASE_URL")
+                .required(true),
+        )
+        .arg(
+            Arg::new("pgdata")
+                .short('D')
+                .long("pgdata")
+                .value_name("DATADIR")
+                .required(true),
+        )
+        .arg(
+            Arg::new("pgbin")
+                .short('b')
+                .long("pgbin")
+                .default_value("postgres")
+                .value_name("POSTGRES_PATH"),
+        )
+        .arg(
+            Arg::new("spec")
+                .short('s')
+                .long("spec")
+                .value_name("SPEC_JSON"),
+        )
+        .arg(
+            Arg::new("spec-path")
+                .short('S')
+                .long("spec-path")
+                .value_name("SPEC_PATH"),
+        )
+}
+
+#[test]
+fn verify_cli() {
+    cli().debug_assert()
+}
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -178,7 +178,6 @@ impl ComputeNode {
            .args(&["--sync-safekeepers"])
            .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
            .stdout(Stdio::piped())
-            .stderr(Stdio::piped())
            .spawn()
            .expect("postgres --sync-safekeepers failed to start");

@@ -191,10 +190,10 @@ impl ComputeNode {

        if !sync_output.status.success() {
            anyhow::bail!(
-                "postgres --sync-safekeepers exited with non-zero status: {}. stdout: {}, stderr: {}",
+                "postgres --sync-safekeepers exited with non-zero status: {}. stdout: {}",
                sync_output.status,
-                String::from_utf8(sync_output.stdout).expect("postgres --sync-safekeepers exited, and stdout is not utf-8"),
-                String::from_utf8(sync_output.stderr).expect("postgres --sync-safekeepers exited, and stderr is not utf-8"),
+                String::from_utf8(sync_output.stdout)
+                    .expect("postgres --sync-safekeepers exited, and stdout is not utf-8"),
            );
        }

@@ -258,14 +257,7 @@ impl ComputeNode {
            .spawn()
            .expect("cannot start postgres process");

-        // Try default Postgres port if it is not provided
-        let port = self
-            .spec
-            .cluster
-            .settings
-            .find("port")
-            .unwrap_or_else(|| "5432".to_string());
-        wait_for_postgres(&mut pg, &port, pgdata_path)?;
+        wait_for_postgres(&mut pg, pgdata_path)?;

        // If connection fails,
        // it may be the old node with `zenith_admin` superuser.
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -1,18 +1,19 @@
 use std::fmt::Write;
+use std::fs;
 use std::fs::File;
 use std::io::{BufRead, BufReader};
-use std::net::{SocketAddr, TcpStream};
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::Child;
-use std::str::FromStr;
-use std::{fs, thread, time};
+use std::time::{Duration, Instant};

 use anyhow::{bail, Result};
 use postgres::{Client, Transaction};
 use serde::Deserialize;

-const POSTGRES_WAIT_TIMEOUT: u64 = 60 * 1000; // milliseconds
+use notify::{RecursiveMode, Watcher};
+
+const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds

 /// Rust representation of Postgres role info with only those fields
 /// that matter for us.
@@ -230,52 +231,112 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
    Ok(postgres_dbs)
 }

-/// Wait for Postgres to become ready to accept connections:
-/// - state should be `ready` in the `pgdata/postmaster.pid`
-/// - and we should be able to connect to 127.0.0.1:5432
-pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()> {
+/// Wait for Postgres to become ready to accept connections. It's ready to
+/// accept connections when the state-field in `pgdata/postmaster.pid` says
+/// 'ready'.
+pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
    let pid_path = pgdata.join("postmaster.pid");
-    let mut slept: u64 = 0; // ms
-    let pause = time::Duration::from_millis(100);

-    let timeout = time::Duration::from_millis(10);
-    let addr = SocketAddr::from_str(&format!("127.0.0.1:{}", port)).unwrap();
+    // PostgreSQL writes line "ready" to the postmaster.pid file, when it has
+    // completed initialization and is ready to accept connections. We want to
+    // react quickly and perform the rest of our initialization as soon as
+    // PostgreSQL starts accepting connections. Use 'notify' to be notified
+    // whenever the PID file is changed, and whenever it changes, read it to
+    // check if it's now "ready".
+    //
+    // You cannot actually watch a file before it exists, so we first watch the
+    // data directory, and once the postmaster.pid file appears, we switch to
+    // watch the file instead. We also wake up every 100 ms to poll, just in
+    // case we miss some events for some reason. Not strictly necessary, but
+    // better safe than sorry.
+    let (tx, rx) = std::sync::mpsc::channel();
+    let (mut watcher, rx): (Box<dyn Watcher>, _) = match notify::recommended_watcher(move |res| {
+        let _ = tx.send(res);
+    }) {
+        Ok(watcher) => (Box::new(watcher), rx),
+        Err(e) => {
+            match e.kind {
+                notify::ErrorKind::Io(os) if os.raw_os_error() == Some(38) => {
+                    // docker on m1 macs does not support recommended_watcher
+                    // but return "Function not implemented (os error 38)"
+                    // see https://github.com/notify-rs/notify/issues/423
+                    let (tx, rx) = std::sync::mpsc::channel();

-    loop {
-        // Sleep POSTGRES_WAIT_TIMEOUT at max (a bit longer actually if consider a TCP timeout,
-        // but postgres starts listening almost immediately, even if it is not really
-        // ready to accept connections).
-        if slept >= POSTGRES_WAIT_TIMEOUT {
-            bail!("timed out while waiting for Postgres to start");
+                    // let's poll it faster than what we check the results for (100ms)
+                    let config =
+                        notify::Config::default().with_poll_interval(Duration::from_millis(50));
+
+                    let watcher = notify::PollWatcher::new(
+                        move |res| {
+                            let _ = tx.send(res);
+                        },
+                        config,
+                    )?;
+
+                    (Box::new(watcher), rx)
+                }
+                _ => return Err(e.into()),
+            }
        }
+    };

+    watcher.watch(pgdata, RecursiveMode::NonRecursive)?;
+
+    let started_at = Instant::now();
+    let mut postmaster_pid_seen = false;
+    loop {
        if let Ok(Some(status)) = pg.try_wait() {
            // Postgres exited, that is not what we expected, bail out earlier.
            let code = status.code().unwrap_or(-1);
            bail!("Postgres exited unexpectedly with code {}", code);
        }

+        let res = rx.recv_timeout(Duration::from_millis(100));
+        log::debug!("woken up by notify: {res:?}");
+        // If there are multiple events in the channel already, we only need to be
+        // check once. Swallow the extra events before we go ahead to check the
+        // pid file.
+        while let Ok(res) = rx.try_recv() {
+            log::debug!("swallowing extra event: {res:?}");
+        }
+
        // Check that we can open pid file first.
        if let Ok(file) = File::open(&pid_path) {
+            if !postmaster_pid_seen {
+                log::debug!("postmaster.pid appeared");
+                watcher
+                    .unwatch(pgdata)
+                    .expect("Failed to remove pgdata dir watch");
+                watcher
+                    .watch(&pid_path, RecursiveMode::NonRecursive)
+                    .expect("Failed to add postmaster.pid file watch");
+                postmaster_pid_seen = true;
+            }
+
            let file = BufReader::new(file);
            let last_line = file.lines().last();

            // Pid file could be there and we could read it, but it could be empty, for example.
            if let Some(Ok(line)) = last_line {
                let status = line.trim();
-                let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok();
+                log::debug!("last line of postmaster.pid: {status:?}");

                // Now Postgres is ready to accept connections
-                if status == "ready" && can_connect {
+                if status == "ready" {
                    break;
                }
            }
        }

-        thread::sleep(pause);
-        slept += 100;
+        // Give up after POSTGRES_WAIT_TIMEOUT.
+        let duration = started_at.elapsed();
+        if duration >= POSTGRES_WAIT_TIMEOUT {
+            bail!("timed out while waiting for Postgres to start");
+        }
    }

+    log::info!("PostgreSQL is now running, continuing to configure it");
+
    Ok(())
 }

--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -380,6 +380,10 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
        info!("grant query {}", &query);

        client.execute(query.as_str(), &[])?;
+
+        // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
+        // This is needed since postgres 15, where this privilege is removed by default.
+        client.execute("GRANT CREATE ON SCHEMA public TO web_access", &[])?;
    }

    // Do some per-database access adjustments. We'd better do this at db creation time,
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -4,22 +4,24 @@ version = "0.1.0"
 edition = "2021"

 [dependencies]
-clap = "3.0"
-comfy-table = "5.0.1"
+clap = "4.0"
+comfy-table = "6.1"
 git-version = "0.3.5"
 tar = "0.4.38"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 serde = { version = "1.0", features = ["derive"] }
-serde_with = "1.12.0"
+serde_with = "2.0"
 toml = "0.5"
 once_cell = "1.13.0"
 regex = "1"
 anyhow = "1.0"
 thiserror = "1"
-nix = "0.23"
+nix = "0.25"
 reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }

-pageserver = { path = "../pageserver" }
-safekeeper = { path = "../safekeeper" }
+# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
+# instead, so that recompile times are better.
+pageserver_api = { path = "../libs/pageserver_api" }
+safekeeper_api = { path = "../libs/safekeeper_api" }
 utils = { path = "../libs/utils" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -6,18 +6,18 @@
 //! rely on `neon_local` to set up the environment for each test.
 //!
 use anyhow::{anyhow, bail, Context, Result};
-use clap::{App, AppSettings, Arg, ArgMatches};
+use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
 use control_plane::compute::ComputeControlPlane;
 use control_plane::local_env::{EtcdBroker, LocalEnv};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage::PageServerNode;
 use control_plane::{etcd, local_env};
-use pageserver::config::defaults::{
+use pageserver_api::models::TimelineInfo;
+use pageserver_api::{
    DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
    DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR,
 };
-use pageserver::http::models::TimelineInfo;
-use safekeeper::defaults::{
+use safekeeper_api::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
 };
@@ -85,212 +85,7 @@ struct TimelineTreeEl {
 //   * Providing CLI api to the pageserver
 //   * TODO: export/import to/from usual postgres
 fn main() -> Result<()> {
-    let branch_name_arg = Arg::new("branch-name")
-        .long("branch-name")
-        .takes_value(true)
-        .help("Name of the branch to be created or used as an alias for other services")
-        .required(false);
-
-    let pg_node_arg = Arg::new("node").help("Postgres node name").required(false);
-
-    let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);
-
-    let tenant_id_arg = Arg::new("tenant-id")
-        .long("tenant-id")
-        .help("Tenant id. Represented as a hexadecimal string 32 symbols length")
-        .takes_value(true)
-        .required(false);
-
-    let timeline_id_arg = Arg::new("timeline-id")
-        .long("timeline-id")
-        .help("Timeline id. Represented as a hexadecimal string 32 symbols length")
-        .takes_value(true)
-        .required(false);
-
-    let pg_version_arg = Arg::new("pg-version")
-        .long("pg-version")
-        .help("Postgres version to use for the initial tenant")
-        .required(false)
-        .takes_value(true)
-        .default_value(DEFAULT_PG_VERSION);
-
-    let port_arg = Arg::new("port")
-        .long("port")
-        .required(false)
-        .value_name("port");
-
-    let stop_mode_arg = Arg::new("stop-mode")
-        .short('m')
-        .takes_value(true)
-        .possible_values(&["fast", "immediate"])
-        .help("If 'immediate', don't flush repository data at shutdown")
-        .required(false)
-        .value_name("stop-mode");
-
-    let pageserver_config_args = Arg::new("pageserver-config-override")
-        .long("pageserver-config-override")
-        .takes_value(true)
-        .number_of_values(1)
-        .multiple_occurrences(true)
-        .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
-        .required(false);
-
-    let lsn_arg = Arg::new("lsn")
-        .long("lsn")
-        .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
-        .takes_value(true)
-        .required(false);
-
-    let matches = App::new("Neon CLI")
-        .setting(AppSettings::ArgRequiredElseHelp)
-        .version(GIT_VERSION)
-        .subcommand(
-            App::new("init")
-                .about("Initialize a new Neon repository")
-                .arg(pageserver_config_args.clone())
-                .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
-                .arg(
-                    Arg::new("config")
-                        .long("config")
-                        .required(false)
-                        .value_name("config"),
-                )
-                .arg(pg_version_arg.clone())
-        )
-        .subcommand(
-            App::new("timeline")
-            .about("Manage timelines")
-            .subcommand(App::new("list")
-                .about("List all timelines, available to this pageserver")
-                .arg(tenant_id_arg.clone()))
-            .subcommand(App::new("branch")
-                .about("Create a new timeline, using another timeline as a base, copying its data")
-                .arg(tenant_id_arg.clone())
-                .arg(branch_name_arg.clone())
-                .arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name").takes_value(true)
-                    .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false))
-                .arg(Arg::new("ancestor-start-lsn").long("ancestor-start-lsn").takes_value(true)
-                    .help("When using another timeline as base, use a specific Lsn in it instead of the latest one").required(false)))
-            .subcommand(App::new("create")
-                .about("Create a new blank timeline")
-                .arg(tenant_id_arg.clone())
-                .arg(branch_name_arg.clone())
-                .arg(pg_version_arg.clone())
-            )
-            .subcommand(App::new("import")
-                .about("Import timeline from basebackup directory")
-                .arg(tenant_id_arg.clone())
-                .arg(timeline_id_arg.clone())
-                .arg(Arg::new("node-name").long("node-name").takes_value(true)
-                    .help("Name to assign to the imported timeline"))
-                .arg(Arg::new("base-tarfile").long("base-tarfile").takes_value(true)
-                    .help("Basebackup tarfile to import"))
-                .arg(Arg::new("base-lsn").long("base-lsn").takes_value(true)
-                    .help("Lsn the basebackup starts at"))
-                .arg(Arg::new("wal-tarfile").long("wal-tarfile").takes_value(true)
-                    .help("Wal to add after base"))
-                .arg(Arg::new("end-lsn").long("end-lsn").takes_value(true)
-                    .help("Lsn the basebackup ends at"))
-                .arg(pg_version_arg.clone())
-            )
-        ).subcommand(
-            App::new("tenant")
-            .setting(AppSettings::ArgRequiredElseHelp)
-            .about("Manage tenants")
-            .subcommand(App::new("list"))
-            .subcommand(App::new("create")
-                .arg(tenant_id_arg.clone())
-                .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
-                .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false))
-                .arg(pg_version_arg.clone())
-                )
-            .subcommand(App::new("config")
-                .arg(tenant_id_arg.clone())
-                .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false))
-                )
-        )
-        .subcommand(
-            App::new("pageserver")
-                .setting(AppSettings::ArgRequiredElseHelp)
-                .about("Manage pageserver")
-                .subcommand(App::new("status"))
-                .subcommand(App::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
-                .subcommand(App::new("stop").about("Stop local pageserver")
-                            .arg(stop_mode_arg.clone()))
-                .subcommand(App::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone()))
-        )
-        .subcommand(
-            App::new("safekeeper")
-                .setting(AppSettings::ArgRequiredElseHelp)
-                .about("Manage safekeepers")
-                .subcommand(App::new("start")
-                            .about("Start local safekeeper")
-                            .arg(safekeeper_id_arg.clone())
-                )
-                .subcommand(App::new("stop")
-                            .about("Stop local safekeeper")
-                            .arg(safekeeper_id_arg.clone())
-                            .arg(stop_mode_arg.clone())
-                )
-                .subcommand(App::new("restart")
-                            .about("Restart local safekeeper")
-                            .arg(safekeeper_id_arg.clone())
-                            .arg(stop_mode_arg.clone())
-                )
-        )
-        .subcommand(
-            App::new("pg")
-                .setting(AppSettings::ArgRequiredElseHelp)
-                .about("Manage postgres instances")
-                .subcommand(App::new("list").arg(tenant_id_arg.clone()))
-                .subcommand(App::new("create")
-                    .about("Create a postgres compute node")
-                    .arg(pg_node_arg.clone())
-                    .arg(branch_name_arg.clone())
-                    .arg(tenant_id_arg.clone())
-                    .arg(lsn_arg.clone())
-                    .arg(port_arg.clone())
-                    .arg(
-                        Arg::new("config-only")
-                            .help("Don't do basebackup, create compute node with only config files")
-                            .long("config-only")
-                            .required(false))
-                    .arg(pg_version_arg.clone())
-                )
-                .subcommand(App::new("start")
-                    .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files")
-                    .arg(pg_node_arg.clone())
-                    .arg(tenant_id_arg.clone())
-                    .arg(branch_name_arg.clone())
-                    .arg(timeline_id_arg.clone())
-                    .arg(lsn_arg.clone())
-                    .arg(port_arg.clone())
-                    .arg(pg_version_arg.clone())
-                )
-                .subcommand(
-                    App::new("stop")
-                    .arg(pg_node_arg.clone())
-                    .arg(tenant_id_arg.clone())
-                    .arg(
-                        Arg::new("destroy")
-                            .help("Also delete data directory (now optional, should be default in future)")
-                            .long("destroy")
-                            .required(false)
-                    )
-                    )
-
-        )
-        .subcommand(
-            App::new("start")
-                .about("Start page server and safekeepers")
-                .arg(pageserver_config_args)
-        )
-        .subcommand(
-            App::new("stop")
-                .about("Stop page server and safekeepers")
-                .arg(stop_mode_arg.clone())
-        )
-        .get_matches();
+    let matches = cli().get_matches();

    let (sub_name, sub_args) = match matches.subcommand() {
        Some(subcommand_data) => subcommand_data,
@@ -358,9 +153,7 @@ fn print_timelines_tree(

    // Memorize all direct children of each timeline.
    for timeline in timelines.iter() {
-        if let Some(ancestor_timeline_id) =
-            timeline.local.as_ref().and_then(|l| l.ancestor_timeline_id)
-        {
+        if let Some(ancestor_timeline_id) = timeline.ancestor_timeline_id {
            timelines_hash
                .get_mut(&ancestor_timeline_id)
                .context("missing timeline info in the HashMap")?
@@ -371,13 +164,7 @@ fn print_timelines_tree(

    for timeline in timelines_hash.values() {
        // Start with root local timelines (no ancestors) first.
-        if timeline
-            .info
-            .local
-            .as_ref()
-            .and_then(|l| l.ancestor_timeline_id)
-            .is_none()
-        {
+        if timeline.info.ancestor_timeline_id.is_none() {
            print_timeline(0, &Vec::from([true]), timeline, &timelines_hash)?;
        }
    }
@@ -394,17 +181,8 @@ fn print_timeline(
    timeline: &TimelineTreeEl,
    timelines: &HashMap<TimelineId, TimelineTreeEl>,
 ) -> Result<()> {
-    let local_remote = match (timeline.info.local.as_ref(), timeline.info.remote.as_ref()) {
-        (None, None) => unreachable!("in this case no info for a timeline is found"),
-        (None, Some(_)) => "(R)",
-        (Some(_), None) => "(L)",
-        (Some(_), Some(_)) => "(L+R)",
-    };
-    // Draw main padding
-    print!("{} ", local_remote);
-
    if nesting_level > 0 {
-        let ancestor_lsn = match timeline.info.local.as_ref().and_then(|i| i.ancestor_lsn) {
+        let ancestor_lsn = match timeline.info.ancestor_lsn {
            Some(lsn) => lsn.to_string(),
            None => "Unknown Lsn".to_string(),
        };
@@ -492,16 +270,16 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R

 fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
    sub_match
-        .value_of("tenant-id")
-        .map(TenantId::from_str)
+        .get_one::<String>("tenant-id")
+        .map(|tenant_id| TenantId::from_str(tenant_id))
        .transpose()
        .context("Failed to parse tenant id from the argument string")
 }

 fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId>> {
    sub_match
-        .value_of("timeline-id")
-        .map(TimelineId::from_str)
+        .get_one::<String>("timeline-id")
+        .map(|timeline_id| TimelineId::from_str(timeline_id))
        .transpose()
        .context("Failed to parse timeline id from the argument string")
 }
@@ -510,19 +288,22 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
    let initial_timeline_id_arg = parse_timeline_id(init_match)?;

    // Create config file
-    let toml_file: String = if let Some(config_path) = init_match.value_of("config") {
+    let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
        // load and parse the file
-        std::fs::read_to_string(std::path::Path::new(config_path))
-            .with_context(|| format!("Could not read configuration file '{config_path}'"))?
+        std::fs::read_to_string(config_path).with_context(|| {
+            format!(
+                "Could not read configuration file '{}'",
+                config_path.display()
+            )
+        })?
    } else {
        // Built-in default config
        default_conf(&EtcdBroker::locate_etcd()?)
    };

    let pg_version = init_match
-        .value_of("pg-version")
-        .unwrap()
-        .parse::<u32>()
+        .get_one::<u32>("pg-version")
+        .copied()
        .context("Failed to parse postgres version from the argument string")?;

    let mut env =
@@ -558,9 +339,10 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {

 fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
    init_match
-        .values_of("pageserver-config-override")
+        .get_many::<String>("pageserver-config-override")
        .into_iter()
        .flatten()
+        .map(|s| s.as_str())
        .collect()
 }

@@ -575,7 +357,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
        Some(("create", create_match)) => {
            let initial_tenant_id = parse_tenant_id(create_match)?;
            let tenant_conf: HashMap<_, _> = create_match
-                .values_of("config")
+                .get_many::<String>("config")
                .map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
                .unwrap_or_default();
            let new_tenant_id = pageserver.tenant_create(initial_tenant_id, tenant_conf)?;
@@ -584,9 +366,8 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
            // Create an initial timeline for the new tenant
            let new_timeline_id = parse_timeline_id(create_match)?;
            let pg_version = create_match
-                .value_of("pg-version")
-                .unwrap()
-                .parse::<u32>()
+                .get_one::<u32>("pg-version")
+                .copied()
                .context("Failed to parse postgres version from the argument string")?;

            let timeline_info = pageserver.timeline_create(
@@ -597,10 +378,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                Some(pg_version),
            )?;
            let new_timeline_id = timeline_info.timeline_id;
-            let last_record_lsn = timeline_info
-                .local
-                .context(format!("Failed to get last record LSN: no local timeline info for timeline {new_timeline_id}"))?
-                .last_record_lsn;
+            let last_record_lsn = timeline_info.last_record_lsn;

            env.register_branch_mapping(
                DEFAULT_BRANCH_NAME.to_string(),
@@ -615,7 +393,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
        Some(("config", create_match)) => {
            let tenant_id = get_tenant_id(create_match, env)?;
            let tenant_conf: HashMap<_, _> = create_match
-                .values_of("config")
+                .get_many::<String>("config")
                .map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
                .unwrap_or_default();

@@ -642,23 +420,19 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
        Some(("create", create_match)) => {
            let tenant_id = get_tenant_id(create_match, env)?;
            let new_branch_name = create_match
-                .value_of("branch-name")
+                .get_one::<String>("branch-name")
                .ok_or_else(|| anyhow!("No branch name provided"))?;

            let pg_version = create_match
-                .value_of("pg-version")
-                .unwrap()
-                .parse::<u32>()
+                .get_one::<u32>("pg-version")
+                .copied()
                .context("Failed to parse postgres version from the argument string")?;

            let timeline_info =
                pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?;
            let new_timeline_id = timeline_info.timeline_id;

-            let last_record_lsn = timeline_info
-                .local
-                .expect("no local timeline info")
-                .last_record_lsn;
+            let last_record_lsn = timeline_info.last_record_lsn;
            env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;

            println!(
@@ -670,35 +444,32 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
            let tenant_id = get_tenant_id(import_match, env)?;
            let timeline_id = parse_timeline_id(import_match)?.expect("No timeline id provided");
            let name = import_match
-                .value_of("node-name")
+                .get_one::<String>("node-name")
                .ok_or_else(|| anyhow!("No node name provided"))?;

            // Parse base inputs
            let base_tarfile = import_match
-                .value_of("base-tarfile")
-                .map(|s| PathBuf::from_str(s).unwrap())
-                .ok_or_else(|| anyhow!("No base-tarfile provided"))?;
+                .get_one::<PathBuf>("base-tarfile")
+                .ok_or_else(|| anyhow!("No base-tarfile provided"))?
+                .to_owned();
            let base_lsn = Lsn::from_str(
                import_match
-                    .value_of("base-lsn")
+                    .get_one::<String>("base-lsn")
                    .ok_or_else(|| anyhow!("No base-lsn provided"))?,
            )?;
            let base = (base_lsn, base_tarfile);

            // Parse pg_wal inputs
-            let wal_tarfile = import_match
-                .value_of("wal-tarfile")
-                .map(|s| PathBuf::from_str(s).unwrap());
+            let wal_tarfile = import_match.get_one::<PathBuf>("wal-tarfile").cloned();
            let end_lsn = import_match
-                .value_of("end-lsn")
+                .get_one::<String>("end-lsn")
                .map(|s| Lsn::from_str(s).unwrap());
            // TODO validate both or none are provided
            let pg_wal = end_lsn.zip(wal_tarfile);

            let pg_version = import_match
-                .value_of("pg-version")
-                .unwrap()
-                .parse::<u32>()
+                .get_one::<u32>("pg-version")
+                .copied()
                .context("Failed to parse postgres version from the argument string")?;

            let mut cplane = ComputeControlPlane::load(env.clone())?;
@@ -713,10 +484,11 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
        Some(("branch", branch_match)) => {
            let tenant_id = get_tenant_id(branch_match, env)?;
            let new_branch_name = branch_match
-                .value_of("branch-name")
+                .get_one::<String>("branch-name")
                .ok_or_else(|| anyhow!("No branch name provided"))?;
            let ancestor_branch_name = branch_match
-                .value_of("ancestor-branch-name")
+                .get_one::<String>("ancestor-branch-name")
+                .map(|s| s.as_str())
                .unwrap_or(DEFAULT_BRANCH_NAME);
            let ancestor_timeline_id = env
                .get_branch_timeline_id(ancestor_branch_name, tenant_id)
@@ -725,8 +497,8 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                })?;

            let start_lsn = branch_match
-                .value_of("ancestor-start-lsn")
-                .map(Lsn::from_str)
+                .get_one::<String>("ancestor-start-lsn")
+                .map(|lsn_str| Lsn::from_str(lsn_str))
                .transpose()
                .context("Failed to parse ancestor start Lsn from the request")?;
            let timeline_info = pageserver.timeline_create(
@@ -738,10 +510,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
            )?;
            let new_timeline_id = timeline_info.timeline_id;

-            let last_record_lsn = timeline_info
-                .local
-                .expect("no local timeline info")
-                .last_record_lsn;
+            let last_record_lsn = timeline_info.last_record_lsn;

            env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;

@@ -801,7 +570,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
                        // Use the LSN at the end of the timeline.
                        timeline_infos
                            .get(&node.timeline_id)
-                            .and_then(|bi| bi.local.as_ref().map(|l| l.last_record_lsn.to_string()))
+                            .map(|bi| bi.last_record_lsn.to_string())
                            .unwrap_or_else(|| "?".to_string())
                    }
                    Some(lsn) => {
@@ -830,45 +599,39 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
        }
        "create" => {
            let branch_name = sub_args
-                .value_of("branch-name")
+                .get_one::<String>("branch-name")
+                .map(|s| s.as_str())
                .unwrap_or(DEFAULT_BRANCH_NAME);
            let node_name = sub_args
-                .value_of("node")
-                .map(ToString::to_string)
-                .unwrap_or_else(|| format!("{}_node", branch_name));
+                .get_one::<String>("node")
+                .map(|node_name| node_name.to_string())
+                .unwrap_or_else(|| format!("{branch_name}_node"));

            let lsn = sub_args
-                .value_of("lsn")
-                .map(Lsn::from_str)
+                .get_one::<String>("lsn")
+                .map(|lsn_str| Lsn::from_str(lsn_str))
                .transpose()
                .context("Failed to parse Lsn from the request")?;
            let timeline_id = env
                .get_branch_timeline_id(branch_name, tenant_id)
-                .ok_or_else(|| anyhow!("Found no timeline id for branch name '{}'", branch_name))?;
+                .ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?;

-            let port: Option<u16> = match sub_args.value_of("port") {
-                Some(p) => Some(p.parse()?),
-                None => None,
-            };
+            let port: Option<u16> = sub_args.get_one::<u16>("port").copied();

            let pg_version = sub_args
-                .value_of("pg-version")
-                .unwrap()
-                .parse::<u32>()
+                .get_one::<u32>("pg-version")
+                .copied()
                .context("Failed to parse postgres version from the argument string")?;

            cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port, pg_version)?;
        }
        "start" => {
-            let port: Option<u16> = match sub_args.value_of("port") {
-                Some(p) => Some(p.parse()?),
-                None => None,
-            };
+            let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
            let node_name = sub_args
-                .value_of("node")
+                .get_one::<String>("node")
                .ok_or_else(|| anyhow!("No node name was provided to start"))?;

-            let node = cplane.nodes.get(&(tenant_id, node_name.to_owned()));
+            let node = cplane.nodes.get(&(tenant_id, node_name.to_string()));

            let auth_token = if matches!(env.pageserver.auth_type, AuthType::NeonJWT) {
                let claims = Claims::new(Some(tenant_id), Scope::Tenant);
@@ -879,36 +642,33 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
            };

            if let Some(node) = node {
-                println!("Starting existing postgres {}...", node_name);
+                println!("Starting existing postgres {node_name}...");
                node.start(&auth_token)?;
            } else {
                let branch_name = sub_args
-                    .value_of("branch-name")
+                    .get_one::<String>("branch-name")
+                    .map(|s| s.as_str())
                    .unwrap_or(DEFAULT_BRANCH_NAME);
                let timeline_id = env
                    .get_branch_timeline_id(branch_name, tenant_id)
                    .ok_or_else(|| {
-                        anyhow!("Found no timeline id for branch name '{}'", branch_name)
+                        anyhow!("Found no timeline id for branch name '{branch_name}'")
                    })?;
                let lsn = sub_args
-                    .value_of("lsn")
-                    .map(Lsn::from_str)
+                    .get_one::<String>("lsn")
+                    .map(|lsn_str| Lsn::from_str(lsn_str))
                    .transpose()
                    .context("Failed to parse Lsn from the request")?;
                let pg_version = sub_args
-                    .value_of("pg-version")
-                    .unwrap()
-                    .parse::<u32>()
-                    .context("Failed to parse postgres version from the argument string")?;
+                    .get_one::<u32>("pg-version")
+                    .copied()
+                    .context("Failed to `pg-version` from the argument string")?;
                // when used with custom port this results in non obvious behaviour
                // port is remembered from first start command, i e
                // start --port X
                // stop
                // start <-- will also use port X even without explicit port argument
-                println!(
-                    "Starting new postgres (v{}) {} on timeline {} ...",
-                    pg_version, node_name, timeline_id
-                );
+                println!("Starting new postgres (v{pg_version}) {node_name} on timeline {timeline_id} ...");

                let node =
                    cplane.new_node(tenant_id, node_name, timeline_id, lsn, port, pg_version)?;
@@ -917,18 +677,18 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
        }
        "stop" => {
            let node_name = sub_args
-                .value_of("node")
+                .get_one::<String>("node")
                .ok_or_else(|| anyhow!("No node name was provided to stop"))?;
-            let destroy = sub_args.is_present("destroy");
+            let destroy = sub_args.get_flag("destroy");

            let node = cplane
                .nodes
-                .get(&(tenant_id, node_name.to_owned()))
-                .with_context(|| format!("postgres {} is not found", node_name))?;
+                .get(&(tenant_id, node_name.to_string()))
+                .with_context(|| format!("postgres {node_name} is not found"))?;
            node.stop(destroy)?;
        }

-        _ => bail!("Unexpected pg subcommand '{}'", sub_name),
+        _ => bail!("Unexpected pg subcommand '{sub_name}'"),
    }

    Ok(())
@@ -946,7 +706,10 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
        }

        Some(("stop", stop_match)) => {
-            let immediate = stop_match.value_of("stop-mode") == Some("immediate");
+            let immediate = stop_match
+                .get_one::<String>("stop-mode")
+                .map(|s| s.as_str())
+                == Some("immediate");

            if let Err(e) = pageserver.stop(immediate) {
                eprintln!("pageserver stop failed: {}", e);
@@ -996,7 +759,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
    };

    // All the commands take an optional safekeeper name argument
-    let sk_id = if let Some(id_str) = sub_args.value_of("id") {
+    let sk_id = if let Some(id_str) = sub_args.get_one::<String>("id") {
        NodeId(id_str.parse().context("while parsing safekeeper id")?)
    } else {
        DEFAULT_SAFEKEEPER_ID
@@ -1012,7 +775,8 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
        }

        "stop" => {
-            let immediate = sub_args.value_of("stop-mode") == Some("immediate");
+            let immediate =
+                sub_args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");

            if let Err(e) = safekeeper.stop(immediate) {
                eprintln!("safekeeper stop failed: {}", e);
@@ -1021,7 +785,8 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
        }

        "restart" => {
-            let immediate = sub_args.value_of("stop-mode") == Some("immediate");
+            let immediate =
+                sub_args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");

            if let Err(e) = safekeeper.stop(immediate) {
                eprintln!("safekeeper stop failed: {}", e);
@@ -1065,7 +830,8 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
 }

 fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    let immediate = sub_match.value_of("stop-mode") == Some("immediate");
+    let immediate =
+        sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");

    let pageserver = PageServerNode::from_env(env);

@@ -1098,3 +864,219 @@ fn try_stop_etcd_process(env: &local_env::LocalEnv) {
        eprintln!("etcd stop failed: {e}");
    }
 }
+
+fn cli() -> Command {
+    let branch_name_arg = Arg::new("branch-name")
+        .long("branch-name")
+        .help("Name of the branch to be created or used as an alias for other services")
+        .required(false);
+
+    let pg_node_arg = Arg::new("node").help("Postgres node name").required(false);
+
+    let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);
+
+    let tenant_id_arg = Arg::new("tenant-id")
+        .long("tenant-id")
+        .help("Tenant id. Represented as a hexadecimal string 32 symbols length")
+        .required(false);
+
+    let timeline_id_arg = Arg::new("timeline-id")
+        .long("timeline-id")
+        .help("Timeline id. Represented as a hexadecimal string 32 symbols length")
+        .required(false);
+
+    let pg_version_arg = Arg::new("pg-version")
+        .long("pg-version")
+        .help("Postgres version to use for the initial tenant")
+        .required(false)
+        .value_parser(value_parser!(u32))
+        .default_value(DEFAULT_PG_VERSION);
+
+    let port_arg = Arg::new("port")
+        .long("port")
+        .required(false)
+        .value_parser(value_parser!(u16))
+        .value_name("port");
+
+    let stop_mode_arg = Arg::new("stop-mode")
+        .short('m')
+        .value_parser(["fast", "immediate"])
+        .help("If 'immediate', don't flush repository data at shutdown")
+        .required(false)
+        .value_name("stop-mode");
+
+    let pageserver_config_args = Arg::new("pageserver-config-override")
+        .long("pageserver-config-override")
+        .num_args(1)
+        .action(ArgAction::Append)
+        .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
+        .required(false);
+
+    let lsn_arg = Arg::new("lsn")
+        .long("lsn")
+        .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
+        .required(false);
+
+    Command::new("Neon CLI")
+        .arg_required_else_help(true)
+        .version(GIT_VERSION)
+        .subcommand(
+            Command::new("init")
+                .about("Initialize a new Neon repository")
+                .arg(pageserver_config_args.clone())
+                .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
+                .arg(
+                    Arg::new("config")
+                        .long("config")
+                        .required(false)
+                        .value_parser(value_parser!(PathBuf))
+                        .value_name("config"),
+                )
+                .arg(pg_version_arg.clone())
+        )
+        .subcommand(
+            Command::new("timeline")
+            .about("Manage timelines")
+            .subcommand(Command::new("list")
+                .about("List all timelines, available to this pageserver")
+                .arg(tenant_id_arg.clone()))
+            .subcommand(Command::new("branch")
+                .about("Create a new timeline, using another timeline as a base, copying its data")
+                .arg(tenant_id_arg.clone())
+                .arg(branch_name_arg.clone())
+                .arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name")
+                    .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false))
+                .arg(Arg::new("ancestor-start-lsn").long("ancestor-start-lsn")
+                    .help("When using another timeline as base, use a specific Lsn in it instead of the latest one").required(false)))
+            .subcommand(Command::new("create")
+                .about("Create a new blank timeline")
+                .arg(tenant_id_arg.clone())
+                .arg(branch_name_arg.clone())
+                .arg(pg_version_arg.clone())
+            )
+            .subcommand(Command::new("import")
+                .about("Import timeline from basebackup directory")
+                .arg(tenant_id_arg.clone())
+                .arg(timeline_id_arg.clone())
+                .arg(Arg::new("node-name").long("node-name")
+                    .help("Name to assign to the imported timeline"))
+                .arg(Arg::new("base-tarfile")
+                    .long("base-tarfile")
+                    .value_parser(value_parser!(PathBuf))
+                    .help("Basebackup tarfile to import")
+                )
+                .arg(Arg::new("base-lsn").long("base-lsn")
+                    .help("Lsn the basebackup starts at"))
+                .arg(Arg::new("wal-tarfile")
+                    .long("wal-tarfile")
+                    .value_parser(value_parser!(PathBuf))
+                    .help("Wal to add after base")
+                )
+                .arg(Arg::new("end-lsn").long("end-lsn")
+                    .help("Lsn the basebackup ends at"))
+                .arg(pg_version_arg.clone())
+            )
+        ).subcommand(
+            Command::new("tenant")
+            .arg_required_else_help(true)
+            .about("Manage tenants")
+            .subcommand(Command::new("list"))
+            .subcommand(Command::new("create")
+                .arg(tenant_id_arg.clone())
+                .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
+                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
+                .arg(pg_version_arg.clone())
+                )
+            .subcommand(Command::new("config")
+                .arg(tenant_id_arg.clone())
+                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
+                )
+        )
+        .subcommand(
+            Command::new("pageserver")
+                .arg_required_else_help(true)
+                .about("Manage pageserver")
+                .subcommand(Command::new("status"))
+                .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
+                .subcommand(Command::new("stop").about("Stop local pageserver")
+                            .arg(stop_mode_arg.clone()))
+                .subcommand(Command::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone()))
+        )
+        .subcommand(
+            Command::new("safekeeper")
+                .arg_required_else_help(true)
+                .about("Manage safekeepers")
+                .subcommand(Command::new("start")
+                            .about("Start local safekeeper")
+                            .arg(safekeeper_id_arg.clone())
+                )
+                .subcommand(Command::new("stop")
+                            .about("Stop local safekeeper")
+                            .arg(safekeeper_id_arg.clone())
+                            .arg(stop_mode_arg.clone())
+                )
+                .subcommand(Command::new("restart")
+                            .about("Restart local safekeeper")
+                            .arg(safekeeper_id_arg)
+                            .arg(stop_mode_arg.clone())
+                )
+        )
+        .subcommand(
+            Command::new("pg")
+                .arg_required_else_help(true)
+                .about("Manage postgres instances")
+                .subcommand(Command::new("list").arg(tenant_id_arg.clone()))
+                .subcommand(Command::new("create")
+                    .about("Create a postgres compute node")
+                    .arg(pg_node_arg.clone())
+                    .arg(branch_name_arg.clone())
+                    .arg(tenant_id_arg.clone())
+                    .arg(lsn_arg.clone())
+                    .arg(port_arg.clone())
+                    .arg(
+                        Arg::new("config-only")
+                            .help("Don't do basebackup, create compute node with only config files")
+                            .long("config-only")
+                            .required(false))
+                    .arg(pg_version_arg.clone())
+                )
+                .subcommand(Command::new("start")
+                    .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files")
+                    .arg(pg_node_arg.clone())
+                    .arg(tenant_id_arg.clone())
+                    .arg(branch_name_arg)
+                    .arg(timeline_id_arg)
+                    .arg(lsn_arg)
+                    .arg(port_arg)
+                    .arg(pg_version_arg)
+                )
+                .subcommand(
+                    Command::new("stop")
+                    .arg(pg_node_arg)
+                    .arg(tenant_id_arg)
+                    .arg(
+                        Arg::new("destroy")
+                            .help("Also delete data directory (now optional, should be default in future)")
+                            .long("destroy")
+                            .action(ArgAction::SetTrue)
+                            .required(false)
+                        )
+                )
+
+        )
+        .subcommand(
+            Command::new("start")
+                .about("Start page server and safekeepers")
+                .arg(pageserver_config_args)
+        )
+        .subcommand(
+            Command::new("stop")
+                .about("Stop page server and safekeepers")
+                .arg(stop_mode_arg)
+        )
+}
+
+#[test]
+fn verify_cli() {
+    cli().debug_assert();
+}
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -12,13 +12,8 @@ use nix::unistd::Pid;
 use postgres::Config;
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
-use safekeeper::http::models::TimelineCreateRequest;
 use thiserror::Error;
-use utils::{
-    connstring::connection_address,
-    http::error::HttpErrorBody,
-    id::{NodeId, TenantId, TimelineId},
-};
+use utils::{connstring::connection_address, http::error::HttpErrorBody, id::NodeId};

 use crate::local_env::{LocalEnv, SafekeeperConf};
 use crate::storage::PageServerNode;
@@ -281,24 +276,4 @@ impl SafekeeperNode {
            .error_from_body()?;
        Ok(())
    }
-
-    pub fn timeline_create(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        peer_ids: Vec<NodeId>,
-    ) -> Result<()> {
-        Ok(self
-            .http_request(
-                Method::POST,
-                format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
-            )
-            .json(&TimelineCreateRequest {
-                timeline_id,
-                peer_ids,
-            })
-            .send()?
-            .error_from_body()?
-            .json()?)
-    }
 }
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -11,7 +11,7 @@ use anyhow::{bail, Context};
 use nix::errno::Errno;
 use nix::sys::signal::{kill, Signal};
 use nix::unistd::Pid;
-use pageserver::http::models::{
+use pageserver_api::models::{
    TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
 };
 use postgres::{Config, NoTls};
--- a/docs/rfcs/018-storage-messaging-2.md
+++ b/docs/rfcs/018-storage-messaging-2.md
@@ -0,0 +1,163 @@
+# Storage messaging
+
+Safekeepers need to communicate to each other to
+* Trim WAL on safekeepers;
+* Decide on which SK should push WAL to the S3;
+* Decide on when to shut down SK<->pageserver connection;
+* Understand state of each other to perform peer recovery;
+
+Pageservers need to communicate to safekeepers to decide which SK should provide
+WAL to the pageserver.
+
+This is an iteration on [015-storage-messaging](https://github.com/neondatabase/neon/blob/main/docs/rfcs/015-storage-messaging.md) describing current situation,
+potential performance issue and ways to address it.
+
+## Background
+
+What we have currently is very close to etcd variant described in
+015-storage-messaging. Basically, we have single `SkTimelineInfo` message
+periodically sent by all safekeepers to etcd for each timeline.
+* Safekeepers subscribe to it to learn status of peers (currently they subscribe to
+  'everything', but they can and should fetch data only for timelines they hold).
+* Pageserver subscribes to it (separate watch per timeline) to learn safekeepers
+  positions; based on that, it decides from which safekeepers to pull WAL.
+
+Also, safekeepers use etcd elections API to make sure only single safekeeper
+offloads WAL.
+
+It works, and callmemaybe is gone. However, this has a performance
+hazard. Currently deployed etcd can do about 6k puts per second (using its own
+`benchmark` tool); on my 6 core laptop, while running on tmpfs, this gets to
+35k. Making benchmark closer to our usage [etcd watch bench](https://github.com/arssher/etcd-client/blob/watch-bench/examples/watch_bench.rs),
+I get ~10k received messages per second with various number of publisher-subscribers
+(laptop, tmpfs). Diving this by 12 (3 sks generate msg, 1 ps + 3 sk consume them) we
+get about 800 active timelines, if message is sent each second. Not extremely
+low, but quite reachable.
+
+A lot of idle watches seem to be ok though -- which is good, as pageserver
+subscribes to all its timelines regardless of their activity.
+
+Also, running etcd with fsyncs disabled is messy -- data dir must be wiped on
+each restart or there is a risk of corruption errors.
+
+The reason is etcd making much more than what we need; it is a fault tolerant
+store with strong consistency, but I claim all we need here is just simplest pub
+sub with best effort delivery, because
+* We already have centralized source of truth for long running data, like which
+  tlis are on which nodes  -- the console.
+* Momentary data (safekeeper/pageserver progress) doesn't make sense to persist.
+  Instead of putting each change to broker, expecting it to reliably deliver it
+  is better to just have constant flow of data for active timelines: 1) they
+  serve as natural heartbeats -- if node can't send, we shouldn't pull WAL from
+  it 2) it is simpler -- no need to track delivery to/from the broker.
+  Moreover, latency here is important: the faster we obtain fresh data, the
+  faster we can switch to proper safekeeper after failure.
+* As for WAL offloading leader election, it is trivial to achieve through these
+  heartbeats -- just take suitable node through deterministic rule (min node
+  id).  Once network is stable, this is a converging process (well, except
+  complicated failure topology, but even then making it converge is not
+  hard). Such elections bear some risk of several offloaders running
+  concurrently for a short period of time, but that's harmless.
+
+  Generally, if one needs strong consistency, electing leader per se is not
+  enough; it must be accompanied with number (logical clock ts), checked at
+  every action to track causality. s3 doesn't provide CAS, so it can't
+  differentiate old/new leader, this must be solved differently.
+
+  We could use etcd CAS (its most powerful/useful primitive actually) to issue
+  these leader numbers (and e.g. prefix files in s3), but currently I don't see
+  need for that.
+
+
+Obviously best effort pub sub is much more simpler and performant; the one proposed is
+
+## gRPC broker
+
+I took tonic and [prototyped](https://github.com/neondatabase/neon/blob/asher/neon-broker/broker/src/broker.rs) the replacement of functionality we currently use
+with grpc streams and tokio mpsc channels. The implementation description is at the file header.
+
+It is just 500 lines of code and core functionality is complete. 1-1 pub sub
+gives about 120k received messages per second; having multiple subscribers in
+different connecitons quickly scales to 1 million received messages per second.
+I had concerns about many concurrent streams in singe connection, but 2^20
+subscribers still work (though eat memory, with 10 publishers 20GB are consumed;
+in this implementation each publisher holds full copy of all subscribers). There
+is `bench.rs` nearby which I used for testing.
+
+`SkTimelineInfo` is wired here, but another message can be added (e.g. if
+pageservers want to communicate with each other) with templating.
+
+### Fault tolerance
+
+Since such broker is stateless, we can run it under k8s. Or add proxying to
+other members, with best-effort this is simple.
+
+### Security implications
+
+Communication happens in a private network that is not exposed to users;
+additionaly we can add auth to the broker.
+
+## Alternative: get existing pub-sub
+
+We could take some existing pub sub solution, e.g. RabbitMQ, Redis. But in this
+case IMV simplicity of our own outweights external dependency costs (RabbitMQ is
+much more complicated and needs VM; Redis Rust client maintenance is not
+ideal...). Also note that projects like CockroachDB and TiDB are based on gRPC
+as well.
+
+## Alternative: direct communication
+
+Apart from being transport, broker solves one more task: discovery, i.e. letting
+safekeepers and pageservers find each other. We can let safekeepers know, for
+each timeline, both other safekeepers for this timeline and pageservers serving
+it. In this case direct communication is possible:
+ - each safekeeper pushes to each other safekeeper status of timelines residing
+   on both of them, letting remove WAL, decide who offloads, decide on peer
+   recovery;
+ - each safekeeper pushes to each pageserver status of timelines residing on
+   both of them, letting pageserver choose from which sk to pull WAL;
+
+It was mostly described in [014-safekeeper-gossip](https://github.com/neondatabase/neon/blob/main/docs/rfcs/014-safekeepers-gossip.md), but I want to recap on that.
+
+The main pro is less one dependency: less moving parts, easier to run Neon
+locally/manually, less places to monitor. Fault tolerance for broker disappears,
+no kuber or something. To me this is a big thing.
+
+Also (though not a big thing) idle watches for inactive timelines disappear:
+naturally safekeepers learn about compute connection first and start pushing
+status to pageserver(s), notifying it should pull.
+
+Importantly, I think that eventually knowing and persisting peers and
+pageservers on safekeepers is inevitable:
+- Knowing peer safekeepers for the timeline is required for correct
+  automatic membership change -- new member set must be hardened on old
+  majority before proceeding. It is required to get rid of sync-safekeepers
+  as well (peer recovery up to flush_lsn).
+- Knowing pageservers where the timeline is attached is needed to
+  1. Understand when to shut down activity on the timeline, i.e. push data to
+     the broker. We can have a lot of timelines sleeping quietly which
+	 shouldn't occupy resources.
+  2. Preserve WAL for these (currently we offload to s3 and take it from there,
+     but serving locally is better, and we get one less condition on which WAL
+     can be removed from s3).
+
+I suppose this membership data should be passed to safekeepers directly from the
+console because
+1. Console is the original source of this data, conceptually this is the
+   simplest way (rather than passing it through compute or something).
+2. We already have similar code for deleting timeline on safekeepers
+   (and attaching/detaching timeline on pageserver), this is a typical
+   action -- queue operation against storage node and execute it until it
+   completes (or timeline is dropped).
+
+Cons of direct communication are
+- It is more complicated: each safekeeper should maintain set of peers it talks
+  to, and set of timelines for each such peer -- they ought to be multiplexed
+  into single connection.
+- Totally, we have O(n^2) connections instead of O(n) with broker schema
+  (still O(n) on each node). However, these are relatively stable, async and
+  thus not very expensive, I don't think this is a big problem. Up to 10k
+  storage nodes I doubt connection overhead would be noticeable.
+
+I'd use gRPC for direct communication, and in this sense gRPC based broker is a
+step towards it.
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -96,7 +96,7 @@ A single virtual environment with all dependencies is described in the single `P
      sudo apt install python3.9
      ```
 - Install `poetry`
-    - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation)`.
+    - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation).
 - Install dependencies via `./scripts/pysync`.
    - Note that CI uses specific Python version (look for `PYTHON_VERSION` [here](https://github.com/neondatabase/docker-images/blob/main/rust/Dockerfile))
      so if you have different version some linting tools can yield different result locally vs in the CI.
--- a/libs/etcd_broker/Cargo.toml
+++ b/libs/etcd_broker/Cargo.toml
@@ -8,7 +8,7 @@
 regex = "1.4.5"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
- serde_with = "1.12.0"
+ serde_with = "2.0"
 once_cell = "1.13.0"

 utils = { path = "../utils" }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -3,7 +3,7 @@
 //! Otherwise, we might not see all metrics registered via
 //! a default registry.
 use once_cell::sync::Lazy;
-use prometheus::core::{AtomicU64, GenericGauge, GenericGaugeVec};
+use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
 pub use prometheus::opts;
 pub use prometheus::register;
 pub use prometheus::{core, default_registry, proto};
@@ -17,6 +17,7 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec};
 pub use prometheus::{register_int_gauge, IntGauge};
 pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
 pub use prometheus::{Encoder, TextEncoder};
+use prometheus::{Registry, Result};

 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
@@ -32,13 +33,27 @@ macro_rules! register_uint_gauge_vec {
    }};
 }

+/// Special internal registry, to collect metrics independently from the default registry.
+/// Was introduced to fix deadlock with lazy registration of metrics in the default registry.
+static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
+
+/// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
+/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
+/// while holding the lock.
+pub fn register_internal(c: Box<dyn Collector>) -> Result<()> {
+    INTERNAL_REGISTRY.register(c)
+}
+
 /// Gathers all Prometheus metrics and records the I/O stats just before that.
 ///
 /// Metrics gathering is a relatively simple and standalone operation, so
 /// it might be fine to do it this way to keep things simple.
 pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
    update_rusage_metrics();
-    prometheus::gather()
+    let mut mfs = prometheus::gather();
+    let mut internal_mfs = INTERNAL_REGISTRY.gather();
+    mfs.append(&mut internal_mfs);
+    mfs
 }

 static DISK_IO_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
@@ -62,6 +77,16 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
    0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
 ];

+pub fn set_build_info_metric(revision: &str) {
+    let metric = register_int_gauge_vec!(
+        "libmetrics_build_info",
+        "Build/version information",
+        &["revision"]
+    )
+    .expect("Failed to register build info metric");
+    metric.with_label_values(&[revision]).set(1);
+}
+
 // Records I/O stats in a "cross-platform" way.
 // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
 // An alternative is to read procfs (`/proc/[pid]/io`) which does not work under macOS at all, hence abandoned.
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "pageserver_api"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+serde = { version = "1.0", features = ["derive"] }
+serde_with = "2.0"
+const_format = "0.2.21"
+
+utils = { path = "../utils" }
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -0,0 +1,9 @@
+use const_format::formatcp;
+
+/// Public API types
+pub mod models;
+
+pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
+pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
+pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
+pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -7,7 +7,17 @@ use utils::{
    lsn::Lsn,
 };

-use crate::tenant::TenantState;
+/// A state of a tenant in pageserver's memory.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub enum TenantState {
+    /// Tenant is fully operational, its background jobs might be running or not.
+    Active { background_jobs_running: bool },
+    /// A tenant is recognized by pageserver, but not yet ready to operate:
+    /// e.g. not present locally and being downloaded or being read into memory from the file system.
+    Paused,
+    /// A tenant is recognized by the pageserver, but no longer used for any operations, as failed to get activated.
+    Broken,
+}

 #[serde_as]
 #[derive(Serialize, Deserialize)]
@@ -113,9 +123,15 @@ pub struct TenantInfo {
    pub has_in_progress_downloads: Option<bool>,
 }

+/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct LocalTimelineInfo {
+pub struct TimelineInfo {
+    #[serde_as(as = "DisplayFromStr")]
+    pub tenant_id: TenantId,
+    #[serde_as(as = "DisplayFromStr")]
+    pub timeline_id: TimelineId,
+
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_timeline_id: Option<TimelineId>,
    #[serde_as(as = "Option<DisplayFromStr>")]
@@ -139,28 +155,33 @@ pub struct LocalTimelineInfo {
    /// the timestamp (in microseconds) of the last received message
    pub last_received_msg_ts: Option<u128>,
    pub pg_version: u32,
+
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub remote_consistent_lsn: Option<Lsn>,
+    pub awaits_download: bool,
+
+    // Some of the above fields are duplicated in 'local' and 'remote', for backwards-
+    // compatility with older clients.
+    pub local: LocalTimelineInfo,
+    pub remote: RemoteTimelineInfo,
+}
+
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct LocalTimelineInfo {
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub ancestor_timeline_id: Option<TimelineId>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub ancestor_lsn: Option<Lsn>,
+    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
+    pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
 }

 #[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct RemoteTimelineInfo {
-    #[serde_as(as = "DisplayFromStr")]
-    pub remote_consistent_lsn: Lsn,
-    pub awaits_download: bool,
-}
-
-///
-/// This represents the output of the "timeline_detail" API call.
-///
-#[serde_as]
-#[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct TimelineInfo {
-    #[serde_as(as = "DisplayFromStr")]
-    pub tenant_id: TenantId,
-    #[serde_as(as = "DisplayFromStr")]
-    pub timeline_id: TimelineId,
-    pub local: Option<LocalTimelineInfo>,
-    pub remote: Option<RemoteTimelineInfo>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub remote_consistent_lsn: Option<Lsn>,
 }

 pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -13,7 +13,7 @@ crc32c = "0.6.0"
 hex = "0.4.3"
 once_cell = "1.13.0"
 log = "0.4.14"
-memoffset = "0.6.2"
+memoffset = "0.7"
 thiserror = "1.0"
 serde = { version = "1.0", features = ["derive"] }
 utils = { path = "../utils" }
@@ -26,4 +26,4 @@ wal_craft = { path = "wal_craft" }

 [build-dependencies]
 anyhow = "1.0"
-bindgen = "0.60.1"
+bindgen = "0.61"
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -3,9 +3,11 @@
 #![allow(non_snake_case)]
 // bindgen creates some unsafe code with no doc comments.
 #![allow(clippy::missing_safety_doc)]
-// suppress warnings on rust 1.53 due to bindgen unit tests.
-// https://github.com/rust-lang/rust-bindgen/issues/1651
-#![allow(deref_nullptr)]
+// noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code.
+#![allow(clippy::useless_transmute)]
+// modules included with the postgres_ffi macro depend on the types of the specific version's
+// types, and trigger a too eager lint.
+#![allow(clippy::duplicate_mod)]

 use bytes::Bytes;
 use utils::bin_ser::SerializeError;
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -57,12 +57,10 @@ pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;
 /// in order to let CLOG_TRUNCATE mechanism correctly extend CLOG.
 const XID_CHECKPOINT_INTERVAL: u32 = 1024;

-#[allow(non_snake_case)]
 pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
    (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo
 }

-#[allow(non_snake_case)]
 pub fn XLogSegNoOffsetToRecPtr(
    segno: XLogSegNo,
    offset: u32,
@@ -71,7 +69,6 @@ pub fn XLogSegNoOffsetToRecPtr(
    segno * (wal_segsz_bytes as u64) + (offset as u64)
 }

-#[allow(non_snake_case)]
 pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String {
    format!(
        "{:>08X}{:>08X}{:>08X}",
@@ -81,7 +78,6 @@ pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize
    )
 }

-#[allow(non_snake_case)]
 pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) {
    let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
    let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
@@ -89,12 +85,10 @@ pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLin
    (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli)
 }

-#[allow(non_snake_case)]
 pub fn IsXLogFileName(fname: &str) -> bool {
    return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit());
 }

-#[allow(non_snake_case)]
 pub fn IsPartialXLogFileName(fname: &str) -> bool {
    fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8])
 }
@@ -170,7 +164,7 @@ pub fn find_end_of_wal(
    let mut curr_lsn = start_lsn;
    let mut buf = [0u8; XLOG_BLCKSZ];
    let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
-    info!("find_end_of_wal PG_VERSION: {}", pg_version);
+    debug!("find_end_of_wal PG_VERSION: {}", pg_version);

    let mut decoder = WalStreamDecoder::new(start_lsn, pg_version);

@@ -182,7 +176,7 @@ pub fn find_end_of_wal(
        match open_wal_segment(&seg_file_path)? {
            None => {
                // no more segments
-                info!(
+                debug!(
                    "find_end_of_wal reached end at {:?}, segment {:?} doesn't exist",
                    result, seg_file_path
                );
@@ -205,7 +199,7 @@ pub fn find_end_of_wal(
                        match decoder.poll_decode() {
                            Ok(Some(record)) => result = record.0,
                            Err(e) => {
-                                info!(
+                                debug!(
                                    "find_end_of_wal reached end at {:?}, decode error: {:?}",
                                    result, e
                                );
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -7,7 +7,7 @@ edition = "2021"

 [dependencies]
 anyhow = "1.0"
-clap = "3.0"
+clap = "4.0"
 env_logger = "0.9"
 log = "0.4"
 once_cell = "1.13.0"
--- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
+++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
@@ -1,68 +1,19 @@
 use anyhow::*;
-use clap::{App, Arg, ArgMatches};
-use std::str::FromStr;
+use clap::{value_parser, Arg, ArgMatches, Command};
+use std::{path::PathBuf, str::FromStr};
 use wal_craft::*;

 fn main() -> Result<()> {
    env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("wal_craft=info"))
        .init();
-    let type_arg = &Arg::new("type")
-        .takes_value(true)
-        .help("Type of WAL to craft")
-        .possible_values([
-            Simple::NAME,
-            LastWalRecordXlogSwitch::NAME,
-            LastWalRecordXlogSwitchEndsOnPageBoundary::NAME,
-            WalRecordCrossingSegmentFollowedBySmallOne::NAME,
-            LastWalRecordCrossingSegment::NAME,
-        ])
-        .required(true);
-    let arg_matches = App::new("Postgres WAL crafter")
-        .about("Crafts Postgres databases with specific WAL properties")
-        .subcommand(
-            App::new("print-postgres-config")
-                .about("Print the configuration required for PostgreSQL server before running this script")
-        )
-        .subcommand(
-            App::new("with-initdb")
-                .about("Craft WAL in a new data directory first initialized with initdb")
-                .arg(type_arg)
-                .arg(
-                    Arg::new("datadir")
-                        .takes_value(true)
-                        .help("Data directory for the Postgres server")
-                        .required(true)
-                )
-                .arg(
-                    Arg::new("pg-distrib-dir")
-                        .long("pg-distrib-dir")
-                        .takes_value(true)
-                        .help("Directory with Postgres distributions (bin and lib directories, e.g. pg_install containing subpath `v14/bin/postgresql`)")
-                        .default_value("/usr/local")
-                )
-                .arg(
-                    Arg::new("pg-version")
-                    .long("pg-version")
-                    .help("Postgres version to use for the initial tenant")
-                    .required(true)
-                    .takes_value(true)
-                )
-        )
-        .subcommand(
-            App::new("in-existing")
-                .about("Craft WAL at an existing recently created Postgres database. Note that server may append new WAL entries on shutdown.")
-                .arg(type_arg)
-                .arg(
-                    Arg::new("connection")
-                        .takes_value(true)
-                        .help("Connection string to the Postgres database to populate")
-                        .required(true)
-                )
-        )
-        .get_matches();
+    let arg_matches = cli().get_matches();

    let wal_craft = |arg_matches: &ArgMatches, client| {
-        let (intermediate_lsns, end_of_wal_lsn) = match arg_matches.value_of("type").unwrap() {
+        let (intermediate_lsns, end_of_wal_lsn) = match arg_matches
+            .get_one::<String>("type")
+            .map(|s| s.as_str())
+            .context("'type' is required")?
+        {
            Simple::NAME => Simple::craft(client)?,
            LastWalRecordXlogSwitch::NAME => LastWalRecordXlogSwitch::craft(client)?,
            LastWalRecordXlogSwitchEndsOnPageBoundary::NAME => {
@@ -72,12 +23,12 @@ fn main() -> Result<()> {
                WalRecordCrossingSegmentFollowedBySmallOne::craft(client)?
            }
            LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
-            a => panic!("Unknown --type argument: {}", a),
+            a => panic!("Unknown --type argument: {a}"),
        };
        for lsn in intermediate_lsns {
-            println!("intermediate_lsn = {}", lsn);
+            println!("intermediate_lsn = {lsn}");
        }
-        println!("end_of_wal = {}", end_of_wal_lsn);
+        println!("end_of_wal = {end_of_wal_lsn}");
        Ok(())
    };

@@ -85,20 +36,24 @@ fn main() -> Result<()> {
        None => panic!("No subcommand provided"),
        Some(("print-postgres-config", _)) => {
            for cfg in REQUIRED_POSTGRES_CONFIG.iter() {
-                println!("{}", cfg);
+                println!("{cfg}");
            }
            Ok(())
        }

        Some(("with-initdb", arg_matches)) => {
            let cfg = Conf {
-                pg_version: arg_matches
-                    .value_of("pg-version")
-                    .unwrap()
-                    .parse::<u32>()
-                    .context("Failed to parse postgres version from the argument string")?,
-                pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(),
-                datadir: arg_matches.value_of("datadir").unwrap().into(),
+                pg_version: *arg_matches
+                    .get_one::<u32>("pg-version")
+                    .context("'pg-version' is required")?,
+                pg_distrib_dir: arg_matches
+                    .get_one::<PathBuf>("pg-distrib-dir")
+                    .context("'pg-distrib-dir' is required")?
+                    .to_owned(),
+                datadir: arg_matches
+                    .get_one::<PathBuf>("datadir")
+                    .context("'datadir' is required")?
+                    .to_owned(),
            };
            cfg.initdb()?;
            let srv = cfg.start_server()?;
@@ -108,9 +63,77 @@ fn main() -> Result<()> {
        }
        Some(("in-existing", arg_matches)) => wal_craft(
            arg_matches,
-            &mut postgres::Config::from_str(arg_matches.value_of("connection").unwrap())?
-                .connect(postgres::NoTls)?,
+            &mut postgres::Config::from_str(
+                arg_matches
+                    .get_one::<String>("connection")
+                    .context("'connection' is required")?,
+            )
+            .context(
+                "'connection' argument value could not be parsed as a postgres connection string",
+            )?
+            .connect(postgres::NoTls)?,
        ),
        Some(_) => panic!("Unknown subcommand"),
    }
 }
+
+fn cli() -> Command {
+    let type_arg = &Arg::new("type")
+        .help("Type of WAL to craft")
+        .value_parser([
+            Simple::NAME,
+            LastWalRecordXlogSwitch::NAME,
+            LastWalRecordXlogSwitchEndsOnPageBoundary::NAME,
+            WalRecordCrossingSegmentFollowedBySmallOne::NAME,
+            LastWalRecordCrossingSegment::NAME,
+        ])
+        .required(true);
+
+    Command::new("Postgres WAL crafter")
+        .about("Crafts Postgres databases with specific WAL properties")
+        .subcommand(
+            Command::new("print-postgres-config")
+                .about("Print the configuration required for PostgreSQL server before running this script")
+        )
+        .subcommand(
+            Command::new("with-initdb")
+                .about("Craft WAL in a new data directory first initialized with initdb")
+                .arg(type_arg)
+                .arg(
+                    Arg::new("datadir")
+                        .help("Data directory for the Postgres server")
+                        .value_parser(value_parser!(PathBuf))
+                        .required(true)
+                )
+                .arg(
+                    Arg::new("pg-distrib-dir")
+                        .long("pg-distrib-dir")
+                        .value_parser(value_parser!(PathBuf))
+                        .help("Directory with Postgres distributions (bin and lib directories, e.g. pg_install containing subpath `v14/bin/postgresql`)")
+                        .default_value("/usr/local")
+                )
+                .arg(
+                    Arg::new("pg-version")
+                    .long("pg-version")
+                    .help("Postgres version to use for the initial tenant")
+                    .value_parser(value_parser!(u32))
+                    .required(true)
+
+                )
+        )
+        .subcommand(
+            Command::new("in-existing")
+                .about("Craft WAL at an existing recently created Postgres database. Note that server may append new WAL entries on shutdown.")
+                .arg(type_arg)
+                .arg(
+                    Arg::new("connection")
+                        .help("Connection string to the Postgres database to populate")
+                        .required(true)
+                )
+        )
+}
+
+#[test]
+fn verify_cli() {
+    cli().debug_assert();
+}
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -15,7 +15,7 @@ serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] }
 tokio-util = { version = "0.7", features = ["io"] }
-toml_edit = { version = "0.13", features = ["easy"] }
+toml_edit = { version = "0.14", features = ["easy"] }
 tracing = "0.1.27"

 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "safekeeper_api"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+serde = { version = "1.0", features = ["derive"] }
+serde_with = "2.0"
+const_format = "0.2.21"
+
+utils = { path = "../utils" }
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/safekeeper_api/src/lib.rs
+++ b/libs/safekeeper_api/src/lib.rs
@@ -0,0 +1,10 @@
+use const_format::formatcp;
+
+/// Public API types
+pub mod models;
+
+pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454;
+pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
+
+pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676;
+pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -0,0 +1,24 @@
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
+
+use utils::{
+    id::{NodeId, TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+#[serde_as]
+#[derive(Serialize, Deserialize)]
+pub struct TimelineCreateRequest {
+    #[serde_as(as = "DisplayFromStr")]
+    pub tenant_id: TenantId,
+    #[serde_as(as = "DisplayFromStr")]
+    pub timeline_id: TimelineId,
+    pub peer_ids: Option<Vec<NodeId>>,
+    pub pg_version: u32,
+    pub system_id: Option<u64>,
+    pub wal_seg_size: Option<u32>,
+    #[serde_as(as = "DisplayFromStr")]
+    pub commit_lsn: Lsn,
+    // If not passed, it is assigned to the beginning of commit_lsn segment.
+    pub local_start_lsn: Option<Lsn>,
+}
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -20,7 +20,7 @@ tokio = { version = "1.17", features = ["macros"]}
 tokio-rustls = "0.23"
 tracing = "0.1"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
-nix = "0.23.0"
+nix = "0.25"
 signal-hook = "0.3.10"
 rand = "0.8.3"
 jsonwebtoken = "8"
@@ -28,7 +28,7 @@ hex = { version = "0.4.3", features = ["serde"] }
 rustls = "0.20.2"
 rustls-split = "0.3.0"
 git-version = "0.3.5"
-serde_with = "1.12.0"
+serde_with = "2.0"
 once_cell = "1.13.0"


@@ -40,7 +40,7 @@ byteorder = "1.4.3"
 bytes = "1.0.1"
 hex-literal = "0.3"
 tempfile = "3.2"
-criterion = "0.3"
+criterion = "0.4"
 rustls-pemfile = "1"

 [[bench]]
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -9,6 +9,7 @@ use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
 use routerify::RequestInfo;
 use routerify::{Middleware, Router, RouterBuilder, RouterService};
+use tokio::task::JoinError;
 use tracing::info;

 use std::future::Future;
@@ -35,7 +36,13 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body
    let mut buffer = vec![];
    let encoder = TextEncoder::new();

-    let metrics = metrics::gather();
+    let metrics = tokio::task::spawn_blocking(move || {
+        // Currently we take a lot of mutexes while collecting metrics, so it's
+        // better to spawn a blocking task to avoid blocking the event loop.
+        metrics::gather()
+    })
+    .await
+    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
    encoder.encode(&metrics, &mut buffer).unwrap();

    let response = Response::builder()
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -66,6 +66,11 @@ impl Lsn {
        (self.0 % seg_sz as u64) as usize
    }

+    /// Compute LSN of the segment start.
+    pub fn segment_lsn(self, seg_sz: usize) -> Lsn {
+        Lsn(self.0 - (self.0 % seg_sz as u64))
+    }
+
    /// Compute the segment number
    pub fn segment_number(self, seg_sz: usize) -> u64 {
        self.0 / seg_sz as u64
--- a/libs/utils/src/pq_proto.rs
+++ b/libs/utils/src/pq_proto.rs
@@ -10,6 +10,7 @@ use serde::{Deserialize, Serialize};
 use std::{
    borrow::Cow,
    collections::HashMap,
+    fmt,
    future::Future,
    io::{self, Cursor},
    str,
@@ -124,6 +125,19 @@ pub struct CancelKeyData {
    pub cancel_key: i32,
 }

+impl fmt::Display for CancelKeyData {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let hi = (self.backend_pid as u64) << 32;
+        let lo = self.cancel_key as u64;
+        let id = hi | lo;
+
+        // This format is more compact and might work better for logs.
+        f.debug_tuple("CancelKeyData")
+            .field(&format_args!("{:x}", id))
+            .finish()
+    }
+}
+
 use rand::distributions::{Distribution, Standard};
 impl Distribution<CancelKeyData> for Standard {
    fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> CancelKeyData {
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -240,7 +240,6 @@ where
 mod tests {
    use super::*;
    use std::sync::Arc;
-    use std::thread::sleep;
    use std::time::Duration;

    impl MonotonicCounter<i32> for i32 {
@@ -258,17 +257,19 @@ mod tests {
        let seq = Arc::new(SeqWait::new(0));
        let seq2 = Arc::clone(&seq);
        let seq3 = Arc::clone(&seq);
-        tokio::task::spawn(async move {
+        let jh1 = tokio::task::spawn(async move {
            seq2.wait_for(42).await.expect("wait_for 42");
            let old = seq2.advance(100);
            assert_eq!(old, 99);
-            seq2.wait_for(999).await.expect_err("no 999");
+            seq2.wait_for_timeout(999, Duration::from_millis(100))
+                .await
+                .expect_err("no 999");
        });
-        tokio::task::spawn(async move {
+        let jh2 = tokio::task::spawn(async move {
            seq3.wait_for(42).await.expect("wait_for 42");
            seq3.wait_for(0).await.expect("wait_for 0");
        });
-        sleep(Duration::from_secs(1));
+        tokio::time::sleep(Duration::from_millis(200)).await;
        let old = seq.advance(99);
        assert_eq!(old, 0);
        seq.wait_for(100).await.expect("wait_for 100");
@@ -277,6 +278,9 @@ mod tests {
        assert_eq!(seq.advance(98), 100);
        assert_eq!(seq.load(), 100);

+        jh1.await.unwrap();
+        jh2.await.unwrap();
+
        seq.shutdown();
    }

@@ -284,15 +288,18 @@ mod tests {
    async fn seqwait_timeout() {
        let seq = Arc::new(SeqWait::new(0));
        let seq2 = Arc::clone(&seq);
-        tokio::task::spawn(async move {
+        let jh = tokio::task::spawn(async move {
            let timeout = Duration::from_millis(1);
            let res = seq2.wait_for_timeout(42, timeout).await;
            assert_eq!(res, Err(SeqWaitError::Timeout));
        });
-        tokio::time::sleep(Duration::from_secs(1)).await;
+        tokio::time::sleep(Duration::from_millis(200)).await;
        // This will attempt to wake, but nothing will happen
        // because the waiter already dropped its Receiver.
        let old = seq.advance(99);
-        assert_eq!(old, 0)
+        assert_eq!(old, 0);
+        jh.await.unwrap();
+
+        seq.shutdown();
    }
 }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -23,7 +23,7 @@ futures = "0.3.13"
 hex = "0.4.3"
 hyper = "0.14"
 itertools = "0.10.3"
-clap = "3.0"
+clap = { version = "4.0", features = ["string"] }
 daemonize = "0.4.1"
 tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
 tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
@@ -38,26 +38,27 @@ tar = "0.4.33"
 humantime = "2.1.0"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
-serde_with = "1.12.0"
+serde_with = "2.0"
 humantime-serde = "1.1.1"

 pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }

-toml_edit = { version = "0.13", features = ["easy"] }
+toml_edit = { version = "0.14", features = ["easy"] }
 scopeguard = "1.1.0"
 const_format = "0.2.21"
 tracing = "0.1.36"
 signal-hook = "0.3.10"
 url = "2"
-nix = "0.23"
+nix = "0.25"
 once_cell = "1.13.0"
 crossbeam-utils = "0.8.5"
 fail = "0.5.0"
 git-version = "0.3.5"
 rstar = "0.9.3"
 num-traits = "0.2.15"
-amplify_num = "0.4.1"
+amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }

+pageserver_api = { path = "../libs/pageserver_api" }
 postgres_ffi = { path = "../libs/postgres_ffi" }
 etcd_broker = { path = "../libs/etcd_broker" }
 metrics = { path = "../libs/metrics" }
@@ -68,5 +69,10 @@ close_fds = "0.3.2"
 walkdir = "2.3.2"

 [dev-dependencies]
+criterion = "0.4"
 hex-literal = "0.3"
 tempfile = "3.2"
+
+[[bench]]
+name = "bench_layer_map"
+harness = false
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
--- a/pageserver/src/bin/dump_layerfile.rs
+++ b/pageserver/src/bin/dump_layerfile.rs
@@ -1,35 +0,0 @@
-//! Main entry point for the dump_layerfile executable
-//!
-//! A handy tool for debugging, that's all.
-use anyhow::Result;
-use clap::{App, Arg};
-use pageserver::page_cache;
-use pageserver::tenant::dump_layerfile_from_path;
-use pageserver::virtual_file;
-use std::path::PathBuf;
-use utils::project_git_version;
-
-project_git_version!(GIT_VERSION);
-
-fn main() -> Result<()> {
-    let arg_matches = App::new("Neon dump_layerfile utility")
-        .about("Dump contents of one layer file, for debugging")
-        .version(GIT_VERSION)
-        .arg(
-            Arg::new("path")
-                .help("Path to file to dump")
-                .required(true)
-                .index(1),
-        )
-        .get_matches();
-
-    let path = PathBuf::from(arg_matches.value_of("path").unwrap());
-
-    // Basic initialization of things that don't change after startup
-    virtual_file::init(10);
-    page_cache::init(100);
-
-    dump_layerfile_from_path(&path, true)?;
-
-    Ok(())
-}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -6,10 +6,12 @@ use tracing::*;

 use anyhow::{anyhow, bail, Context, Result};

-use clap::{App, Arg};
+use clap::{Arg, ArgAction, Command};
 use daemonize::Daemonize;

 use fail::FailScenario;
+use metrics::set_build_info_metric;
+
 use pageserver::{
    config::{defaults::*, PageServerConf},
    http, page_cache, page_service, profiling, task_mgr,
@@ -31,72 +33,35 @@ use utils::{

 project_git_version!(GIT_VERSION);

+const FEATURES: &[&str] = &[
+    #[cfg(feature = "testing")]
+    "testing",
+    #[cfg(feature = "fail/failpoints")]
+    "fail/failpoints",
+    #[cfg(feature = "profiling")]
+    "profiling",
+];
+
 fn version() -> String {
    format!(
-        "{GIT_VERSION} profiling:{} failpoints:{}",
-        cfg!(feature = "profiling"),
-        fail::has_failpoints()
+        "{GIT_VERSION} failpoints: {}, features: {:?}",
+        fail::has_failpoints(),
+        FEATURES,
    )
 }

 fn main() -> anyhow::Result<()> {
-    let arg_matches = App::new("Neon page server")
-        .about("Materializes WAL stream to pages and serves them to the postgres")
-        .version(&*version())
-        .arg(
+    let arg_matches = cli().get_matches();

-            Arg::new("daemonize")
-                .short('d')
-                .long("daemonize")
-                .takes_value(false)
-                .help("Run in the background"),
-        )
-        .arg(
-            Arg::new("init")
-                .long("init")
-                .takes_value(false)
-                .help("Initialize pageserver with all given config overrides"),
-        )
-        .arg(
-            Arg::new("workdir")
-                .short('D')
-                .long("workdir")
-                .takes_value(true)
-                .help("Working directory for the pageserver"),
-        )
-        // See `settings.md` for more details on the extra configuration patameters pageserver can process
-        .arg(
-            Arg::new("config-override")
-                .short('c')
-                .takes_value(true)
-                .number_of_values(1)
-                .multiple_occurrences(true)
-                .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there).
-                Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
-        )
-        .arg(Arg::new("update-config").long("update-config").takes_value(false).help(
-            "Update the config file when started",
-        ))
-        .arg(
-            Arg::new("enabled-features")
-                .long("enabled-features")
-                .takes_value(false)
-                .help("Show enabled compile time features"),
-        )
-        .get_matches();
-
-    if arg_matches.is_present("enabled-features") {
-        let features: &[&str] = &[
-            #[cfg(feature = "testing")]
-            "testing",
-            #[cfg(feature = "profiling")]
-            "profiling",
-        ];
-        println!("{{\"features\": {features:?} }}");
+    if arg_matches.get_flag("enabled-features") {
+        println!("{{\"features\": {FEATURES:?} }}");
        return Ok(());
    }

-    let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".neon"));
+    let workdir = arg_matches
+        .get_one::<String>("workdir")
+        .map(Path::new)
+        .unwrap_or_else(|| Path::new(".neon"));
    let workdir = workdir
        .canonicalize()
        .with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
@@ -110,7 +75,7 @@ fn main() -> anyhow::Result<()> {
        )
    })?;

-    let daemonize = arg_matches.is_present("daemonize");
+    let daemonize = arg_matches.get_flag("daemonize");

    let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
        ControlFlow::Continue(conf) => conf,
@@ -148,8 +113,8 @@ fn initialize_config(
    arg_matches: clap::ArgMatches,
    workdir: &Path,
 ) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
-    let init = arg_matches.is_present("init");
-    let update_config = init || arg_matches.is_present("update-config");
+    let init = arg_matches.get_flag("init");
+    let update_config = init || arg_matches.get_flag("update-config");

    let (mut toml, config_file_exists) = if cfg_file_path.is_file() {
        if init {
@@ -191,13 +156,10 @@ fn initialize_config(
        )
    };

-    if let Some(values) = arg_matches.values_of("config-override") {
+    if let Some(values) = arg_matches.get_many::<String>("config-override") {
        for option_line in values {
            let doc = toml_edit::Document::from_str(option_line).with_context(|| {
-                format!(
-                    "Option '{}' could not be parsed as a toml document",
-                    option_line
-                )
+                format!("Option '{option_line}' could not be parsed as a toml document")
            })?;

            for (key, item) in doc.iter() {
@@ -239,7 +201,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
    // Initialize logger
    let log_file = logging::init(LOG_FILE_NAME, daemonize)?;

-    info!("version: {GIT_VERSION}");
+    info!("version: {}", version());

    // TODO: Check that it looks like a valid repository before going further

@@ -356,6 +318,8 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
        },
    );

+    set_build_info_metric(GIT_VERSION);
+
    // All started up! Now just sit and wait for shutdown signal.
    signals.handle(|signal| match signal {
        Signal::Quit => {
@@ -378,3 +342,55 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
        }
    })
 }
+
+fn cli() -> Command {
+    Command::new("Neon page server")
+        .about("Materializes WAL stream to pages and serves them to the postgres")
+        .version(version())
+        .arg(
+
+            Arg::new("daemonize")
+                .short('d')
+                .long("daemonize")
+                .action(ArgAction::SetTrue)
+                .help("Run in the background"),
+        )
+        .arg(
+            Arg::new("init")
+                .long("init")
+                .action(ArgAction::SetTrue)
+                .help("Initialize pageserver with all given config overrides"),
+        )
+        .arg(
+            Arg::new("workdir")
+                .short('D')
+                .long("workdir")
+                .help("Working directory for the pageserver"),
+        )
+        // See `settings.md` for more details on the extra configuration patameters pageserver can process
+        .arg(
+            Arg::new("config-override")
+                .short('c')
+                .num_args(1)
+                .action(ArgAction::Append)
+                .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \
+                Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
+        )
+        .arg(
+            Arg::new("update-config")
+                .long("update-config")
+                .action(ArgAction::SetTrue)
+                .help("Update the config file when started"),
+        )
+        .arg(
+            Arg::new("enabled-features")
+                .long("enabled-features")
+                .action(ArgAction::SetTrue)
+                .help("Show enabled compile time features"),
+        )
+}
+
+#[test]
+fn verify_cli() {
+    cli().debug_assert();
+}
--- a/pageserver/src/bin/pageserver_binutils.rs
+++ b/pageserver/src/bin/pageserver_binutils.rs
@@ -0,0 +1,154 @@
+//! A helper tool to manage pageserver binary files.
+//! Accepts a file as an argument, attempts to parse it with all ways possible
+//! and prints its interpreted context.
+//!
+//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
+use std::{
+    path::{Path, PathBuf},
+    str::FromStr,
+};
+
+use anyhow::Context;
+use clap::{value_parser, Arg, Command};
+
+use pageserver::{
+    page_cache,
+    tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
+    virtual_file,
+};
+use postgres_ffi::ControlFileData;
+use utils::{lsn::Lsn, project_git_version};
+
+project_git_version!(GIT_VERSION);
+
+const METADATA_SUBCOMMAND: &str = "metadata";
+
+fn main() -> anyhow::Result<()> {
+    let arg_matches = cli().get_matches();
+
+    match arg_matches.subcommand() {
+        Some((subcommand_name, subcommand_matches)) => {
+            let path = subcommand_matches
+                .get_one::<PathBuf>("metadata_path")
+                .context("'metadata_path' argument is missing")?
+                .to_path_buf();
+            anyhow::ensure!(
+                subcommand_name == METADATA_SUBCOMMAND,
+                "Unknown subcommand {subcommand_name}"
+            );
+            handle_metadata(&path, subcommand_matches)?;
+        }
+        None => {
+            let path = arg_matches
+                .get_one::<PathBuf>("path")
+                .context("'path' argument is missing")?
+                .to_path_buf();
+            println!(
+                "No subcommand specified, attempting to guess the format for file {}",
+                path.display()
+            );
+            if let Err(e) = read_pg_control_file(&path) {
+                println!(
+                    "Failed to read input file as a pg control one: {e:#}\n\
+                    Attempting to read it as layer file"
+                );
+                print_layerfile(&path)?;
+            }
+        }
+    };
+    Ok(())
+}
+
+fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
+    let control_file = ControlFileData::decode(&std::fs::read(&control_file_path)?)?;
+    println!("{control_file:?}");
+    let control_file_initdb = Lsn(control_file.checkPoint);
+    println!(
+        "pg_initdb_lsn: {}, aligned: {}",
+        control_file_initdb,
+        control_file_initdb.align()
+    );
+    Ok(())
+}
+
+fn print_layerfile(path: &Path) -> anyhow::Result<()> {
+    // Basic initialization of things that don't change after startup
+    virtual_file::init(10);
+    page_cache::init(100);
+    dump_layerfile_from_path(path, true)
+}
+
+fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> {
+    let metadata_bytes = std::fs::read(&path)?;
+    let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
+    println!("Current metadata:\n{meta:?}");
+    let mut update_meta = false;
+    if let Some(disk_consistent_lsn) = arg_matches.get_one::<String>("disk_consistent_lsn") {
+        meta = TimelineMetadata::new(
+            Lsn::from_str(disk_consistent_lsn)?,
+            meta.prev_record_lsn(),
+            meta.ancestor_timeline(),
+            meta.ancestor_lsn(),
+            meta.latest_gc_cutoff_lsn(),
+            meta.initdb_lsn(),
+            meta.pg_version(),
+        );
+        update_meta = true;
+    }
+    if let Some(prev_record_lsn) = arg_matches.get_one::<String>("prev_record_lsn") {
+        meta = TimelineMetadata::new(
+            meta.disk_consistent_lsn(),
+            Some(Lsn::from_str(prev_record_lsn)?),
+            meta.ancestor_timeline(),
+            meta.ancestor_lsn(),
+            meta.latest_gc_cutoff_lsn(),
+            meta.initdb_lsn(),
+            meta.pg_version(),
+        );
+        update_meta = true;
+    }
+
+    if update_meta {
+        let metadata_bytes = meta.to_bytes()?;
+        std::fs::write(&path, &metadata_bytes)?;
+    }
+
+    Ok(())
+}
+
+fn cli() -> Command {
+    Command::new("Neon Pageserver binutils")
+        .about("Reads pageserver (and related) binary files management utility")
+        .version(GIT_VERSION)
+        .arg(
+            Arg::new("path")
+                .help("Input file path")
+                .value_parser(value_parser!(PathBuf))
+                .required(false),
+        )
+        .subcommand(
+            Command::new(METADATA_SUBCOMMAND)
+                .about("Read and update pageserver metadata file")
+                .arg(
+                    Arg::new("metadata_path")
+                        .help("Input metadata file path")
+                        .value_parser(value_parser!(PathBuf))
+                        .required(false),
+                )
+                .arg(
+                    Arg::new("disk_consistent_lsn")
+                        .long("disk_consistent_lsn")
+                        .help("Replace disk consistent Lsn"),
+                )
+                .arg(
+                    Arg::new("prev_record_lsn")
+                        .long("prev_record_lsn")
+                        .help("Replace previous record Lsn"),
+                ),
+        )
+}
+
+#[test]
+fn verify_cli() {
+    cli().debug_assert();
+}
--- a/pageserver/src/bin/update_metadata.rs
+++ b/pageserver/src/bin/update_metadata.rs
@@ -1,75 +0,0 @@
-//! Main entry point for the edit_metadata executable
-//!
-//! A handy tool for debugging, that's all.
-use anyhow::Result;
-use clap::{App, Arg};
-use pageserver::tenant::metadata::TimelineMetadata;
-use std::path::PathBuf;
-use std::str::FromStr;
-use utils::{lsn::Lsn, project_git_version};
-
-project_git_version!(GIT_VERSION);
-
-fn main() -> Result<()> {
-    let arg_matches = App::new("Neon update metadata utility")
-        .about("Dump or update metadata file")
-        .version(GIT_VERSION)
-        .arg(
-            Arg::new("path")
-                .help("Path to metadata file")
-                .required(true),
-        )
-        .arg(
-            Arg::new("disk_lsn")
-                .short('d')
-                .long("disk_lsn")
-                .takes_value(true)
-                .help("Replace disk constistent lsn"),
-        )
-        .arg(
-            Arg::new("prev_lsn")
-                .short('p')
-                .long("prev_lsn")
-                .takes_value(true)
-                .help("Previous record LSN"),
-        )
-        .get_matches();
-
-    let path = PathBuf::from(arg_matches.value_of("path").unwrap());
-    let metadata_bytes = std::fs::read(&path)?;
-    let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
-    println!("Current metadata:\n{:?}", &meta);
-
-    let mut update_meta = false;
-
-    if let Some(disk_lsn) = arg_matches.value_of("disk_lsn") {
-        meta = TimelineMetadata::new(
-            Lsn::from_str(disk_lsn)?,
-            meta.prev_record_lsn(),
-            meta.ancestor_timeline(),
-            meta.ancestor_lsn(),
-            meta.latest_gc_cutoff_lsn(),
-            meta.initdb_lsn(),
-            meta.pg_version(),
-        );
-        update_meta = true;
-    }
-
-    if let Some(prev_lsn) = arg_matches.value_of("prev_lsn") {
-        meta = TimelineMetadata::new(
-            meta.disk_consistent_lsn(),
-            Some(Lsn::from_str(prev_lsn)?),
-            meta.ancestor_timeline(),
-            meta.ancestor_lsn(),
-            meta.latest_gc_cutoff_lsn(),
-            meta.initdb_lsn(),
-            meta.pg_version(),
-        );
-        update_meta = true;
-    }
-    if update_meta {
-        let metadata_bytes = meta.to_bytes()?;
-        std::fs::write(&path, &metadata_bytes)?;
-    }
-    Ok(())
-}
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -30,10 +30,10 @@ pub mod defaults {
    use crate::tenant_config::defaults::*;
    use const_format::formatcp;

-    pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
-    pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
-    pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
-    pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
+    pub use pageserver_api::{
+        DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
+        DEFAULT_PG_LISTEN_PORT,
+    };

    pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
    pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
--- a/pageserver/src/http/mod.rs
+++ b/pageserver/src/http/mod.rs
@@ -1,3 +1,4 @@
-pub mod models;
 pub mod routes;
 pub use routes::make_router;
+
+pub use pageserver_api::models;
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1,7 +1,11 @@
 openapi: "3.0.2"
 info:
  title: Page Server API
+  description: Neon Pageserver API
  version: "1.0"
+  license:
+    name: "Apache"
+    url: https://github.com/neondatabase/neon/blob/main/LICENSE
 servers:
  - url: ""
 paths:
@@ -207,6 +211,61 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"

+  /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    get:
+      description: Get LSN by a timestamp
+      parameters:
+        - name: timestamp
+          in: query
+          required: true
+          schema:
+            type: string
+            format: date-time
+          description: A timestamp to get the LSN
+      responses:
+        "200":
+          description: OK
+          content:
+            application/json:
+              schema:
+                type: string
+        "400":
+          description: Error when no tenant id found in path, no timeline id or invalid timestamp
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
  /v1/tenant/{tenant_id}/attach:
    parameters:
      - name: tenant_id
@@ -556,6 +615,9 @@ components:
      required:
        - timeline_id
        - tenant_id
+        - last_record_lsn
+        - disk_consistent_lsn
+        - awaits_download
      properties:
        timeline_id:
          type: string
@@ -563,33 +625,15 @@ components:
        tenant_id:
          type: string
          format: hex
-        local:
-          $ref: "#/components/schemas/LocalTimelineInfo"
-        remote:
-          $ref: "#/components/schemas/RemoteTimelineInfo"
-    RemoteTimelineInfo:
-      type: object
-      required:
-        - awaits_download
-        - remote_consistent_lsn
-      properties:
-        awaits_download:
-          type: boolean
-        remote_consistent_lsn:
-          type: string
-          format: hex
-    LocalTimelineInfo:
-      type: object
-      required:
-        - last_record_lsn
-        - disk_consistent_lsn
-      properties:
        last_record_lsn:
          type: string
          format: hex
        disk_consistent_lsn:
          type: string
          format: hex
+        remote_consistent_lsn:
+          type: string
+          format: hex
        ancestor_timeline_id:
          type: string
          format: hex
@@ -614,7 +658,39 @@ components:
          format: hex
        last_received_msg_ts:
          type: integer
+        awaits_download:
+          type: boolean

+        # These 'local' and 'remote' fields just duplicate some of the fields
+        # above. They are kept for backwards-compatibility. They can be removed,
+        # when the control plane has been updated to look at the above fields
+        # directly.
+        local:
+          $ref: "#/components/schemas/LocalTimelineInfo"
+        remote:
+          $ref: "#/components/schemas/RemoteTimelineInfo"
+
+    LocalTimelineInfo:
+      type: object
+      properties:
+        ancestor_timeline_id:
+          type: string
+          format: hex
+        ancestor_lsn:
+          type: string
+          format: hex
+        current_logical_size:
+          type: integer
+        current_physical_size:
+          type: integer
+    RemoteTimelineInfo:
+      type: object
+      required:
+        - remote_consistent_lsn
+      properties:
+        remote_consistent_lsn:
+          type: string
+          format: hex
    Error:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -12,6 +12,7 @@ use super::models::{
    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
    TimelineCreateRequest,
 };
+use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::storage_sync;
 use crate::storage_sync::index::{RemoteIndex, RemoteTimeline};
 use crate::tenant::{TenantState, Timeline};
@@ -78,13 +79,13 @@ fn get_config(request: &Request<Body>) -> &'static PageServerConf {
    get_state(request).conf
 }

-// Helper functions to construct a LocalTimelineInfo struct for a timeline
-
-fn local_timeline_info_from_timeline(
+// Helper function to construct a TimelineInfo struct for a timeline
+async fn build_timeline_info(
+    state: &State,
    timeline: &Arc<Timeline>,
    include_non_incremental_logical_size: bool,
    include_non_incremental_physical_size: bool,
-) -> anyhow::Result<LocalTimelineInfo> {
+) -> anyhow::Result<TimelineInfo> {
    let last_record_lsn = timeline.get_last_record_lsn();
    let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
        let guard = timeline.last_received_wal.lock().unwrap();
@@ -99,24 +100,47 @@ fn local_timeline_info_from_timeline(
        }
    };

-    let info = LocalTimelineInfo {
-        ancestor_timeline_id: timeline.get_ancestor_timeline_id(),
-        ancestor_lsn: {
-            match timeline.get_ancestor_lsn() {
-                Lsn(0) => None,
-                lsn @ Lsn(_) => Some(lsn),
-            }
-        },
+    let (remote_consistent_lsn, awaits_download) = if let Some(remote_entry) = state
+        .remote_index
+        .read()
+        .await
+        .timeline_entry(&TenantTimelineId {
+            tenant_id: timeline.tenant_id,
+            timeline_id: timeline.timeline_id,
+        }) {
+        (
+            Some(remote_entry.metadata.disk_consistent_lsn()),
+            remote_entry.awaits_download,
+        )
+    } else {
+        (None, false)
+    };
+
+    let ancestor_timeline_id = timeline.get_ancestor_timeline_id();
+    let ancestor_lsn = match timeline.get_ancestor_lsn() {
+        Lsn(0) => None,
+        lsn @ Lsn(_) => Some(lsn),
+    };
+    let current_logical_size = match timeline.get_current_logical_size() {
+        Ok(size) => Some(size),
+        Err(err) => {
+            error!("Timeline info creation failed to get current logical size: {err:?}");
+            None
+        }
+    };
+    let current_physical_size = Some(timeline.get_physical_size());
+
+    let info = TimelineInfo {
+        tenant_id: timeline.tenant_id,
+        timeline_id: timeline.timeline_id,
+        ancestor_timeline_id,
+        ancestor_lsn,
        disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
        last_record_lsn,
        prev_record_lsn: Some(timeline.get_prev_record_lsn()),
        latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
-        current_logical_size: Some(
-            timeline
-                .get_current_logical_size()
-                .context("Timeline info creation failed to get current logical size")?,
-        ),
-        current_physical_size: Some(timeline.get_physical_size()),
+        current_logical_size,
+        current_physical_size,
        current_logical_size_non_incremental: if include_non_incremental_logical_size {
            Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?)
        } else {
@@ -131,32 +155,25 @@ fn local_timeline_info_from_timeline(
        last_received_msg_lsn,
        last_received_msg_ts,
        pg_version: timeline.pg_version,
+
+        remote_consistent_lsn,
+        awaits_download,
+
+        // Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility
+        // with the control plane.
+        local: LocalTimelineInfo {
+            ancestor_timeline_id,
+            ancestor_lsn,
+            current_logical_size,
+            current_physical_size,
+        },
+        remote: RemoteTimelineInfo {
+            remote_consistent_lsn,
+        },
    };
    Ok(info)
 }

-fn list_local_timelines(
-    tenant_id: TenantId,
-    include_non_incremental_logical_size: bool,
-    include_non_incremental_physical_size: bool,
-) -> Result<Vec<(TimelineId, LocalTimelineInfo)>> {
-    let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
-    let timelines = tenant.list_timelines();
-
-    let mut local_timeline_info = Vec::with_capacity(timelines.len());
-    for (timeline_id, repository_timeline) in timelines {
-        local_timeline_info.push((
-            timeline_id,
-            local_timeline_info_from_timeline(
-                &repository_timeline,
-                include_non_incremental_logical_size,
-                include_non_incremental_physical_size,
-            )?,
-        ))
-    }
-    Ok(local_timeline_info)
-}
-
 // healthcheck handler
 async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let config = get_config(&request);
@@ -168,6 +185,8 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
    let request_data: TimelineCreateRequest = json_request(&mut request).await?;
    check_permission(&request, Some(tenant_id))?;

+    let state = get_state(&request);
+
    let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
    let new_timeline_info = async {
        match tenant.create_timeline(
@@ -178,14 +197,10 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
        ).await {
            Ok(Some(new_timeline)) => {
                // Created. Construct a TimelineInfo for it.
-                let local_info = local_timeline_info_from_timeline(&new_timeline, false, false)
+                let timeline_info = build_timeline_info(state, &new_timeline, false, false)
+                    .await
                    .map_err(ApiError::InternalServerError)?;
-                Ok(Some(TimelineInfo {
-                    tenant_id,
-                    timeline_id: new_timeline.timeline_id,
-                    local: Some(local_info),
-                    remote: None,
-                }))
+                Ok(Some(timeline_info))
            }
            Ok(None) => Ok(None), // timeline already exists
            Err(err) => Err(ApiError::InternalServerError(err)),
@@ -208,6 +223,8 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
        query_param_present(&request, "include-non-incremental-physical-size");
    check_permission(&request, Some(tenant_id))?;

+    let state = get_state(&request);
+
    let timelines = tokio::task::spawn_blocking(move || {
        let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
        let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
@@ -217,36 +234,18 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;

    let mut response_data = Vec::with_capacity(timelines.len());
-    for (timeline_id, timeline) in timelines {
-        let local = match local_timeline_info_from_timeline(
+    for timeline in timelines {
+        let timeline_info = build_timeline_info(
+            state,
            &timeline,
            include_non_incremental_logical_size,
            include_non_incremental_physical_size,
-        ) {
-            Ok(local) => Some(local),
-            Err(e) => {
-                error!("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}");
-                None
-            }
-        };
+        )
+        .await
+        .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
+        .map_err(ApiError::InternalServerError)?;

-        response_data.push(TimelineInfo {
-            tenant_id,
-            timeline_id,
-            local,
-            remote: get_state(&request)
-                .remote_index
-                .read()
-                .await
-                .timeline_entry(&TenantTimelineId {
-                    tenant_id,
-                    timeline_id,
-                })
-                .map(|remote_entry| RemoteTimelineInfo {
-                    remote_consistent_lsn: remote_entry.metadata.disk_consistent_lsn(),
-                    awaits_download: remote_entry.awaits_download,
-                }),
-        })
+        response_data.push(timeline_info);
    }

    json_response(StatusCode::OK, response_data)
@@ -265,6 +264,23 @@ fn query_param_present(request: &Request<Body>, param: &str) -> bool {
        .unwrap_or(false)
 }

+fn get_query_param(request: &Request<Body>, param_name: &str) -> Result<String, ApiError> {
+    request.uri().query().map_or(
+        Err(ApiError::BadRequest(anyhow!("empty query in request"))),
+        |v| {
+            url::form_urlencoded::parse(v.as_bytes())
+                .into_owned()
+                .find(|(k, _)| k == param_name)
+                .map_or(
+                    Err(ApiError::BadRequest(anyhow!(
+                        "no {param_name} specified in query parameters"
+                    ))),
+                    |(_, v)| Ok(v),
+                )
+        },
+    )
+}
+
 async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -274,59 +290,60 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
        query_param_present(&request, "include-non-incremental-physical-size");
    check_permission(&request, Some(tenant_id))?;

-    let (local_timeline_info, remote_timeline_info) = async {
+    let state = get_state(&request);
+
+    let timeline_info = async {
        let timeline = tokio::task::spawn_blocking(move || {
            tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id)
        })
        .await
        .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;

-        let local_timeline_info = match timeline.and_then(|timeline| {
-            local_timeline_info_from_timeline(
-                &timeline,
-                include_non_incremental_logical_size,
-                include_non_incremental_physical_size,
-            )
-        }) {
-            Ok(local_info) => Some(local_info),
-            Err(e) => {
-                error!("Failed to get local timeline info: {e:#}");
-                None
-            }
-        };
+        let timeline = timeline.map_err(ApiError::NotFound)?;

-        let remote_timeline_info = {
-            let remote_index_read = get_state(&request).remote_index.read().await;
-            remote_index_read
-                .timeline_entry(&TenantTimelineId {
-                    tenant_id,
-                    timeline_id,
-                })
-                .map(|remote_entry| RemoteTimelineInfo {
-                    remote_consistent_lsn: remote_entry.metadata.disk_consistent_lsn(),
-                    awaits_download: remote_entry.awaits_download,
-                })
-        };
-        Ok::<_, ApiError>((local_timeline_info, remote_timeline_info))
+        let timeline_info = build_timeline_info(
+            state,
+            &timeline,
+            include_non_incremental_logical_size,
+            include_non_incremental_physical_size,
+        )
+        .await
+        .context("Failed to get local timeline info: {e:#}")
+        .map_err(ApiError::InternalServerError)?;
+
+        Ok::<_, ApiError>(timeline_info)
    }
    .instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id))
    .await?;

-    if local_timeline_info.is_none() && remote_timeline_info.is_none() {
-        Err(ApiError::NotFound(anyhow!(
-            "Timeline {tenant_id}/{timeline_id} is not found neither locally nor remotely"
-        )))
-    } else {
-        json_response(
-            StatusCode::OK,
-            TimelineInfo {
-                tenant_id,
-                timeline_id,
-                local: local_timeline_info,
-                remote: remote_timeline_info,
-            },
-        )
-    }
+    json_response(StatusCode::OK, timeline_info)
+}
+
+async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let timestamp_raw = get_query_param(&request, "timestamp")?;
+    let timestamp = humantime::parse_rfc3339(timestamp_raw.as_str())
+        .with_context(|| format!("Invalid time: {:?}", timestamp_raw))
+        .map_err(ApiError::BadRequest)?;
+    let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
+
+    let timeline = tenant_mgr::get_tenant(tenant_id, true)
+        .and_then(|tenant| tenant.get_timeline(timeline_id))
+        .with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
+        .map_err(ApiError::NotFound)?;
+    let result = match timeline
+        .find_lsn_for_timestamp(timestamp_pg)
+        .map_err(ApiError::InternalServerError)?
+    {
+        LsnForTimestamp::Present(lsn) => format!("{}", lsn),
+        LsnForTimestamp::Future(_lsn) => "future".into(),
+        LsnForTimestamp::Past(_lsn) => "past".into(),
+        LsnForTimestamp::NoData(_lsn) => "nodata".into(),
+    };
+    json_response(StatusCode::OK, result)
 }

 // TODO makes sense to provide tenant config right away the same way as it handled in tenant_create
@@ -337,9 +354,16 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
    info!("Handling tenant attach {tenant_id}");

    tokio::task::spawn_blocking(move || match tenant_mgr::get_tenant(tenant_id, false) {
-        Ok(_) => Err(ApiError::Conflict(
-            "Tenant is already present locally".to_owned(),
-        )),
+        Ok(tenant) => {
+            if tenant.list_timelines().is_empty() {
+                info!("Attaching to tenant {tenant_id} with zero timelines");
+                Ok(())
+            } else {
+                Err(ApiError::Conflict(
+                    "Tenant is already present locally".to_owned(),
+                ))
+            }
+        }
        Err(_) => Ok(()),
    })
    .await
@@ -362,7 +386,7 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
        }
        return json_response(StatusCode::ACCEPTED, ());
    }
-    // no tenant in the index, release the lock to make the potentially lengthy download opetation
+    // no tenant in the index, release the lock to make the potentially lengthy download operation
    drop(index_accessor);

    // download index parts for every tenant timeline
@@ -514,36 +538,27 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
            false
        });

-    let tenant_state = match tenant {
-        Ok(tenant) => tenant.current_state(),
+    let (tenant_state, current_physical_size) = match tenant {
+        Ok(tenant) => {
+            let timelines = tenant.list_timelines();
+            // Calculate total physical size of all timelines
+            let mut current_physical_size = 0;
+            for timeline in timelines {
+                current_physical_size += timeline.get_physical_size();
+            }
+
+            (tenant.current_state(), Some(current_physical_size))
+        }
        Err(e) => {
            error!("Failed to get local tenant state: {e:#}");
            if has_in_progress_downloads {
-                TenantState::Paused
+                (TenantState::Paused, None)
            } else {
-                TenantState::Broken
+                (TenantState::Broken, None)
            }
        }
    };

-    let current_physical_size =
-        match tokio::task::spawn_blocking(move || list_local_timelines(tenant_id, false, false))
-            .await
-            .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?
-        {
-            Err(err) => {
-                // Getting local timelines can fail when no local tenant directory is on disk (e.g, when tenant data is being downloaded).
-                // In that case, put a warning message into log and operate normally.
-                warn!("Failed to get local timelines for tenant {tenant_id}: {err}");
-                None
-            }
-            Ok(local_timeline_infos) => Some(
-                local_timeline_infos
-                    .into_iter()
-                    .fold(0, |acc, x| acc + x.1.current_physical_size.unwrap()),
-            ),
-        };
-
    json_response(
        StatusCode::OK,
        TenantInfo {
@@ -732,7 +747,7 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
    json_response(StatusCode::OK, ())
 }

-#[cfg(any(feature = "testing", feature = "failpoints"))]
+#[cfg(feature = "testing")]
 async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
    if !fail::has_failpoints() {
        return Err(ApiError::BadRequest(anyhow!(
@@ -901,6 +916,10 @@ pub fn make_router(
            "/v1/tenant/:tenant_id/timeline/:timeline_id",
            timeline_detail_handler,
        )
+        .get(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
+            get_lsn_by_timestamp_handler,
+        )
        .put(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc",
            testing_api!("run timeline GC", timeline_gc_handler),
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -119,32 +119,6 @@ impl<T> TenantTimelineValues<T> {
    fn new() -> Self {
        Self(HashMap::new())
    }
-
-    fn with_capacity(capacity: usize) -> Self {
-        Self(HashMap::with_capacity(capacity))
-    }
-
-    /// A convenience method to map certain values and omit some of them, if needed.
-    /// Tenants that won't have any timeline entries due to the filtering, will still be preserved
-    /// in the structure.
-    fn filter_map<F, NewT>(self, map: F) -> TenantTimelineValues<NewT>
-    where
-        F: Fn(T) -> Option<NewT>,
-    {
-        let capacity = self.0.len();
-        self.0.into_iter().fold(
-            TenantTimelineValues::<NewT>::with_capacity(capacity),
-            |mut new_values, (tenant_id, old_values)| {
-                let new_timeline_values = new_values.0.entry(tenant_id).or_default();
-                for (timeline_id, old_value) in old_values {
-                    if let Some(new_value) = map(old_value) {
-                        new_timeline_values.insert(timeline_id, new_value);
-                    }
-                }
-                new_values
-            },
-        )
-    }
 }

 /// A suffix to be used during file sync from the remote storage,
@@ -181,35 +155,3 @@ mod backoff_defaults_tests {
        );
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use crate::tenant::harness::TIMELINE_ID;
-
-    use super::*;
-
-    #[test]
-    fn tenant_timeline_value_mapping() {
-        let first_tenant = TenantId::generate();
-        let second_tenant = TenantId::generate();
-        assert_ne!(first_tenant, second_tenant);
-
-        let mut initial = TenantTimelineValues::new();
-        initial
-            .0
-            .entry(first_tenant)
-            .or_default()
-            .insert(TIMELINE_ID, "test_value");
-        let _ = initial.0.entry(second_tenant).or_default();
-        assert_eq!(initial.0.len(), 2, "Should have entries for both tenants");
-
-        let filtered = initial.filter_map(|_| None::<&str>).0;
-        assert_eq!(
-            filtered.len(),
-            2,
-            "Should have entries for both tenants even after filtering away all entries"
-        );
-        assert!(filtered.contains_key(&first_tenant));
-        assert!(filtered.contains_key(&second_tenant));
-    }
-}
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,8 +1,9 @@
 use metrics::core::{AtomicU64, GenericCounter};
 use metrics::{
-    register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec,
-    register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, Histogram, HistogramVec,
-    IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+    register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter,
+    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
+    GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge,
+    UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
 use utils::id::{TenantId, TimelineId};
@@ -204,12 +205,34 @@ pub static REMAINING_SYNC_ITEMS: Lazy<IntGauge> = Lazy::new(|| {
    .expect("failed to register pageserver remote storage remaining sync items int gauge")
 });

-pub static IMAGE_SYNC_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+pub static IMAGE_SYNC_TIME: Lazy<GaugeVec> = Lazy::new(|| {
+    register_gauge_vec!(
+        "pageserver_remote_storage_image_sync_duration",
+        "Time spent to synchronize (up/download) a whole pageserver image",
+        &["tenant_id", "timeline_id"],
+    )
+    .expect("failed to register per-timeline pageserver image sync time vec")
+});
+
+pub static IMAGE_SYNC_OPERATION_KINDS: &[&str] = &["upload", "download", "delete"];
+pub static IMAGE_SYNC_STATUS: &[&str] = &["success", "failure", "abort"];
+
+pub static IMAGE_SYNC_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_remote_storage_image_sync_count",
+        "Number of synchronization operations executed for pageserver images. \
+        Grouped by tenant, timeline, operation_kind and status",
+        &["tenant_id", "timeline_id", "operation_kind", "status"]
+    )
+    .expect("failed to register pageserver image sync count vec")
+});
+
+pub static IMAGE_SYNC_TIME_HISTOGRAM: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_remote_storage_image_sync_seconds",
        "Time took to synchronize (download or upload) a whole pageserver image. \
-        Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)",
-        &["tenant_id", "timeline_id", "operation_kind", "status"],
+        Grouped by operation_kind and status",
+        &["operation_kind", "status"],
        vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0]
    )
    .expect("failed to register pageserver image sync time histogram vec")
@@ -256,7 +279,7 @@ macro_rules! redo_histogram_time_buckets {
    () => {
        vec![
            0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000,
-            0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000,
+            0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000,
        ]
    };
 }
@@ -411,6 +434,14 @@ impl Drop for TimelineMetrics {
        for op in SMGR_QUERY_TIME_OPERATIONS {
            let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
        }
+
+        for op in IMAGE_SYNC_OPERATION_KINDS {
+            for status in IMAGE_SYNC_STATUS {
+                let _ = IMAGE_SYNC_COUNT.remove_label_values(&[tenant_id, timeline_id, op, status]);
+            }
+        }
+
+        let _ = IMAGE_SYNC_TIME.remove_label_values(&[tenant_id, timeline_id]);
    }
 }

--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -12,7 +12,6 @@
 use anyhow::{bail, ensure, Context, Result};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use futures::{Stream, StreamExt};
-use regex::Regex;
 use std::io;
 use std::net::TcpListener;
 use std::str;
@@ -35,7 +34,6 @@ use crate::basebackup;
 use crate::config::{PageServerConf, ProfilingConfig};
 use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar};
 use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
-use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::profiling::profpoint_start;
 use crate::reltag::RelTag;
 use crate::task_mgr;
@@ -45,7 +43,6 @@ use crate::tenant_mgr;
 use crate::CheckpointConfig;

 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
-use postgres_ffi::to_pg_timestamp;
 use postgres_ffi::BLCKSZ;

 // Wrapped in libpq CopyData
@@ -1062,33 +1059,6 @@ impl postgres_backend_async::Handler for PageServerHandler {
                Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()),
            ]))?
            .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("get_lsn_by_timestamp ") {
-            // Locate LSN of last transaction with timestamp less or equal than sppecified
-            // TODO lazy static
-            let re = Regex::new(r"^get_lsn_by_timestamp ([[:xdigit:]]+) ([[:xdigit:]]+) '(.*)'$")
-                .unwrap();
-            let caps = re
-                .captures(query_string)
-                .with_context(|| format!("invalid get_lsn_by_timestamp: '{}'", query_string))?;
-            let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?;
-            let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?;
-            let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?;
-            let timestamp_pg = to_pg_timestamp(timestamp);
-
-            self.check_permission(Some(tenant_id))?;
-
-            let timeline = get_local_timeline(tenant_id, timeline_id)?;
-            pgb.write_message(&BeMessage::RowDescription(&[RowDescriptor::text_col(
-                b"lsn",
-            )]))?;
-            let result = match timeline.find_lsn_for_timestamp(timestamp_pg)? {
-                LsnForTimestamp::Present(lsn) => format!("{}", lsn),
-                LsnForTimestamp::Future(_lsn) => "future".into(),
-                LsnForTimestamp::Past(_lsn) => "past".into(),
-                LsnForTimestamp::NoData(_lsn) => "nodata".into(),
-            };
-            pgb.write_message(&BeMessage::DataRow(&[Some(result.as_bytes())]))?;
-            pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else {
            bail!("unknown command");
        }
--- a/pageserver/src/storage_sync.rs
+++ b/pageserver/src/storage_sync.rs
@@ -169,15 +169,21 @@ use self::{
    upload::{upload_index_part, upload_timeline_layers, UploadedTimeline},
 };
 use crate::{
-    config::PageServerConf, exponential_backoff, storage_sync::index::RemoteIndex, task_mgr,
-    task_mgr::TaskKind, task_mgr::BACKGROUND_RUNTIME, tenant::metadata::TimelineMetadata,
-    tenant_mgr::attach_local_tenants,
+    config::PageServerConf,
+    exponential_backoff,
+    storage_sync::index::{LayerFileMetadata, RemoteIndex},
+    task_mgr,
+    task_mgr::TaskKind,
+    task_mgr::BACKGROUND_RUNTIME,
+    tenant::metadata::TimelineMetadata,
+    tenant_mgr::{attach_local_tenants, TenantAttachData},
 };
 use crate::{
    metrics::{IMAGE_SYNC_TIME, REMAINING_SYNC_ITEMS, REMOTE_INDEX_UPLOAD},
    TenantTimelineValues,
 };

+use crate::metrics::{IMAGE_SYNC_COUNT, IMAGE_SYNC_TIME_HISTOGRAM};
 use utils::id::{TenantId, TenantTimelineId, TimelineId};

 use self::download::download_index_parts;
@@ -187,7 +193,7 @@ static SYNC_QUEUE: OnceCell<SyncQueue> = OnceCell::new();

 /// A timeline status to share with pageserver's sync counterpart,
 /// after comparing local and remote timeline state.
-#[derive(Clone)]
+#[derive(Clone, PartialEq, Eq)]
 pub enum LocalTimelineInitStatus {
    /// The timeline has every remote layer present locally.
    /// There could be some layers requiring uploading,
@@ -310,7 +316,7 @@ impl SyncQueue {

 /// A task to run in the async download/upload loop.
 /// Limited by the number of retries, after certain threshold the failing task gets evicted and the timeline disabled.
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, PartialEq, Eq)]
 enum SyncTask {
    /// A checkpoint outcome with possible local file updates that need actualization in the remote storage.
    /// Not necessary more fresh than the one already uploaded.
@@ -421,7 +427,7 @@ impl SyncTaskBatch {
                            .extend(new_delete.data.deleted_layers.iter().cloned());
                    }
                    if let Some(batch_upload) = &mut self.upload {
-                        let not_deleted = |layer: &PathBuf| {
+                        let not_deleted = |layer: &PathBuf, _: &mut LayerFileMetadata| {
                            !new_delete.data.layers_to_delete.contains(layer)
                                && !new_delete.data.deleted_layers.contains(layer)
                        };
@@ -449,21 +455,35 @@ impl SyncTaskBatch {
 #[derive(Debug, Clone, PartialEq, Eq)]
 struct LayersUpload {
    /// Layer file path in the pageserver workdir, that were added for the corresponding checkpoint.
-    layers_to_upload: HashSet<PathBuf>,
+    layers_to_upload: HashMap<PathBuf, LayerFileMetadata>,
    /// Already uploaded layers. Used to store the data about the uploads between task retries
    /// and to record the data into the remote index after the task got completed or evicted.
-    uploaded_layers: HashSet<PathBuf>,
+    uploaded_layers: HashMap<PathBuf, LayerFileMetadata>,
    metadata: Option<TimelineMetadata>,
 }

 /// A timeline download task.
 /// Does not contain the file list to download, to allow other
 /// parts of the pageserer code to schedule the task
-/// without using the remote index or any other ways to list the remote timleine files.
+/// without using the remote index or any other ways to list the remote timeline files.
 /// Skips the files that are already downloaded.
 #[derive(Debug, Clone, PartialEq, Eq)]
 struct LayersDownload {
    layers_to_skip: HashSet<PathBuf>,
+
+    /// Paths which have been downloaded, and had their metadata verified or generated.
+    ///
+    /// Metadata generation happens when upgrading from past version of `IndexPart`.
+    gathered_metadata: HashMap<PathBuf, LayerFileMetadata>,
+}
+
+impl LayersDownload {
+    fn from_skipped_layers(layers_to_skip: HashSet<PathBuf>) -> Self {
+        LayersDownload {
+            layers_to_skip,
+            gathered_metadata: HashMap::default(),
+        }
+    }
 }

 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -485,7 +505,7 @@ struct LayersDeletion {
 pub fn schedule_layer_upload(
    tenant_id: TenantId,
    timeline_id: TimelineId,
-    layers_to_upload: HashSet<PathBuf>,
+    layers_to_upload: HashMap<PathBuf, LayerFileMetadata>,
    metadata: Option<TimelineMetadata>,
 ) {
    let sync_queue = match SYNC_QUEUE.get() {
@@ -502,7 +522,7 @@ pub fn schedule_layer_upload(
        },
        SyncTask::upload(LayersUpload {
            layers_to_upload,
-            uploaded_layers: HashSet::new(),
+            uploaded_layers: HashMap::new(),
            metadata,
        }),
    );
@@ -560,18 +580,44 @@ pub fn schedule_layer_download(tenant_id: TenantId, timeline_id: TimelineId) {
            tenant_id,
            timeline_id,
        },
-        SyncTask::download(LayersDownload {
-            layers_to_skip: HashSet::new(),
-        }),
+        SyncTask::download(LayersDownload::from_skipped_layers(HashSet::new())),
    );
    debug!("Download task for tenant {tenant_id}, timeline {timeline_id} sent")
 }

+/// Local existing timeline files
+///
+/// Values of this type serve different meanings in different contexts. On startup, collected
+/// timelines come with the full collected information and when signalling readyness to attach
+/// after completed download. After the download the file information is no longer carried, because
+/// it is already merged into [`RemoteTimeline`].
+#[derive(Debug)]
+pub struct TimelineLocalFiles(TimelineMetadata, HashMap<PathBuf, LayerFileMetadata>);
+
+impl TimelineLocalFiles {
+    pub fn metadata(&self) -> &TimelineMetadata {
+        &self.0
+    }
+
+    /// Called during startup, for all of the local files with full metadata.
+    pub(crate) fn collected(
+        metadata: TimelineMetadata,
+        timeline_files: HashMap<PathBuf, LayerFileMetadata>,
+    ) -> TimelineLocalFiles {
+        TimelineLocalFiles(metadata, timeline_files)
+    }
+
+    /// Called near the end of tenant initialization, to signal readyness to attach tenants.
+    pub(crate) fn ready(metadata: TimelineMetadata) -> Self {
+        TimelineLocalFiles(metadata, HashMap::new())
+    }
+}
+
 /// Launch a thread to perform remote storage sync tasks.
 /// See module docs for loop step description.
 pub fn spawn_storage_sync_task(
    conf: &'static PageServerConf,
-    local_timeline_files: TenantTimelineValues<(TimelineMetadata, HashSet<PathBuf>)>,
+    local_timeline_files: HashMap<TenantId, HashMap<TimelineId, TimelineLocalFiles>>,
    storage: GenericRemoteStorage,
    max_concurrent_timelines_sync: NonZeroUsize,
    max_sync_errors: NonZeroU32,
@@ -594,7 +640,7 @@ pub fn spawn_storage_sync_task(
    let mut keys_for_index_part_downloads = HashSet::new();
    let mut timelines_to_sync = HashMap::new();

-    for (tenant_id, timeline_data) in local_timeline_files.0 {
+    for (tenant_id, timeline_data) in local_timeline_files {
        if timeline_data.is_empty() {
            info!("got empty tenant {}", tenant_id);
            let _ = empty_tenants.0.entry(tenant_id).or_default();
@@ -638,6 +684,7 @@ pub fn spawn_storage_sync_task(
                (storage, remote_index_clone, sync_queue),
                max_sync_errors,
            )
+            .instrument(info_span!("storage_sync_loop"))
            .await;
            Ok(())
        },
@@ -696,7 +743,7 @@ async fn storage_sync_loop(
                        "Sync loop step completed, {} new tenant state update(s)",
                        updated_tenants.len()
                    );
-                    let mut timelines_to_attach = TenantTimelineValues::new();
+                    let mut timelines_to_attach = HashMap::new();
                    let index_accessor = index.read().await;
                    for tenant_id in updated_tenants {
                        let tenant_entry = match index_accessor.tenant_entry(&tenant_id) {
@@ -722,12 +769,16 @@ async fn storage_sync_loop(
                            // and register them all at once in a tenant for download
                            // to be submitted in a single operation to tenant
                            // so it can apply them at once to internal timeline map.
-                            timelines_to_attach.0.insert(
+                            timelines_to_attach.insert(
                                tenant_id,
-                                tenant_entry
-                                    .iter()
-                                    .map(|(&id, entry)| (id, entry.metadata.clone()))
-                                    .collect(),
+                                TenantAttachData::Ready(
+                                    tenant_entry
+                                        .iter()
+                                        .map(|(&id, entry)| {
+                                            (id, TimelineLocalFiles::ready(entry.metadata.clone()))
+                                        })
+                                        .collect(),
+                                ),
                            );
                        }
                    }
@@ -835,7 +886,6 @@ async fn process_sync_task_batch(
                            sync_id,
                            upload_data,
                            sync_start,
-                            "upload",
                        )
                        .await
                    }
@@ -879,7 +929,6 @@ async fn process_sync_task_batch(
                            sync_id,
                            download_data,
                            sync_start,
-                            "download",
                        )
                        .await;
                    }
@@ -911,7 +960,6 @@ async fn process_sync_task_batch(
                            sync_id,
                            delete_data,
                            sync_start,
-                            "delete",
                        )
                        .instrument(info_span!("delete_timeline_data"))
                        .await;
@@ -948,8 +996,9 @@ async fn download_timeline_data(
    sync_id: TenantTimelineId,
    new_download_data: SyncData<LayersDownload>,
    sync_start: Instant,
-    task_name: &str,
 ) -> DownloadStatus {
+    static TASK_NAME: &str = "download";
+
    match download_timeline_layers(
        conf,
        storage,
@@ -961,30 +1010,42 @@ async fn download_timeline_data(
    .await
    {
        DownloadedTimeline::Abort => {
-            register_sync_status(sync_id, sync_start, task_name, None);
+            register_sync_status(sync_id, sync_start, TASK_NAME, None);
            if let Err(e) = index.write().await.set_awaits_download(&sync_id, false) {
                error!("Timeline {sync_id} was expected to be in the remote index after a download attempt, but it's absent: {e:?}");
            }
        }
        DownloadedTimeline::FailedAndRescheduled => {
-            register_sync_status(sync_id, sync_start, task_name, Some(false));
+            register_sync_status(sync_id, sync_start, TASK_NAME, Some(false));
        }
        DownloadedTimeline::Successful(mut download_data) => {
            match update_local_metadata(conf, sync_id, current_remote_timeline).await {
-                Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) {
-                    Ok(()) => {
-                        register_sync_status(sync_id, sync_start, task_name, Some(true));
-                        return DownloadStatus::Downloaded;
-                    }
-                    Err(e) => {
-                        error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}");
-                    }
-                },
+                Ok(()) => {
+                    let mut g = index.write().await;
+
+                    match g.set_awaits_download(&sync_id, false) {
+                        Ok(()) => {
+                            let timeline = g
+                                .timeline_entry_mut(&sync_id)
+                                .expect("set_awaits_download verified existence");
+
+                            timeline.merge_metadata_from_downloaded(
+                                &download_data.data.gathered_metadata,
+                            );
+
+                            register_sync_status(sync_id, sync_start, TASK_NAME, Some(true));
+                            return DownloadStatus::Downloaded;
+                        }
+                        Err(e) => {
+                            error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}");
+                        }
+                    };
+                }
                Err(e) => {
                    error!("Failed to update local timeline metadata: {e:?}");
                    download_data.retries += 1;
                    sync_queue.push(sync_id, SyncTask::Download(download_data));
-                    register_sync_status(sync_id, sync_start, task_name, Some(false));
+                    register_sync_status(sync_id, sync_start, TASK_NAME, Some(false));
                }
            }
        }
@@ -1060,8 +1121,9 @@ async fn delete_timeline_data(
    sync_id: TenantTimelineId,
    mut new_delete_data: SyncData<LayersDeletion>,
    sync_start: Instant,
-    task_name: &str,
 ) {
+    static TASK_NAME: &str = "delete";
+
    let timeline_delete = &mut new_delete_data.data;

    if !timeline_delete.deletion_registered {
@@ -1077,14 +1139,14 @@ async fn delete_timeline_data(
            error!("Failed to update remote timeline {sync_id}: {e:?}");
            new_delete_data.retries += 1;
            sync_queue.push(sync_id, SyncTask::Delete(new_delete_data));
-            register_sync_status(sync_id, sync_start, task_name, Some(false));
+            register_sync_status(sync_id, sync_start, TASK_NAME, Some(false));
            return;
        }
    }
    timeline_delete.deletion_registered = true;

    let sync_status = delete_timeline_layers(storage, sync_queue, sync_id, new_delete_data).await;
-    register_sync_status(sync_id, sync_start, task_name, Some(sync_status));
+    register_sync_status(sync_id, sync_start, TASK_NAME, Some(sync_status));
 }

 async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result<TimelineMetadata> {
@@ -1103,8 +1165,8 @@ async fn upload_timeline_data(
    sync_id: TenantTimelineId,
    new_upload_data: SyncData<LayersUpload>,
    sync_start: Instant,
-    task_name: &str,
 ) -> UploadStatus {
+    static TASK_NAME: &str = "upload";
    let mut uploaded_data = match upload_timeline_layers(
        storage,
        sync_queue,
@@ -1115,7 +1177,7 @@ async fn upload_timeline_data(
    .await
    {
        UploadedTimeline::FailedAndRescheduled(e) => {
-            register_sync_status(sync_id, sync_start, task_name, Some(false));
+            register_sync_status(sync_id, sync_start, TASK_NAME, Some(false));
            return UploadStatus::Failed(e);
        }
        UploadedTimeline::Successful(upload_data) => upload_data,
@@ -1134,14 +1196,14 @@ async fn upload_timeline_data(
    .await
    {
        Ok(()) => {
-            register_sync_status(sync_id, sync_start, task_name, Some(true));
+            register_sync_status(sync_id, sync_start, TASK_NAME, Some(true));
            UploadStatus::Uploaded
        }
        Err(e) => {
            error!("Failed to update remote timeline {sync_id}: {e:?}");
            uploaded_data.retries += 1;
            sync_queue.push(sync_id, SyncTask::Upload(uploaded_data));
-            register_sync_status(sync_id, sync_start, task_name, Some(false));
+            register_sync_status(sync_id, sync_start, TASK_NAME, Some(false));
            UploadStatus::Failed(e)
        }
    }
@@ -1181,11 +1243,18 @@ async fn update_remote_data(
                        }
                        if upload_failed {
                            existing_entry.add_upload_failures(
-                                uploaded_data.layers_to_upload.iter().cloned(),
+                                uploaded_data
+                                    .layers_to_upload
+                                    .iter()
+                                    .map(|(k, v)| (k.to_owned(), v.to_owned())),
                            );
                        } else {
-                            existing_entry
-                                .add_timeline_layers(uploaded_data.uploaded_layers.iter().cloned());
+                            existing_entry.add_timeline_layers(
+                                uploaded_data
+                                    .uploaded_layers
+                                    .iter()
+                                    .map(|(k, v)| (k.to_owned(), v.to_owned())),
+                            );
                        }
                    }
                    RemoteDataUpdate::Delete(layers_to_remove) => {
@@ -1205,11 +1274,19 @@ async fn update_remote_data(
                    };
                    let mut new_remote_timeline = RemoteTimeline::new(new_metadata.clone());
                    if upload_failed {
-                        new_remote_timeline
-                            .add_upload_failures(uploaded_data.layers_to_upload.iter().cloned());
+                        new_remote_timeline.add_upload_failures(
+                            uploaded_data
+                                .layers_to_upload
+                                .iter()
+                                .map(|(k, v)| (k.to_owned(), v.to_owned())),
+                        );
                    } else {
-                        new_remote_timeline
-                            .add_timeline_layers(uploaded_data.uploaded_layers.iter().cloned());
+                        new_remote_timeline.add_timeline_layers(
+                            uploaded_data
+                                .uploaded_layers
+                                .iter()
+                                .map(|(k, v)| (k.to_owned(), v.to_owned())),
+                        );
                    }

                    index_accessor.add_timeline_entry(sync_id, new_remote_timeline.clone());
@@ -1257,13 +1334,14 @@ async fn validate_task_retries(
 fn schedule_first_sync_tasks(
    index: &mut RemoteTimelineIndex,
    sync_queue: &SyncQueue,
-    local_timeline_files: HashMap<TenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>,
+    local_timeline_files: HashMap<TenantTimelineId, TimelineLocalFiles>,
 ) -> TenantTimelineValues<LocalTimelineInitStatus> {
    let mut local_timeline_init_statuses = TenantTimelineValues::new();

    let mut new_sync_tasks = VecDeque::with_capacity(local_timeline_files.len());

-    for (sync_id, (local_metadata, local_files)) in local_timeline_files {
+    for (sync_id, local_timeline) in local_timeline_files {
+        let TimelineLocalFiles(local_metadata, local_files) = local_timeline;
        match index.timeline_entry_mut(&sync_id) {
            Some(remote_timeline) => {
                let (timeline_status, awaits_download) = compare_local_and_remote_timeline(
@@ -1307,7 +1385,7 @@ fn schedule_first_sync_tasks(
                    sync_id,
                    SyncTask::upload(LayersUpload {
                        layers_to_upload: local_files,
-                        uploaded_layers: HashSet::new(),
+                        uploaded_layers: HashMap::new(),
                        metadata: Some(local_metadata.clone()),
                    }),
                ));
@@ -1334,20 +1412,46 @@ fn compare_local_and_remote_timeline(
    new_sync_tasks: &mut VecDeque<(TenantTimelineId, SyncTask)>,
    sync_id: TenantTimelineId,
    local_metadata: TimelineMetadata,
-    local_files: HashSet<PathBuf>,
+    local_files: HashMap<PathBuf, LayerFileMetadata>,
    remote_entry: &RemoteTimeline,
 ) -> (LocalTimelineInitStatus, bool) {
    let _entered = info_span!("compare_local_and_remote_timeline", sync_id = %sync_id).entered();

-    let remote_files = remote_entry.stored_files();
+    let needed_to_download_files = remote_entry
+        .stored_files()
+        .iter()
+        .filter_map(|(layer_file, remote_metadata)| {
+            if let Some(local_metadata) = local_files.get(layer_file) {
+                match (remote_metadata.file_size(), local_metadata.file_size()) {
+                    (Some(x), Some(y)) if x == y => { None },
+                    (None, Some(_)) => {
+                        // upgrading from an earlier IndexPart without metadata
+                        None
+                    },
+                    _ => {
+                        // having to deal with other than (Some(x), Some(y)) where x != y here is a
+                        // bummer, but see #2582 and #2610 for attempts and discussion.
+                        warn!("Redownloading locally existing {layer_file:?} due to size mismatch, size on index: {:?}, on disk: {:?}", remote_metadata.file_size(), local_metadata.file_size());
+                        Some(layer_file)
+                    },
+                }
+            } else {
+                // doesn't exist locally
+                Some(layer_file)
+            }
+        })
+        .collect::<HashSet<_>>();

-    let number_of_layers_to_download = remote_files.difference(&local_files).count();
-    let (initial_timeline_status, awaits_download) = if number_of_layers_to_download > 0 {
+    let (initial_timeline_status, awaits_download) = if !needed_to_download_files.is_empty() {
        new_sync_tasks.push_back((
            sync_id,
-            SyncTask::download(LayersDownload {
-                layers_to_skip: local_files.clone(),
-            }),
+            SyncTask::download(LayersDownload::from_skipped_layers(
+                local_files
+                    .keys()
+                    .filter(|path| !needed_to_download_files.contains(path))
+                    .cloned()
+                    .collect(),
+            )),
        ));
        info!("NeedsSync");
        (LocalTimelineInitStatus::NeedsSync, true)
@@ -1362,15 +1466,22 @@ fn compare_local_and_remote_timeline(
    };

    let layers_to_upload = local_files
-        .difference(remote_files)
-        .cloned()
-        .collect::<HashSet<_>>();
+        .iter()
+        .filter_map(|(local_file, metadata)| {
+            if !remote_entry.stored_files().contains_key(local_file) {
+                Some((local_file.to_owned(), metadata.to_owned()))
+            } else {
+                None
+            }
+        })
+        .collect::<HashMap<_, _>>();
+
    if !layers_to_upload.is_empty() {
        new_sync_tasks.push_back((
            sync_id,
            SyncTask::upload(LayersUpload {
                layers_to_upload,
-                uploaded_layers: HashSet::new(),
+                uploaded_layers: HashMap::new(),
                metadata: Some(local_metadata),
            }),
        ));
@@ -1391,16 +1502,22 @@ fn register_sync_status(

    let tenant_id = sync_id.tenant_id.to_string();
    let timeline_id = sync_id.timeline_id.to_string();
-    match sync_status {
-        Some(true) => {
-            IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "success"])
-        }
-        Some(false) => {
-            IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "failure"])
-        }
-        None => return,
-    }
-    .observe(secs_elapsed)
+
+    let sync_status = match sync_status {
+        Some(true) => "success",
+        Some(false) => "failure",
+        None => "abort",
+    };
+
+    IMAGE_SYNC_TIME_HISTOGRAM
+        .with_label_values(&[sync_name, sync_status])
+        .observe(secs_elapsed);
+    IMAGE_SYNC_TIME
+        .with_label_values(&[&tenant_id, &timeline_id])
+        .add(secs_elapsed);
+    IMAGE_SYNC_COUNT
+        .with_label_values(&[&tenant_id, &timeline_id, sync_name, sync_status])
+        .inc();
 }

 #[cfg(test)]
@@ -1420,11 +1537,12 @@ mod test_utils {
        let timeline_path = harness.timeline_path(&timeline_id);
        fs::create_dir_all(&timeline_path).await?;

-        let mut layers_to_upload = HashSet::with_capacity(filenames.len());
+        let mut layers_to_upload = HashMap::with_capacity(filenames.len());
        for &file in filenames {
            let file_path = timeline_path.join(file);
            fs::write(&file_path, dummy_contents(file).into_bytes()).await?;
-            layers_to_upload.insert(file_path);
+            let metadata = LayerFileMetadata::new(file_path.metadata()?.len());
+            layers_to_upload.insert(file_path, metadata);
        }

        fs::write(
@@ -1435,7 +1553,7 @@ mod test_utils {

        Ok(LayersUpload {
            layers_to_upload,
-            uploaded_layers: HashSet::new(),
+            uploaded_layers: HashMap::new(),
            metadata: Some(metadata),
        })
    }
@@ -1490,12 +1608,13 @@ mod tests {
        assert!(sync_id_2 != sync_id_3);
        assert!(sync_id_3 != TEST_SYNC_ID);

-        let download_task = SyncTask::download(LayersDownload {
-            layers_to_skip: HashSet::from([PathBuf::from("sk")]),
-        });
+        let download_task =
+            SyncTask::download(LayersDownload::from_skipped_layers(HashSet::from([
+                PathBuf::from("sk"),
+            ])));
        let upload_task = SyncTask::upload(LayersUpload {
-            layers_to_upload: HashSet::from([PathBuf::from("up")]),
-            uploaded_layers: HashSet::from([PathBuf::from("upl")]),
+            layers_to_upload: HashMap::from([(PathBuf::from("up"), LayerFileMetadata::new(123))]),
+            uploaded_layers: HashMap::from([(PathBuf::from("upl"), LayerFileMetadata::new(123))]),
            metadata: Some(dummy_metadata(Lsn(2))),
        });
        let delete_task = SyncTask::delete(LayersDeletion {
@@ -1539,12 +1658,10 @@ mod tests {
        let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
        assert_eq!(sync_queue.len(), 0);

-        let download = LayersDownload {
-            layers_to_skip: HashSet::from([PathBuf::from("sk")]),
-        };
+        let download = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk")]));
        let upload = LayersUpload {
-            layers_to_upload: HashSet::from([PathBuf::from("up")]),
-            uploaded_layers: HashSet::from([PathBuf::from("upl")]),
+            layers_to_upload: HashMap::from([(PathBuf::from("up"), LayerFileMetadata::new(123))]),
+            uploaded_layers: HashMap::from([(PathBuf::from("upl"), LayerFileMetadata::new(123))]),
            metadata: Some(dummy_metadata(Lsn(2))),
        };
        let delete = LayersDeletion {
@@ -1592,18 +1709,10 @@ mod tests {
    #[tokio::test]
    async fn same_task_id_same_tasks_batch() {
        let sync_queue = SyncQueue::new(NonZeroUsize::new(1).unwrap());
-        let download_1 = LayersDownload {
-            layers_to_skip: HashSet::from([PathBuf::from("sk1")]),
-        };
-        let download_2 = LayersDownload {
-            layers_to_skip: HashSet::from([PathBuf::from("sk2")]),
-        };
-        let download_3 = LayersDownload {
-            layers_to_skip: HashSet::from([PathBuf::from("sk3")]),
-        };
-        let download_4 = LayersDownload {
-            layers_to_skip: HashSet::from([PathBuf::from("sk4")]),
-        };
+        let download_1 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk1")]));
+        let download_2 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk2")]));
+        let download_3 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk3")]));
+        let download_4 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk4")]));

        let sync_id_2 = TenantTimelineId {
            tenant_id: TenantId::from_array(hex!("22223344556677881122334455667788")),
@@ -1627,15 +1736,15 @@ mod tests {
            Some(SyncTaskBatch {
                download: Some(SyncData {
                    retries: 0,
-                    data: LayersDownload {
-                        layers_to_skip: {
+                    data: LayersDownload::from_skipped_layers(
+                        {
                            let mut set = HashSet::new();
                            set.extend(download_1.layers_to_skip.into_iter());
                            set.extend(download_2.layers_to_skip.into_iter());
                            set.extend(download_4.layers_to_skip.into_iter());
                            set
                        },
-                    }
+                    )
                }),
                upload: None,
                delete: None,
@@ -1651,4 +1760,148 @@ mod tests {
            "Should have one task left out of the batch"
        );
    }
+
+    mod local_and_remote_comparisons {
+        use super::*;
+
+        #[test]
+        fn ready() {
+            let mut new_sync_tasks = VecDeque::default();
+            let sync_id = TenantTimelineId::generate();
+            let local_metadata = dummy_metadata(0x02.into());
+            let local_files =
+                HashMap::from([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]);
+            let mut remote_entry = RemoteTimeline::new(local_metadata.clone());
+            remote_entry
+                .add_timeline_layers([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]);
+
+            let (status, sync_needed) = compare_local_and_remote_timeline(
+                &mut new_sync_tasks,
+                sync_id,
+                local_metadata.clone(),
+                local_files,
+                &remote_entry,
+            );
+
+            assert_eq!(
+                status,
+                LocalTimelineInitStatus::LocallyComplete(local_metadata)
+            );
+            assert!(!sync_needed);
+
+            assert!(new_sync_tasks.is_empty(), "{:?}", new_sync_tasks);
+        }
+
+        #[test]
+        fn needs_download() {
+            let mut new_sync_tasks = VecDeque::default();
+            let sync_id = TenantTimelineId::generate();
+            let local_metadata = dummy_metadata(0x02.into());
+            let local_files = HashMap::default();
+            let mut remote_entry = RemoteTimeline::new(local_metadata.clone());
+            remote_entry
+                .add_timeline_layers([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]);
+
+            let (status, sync_needed) = compare_local_and_remote_timeline(
+                &mut new_sync_tasks,
+                sync_id,
+                local_metadata,
+                local_files.clone(),
+                &remote_entry,
+            );
+
+            assert_eq!(status, LocalTimelineInitStatus::NeedsSync);
+            assert!(sync_needed);
+
+            let new_sync_tasks = new_sync_tasks.into_iter().collect::<Vec<_>>();
+
+            assert_eq!(
+                &new_sync_tasks,
+                &[(
+                    sync_id,
+                    SyncTask::download(LayersDownload::from_skipped_layers(
+                        local_files.keys().cloned().collect()
+                    ))
+                )]
+            );
+        }
+
+        #[test]
+        fn redownload_is_not_needed_on_upgrade() {
+            // originally the implementation missed the `(None, Some(_))` case in the match, and
+            // proceeded to always redownload if the remote metadata was not available.
+
+            let mut new_sync_tasks = VecDeque::default();
+            let sync_id = TenantTimelineId::generate();
+
+            let local_metadata = dummy_metadata(0x02.into());
+
+            // type system would in general allow that LayerFileMetadata would be created with
+            // file_size: None, however `LayerFileMetadata::default` is only allowed from tests,
+            // and so everywhere within the system valid LayerFileMetadata is being created, it is
+            // created through `::new`.
+            let local_files =
+                HashMap::from([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]);
+
+            let mut remote_entry = RemoteTimeline::new(local_metadata.clone());
+
+            // RemoteTimeline is constructed out of an older version IndexPart, which didn't carry
+            // any metadata.
+            remote_entry
+                .add_timeline_layers([(PathBuf::from("first_file"), LayerFileMetadata::default())]);
+
+            let (status, sync_needed) = compare_local_and_remote_timeline(
+                &mut new_sync_tasks,
+                sync_id,
+                local_metadata.clone(),
+                local_files,
+                &remote_entry,
+            );
+
+            assert_eq!(
+                status,
+                LocalTimelineInitStatus::LocallyComplete(local_metadata)
+            );
+            assert!(!sync_needed);
+        }
+
+        #[test]
+        fn needs_upload() {
+            let mut new_sync_tasks = VecDeque::default();
+            let sync_id = TenantTimelineId::generate();
+            let local_metadata = dummy_metadata(0x02.into());
+            let local_files =
+                HashMap::from([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]);
+            let mut remote_entry = RemoteTimeline::new(local_metadata.clone());
+            remote_entry.add_timeline_layers([]);
+
+            let (status, sync_needed) = compare_local_and_remote_timeline(
+                &mut new_sync_tasks,
+                sync_id,
+                local_metadata.clone(),
+                local_files.clone(),
+                &remote_entry,
+            );
+
+            assert_eq!(
+                status,
+                LocalTimelineInitStatus::LocallyComplete(local_metadata.clone())
+            );
+            assert!(!sync_needed);
+
+            let new_sync_tasks = new_sync_tasks.into_iter().collect::<Vec<_>>();
+
+            assert_eq!(
+                &new_sync_tasks,
+                &[(
+                    sync_id,
+                    SyncTask::upload(LayersUpload {
+                        layers_to_upload: local_files,
+                        uploaded_layers: HashMap::default(),
+                        metadata: Some(local_metadata),
+                    })
+                )]
+            );
+        }
+    }
 }
--- a/pageserver/src/storage_sync/delete.rs
+++ b/pageserver/src/storage_sync/delete.rs
@@ -171,7 +171,7 @@ mod tests {
        let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
        let timeline_upload =
            create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
-        for local_path in timeline_upload.layers_to_upload {
+        for (local_path, _metadata) in timeline_upload.layers_to_upload {
            let remote_path =
                local_storage.resolve_in_storage(&local_storage.remote_object_id(&local_path)?)?;
            let remote_parent_dir = remote_path.parent().unwrap();
--- a/pageserver/src/storage_sync/download.rs
+++ b/pageserver/src/storage_sync/download.rs
@@ -16,7 +16,11 @@ use tokio::{
 };
 use tracing::{debug, error, info, warn};

-use crate::{config::PageServerConf, storage_sync::SyncTask, TEMP_FILE_SUFFIX};
+use crate::{
+    config::PageServerConf,
+    storage_sync::{index::LayerFileMetadata, SyncTask},
+    TEMP_FILE_SUFFIX,
+};
 use utils::{
    crashsafe_dir::path_with_suffix_extension,
    id::{TenantId, TenantTimelineId, TimelineId},
@@ -219,8 +223,14 @@ pub(super) async fn download_timeline_layers<'a>(

    let layers_to_download = remote_timeline
        .stored_files()
-        .difference(&download.layers_to_skip)
-        .cloned()
+        .iter()
+        .filter_map(|(layer_path, metadata)| {
+            if !download.layers_to_skip.contains(layer_path) {
+                Some((layer_path.to_owned(), metadata.to_owned()))
+            } else {
+                None
+            }
+        })
        .collect::<Vec<_>>();

    debug!("Layers to download: {layers_to_download:?}");
@@ -233,89 +243,129 @@ pub(super) async fn download_timeline_layers<'a>(

    let mut download_tasks = layers_to_download
        .into_iter()
-        .map(|layer_destination_path| async move {
-            if layer_destination_path.exists() {
-                debug!(
-                    "Layer already exists locally, skipping download: {}",
-                    layer_destination_path.display()
-                );
-            } else {
-                // Perform a rename inspired by durable_rename from file_utils.c.
-                // The sequence:
-                //     write(tmp)
-                //     fsync(tmp)
-                //     rename(tmp, new)
-                //     fsync(new)
-                //     fsync(parent)
-                // For more context about durable_rename check this email from postgres mailing list:
-                // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
-                // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
-                let temp_file_path =
-                    path_with_suffix_extension(&layer_destination_path, TEMP_FILE_SUFFIX);
+        .map(|(layer_destination_path, metadata)| async move {

-                let mut destination_file =
-                    fs::File::create(&temp_file_path).await.with_context(|| {
-                        format!(
-                            "Failed to create a destination file for layer '{}'",
-                            temp_file_path.display()
-                        )
-                    })?;
+            match layer_destination_path.metadata() {
+                Ok(m) if m.is_file() => {
+                    // the file exists from earlier round when we failed after renaming it as
+                    // layer_destination_path
+                    let verified = if let Some(expected) = metadata.file_size() {
+                        m.len() == expected
+                    } else {
+                        // behaviour before recording metadata was to accept any existing
+                        true
+                    };

-                let mut layer_download = storage.download_storage_object(None, &layer_destination_path)
-                    .await
-                    .with_context(|| {
-                        format!(
-                            "Failed to initiate the download the layer for {sync_id} into file '{}'",
-                            temp_file_path.display()
-                        )
-                    })?;
-                io::copy(&mut layer_download.download_stream, &mut destination_file)
-                    .await
-                    .with_context(|| {
-                        format!(
-                            "Failed to download the layer for {sync_id} into file '{}'",
-                            temp_file_path.display()
-                        )
-                    })?;
-
-                // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
-                // A file will not be closed immediately when it goes out of scope if there are any IO operations
-                // that have not yet completed. To ensure that a file is closed immediately when it is dropped,
-                // you should call flush before dropping it.
-                //
-                // From the tokio code I see that it waits for pending operations to complete. There shouldn't be any because
-                // we assume that `destination_file` file is fully written. I.e there is no pending .write(...).await operations.
-                // But for additional safety let's check/wait for any pending operations.
-                destination_file.flush().await.with_context(|| {
-                    format!(
-                        "failed to flush source file at {}",
-                        temp_file_path.display()
-                    )
-                })?;
-
-                // not using sync_data because it can lose file size update
-                destination_file.sync_all().await.with_context(|| {
-                    format!(
-                        "failed to fsync source file at {}",
-                        temp_file_path.display()
-                    )
-                })?;
-                drop(destination_file);
-
-                fail::fail_point!("remote-storage-download-pre-rename", |_| {
-                    anyhow::bail!("remote-storage-download-pre-rename failpoint triggered")
-                });
-
-                fs::rename(&temp_file_path, &layer_destination_path).await?;
-
-                fsync_path(&layer_destination_path).await.with_context(|| {
-                    format!(
-                        "Cannot fsync layer destination path {}",
-                        layer_destination_path.display(),
-                    )
-                })?;
+                    if verified {
+                        debug!(
+                            "Layer already exists locally, skipping download: {}",
+                            layer_destination_path.display()
+                        );
+                        return Ok((layer_destination_path, LayerFileMetadata::new(m.len())))
+                    } else {
+                        // no need to remove it, it will be overwritten by fs::rename
+                        // after successful download
+                        warn!("Downloaded layer exists already but layer file metadata mismatches: {}, metadata {:?}", layer_destination_path.display(), metadata);
+                    }
+                }
+                Ok(m) => {
+                    return Err(anyhow::anyhow!("Downloaded layer destination exists but is not a file: {m:?}, target needs to be removed/archived manually: {layer_destination_path:?}"));
+                }
+                Err(_) => {
+                    // behave as the file didn't exist
+                }
            }
-            Ok::<_, anyhow::Error>(layer_destination_path)
+
+            // Perform a rename inspired by durable_rename from file_utils.c.
+            // The sequence:
+            //     write(tmp)
+            //     fsync(tmp)
+            //     rename(tmp, new)
+            //     fsync(new)
+            //     fsync(parent)
+            // For more context about durable_rename check this email from postgres mailing list:
+            // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
+            // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
+            let temp_file_path =
+                path_with_suffix_extension(&layer_destination_path, TEMP_FILE_SUFFIX);
+
+            // TODO: this doesn't use the cached fd for some reason?
+            let mut destination_file =
+                fs::File::create(&temp_file_path).await.with_context(|| {
+                    format!(
+                        "Failed to create a destination file for layer '{}'",
+                        temp_file_path.display()
+                    )
+                })?;
+
+            let mut layer_download = storage.download_storage_object(None, &layer_destination_path)
+                .await
+                .with_context(|| {
+                    format!(
+                        "Failed to initiate the download the layer for {sync_id} into file '{}'",
+                        temp_file_path.display()
+                    )
+                })?;
+
+            let bytes_amount = io::copy(&mut layer_download.download_stream, &mut destination_file)
+                .await
+                .with_context(|| {
+                    format!(
+                        "Failed to download the layer for {sync_id} into file '{}'",
+                        temp_file_path.display()
+                    )
+                })?;
+
+            // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
+            // A file will not be closed immediately when it goes out of scope if there are any IO operations
+            // that have not yet completed. To ensure that a file is closed immediately when it is dropped,
+            // you should call flush before dropping it.
+            //
+            // From the tokio code I see that it waits for pending operations to complete. There shouldn't be any because
+            // we assume that `destination_file` file is fully written. I.e there is no pending .write(...).await operations.
+            // But for additional safety let's check/wait for any pending operations.
+            destination_file.flush().await.with_context(|| {
+                format!(
+                    "failed to flush source file at {}",
+                    temp_file_path.display()
+                )
+            })?;
+
+            match metadata.file_size() {
+                Some(expected) if expected != bytes_amount => {
+                    anyhow::bail!(
+                        "According to layer file metadata should had downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
+                        temp_file_path.display()
+                    );
+                },
+                Some(_) | None => {
+                    // matches, or upgrading from an earlier IndexPart version
+                }
+            }
+
+            // not using sync_data because it can lose file size update
+            destination_file.sync_all().await.with_context(|| {
+                format!(
+                    "failed to fsync source file at {}",
+                    temp_file_path.display()
+                )
+            })?;
+            drop(destination_file);
+
+            fail::fail_point!("remote-storage-download-pre-rename", |_| {
+                anyhow::bail!("remote-storage-download-pre-rename failpoint triggered")
+            });
+
+            fs::rename(&temp_file_path, &layer_destination_path).await?;
+
+            fsync_path(&layer_destination_path).await.with_context(|| {
+                format!(
+                    "Cannot fsync layer destination path {}",
+                    layer_destination_path.display(),
+                )
+            })?;
+
+            Ok::<_, anyhow::Error>((layer_destination_path, LayerFileMetadata::new(bytes_amount)))
        })
        .collect::<FuturesUnordered<_>>();

@@ -324,9 +374,12 @@ pub(super) async fn download_timeline_layers<'a>(
    let mut undo = HashSet::new();
    while let Some(download_result) = download_tasks.next().await {
        match download_result {
-            Ok(downloaded_path) => {
+            Ok((downloaded_path, metadata)) => {
                undo.insert(downloaded_path.clone());
-                download.layers_to_skip.insert(downloaded_path);
+                download.layers_to_skip.insert(downloaded_path.clone());
+                // what if the key existed already? ignore, because then we would had
+                // downloaded a partial file, and had to retry
+                download.gathered_metadata.insert(downloaded_path, metadata);
            }
            Err(e) => {
                errors_happened = true;
@@ -349,6 +402,8 @@ pub(super) async fn download_timeline_layers<'a>(
        );
        for item in undo {
            download.layers_to_skip.remove(&item);
+            // intentionally don't clear the gathered_metadata because it exists for fsync_path
+            // failure on parent directory
        }
        errors_happened = true;
    }
@@ -453,9 +508,9 @@ mod tests {
        let timeline_upload =
            create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;

-        for local_path in timeline_upload.layers_to_upload {
+        for local_path in timeline_upload.layers_to_upload.keys() {
            let remote_path =
-                local_storage.resolve_in_storage(&storage.remote_object_id(&local_path)?)?;
+                local_storage.resolve_in_storage(&storage.remote_object_id(local_path)?)?;
            let remote_parent_dir = remote_path.parent().unwrap();
            if !remote_parent_dir.exists() {
                fs::create_dir_all(&remote_parent_dir).await?;
@@ -473,11 +528,19 @@ mod tests {

        let mut remote_timeline = RemoteTimeline::new(metadata.clone());
        remote_timeline.awaits_download = true;
-        remote_timeline.add_timeline_layers(
-            layer_files
-                .iter()
-                .map(|layer| local_timeline_path.join(layer)),
-        );
+        remote_timeline.add_timeline_layers(layer_files.iter().map(|layer| {
+            let layer_path = local_timeline_path.join(layer);
+
+            // this could had also been LayerFileMetadata::default(), but since in this test we
+            // don't do the merge operation done by storage_sync::download_timeline_data, it would
+            // not be merged back to timeline.
+            let metadata_from_upload = timeline_upload
+                .layers_to_upload
+                .get(&layer_path)
+                .expect("layer must exist in previously uploaded paths")
+                .to_owned();
+            (layer_path, metadata_from_upload)
+        }));

        let download_data = match download_timeline_layers(
            harness.conf,
@@ -487,9 +550,9 @@ mod tests {
            sync_id,
            SyncData::new(
                current_retries,
-                LayersDownload {
-                    layers_to_skip: HashSet::from([local_timeline_path.join("layer_to_skip")]),
-                },
+                LayersDownload::from_skipped_layers(HashSet::from([
+                    local_timeline_path.join("layer_to_skip")
+                ])),
            ),
        )
        .await
@@ -552,12 +615,7 @@ mod tests {
            &sync_queue,
            None,
            sync_id,
-            SyncData::new(
-                0,
-                LayersDownload {
-                    layers_to_skip: HashSet::new(),
-                },
-            ),
+            SyncData::new(0, LayersDownload::from_skipped_layers(HashSet::new())),
        )
        .await;
        assert!(
@@ -576,12 +634,7 @@ mod tests {
            &sync_queue,
            Some(&not_expecting_download_remote_timeline),
            sync_id,
-            SyncData::new(
-                0,
-                LayersDownload {
-                    layers_to_skip: HashSet::new(),
-                },
-            ),
+            SyncData::new(0, LayersDownload::from_skipped_layers(HashSet::new())),
        )
        .await;
        assert!(
--- a/pageserver/src/storage_sync/index.rs
+++ b/pageserver/src/storage_sync/index.rs
@@ -212,8 +212,8 @@ impl RemoteTimelineIndex {
 /// Restored index part data about the timeline, stored in the remote index.
 #[derive(Debug, Clone)]
 pub struct RemoteTimeline {
-    timeline_layers: HashSet<PathBuf>,
-    missing_layers: HashSet<PathBuf>,
+    timeline_layers: HashMap<PathBuf, LayerFileMetadata>,
+    missing_layers: HashMap<PathBuf, LayerFileMetadata>,

    pub metadata: TimelineMetadata,
    pub awaits_download: bool,
@@ -222,62 +222,161 @@ pub struct RemoteTimeline {
 impl RemoteTimeline {
    pub fn new(metadata: TimelineMetadata) -> Self {
        Self {
-            timeline_layers: HashSet::new(),
-            missing_layers: HashSet::new(),
+            timeline_layers: HashMap::default(),
+            missing_layers: HashMap::default(),
            metadata,
            awaits_download: false,
        }
    }

-    pub fn add_timeline_layers(&mut self, new_layers: impl IntoIterator<Item = PathBuf>) {
-        self.timeline_layers.extend(new_layers.into_iter());
+    pub fn add_timeline_layers(
+        &mut self,
+        new_layers: impl IntoIterator<Item = (PathBuf, LayerFileMetadata)>,
+    ) {
+        self.timeline_layers.extend(new_layers);
    }

-    pub fn add_upload_failures(&mut self, upload_failures: impl IntoIterator<Item = PathBuf>) {
-        self.missing_layers.extend(upload_failures.into_iter());
+    pub fn add_upload_failures(
+        &mut self,
+        upload_failures: impl IntoIterator<Item = (PathBuf, LayerFileMetadata)>,
+    ) {
+        self.missing_layers.extend(upload_failures);
    }

    pub fn remove_layers(&mut self, layers_to_remove: &HashSet<PathBuf>) {
        self.timeline_layers
-            .retain(|layer| !layers_to_remove.contains(layer));
+            .retain(|layer, _| !layers_to_remove.contains(layer));
        self.missing_layers
-            .retain(|layer| !layers_to_remove.contains(layer));
+            .retain(|layer, _| !layers_to_remove.contains(layer));
    }

    /// Lists all layer files in the given remote timeline. Omits the metadata file.
-    pub fn stored_files(&self) -> &HashSet<PathBuf> {
+    pub fn stored_files(&self) -> &HashMap<PathBuf, LayerFileMetadata> {
        &self.timeline_layers
    }

+    /// Combines metadata gathered or verified during downloading needed layer files to metadata on
+    /// the [`RemoteIndex`], so it can be uploaded later.
+    pub fn merge_metadata_from_downloaded(
+        &mut self,
+        downloaded: &HashMap<PathBuf, LayerFileMetadata>,
+    ) {
+        downloaded.iter().for_each(|(path, metadata)| {
+            if let Some(upgraded) = self.timeline_layers.get_mut(path) {
+                upgraded.merge(metadata);
+            }
+        });
+    }
+
    pub fn from_index_part(timeline_path: &Path, index_part: IndexPart) -> anyhow::Result<Self> {
        let metadata = TimelineMetadata::from_bytes(&index_part.metadata_bytes)?;
+        let default_metadata = &IndexLayerMetadata::default();
+
+        let find_metadata = |key: &RelativePath| -> LayerFileMetadata {
+            index_part
+                .layer_metadata
+                .get(key)
+                .unwrap_or(default_metadata)
+                .into()
+        };
+
        Ok(Self {
-            timeline_layers: to_local_paths(timeline_path, index_part.timeline_layers),
-            missing_layers: to_local_paths(timeline_path, index_part.missing_layers),
+            timeline_layers: index_part
+                .timeline_layers
+                .iter()
+                .map(|layer_path| (layer_path.as_path(timeline_path), find_metadata(layer_path)))
+                .collect(),
+            missing_layers: index_part
+                .missing_layers
+                .iter()
+                .map(|layer_path| (layer_path.as_path(timeline_path), find_metadata(layer_path)))
+                .collect(),
            metadata,
            awaits_download: false,
        })
    }
 }

+/// Metadata gathered for each of the layer files.
+///
+/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
+/// might have less or more metadata depending if upgrading or rolling back an upgrade.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
+#[cfg_attr(test, derive(Default))]
+pub struct LayerFileMetadata {
+    file_size: Option<u64>,
+}
+
+impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
+    fn from(other: &IndexLayerMetadata) -> Self {
+        LayerFileMetadata {
+            file_size: other.file_size,
+        }
+    }
+}
+
+impl LayerFileMetadata {
+    pub fn new(file_size: u64) -> Self {
+        LayerFileMetadata {
+            file_size: Some(file_size),
+        }
+    }
+
+    pub fn file_size(&self) -> Option<u64> {
+        self.file_size
+    }
+
+    /// Metadata has holes due to version upgrades. This method is called to upgrade self with the
+    /// other value.
+    ///
+    /// This is called on the possibly outdated version.
+    pub fn merge(&mut self, other: &Self) {
+        self.file_size = other.file_size.or(self.file_size);
+    }
+}
+
 /// Part of the remote index, corresponding to a certain timeline.
 /// Contains the data about all files in the timeline, present remotely and its metadata.
+///
+/// This type needs to be backwards and forwards compatible. When changing the fields,
+/// remember to add a test case for the changed version.
 #[serde_as]
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub struct IndexPart {
+    /// Debugging aid describing the version of this type.
+    #[serde(default)]
+    version: usize,
+
+    /// Each of the layers present on remote storage.
+    ///
+    /// Additional metadata can might exist in `layer_metadata`.
    timeline_layers: HashSet<RelativePath>,
+
    /// Currently is not really used in pageserver,
    /// present to manually keep track of the layer files that pageserver might never retrieve.
    ///
    /// Such "holes" might appear if any upload task was evicted on an error threshold:
    /// the this layer will only be rescheduled for upload on pageserver restart.
    missing_layers: HashSet<RelativePath>,
+
+    /// Per layer file metadata, which can be present for a present or missing layer file.
+    ///
+    /// Older versions of `IndexPart` will not have this property or have only a part of metadata
+    /// that latest version stores.
+    #[serde(default)]
+    layer_metadata: HashMap<RelativePath, IndexLayerMetadata>,
+
    #[serde_as(as = "DisplayFromStr")]
    disk_consistent_lsn: Lsn,
    metadata_bytes: Vec<u8>,
 }

 impl IndexPart {
+    /// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
+    /// used to understand later versions.
+    ///
+    /// Version is currently informative only.
+    const LATEST_VERSION: usize = 1;
    pub const FILE_NAME: &'static str = "index_part.json";

    #[cfg(test)]
@@ -288,8 +387,10 @@ impl IndexPart {
        metadata_bytes: Vec<u8>,
    ) -> Self {
        Self {
+            version: Self::LATEST_VERSION,
            timeline_layers,
            missing_layers,
+            layer_metadata: HashMap::default(),
            disk_consistent_lsn,
            metadata_bytes,
        }
@@ -304,35 +405,68 @@ impl IndexPart {
        remote_timeline: RemoteTimeline,
    ) -> anyhow::Result<Self> {
        let metadata_bytes = remote_timeline.metadata.to_bytes()?;
+
+        let mut layer_metadata = HashMap::new();
+
+        let mut missing_layers = HashSet::new();
+
+        separate_paths_and_metadata(
+            timeline_path,
+            &remote_timeline.missing_layers,
+            &mut missing_layers,
+            &mut layer_metadata,
+        )
+        .context("Failed to convert missing layers' paths to relative ones")?;
+
+        let mut timeline_layers = HashSet::new();
+
+        separate_paths_and_metadata(
+            timeline_path,
+            &remote_timeline.timeline_layers,
+            &mut timeline_layers,
+            &mut layer_metadata,
+        )
+        .context("Failed to convert timeline layers' paths to relative ones")?;
+
        Ok(Self {
-            timeline_layers: to_relative_paths(timeline_path, remote_timeline.timeline_layers)
-                .context("Failed to convert timeline layers' paths to relative ones")?,
-            missing_layers: to_relative_paths(timeline_path, remote_timeline.missing_layers)
-                .context("Failed to convert missing layers' paths to relative ones")?,
+            version: Self::LATEST_VERSION,
+            timeline_layers,
+            missing_layers,
+            layer_metadata,
            disk_consistent_lsn: remote_timeline.metadata.disk_consistent_lsn(),
            metadata_bytes,
        })
    }
 }

-fn to_local_paths(
-    timeline_path: &Path,
-    paths: impl IntoIterator<Item = RelativePath>,
-) -> HashSet<PathBuf> {
-    paths
-        .into_iter()
-        .map(|path| path.as_path(timeline_path))
-        .collect()
+/// Serialized form of [`LayerFileMetadata`].
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
+pub struct IndexLayerMetadata {
+    file_size: Option<u64>,
 }

-fn to_relative_paths(
+impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
+    fn from(other: &'_ LayerFileMetadata) -> Self {
+        IndexLayerMetadata {
+            file_size: other.file_size,
+        }
+    }
+}
+
+fn separate_paths_and_metadata(
    timeline_path: &Path,
-    paths: impl IntoIterator<Item = PathBuf>,
-) -> anyhow::Result<HashSet<RelativePath>> {
-    paths
-        .into_iter()
-        .map(|path| RelativePath::new(timeline_path, path))
-        .collect()
+    input: &HashMap<PathBuf, LayerFileMetadata>,
+    output: &mut HashSet<RelativePath>,
+    layer_metadata: &mut HashMap<RelativePath, IndexLayerMetadata>,
+) -> anyhow::Result<()> {
+    for (path, metadata) in input {
+        let rel_path = RelativePath::new(timeline_path, path)?;
+        let metadata = IndexLayerMetadata::from(metadata);
+
+        layer_metadata.insert(rel_path.clone(), metadata);
+        output.insert(rel_path);
+    }
+    Ok(())
 }

 #[cfg(test)]
@@ -357,13 +491,13 @@ mod tests {
            DEFAULT_PG_VERSION,
        );
        let remote_timeline = RemoteTimeline {
-            timeline_layers: HashSet::from([
-                timeline_path.join("layer_1"),
-                timeline_path.join("layer_2"),
+            timeline_layers: HashMap::from([
+                (timeline_path.join("layer_1"), LayerFileMetadata::new(1)),
+                (timeline_path.join("layer_2"), LayerFileMetadata::new(2)),
            ]),
-            missing_layers: HashSet::from([
-                timeline_path.join("missing_1"),
-                timeline_path.join("missing_2"),
+            missing_layers: HashMap::from([
+                (timeline_path.join("missing_1"), LayerFileMetadata::new(3)),
+                (timeline_path.join("missing_2"), LayerFileMetadata::new(4)),
            ]),
            metadata: metadata.clone(),
            awaits_download: false,
@@ -485,13 +619,13 @@ mod tests {
        let conversion_result = IndexPart::from_remote_timeline(
            &timeline_path,
            RemoteTimeline {
-                timeline_layers: HashSet::from([
-                    PathBuf::from("bad_path"),
-                    timeline_path.join("layer_2"),
+                timeline_layers: HashMap::from([
+                    (PathBuf::from("bad_path"), LayerFileMetadata::new(1)),
+                    (timeline_path.join("layer_2"), LayerFileMetadata::new(2)),
                ]),
-                missing_layers: HashSet::from([
-                    timeline_path.join("missing_1"),
-                    timeline_path.join("missing_2"),
+                missing_layers: HashMap::from([
+                    (timeline_path.join("missing_1"), LayerFileMetadata::new(3)),
+                    (timeline_path.join("missing_2"), LayerFileMetadata::new(4)),
                ]),
                metadata: metadata.clone(),
                awaits_download: false,
@@ -502,13 +636,13 @@ mod tests {
        let conversion_result = IndexPart::from_remote_timeline(
            &timeline_path,
            RemoteTimeline {
-                timeline_layers: HashSet::from([
-                    timeline_path.join("layer_1"),
-                    timeline_path.join("layer_2"),
+                timeline_layers: HashMap::from([
+                    (timeline_path.join("layer_1"), LayerFileMetadata::new(1)),
+                    (timeline_path.join("layer_2"), LayerFileMetadata::new(2)),
                ]),
-                missing_layers: HashSet::from([
-                    PathBuf::from("bad_path"),
-                    timeline_path.join("missing_2"),
+                missing_layers: HashMap::from([
+                    (PathBuf::from("bad_path"), LayerFileMetadata::new(3)),
+                    (timeline_path.join("missing_2"), LayerFileMetadata::new(4)),
                ]),
                metadata,
                awaits_download: false,
@@ -516,4 +650,63 @@ mod tests {
        );
        assert!(conversion_result.is_err(), "Should not be able to convert metadata with missing layer paths that are not in the timeline directory");
    }
+
+    #[test]
+    fn v0_indexpart_is_parsed() {
+        let example = r#"{
+            "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
+            "missing_layers":["not_a_real_layer_but_adding_coverage"],
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+        }"#;
+
+        let expected = IndexPart {
+            version: 0,
+            timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
+            missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(),
+            layer_metadata: HashMap::default(),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+        };
+
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        assert_eq!(part, expected);
+    }
+
+    #[test]
+    fn v1_indexpart_is_parsed() {
+        let example = r#"{
+            "version":1,
+            "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
+            "missing_layers":["not_a_real_layer_but_adding_coverage"],
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+        }"#;
+
+        let expected = IndexPart {
+            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
+            version: 1,
+            timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
+            missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(),
+            layer_metadata: HashMap::from([
+                (RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned()), IndexLayerMetadata {
+                    file_size: Some(25600000),
+                }),
+                (RelativePath("not_a_real_layer_but_adding_coverage".to_owned()), IndexLayerMetadata {
+                    // serde_json should always parse this but this might be a double with jq for
+                    // example.
+                    file_size: Some(9007199254741001),
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+        };
+
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        assert_eq!(part, expected);
+    }
 }
--- a/pageserver/src/storage_sync/upload.rs
+++ b/pageserver/src/storage_sync/upload.rs
@@ -69,14 +69,25 @@ pub(super) async fn upload_timeline_layers<'a>(
        .map(|meta| meta.disk_consistent_lsn());

    let already_uploaded_layers = remote_timeline
-        .map(|timeline| timeline.stored_files())
-        .cloned()
+        .map(|timeline| {
+            timeline
+                .stored_files()
+                .keys()
+                .cloned()
+                .collect::<std::collections::HashSet<_>>()
+        })
        .unwrap_or_default();

    let layers_to_upload = upload
        .layers_to_upload
-        .difference(&already_uploaded_layers)
-        .cloned()
+        .iter()
+        .filter_map(|(k, v)| {
+            if !already_uploaded_layers.contains(k) {
+                Some((k.to_owned(), v.to_owned()))
+            } else {
+                None
+            }
+        })
        .collect::<Vec<_>>();

    if layers_to_upload.is_empty() {
@@ -98,7 +109,7 @@ pub(super) async fn upload_timeline_layers<'a>(

    let mut upload_tasks = layers_to_upload
        .into_iter()
-        .map(|source_path| async move {
+        .map(|(source_path, known_metadata)| async move {
            let source_file = match fs::File::open(&source_path).await.with_context(|| {
                format!(
                    "Failed to upen a source file for layer '{}'",
@@ -109,7 +120,7 @@ pub(super) async fn upload_timeline_layers<'a>(
                Err(e) => return Err(UploadError::MissingLocalFile(source_path, e)),
            };

-            let source_size = source_file
+            let fs_size = source_file
                .metadata()
                .await
                .with_context(|| {
@@ -119,10 +130,24 @@ pub(super) async fn upload_timeline_layers<'a>(
                    )
                })
                .map_err(UploadError::Other)?
-                .len() as usize;
+                .len();
+
+            // FIXME: this looks bad
+            if let Some(metadata_size) = known_metadata.file_size() {
+                if metadata_size != fs_size {
+                    return Err(UploadError::Other(anyhow::anyhow!(
+                        "File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"
+                    )));
+                }
+            } else {
+                // this is a silly state we would like to avoid
+            }
+
+            let fs_size = usize::try_from(fs_size).with_context(|| format!("File {source_path:?} size {fs_size} could not be converted to usize"))
+                .map_err(UploadError::Other)?;

            match storage
-                .upload_storage_object(Box::new(source_file), source_size, &source_path)
+                .upload_storage_object(Box::new(source_file), fs_size, &source_path)
                .await
                .with_context(|| format!("Failed to upload layer file for {sync_id}"))
            {
@@ -136,8 +161,11 @@ pub(super) async fn upload_timeline_layers<'a>(
    while let Some(upload_result) = upload_tasks.next().await {
        match upload_result {
            Ok(uploaded_path) => {
-                upload.layers_to_upload.remove(&uploaded_path);
-                upload.uploaded_layers.insert(uploaded_path);
+                let metadata = upload
+                    .layers_to_upload
+                    .remove(&uploaded_path)
+                    .expect("metadata should always exist, assuming no double uploads");
+                upload.uploaded_layers.insert(uploaded_path, metadata);
            }
            Err(e) => match e {
                UploadError::Other(e) => {
@@ -262,7 +290,7 @@ mod tests {
        assert_eq!(
            upload
                .uploaded_layers
-                .iter()
+                .keys()
                .cloned()
                .collect::<BTreeSet<_>>(),
            layer_files
@@ -357,7 +385,7 @@ mod tests {
        assert_eq!(
            upload
                .uploaded_layers
-                .iter()
+                .keys()
                .cloned()
                .collect::<BTreeSet<_>>(),
            layer_files
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -17,7 +17,6 @@ use tracing::*;
 use utils::crashsafe_dir::path_with_suffix_extension;

 use std::cmp::min;
-use std::collections::hash_map;
 use std::collections::hash_map::Entry;
 use std::collections::BTreeSet;
 use std::collections::HashMap;
@@ -46,6 +45,7 @@ use crate::tenant_config::TenantConfOpt;
 use crate::virtual_file::VirtualFile;
 use crate::walredo::WalRedoManager;
 use crate::{CheckpointConfig, TEMP_FILE_SUFFIX};
+pub use pageserver_api::models::TenantState;

 use toml_edit;
 use utils::{
@@ -59,13 +59,14 @@ pub mod block_io;
 mod delta_layer;
 mod disk_btree;
 pub(crate) mod ephemeral_file;
-mod filename;
+pub mod filename;
 mod image_layer;
 mod inmemory_layer;
-mod layer_map;
+pub mod layer_map;
+
 pub mod metadata;
 mod par_fsync;
-mod storage_layer;
+pub mod storage_layer;

 mod timeline;

@@ -119,18 +120,6 @@ pub struct Tenant {
    upload_layers: bool,
 }

-/// A state of a tenant in pageserver's memory.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-pub enum TenantState {
-    /// Tenant is fully operational, its background jobs might be running or not.
-    Active { background_jobs_running: bool },
-    /// A tenant is recognized by pageserver, but not yet ready to operate:
-    /// e.g. not present locally and being downloaded or being read into memory from the file system.
-    Paused,
-    /// A tenant is recognized by the pageserver, but no longer used for any operations, as failed to get activated.
-    Broken,
-}
-
 /// A repository corresponds to one .neon directory. One repository holds multiple
 /// timelines, forked off from the same initial call to 'initdb'.
 impl Tenant {
@@ -156,17 +145,18 @@ impl Tenant {

    /// Lists timelines the tenant contains.
    /// Up to tenant's implementation to omit certain timelines that ar not considered ready for use.
-    pub fn list_timelines(&self) -> Vec<(TimelineId, Arc<Timeline>)> {
+    pub fn list_timelines(&self) -> Vec<Arc<Timeline>> {
        self.timelines
            .lock()
            .unwrap()
-            .iter()
-            .map(|(timeline_id, timeline_entry)| (*timeline_id, Arc::clone(timeline_entry)))
+            .values()
+            .map(Arc::clone)
            .collect()
    }

-    /// Create a new, empty timeline. The caller is responsible for loading data into it
-    /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
+    /// This is used to create the initial 'main' timeline during bootstrapping,
+    /// or when importing a new base backup. The caller is expected to load an
+    /// initial image of the datadir to the new timeline after this.
    pub fn create_empty_timeline(
        &self,
        new_timeline_id: TimelineId,
@@ -246,12 +236,12 @@ impl Tenant {
                    let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
                    if ancestor_ancestor_lsn > *lsn {
                        // can we safely just branch from the ancestor instead?
-                        anyhow::bail!(
-                    "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
-                    lsn,
-                    ancestor_timeline_id,
-                    ancestor_ancestor_lsn,
-                );
+                        bail!(
+                            "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
+                            lsn,
+                            ancestor_timeline_id,
+                            ancestor_ancestor_lsn,
+                        );
                    }
                }

@@ -357,7 +347,7 @@ impl Tenant {

        ensure!(
            !children_exist,
-            "Cannot detach timeline which has child timelines"
+            "Cannot delete timeline which has child timelines"
        );
        let timeline_entry = match timelines.entry(timeline_id) {
            Entry::Occupied(e) => e,
@@ -401,16 +391,19 @@ impl Tenant {
                timeline_id,
                metadata.pg_version()
            );
-            let timeline = self
-                .initialize_new_timeline(timeline_id, metadata, &mut timelines_accessor)
-                .with_context(|| format!("Failed to initialize timeline {timeline_id}"))?;
-
-            match timelines_accessor.entry(timeline.timeline_id) {
-                hash_map::Entry::Occupied(_) => anyhow::bail!(
-                    "Found freshly initialized timeline {} in the tenant map",
-                    timeline.timeline_id
+            let ancestor = metadata
+                .ancestor_timeline()
+                .and_then(|ancestor_timeline_id| timelines_accessor.get(&ancestor_timeline_id))
+                .cloned();
+            match timelines_accessor.entry(timeline_id) {
+                Entry::Occupied(_) => warn!(
+                    "Timeline {}/{} already exists in the tenant map, skipping its initialization",
+                    self.tenant_id, timeline_id
                ),
-                hash_map::Entry::Vacant(v) => {
+                Entry::Vacant(v) => {
+                    let timeline = self
+                        .initialize_new_timeline(timeline_id, metadata, ancestor)
+                        .with_context(|| format!("Failed to initialize timeline {timeline_id}"))?;
                    v.insert(timeline);
                }
            }
@@ -610,21 +603,14 @@ impl Tenant {
        &self,
        new_timeline_id: TimelineId,
        new_metadata: TimelineMetadata,
-        timelines: &mut MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
+        ancestor: Option<Arc<Timeline>>,
    ) -> anyhow::Result<Arc<Timeline>> {
-        let ancestor = match new_metadata.ancestor_timeline() {
-            Some(ancestor_timeline_id) => Some(
-                timelines
-                    .get(&ancestor_timeline_id)
-                    .cloned()
-                    .with_context(|| {
-                        format!(
-                        "Timeline's {new_timeline_id} ancestor {ancestor_timeline_id} was not found"
-                    )
-                    })?,
-            ),
-            None => None,
-        };
+        if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() {
+            anyhow::ensure!(
+                ancestor.is_some(),
+                "Timeline's {new_timeline_id} ancestor {ancestor_timeline_id} was not found"
+            )
+        }

        let new_disk_consistent_lsn = new_metadata.disk_consistent_lsn();
        let pg_version = new_metadata.pg_version();
@@ -768,7 +754,7 @@ impl Tenant {
                })
                .with_context(|| {
                    format!(
-                        "Failed to fsync on firts save for config {}",
+                        "Failed to fsync on first save for config {}",
                        target_config_path.display()
                    )
                })?;
@@ -922,6 +908,7 @@ impl Tenant {
        Ok(totals)
    }

+    /// Branch an existing timeline
    fn branch_timeline(
        &self,
        src: TimelineId,
@@ -997,7 +984,7 @@ impl Tenant {
            dst_prev,
            Some(src),
            start_lsn,
-            *src_timeline.latest_gc_cutoff_lsn.read(),
+            *src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
            src_timeline.initdb_lsn,
            src_timeline.pg_version,
        );
@@ -1081,8 +1068,12 @@ impl Tenant {
            )
        })?;

+        let ancestor = new_metadata
+            .ancestor_timeline()
+            .and_then(|ancestor_timeline_id| timelines.get(&ancestor_timeline_id))
+            .cloned();
        let new_timeline = self
-            .initialize_new_timeline(new_timeline_id, new_metadata, timelines)
+            .initialize_new_timeline(new_timeline_id, new_metadata, ancestor)
            .with_context(|| {
                format!(
                    "Failed to initialize timeline {}/{}",
@@ -1091,11 +1082,11 @@ impl Tenant {
            })?;

        match timelines.entry(new_timeline_id) {
-            hash_map::Entry::Occupied(_) => anyhow::bail!(
+            Entry::Occupied(_) => bail!(
                "Found freshly initialized timeline {} in the tenant map",
                new_timeline_id
            ),
-            hash_map::Entry::Vacant(v) => {
+            Entry::Vacant(v) => {
                v.insert(Arc::clone(&new_timeline));
            }
        }
@@ -1106,12 +1097,22 @@ impl Tenant {

 /// Create the cluster temporarily in 'initdbpath' directory inside the repository
 /// to get bootstrap data for timeline initialization.
-fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path, pg_version: u32) -> Result<()> {
-    info!("running initdb in {}... ", initdbpath.display());
+fn run_initdb(
+    conf: &'static PageServerConf,
+    initdb_target_dir: &Path,
+    pg_version: u32,
+) -> Result<()> {
+    let initdb_bin_path = conf.pg_bin_dir(pg_version).join("initdb");
+    let initdb_lib_dir = conf.pg_lib_dir(pg_version);
+    info!(
+        "running {} in {}, libdir: {}",
+        initdb_bin_path.display(),
+        initdb_target_dir.display(),
+        initdb_lib_dir.display(),
+    );

-    let initdb_path = conf.pg_bin_dir(pg_version).join("initdb");
-    let initdb_output = Command::new(initdb_path)
-        .args(&["-D", &initdbpath.to_string_lossy()])
+    let initdb_output = Command::new(initdb_bin_path)
+        .args(&["-D", &initdb_target_dir.to_string_lossy()])
        .args(&["-U", &conf.superuser])
        .args(&["-E", "utf8"])
        .arg("--no-instructions")
@@ -1119,8 +1120,8 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path, pg_version: u32)
        // so no need to fsync it
        .arg("--no-sync")
        .env_clear()
-        .env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version))
-        .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version))
+        .env("LD_LIBRARY_PATH", &initdb_lib_dir)
+        .env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
        .stdout(Stdio::null())
        .output()
        .context("failed to execute initdb")?;
--- a/pageserver/src/tenant/delta_layer.rs
+++ b/pageserver/src/tenant/delta_layer.rs
@@ -556,7 +556,7 @@ impl DeltaLayer {

    /// Create a DeltaLayer struct representing an existing file on disk.
    ///
-    /// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
+    /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
    pub fn new_for_path<F>(path: &Path, file: F) -> Result<Self>
    where
        F: FileExt,
--- a/pageserver/src/tenant/filename.rs
+++ b/pageserver/src/tenant/filename.rs
@@ -177,7 +177,7 @@ impl fmt::Display for ImageFileName {
 ///
 /// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
 /// global config, and paths to layer files are constructed using the tenant/timeline
-/// path from the config. But in the 'dump_layerfile' binary, we need to construct a Layer
+/// path from the config. But in the 'pageserver_binutils' binary, we need to construct a Layer
 /// struct for a file on disk, without having a page server running, so that we have no
 /// config. In that case, we use the Path variant to hold the full path to the file on
 /// disk.
--- a/pageserver/src/tenant/image_layer.rs
+++ b/pageserver/src/tenant/image_layer.rs
@@ -357,7 +357,7 @@ impl ImageLayer {

    /// Create an ImageLayer struct representing an existing file on disk.
    ///
-    /// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
+    /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
    pub fn new_for_path<F>(path: &Path, file: F) -> Result<ImageLayer>
    where
        F: std::os::unix::prelude::FileExt,
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -62,6 +62,8 @@ pub struct LayerMap {

 struct LayerRTreeObject {
    layer: Arc<dyn Layer>,
+
+    envelope: AABB<[IntKey; 2]>,
 }

 // Representation of Key as numeric type.
@@ -197,9 +199,16 @@ impl PartialEq for LayerRTreeObject {
 impl RTreeObject for LayerRTreeObject {
    type Envelope = AABB<[IntKey; 2]>;
    fn envelope(&self) -> Self::Envelope {
-        let key_range = self.layer.get_key_range();
-        let lsn_range = self.layer.get_lsn_range();
-        AABB::from_corners(
+        self.envelope
+    }
+}
+
+impl LayerRTreeObject {
+    fn new(layer: Arc<dyn Layer>) -> Self {
+        let key_range = layer.get_key_range();
+        let lsn_range = layer.get_lsn_range();
+
+        let envelope = AABB::from_corners(
            [
                IntKey::from(key_range.start.to_i128()),
                IntKey::from(lsn_range.start.0 as i128),
@@ -208,7 +217,8 @@ impl RTreeObject for LayerRTreeObject {
                IntKey::from(key_range.end.to_i128() - 1),
                IntKey::from(lsn_range.end.0 as i128 - 1),
            ], // AABB::upper is inclusive, while `key_range.end` and `lsn_range.end` are exclusive
-        )
+        );
+        LayerRTreeObject { layer, envelope }
    }
 }

@@ -338,7 +348,7 @@ impl LayerMap {
        if layer.get_key_range() == (Key::MIN..Key::MAX) {
            self.l0_delta_layers.push(layer.clone());
        }
-        self.historic_layers.insert(LayerRTreeObject { layer });
+        self.historic_layers.insert(LayerRTreeObject::new(layer));
        NUM_ONDISK_LAYERS.inc();
    }

@@ -362,7 +372,7 @@ impl LayerMap {
        }
        assert!(self
            .historic_layers
-            .remove(&LayerRTreeObject { layer })
+            .remove(&LayerRTreeObject::new(layer))
            .is_some());
        NUM_ONDISK_LAYERS.dec();
    }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -52,7 +52,10 @@ use crate::task_mgr::TaskKind;
 use crate::walreceiver::{is_etcd_client_initialized, spawn_connection_manager_task};
 use crate::walredo::WalRedoManager;
 use crate::CheckpointConfig;
-use crate::{page_cache, storage_sync};
+use crate::{
+    page_cache,
+    storage_sync::{self, index::LayerFileMetadata},
+};

 pub struct Timeline {
    conf: &'static PageServerConf,
@@ -343,7 +346,9 @@ impl Timeline {
                match cached_lsn.cmp(&lsn) {
                    Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
                    Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image
-                    Ordering::Greater => panic!(), // the returned lsn should never be after the requested lsn
+                    Ordering::Greater => {
+                        unreachable!("the returned lsn should never be after the requested lsn")
+                    }
                }
                Some((cached_lsn, cached_img))
            }
@@ -473,10 +478,6 @@ impl Timeline {
    }

    /// Mutate the timeline with a [`TimelineWriter`].
-    ///
-    /// FIXME: This ought to return &'a TimelineWriter, where TimelineWriter
-    /// is a generic type in this trait. But that doesn't currently work in
-    /// Rust: https://rust-lang.github.io/rfcs/1598-generic_associated_types.html
    pub fn writer(&self) -> TimelineWriter<'_> {
        TimelineWriter {
            tl: self,
@@ -625,7 +626,7 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
        drop(tenant_conf_guard);
        let self_clone = Arc::clone(self);
-        let _ = spawn_connection_manager_task(
+        spawn_connection_manager_task(
            self.conf.broker_etcd_prefix.clone(),
            self_clone,
            walreceiver_connect_timeout,
@@ -726,10 +727,10 @@ impl Timeline {
        Ok(())
    }

-    pub fn layer_removal_guard(&self) -> Result<MutexGuard<()>, anyhow::Error> {
+    pub fn layer_removal_guard(&self) -> anyhow::Result<MutexGuard<()>> {
        self.layer_removal_cs
            .try_lock()
-            .map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}"))
+            .map_err(|e| anyhow!("cannot lock compaction critical section {e}"))
    }

    /// Retrieve current logical size of the timeline.
@@ -1192,8 +1193,8 @@ impl Timeline {
                self.create_image_layers(&partitioning, self.initdb_lsn, true)?
            } else {
                // normal case, write out a L0 delta layer file.
-                let delta_path = self.create_delta_layer(&frozen_layer)?;
-                HashSet::from([delta_path])
+                let (delta_path, metadata) = self.create_delta_layer(&frozen_layer)?;
+                HashMap::from([(delta_path, metadata)])
            };

        fail_point!("flush-frozen-before-sync");
@@ -1228,7 +1229,7 @@ impl Timeline {
    fn update_disk_consistent_lsn(
        &self,
        disk_consistent_lsn: Lsn,
-        layer_paths_to_upload: HashSet<PathBuf>,
+        layer_paths_to_upload: HashMap<PathBuf, LayerFileMetadata>,
    ) -> Result<()> {
        // If we were able to advance 'disk_consistent_lsn', save it the metadata file.
        // After crash, we will restart WAL streaming and processing from that point.
@@ -1297,7 +1298,10 @@ impl Timeline {
    }

    // Write out the given frozen in-memory layer as a new L0 delta file
-    fn create_delta_layer(&self, frozen_layer: &InMemoryLayer) -> Result<PathBuf> {
+    fn create_delta_layer(
+        &self,
+        frozen_layer: &InMemoryLayer,
+    ) -> Result<(PathBuf, LayerFileMetadata)> {
        // Write it out
        let new_delta = frozen_layer.write_to_disk()?;
        let new_delta_path = new_delta.path();
@@ -1323,12 +1327,13 @@ impl Timeline {

        // update the timeline's physical size
        let sz = new_delta_path.metadata()?.len();
+
        self.metrics.current_physical_size_gauge.add(sz);
        // update metrics
        self.metrics.num_persistent_files_created.inc_by(1);
        self.metrics.persistent_bytes_written.inc_by(sz);

-        Ok(new_delta_path)
+        Ok((new_delta_path, LayerFileMetadata::new(sz)))
    }

    pub fn compact(&self) -> anyhow::Result<()> {
@@ -1394,7 +1399,7 @@ impl Timeline {
                    storage_sync::schedule_layer_upload(
                        self.tenant_id,
                        self.timeline_id,
-                        HashSet::from_iter(layer_paths_to_upload),
+                        layer_paths_to_upload,
                        None,
                    );
                }
@@ -1475,10 +1480,9 @@ impl Timeline {
        partitioning: &KeyPartitioning,
        lsn: Lsn,
        force: bool,
-    ) -> Result<HashSet<PathBuf>> {
+    ) -> Result<HashMap<PathBuf, LayerFileMetadata>> {
        let timer = self.metrics.create_images_time_histo.start_timer();
        let mut image_layers: Vec<ImageLayer> = Vec::new();
-        let mut layer_paths_to_upload = HashSet::new();
        for partition in partitioning.parts.iter() {
            if force || self.time_for_new_image_layer(partition, lsn)? {
                let img_range =
@@ -1500,7 +1504,6 @@ impl Timeline {
                    }
                }
                let image_layer = image_layer_writer.finish()?;
-                layer_paths_to_upload.insert(image_layer.path());
                image_layers.push(image_layer);
            }
        }
@@ -1514,15 +1517,25 @@ impl Timeline {
        //
        // Compaction creates multiple image layers. It would be better to create them all
        // and fsync them all in parallel.
-        let mut all_paths = Vec::from_iter(layer_paths_to_upload.clone());
-        all_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id));
+        let all_paths = image_layers
+            .iter()
+            .map(|layer| layer.path())
+            .chain(std::iter::once(
+                self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
+            ))
+            .collect::<Vec<_>>();
        par_fsync::par_fsync(&all_paths)?;

+        let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
+
        let mut layers = self.layers.write().unwrap();
        for l in image_layers {
-            self.metrics
-                .current_physical_size_gauge
-                .add(l.path().metadata()?.len());
+            let path = l.path();
+            let metadata = path.metadata()?;
+
+            layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len()));
+
+            self.metrics.current_physical_size_gauge.add(metadata.len());
            layers.insert_historic(Arc::new(l));
        }
        drop(layers);
@@ -1773,16 +1786,16 @@ impl Timeline {
        }

        let mut layers = self.layers.write().unwrap();
-        let mut new_layer_paths = HashSet::with_capacity(new_layers.len());
+        let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
        for l in new_layers {
            let new_delta_path = l.path();

-            // update the timeline's physical size
-            self.metrics
-                .current_physical_size_gauge
-                .add(new_delta_path.metadata()?.len());
+            let metadata = new_delta_path.metadata()?;

-            new_layer_paths.insert(new_delta_path);
+            // update the timeline's physical size
+            self.metrics.current_physical_size_gauge.add(metadata.len());
+
+            new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
            layers.insert_historic(Arc::new(l));
        }

--- a/pageserver/src/tenant_config.rs
+++ b/pageserver/src/tenant_config.rs
@@ -24,7 +24,7 @@ pub mod defaults {
    // This parameter determines L1 layer file size.
    pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;

-    pub const DEFAULT_COMPACTION_PERIOD: &str = "1 s";
+    pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
    pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;

    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -1,7 +1,7 @@
 //! This module acts as a switchboard to access different repositories managed by this
 //! page server.

-use std::collections::{hash_map, HashMap, HashSet};
+use std::collections::{hash_map, HashMap};
 use std::ffi::OsStr;
 use std::fs;
 use std::path::{Path, PathBuf};
@@ -14,15 +14,15 @@ use remote_storage::GenericRemoteStorage;

 use crate::config::{PageServerConf, METADATA_FILE_NAME};
 use crate::http::models::TenantInfo;
-use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex};
-use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData};
+use crate::storage_sync::index::{LayerFileMetadata, RemoteIndex, RemoteTimelineIndex};
+use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData, TimelineLocalFiles};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::{
    ephemeral_file::is_ephemeral_file, metadata::TimelineMetadata, Tenant, TenantState,
 };
 use crate::tenant_config::TenantConfOpt;
 use crate::walredo::PostgresRedoManager;
-use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX};
+use crate::TEMP_FILE_SUFFIX;

 use utils::crashsafe_dir::{self, path_with_suffix_extension};
 use utils::id::{TenantId, TimelineId};
@@ -70,34 +70,54 @@ pub fn init_tenant_mgr(
            .remote_storage_config
            .as_ref()
            .expect("remote storage without config");
-
+        let mut broken_tenants = HashMap::new();
+        let mut ready_tenants = HashMap::new();
+        for (tenant_id, tenant_attach_data) in local_tenant_files.into_iter() {
+            match tenant_attach_data {
+                TenantAttachData::Ready(t) => {
+                    ready_tenants.insert(tenant_id, t);
+                }
+                TenantAttachData::Broken(e) => {
+                    broken_tenants.insert(tenant_id, TenantAttachData::Broken(e));
+                }
+            }
+        }
        let SyncStartupData {
            remote_index,
            local_timeline_init_statuses,
        } = storage_sync::spawn_storage_sync_task(
            conf,
-            local_tenant_files,
+            ready_tenants,
            storage,
            storage_config.max_concurrent_syncs,
            storage_config.max_sync_errors,
        )
        .context("Failed to spawn the storage sync thread")?;

-        (
-            remote_index,
-            local_timeline_init_statuses.filter_map(|init_status| match init_status {
-                LocalTimelineInitStatus::LocallyComplete(metadata) => Some(metadata),
-                LocalTimelineInitStatus::NeedsSync => None,
-            }),
-        )
+        let n = local_timeline_init_statuses.0.len();
+        let mut synced_timelines = local_timeline_init_statuses.0.into_iter().fold(
+            HashMap::<TenantId, TenantAttachData>::with_capacity(n),
+            |mut new_values, (tenant_id, old_values)| {
+                let new_timeline_values = new_values
+                    .entry(tenant_id)
+                    .or_insert_with(|| TenantAttachData::Ready(HashMap::new()));
+                if let TenantAttachData::Ready(t) = new_timeline_values {
+                    for (timeline_id, old_value) in old_values {
+                        if let LocalTimelineInitStatus::LocallyComplete(metadata) = old_value {
+                            t.insert(timeline_id, TimelineLocalFiles::ready(metadata));
+                        }
+                    }
+                }
+                new_values
+            },
+        );
+        synced_timelines.extend(broken_tenants);
+
+        (remote_index, synced_timelines)
    } else {
        info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled");
-        (
-            RemoteIndex::default(),
-            local_tenant_files.filter_map(|(metadata, _)| Some(metadata)),
-        )
+        (RemoteIndex::default(), local_tenant_files)
    };
-
    attach_local_tenants(conf, &remote_index, tenants_to_attach);

    Ok(remote_index)
@@ -107,46 +127,79 @@ pub fn init_tenant_mgr(
 /// Ignores other timelines that might be present for tenant, but were not passed as a parameter.
 /// Attempts to load as many entites as possible: if a certain timeline fails during the load, the tenant is marked as "Broken",
 /// and the load continues.
+///
+/// For successful tenant attach, it first has to have a `timelines/` subdirectory and a tenant config file that's loaded into memory successfully.
+/// If either of the conditions fails, the tenant will be added to memory with [`TenantState::Broken`] state, otherwise we start to load its timelines.
+/// Alternatively, tenant is considered loaded successfully, if it's already in pageserver's memory (i.e. was loaded already before).
+///
+/// Attach happens on startup and sucessful timeline downloads
+/// (some subset of timeline files, always including its metadata, after which the new one needs to be registered).
 pub fn attach_local_tenants(
    conf: &'static PageServerConf,
    remote_index: &RemoteIndex,
-    tenants_to_attach: TenantTimelineValues<TimelineMetadata>,
+    tenants_to_attach: HashMap<TenantId, TenantAttachData>,
 ) {
    let _entered = info_span!("attach_local_tenants").entered();
-    let number_of_tenants = tenants_to_attach.0.len();
+    let number_of_tenants = tenants_to_attach.len();

-    for (tenant_id, local_timelines) in tenants_to_attach.0 {
-        info!(
-            "Attaching {} timelines for {tenant_id}",
-            local_timelines.len()
-        );
-        debug!("Timelines to attach: {local_timelines:?}");
-
-        let tenant = load_local_tenant(conf, tenant_id, remote_index);
-        {
-            match tenants_state::write_tenants().entry(tenant_id) {
-                hash_map::Entry::Occupied(_) => {
-                    error!("Cannot attach tenant {tenant_id}: there's already an entry in the tenant state");
-                    continue;
-                }
-                hash_map::Entry::Vacant(v) => {
-                    v.insert(Arc::clone(&tenant));
-                }
+    for (tenant_id, local_timelines) in tenants_to_attach {
+        let mut tenants_accessor = tenants_state::write_tenants();
+        let tenant = match tenants_accessor.entry(tenant_id) {
+            hash_map::Entry::Occupied(o) => {
+                info!("Tenant {tenant_id} was found in pageserver's memory");
+                Arc::clone(o.get())
            }
-        }
-
-        if tenant.current_state() == TenantState::Broken {
-            warn!("Skipping timeline load for broken tenant {tenant_id}")
-        } else {
-            let has_timelines = !local_timelines.is_empty();
-            match tenant.init_attach_timelines(local_timelines) {
-                Ok(()) => {
-                    info!("successfully loaded local timelines for tenant {tenant_id}");
-                    tenant.activate(has_timelines);
+            hash_map::Entry::Vacant(v) => {
+                info!("Tenant {tenant_id} was not found in pageserver's memory, loading it");
+                let tenant = Arc::new(Tenant::new(
+                    conf,
+                    TenantConfOpt::default(),
+                    Arc::new(PostgresRedoManager::new(conf, tenant_id)),
+                    tenant_id,
+                    remote_index.clone(),
+                    conf.remote_storage_config.is_some(),
+                ));
+                match local_timelines {
+                    TenantAttachData::Broken(_) => {
+                        tenant.set_state(TenantState::Broken);
+                    }
+                    TenantAttachData::Ready(_) => {
+                        match Tenant::load_tenant_config(conf, tenant_id) {
+                            Ok(tenant_conf) => {
+                                tenant.update_tenant_config(tenant_conf);
+                                tenant.activate(false);
+                            }
+                            Err(e) => {
+                                error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}");
+                                tenant.set_state(TenantState::Broken);
+                            }
+                        };
+                    }
                }
-                Err(e) => {
-                    error!("Failed to attach tenant timelines: {e:?}");
-                    tenant.set_state(TenantState::Broken);
+                v.insert(Arc::clone(&tenant));
+                tenant
+            }
+        };
+        drop(tenants_accessor);
+        match local_timelines {
+            TenantAttachData::Broken(e) => warn!("{}", e),
+            TenantAttachData::Ready(ref timelines) => {
+                info!("Attaching {} timelines for {tenant_id}", timelines.len());
+                debug!("Timelines to attach: {local_timelines:?}");
+                let has_timelines = !timelines.is_empty();
+                let timelines_to_attach = timelines
+                    .iter()
+                    .map(|(&k, v)| (k, v.metadata().to_owned()))
+                    .collect();
+                match tenant.init_attach_timelines(timelines_to_attach) {
+                    Ok(()) => {
+                        info!("successfully loaded local timelines for tenant {tenant_id}");
+                        tenant.activate(has_timelines);
+                    }
+                    Err(e) => {
+                        error!("Failed to attach tenant timelines: {e:?}");
+                        tenant.set_state(TenantState::Broken);
+                    }
                }
            }
        }
@@ -155,32 +208,6 @@ pub fn attach_local_tenants(
    info!("Processed {number_of_tenants} local tenants during attach")
 }

-fn load_local_tenant(
-    conf: &'static PageServerConf,
-    tenant_id: TenantId,
-    remote_index: &RemoteIndex,
-) -> Arc<Tenant> {
-    let tenant = Arc::new(Tenant::new(
-        conf,
-        TenantConfOpt::default(),
-        Arc::new(PostgresRedoManager::new(conf, tenant_id)),
-        tenant_id,
-        remote_index.clone(),
-        conf.remote_storage_config.is_some(),
-    ));
-    match Tenant::load_tenant_config(conf, tenant_id) {
-        Ok(tenant_conf) => {
-            tenant.update_tenant_config(tenant_conf);
-            tenant.activate(false);
-        }
-        Err(e) => {
-            error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}");
-            tenant.set_state(TenantState::Broken);
-        }
-    }
-    tenant
-}
-
 ///
 /// Shut down all tenants. This runs as part of pageserver shutdown.
 ///
@@ -454,16 +481,21 @@ pub fn list_tenant_info(remote_index: &RemoteTimelineIndex) -> Vec<TenantInfo> {
        .collect()
 }

+#[derive(Debug)]
+pub enum TenantAttachData {
+    Ready(HashMap<TimelineId, TimelineLocalFiles>),
+    Broken(anyhow::Error),
+}
 /// Attempts to collect information about all tenant and timelines, existing on the local FS.
 /// If finds any, deletes all temporary files and directories, created before. Also removes empty directories,
 /// that may appear due to such removals.
 /// Does not fail on particular timeline or tenant collection errors, rather logging them and ignoring the entities.
 fn local_tenant_timeline_files(
    config: &'static PageServerConf,
-) -> anyhow::Result<TenantTimelineValues<(TimelineMetadata, HashSet<PathBuf>)>> {
+) -> anyhow::Result<HashMap<TenantId, TenantAttachData>> {
    let _entered = info_span!("local_tenant_timeline_files").entered();

-    let mut local_tenant_timeline_files = TenantTimelineValues::new();
+    let mut local_tenant_timeline_files = HashMap::new();
    let tenants_dir = config.tenants_path();
    for tenants_dir_entry in fs::read_dir(&tenants_dir)
        .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))?
@@ -485,19 +517,31 @@ fn local_tenant_timeline_files(
                    }
                } else {
                    match collect_timelines_for_tenant(config, &tenant_dir_path) {
-                        Ok((tenant_id, collected_files)) => {
+                        Ok((tenant_id, TenantAttachData::Broken(e))) => {
+                            local_tenant_timeline_files.entry(tenant_id).or_insert(TenantAttachData::Broken(e));
+                        },
+                        Ok((tenant_id, TenantAttachData::Ready(collected_files))) => {
                            if collected_files.is_empty() {
                                match remove_if_empty(&tenant_dir_path) {
                                    Ok(true) => info!("Removed empty tenant directory {}", tenant_dir_path.display()),
                                    Ok(false) => {
                                        // insert empty timeline entry: it has some non-temporary files inside that we cannot remove
                                        // so make obvious for HTTP API callers, that something exists there and try to load the tenant
-                                        let _ = local_tenant_timeline_files.0.entry(tenant_id).or_default();
+                                        let _ = local_tenant_timeline_files.entry(tenant_id).or_insert_with(|| TenantAttachData::Ready(HashMap::new()));
                                    },
                                    Err(e) => error!("Failed to remove empty tenant directory: {e:?}"),
                                }
                            } else {
-                                local_tenant_timeline_files.0.entry(tenant_id).or_default().extend(collected_files.into_iter())
+                                match local_tenant_timeline_files.entry(tenant_id) {
+                                    hash_map::Entry::Vacant(entry) => {
+                                        entry.insert(TenantAttachData::Ready(collected_files));
+                                    }
+                                    hash_map::Entry::Occupied(entry) =>{
+                                        if let TenantAttachData::Ready(old_timelines) = entry.into_mut() {
+                                            old_timelines.extend(collected_files);
+                                        }
+                                    },
+                                }
                            }
                        },
                        Err(e) => error!(
@@ -520,7 +564,7 @@ fn local_tenant_timeline_files(

    info!(
        "Collected files for {} tenants",
-        local_tenant_timeline_files.0.len()
+        local_tenant_timeline_files.len(),
    );
    Ok(local_tenant_timeline_files)
 }
@@ -558,14 +602,10 @@ fn is_temporary(path: &Path) -> bool {
    }
 }

-#[allow(clippy::type_complexity)]
 fn collect_timelines_for_tenant(
    config: &'static PageServerConf,
    tenant_path: &Path,
-) -> anyhow::Result<(
-    TenantId,
-    HashMap<TimelineId, (TimelineMetadata, HashSet<PathBuf>)>,
-)> {
+) -> anyhow::Result<(TenantId, TenantAttachData)> {
    let tenant_id = tenant_path
        .file_name()
        .and_then(OsStr::to_str)
@@ -574,6 +614,17 @@ fn collect_timelines_for_tenant(
        .context("Could not parse tenant id out of the tenant dir name")?;
    let timelines_dir = config.timelines_path(&tenant_id);

+    if !timelines_dir.as_path().is_dir() {
+        return Ok((
+            tenant_id,
+            TenantAttachData::Broken(anyhow::anyhow!(
+                "Tenant {} has no timelines directory at {}",
+                tenant_id,
+                timelines_dir.display()
+            )),
+        ));
+    }
+
    let mut tenant_timelines = HashMap::new();
    for timelines_dir_entry in fs::read_dir(&timelines_dir)
        .with_context(|| format!("Failed to list timelines dir entry for tenant {tenant_id}"))?
@@ -596,7 +647,10 @@ fn collect_timelines_for_tenant(
                } else {
                    match collect_timeline_files(&timeline_dir) {
                        Ok((timeline_id, metadata, timeline_files)) => {
-                            tenant_timelines.insert(timeline_id, (metadata, timeline_files));
+                            tenant_timelines.insert(
+                                timeline_id,
+                                TimelineLocalFiles::collected(metadata, timeline_files),
+                            );
                        }
                        Err(e) => {
                            error!(
@@ -625,25 +679,25 @@ fn collect_timelines_for_tenant(
    }

    if tenant_timelines.is_empty() {
-        match remove_if_empty(&timelines_dir) {
-            Ok(true) => info!(
-                "Removed empty tenant timelines directory {}",
-                timelines_dir.display()
-            ),
-            Ok(false) => (),
-            Err(e) => error!("Failed to remove empty tenant timelines directory: {e:?}"),
-        }
+        // this is normal, we've removed all broken, empty and temporary timeline dirs
+        // but should allow the tenant to stay functional and allow creating new timelines
+        // on a restart, we require tenants to have the timelines dir, so leave it on disk
+        debug!("Tenant {tenant_id} has no timelines loaded");
    }

-    Ok((tenant_id, tenant_timelines))
+    Ok((tenant_id, TenantAttachData::Ready(tenant_timelines)))
 }

 // discover timeline files and extract timeline metadata
 //  NOTE: ephemeral files are excluded from the list
 fn collect_timeline_files(
    timeline_dir: &Path,
-) -> anyhow::Result<(TimelineId, TimelineMetadata, HashSet<PathBuf>)> {
-    let mut timeline_files = HashSet::new();
+) -> anyhow::Result<(
+    TimelineId,
+    TimelineMetadata,
+    HashMap<PathBuf, LayerFileMetadata>,
+)> {
+    let mut timeline_files = HashMap::new();
    let mut timeline_metadata_path = None;

    let timeline_id = timeline_dir
@@ -656,7 +710,9 @@ fn collect_timeline_files(
        fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?;
    for entry in timeline_dir_entries {
        let entry_path = entry.context("Failed to list timeline dir entry")?.path();
-        if entry_path.is_file() {
+        let metadata = entry_path.metadata()?;
+
+        if metadata.is_file() {
            if entry_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME) {
                timeline_metadata_path = Some(entry_path);
            } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) {
@@ -671,7 +727,8 @@ fn collect_timeline_files(
                    )
                })?;
            } else {
-                timeline_files.insert(entry_path);
+                let layer_metadata = LayerFileMetadata::new(metadata.len());
+                timeline_files.insert(entry_path, layer_metadata);
            }
        }
    }
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -31,7 +31,6 @@ use etcd_broker::Client;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use std::future::Future;
-use std::sync::Arc;
 use tokio::sync::watch;
 use tracing::*;
 use url::Url;
@@ -88,37 +87,44 @@ pub fn is_etcd_client_initialized() -> bool {
 /// That may lead to certain events not being observed by the listener.
 #[derive(Debug)]
 pub struct TaskHandle<E> {
-    events_receiver: watch::Receiver<TaskEvent<E>>,
+    join_handle: Option<tokio::task::JoinHandle<anyhow::Result<()>>>,
+    events_receiver: watch::Receiver<TaskStateUpdate<E>>,
    cancellation: watch::Sender<()>,
 }

-#[derive(Debug, Clone)]
 pub enum TaskEvent<E> {
+    Update(TaskStateUpdate<E>),
+    End(anyhow::Result<()>),
+}
+
+#[derive(Debug, Clone)]
+pub enum TaskStateUpdate<E> {
+    Init,
    Started,
-    NewEvent(E),
-    End,
+    Progress(E),
 }

 impl<E: Clone> TaskHandle<E> {
    /// Initializes the task, starting it immediately after the creation.
    pub fn spawn<Fut>(
-        task: impl FnOnce(Arc<watch::Sender<TaskEvent<E>>>, watch::Receiver<()>) -> Fut + Send + 'static,
+        task: impl FnOnce(watch::Sender<TaskStateUpdate<E>>, watch::Receiver<()>) -> Fut
+            + Send
+            + 'static,
    ) -> Self
    where
-        Fut: Future<Output = Result<(), String>> + Send,
-        E: Sync + Send + 'static,
+        Fut: Future<Output = anyhow::Result<()>> + Send,
+        E: Send + Sync + 'static,
    {
        let (cancellation, cancellation_receiver) = watch::channel(());
-        let (events_sender, events_receiver) = watch::channel(TaskEvent::Started);
-        let events_sender = Arc::new(events_sender);
+        let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started);

-        let sender = Arc::clone(&events_sender);
-        let _ = WALRECEIVER_RUNTIME.spawn(async move {
-            events_sender.send(TaskEvent::Started).ok();
-            task(sender, cancellation_receiver).await
+        let join_handle = WALRECEIVER_RUNTIME.spawn(async move {
+            events_sender.send(TaskStateUpdate::Started).ok();
+            task(events_sender, cancellation_receiver).await
        });

        TaskHandle {
+            join_handle: Some(join_handle),
            events_receiver,
            cancellation,
        }
@@ -126,15 +132,45 @@ impl<E: Clone> TaskHandle<E> {

    async fn next_task_event(&mut self) -> TaskEvent<E> {
        match self.events_receiver.changed().await {
-            Ok(()) => self.events_receiver.borrow().clone(),
-            Err(_task_channel_part_dropped) => TaskEvent::End,
+            Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()),
+            Err(_task_channel_part_dropped) => {
+                TaskEvent::End(match self.join_handle.take() {
+                    Some(jh) => {
+                        if !jh.is_finished() {
+                            warn!("sender is dropped while join handle is still alive");
+                        }
+
+                        jh.await
+                            .map_err(|e| anyhow::anyhow!("Failed to join task: {e}"))
+                            .and_then(|x| x)
+                    }
+                    None => {
+                        // Another option is to have an enum, join handle or result and give away the reference to it
+                        Err(anyhow::anyhow!("Task was joined more than once"))
+                    }
+                })
+            }
        }
    }

    /// Aborts current task, waiting for it to finish.
-    pub async fn shutdown(mut self) {
-        self.cancellation.send(()).ok();
-        // wait until the sender is dropped
-        while self.events_receiver.changed().await.is_ok() {}
+    pub async fn shutdown(self) {
+        match self.join_handle {
+            Some(jh) => {
+                self.cancellation.send(()).ok();
+                match jh.await {
+                    Ok(Ok(())) => debug!("Shutdown success"),
+                    Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
+                    Err(join_error) => {
+                        if join_error.is_cancelled() {
+                            error!("Shutdown task was cancelled");
+                        } else {
+                            error!("Shutdown task join error: {join_error}")
+                        }
+                    }
+                }
+            }
+            None => {}
+        }
    }
 }
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -16,10 +16,10 @@ use std::{
    time::Duration,
 };

-use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::task_mgr::WALRECEIVER_RUNTIME;
 use crate::tenant::Timeline;
+use crate::{task_mgr, walreceiver::TaskStateUpdate};
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
 use etcd_broker::{
@@ -145,19 +145,26 @@ async fn connection_manager_loop_step(
                let wal_connection = walreceiver_state.wal_connection.as_mut()
                    .expect("Should have a connection, as checked by the corresponding select! guard");
                match wal_connection_update {
-                    TaskEvent::Started => {},
-                    TaskEvent::NewEvent(status) => {
-                        if status.has_processed_wal {
-                            // We have advanced last_record_lsn by processing the WAL received
-                            // from this safekeeper. This is good enough to clean unsuccessful
-                            // retries history and allow reconnecting to this safekeeper without
-                            // sleeping for a long time.
-                            walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id);
+                    TaskEvent::Update(c) => {
+                        match c {
+                            TaskStateUpdate::Init | TaskStateUpdate::Started => {},
+                            TaskStateUpdate::Progress(status) => {
+                                if status.has_processed_wal {
+                                    // We have advanced last_record_lsn by processing the WAL received
+                                    // from this safekeeper. This is good enough to clean unsuccessful
+                                    // retries history and allow reconnecting to this safekeeper without
+                                    // sleeping for a long time.
+                                    walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id);
+                                }
+                                wal_connection.status = status.to_owned();
+                            }
                        }
-                        wal_connection.status = status;
                    },
-                    TaskEvent::End => {
-                        debug!("WAL receiving task finished");
+                    TaskEvent::End(walreceiver_task_result) => {
+                        match walreceiver_task_result {
+                            Ok(()) => debug!("WAL receiving task finished"),
+                            Err(e) => error!("wal receiver task finished with an error: {e:?}"),
+                        }
                        walreceiver_state.drop_old_connection(false).await;
                    },
                }
@@ -363,13 +370,13 @@ impl WalreceiverState {
            async move {
                super::walreceiver_connection::handle_walreceiver_connection(
                    timeline,
-                    &new_wal_source_connstr,
-                    events_sender.as_ref(),
+                    new_wal_source_connstr,
+                    events_sender,
                    cancellation,
                    connect_timeout,
                )
                .await
-                .map_err(|e| format!("walreceiver connection handling failure: {e:#}"))
+                .context("walreceiver connection handling failure")
            }
            .instrument(info_span!("walreceiver_connection", id = %id))
        });
@@ -885,7 +892,7 @@ mod tests {
            status: connection_status.clone(),
            connection_task: TaskHandle::spawn(move |sender, _| async move {
                sender
-                    .send(TaskEvent::NewEvent(connection_status.clone()))
+                    .send(TaskStateUpdate::Progress(connection_status.clone()))
                    .ok();
                Ok(())
            }),
@@ -1145,7 +1152,7 @@ mod tests {
            status: connection_status.clone(),
            connection_task: TaskHandle::spawn(move |sender, _| async move {
                sender
-                    .send(TaskEvent::NewEvent(connection_status.clone()))
+                    .send(TaskStateUpdate::Progress(connection_status.clone()))
                    .ok();
                Ok(())
            }),
@@ -1233,7 +1240,7 @@ mod tests {
            status: connection_status.clone(),
            connection_task: TaskHandle::spawn(move |sender, _| async move {
                sender
-                    .send(TaskEvent::NewEvent(connection_status.clone()))
+                    .send(TaskStateUpdate::Progress(connection_status.clone()))
                    .ok();
                Ok(())
            }),
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/walreceiver/walreceiver_connection.rs
@@ -12,14 +12,15 @@ use chrono::{NaiveDateTime, Utc};
 use fail::fail_point;
 use futures::StreamExt;
 use postgres::{SimpleQueryMessage, SimpleQueryRow};
+use postgres_ffi::v14::xlog_utils::normalize_lsn;
+use postgres_ffi::WAL_SEGMENT_SIZE;
 use postgres_protocol::message::backend::ReplicationMessage;
 use postgres_types::PgLsn;
 use tokio::{pin, select, sync::watch, time};
 use tokio_postgres::{replication::ReplicationStream, Client};
-use tracing::{debug, error, info, info_span, trace, warn, Instrument};
+use tracing::{debug, error, info, trace, warn};

-use super::TaskEvent;
-use crate::metrics::LIVE_CONNECTIONS_COUNT;
+use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate};
 use crate::{
    task_mgr,
    task_mgr::TaskKind,
@@ -55,8 +56,8 @@ pub struct WalConnectionStatus {
 /// messages as we go.
 pub async fn handle_walreceiver_connection(
    timeline: Arc<Timeline>,
-    wal_source_connstr: &str,
-    events_sender: &watch::Sender<TaskEvent<WalConnectionStatus>>,
+    wal_source_connstr: String,
+    events_sender: watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
    mut cancellation: watch::Receiver<()>,
    connect_timeout: Duration,
 ) -> anyhow::Result<()> {
@@ -81,7 +82,7 @@ pub async fn handle_walreceiver_connection(
        streaming_lsn: None,
        commit_lsn: None,
    };
-    if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) {
+    if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) {
        warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}");
        return Ok(());
    }
@@ -112,8 +113,7 @@ pub async fn handle_walreceiver_connection(
                _ = connection_cancellation.changed() => info!("Connection cancelled"),
            }
            Ok(())
-        }
-        .instrument(info_span!("walreceiver connection")),
+        },
    );

    // Immediately increment the gauge, then create a job to decrement it on task exit.
@@ -134,7 +134,7 @@ pub async fn handle_walreceiver_connection(
    connection_status.latest_connection_update = Utc::now().naive_utc();
    connection_status.latest_wal_update = Utc::now().naive_utc();
    connection_status.commit_lsn = Some(end_of_wal);
-    if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) {
+    if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) {
        warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}");
        return Ok(());
    }
@@ -158,6 +158,14 @@ pub async fn handle_walreceiver_connection(
    // There might be some padding after the last full record, skip it.
    startpoint += startpoint.calc_padding(8u32);

+    // If the starting point is at a WAL page boundary, skip past the page header. We don't need the page headers
+    // for anything, and in some corner cases, the compute node might have never generated the WAL for page headers
+    //. That happens if you create a branch at page boundary: the start point of the branch is at the page boundary,
+    // but when the compute node first starts on the branch, we normalize the first REDO position to just after the page
+    // header (see generate_pg_control()), so the WAL for the page header is never streamed from the compute node
+    //  to the safekeepers.
+    startpoint = normalize_lsn(startpoint, WAL_SEGMENT_SIZE);
+
    info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, safekeeper is at {end_of_wal}...");

    let query = format!("START_REPLICATION PHYSICAL {startpoint}");
@@ -202,7 +210,7 @@ pub async fn handle_walreceiver_connection(
            }
            &_ => {}
        };
-        if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) {
+        if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) {
            warn!("Wal connection event listener dropped, aborting the connection: {e}");
            return Ok(());
        }
@@ -268,7 +276,8 @@ pub async fn handle_walreceiver_connection(
        if !connection_status.has_processed_wal && last_rec_lsn > last_rec_lsn_before_msg {
            // We have successfully processed at least one WAL record.
            connection_status.has_processed_wal = true;
-            if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) {
+            if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone()))
+            {
                warn!("Wal connection event listener dropped, aborting the connection: {e}");
                return Ok(());
            }
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1471,12 +1471,6 @@ SendProposerElected(Safekeeper *sk)
 	 */
 	th = &sk->voteResponse.termHistory;

-	/*
-	 * If any WAL is present on the sk, it must be authorized by some term.
-	 * OTOH, without any WAL there are no term swiches in the log.
-	 */
-	Assert((th->n_entries == 0) ==
-		   (sk->voteResponse.flushLsn == InvalidXLogRecPtr));
 	/* We must start somewhere. */
 	Assert(propTermHistory.n_entries >= 1);

--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -5,11 +5,11 @@ edition = "2021"

 [dependencies]
 anyhow = "1.0"
-async-trait = "0.1"
+atty = "0.2.14"
 base64 = "0.13.0"
-bstr = "0.2.17"
+bstr = "1.0"
 bytes = { version = "1.0.1", features = ['serde'] }
-clap = "3.0"
+clap = "4.0"
 futures = "0.3.13"
 git-version = "0.3.5"
 hashbrown = "0.12"
@@ -22,7 +22,11 @@ once_cell = "1.13.0"
 parking_lot = "0.12"
 pin-project-lite = "0.2.7"
 rand = "0.8.3"
-reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
+reqwest = { version = "0.11", default-features = false, features = [
+    "blocking",
+    "json",
+    "rustls-tls",
+] }
 routerify = "3"
 rustls = "0.20.0"
 rustls-pemfile = "1"
@@ -33,17 +37,20 @@ sha2 = "0.10.2"
 socket2 = "0.4.4"
 thiserror = "1.0.30"
 tokio = { version = "1.17", features = ["macros"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 tokio-rustls = "0.23.0"
+tracing = "0.1.36"
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 url = "2.2.2"
-uuid = { version = "0.8.2", features = ["v4", "serde"]}
-x509-parser = "0.13.2"
+uuid = { version = "1.2", features = ["v4", "serde"] }
+x509-parser = "0.14"

 utils = { path = "../libs/utils" }
 metrics = { path = "../libs/metrics" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }

 [dev-dependencies]
-rcgen = "0.8.14"
-rstest = "0.12"
+async-trait = "0.1"
+rcgen = "0.10"
+rstest = "0.15"
 tokio-postgres-rustls = "0.9.0"
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -11,13 +11,24 @@ use crate::{
    compute, http, mgmt, stream, url,
    waiters::{self, Waiter, Waiters},
 };
+use metrics::{register_int_counter_vec, IntCounterVec};
 use once_cell::sync::Lazy;
 use serde::{Deserialize, Serialize};
 use std::borrow::Cow;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::{info, warn};

 static CPLANE_WAITERS: Lazy<Waiters<mgmt::ComputeReady>> = Lazy::new(Default::default);

+static AUTH_METHOD_USED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_auth_method_used_total",
+        "Number of authentication requests served.",
+        &["method_name"],
+    )
+    .unwrap()
+});
+
 /// Give caller an opportunity to wait for the cloud's reply.
 pub async fn with_waiter<R, T, E>(
    psql_session_id: impl Into<String>,
@@ -171,6 +182,11 @@ impl BackendType<'_, ClientCredentials<'_>> {
            // support SNI or other means of passing the project name.
            // We now expect to see a very specific payload in the place of password.
            if creds.project().is_none() {
+                AUTH_METHOD_USED_COUNTER
+                    .with_label_values(&["password_hack"])
+                    .inc();
+
+                warn!("project name not specified, resorting to the password hack auth flow");
                let payload = AuthFlow::new(client)
                    .begin(auth::PasswordHack)
                    .await?
@@ -179,6 +195,7 @@ impl BackendType<'_, ClientCredentials<'_>> {

                // Finally we may finish the initialization of `creds`.
                // TODO: add missing type safety to ClientCredentials.
+                info!(project = &payload.project, "received missing parameter");
                creds.project = Some(payload.project.into());

                let mut config = match &self {
@@ -196,6 +213,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
                // We should use a password from payload as well.
                config.password(payload.password);

+                info!("user successfully authenticated (using the password hack)");
                return Ok(compute::NodeInfo {
                    reported_auth_ok: false,
                    config,
@@ -203,19 +221,40 @@ impl BackendType<'_, ClientCredentials<'_>> {
            }
        }

-        match self {
+        let res = match self {
            Console(endpoint, creds) => {
+                AUTH_METHOD_USED_COUNTER
+                    .with_label_values(&["console"])
+                    .inc();
+
+                info!(
+                    user = creds.user,
+                    project = creds.project(),
+                    "performing authentication using the console"
+                );
                console::Api::new(&endpoint, extra, &creds)
                    .handle_user(client)
                    .await
            }
            Postgres(endpoint, creds) => {
+                AUTH_METHOD_USED_COUNTER
+                    .with_label_values(&["postgres"])
+                    .inc();
+
+                info!("performing mock authentication using a local postgres instance");
                postgres::Api::new(&endpoint, &creds)
                    .handle_user(client)
                    .await
            }
            // NOTE: this auth backend doesn't use client credentials.
-            Link(url) => link::handle_user(&url, client).await,
-        }
+            Link(url) => {
+                AUTH_METHOD_USED_COUNTER.with_label_values(&["link"]).inc();
+                info!("performing link authentication");
+                link::handle_user(&url, client).await
+            }
+        }?;
+
+        info!("user successfully authenticated");
+        Ok(res)
    }
 }
--- a/proxy/src/auth/backend/console.rs
+++ b/proxy/src/auth/backend/console.rs
@@ -8,35 +8,20 @@ use crate::{
    http, scram,
    stream::PqStream,
 };
+use futures::TryFutureExt;
 use serde::{Deserialize, Serialize};
 use std::future::Future;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::{error, info, info_span};

 const REQUEST_FAILED: &str = "Console request failed";

 #[derive(Debug, Error)]
-pub enum TransportError {
-    #[error("Console responded with a malformed JSON: {0}")]
-    BadResponse(#[from] serde_json::Error),
+#[error("{}", REQUEST_FAILED)]
+pub struct TransportError(#[from] std::io::Error);

-    /// HTTP status (other than 200) returned by the console.
-    #[error("Console responded with an HTTP status: {0}")]
-    HttpStatus(reqwest::StatusCode),
-
-    #[error(transparent)]
-    Io(#[from] std::io::Error),
-}
-
-impl UserFacingError for TransportError {
-    fn to_string_client(&self) -> String {
-        use TransportError::*;
-        match self {
-            HttpStatus(_) => self.to_string(),
-            _ => REQUEST_FAILED.to_owned(),
-        }
-    }
-}
+impl UserFacingError for TransportError {}

 // Helps eliminate graceless `.map_err` calls without introducing another ctor.
 impl From<reqwest::Error> for TransportError {
@@ -148,10 +133,11 @@ impl<'a> Api<'a> {
    }

    async fn get_auth_info(&self) -> Result<AuthInfo, GetAuthInfoError> {
+        let request_id = uuid::Uuid::new_v4().to_string();
        let req = self
            .endpoint
            .get("proxy_get_role_secret")
-            .header("X-Request-ID", uuid::Uuid::new_v4().to_string())
+            .header("X-Request-ID", &request_id)
            .query(&[("session_id", self.extra.session_id)])
            .query(&[
                ("application_name", self.extra.application_name),
@@ -160,27 +146,30 @@ impl<'a> Api<'a> {
            ])
            .build()?;

-        // TODO: use a proper logger
-        println!("cplane request: {}", req.url());
+        let span = info_span!("http", id = request_id, url = req.url().as_str());
+        info!(parent: &span, "request auth info");
+        let msg = self
+            .endpoint
+            .checked_execute(req)
+            .and_then(|r| r.json::<GetRoleSecretResponse>())
+            .await
+            .map_err(|e| {
+                error!(parent: &span, "{e}");
+                e
+            })?;

-        let resp = self.endpoint.execute(req).await?;
-        if !resp.status().is_success() {
-            return Err(TransportError::HttpStatus(resp.status()).into());
-        }
-
-        let response: GetRoleSecretResponse = serde_json::from_str(&resp.text().await?)?;
-
-        scram::ServerSecret::parse(&response.role_secret)
+        scram::ServerSecret::parse(&msg.role_secret)
            .map(AuthInfo::Scram)
            .ok_or(GetAuthInfoError::BadSecret)
    }

    /// Wake up the compute node and return the corresponding connection info.
    pub(super) async fn wake_compute(&self) -> Result<ComputeConnCfg, WakeComputeError> {
+        let request_id = uuid::Uuid::new_v4().to_string();
        let req = self
            .endpoint
            .get("proxy_wake_compute")
-            .header("X-Request-ID", uuid::Uuid::new_v4().to_string())
+            .header("X-Request-ID", &request_id)
            .query(&[("session_id", self.extra.session_id)])
            .query(&[
                ("application_name", self.extra.application_name),
@@ -188,19 +177,21 @@ impl<'a> Api<'a> {
            ])
            .build()?;

-        // TODO: use a proper logger
-        println!("cplane request: {}", req.url());
-
-        let resp = self.endpoint.execute(req).await?;
-        if !resp.status().is_success() {
-            return Err(TransportError::HttpStatus(resp.status()).into());
-        }
-
-        let response: GetWakeComputeResponse = serde_json::from_str(&resp.text().await?)?;
+        let span = info_span!("http", id = request_id, url = req.url().as_str());
+        info!(parent: &span, "request wake-up");
+        let msg = self
+            .endpoint
+            .checked_execute(req)
+            .and_then(|r| r.json::<GetWakeComputeResponse>())
+            .await
+            .map_err(|e| {
+                error!(parent: &span, "{e}");
+                e
+            })?;

        // Unfortunately, ownership won't let us use `Option::ok_or` here.
-        let (host, port) = match parse_host_port(&response.address) {
-            None => return Err(WakeComputeError::BadComputeAddress(response.address)),
+        let (host, port) = match parse_host_port(&msg.address) {
+            None => return Err(WakeComputeError::BadComputeAddress(msg.address)),
            Some(x) => x,
        };

@@ -227,15 +218,18 @@ where
    GetAuthInfo: Future<Output = Result<AuthInfo, GetAuthInfoError>>,
    WakeCompute: Future<Output = Result<ComputeConnCfg, WakeComputeError>>,
 {
+    info!("fetching user's authentication info");
    let auth_info = get_auth_info(endpoint).await?;

    let flow = AuthFlow::new(client);
    let scram_keys = match auth_info {
        AuthInfo::Md5(_) => {
            // TODO: decide if we should support MD5 in api v2
+            info!("auth endpoint chooses MD5");
            return Err(auth::AuthError::bad_auth_method("MD5"));
        }
        AuthInfo::Scram(secret) => {
+            info!("auth endpoint chooses SCRAM");
            let scram = auth::Scram(&secret);
            Some(compute::ScramKeys {
                client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(),
@@ -259,3 +253,15 @@ fn parse_host_port(input: &str) -> Option<(&str, u16)> {
    let (host, port) = input.split_once(':')?;
    Some((host, port.parse().ok()?))
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_host_port() {
+        let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse");
+        assert_eq!(host, "127.0.0.1");
+        assert_eq!(port, 5432);
+    }
+}
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -1,6 +1,7 @@
 use crate::{auth, compute, error::UserFacingError, stream::PqStream, waiters};
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::{info, info_span};
 use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage};

 #[derive(Debug, Error)]
@@ -50,17 +51,20 @@ pub async fn handle_user(
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<compute::NodeInfo> {
    let psql_session_id = new_psql_session_id();
+    let span = info_span!("link", psql_session_id = &psql_session_id);
    let greeting = hello_message(link_uri, &psql_session_id);

    let db_info = super::with_waiter(psql_session_id, |waiter| async {
-        // Give user a URL to spawn a new database
+        // Give user a URL to spawn a new database.
+        info!(parent: &span, "sending the auth URL to the user");
        client
            .write_message_noflush(&Be::AuthenticationOk)?
            .write_message_noflush(&BeParameterStatusMessage::encoding())?
            .write_message(&Be::NoticeResponse(&greeting))
            .await?;

-        // Wait for web console response (see `mgmt`)
+        // Wait for web console response (see `mgmt`).
+        info!(parent: &span, "waiting for console's reply...");
        waiter.await?.map_err(LinkAuthError::AuthFailed)
    })
    .await?;
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -3,6 +3,7 @@
 use crate::error::UserFacingError;
 use std::borrow::Cow;
 use thiserror::Error;
+use tracing::info;
 use utils::pq_proto::StartupMessageParams;

 #[derive(Debug, Error, PartialEq, Eq, Clone)]
@@ -54,13 +55,10 @@ impl<'a> ClientCredentials<'a> {
        let dbname = get_param("database")?;

        // Project name might be passed via PG's command-line options.
-        let project_a = params.options_raw().and_then(|options| {
-            for opt in options {
-                if let Some(value) = opt.strip_prefix("project=") {
-                    return Some(Cow::Borrowed(value));
-                }
-            }
-            None
+        let project_a = params.options_raw().and_then(|mut options| {
+            options
+                .find_map(|opt| opt.strip_prefix("project="))
+                .map(Cow::Borrowed)
        });

        // Alternative project name is in fact a subdomain from SNI.
@@ -85,6 +83,13 @@ impl<'a> ClientCredentials<'a> {
        }
        .transpose()?;

+        info!(
+            user = user,
+            dbname = dbname,
+            project = project.as_deref(),
+            "credentials"
+        );
+
        Ok(Self {
            user,
            dbname,
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -4,6 +4,7 @@ use parking_lot::Mutex;
 use std::net::SocketAddr;
 use tokio::net::TcpStream;
 use tokio_postgres::{CancelToken, NoTls};
+use tracing::info;
 use utils::pq_proto::CancelKeyData;

 /// Enables serving `CancelRequest`s.
@@ -18,8 +19,9 @@ impl CancelMap {
            .lock()
            .get(&key)
            .and_then(|x| x.clone())
-            .with_context(|| format!("unknown session: {:?}", key))?;
+            .with_context(|| format!("query cancellation key not found: {key}"))?;

+        info!("cancelling query per user's request using key {key}");
        cancel_closure.try_cancel_query().await
    }

@@ -41,17 +43,29 @@ impl CancelMap {
        self.0
            .lock()
            .try_insert(key, None)
-            .map_err(|_| anyhow!("session already exists: {:?}", key))?;
+            .map_err(|_| anyhow!("query cancellation key already exists: {key}"))?;

        // This will guarantee that the session gets dropped
        // as soon as the future is finished.
        scopeguard::defer! {
            self.0.lock().remove(&key);
+            info!("dropped query cancellation key {key}");
        }

+        info!("registered new query cancellation key {key}");
        let session = Session::new(key, self);
        f(session).await
    }
+
+    #[cfg(test)]
+    fn contains(&self, session: &Session) -> bool {
+        self.0.lock().contains_key(&session.key)
+    }
+
+    #[cfg(test)]
+    fn is_empty(&self) -> bool {
+        self.0.lock().is_empty()
+    }
 }

 /// This should've been a [`std::future::Future`], but
@@ -92,10 +106,13 @@ impl<'a> Session<'a> {
    fn new(key: CancelKeyData, cancel_map: &'a CancelMap) -> Self {
        Self { key, cancel_map }
    }
+}

+impl Session<'_> {
    /// Store the cancel token for the given session.
    /// This enables query cancellation in [`crate::proxy::handshake`].
    pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData {
+        info!("enabling query cancellation for this session");
        self.cancel_map
            .0
            .lock()
@@ -104,3 +121,39 @@ impl<'a> Session<'a> {
        self.key
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use once_cell::sync::Lazy;
+
+    #[tokio::test]
+    async fn check_session_drop() -> anyhow::Result<()> {
+        static CANCEL_MAP: Lazy<CancelMap> = Lazy::new(Default::default);
+
+        let (tx, rx) = tokio::sync::oneshot::channel();
+        let task = tokio::spawn(CANCEL_MAP.with_session(|session| async move {
+            assert!(CANCEL_MAP.contains(&session));
+
+            tx.send(()).expect("failed to send");
+            futures::future::pending::<()>().await; // sleep forever
+
+            Ok(())
+        }));
+
+        // Wait until the task has been spawned.
+        rx.await.context("failed to hear from the task")?;
+
+        // Drop the session's entry by cancelling the task.
+        task.abort();
+        let error = task.await.expect_err("task should have failed");
+        if !error.is_cancelled() {
+            anyhow::bail!(error);
+        }
+
+        // Check that the session has been dropped.
+        assert!(CANCEL_MAP.is_empty());
+
+        Ok(())
+    }
+}
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -5,6 +5,7 @@ use std::{io, net::SocketAddr};
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio_postgres::NoTls;
+use tracing::{error, info};
 use utils::pq_proto::StartupMessageParams;

 #[derive(Debug, Error)]
@@ -54,6 +55,7 @@ impl NodeInfo {
        use tokio_postgres::config::Host;

        let connect_once = |host, port| {
+            info!("trying to connect to a compute node at {host}:{port}");
            TcpStream::connect((host, port)).and_then(|socket| async {
                let socket_addr = socket.peer_addr()?;
                // This prevents load balancer from severing the connection.
@@ -72,7 +74,11 @@ impl NodeInfo {
        if ports.len() > 1 && ports.len() != hosts.len() {
            return Err(io::Error::new(
                io::ErrorKind::Other,
-                format!("couldn't connect: bad compute config, ports and hosts entries' count does not match: {:?}", self.config),
+                format!(
+                    "couldn't connect: bad compute config, \
+                        ports and hosts entries' count does not match: {:?}",
+                    self.config
+                ),
            ));
        }

@@ -88,7 +94,7 @@ impl NodeInfo {
                Ok(socket) => return Ok(socket),
                Err(err) => {
                    // We can't throw an error here, as there might be more hosts to try.
-                    println!("failed to connect to compute `{host}:{port}`: {err}");
+                    error!("failed to connect to a compute node at {host}:{port}: {err}");
                    connection_error = Some(err);
                }
            }
@@ -160,8 +166,8 @@ impl NodeInfo {
            .ok_or(ConnectionError::FailedToFetchPgVersion)?
            .into();

+        info!("connected to user's compute node at {socket_addr}");
        let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token());
-
        let db = PostgresConnection { stream, version };

        Ok((db, cancel_closure))
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Dmitry Ivanov	140f28bf1b	[proxy] Add more metrics	2022-10-18 16:07:12 +03:00
Sergey Melnikov	546e9bdbec	Deploy storage into new account and migrate to management API v2 (#2619 ) Deploy storage into new account Migrate safekeeper and pageserver initialisation to management api v2	2022-10-18 15:52:15 +03:00
Heikki Linnakangas	59bc7e67e0	Use an optimized version of amplify_num. Speeds up layer_map::search somewhat. I also opened a PR in the upstream rust-amplify repository with these changes, see https://github.com/rust-amplify/rust-amplify/pull/148. We can switch back to upstream version when that's merged.	2022-10-18 15:00:10 +03:00
Heikki Linnakangas	2418e72649	Speed up layer_map::search, by remembering the "envelope" for each layer. Lookups in the R-tree call the "envelope" function for every comparison, and our envelope function isn't very cheap, so that overhead adds up. Create the envelope once, when the layer is inserted into the tree, and store it along with the layer. That uses some more memory per layer, but that's not very significant. Speeds up the search operation 2x	2022-10-18 15:00:10 +03:00
Heikki Linnakangas	80746b1c7a	Add micro-benchmark for layer map search function The test data was extracted from our pgbench benchmark project on the captest environment, the one we use for the 'neon-captest-reuse' test.	2022-10-18 15:00:10 +03:00
Dmitry Rodionov	129f7c82b7	remove redundant expect_tenant_to_download_timeline	2022-10-18 11:21:48 +03:00
Anastasia Lubennikova	0ec5ddea0b	GRANT CREATE ON SCHEMA public TO web_access	2022-10-17 22:42:51 +03:00
Kirill Bulatov	c4ee62d427	Bump clap and other minor dependencies (#2623 )	2022-10-17 12:58:40 +03:00
Joonas Koivunen	c709354579	Add layer sizes to index_part.json (#2582 ) This is the first step in verifying layer files. Next up on the road is hashing the files and verifying the hashes. The metadata additions do not require any migration. The idea is that the change is backward and forward-compatible with regard to `index_part.json` due to the softness of JSON schema and the deserialization options in use. New types added: - LayerFileMetadata for tracking the file metadata - starting with only the file size - in future hopefully a sha256 as well - IndexLayerMetadata, the serialized counterpart of LayerFileMetadata LayerFileMetadata needing to have all fields Option is a problem but that is not possible to handle without conflicting a lot more with other ongoing work. Co-authored-by: Kirill Bulatov <kirill@neon.tech>	2022-10-17 12:21:04 +03:00
Lassi Pölönen	5d6553d41d	Fix pageserver configuration generation bug (#2584 ) * We had an issue with `lineinfile` usage for pageserver configuration file: if the S3 bucket related values were changed, it would have resulted in duplicate keys, resulting in invalid toml. So to fix the issue, we should keep the configuration in structured format (yaml in this case) so we can always generate syntactically correct toml. Inventories are converted to yaml just so that it's easier to maintain the configuration there. Another alternative would have been a separate variable files. * Keep the ansible collections dir, but locally installed collections should not be tracked.	2022-10-16 11:37:10 +00:00
Kirill Bulatov	f03b7c3458	Bump regular dependencies (#2618 ) * etcd-client is not updated, since we plan to replace it with another client and the new version errors with some missing prost library error * clap had released another major update that requires changing every CLI declaration again, deserves a separate PR	2022-10-15 01:55:31 +03:00
Heikki Linnakangas	9c24de254f	Add description and license fields to OpenAPI spec. These were added earlier to the control plane's copy of this file. This is the master version of this file, so let's keep it in sync.	2022-10-14 18:37:58 +03:00
Heikki Linnakangas	538876650a	Merge 'local' and 'remote' parts of TimelineInfo into one struct. The 'local' part was always filled in, so that was easy to merge into into the TimelineInfo itself. 'remote' only contained two fields, 'remote_consistent_lsn' and 'awaits_download'. I made 'remote_consistent_lsn' an optional field, and 'awaits_download' is now false if the timeline is not present remotely. However, I kept stub versions of the 'local' and 'remote' structs for backwards-compatibility, with a few fields that are actively used by the control plane. They just duplicate the fields from TimelineInfo now. They can be removed later, once the control plane has been updated to use the new fields.	2022-10-14 18:37:14 +03:00
Heikki Linnakangas	500239176c	Make TimelineInfo.local field mandatory. It was only None when you queried the status of a timeline with 'timeline_detail' mgmt API call, and it was still being downloaded. You can check for that status with the 'tenant_status' API call instead, checking for has_in_progress_downloads field. Anothere case was if an error happened while trying to get the current logical size, in a 'timeline_detail' request. It might make sense to tolerate such errors, and leave the fields we cannot fill in as empty, None, 0 or similar, but it doesn't make sense to me to leave the whole 'local' struct empty in tht case.	2022-10-14 18:37:14 +03:00
Anastasia Lubennikova	ee64a6b80b	Fix CI: push versioned compute images to production ECR	2022-10-14 18:12:50 +03:00
Anastasia Lubennikova	a13b486943	Bump vendor/postgres-v15. Rebase to 15.0	2022-10-14 18:12:50 +03:00
Arseny Sher	9fe4548e13	Reimplement explicit timeline creation on safekeepers. With the ability to pass commit_lsn. This allows to perform project WAL recovery through different (from the original) set of safekeepers (or under different ttid) by 1) moving WAL files to s3 under proper ttid; 2) explicitly creating timeline on safekeepers, setting commit_lsn to the latest point; 3) putting the lastest .parital file to the timeline directory on safekeepers, if desired. Extend test_s3_wal_replay to exersise this behaviour. Also extends timeline_status endpoint to return postgres information.	2022-10-13 21:43:10 +04:00
Heikki Linnakangas	14c623b254	Make it possible to build with old cargo version. I'm using the Rust compiler and cargo versions from Debian packages, but the latest available cargo Debian package is quite old, version 1.57. The 'named-profiles' features was not stabilized at that version yet, so ever since commit `a463749f5`, I've had to manually add this line to the Cargo.toml file to compile. I've been wishing that someone would update the cargo Debian package, but it doesn't seem to be happening any time soon. This doesn't seem to bother anyone else but me, but it shouldn't hurt anyone else either. If there was a good reason, I could install a newer cargo version with 'rustup', but if all we need is this one line in Cargo.toml, I'd prefer to continue using the Debian packages.	2022-10-13 15:17:00 +03:00
Alexander Bayandin	ebf54b0de0	Nightly Benchmarks: Add 50 GB projects (#2612 )	2022-10-13 10:00:29 +01:00
Andrés	09dda35dac	Return broken tenants due to non existing timelines dir (#2552 ) (#2575 ) Co-authored-by: andres <andres.rodriguez@outlook.es>	2022-10-12 22:28:39 +03:00
Dmitry Ivanov	6ace79345d	[proxy] Add more context to console requests logging (#2583 )	2022-10-12 21:00:44 +03:00
danieltprice	771e61425e	Update release-pr.md (#2600 ) Update the Release Notes PR example that is referenced from the checklist. The Release Notes file structure changed recently.	2022-10-12 08:38:28 -03:00
Alexander Bayandin	93775f6ca7	GitHub Actions: replace deprecated set-output with GITHUB_OUTPUT (#2608 )	2022-10-12 10:22:24 +01:00
Arseny Sher	6d0dacc4ce	Recreate timeline on pageserver in s3_wal_replay test. That's closer to real usage than switching to brand new pageserver.	2022-10-12 11:46:21 +04:00
Heikki Linnakangas	e5e40a31f4	Clean up terms "delete timeline" and "detach tenant". You cannot attach/detach an individual timeline, attach/detach always applies to the whole tenant. However, you can delete a single timeline from a tenant. Fix some comments and error messages that confused these two operations.	2022-10-11 17:47:41 +03:00
Heikki Linnakangas	676c63c329	Improve comments.	2022-10-11 17:47:41 +03:00
Heikki Linnakangas	47366522a8	Make the return type of 'list_timelines' simpler. It's enough to return just the Timeline references. You can get the timeline's ID easily from Timeline.	2022-10-11 17:47:41 +03:00
Heikki Linnakangas	db26bc49cc	Remove obsolete FIXME comment. Commit `c634cb1d36` removed the trait and changed the function to return a &TimelineWriter, as the FIXME said we should do, but forgot to remove the FIXME.	2022-10-11 17:47:41 +03:00
Lassi Pölönen	e520293090	Add build info metric to pageserver, safekeeper and proxy (#2596 ) * Test that we emit build info metric for pageserver, safekeeper and proxy with some non-zero length revision label * Emit libmetrics_build_info on startup of pageserver, safekeeper and proxy with label "revision" which tells the git revision.	2022-10-11 09:54:32 +03:00
Sergey Melnikov	241e549757	Switch neon-stress etcd to dedicatd instance (#2602 )	2022-10-10 22:07:19 +00:00
Sergey Melnikov	34bea270f0	Fix POSTGRES_DISTRIB_DIR for benchmarks on ec2 runner (#2594 )	2022-10-10 09:12:50 +00:00
Kirill Bulatov	13f0e7a5b4	Deploy pageserver_binutils to the envs	2022-10-09 08:21:11 +03:00
Kirill Bulatov	3e35f10adc	Add a script to reformat the project	2022-10-09 08:21:11 +03:00
Kirill Bulatov	3be3bb7730	Be more verbose with initdb for pageserver timeline creation	2022-10-09 08:21:11 +03:00
Kirill Bulatov	01d2c52c82	Tidy up feature reporting	2022-10-09 08:21:11 +03:00
Kirill Bulatov	9f79e7edea	Merge pageserver helper binaries and provide it for deployment (#2590 )	2022-10-08 12:42:17 +00:00
Heikki Linnakangas	a22165d41e	Add tests for comparing root and child branch performance. Author: Thang Pham <thang@neon.tech>	2022-10-08 10:07:33 +03:00
Arseny Sher	725be60bb7	Storage messaging rfc 2.	2022-10-07 21:22:17 +04:00
Dmitry Ivanov	e516c376d6	[proxy] Improve logging (#2554 ) * [proxy] Use `tracing::` instead of `println!` for logging Fix a minor misnomer * Log more stuff	2022-10-07 14:34:57 +03:00
Kirill Bulatov	8e51c27e1a	Restore artifact versions (#2578 ) Context: https://github.com/neondatabase/neon/pull/2128/files#r989489965 Co-authored-by: Rory de Zoete <rory@neon.tech>	2022-10-07 10:58:31 +00:00
Heikki Linnakangas	9e1eb69d55	Increase default compaction_period setting to 20 s. The previous default of 1 s caused excessive CPU usage when there were a lot of projects. Polling every timeline once a second was too aggressive so let's reduce it. Fixes https://github.com/neondatabase/neon/issues/2542, but we probably also want do to something so that we don't poll timelines that have received no new WAL or layers since last check.	2022-10-07 13:55:19 +03:00
Arthur Petukhovsky	687ba81366	Display sync safekeepers output in compute_ctl (#2571 ) Pipe postgres output to compute_ctl stdout and create a test to check that compute_ctl works and prints postgres logs.	2022-10-06 13:53:52 +00:00
Andrés	47bae68a2e	Make get_lsn_by_timestamp available in mgmt API (#2536 ) (#2560 ) Co-authored-by: andres <andres.rodriguez@outlook.es>	2022-10-06 12:42:50 +03:00
Joonas Koivunen	e8b195acb7	fix: apply notify workaround on m1 mac docker (#2564 ) workaround as discussed in the notify repository.	2022-10-06 11:13:40 +03:00
Anastasia Lubennikova	254cb7dc4f	Update CI script to push compute-node-v15 to dockerhub	2022-10-06 10:50:08 +03:00
Anastasia Lubennikova	ed85d97f17	bump vendor/postgres-v15. Rebase it to Stamp 15rc2	2022-10-06 10:50:08 +03:00
Anastasia Lubennikova	4a216c5f7f	Use PostGIS 3.3.1 that is compatible with pg 15	2022-10-06 10:50:08 +03:00
Anastasia Lubennikova	c5a428a61a	Update Dockerfile.compute-node-v15 to match v14 version. Fix build script to promote the image for v15 to neon dockerhub	2022-10-06 10:50:08 +03:00
Konstantin Knizhnik	ff8c481777	Normalize last_record LSN in wal receiver (#2529 ) * Add test for branching on page boundary * Normalize start recovery point Co-authored-by: Heikki Linnakangas <heikki@neon.tech> Co-authored-by: Thang Pham <thang@neon.tech>	2022-10-06 09:01:56 +03:00
Arthur Petukhovsky	f25dd75be9	Fix deadlock in safekeeper metrics (#2566 ) We had a problem where almost all of the threads were waiting on a futex syscall. More specifically: - `/metrics` handler was inside `TimelineCollector::collect()`, waiting on a mutex for a single Timeline - This exact timeline was inside `control_file::FileStorage::persist()`, waiting on a mutex for Lazy initialization of `PERSIST_CONTROL_FILE_SECONDS` - `PERSIST_CONTROL_FILE_SECONDS: Lazy<Histogram>` was blocked on `prometheus::register` - `prometheus::register` calls `DEFAULT_REGISTRY.write().register()` to take a write lock on Registry and add a new metric - `DEFAULT_REGISTRY` lock was already taken inside `DEFAULT_REGISTRY.gather()`, which was called by `/metrics` handler to collect all metrics This commit creates another Registry with a separate lock, to avoid deadlock in a case where `TimelineCollector` triggers registration of new metrics inside default registry.	2022-10-06 01:07:02 +03:00
Sergey Melnikov	b99bed510d	Move proxies to neon-proxy namespace (#2555 )	2022-10-05 16:14:09 +03:00
sharnoff	580584c8fc	Remove control_plane deps on pageserver/safekeeper (#2513 ) Creates new `pageserver_api` and `safekeeper_api` crates to serve as the shared dependencies. Should reduce both recompile times and cold compile times. Decreases the size of the optimized `neon_local` binary: 380M -> 179M. No significant changes for anything else (mostly as expected).	2022-10-04 11:14:45 -07:00
Kirill Bulatov	d823e84ed5	Allow attaching tenants with zero timelines	2022-10-04 18:13:51 +03:00
Kirill Bulatov	231dfbaed6	Do not remove empty timelines/ directory for tenants	2022-10-04 18:13:51 +03:00
Dmitry Rodionov	5cf53786f9	Improve pytest ergonomics 1. Disable perf tests by default 2. Add instruction to run tests in parallel	2022-10-04 14:53:01 +03:00
Heikki Linnakangas	9b9bbad462	Use 'notify' crate to wait for PostgreSQL startup. Compute node startup time is very important. After launching PostgreSQL, use 'notify' to be notified immediately when it has updated the PID file, instead of polling. The polling loop had 100 ms interval so this shaves up to 100 ms from the startup time.	2022-10-04 13:00:15 +03:00
Heikki Linnakangas	537b2c1ae6	Remove unnecessary check for open PostgreSQL TCP port. The loop checked if the TCP port is open for connections, by trying to connect to it. That seems unnecessary. By the time the postmaster.pid file says that it's ready, the port should be open. Remove that check.	2022-10-04 12:09:13 +03:00
Joonas Koivunen	31123d1fa8	Silence clippies, minor doc fix (#2543 ) * doc: remove stray backtick * chore: clippy::let_unit_value * chore: silence useless_transmute, duplicate_mod * chore: remove allowing deref_nullptr not needed since bindgen 0.60.0. * chore: remove repeated allowed lints they are already allowed from the crate root.	2022-10-03 17:44:17 +03:00
Kirill Bulatov	4f2ac51bdd	Bump rustc to 1.61	2022-10-03 16:36:03 +03:00
Kirill Bulatov	7b2f9dc908	Reuse existing tenants during attach (#2540 )	2022-10-03 13:33:55 +03:00
Arthur Petukhovsky	dabb6d2675	Fix log level for sk startup logs (#2526 )	2022-09-27 10:36:17 +00:00
Arthur Petukhovsky	fc7087b16f	Add metric for loaded safekeeper timelines (#2509 )	2022-09-27 11:57:59 +03:00
Vadim Kharitonov	2233ca2a39	seqwait.rs unit tests don't check return value	2022-09-27 11:47:59 +03:00
Dmitry Rodionov	fb68d01449	Preserve task result in TaskHandle by keeping join handle around (#2521 ) * Preserve task result in TaskHandle by keeping join handle around The solution is not great, but it should hep to debug staging issue I tried to do it in a least destructive way. TaskHandle used only in one place so it is ok to use something less generic unless we want to extend its usage across the codebase. In its current current form for its single usage place it looks too abstract Some problems around this code: 1. Task can drop event sender and continue running 2. Task cannot be joined several times (probably not needed, but still, can be surprising) 3. Had to split task event into two types because ahyhow::Error does not implement clone. So TaskContinueEvent derives clone but usual task evend does not. Clone requirement appears because we clone the current value in next_task_event. Taking it by reference is complicated. 4. Split between Init and Started is artificial and comes from watch::channel requirement to have some initial value. To summarize from 3 and 4. It may be a better idea to use RWLock or a bounded channel instead	2022-09-26 20:57:02 +00:00
Arthur Petukhovsky	d15116f2cc	Update pg_version for old timelines	2022-09-26 19:57:03 +03:00
Stas Kelvich	df45c0d0e5	Disable plv8 again	2022-09-26 12:22:46 +03:00
Stas Kelvich	367cc01290	Fix deploy paths	2022-09-26 10:08:01 +03:00
Anastasia Lubennikova	1165686201	fix deploy lib paths for postgres	2022-09-23 22:23:43 +03:00
Anastasia Lubennikova	093264a695	Fix deploy bin and lib paths for postgres	2022-09-23 22:23:43 +03:00
sharnoff	805bb198c2	Miscellaneous small fixups (#2503 ) Changes are: * Correct typo "firts" -> "first" * Change <empty panic with comment explaining> to <panic with message taken from the comment> * Fix weird indentation that rustfmt was failing to handle * Use existing `anyhow::{anyhow,bail}!` as `{anyhow,bail}!` if it's already in scope * Spell `Result<T, anyhow::Error>` as `anyhow::Result<T>` * In general, closer to matching the rest of the codebase * Change usages of `hash_map::Entry` to `Entry` when it's already in scope * A quick search shows our style on this one varies across the files it's used in	2022-09-23 11:49:28 -07:00
Stas Kelvich	5ccd54c699	Add support for h3-pg and re-enable plv8	2022-09-23 19:06:57 +03:00
Dmitry Ivanov	1dffba9de6	Write more tests for the proxy... (#1918 ) And change a few more things in the process.	2022-09-23 18:30:44 +03:00
Alexander Bayandin	ebab89ebd2	test_runner: pass password to pgbench via PGPASSWORD (#2468 )	2022-09-23 12:51:33 +00:00
MMeent	bc3ba23e0a	Fix extreme metrics bloat in storage sync (#2506 ) * Fix extreme metrics bloat in storage sync From 78 metrics per (timeline, tenant) pair down to (max) 10 metrics per (timeline, tenant) pair, plus another 117 metrics in a global histogram that replaces the previous per-timeline histogram. * Drop image sync operation metric series when dropping TimelineMetrics.	2022-09-23 14:35:36 +02:00
Alexander Bayandin	3e65209a06	Nightly Benchmarks: use Postgres binaries from artifacts (#2501 )	2022-09-23 12:50:36 +01:00
Dmitry Rodionov	eb0c6bcf1a	reenable storage deployments	2022-09-23 14:12:39 +03:00
Rory de Zoete	52819898e4	Extend image push step with production ECR (#2465 ) * Extend image push step with production ECR * Put copy step before auth change * Use correct name * Only push on main * Fix typo	2022-09-23 11:25:29 +02:00
Sergey Melnikov	b0377f750a	Add staging-test region to normal staging rollouts (#2500 )	2022-09-23 11:25:26 +03:00
Dmitry Rodionov	43560506c0	remove duplicate walreceiver connection span	2022-09-23 00:27:24 +03:00
Anastasia Lubennikova	c81ede8644	Hotfix for safekeeper timelines with unknown pg_version. Assume DEFAULT_PG_VERSION = 14	2022-09-22 21:17:36 +03:00
Anastasia Lubennikova	eb9200abc8	Use version-specific path in pytest CI script	2022-09-22 18:12:41 +03:00
Anastasia Lubennikova	7c1695e87d	fix psql path in export_import_between_pageservers script	2022-09-22 18:12:41 +03:00
Anastasia Lubennikova	8b42c184e7	Update LD_LIBRARY_PATH in deploy scripts	2022-09-22 18:12:41 +03:00
Anastasia Lubennikova	7138db9279	Fix paths to postgres binaries in the deploy script	2022-09-22 18:12:41 +03:00
				`@@ -0,0 +1 @@`
				`{{ pageserver_config \| sivel.toiletwater.to_toml }}`