persistent_range_query: add layer_map_test

persistent_range_query: add stress test
persistent_range_query: more refs
2026-03-08 02:40:37 +00:00 · 2022-11-24 04:47:19 +02:00 · 2022-11-24 03:50:18 +02:00 · 2022-11-24 03:45:02 +02:00 · 2022-11-24 02:31:48 +02:00 · 2022-11-24 02:11:06 +02:00
197 changed files with 10361 additions and 3375 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -55,6 +55,22 @@ runs:
        name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
        path: /tmp/neon

+    - name: Download Neon binaries for the previous release
+      if: inputs.build_type != 'remote'
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
+        path: /tmp/neon-previous
+        prefix: latest
+
+    - name: Download compatibility snapshot for Postgres 14
+      if: inputs.build_type != 'remote'
+      uses: ./.github/actions/download
+      with:
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg14
+        path: /tmp/compatibility_snapshot_pg14
+        prefix: latest
+
    - name: Checkout
      if: inputs.needs_postgres_source == 'true'
      uses: actions/checkout@v3
@@ -73,22 +89,18 @@ runs:
      shell: bash -euxo pipefail {0}
      run: ./scripts/pysync

-    - name: Download compatibility snapshot for Postgres 14
-      uses: ./.github/actions/download
-      with:
-        name: compatibility-snapshot-${{ inputs.build_type }}-pg14
-        path: /tmp/compatibility_snapshot_pg14
-        prefix: latest
-
    - name: Run pytest
      env:
        NEON_BIN: /tmp/neon/bin
+        COMPATIBILITY_NEON_BIN: /tmp/neon-previous/bin
+        COMPATIBILITY_POSTGRES_DISTRIB_DIR: /tmp/neon-previous/pg_install
        TEST_OUTPUT: /tmp/test_output
        BUILD_TYPE: ${{ inputs.build_type }}
        AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
        AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
-        ALLOW_BREAKING_CHANGES: contains(github.event.pull_request.labels.*.name, 'breaking changes')
+        ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
+        ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
      shell: bash -euxo pipefail {0}
      run: |
        # PLATFORM will be embedded in the perf test report
@@ -111,7 +123,12 @@ runs:
          exit 1
        fi
        if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
+          # -n4 uses four processes to run tests via pytest-xdist
          EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
+
+          # --dist=loadgroup points tests marked with @pytest.mark.xdist_group
+          # to the same worker to make @pytest.mark.order work with xdist
+          EXTRA_PARAMS="--dist=loadgroup $EXTRA_PARAMS"
        fi

        if [[ "${{ inputs.run_with_real_s3 }}" == "true" ]]; then
@@ -146,9 +163,9 @@ runs:
        # --verbose prints name of each test (helpful when there are
        # multiple tests in one file)
        # -rA prints summary in the end
-        # -n4 uses four processes to run tests via pytest-xdist
        # -s is not used to prevent pytest from capturing output, because tests are running
        # in parallel and logs are mixed between different tests
+        #
        mkdir -p $TEST_OUTPUT/allure/results
        "${cov_prefix[@]}" ./scripts/pytest \
          --junitxml=$TEST_OUTPUT/junit.xml \
@@ -168,12 +185,12 @@ runs:
      uses: ./.github/actions/upload
      with:
        name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
-        # The path includes a test name (test_prepare_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
-        path: /tmp/test_output/test_prepare_snapshot/compatibility_snapshot_pg14/
+        # The path includes a test name (test_create_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
+        path: /tmp/test_output/test_create_snapshot/compatibility_snapshot_pg14/
        prefix: latest

    - name: Create Allure report
-      if: always()
+      if: success() || failure()
      uses: ./.github/actions/allure-report
      with:
        action: store
--- a/.github/ansible/.gitignore
+++ b/.github/ansible/.gitignore
@@ -1,5 +1,3 @@
-zenith_install.tar.gz
-.zenith_current_version
 neon_install.tar.gz
 .neon_current_version

--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -0,0 +1,35 @@
+storage:
+  vars:
+    bucket_name: neon-prod-storage-ap-southeast-1
+    bucket_region: ap-southeast-1
+    console_mgmt_base_url: http://console-release.local
+    etcd_endpoints: etcd-0.ap-southeast-1.aws.neon.tech:2379
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: ap-southeast-1
+    ansible_aws_ssm_bucket_name: neon-prod-storage-ap-southeast-1
+    console_region_id: aws-ap-southeast-1
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-064de8ea28bdb495b
+        pageserver-1.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-0b180defcaeeb6b93
+
+    safekeepers:
+      hosts:
+        safekeeper-0.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-0d6f1dc5161eef894
+        safekeeper-1.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-0e338adda8eb2d19f
+        safekeeper-2.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-04fb63634e4679eb9
--- a/.github/ansible/prod.eu-central-1.hosts.yaml
+++ b/.github/ansible/prod.eu-central-1.hosts.yaml
@@ -0,0 +1,35 @@
+storage:
+  vars:
+    bucket_name: neon-prod-storage-eu-central-1
+    bucket_region: eu-central-1
+    console_mgmt_base_url: http://console-release.local
+    etcd_endpoints: etcd-0.eu-central-1.aws.neon.tech:2379
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: eu-central-1
+    ansible_aws_ssm_bucket_name: neon-prod-storage-eu-central-1
+    console_region_id: aws-eu-central-1
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.eu-central-1.aws.neon.tech:
+          ansible_host:  i-0cd8d316ecbb715be
+        pageserver-1.eu-central-1.aws.neon.tech:
+          ansible_host:  i-090044ed3d383fef0
+
+    safekeepers:
+      hosts:
+        safekeeper-0.eu-central-1.aws.neon.tech:
+          ansible_host:  i-0b238612d2318a050
+        safekeeper-1.eu-central-1.aws.neon.tech:
+          ansible_host:  i-07b9c45e5c2637cd4
+        safekeeper-2.eu-central-1.aws.neon.tech:
+          ansible_host:  i-020257302c3c93d88
--- a/.github/ansible/prod.us-east-2.hosts.yaml
+++ b/.github/ansible/prod.us-east-2.hosts.yaml
@@ -0,0 +1,36 @@
+storage:
+  vars:
+    bucket_name: neon-prod-storage-us-east-2
+    bucket_region: us-east-2
+    console_mgmt_base_url: http://console-release.local
+    etcd_endpoints: etcd-0.us-east-2.aws.neon.tech:2379
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: us-east-2
+    ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-2
+    console_region_id: aws-us-east-2
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.us-east-2.aws.neon.tech:
+          ansible_host:  i-062227ba7f119eb8c
+        pageserver-1.us-east-2.aws.neon.tech:
+          ansible_host:  i-0b3ec0afab5968938
+
+    safekeepers:
+      hosts:
+        safekeeper-0.us-east-2.aws.neon.tech:
+          ansible_host:  i-0e94224750c57d346
+        safekeeper-1.us-east-2.aws.neon.tech:
+          ansible_host:  i-06d113fb73bfddeb0
+        safekeeper-2.us-east-2.aws.neon.tech:
+          ansible_host:  i-09f66c8e04afff2e8
+          
--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -22,6 +22,10 @@ storage:
          console_region_id: aws-us-west-2
        zenith-1-ps-3:
          console_region_id: aws-us-west-2
+        zenith-1-ps-4:
+          console_region_id: aws-us-west-2
+        zenith-1-ps-5:
+          console_region_id: aws-us-west-2

    safekeepers:
      hosts:
--- a/.github/ansible/ssm_config
+++ b/.github/ansible/ssm_config
@@ -1,3 +1,2 @@
 ansible_connection: aws_ssm
-ansible_aws_ssm_bucket_name: neon-dev-bucket
 ansible_python_interpreter: /usr/bin/python3
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -0,0 +1,33 @@
+storage:
+  vars:
+    bucket_name: neon-dev-storage-eu-west-1
+    bucket_region: eu-west-1
+    console_mgmt_base_url: http://console-staging.local
+    etcd_endpoints: etcd-0.eu-west-1.aws.neon.build:2379
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: eu-west-1
+    ansible_aws_ssm_bucket_name: neon-dev-storage-eu-west-1
+    console_region_id: aws-eu-west-1
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.eu-west-1.aws.neon.build:
+          ansible_host: i-01d496c5041c7f34c
+
+    safekeepers:
+      hosts:
+        safekeeper-0.eu-west-1.aws.neon.build:
+          ansible_host: i-05226ef85722831bf
+        safekeeper-1.eu-west-1.aws.neon.build:
+          ansible_host: i-06969ee1bf2958bfc
+        safekeeper-2.eu-west-1.aws.neon.build:
+          ansible_host: i-087892e9625984a0b
--- a/.github/ansible/staging.hosts.yaml
+++ b/.github/ansible/staging.hosts.yaml
@@ -3,7 +3,7 @@ storage:
    bucket_name: zenith-staging-storage-us-east-1
    bucket_region: us-east-1
    console_mgmt_base_url: http://console-staging.local
-    etcd_endpoints: zenith-us-stage-etcd.local:2379
+    etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
      remote_storage:
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -14,6 +14,7 @@ storage:
    hostname_suffix: ""
    remote_user: ssm-user
    ansible_aws_ssm_region: us-east-2
+    ansible_aws_ssm_bucket_name: neon-staging-storage-us-east-2
    console_region_id: aws-us-east-2

  children:
@@ -21,6 +22,8 @@ storage:
      hosts:
        pageserver-0.us-east-2.aws.neon.build:
          ansible_host: i-0c3e70929edb5d691
+        pageserver-1.us-east-2.aws.neon.build:
+          ansible_host: i-0565a8b4008aa3f40

    safekeepers:
      hosts:
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-staging.local/management/api/v2"
+  domain: "*.eu-west-1.aws.neon.build"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: dev
+  zenith_region: eu-west-1
+  zenith_region_slug: eu-west-1
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: eu-west-1.aws.neon.build
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-release.local/management/api/v2"
+  domain: "*.ap-southeast-1.aws.neon.tech"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: prod
+  zenith_region: ap-southeast-1
+  zenith_region_slug: ap-southeast-1
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-release.local/management/api/v2"
+  domain: "*.eu-central-1.aws.neon.tech"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: prod
+  zenith_region: eu-central-1
+  zenith_region_slug: eu-central-1
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-release.local/management/api/v2"
+  domain: "*.us-east-2.aws.neon.tech"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: prod
+  zenith_region: us-east-2
+  zenith_region_slug: us-east-2
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -144,7 +144,9 @@ jobs:
        # neon-captest-new: Run pgbench in a freshly created project
        # neon-captest-reuse: Same, but reusing existing project
        # neon-captest-prefetch: Same, with prefetching enabled (new project)
-        platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch ]
+        # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
+        # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
+        platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
        db_size: [ 10gb ]
        include:
          - platform: neon-captest-new
@@ -164,7 +166,7 @@ jobs:
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
      PLATFORM: ${{ matrix.platform }}

-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
      options: --init
@@ -207,8 +209,11 @@ jobs:
          rds-aurora)
            CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }}
            ;;
+          rds-postgres)
+            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
+            ;;
          *)
-            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch' or 'rds-aurora'"
+            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
@@ -265,7 +270,7 @@ jobs:
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"

    - name: Create Allure report
-      if: always()
+      if: success() || failure()
      uses: ./.github/actions/allure-report
      with:
        action: generate
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -18,8 +18,8 @@ env:

 jobs:
  tag:
-    runs-on: dev
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
+    runs-on: [ self-hosted, dev, x64 ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
    outputs:
      build-tag: ${{steps.build-tag.outputs.tag}}

@@ -46,7 +46,7 @@ jobs:
        id: build-tag

  build-neon:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -127,8 +127,8 @@ jobs:
            target/
          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
          key: |
-            v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
-            v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-
+            v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
+            v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-

      - name: Cache postgres v14 build
        id: cache_pg_14
@@ -236,7 +236,7 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -268,34 +268,8 @@ jobs:
        if: matrix.build_type == 'debug'
        uses: ./.github/actions/save-coverage-data

-  upload-latest-artifacts:
-    runs-on: dev
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
-      options: --init
-    needs: [ regress-tests ]
-    if: github.ref_name == 'main'
-    steps:
-      - name: Copy Neon artifact to the latest directory
-        shell: bash -euxo pipefail {0}
-        env:
-          BUCKET: neon-github-public-dev
-          PREFIX: artifacts/${{ github.run_id }}
-        run: |
-          for build_type in debug release; do
-            FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
-
-            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
-            if [ -z "${S3_KEY}" ]; then
-              echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist"
-              exit 1
-            fi
-
-            time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/artifacts/latest/${FILENAME}
-          done
-
  benchmarks:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -326,12 +300,12 @@ jobs:
      # while coverage is currently collected for the debug ones

  merge-allure-report:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
    needs: [ regress-tests, benchmarks ]
-    if: always()
+    if: success() || failure()
    strategy:
      fail-fast: false
      matrix:
@@ -364,7 +338,7 @@ jobs:
          DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json

  coverage-report:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -389,7 +363,7 @@ jobs:
            !~/.cargo/registry/src
            ~/.cargo/git/
            target/
-          key: v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
+          key: v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}

      - name: Get Neon artifact
        uses: ./.github/actions/download
@@ -441,15 +415,19 @@ jobs:
        shell: bash -euxo pipefail {0}

  trigger-e2e-tests:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
-    needs: [ build-neon ]
+    needs: [ push-docker-hub, tag ]
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
+          # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
+          # but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
+          # to place a job run status update later.
          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+          # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}

          REMOTE_REPO="${{ github.repository_owner }}/cloud"
@@ -475,12 +453,14 @@ jobs:
              \"inputs\": {
                \"ci_job_name\": \"neon-cloud-e2e\",
                \"commit_hash\": \"$COMMIT_SHA\",
-                \"remote_repo\": \"${{ github.repository }}\"
+                \"remote_repo\": \"${{ github.repository }}\",
+                \"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
+                \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\"
              }
            }"

  neon-image:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug

@@ -498,7 +478,7 @@ jobs:
        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}

  compute-tools-image:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug

@@ -512,28 +492,8 @@ jobs:
      - name: Kaniko build compute tools
        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}

-  compute-node-image:
-    runs-on: dev
-    container: gcr.io/kaniko-project/executor:v1.9.0-debug
-    needs: [ tag ]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1 # v3 won't work with kaniko
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
-
-        # compute-node uses postgres 14, which is default now
-        # cloud repo depends on this image name, thus duplicating it
-        # remove compute-node when cloud repo is updated
-      - name: Kaniko build compute node with extensions v14 (compatibility)
-        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}}
-
  compute-node-image-v14:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug
    needs: [ tag ]
    steps:
@@ -549,9 +509,8 @@ jobs:
      - name: Kaniko build compute node with extensions v14
        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}}

-
  compute-node-image-v15:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug
    needs: [ tag ]
    steps:
@@ -567,18 +526,58 @@ jobs:
      - name: Kaniko build compute node with extensions v15
        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}}

+  test-images:
+    needs: [ tag, neon-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
+    runs-on: [ self-hosted, dev, x64 ]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
+      # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
+      # Regular pageserver version string looks like
+      #   Neon page server git-env:32d14403bd6ab4f4520a94cbfd81a6acef7a526c failpoints: true, features: []
+      # Bad versions might loop like:
+      #   Neon page server git-env:local failpoints: true, features: ["testing"]
+      # Ensure that we don't have bad versions.
+      - name: Verify image versions
+        shell: bash # ensure no set -e for better error messages
+        run: |
+          pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
+
+          echo "Pageserver version string: $pageserver_version"
+
+          if ! echo "$pageserver_version" | grep -qv 'git-env:local' ; then
+            echo "Pageserver version should not be the default Dockerfile one"
+            exit 1
+          fi
+
+          if ! echo "$pageserver_version" | grep -qv '"testing"' ; then
+            echo "Pageserver version should have no testing feature enabled"
+            exit 1
+          fi
+
+      - name: Verify docker-compose example
+        run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
+
+      - name: Print logs and clean up
+        if: always()
+        run: |
+          docker compose -f ./docker-compose/docker-compose.yml logs || 0
+          docker compose -f ./docker-compose/docker-compose.yml down
+
  promote-images:
-    runs-on: dev
-    needs: [ tag, neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
+    runs-on: [ self-hosted, dev, x64 ]
+    needs: [ tag, test-images ]
    if: github.event_name != 'workflow_dispatch'
    container: amazon/aws-cli
    strategy:
      fail-fast: false
      matrix:
-        # compute-node uses postgres 14, which is default now
-        # cloud repo depends on this image name, thus duplicating it
-        # remove compute-node when cloud repo is updated
-        name: [ neon, compute-node, compute-node-v14, compute-node-v15, compute-tools ]
+        name: [ neon, compute-node-v14, compute-node-v15, compute-tools ]

    steps:
      - name: Promote image to latest
@@ -587,7 +586,7 @@ jobs:
          aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"

  push-docker-hub:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    needs: [ promote-images, tag ]
    container: golang:1.19-bullseye

@@ -608,9 +607,6 @@ jobs:
      - name: Pull compute tools image from ECR
        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools

-      - name: Pull compute node image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} compute-node
-
      - name: Pull compute node v14 image from ECR
        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14

@@ -625,11 +621,10 @@ jobs:
          (github.ref_name == 'main' || github.ref_name == 'release') &&
          github.event_name != 'workflow_dispatch'
        run: |
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v14:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v15:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest

      - name: Configure Docker Hub login
        run: |
@@ -643,9 +638,6 @@ jobs:
      - name: Push compute tools image to Docker Hub
        run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}

-      - name: Push compute node image to Docker Hub
-        run: crane push compute-node neondatabase/compute-node:${{needs.tag.outputs.build-tag}}
-
      - name: Push compute node v14 image to Docker Hub
        run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}

@@ -662,7 +654,6 @@ jobs:
        run: |
          crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest

@@ -745,7 +736,7 @@ jobs:
          rm -f neon_install.tar.gz .neon_current_version

  deploy-new:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
@@ -756,9 +747,80 @@ jobs:
    defaults:
      run:
        shell: bash
-    env:
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+    strategy:
+      matrix:
+        target_region: [ us-east-2 ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Redeploy
+        run: |
+          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
+          cd "$(pwd)/.github/ansible"
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            ./get_binaries.sh
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            RELEASE=true ./get_binaries.sh
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+          ansible-galaxy collection install sivel.toiletwater
+          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
+          rm -f neon_install.tar.gz .neon_current_version
+
+  deploy-pr-test-new:
+    runs-on: [ self-hosted, dev, x64 ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
+    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
+    needs: [ push-docker-hub, tag, regress-tests ]
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') && 
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        target_region: [ eu-west-1 ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Redeploy
+        run: |
+          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
+          cd "$(pwd)/.github/ansible"
+
+          ./get_binaries.sh
+
+          ansible-galaxy collection install sivel.toiletwater
+          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
+          rm -f neon_install.tar.gz .neon_current_version
+
+  deploy-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
+    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
+    needs: [ push-docker-hub, tag, regress-tests ]
+    if: |
+      (github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        target_region: [ us-east-2, eu-central-1, ap-southeast-1 ]
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -781,11 +843,11 @@ jobs:
          fi

          ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i staging.us-east-2.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
+          ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_PRODUCTION_API_KEY}}
          rm -f neon_install.tar.gz .neon_current_version

  deploy-proxy:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
@@ -827,16 +889,23 @@ jobs:
          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s

  deploy-proxy-new:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    needs: [ push-docker-hub, tag, regress-tests ]
    if: |
      (github.ref_name == 'main') &&
      github.event_name != 'workflow_dispatch'
    defaults:
      run:
        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: dev-us-east-2-beta
+          - target_region:  eu-west-1
+            target_cluster: dev-eu-west-1-zeta
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -847,15 +916,52 @@ jobs:
      - name: Configure environment
        run: |
          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region us-east-2 eks update-kubeconfig --name dev-us-east-2-beta --role-arn arn:aws:iam::369495373322:role/github-runner
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}

      - name: Re-deploy proxy
        run: |
          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s

-  promote-compatibility-test-snapshot:
-    runs-on: dev
+  deploy-proxy-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
+    needs: [ push-docker-hub, tag, regress-tests ]
+    if: |
+      (github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: prod-us-east-2-delta
+          - target_region: eu-central-1
+            target_cluster: prod-eu-central-1-gamma
+          - target_region: ap-southeast-1
+            target_cluster: prod-ap-southeast-1-epsilon
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+
+      - name: Re-deploy proxy
+        run: |
+          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+
+  promote-compatibility-data:
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -868,9 +974,24 @@ jobs:
          BUCKET: neon-github-public-dev
          PREFIX: artifacts/latest
        run: |
+          # Update compatibility snapshot for the release
          for build_type in debug release; do
            OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst
            NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst

            time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
          done
+
+          # Update Neon artifact for the release (reuse already uploaded artifact)
+          for build_type in debug release; do
+            OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
+            FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
+
+            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
+            if [ -z "${S3_KEY}" ]; then
+              echo 2>&1 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
+              exit 1
+            fi
+
+            time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
+          done
--- a/.github/workflows/codestyle.yml
+++ b/.github/workflows/codestyle.yml
@@ -106,7 +106,7 @@ jobs:
            !~/.cargo/registry/src
            ~/.cargo/git
            target
-          key: v5-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
+          key: v6-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust

      - name: Run cargo clippy
        run: ./run_clippy.sh
@@ -115,7 +115,7 @@ jobs:
        run: cargo build --locked --all --all-targets

  check-rust-dependencies:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,7 +1,7 @@
 [submodule "vendor/postgres-v14"]
 	path = vendor/postgres-v14
 	url = https://github.com/neondatabase/postgres.git
-	branch = main
+	branch = REL_14_STABLE_neon
 [submodule "vendor/postgres-v15"]
 	path = vendor/postgres-v15
 	url = https://github.com/neondatabase/postgres.git
--- a/11
+++ b/11
@@ -0,0 +1,11 @@
+/compute_tools/ @neondatabase/control-plane
+/control_plane/ @neondatabase/compute @neondatabase/storage
+/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
+/libs/postgres_ffi/ @neondatabase/compute 
+/libs/remote_storage/ @neondatabase/storage 
+/libs/safekeeper_api/ @neondatabase/safekeepers  
+/pageserver/ @neondatabase/compute @neondatabase/storage 
+/pgxn/ @neondatabase/compute
+/proxy/ @neondatabase/control-plane 
+/safekeeper/ @neondatabase/safekeepers
+/vendor/ @neondatabase/compute
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -317,12 +317,6 @@ dependencies = [
 "generic-array",
 ]

-[[package]]
-name = "boxfnonce"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5988cb1d626264ac94100be357308f29ff7cbdd3b36bda27f450a4ee3f713426"
-
 [[package]]
 name = "bstr"
 version = "1.0.1"
@@ -600,6 +594,7 @@ dependencies = [
 "tar",
 "thiserror",
 "toml",
+ "url",
 "utils",
 "workspace_hack",
 ]
@@ -849,16 +844,6 @@ dependencies = [
 "syn",
 ]

-[[package]]
-name = "daemonize"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70c24513e34f53b640819f0ac9f705b673fcf4006d7aab8778bee72ebfc89815"
-dependencies = [
- "boxfnonce",
- "libc",
-]
-
 [[package]]
 name = "darling"
 version = "0.14.1"
@@ -2140,7 +2125,6 @@ dependencies = [
 "crc32c",
 "criterion",
 "crossbeam-utils",
- "daemonize",
 "etcd_broker",
 "fail",
 "futures",
@@ -2161,6 +2145,7 @@ dependencies = [
 "postgres-types",
 "postgres_ffi",
 "pprof",
+ "pq_proto",
 "rand",
 "regex",
 "remote_storage",
@@ -2173,6 +2158,7 @@ dependencies = [
 "svg_fmt",
 "tar",
 "tempfile",
+ "tenant_size_model",
 "thiserror",
 "tokio",
 "tokio-postgres",
@@ -2190,6 +2176,7 @@ name = "pageserver_api"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "byteorder",
 "bytes",
 "const_format",
 "postgres_ffi",
@@ -2268,6 +2255,14 @@ version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e"

+[[package]]
+name = "persistent_range_query"
+version = "0.1.0"
+dependencies = [
+ "rand",
+ "workspace_hack",
+]
+
 [[package]]
 name = "petgraph"
 version = "0.6.2"
@@ -2452,6 +2447,21 @@ version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"

+[[package]]
+name = "pq_proto"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "bytes",
+ "pin-project-lite",
+ "postgres-protocol",
+ "rand",
+ "serde",
+ "tokio",
+ "tracing",
+ "workspace_hack",
+]
+
 [[package]]
 name = "prettyplease"
 version = "0.1.21"
@@ -2584,6 +2594,7 @@ dependencies = [
 "once_cell",
 "parking_lot 0.12.1",
 "pin-project-lite",
+ "pq_proto",
 "rand",
 "rcgen",
 "reqwest",
@@ -3087,7 +3098,6 @@ dependencies = [
 "clap 4.0.15",
 "const_format",
 "crc32c",
- "daemonize",
 "etcd_broker",
 "fs2",
 "git-version",
@@ -3095,11 +3105,13 @@ dependencies = [
 "humantime",
 "hyper",
 "metrics",
+ "nix 0.25.0",
 "once_cell",
 "parking_lot 0.12.1",
 "postgres",
 "postgres-protocol",
 "postgres_ffi",
+ "pq_proto",
 "regex",
 "remote_storage",
 "safekeeper_api",
@@ -3548,6 +3560,13 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "tenant_size_model"
+version = "0.1.0"
+dependencies = [
+ "workspace_hack",
+]
+
 [[package]]
 name = "termcolor"
 version = "1.1.3"
@@ -4053,9 +4072,7 @@ dependencies = [
 "metrics",
 "nix 0.25.0",
 "once_cell",
- "pin-project-lite",
- "postgres",
- "postgres-protocol",
+ "pq_proto",
 "rand",
 "routerify",
 "rustls",
@@ -4380,6 +4397,9 @@ dependencies = [
 "crossbeam-utils",
 "either",
 "fail",
+ "futures-channel",
+ "futures-task",
+ "futures-util",
 "hashbrown",
 "indexmap",
 "libc",
@@ -4393,6 +4413,7 @@ dependencies = [
 "rand",
 "regex",
 "regex-syntax",
+ "reqwest",
 "scopeguard",
 "serde",
 "stable_deref_trait",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -25,6 +25,10 @@ members = [
 # Besides, debug info should not affect the performance.
 debug = true

+# disable debug symbols for all packages except this one to decrease binaries size
+[profile.release.package."*"]
+debug = false
+
 [profile.release-line-debug]
 inherits = "release"
 debug = 1 # true = 2 = all symbols, 1 = line only
--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node-v14
@@ -13,7 +13,7 @@ ARG TAG=pinned
 FROM debian:bullseye-slim AS build-deps
 RUN apt update &&  \
    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
-    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config
+    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev

 #########################################################################################
 #
@@ -24,7 +24,7 @@ RUN apt update &&  \
 FROM build-deps AS pg-build
 COPY vendor/postgres-v14 postgres
 RUN cd postgres && \
-    ./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
+    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
    # Install headers
--- a/Dockerfile.compute-node-v15
+++ b/Dockerfile.compute-node-v15
@@ -13,7 +13,7 @@ ARG TAG=pinned
 FROM debian:bullseye-slim AS build-deps
 RUN apt update &&  \
    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
-    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config
+    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev

 #########################################################################################
 #
@@ -24,7 +24,7 @@ RUN apt update &&  \
 FROM build-deps AS pg-build
 COPY vendor/postgres-v15 postgres
 RUN cd postgres && \
-    ./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
+    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
    # Install headers
--- a/Dockerfile.compute-node.legacy
+++ b/Dockerfile.compute-node.legacy
@@ -1,88 +0,0 @@
-#
-# Legacy version of the Dockerfile for the compute node.
-# Used by e2e CI. Building Dockerfile.compute-node will take
-# unreasonable ammount of time without v2 runners.
-#
-# TODO: remove once cloud repo CI is moved to v2 runners.
-#
-
-
-# Allow specifiyng different compute-tools tag and image repo, so we are
-# able to use different images
-ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
-ARG IMAGE=compute-tools
-ARG TAG=latest
-
-#
-# Image with pre-built tools
-#
-FROM $REPOSITORY/$IMAGE:$TAG AS compute-deps
-# Only to get ready compute_ctl binary as deppendency
-
-#
-# Image with Postgres build deps
-#
-FROM debian:bullseye-slim AS build-deps
-
-RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-                                          libcurl4-openssl-dev libossp-uuid-dev
-
-#
-# Image with built Postgres
-#
-FROM build-deps AS pg-build
-
-# Add user postgres
-RUN adduser postgres
-RUN mkdir /pg && chown postgres:postgres /pg
-
-# Copy source files
-# version 14 is default for now
-COPY ./vendor/postgres-v14 /pg/
-COPY ./pgxn /pg/
-
-# Build and install Postgres locally
-RUN mkdir /pg/compute_build && cd /pg/compute_build && \
-    ../configure CFLAGS='-O2 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --with-uuid=ossp && \
-    # Install main binaries and contribs
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
-    # Install headers
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install
-
-# Install neon contrib
-RUN make MAKELEVEL=0 PG_CONFIG=/pg/compute_build/postgres_bin/bin/pg_config -j $(getconf _NPROCESSORS_ONLN) -C /pg/neon install
-
-USER postgres
-WORKDIR /pg
-
-#
-# Final compute node image to be exported
-#
-FROM debian:bullseye-slim
-
-# libreadline-dev is required to run psql
-RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev
-
-# Add user postgres
-RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
-    echo "postgres:test_console_pass" | chpasswd && \
-    mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
-    chown -R postgres:postgres /var/db/postgres && \
-    chmod 0750 /var/db/postgres/compute
-
-# Copy ready Postgres binaries
-COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local
-
-# Copy binaries from compute-tools
-COPY --from=compute-deps /usr/local/bin/compute_ctl /usr/local/bin/compute_ctl
-
-# XXX: temporary symlink for compatibility with old control-plane
-RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
-
-# Add postgres shared objects to the search path
-RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
-
-USER postgres
-
-ENTRYPOINT ["/usr/local/bin/compute_ctl"]
--- a/38
+++ b/38
@@ -20,18 +20,18 @@ else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif

-# Seccomp BPF is only available for Linux
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
+	# Seccomp BPF is only available for Linux
 	PG_CONFIGURE_OPTS += --with-libseccomp
-endif
-
-# macOS with brew-installed openssl requires explicit paths
-# It can be configured with OPENSSL_PREFIX variable
-UNAME_S := $(shell uname -s)
-ifeq ($(UNAME_S),Darwin)
-    OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
-    PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+else ifeq ($(UNAME_S),Darwin)
+	# macOS with brew-installed openssl requires explicit paths
+	# It can be configured with OPENSSL_PREFIX variable
+	OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
+	PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+	# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
+	# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
+	EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
 endif

 # Use -C option so that when PostgreSQL "make install" installs the
@@ -73,7 +73,8 @@ $(POSTGRES_INSTALL_DIR)/build/v14/config.status:
 	+@echo "Configuring Postgres v14 build"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/v14
 	(cd $(POSTGRES_INSTALL_DIR)/build/v14 && \
-	$(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure CFLAGS='$(PG_CFLAGS)' \
+	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure \
+		CFLAGS='$(PG_CFLAGS)' \
 		$(PG_CONFIGURE_OPTS) \
 		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v14 > configure.log)

@@ -81,7 +82,8 @@ $(POSTGRES_INSTALL_DIR)/build/v15/config.status:
 	+@echo "Configuring Postgres v15 build"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/v15
 	(cd $(POSTGRES_INSTALL_DIR)/build/v15 && \
-	$(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure CFLAGS='$(PG_CFLAGS)' \
+	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure \
+		CFLAGS='$(PG_CFLAGS)' \
 		$(PG_CONFIGURE_OPTS) \
 		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v15 > configure.log)

@@ -111,6 +113,8 @@ postgres-v14: postgres-v14-configure \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 install
 	+@echo "Compiling libpq v14"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq install
+	+@echo "Compiling pg_prewarm v14"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_prewarm install
 	+@echo "Compiling pg_buffercache v14"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache install
 	+@echo "Compiling pageinspect v14"
@@ -123,6 +127,8 @@ postgres-v15: postgres-v15-configure \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 install
 	+@echo "Compiling libpq v15"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq install
+	+@echo "Compiling pg_prewarm v15"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_prewarm install
 	+@echo "Compiling pg_buffercache v15"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache install
 	+@echo "Compiling pageinspect v15"
@@ -151,6 +157,11 @@ neon-pg-ext-v14: postgres-v14
 	(cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
+	+@echo "Compiling neon_walredo v14"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14
+	(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 && \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
 	+@echo "Compiling neon_test_utils" v14
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14
 	(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \
@@ -163,6 +174,11 @@ neon-pg-ext-v15: postgres-v15
 	(cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
+	+@echo "Compiling neon_walredo v15"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15
+	(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 && \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
 	+@echo "Compiling neon_test_utils" v15
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15
 	(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 1. Install XCode and dependencies
 ```
 xcode-select --install
-brew install protobuf etcd openssl
+brew install protobuf etcd openssl flex bison
 ```

 2. [Install Rust](https://www.rust-lang.org/tools/install)
@@ -125,24 +125,23 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
 # Create repository in .neon with proper paths to binaries and data
 # Later that would be responsibility of a package install script
 > ./target/debug/neon_local init
-Starting pageserver at '127.0.0.1:64000' in '.neon'
-
-Pageserver started
-Successfully initialized timeline 7dd0907914ac399ff3be45fb252bfdb7
-Stopping pageserver gracefully...done!
+Starting pageserver at '127.0.0.1:64000' in '.neon'.
+pageserver started, pid: 2545906
+Successfully initialized timeline de200bd42b49cc1814412c7e592dd6e9
+Stopped pageserver 1 process with pid 2545906

 # start pageserver and safekeeper
 > ./target/debug/neon_local start
-Starting etcd broker using /usr/bin/etcd
-Starting pageserver at '127.0.0.1:64000' in '.neon'
-
-Pageserver started
-Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'
-Safekeeper started
+Starting etcd broker using "/usr/bin/etcd"
+etcd started, pid: 2545996
+Starting pageserver at '127.0.0.1:64000' in '.neon'.
+pageserver started, pid: 2546005
+Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
+safekeeper 1 started, pid: 2546041

 # start postgres compute node
 > ./target/debug/neon_local pg start main
-Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
+Starting new postgres (v14) main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
 Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
 Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'

@@ -223,10 +222,7 @@ Ensure your dependencies are installed as described [here](https://github.com/ne
 ```sh
 git clone --recursive https://github.com/neondatabase/neon.git

-# either:
 CARGO_BUILD_FLAGS="--features=testing" make
-# or:
-make debug

 ./scripts/pytest
 ```
--- a/cli-v2-story.md
+++ b/cli-v2-story.md
@@ -1,188 +0,0 @@
-Create a new Zenith repository in the current directory:
-
-    ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli init
-    The files belonging to this database system will be owned by user "heikki".
-    This user must also own the server process.
-    
-    The database cluster will be initialized with locale "en_GB.UTF-8".
-    The default database encoding has accordingly been set to "UTF8".
-    The default text search configuration will be set to "english".
-    
-    Data page checksums are disabled.
-    
-    creating directory tmp ... ok
-    creating subdirectories ... ok
-    selecting dynamic shared memory implementation ... posix
-    selecting default max_connections ... 100
-    selecting default shared_buffers ... 128MB
-    selecting default time zone ... Europe/Helsinki
-    creating configuration files ... ok
-    running bootstrap script ... ok
-    performing post-bootstrap initialization ... ok
-    syncing data to disk ... ok
-    
-    initdb: warning: enabling "trust" authentication for local connections
-    You can change this by editing pg_hba.conf or using the option -A, or
-    --auth-local and --auth-host, the next time you run initdb.
-    new zenith repository was created in .zenith
-
-Initially, there is only one branch:
-
-    ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch
-      main
-
-Start a local Postgres instance on the branch:
-
-    ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start main
-    Creating data directory from snapshot at 0/15FFB08...
-    waiting for server to start....2021-04-13 09:27:43.919 EEST [984664] LOG:  starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
-    2021-04-13 09:27:43.920 EEST [984664] LOG:  listening on IPv6 address "::1", port 5432
-    2021-04-13 09:27:43.920 EEST [984664] LOG:  listening on IPv4 address "127.0.0.1", port 5432
-    2021-04-13 09:27:43.927 EEST [984664] LOG:  listening on Unix socket "/tmp/.s.PGSQL.5432"
-    2021-04-13 09:27:43.939 EEST [984665] LOG:  database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
-    2021-04-13 09:27:43.939 EEST [984665] LOG:  creating missing WAL directory "pg_wal/archive_status"
-    2021-04-13 09:27:44.189 EEST [984665] LOG:  database system was not properly shut down; automatic recovery in progress
-    2021-04-13 09:27:44.195 EEST [984665] LOG:  invalid record length at 0/15FFB80: wanted 24, got 0
-    2021-04-13 09:27:44.195 EEST [984665] LOG:  redo is not required
-    2021-04-13 09:27:44.225 EEST [984664] LOG:  database system is ready to accept connections
-     done
-    server started
-
-Run some commands against it:
-
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -c "create table foo (t text);" 
-    CREATE TABLE
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -c "insert into foo values ('inserted on the main branch');" 
-    INSERT 0 1
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -c "select * from foo" 
-                  t              
-    -----------------------------
-     inserted on the main branch
-    (1 row)
-
-Create a new branch called 'experimental'. We create it from the
-current end of the 'main' branch, but you could specify a different
-LSN as the start point instead.
-
-    ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch experimental main
-    branching at end of WAL: 0/161F478
-    
-    ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch 
-      experimental
-      main
-
-Start another Postgres instance off the 'experimental' branch:
-
-    ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433
-    Creating data directory from snapshot at 0/15FFB08...
-    waiting for server to start....2021-04-13 09:28:41.874 EEST [984766] LOG:  starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
-    2021-04-13 09:28:41.875 EEST [984766] LOG:  listening on IPv6 address "::1", port 5433
-    2021-04-13 09:28:41.875 EEST [984766] LOG:  listening on IPv4 address "127.0.0.1", port 5433
-    2021-04-13 09:28:41.883 EEST [984766] LOG:  listening on Unix socket "/tmp/.s.PGSQL.5433"
-    2021-04-13 09:28:41.896 EEST [984767] LOG:  database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
-    2021-04-13 09:28:42.265 EEST [984767] LOG:  database system was not properly shut down; automatic recovery in progress
-    2021-04-13 09:28:42.269 EEST [984767] LOG:  redo starts at 0/15FFB80
-    2021-04-13 09:28:42.272 EEST [984767] LOG:  invalid record length at 0/161F4B0: wanted 24, got 0
-    2021-04-13 09:28:42.272 EEST [984767] LOG:  redo done at 0/161F478 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s
-    2021-04-13 09:28:42.321 EEST [984766] LOG:  database system is ready to accept connections
-     done
-    server started
-
-Insert some a row on the 'experimental' branch:
-
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo" 
-                  t              
-    -----------------------------
-     inserted on the main branch
-    (1 row)
-    
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "insert into foo values ('inserted on experimental')" 
-    INSERT 0 1
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo" 
-                  t              
-    -----------------------------
-     inserted on the main branch
-     inserted on experimental
-    (2 rows)
-    
-See that the other Postgres instance is still running on 'main' branch on port 5432:
-
-
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5432 -c "select * from foo" 
-                  t              
-    -----------------------------
-     inserted on the main branch
-    (1 row)
-
-
-
-
-Everything is stored in the .zenith directory:
-
-    ~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/
-    total 12
-    drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 datadirs
-    drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:27 refs
-    drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 timelines
-
-The 'datadirs' directory contains the datadirs of the running instances:
-
-    ~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/
-    total 8
-    drwx------ 18 heikki heikki 4096 Apr 13 09:27 3c0c634c1674079b2c6d4edf7c91523e
-    drwx------ 18 heikki heikki 4096 Apr 13 09:28 697e3c103d4b1763cd6e82e4ff361d76
-    ~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/3c0c634c1674079b2c6d4edf7c91523e/
-    total 124
-    drwxr-xr-x 5 heikki heikki  4096 Apr 13 09:27 base
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 global
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_commit_ts
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_dynshmem
-    -rw------- 1 heikki heikki  4760 Apr 13 09:27 pg_hba.conf
-    -rw------- 1 heikki heikki  1636 Apr 13 09:27 pg_ident.conf
-    drwxr-xr-x 4 heikki heikki  4096 Apr 13 09:32 pg_logical
-    drwxr-xr-x 4 heikki heikki  4096 Apr 13 09:27 pg_multixact
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_notify
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_replslot
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_serial
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_snapshots
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_stat
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:34 pg_stat_tmp
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_subtrans
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_tblspc
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_twophase
-    -rw------- 1 heikki heikki     3 Apr 13 09:27 PG_VERSION
-    lrwxrwxrwx 1 heikki heikki    52 Apr 13 09:27 pg_wal -> ../../timelines/3c0c634c1674079b2c6d4edf7c91523e/wal
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_xact
-    -rw------- 1 heikki heikki    88 Apr 13 09:27 postgresql.auto.conf
-    -rw------- 1 heikki heikki 28688 Apr 13 09:27 postgresql.conf
-    -rw------- 1 heikki heikki    96 Apr 13 09:27 postmaster.opts
-    -rw------- 1 heikki heikki   149 Apr 13 09:27 postmaster.pid
-
-Note how 'pg_wal' is just a symlink to the 'timelines' directory. The
-datadir is ephemeral, you can delete it at any time, and it can be reconstructed
-from the snapshots and WAL stored in the 'timelines' directory. So if you push/pull
-the repository, the 'datadirs' are not included. (They are like git working trees)
-
-    ~/git-sandbox/zenith (cli-v2)$ killall -9 postgres
-    ~/git-sandbox/zenith (cli-v2)$ rm -rf .zenith/datadirs/*
-    ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433
-    Creating data directory from snapshot at 0/15FFB08...
-    waiting for server to start....2021-04-13 09:37:05.476 EEST [985340] LOG:  starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
-    2021-04-13 09:37:05.477 EEST [985340] LOG:  listening on IPv6 address "::1", port 5433
-    2021-04-13 09:37:05.477 EEST [985340] LOG:  listening on IPv4 address "127.0.0.1", port 5433
-    2021-04-13 09:37:05.487 EEST [985340] LOG:  listening on Unix socket "/tmp/.s.PGSQL.5433"
-    2021-04-13 09:37:05.498 EEST [985341] LOG:  database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
-    2021-04-13 09:37:05.808 EEST [985341] LOG:  database system was not properly shut down; automatic recovery in progress
-    2021-04-13 09:37:05.813 EEST [985341] LOG:  redo starts at 0/15FFB80
-    2021-04-13 09:37:05.815 EEST [985341] LOG:  invalid record length at 0/161F770: wanted 24, got 0
-    2021-04-13 09:37:05.815 EEST [985341] LOG:  redo done at 0/161F738 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s
-    2021-04-13 09:37:05.866 EEST [985340] LOG:  database system is ready to accept connections
-     done
-    server started
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo" 
-                  t              
-    -----------------------------
-     inserted on the main branch
-     inserted on experimental
-    (2 rows)
-
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -65,7 +65,7 @@ impl GenericOption {
            let name = match self.name.as_str() {
                "safekeepers" => "neon.safekeepers",
                "wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout",
-                "wal_acceptor_connect_timeout" => "neon.safekeeper_connect_timeout",
+                "wal_acceptor_connection_timeout" => "neon.safekeeper_connection_timeout",
                it => it,
            };

--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -4,20 +4,21 @@ version = "0.1.0"
 edition = "2021"

 [dependencies]
+anyhow = "1.0"
 clap = "4.0"
 comfy-table = "6.1"
 git-version = "0.3.5"
-tar = "0.4.38"
+nix = "0.25"
+once_cell = "1.13.0"
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+regex = "1"
+reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
 serde = { version = "1.0", features = ["derive"] }
 serde_with = "2.0"
-toml = "0.5"
-once_cell = "1.13.0"
-regex = "1"
-anyhow = "1.0"
+tar = "0.4.38"
 thiserror = "1"
-nix = "0.25"
-reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
+toml = "0.5"
+url = "2.2.2"

 # Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
 # instead, so that recompile times are better.
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -0,0 +1,279 @@
+//! Spawns and kills background processes that are needed by Neon CLI.
+//! Applies common set-up such as log and pid files (if needed) to every process.
+//!
+//! Neon CLI does not run in background, so it needs to store the information about
+//! spawned processes, which it does in this module.
+//! We do that by storing the pid of the process in the "${process_name}.pid" file.
+//! The pid file can be created by the process itself
+//! (Neon storage binaries do that and also ensure that a lock is taken onto that file)
+//! or we create such file after starting the process
+//! (non-Neon binaries don't necessarily follow our pidfile conventions).
+//! The pid stored in the file is later used to stop the service.
+//!
+//! See [`lock_file`] module for more info.
+
+use std::ffi::OsStr;
+use std::io::Write;
+use std::path::Path;
+use std::process::{Child, Command};
+use std::time::Duration;
+use std::{fs, io, thread};
+
+use anyhow::{anyhow, bail, Context, Result};
+use nix::errno::Errno;
+use nix::sys::signal::{kill, Signal};
+use nix::unistd::Pid;
+
+use utils::lock_file;
+
+// These constants control the loop used to poll for process start / stop.
+//
+// The loop waits for at most 10 seconds, polling every 100 ms.
+// Once a second, it prints a dot ("."), to give the user an indication that
+// it's waiting. If the process hasn't started/stopped after 5 seconds,
+// it prints a notice that it's taking long, but keeps waiting.
+//
+const RETRY_UNTIL_SECS: u64 = 10;
+const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
+const RETRY_INTERVAL_MILLIS: u64 = 100;
+const DOT_EVERY_RETRIES: u64 = 10;
+const NOTICE_AFTER_RETRIES: u64 = 50;
+
+/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
+/// it itself.
+pub enum InitialPidFile<'t> {
+    /// Create a pidfile, to allow future CLI invocations to manipulate the process.
+    Create(&'t Path),
+    /// The process will create the pidfile itself, need to wait for that event.
+    Expect(&'t Path),
+}
+
+/// Start a background child process using the parameters given.
+pub fn start_process<F, S: AsRef<OsStr>>(
+    process_name: &str,
+    datadir: &Path,
+    command: &Path,
+    args: &[S],
+    initial_pid_file: InitialPidFile,
+    process_status_check: F,
+) -> anyhow::Result<Child>
+where
+    F: Fn() -> anyhow::Result<bool>,
+{
+    let log_path = datadir.join(format!("{process_name}.log"));
+    let process_log_file = fs::OpenOptions::new()
+        .create(true)
+        .write(true)
+        .append(true)
+        .open(&log_path)
+        .with_context(|| {
+            format!("Could not open {process_name} log file {log_path:?} for writing")
+        })?;
+    let same_file_for_stderr = process_log_file.try_clone().with_context(|| {
+        format!("Could not reuse {process_name} log file {log_path:?} for writing stderr")
+    })?;
+
+    let mut command = Command::new(command);
+    let background_command = command
+        .stdout(process_log_file)
+        .stderr(same_file_for_stderr)
+        .args(args);
+    let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
+
+    let mut spawned_process = filled_cmd.spawn().with_context(|| {
+        format!("Could not spawn {process_name}, see console output and log files for details.")
+    })?;
+    let pid = spawned_process.id();
+    let pid = Pid::from_raw(
+        i32::try_from(pid)
+            .with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
+    );
+
+    let pid_file_to_check = match initial_pid_file {
+        InitialPidFile::Create(target_pid_file_path) => {
+            match lock_file::create_lock_file(target_pid_file_path, pid.to_string()) {
+                lock_file::LockCreationResult::Created { .. } => {
+                    // We use "lock" file here only to create the pid file. The lock on the pidfile will be dropped as soon
+                    // as this CLI invocation exits, so it's a bit useless, but doesn't any harm either.
+                }
+                lock_file::LockCreationResult::AlreadyLocked { .. } => {
+                    anyhow::bail!("Cannot write pid file for {process_name} at path {target_pid_file_path:?}: file is already locked by another process")
+                }
+                lock_file::LockCreationResult::CreationFailed(e) => {
+                    return Err(e.context(format!(
+                    "Failed to create pid file for {process_name} at path {target_pid_file_path:?}"
+                )))
+                }
+            }
+            None
+        }
+        InitialPidFile::Expect(pid_file_path) => Some(pid_file_path),
+    };
+
+    for retries in 0..RETRIES {
+        match process_started(pid, pid_file_to_check, &process_status_check) {
+            Ok(true) => {
+                println!("\n{process_name} started, pid: {pid}");
+                return Ok(spawned_process);
+            }
+            Ok(false) => {
+                if retries == NOTICE_AFTER_RETRIES {
+                    // The process is taking a long time to start up. Keep waiting, but
+                    // print a message
+                    print!("\n{process_name} has not started yet, continuing to wait");
+                }
+                if retries % DOT_EVERY_RETRIES == 0 {
+                    print!(".");
+                    io::stdout().flush().unwrap();
+                }
+                thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
+            }
+            Err(e) => {
+                println!("{process_name} failed to start: {e:#}");
+                if let Err(e) = spawned_process.kill() {
+                    println!("Could not stop {process_name} subprocess: {e:#}")
+                };
+                return Err(e);
+            }
+        }
+    }
+    println!();
+    anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
+}
+
+/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
+pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
+    if !pid_file.exists() {
+        println!("{process_name} is already stopped: no pid file {pid_file:?} is present");
+        return Ok(());
+    }
+    let pid = read_pidfile(pid_file)?;
+
+    let sig = if immediate {
+        print!("Stopping {process_name} with pid {pid} immediately..");
+        Signal::SIGQUIT
+    } else {
+        print!("Stopping {process_name} with pid {pid} gracefully..");
+        Signal::SIGTERM
+    };
+    io::stdout().flush().unwrap();
+    match kill(pid, sig) {
+        Ok(()) => (),
+        Err(Errno::ESRCH) => {
+            println!(
+                "{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found"
+            );
+            return Ok(());
+        }
+        Err(e) => anyhow::bail!("Failed to send signal to {process_name} with pid {pid}: {e}"),
+    }
+
+    // Wait until process is gone
+    for retries in 0..RETRIES {
+        match process_has_stopped(pid) {
+            Ok(true) => {
+                println!("\n{process_name} stopped");
+                if let Err(e) = fs::remove_file(pid_file) {
+                    if e.kind() != io::ErrorKind::NotFound {
+                        eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
+                    }
+                }
+                return Ok(());
+            }
+            Ok(false) => {
+                if retries == NOTICE_AFTER_RETRIES {
+                    // The process is taking a long time to start up. Keep waiting, but
+                    // print a message
+                    print!("\n{process_name} has not stopped yet, continuing to wait");
+                }
+                if retries % DOT_EVERY_RETRIES == 0 {
+                    print!(".");
+                    io::stdout().flush().unwrap();
+                }
+                thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
+            }
+            Err(e) => {
+                println!("{process_name} with pid {pid} failed to stop: {e:#}");
+                return Err(e);
+            }
+        }
+    }
+    println!();
+    anyhow::bail!("{process_name} with pid {pid} did not stop in {RETRY_UNTIL_SECS} seconds");
+}
+
+fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
+    let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
+
+    // Pass through these environment variables to the command
+    for var in ["LLVM_PROFILE_FILE", "FAILPOINTS", "RUST_LOG"] {
+        if let Some(val) = std::env::var_os(var) {
+            filled_cmd = filled_cmd.env(var, val);
+        }
+    }
+
+    filled_cmd
+}
+
+fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
+    for env_key in [
+        "AWS_ACCESS_KEY_ID",
+        "AWS_SECRET_ACCESS_KEY",
+        "AWS_SESSION_TOKEN",
+    ] {
+        if let Ok(value) = std::env::var(env_key) {
+            cmd = cmd.env(env_key, value);
+        }
+    }
+    cmd
+}
+
+fn process_started<F>(
+    pid: Pid,
+    pid_file_to_check: Option<&Path>,
+    status_check: &F,
+) -> anyhow::Result<bool>
+where
+    F: Fn() -> anyhow::Result<bool>,
+{
+    match status_check() {
+        Ok(true) => match pid_file_to_check {
+            Some(pid_file_path) => {
+                if pid_file_path.exists() {
+                    let pid_in_file = read_pidfile(pid_file_path)?;
+                    Ok(pid_in_file == pid)
+                } else {
+                    Ok(false)
+                }
+            }
+            None => Ok(true),
+        },
+        Ok(false) => Ok(false),
+        Err(e) => anyhow::bail!("process failed to start: {e}"),
+    }
+}
+
+/// Read a PID file
+///
+/// We expect a file that contains a single integer.
+fn read_pidfile(pidfile: &Path) -> Result<Pid> {
+    let pid_str = fs::read_to_string(pidfile)
+        .with_context(|| format!("failed to read pidfile {pidfile:?}"))?;
+    let pid: i32 = pid_str
+        .parse()
+        .map_err(|_| anyhow!("failed to parse pidfile {pidfile:?}"))?;
+    if pid < 1 {
+        bail!("pidfile {pidfile:?} contained bad value '{pid}'");
+    }
+    Ok(Pid::from_raw(pid))
+}
+
+fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
+    match kill(pid, None) {
+        // Process exists, keep waiting
+        Ok(_) => Ok(false),
+        // Process not found, we're done
+        Err(Errno::ESRCH) => Ok(true),
+        Err(err) => anyhow::bail!("Failed to send signal to process with pid {pid}: {err}"),
+    }
+}
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -9,8 +9,8 @@ use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
 use control_plane::compute::ComputeControlPlane;
 use control_plane::local_env::{EtcdBroker, LocalEnv};
+use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
-use control_plane::storage::PageServerNode;
 use control_plane::{etcd, local_env};
 use pageserver_api::models::TimelineInfo;
 use pageserver_api::{
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -12,15 +12,14 @@ use std::time::Duration;

 use anyhow::{Context, Result};
 use utils::{
-    connstring::connection_host_port,
    id::{TenantId, TimelineId},
    lsn::Lsn,
    postgres_backend::AuthType,
 };

 use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION};
+use crate::pageserver::PageServerNode;
 use crate::postgresql_conf::PostgresConf;
-use crate::storage::PageServerNode;

 //
 // ComputeControlPlane
@@ -300,7 +299,8 @@ impl PostgresNode {

        // Configure the node to fetch pages from pageserver
        let pageserver_connstr = {
-            let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
+            let config = &self.pageserver.pg_connection_config;
+            let (host, port) = (config.host(), config.port());

            // Set up authentication
            //
@@ -343,7 +343,7 @@ impl PostgresNode {
        //   To be able to restore database in case of pageserver node crash, safekeeper should not
        //   remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers
        //   (if they are not able to upload WAL to S3).
-        conf.append("max_replication_write_lag", "500MB");
+        conf.append("max_replication_write_lag", "15MB");
        conf.append("max_replication_flush_lag", "10GB");

        if !self.env.safekeepers.is_empty() {
--- a/control_plane/src/connection.rs
+++ b/control_plane/src/connection.rs
@@ -0,0 +1,57 @@
+use url::Url;
+
+#[derive(Debug)]
+pub struct PgConnectionConfig {
+    url: Url,
+}
+
+impl PgConnectionConfig {
+    pub fn host(&self) -> &str {
+        self.url.host_str().expect("BUG: no host")
+    }
+
+    pub fn port(&self) -> u16 {
+        self.url.port().expect("BUG: no port")
+    }
+
+    /// Return a `<host>:<port>` string.
+    pub fn raw_address(&self) -> String {
+        format!("{}:{}", self.host(), self.port())
+    }
+
+    /// Connect using postgres protocol with TLS disabled.
+    pub fn connect_no_tls(&self) -> Result<postgres::Client, postgres::Error> {
+        postgres::Client::connect(self.url.as_str(), postgres::NoTls)
+    }
+}
+
+impl std::str::FromStr for PgConnectionConfig {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let mut url: Url = s.parse()?;
+
+        match url.scheme() {
+            "postgres" | "postgresql" => {}
+            other => anyhow::bail!("invalid scheme: {other}"),
+        }
+
+        // It's not a valid connection url if host is unavailable.
+        if url.host().is_none() {
+            anyhow::bail!(url::ParseError::EmptyHost);
+        }
+
+        // E.g. `postgres:bar`.
+        if url.cannot_be_a_base() {
+            anyhow::bail!("URL cannot be a base");
+        }
+
+        // Set the default PG port if it's missing.
+        if url.port().is_none() {
+            url.set_port(Some(5432))
+                .expect("BUG: couldn't set the default port");
+        }
+
+        Ok(Self { url })
+    }
+}
--- a/control_plane/src/etcd.rs
+++ b/control_plane/src/etcd.rs
@@ -1,99 +1,75 @@
-use std::{
-    fs,
-    path::PathBuf,
-    process::{Command, Stdio},
-};
+use std::{fs, path::PathBuf};

 use anyhow::Context;
-use nix::{
-    sys::signal::{kill, Signal},
-    unistd::Pid,
-};

-use crate::{local_env, read_pidfile};
+use crate::{background_process, local_env};

 pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
    let etcd_broker = &env.etcd_broker;
-    println!(
-        "Starting etcd broker using {}",
-        etcd_broker.etcd_binary_path.display()
+    print!(
+        "Starting etcd broker using {:?}",
+        etcd_broker.etcd_binary_path
    );

    let etcd_data_dir = env.base_data_dir.join("etcd");
-    fs::create_dir_all(&etcd_data_dir).with_context(|| {
-        format!(
-            "Failed to create etcd data dir: {}",
-            etcd_data_dir.display()
-        )
-    })?;
+    fs::create_dir_all(&etcd_data_dir)
+        .with_context(|| format!("Failed to create etcd data dir {etcd_data_dir:?}"))?;

-    let etcd_stdout_file =
-        fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| {
-            format!(
-                "Failed to create etcd stout file in directory {}",
-                etcd_data_dir.display()
-            )
-        })?;
-    let etcd_stderr_file =
-        fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| {
-            format!(
-                "Failed to create etcd stderr file in directory {}",
-                etcd_data_dir.display()
-            )
-        })?;
    let client_urls = etcd_broker.comma_separated_endpoints();
+    let args = [
+        format!("--data-dir={}", etcd_data_dir.display()),
+        format!("--listen-client-urls={client_urls}"),
+        format!("--advertise-client-urls={client_urls}"),
+        // Set --quota-backend-bytes to keep the etcd virtual memory
+        // size smaller. Our test etcd clusters are very small.
+        // See https://github.com/etcd-io/etcd/issues/7910
+        "--quota-backend-bytes=100000000".to_string(),
+        // etcd doesn't compact (vacuum) with default settings,
+        // enable it to prevent space exhaustion.
+        "--auto-compaction-mode=revision".to_string(),
+        "--auto-compaction-retention=1".to_string(),
+    ];

-    let etcd_process = Command::new(&etcd_broker.etcd_binary_path)
-        .args(&[
-            format!("--data-dir={}", etcd_data_dir.display()),
-            format!("--listen-client-urls={client_urls}"),
-            format!("--advertise-client-urls={client_urls}"),
-            // Set --quota-backend-bytes to keep the etcd virtual memory
-            // size smaller. Our test etcd clusters are very small.
-            // See https://github.com/etcd-io/etcd/issues/7910
-            "--quota-backend-bytes=100000000".to_string(),
-            // etcd doesn't compact (vacuum) with default settings,
-            // enable it to prevent space exhaustion.
-            "--auto-compaction-mode=revision".to_string(),
-            "--auto-compaction-retention=1".to_string(),
-        ])
-        .stdout(Stdio::from(etcd_stdout_file))
-        .stderr(Stdio::from(etcd_stderr_file))
-        .spawn()
-        .context("Failed to spawn etcd subprocess")?;
-    let pid = etcd_process.id();
+    let pid_file_path = etcd_pid_file_path(env);

-    let etcd_pid_file_path = etcd_pid_file_path(env);
-    fs::write(&etcd_pid_file_path, pid.to_string()).with_context(|| {
-        format!(
-            "Failed to create etcd pid file at {}",
-            etcd_pid_file_path.display()
-        )
-    })?;
+    let client = reqwest::blocking::Client::new();
+
+    background_process::start_process(
+        "etcd",
+        &etcd_data_dir,
+        &etcd_broker.etcd_binary_path,
+        &args,
+        background_process::InitialPidFile::Create(&pid_file_path),
+        || {
+            for broker_endpoint in &etcd_broker.broker_endpoints {
+                let request = broker_endpoint
+                    .join("health")
+                    .with_context(|| {
+                        format!(
+                            "Failed to append /health path to broker endopint {}",
+                            broker_endpoint
+                        )
+                    })
+                    .and_then(|url| {
+                        client.get(&url.to_string()).build().with_context(|| {
+                            format!("Failed to construct request to etcd endpoint {url}")
+                        })
+                    })?;
+                if client.execute(request).is_ok() {
+                    return Ok(true);
+                }
+            }
+
+            Ok(false)
+        },
+    )
+    .context("Failed to spawn etcd subprocess")?;

    Ok(())
 }

 pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
-    let etcd_path = &env.etcd_broker.etcd_binary_path;
-    println!("Stopping etcd broker at {}", etcd_path.display());
-
-    let etcd_pid_file_path = etcd_pid_file_path(env);
-    let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| {
-        format!(
-            "Failed to read etcd pid file at {}",
-            etcd_pid_file_path.display()
-        )
-    })?);
-
-    kill(pid, Signal::SIGTERM).with_context(|| {
-        format!(
-            "Failed to stop etcd with pid {pid} at {}",
-            etcd_pid_file_path.display()
-        )
-    })?;
-
-    Ok(())
+    background_process::stop_process(true, "etcd", &etcd_pid_file_path(env))
 }

 fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -6,59 +6,12 @@
 // Intended to be used in integration tests and in CLI tools for
 // local installations.
 //
-use anyhow::{anyhow, bail, Context, Result};
-use std::fs;
-use std::path::Path;
-use std::process::Command;

+mod background_process;
 pub mod compute;
+pub mod connection;
 pub mod etcd;
 pub mod local_env;
+pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
-pub mod storage;
-
-/// Read a PID file
-///
-/// We expect a file that contains a single integer.
-/// We return an i32 for compatibility with libc and nix.
-pub fn read_pidfile(pidfile: &Path) -> Result<i32> {
-    let pid_str = fs::read_to_string(pidfile)
-        .with_context(|| format!("failed to read pidfile {:?}", pidfile))?;
-    let pid: i32 = pid_str
-        .parse()
-        .map_err(|_| anyhow!("failed to parse pidfile {:?}", pidfile))?;
-    if pid < 1 {
-        bail!("pidfile {:?} contained bad value '{}'", pidfile, pid);
-    }
-    Ok(pid)
-}
-
-fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
-    let cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
-
-    let var = "LLVM_PROFILE_FILE";
-    if let Some(val) = std::env::var_os(var) {
-        cmd.env(var, val);
-    }
-
-    const RUST_LOG_KEY: &str = "RUST_LOG";
-    if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
-        cmd.env(RUST_LOG_KEY, rust_log_value)
-    } else {
-        cmd
-    }
-}
-
-fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
-    for env_key in [
-        "AWS_ACCESS_KEY_ID",
-        "AWS_SECRET_ACCESS_KEY",
-        "AWS_SESSION_TOKEN",
-    ] {
-        if let Ok(value) = std::env::var(env_key) {
-            cmd = cmd.env(env_key, value);
-        }
-    }
-    cmd
-}
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -226,12 +226,12 @@ impl LocalEnv {
        }
    }

-    pub fn pageserver_bin(&self) -> anyhow::Result<PathBuf> {
-        Ok(self.neon_distrib_dir.join("pageserver"))
+    pub fn pageserver_bin(&self) -> PathBuf {
+        self.neon_distrib_dir.join("pageserver")
    }

-    pub fn safekeeper_bin(&self) -> anyhow::Result<PathBuf> {
-        Ok(self.neon_distrib_dir.join("safekeeper"))
+    pub fn safekeeper_bin(&self) -> PathBuf {
+        self.neon_distrib_dir.join("safekeeper")
    }

    pub fn pg_data_dirs_path(&self) -> PathBuf {
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -1,33 +1,27 @@
 use std::collections::HashMap;
-use std::fs::File;
+use std::fs::{self, File};
 use std::io::{BufReader, Write};
 use std::num::NonZeroU64;
 use std::path::{Path, PathBuf};
-use std::process::Command;
-use std::time::Duration;
-use std::{io, result, thread};
+use std::process::Child;
+use std::{io, result};

+use crate::connection::PgConnectionConfig;
 use anyhow::{bail, Context};
-use nix::errno::Errno;
-use nix::sys::signal::{kill, Signal};
-use nix::unistd::Pid;
 use pageserver_api::models::{
    TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
 };
-use postgres::{Config, NoTls};
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
 use utils::{
-    connstring::connection_address,
    http::error::HttpErrorBody,
    id::{TenantId, TimelineId},
    lsn::Lsn,
    postgres_backend::AuthType,
 };

-use crate::local_env::LocalEnv;
-use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
+use crate::{background_process, local_env::LocalEnv};

 #[derive(Error, Debug)]
 pub enum PageserverHttpError {
@@ -75,7 +69,7 @@ impl ResponseErrorMessageExt for Response {
 //
 #[derive(Debug)]
 pub struct PageServerNode {
-    pub pg_connection_config: Config,
+    pub pg_connection_config: PgConnectionConfig,
    pub env: LocalEnv,
    pub http_client: Client,
    pub http_base_url: String,
@@ -101,7 +95,7 @@ impl PageServerNode {
    }

    /// Construct libpq connection string for connecting to the pageserver.
-    fn pageserver_connection_config(password: &str, listen_addr: &str) -> Config {
+    fn pageserver_connection_config(password: &str, listen_addr: &str) -> PgConnectionConfig {
        format!("postgresql://no_user:{password}@{listen_addr}/no_db")
            .parse()
            .unwrap()
@@ -161,7 +155,15 @@ impl PageServerNode {
            init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'");
        }

-        self.start_node(&init_config_overrides, &self.env.base_data_dir, true)?;
+        let mut pageserver_process = self
+            .start_node(&init_config_overrides, &self.env.base_data_dir, true)
+            .with_context(|| {
+                format!(
+                    "Failed to start a process for pageserver {}",
+                    self.env.pageserver.id,
+                )
+            })?;
+
        let init_result = self
            .try_init_timeline(create_tenant, initial_timeline_id, pg_version)
            .context("Failed to create initial tenant and timeline for pageserver");
@@ -171,7 +173,29 @@ impl PageServerNode {
            }
            Err(e) => eprintln!("{e:#}"),
        }
-        self.stop(false)?;
+        match pageserver_process.kill() {
+            Err(e) => {
+                eprintln!(
+                    "Failed to stop pageserver {} process with pid {}: {e:#}",
+                    self.env.pageserver.id,
+                    pageserver_process.id(),
+                )
+            }
+            Ok(()) => {
+                println!(
+                    "Stopped pageserver {} process with pid {}",
+                    self.env.pageserver.id,
+                    pageserver_process.id(),
+                );
+                // cleanup after pageserver startup, since we do not call regular `stop_process` during init
+                let pid_file = self.pid_file();
+                if let Err(e) = fs::remove_file(&pid_file) {
+                    if e.kind() != io::ErrorKind::NotFound {
+                        eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
+                    }
+                }
+            }
+        }
        init_result
    }

@@ -196,11 +220,14 @@ impl PageServerNode {
        self.env.pageserver_data_dir()
    }

-    pub fn pid_file(&self) -> PathBuf {
+    /// The pid file is created by the pageserver process, with its pid stored inside.
+    /// Other pageservers cannot lock the same file and overwrite it for as long as the current
+    /// pageserver runs. (Unless someone removes the file manually; never do that!)
+    fn pid_file(&self) -> PathBuf {
        self.repo_path().join("pageserver.pid")
    }

-    pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
+    pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
        self.start_node(config_overrides, &self.repo_path(), false)
    }

@@ -209,10 +236,10 @@ impl PageServerNode {
        config_overrides: &[&str],
        datadir: &Path,
        update_config: bool,
-    ) -> anyhow::Result<()> {
-        println!(
+    ) -> anyhow::Result<Child> {
+        print!(
            "Starting pageserver at '{}' in '{}'",
-            connection_address(&self.pg_connection_config),
+            self.pg_connection_config.raw_address(),
            datadir.display()
        );
        io::stdout().flush()?;
@@ -220,10 +247,7 @@ impl PageServerNode {
        let mut args = vec![
            "-D",
            datadir.to_str().with_context(|| {
-                format!(
-                    "Datadir path '{}' cannot be represented as a unicode string",
-                    datadir.display()
-                )
+                format!("Datadir path {datadir:?} cannot be represented as a unicode string")
            })?,
        ];

@@ -235,48 +259,18 @@ impl PageServerNode {
            args.extend(["-c", config_override]);
        }

-        let mut cmd = Command::new(self.env.pageserver_bin()?);
-        let mut filled_cmd = fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
-        filled_cmd = fill_aws_secrets_vars(filled_cmd);
-
-        if !filled_cmd.status()?.success() {
-            bail!(
-                "Pageserver failed to start. See console output and '{}' for details.",
-                datadir.join("pageserver.log").display()
-            );
-        }
-
-        // It takes a while for the page server to start up. Wait until it is
-        // open for business.
-        const RETRIES: i8 = 15;
-        for retries in 1..RETRIES {
-            match self.check_status() {
-                Ok(()) => {
-                    println!("\nPageserver started");
-                    return Ok(());
-                }
-                Err(err) => {
-                    match err {
-                        PageserverHttpError::Transport(err) => {
-                            if err.is_connect() && retries < 5 {
-                                print!(".");
-                                io::stdout().flush().unwrap();
-                            } else {
-                                if retries == 5 {
-                                    println!() // put a line break after dots for second message
-                                }
-                                println!("Pageserver not responding yet, err {err} retrying ({retries})...");
-                            }
-                        }
-                        PageserverHttpError::Response(msg) => {
-                            bail!("pageserver failed to start: {msg} ")
-                        }
-                    }
-                    thread::sleep(Duration::from_secs(1));
-                }
-            }
-        }
-        bail!("pageserver failed to start in {RETRIES} seconds");
+        background_process::start_process(
+            "pageserver",
+            datadir,
+            &self.env.pageserver_bin(),
+            &args,
+            background_process::InitialPidFile::Expect(&self.pid_file()),
+            || match self.check_status() {
+                Ok(()) => Ok(true),
+                Err(PageserverHttpError::Transport(_)) => Ok(false),
+                Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
+            },
+        )
    }

    ///
@@ -288,69 +282,18 @@ impl PageServerNode {
    /// If the server is not running, returns success
    ///
    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        let pid_file = self.pid_file();
-        if !pid_file.exists() {
-            println!("Pageserver is already stopped");
-            return Ok(());
-        }
-        let pid = Pid::from_raw(read_pidfile(&pid_file)?);
-
-        let sig = if immediate {
-            print!("Stopping pageserver immediately..");
-            Signal::SIGQUIT
-        } else {
-            print!("Stopping pageserver gracefully..");
-            Signal::SIGTERM
-        };
-        io::stdout().flush().unwrap();
-        match kill(pid, sig) {
-            Ok(_) => (),
-            Err(Errno::ESRCH) => {
-                println!("Pageserver with pid {pid} does not exist, but a PID file was found");
-                return Ok(());
-            }
-            Err(err) => bail!(
-                "Failed to send signal to pageserver with pid {pid}: {}",
-                err.desc()
-            ),
-        }
-
-        // Wait until process is gone
-        for i in 0..600 {
-            let signal = None; // Send no signal, just get the error code
-            match kill(pid, signal) {
-                Ok(_) => (), // Process exists, keep waiting
-                Err(Errno::ESRCH) => {
-                    // Process not found, we're done
-                    println!("done!");
-                    return Ok(());
-                }
-                Err(err) => bail!(
-                    "Failed to send signal to pageserver with pid {}: {}",
-                    pid,
-                    err.desc()
-                ),
-            };
-
-            if i % 10 == 0 {
-                print!(".");
-                io::stdout().flush().unwrap();
-            }
-            thread::sleep(Duration::from_millis(100));
-        }
-
-        bail!("Failed to stop pageserver with pid {pid}");
+        background_process::stop_process(immediate, "pageserver", &self.pid_file())
    }

    pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
-        let mut client = self.pg_connection_config.connect(NoTls).unwrap();
+        let mut client = self.pg_connection_config.connect_no_tls().unwrap();

        println!("Pageserver query: '{sql}'");
        client.simple_query(sql).unwrap()
    }

    pub fn page_server_psql_client(&self) -> result::Result<postgres::Client, postgres::Error> {
-        self.pg_connection_config.connect(NoTls)
+        self.pg_connection_config.connect_no_tls()
    }

    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
@@ -419,6 +362,11 @@ impl PageServerNode {
                .map(|x| x.parse::<NonZeroU64>())
                .transpose()
                .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
+            trace_read_requests: settings
+                .remove("trace_read_requests")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'trace_read_requests' as bool")?,
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
@@ -481,6 +429,11 @@ impl PageServerNode {
                    .map(|x| x.parse::<NonZeroU64>())
                    .transpose()
                    .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
+                trace_read_requests: settings
+                    .get("trace_read_requests")
+                    .map(|x| x.parse::<bool>())
+                    .transpose()
+                    .context("Failed to parse 'trace_read_requests' as bool")?,
            })
            .send()?
            .error_from_body()?;
@@ -549,7 +502,7 @@ impl PageServerNode {
        pg_wal: Option<(Lsn, PathBuf)>,
        pg_version: u32,
    ) -> anyhow::Result<()> {
-        let mut client = self.pg_connection_config.connect(NoTls).unwrap();
+        let mut client = self.pg_connection_config.connect_no_tls().unwrap();

        // Init base reader
        let (start_lsn, base_tarfile_path) = base;
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -1,23 +1,21 @@
 use std::io::Write;
 use std::path::PathBuf;
-use std::process::Command;
+use std::process::Child;
 use std::sync::Arc;
-use std::time::Duration;
-use std::{io, result, thread};
+use std::{io, result};

-use anyhow::bail;
-use nix::errno::Errno;
-use nix::sys::signal::{kill, Signal};
-use nix::unistd::Pid;
-use postgres::Config;
+use anyhow::Context;
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
-use utils::{connstring::connection_address, http::error::HttpErrorBody, id::NodeId};
+use utils::{http::error::HttpErrorBody, id::NodeId};

-use crate::local_env::{LocalEnv, SafekeeperConf};
-use crate::storage::PageServerNode;
-use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
+use crate::connection::PgConnectionConfig;
+use crate::pageserver::PageServerNode;
+use crate::{
+    background_process,
+    local_env::{LocalEnv, SafekeeperConf},
+};

 #[derive(Error, Debug)]
 pub enum SafekeeperHttpError {
@@ -63,7 +61,7 @@ pub struct SafekeeperNode {

    pub conf: SafekeeperConf,

-    pub pg_connection_config: Config,
+    pub pg_connection_config: PgConnectionConfig,
    pub env: LocalEnv,
    pub http_client: Client,
    pub http_base_url: String,
@@ -87,15 +85,15 @@ impl SafekeeperNode {
    }

    /// Construct libpq connection string for connecting to this safekeeper.
-    fn safekeeper_connection_config(port: u16) -> Config {
+    fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
        // TODO safekeeper authentication not implemented yet
-        format!("postgresql://no_user@127.0.0.1:{}/no_db", port)
+        format!("postgresql://no_user@127.0.0.1:{port}/no_db")
            .parse()
            .unwrap()
    }

    pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
-        env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref())
+        env.safekeeper_data_dir(&format!("sk{sk_id}"))
    }

    pub fn datadir_path(&self) -> PathBuf {
@@ -106,91 +104,78 @@ impl SafekeeperNode {
        self.datadir_path().join("safekeeper.pid")
    }

-    pub fn start(&self) -> anyhow::Result<()> {
+    pub fn start(&self) -> anyhow::Result<Child> {
        print!(
            "Starting safekeeper at '{}' in '{}'",
-            connection_address(&self.pg_connection_config),
+            self.pg_connection_config.raw_address(),
            self.datadir_path().display()
        );
        io::stdout().flush().unwrap();

        let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
        let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
+        let id = self.id;
+        let datadir = self.datadir_path();

-        let mut cmd = Command::new(self.env.safekeeper_bin()?);
-        fill_rust_env_vars(
-            cmd.args(&["-D", self.datadir_path().to_str().unwrap()])
-                .args(&["--id", self.id.to_string().as_ref()])
-                .args(&["--listen-pg", &listen_pg])
-                .args(&["--listen-http", &listen_http])
-                .arg("--daemonize"),
-        );
+        let id_string = id.to_string();
+        let mut args = vec![
+            "-D",
+            datadir.to_str().with_context(|| {
+                format!("Datadir path {datadir:?} cannot be represented as a unicode string")
+            })?,
+            "--id",
+            &id_string,
+            "--listen-pg",
+            &listen_pg,
+            "--listen-http",
+            &listen_http,
+        ];
        if !self.conf.sync {
-            cmd.arg("--no-sync");
+            args.push("--no-sync");
        }

        let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints();
        if !comma_separated_endpoints.is_empty() {
-            cmd.args(&["--broker-endpoints", &comma_separated_endpoints]);
+            args.extend(["--broker-endpoints", &comma_separated_endpoints]);
        }
        if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
-            cmd.args(&["--broker-etcd-prefix", prefix]);
+            args.extend(["--broker-etcd-prefix", prefix]);
        }
+
+        let mut backup_threads = String::new();
        if let Some(threads) = self.conf.backup_threads {
-            cmd.args(&["--backup-threads", threads.to_string().as_ref()]);
+            backup_threads = threads.to_string();
+            args.extend(["--backup-threads", &backup_threads]);
+        } else {
+            drop(backup_threads);
        }
+
        if let Some(ref remote_storage) = self.conf.remote_storage {
-            cmd.args(&["--remote-storage", remote_storage]);
+            args.extend(["--remote-storage", remote_storage]);
        }
+
+        let key_path = self.env.base_data_dir.join("auth_public_key.pem");
        if self.conf.auth_enabled {
-            cmd.arg("--auth-validation-public-key-path");
-            // PathBuf is better be passed as is, not via `String`.
-            cmd.arg(self.env.base_data_dir.join("auth_public_key.pem"));
+            args.extend([
+                "--auth-validation-public-key-path",
+                key_path.to_str().with_context(|| {
+                    format!("Key path {key_path:?} cannot be represented as a unicode string")
+                })?,
+            ]);
        }

-        fill_aws_secrets_vars(&mut cmd);
-
-        if !cmd.status()?.success() {
-            bail!(
-                "Safekeeper failed to start. See '{}' for details.",
-                self.datadir_path().join("safekeeper.log").display()
-            );
-        }
-
-        // It takes a while for the safekeeper to start up. Wait until it is
-        // open for business.
-        const RETRIES: i8 = 15;
-        for retries in 1..RETRIES {
-            match self.check_status() {
-                Ok(_) => {
-                    println!("\nSafekeeper started");
-                    return Ok(());
-                }
-                Err(err) => {
-                    match err {
-                        SafekeeperHttpError::Transport(err) => {
-                            if err.is_connect() && retries < 5 {
-                                print!(".");
-                                io::stdout().flush().unwrap();
-                            } else {
-                                if retries == 5 {
-                                    println!() // put a line break after dots for second message
-                                }
-                                println!(
-                                    "Safekeeper not responding yet, err {} retrying ({})...",
-                                    err, retries
-                                );
-                            }
-                        }
-                        SafekeeperHttpError::Response(msg) => {
-                            bail!("safekeeper failed to start: {} ", msg)
-                        }
-                    }
-                    thread::sleep(Duration::from_secs(1));
-                }
-            }
-        }
-        bail!("safekeeper failed to start in {} seconds", RETRIES);
+        background_process::start_process(
+            &format!("safekeeper {id}"),
+            &datadir,
+            &self.env.safekeeper_bin(),
+            &args,
+            background_process::InitialPidFile::Expect(&self.pid_file()),
+            || match self.check_status() {
+                Ok(()) => Ok(true),
+                Err(SafekeeperHttpError::Transport(_)) => Ok(false),
+                Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
+            },
+        )
    }

    ///
@@ -202,63 +187,11 @@ impl SafekeeperNode {
    /// If the server is not running, returns success
    ///
    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        let pid_file = self.pid_file();
-        if !pid_file.exists() {
-            println!("Safekeeper {} is already stopped", self.id);
-            return Ok(());
-        }
-        let pid = read_pidfile(&pid_file)?;
-        let pid = Pid::from_raw(pid);
-
-        let sig = if immediate {
-            print!("Stopping safekeeper {} immediately..", self.id);
-            Signal::SIGQUIT
-        } else {
-            print!("Stopping safekeeper {} gracefully..", self.id);
-            Signal::SIGTERM
-        };
-        io::stdout().flush().unwrap();
-        match kill(pid, sig) {
-            Ok(_) => (),
-            Err(Errno::ESRCH) => {
-                println!(
-                    "Safekeeper with pid {} does not exist, but a PID file was found",
-                    pid
-                );
-                return Ok(());
-            }
-            Err(err) => bail!(
-                "Failed to send signal to safekeeper with pid {}: {}",
-                pid,
-                err.desc()
-            ),
-        }
-
-        // Wait until process is gone
-        for i in 0..600 {
-            let signal = None; // Send no signal, just get the error code
-            match kill(pid, signal) {
-                Ok(_) => (), // Process exists, keep waiting
-                Err(Errno::ESRCH) => {
-                    // Process not found, we're done
-                    println!("done!");
-                    return Ok(());
-                }
-                Err(err) => bail!(
-                    "Failed to send signal to pageserver with pid {}: {}",
-                    pid,
-                    err.desc()
-                ),
-            };
-
-            if i % 10 == 0 {
-                print!(".");
-                io::stdout().flush().unwrap();
-            }
-            thread::sleep(Duration::from_millis(100));
-        }
-
-        bail!("Failed to stop safekeeper with pid {}", pid);
+        background_process::stop_process(
+            immediate,
+            &format!("safekeeper {}", self.id),
+            &self.pid_file(),
+        )
    }

    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -0,0 +1,13 @@
+ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+ARG COMPUTE_IMAGE=compute-node-v14
+ARG TAG=latest
+
+FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG
+
+USER root
+RUN apt-get update &&       \
+    apt-get install -y curl \
+                       jq   \
+                       netcat
+
+USER postgres
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -2,6 +2,7 @@ version: '3'

 services:
  etcd:
+    restart: always
    image: quay.io/coreos/etcd:v3.5.4
    ports:
      - 2379:2379
@@ -9,7 +10,7 @@ services:
    environment:
      # This signifficantly speeds up etcd and we anyway don't data persistency there.
      ETCD_UNSAFE_NO_FSYNC: "1"
-    command: 
+    command:
      - "etcd"
      - "--auto-compaction-mode=revision"
      - "--auto-compaction-retention=1"
@@ -24,6 +25,7 @@ services:
      - "--quota-backend-bytes=134217728" # 128 MB

  minio:
+    restart: always
    image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z
    ports:
      - 9000:9000
@@ -41,7 +43,7 @@ services:
    entrypoint:
      - "/bin/sh"
      - "-c"
-    command: 
+    command:
      - "until (/usr/bin/mc alias set minio http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD) do
             echo 'Waiting to start minio...' && sleep 1;
         done;
@@ -51,7 +53,8 @@ services:
      - minio

  pageserver:
-    image: neondatabase/neon:${TAG:-latest}
+    restart: always
+    image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
    environment:
      - BROKER_ENDPOINT='http://etcd:2379'
      - AWS_ACCESS_KEY_ID=minio
@@ -77,7 +80,8 @@ services:
      - minio_create_buckets

  safekeeper1:
-    image: neondatabase/neon:${TAG:-latest}
+    restart: always
+    image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
    environment:
      - SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
      - SAFEKEEPER_ID=1
@@ -106,7 +110,8 @@ services:
      - minio_create_buckets

  safekeeper2:
-    image: neondatabase/neon:${TAG:-latest}
+    restart: always
+    image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
    environment:
      - SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
      - SAFEKEEPER_ID=2
@@ -135,7 +140,8 @@ services:
      - minio_create_buckets

  safekeeper3:
-    image: neondatabase/neon:${TAG:-latest}
+    restart: always
+    image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
    environment:
      - SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
      - SAFEKEEPER_ID=3
@@ -164,18 +170,21 @@ services:
      - minio_create_buckets

  compute:
+    restart: always
    build:
-      context: ./image/compute
+      context: ./compute_wrapper/
      args:
-        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}:${TAG:-latest}
+        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}
+        - TAG=${TAG:-latest}
        - http_proxy=$http_proxy
        - https_proxy=$https_proxy
    environment:
      - PG_VERSION=${PG_VERSION:-14}
      #- RUST_BACKTRACE=1
+    # Mount the test files directly, for faster editing cycle.
    volumes:
-      - ./compute/var/db/postgres/specs/:/var/db/postgres/specs/
-      - ./compute/shell/:/shell/
+      - ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/
+      - ./compute_wrapper/shell/:/shell/
    ports:
      - 55433:55433 # pg protocol handler
      - 3080:3080 # http endpoints
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# A basic test to ensure Docker images are built correctly.
+# Build a wrapper around the compute, start all services and runs a simple SQL query.
+# Repeats the process for all currenly supported Postgres versions.
+
+# Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file
+# Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
+# to verify custom image builds (e.g pre-published ones).
+
+# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
+
+set -eux -o pipefail
+
+SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml
+
+COMPUTE_CONTAINER_NAME=docker-compose-compute-1
+SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
+PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
+
+cleanup() {
+    echo "show container information"
+    docker ps
+    docker compose -f $COMPOSE_FILE logs
+    echo "stop containers..."
+    docker compose -f $COMPOSE_FILE down
+}
+
+echo "clean up containers if exists"
+cleanup
+
+for pg_version in 14 15; do
+    echo "start containers (pg_version=$pg_version)."
+    PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d
+
+    echo "wait until the compute is ready. timeout after 60s. "
+    cnt=0
+    while sleep 1; do
+        # check timeout
+        cnt=`expr $cnt + 1`
+        if [ $cnt -gt 60 ]; then
+            echo "timeout before the compute is ready."
+            cleanup
+            exit 1
+        fi
+
+        # check if the compute is ready
+        set +o pipefail
+        result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
+        set -o pipefail
+        if [ $result -eq 1 ]; then
+            echo "OK. The compute is ready to connect."
+            echo "execute simple queries."
+            docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
+            cleanup
+            break
+        fi
+    done
+done
--- a/docker-compose/image/compute/Dockerfile
+++ b/docker-compose/image/compute/Dockerfile
@@ -1,10 +0,0 @@
-ARG COMPUTE_IMAGE=compute-node-v14:latest
-FROM neondatabase/${COMPUTE_IMAGE}
-
-USER root
-RUN apt-get update &&       \
-    apt-get install -y curl \
-                       jq   \
-                       netcat
-
-USER postgres
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -37,7 +37,7 @@

 - [Source view](./sourcetree.md)
  - [docker.md](./docker.md) — Docker images and building pipeline.
-  - [Error handling and logging]()
+  - [Error handling and logging](./error-handling.md)
  - [Testing]()
    - [Unit testing]()
    - [Integration testing]()
--- a/docs/error-handling.md
+++ b/docs/error-handling.md
@@ -0,0 +1,198 @@
+# Error handling and logging
+
+## Logging errors
+
+The principle is that errors are logged when they are handled. If you
+just propagate an error to the caller in a function, you don't need to
+log it; the caller will. But if you consume an error in a function,
+you *must* log it (if it needs to be logged at all).
+
+For example:
+
+```rust
+fn read_motd_file() -> std::io::Result<String> {
+    let mut f = File::open("/etc/motd")?;
+    let mut result = String::new();
+    f.read_to_string(&mut result)?;
+    result
+}
+```
+
+Opening or reading the file could fail, but there is no need to log
+the error here. The function merely propagates the error to the
+caller, and it is up to the caller to log the error or propagate it
+further, if the failure is not expected. But if, for example, it is
+normal that the "/etc/motd" file doesn't exist, the caller can choose
+to silently ignore the error, or log it as an INFO or DEBUG level
+message:
+
+```rust
+fn get_message_of_the_day() -> String {
+    // Get the motd from /etc/motd, or return the default proverb
+    match read_motd_file() {
+        Ok(motd) => motd,
+        Err(err)  => {
+            // It's normal that /etc/motd doesn't exist, but if we fail to
+            // read it for some other reason, that's unexpected. The message
+            // of the day isn't very important though, so we just WARN and
+            // continue with the default in any case.
+            if err.kind() != std::io::ErrorKind::NotFound {
+                 tracing::warn!("could not read \"/etc/motd\": {err:?}");
+            }
+            "An old error is always more popular than a new truth. - German proverb"
+        }
+    }
+}
+```
+
+## Error types
+
+We use the `anyhow` crate widely. It contains many convenient macros
+like `bail!` and `ensure!` to construct and return errors, and to
+propagate many kinds of low-level errors, wrapped in `anyhow::Error`.
+
+A downside of `anyhow::Error` is that the caller cannot distinguish
+between different error cases. Most errors are propagated all the way
+to the mgmt API handler function, or the main loop that handles a
+connection with the compute node, and they are all handled the same
+way: the error is logged and returned to the client as an HTTP or
+libpq error.
+
+But in some cases, we need to distinguish between errors and handle
+them differently. For example, attaching a tenant to the pageserver
+could fail either because the tenant has already been attached, or
+because we could not load its metadata from cloud storage. The first
+case is more or less expected. The console sends the Attach request to
+the pageserver, and the pageserver completes the operation, but the
+network connection might be lost before the console receives the
+response. The console will retry the operation in that case, but the
+tenant has already been attached. It is important that the pagserver
+responds with the HTTP 403 Already Exists error in that case, rather
+than a generic HTTP 500 Internal Server Error.
+
+If you need to distinguish between different kinds of errors, create a
+new `Error` type. The `thiserror` crate is useful for that. But in
+most cases `anyhow::Error` is good enough.
+
+## Panics
+
+Depending on where a panic happens, it can cause the whole pageserver
+or safekeeper to restart, or just a single tenant. In either case,
+that is pretty bad and causes an outage. Avoid panics. Never use
+`unwrap()` or other calls that might panic, to verify inputs from the
+network or from disk.
+
+It is acceptable to use functions that might panic, like `unwrap()`, if
+it is obvious that it cannot panic. For example, if you have just
+checked that a variable is not None, it is OK to call `unwrap()` on it,
+but it is still preferable to use `expect("reason")` instead to explain
+why the function cannot fail.
+
+`assert!` and `panic!` are reserved for checking clear invariants and
+very obvious "can't happen" cases. When in doubt, use anyhow `ensure!`
+or `bail!` instead.
+
+## Error levels
+
+`tracing::Level` doesn't provide very clear guidelines on what the
+different levels mean, or when to use which level. Here is how we use
+them:
+
+### Error
+
+Examples:
+- could not open file "foobar"
+- invalid tenant id
+
+Errors are not expected to happen during normal operation. Incorrect
+inputs from client can cause ERRORs. For example, if a client tries to
+call a mgmt API that doesn't exist, or if a compute node sends passes
+an LSN that has already been garbage collected away.
+
+These should *not* happen during normal operations. "Normal
+operations" is not a very precise concept. But for example, disk
+errors are not expected to happen when the system is working, so those
+count as Errors. However, if a TCP connection to a compute node is
+lost, that is not considered an Error, because it doesn't affect the
+pageserver's or safekeeper's operation in any way, and happens fairly
+frequently when compute nodes are shut down, or are killed abruptly
+because of errors in the compute.
+
+**Errors are monitored, and always need human investigation to determine
+the cause.**
+
+Whether something should be logged at ERROR, WARNING or INFO level can
+depend on the callers and clients. For example, it might be unexpected
+and a sign of a serious issue if the console calls the
+"timeline_detail" mgmt API for a timeline that doesn't exist. ERROR
+would be appropriate in that case. But if the console routinely calls
+the API after deleting a timeline, to check if the deletion has
+completed, then it would be totally normal and an INFO or DEBUG level
+message would be more appropriate. If a message is logged as an ERROR,
+but it in fact happens frequently in production and never requires any
+action, it should probably be demoted to an INFO level message.
+
+### Warn
+
+Examples:
+- could not remove temporary file "foobar.temp"
+- unrecognized file "foobar" in timeline directory
+
+Warnings are similar to Errors, in that they should not happen
+when the system is operating normally. The difference between Error and
+Warning is that an Error means that the operation failed, whereas Warning
+means that something unexpected happened, but the operation continued anyway.
+For example, if deleting a file fails because the file already didn't exist,
+it should be logged as Warning.
+
+> **Note:** The python regression tests, under `test_regress`, check the
+> pageserver log after each test for any ERROR and WARN lines. If there are
+> any ERRORs or WARNs that have not been explicitly listed in the test as
+> allowed, the test is marked a failed. This is to catch unexpected errors
+> e.g. in background operations, that don't cause immediate misbehaviour in
+> the tested functionality.
+
+### Info
+
+Info level is used to log useful information when the system is
+operating normally. Info level is appropriate e.g. for logging state
+changes, background operations, and network connections.
+
+Examples:
+- "system is shutting down"
+- "tenant was created"
+- "retrying S3 upload"
+
+### Debug & Trace
+
+Debug and Trace level messages are not printed to the log in our normal
+production configuration, but could be enabled for a specific server or
+tenant, to aid debugging. (Although we don't actually have that
+capability as of this writing).
+
+## Context
+
+We use logging "spans" to hold context information about the current
+operation. Almost every operation happens on a particular tenant and
+timeline, so we enter a span with the "tenant_id" and "timeline_id"
+very early when processing an incoming API request, for example. All
+background operations should also run in a span containing at least
+those two fields, and any other parameters or information that might
+be useful when debugging an error that might happen when performing
+the operation.
+
+TODO: Spans are not captured in the Error when it is created, but when
+the error is logged. It would be more useful to capture them at Error
+creation. We should consider using `tracing_error::SpanTrace` to do
+that.
+
+## Error message style
+
+PostgreSQL has a style guide for writing error messages:
+
+https://www.postgresql.org/docs/current/error-style-guide.html
+
+Follow that guide when writing error messages in the PostgreSQL
+extension. We don't follow it strictly in the pageserver and
+safekeeper, but the advice in the PostgreSQL style guide is generally
+good, and you can't go wrong by following it.
--- a/docs/rfcs/020-pageserver-s3-coordination.md
+++ b/docs/rfcs/020-pageserver-s3-coordination.md
@@ -0,0 +1,246 @@
+# Coordinating access of multiple pageservers to the same s3 data
+
+## Motivation
+
+There are some blind spots around coordinating access of multiple pageservers
+to the same s3 data. Currently this is applicable only to tenant relocation
+case, but in the future we'll need to solve similar problems for
+replica/standby pageservers.
+
+## Impacted components (e.g. pageserver, safekeeper, console, etc)
+
+Pageserver
+
+## The problem
+
+### Relocation
+
+During relocation both pageservers can write to s3. This should be ok for all
+data except the `index_part.json`. For index part it causes problems during
+compaction/gc because they remove files from index/s3.
+
+Imagine this case:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant PS1
+    participant S3
+    participant PS2
+
+    PS1->>S3: Uploads L1, L2 <br/> Index contains L1 L2
+    PS2->>S3: Attach called, sees L1, L2
+    PS1->>S3: Compaction comes <br/> Removes L1, adds L3
+    note over S3: Index now L2, L3
+    PS2->>S3: Uploads new layer L4 <br/> (added to previous view of the index)
+    note over S3: Index now L1, L2, L4
+```
+
+At this point it is not possible to restore from index, it contains L2 which
+is no longer available in s3 and doesnt contain L3 added by compaction by the
+first pageserver. So if any of the pageservers restart initial sync will fail
+(or in on-demand world it will fail a bit later during page request from
+missing layer)
+
+### Standby pageserver
+
+Another related case is standby pageserver. In this case second pageserver can
+be used as a replica to scale reads and serve as a failover target in case
+first one fails.
+
+In this mode second pageserver needs to have the same picture of s3 files to
+be able to load layers on-demand. To accomplish that second pageserver
+cannot run gc/compaction jobs. Instead it needs to receive updates for index
+contents. (There is no need to run walreceiver on the second pageserver then).
+
+## Observations
+
+- If both pageservers ingest wal then their layer set diverges, because layer
+  file generation is not deterministic
+- If one of the pageservers does not ingest wal (and just picks up layer
+  updates) then it lags behind and cannot really answer queries in the same
+  pace as the primary one
+- Can compaction help make layers deterministic? E g we do not upload level
+  zero layers and construction of higher levels should be deterministic.
+  This way we can guarantee that layer creation by timeout wont mess things up.
+  This way one pageserver uploads data and second one can just ingest it.
+  But we still need some form of election
+
+## Solutions
+
+### Manual orchestration
+
+One possible solution for relocation case is to orchestrate background jobs
+from outside. The oracle who runs migration can turn off background jobs on
+PS1 before migration and then run migration -> enable them on PS2. The problem
+comes if migration fails. In this case in order to resume background jobs
+oracle needs to guarantee that PS2 doesnt run background jobs and if it doesnt
+respond then PS1 is stuck unable to run compaction/gc. This cannot be solved
+without human ensuring that no upload from PS2 can happen. In order to be able
+to resolve this automatically CAS is required on S3 side so pageserver can
+avoid overwriting index part if it is no longer the leading one
+
+Note that flag that disables background jobs needs to be persistent, because
+otherwise pageserver restart will clean it
+
+### Avoid index_part.json
+
+Index part consists of two parts, list of layers and metadata. List of layers
+can be easily obtained by `ListObjects` S3 API method. But what to do with
+metadata? Create metadata instance for each checkpoint and add some counter
+to the file name?
+
+Back to potentially long s3 ls.
+
+### Coordination based approach
+
+Do it like safekeepers chose leader for WAL upload. Ping each other and decide
+based on some heuristics e g smallest node id. During relocation PS1 sends
+"resign" ping message so others can start election without waiting for a timeout.
+
+This still leaves metadata question open and non deterministic layers are a
+problem as well
+
+### Avoid metadata file
+
+One way to eliminate metadata file is to store it in layer files under some
+special key. This may resonate with intention to keep all relation sizes in
+some special segment to avoid initial download during size calculation.
+Maybe with that we can even store pre calculated value.
+
+As a downside each checkpoint gets 512 bytes larger.
+
+If we entirely avoid metadata file this opens up many approaches
+
+* * *
+
+During discussion it seems that we converged on the approach consisting of:
+
+- index files stored per pageserver in the same timeline directory. With that
+  index file name starts to look like: `<pageserver_node_id>_index_part.json`.
+  In such set up there are no concurrent overwrites of index file by different
+  pageservers.
+- For replica pageservers the solution would be for primary to broadcast index
+  changes to any followers with an ability to check index files in s3 and
+  restore the full state. To properly merge changes with index files we can use
+  a counter that is persisted in an index file, is incremented on every change
+  to it and passed along with broadcasted change. This way we can determine
+  whether we need to apply change to the index state or not.
+- Responsibility for running background jobs is assigned externally. Pageserver
+  keeps locally persistent flag for each tenant that indicates whether this
+  pageserver is considered as primary one or not. TODO what happends if we
+  crash and cannot start for some extended period of time? Control plane can
+  assign ownership to some other pageserver. Pageserver needs some way to check
+  if its still the blessed one. Maybe by explicit request to control plane on
+  start.
+
+Requirement for deterministic layer generation was considered overly strict
+because of two reasons:
+
+- It can limit possible optimizations e g when pageserver wants to reshuffle
+  some data locally and doesnt want to coordinate this
+- The deterministic algorithm itself can change so during deployments for some
+  time there will be two different version running at the same time which can
+  cause non determinism
+
+### External elections
+
+The above case with lost state in this schema with externally managed
+leadership is represented like this:
+
+Note that here we keep objects list in the index file.
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant PS1
+    participant CP as Control Plane
+    participant S3
+    participant PS2
+
+    note over PS1,PS2: PS1 starts up and still a leader
+    PS1->>CP: Am I still the leader for Tenant X?
+    activate CP
+    CP->>PS1: Yes
+    deactivate CP
+    PS1->>S3: Fetch PS1 index.
+    note over PS1: Continue operations, start backround jobs
+    note over PS1,PS2: PS1 starts up and still and is not a leader anymore
+    PS1->>CP: Am I still the leader for Tenant X?
+    CP->>PS1: No
+    PS1->>PS2: Subscribe to index changes
+    PS1->>S3: Fetch PS1 and PS2 indexes
+    note over PS1: Combine index file to include layers <br> from both indexes to be able <br> to see newer files from leader (PS2)
+    note over PS1: Continue operations, do not start background jobs
+```
+
+### Internal elections
+
+To manage leadership internally we can use broker to exchange pings so nodes
+can decide on the leader roles. In case multiple pageservers are active leader
+is the one with lowest node id.
+
+Operations with internally managed elections:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant PS1
+    participant S3
+
+    note over PS1: Starts up
+    note over PS1: Subscribes to changes, waits for two ping <br> timeouts to see if there is a leader
+    PS1->>S3: Fetch indexes from s3
+    alt there is a leader
+        note over PS1: do not start background jobs, <br> continue applying index updates
+    else there is no leader
+        note over PS1: start background jobs, <br> broadcast index changes
+    end
+
+    note over PS1,S3: Then the picture is similar to external elections <br> the difference is that follower can become a leader <br> if there are no pings after some timeout new leader gets elected
+```
+
+### Eviction
+
+When two pageservers operate on a tenant for extended period of time follower
+doesnt perform write operations in s3. When layer is evicted follower relies
+on updates from primary to get info about layers it needs to cover range for
+evicted layer.
+
+Note that it wont match evicted layer exactly, so layers will overlap and
+lookup code needs to correctly handle that.
+
+### Relocation flow
+
+Actions become:
+
+- Attach tenant to new pageserver
+- New pageserver becomes follower since previous one is still leading
+- New pageserver starts replicating from safekeepers but does not upload layers
+- Detach is called on the old one
+- New pageserver becomes leader after it realizes that old one disappeared
+
+### Index File
+
+Using `s3 ls` on startup simplifies things, but we still need metadata, so we
+need to fetch index files anyway. If they contain list of files we can combine
+them and avoid costly `s3 ls`
+
+### Remaining issues
+
+- More than one remote consistent lsn for safekeepers to know
+
+Anything else?
+
+### Proposed solution
+
+To recap. On meeting we converged on approach with external elections but I
+think it will be overall harder to manage and will introduce a dependency on
+control plane for pageserver. Using separate index files for each pageserver
+consisting of log of operations and a metadata snapshot should be enough.
+
+### What we need to get there?
+
+- Change index file structure to contain log of changes instead of just the
+  file list
+- Implement pinging/elections for pageservers
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -52,6 +52,10 @@ PostgreSQL extension that implements storage manager API and network communicati

 PostgreSQL extension that contains functions needed for testing and debugging.

+`/pgxn/neon_walredo`:
+
+Library to run Postgres as a "WAL redo process" in the pageserver.
+
 `/safekeeper`:

 The neon WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.
@@ -79,6 +83,16 @@ A subject for future modularization.
 `/libs/metrics`:
 Helpers for exposing Prometheus metrics from the server.

+### Adding dependencies
+When you add a Cargo dependency, you should update hakari manifest by running commands below and committing the updated `Cargo.lock` and `workspace_hack/`. There may be no changes, that's fine.
+
+```bash
+cargo hakari generate
+cargo hakari manage-deps
+```
+
+If you don't have hakari installed (`error: no such subcommand: hakari`), install it by running `cargo install cargo-hakari`.
+
 ## Using Python
 Note that Debian/Ubuntu Python packages are stale, as it commonly happens,
 so manual installation of dependencies is not recommended.
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -9,6 +9,7 @@ serde_with = "2.0"
 const_format = "0.2.21"
 anyhow = { version = "1.0", features = ["backtrace"] }
 bytes = "1.0.1"
+byteorder = "1.4.3"

 utils = { path = "../utils" }
 postgres_ffi = { path = "../postgres_ffi" }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1,5 +1,6 @@
 use std::num::NonZeroU64;

+use byteorder::{BigEndian, ReadBytesExt};
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use utils::{
@@ -9,7 +10,7 @@ use utils::{

 use crate::reltag::RelTag;
 use anyhow::bail;
-use bytes::{Buf, BufMut, Bytes, BytesMut};
+use bytes::{BufMut, Bytes, BytesMut};

 /// A state of a tenant in pageserver's memory.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -72,6 +73,7 @@ pub struct TenantCreateRequest {
    pub walreceiver_connect_timeout: Option<String>,
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
+    pub trace_read_requests: Option<bool>,
 }

 #[serde_as]
@@ -111,6 +113,7 @@ pub struct TenantConfigRequest {
    pub walreceiver_connect_timeout: Option<String>,
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
+    pub trace_read_requests: Option<bool>,
 }

 impl TenantConfigRequest {
@@ -129,6 +132,7 @@ impl TenantConfigRequest {
            walreceiver_connect_timeout: None,
            lagging_wal_timeout: None,
            max_lsn_wal_lag: None,
+            trace_read_requests: None,
        }
    }
 }
@@ -225,6 +229,7 @@ pub struct TimelineGcRequest {
 }

 // Wrapped in libpq CopyData
+#[derive(PartialEq, Eq)]
 pub enum PagestreamFeMessage {
    Exists(PagestreamExistsRequest),
    Nblocks(PagestreamNblocksRequest),
@@ -241,21 +246,21 @@ pub enum PagestreamBeMessage {
    DbSize(PagestreamDbSizeResponse),
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamExistsRequest {
    pub latest: bool,
    pub lsn: Lsn,
    pub rel: RelTag,
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamNblocksRequest {
    pub latest: bool,
    pub lsn: Lsn,
    pub rel: RelTag,
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamGetPageRequest {
    pub latest: bool,
    pub lsn: Lsn,
@@ -263,7 +268,7 @@ pub struct PagestreamGetPageRequest {
    pub blkno: u32,
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamDbSizeRequest {
    pub latest: bool,
    pub lsn: Lsn,
@@ -296,52 +301,98 @@ pub struct PagestreamDbSizeResponse {
 }

 impl PagestreamFeMessage {
-    pub fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
+    pub fn serialize(&self) -> Bytes {
+        let mut bytes = BytesMut::new();
+
+        match self {
+            Self::Exists(req) => {
+                bytes.put_u8(0);
+                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u64(req.lsn.0);
+                bytes.put_u32(req.rel.spcnode);
+                bytes.put_u32(req.rel.dbnode);
+                bytes.put_u32(req.rel.relnode);
+                bytes.put_u8(req.rel.forknum);
+            }
+
+            Self::Nblocks(req) => {
+                bytes.put_u8(1);
+                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u64(req.lsn.0);
+                bytes.put_u32(req.rel.spcnode);
+                bytes.put_u32(req.rel.dbnode);
+                bytes.put_u32(req.rel.relnode);
+                bytes.put_u8(req.rel.forknum);
+            }
+
+            Self::GetPage(req) => {
+                bytes.put_u8(2);
+                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u64(req.lsn.0);
+                bytes.put_u32(req.rel.spcnode);
+                bytes.put_u32(req.rel.dbnode);
+                bytes.put_u32(req.rel.relnode);
+                bytes.put_u8(req.rel.forknum);
+                bytes.put_u32(req.blkno);
+            }
+
+            Self::DbSize(req) => {
+                bytes.put_u8(3);
+                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u64(req.lsn.0);
+                bytes.put_u32(req.dbnode);
+            }
+        }
+
+        bytes.into()
+    }
+
+    pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
        // TODO these gets can fail

        // these correspond to the NeonMessageTag enum in pagestore_client.h
        //
        // TODO: consider using protobuf or serde bincode for less error prone
        // serialization.
-        let msg_tag = body.get_u8();
+        let msg_tag = body.read_u8()?;
        match msg_tag {
            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
+                    spcnode: body.read_u32::<BigEndian>()?,
+                    dbnode: body.read_u32::<BigEndian>()?,
+                    relnode: body.read_u32::<BigEndian>()?,
+                    forknum: body.read_u8()?,
                },
            })),
            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
+                    spcnode: body.read_u32::<BigEndian>()?,
+                    dbnode: body.read_u32::<BigEndian>()?,
+                    relnode: body.read_u32::<BigEndian>()?,
+                    forknum: body.read_u8()?,
                },
            })),
            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
+                    spcnode: body.read_u32::<BigEndian>()?,
+                    dbnode: body.read_u32::<BigEndian>()?,
+                    relnode: body.read_u32::<BigEndian>()?,
+                    forknum: body.read_u8()?,
                },
-                blkno: body.get_u32(),
+                blkno: body.read_u32::<BigEndian>()?,
            })),
            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                dbnode: body.get_u32(),
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                dbnode: body.read_u32::<BigEndian>()?,
            })),
-            _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
+            _ => bail!("unknown smgr message tag: {:?}", msg_tag),
        }
    }
 }
@@ -380,3 +431,58 @@ impl PagestreamBeMessage {
        bytes.into()
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use bytes::Buf;
+
+    use super::*;
+
+    #[test]
+    fn test_pagestream() {
+        // Test serialization/deserialization of PagestreamFeMessage
+        let messages = vec![
+            PagestreamFeMessage::Exists(PagestreamExistsRequest {
+                latest: true,
+                lsn: Lsn(4),
+                rel: RelTag {
+                    forknum: 1,
+                    spcnode: 2,
+                    dbnode: 3,
+                    relnode: 4,
+                },
+            }),
+            PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
+                latest: false,
+                lsn: Lsn(4),
+                rel: RelTag {
+                    forknum: 1,
+                    spcnode: 2,
+                    dbnode: 3,
+                    relnode: 4,
+                },
+            }),
+            PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
+                latest: true,
+                lsn: Lsn(4),
+                rel: RelTag {
+                    forknum: 1,
+                    spcnode: 2,
+                    dbnode: 3,
+                    relnode: 4,
+                },
+                blkno: 7,
+            }),
+            PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
+                latest: true,
+                lsn: Lsn(4),
+                dbnode: 7,
+            }),
+        ];
+        for msg in messages {
+            let bytes = msg.serialize();
+            let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
+            assert!(msg == reconstructed);
+        }
+    }
+}
--- a/libs/persistent_range_query/Cargo.toml
+++ b/libs/persistent_range_query/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "persistent_range_query"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+
+[dev-dependencies]
+rand = "0.8.3"
--- a/libs/persistent_range_query/src/lib.rs
+++ b/libs/persistent_range_query/src/lib.rs
@@ -0,0 +1,78 @@
+use std::ops::Range;
+
+pub mod naive;
+pub mod ops;
+pub mod segment_tree;
+
+/// Should be a monoid:
+/// * Identity element: for all a: combine(new_for_empty_range(), a) = combine(a, new_for_empty_range()) = a
+/// * Associativity: for all a, b, c: combine(combine(a, b), c) == combine(a, combine(b, c))
+pub trait RangeQueryResult<Key>: Sized + Clone {
+    // Clone is equivalent to combine with an empty range.
+
+    fn new_for_empty_range() -> Self;
+
+    // Contract: left_range.end == right_range.start
+    // left_range.start == left_range.end == right_range.start == right_range.end is still possible
+    fn combine(
+        left: &Self,
+        left_range: &Range<Key>,
+        right: &Self,
+        right_range: &Range<Key>,
+    ) -> Self;
+
+    fn add(left: &mut Self, left_range: &Range<Key>, right: &Self, right_range: &Range<Key>);
+}
+
+pub trait LazyRangeInitializer<Result: RangeQueryResult<Key>, Key> {
+    fn get(&self, range: &Range<Key>) -> Result;
+}
+
+/// Should be a monoid:
+/// * Identity element: for all op: compose(no_op(), op) == compose(op, no_op()) == op
+/// * Associativity: for all op_1, op_2, op_3: compose(compose(op_1, op_2), op_3) == compose(op_1, compose(op_2, op_3))
+///
+/// Should left act on Result:
+/// * Identity operation: for all r: no_op().apply(r) == r
+/// * Compatibility: for all op_1, op_2, r: op_1.apply(op_2.apply(r)) == compose(op_1, op_2).apply(r)
+pub trait RangeModification<Key> {
+    type Result: RangeQueryResult<Key>;
+
+    fn no_op() -> Self;
+    fn is_no_op(&self) -> bool;
+    fn is_reinitialization(&self) -> bool;
+    fn apply(&self, result: &mut Self::Result, range: &Range<Key>);
+    fn compose(later: &Self, earlier: &mut Self);
+}
+
+pub trait VecReadableVersion<Modification: RangeModification<Key>, Key> {
+    fn get(&self, keys: &Range<Key>) -> Modification::Result;
+}
+
+// TODO: use trait alias when stabilized
+pub trait VecFrozenVersion<Modification: RangeModification<Key>, Key>:
+    Clone + VecReadableVersion<Modification, Key>
+{
+}
+
+impl<
+        T: Clone + VecReadableVersion<Modification, Key>,
+        Modification: RangeModification<Key>,
+        Key,
+    > VecFrozenVersion<Modification, Key> for T
+{
+}
+
+pub trait PersistentVecStorage<
+    Modification: RangeModification<Key>,
+    Initializer: LazyRangeInitializer<Modification::Result, Key>,
+    Key,
+>: VecReadableVersion<Modification, Key>
+{
+    fn new(all_keys: Range<Key>, initializer: Initializer) -> Self;
+
+    type FrozenVersion: VecFrozenVersion<Modification, Key>;
+
+    fn modify(&mut self, keys: &Range<Key>, modification: &Modification);
+    fn freeze(&mut self) -> Self::FrozenVersion;
+}
--- a/libs/persistent_range_query/src/naive.rs
+++ b/libs/persistent_range_query/src/naive.rs
@@ -0,0 +1,115 @@
+use crate::{
+    LazyRangeInitializer, PersistentVecStorage, RangeModification, RangeQueryResult,
+    VecReadableVersion,
+};
+use std::marker::PhantomData;
+use std::ops::Range;
+use std::rc::Rc;
+
+pub struct NaiveFrozenVersion<Modification: RangeModification<Key>, Key> {
+    all_keys: Range<Key>,
+    values: Rc<Box<Vec<Modification::Result>>>,
+}
+
+pub trait IndexableKey: Clone {
+    fn index(all_keys: &Range<Self>, key: &Self) -> usize;
+    fn element_range(all_keys: &Range<Self>, index: usize) -> Range<Self>;
+}
+
+fn get<Modification: RangeModification<Key>, Key: IndexableKey>(
+    all_keys: &Range<Key>,
+    values: &Vec<Modification::Result>,
+    keys: &Range<Key>,
+) -> Modification::Result {
+    let mut result = Modification::Result::new_for_empty_range();
+    let mut result_range = keys.start.clone()..keys.start.clone();
+    for index in
+        IndexableKey::index(&all_keys, &keys.start)..IndexableKey::index(&all_keys, &keys.end)
+    {
+        let element_range = IndexableKey::element_range(&all_keys, index);
+        Modification::Result::add(&mut result, &result_range, &values[index], &element_range);
+        result_range.end = element_range.end;
+    }
+    result
+}
+
+impl<Modification: RangeModification<Key>, Key: IndexableKey> VecReadableVersion<Modification, Key>
+    for NaiveFrozenVersion<Modification, Key>
+{
+    fn get(&self, keys: &Range<Key>) -> Modification::Result {
+        get::<Modification, Key>(&self.all_keys, &self.values, keys)
+    }
+}
+
+// Manual implementation of `Clone` becase `derive` requires `Modification: Clone`
+impl<Modification: RangeModification<Key>, Key: Clone> Clone
+    for NaiveFrozenVersion<Modification, Key>
+{
+    fn clone(&self) -> Self {
+        Self {
+            all_keys: self.all_keys.clone(),
+            values: self.values.clone(),
+        }
+    }
+}
+
+// TODO: is it at all possible to store previous versions in this struct,
+// without any Rc<>?
+pub struct NaiveVecStorage<
+    Modification: RangeModification<Key>,
+    Initializer: LazyRangeInitializer<Modification::Result, Key>,
+    Key: IndexableKey,
+> {
+    all_keys: Range<Key>,
+    last_version: Vec<Modification::Result>,
+    _initializer: PhantomData<Initializer>,
+}
+
+impl<
+        Modification: RangeModification<Key>,
+        Initializer: LazyRangeInitializer<Modification::Result, Key>,
+        Key: IndexableKey,
+    > VecReadableVersion<Modification, Key> for NaiveVecStorage<Modification, Initializer, Key>
+{
+    fn get(&self, keys: &Range<Key>) -> Modification::Result {
+        get::<Modification, Key>(&self.all_keys, &self.last_version, keys)
+    }
+}
+
+impl<
+        Modification: RangeModification<Key>,
+        Initializer: LazyRangeInitializer<Modification::Result, Key>,
+        Key: IndexableKey,
+    > PersistentVecStorage<Modification, Initializer, Key>
+    for NaiveVecStorage<Modification, Initializer, Key>
+{
+    fn new(all_keys: Range<Key>, initializer: Initializer) -> Self {
+        let mut values = Vec::with_capacity(IndexableKey::index(&all_keys, &all_keys.end));
+        for index in 0..values.capacity() {
+            values.push(initializer.get(&IndexableKey::element_range(&all_keys, index)));
+        }
+        NaiveVecStorage {
+            all_keys,
+            last_version: values,
+            _initializer: PhantomData,
+        }
+    }
+
+    type FrozenVersion = NaiveFrozenVersion<Modification, Key>;
+
+    fn modify(&mut self, keys: &Range<Key>, modification: &Modification) {
+        for index in IndexableKey::index(&self.all_keys, &keys.start)
+            ..IndexableKey::index(&self.all_keys, &keys.end)
+        {
+            let element_range = IndexableKey::element_range(&self.all_keys, index);
+            modification.apply(&mut self.last_version[index], &element_range);
+        }
+    }
+
+    fn freeze(&mut self) -> Self::FrozenVersion {
+        NaiveFrozenVersion::<Modification, Key> {
+            all_keys: self.all_keys.clone(),
+            values: Rc::new(Box::new(self.last_version.clone())),
+        }
+    }
+}
--- a/libs/persistent_range_query/src/ops/mod.rs
+++ b/libs/persistent_range_query/src/ops/mod.rs
@@ -0,0 +1,14 @@
+pub mod rsq;
+
+#[derive(Copy, Clone, Debug)]
+pub struct SameElementsInitializer<T> {
+    initial_element_value: T,
+}
+
+impl<T> SameElementsInitializer<T> {
+    pub fn new(initial_element_value: T) -> Self {
+        SameElementsInitializer {
+            initial_element_value,
+        }
+    }
+}
--- a/libs/persistent_range_query/src/ops/rsq.rs
+++ b/libs/persistent_range_query/src/ops/rsq.rs
@@ -0,0 +1,118 @@
+//! # Range Sum Query
+
+use crate::ops::SameElementsInitializer;
+use crate::{LazyRangeInitializer, RangeModification, RangeQueryResult};
+use std::borrow::Borrow;
+use std::ops::{Add, AddAssign, Range};
+
+// TODO: commutative Add
+
+#[derive(Clone, Copy, Debug)]
+pub struct SumResult<T> {
+    sum: T,
+}
+
+impl<T> SumResult<T> {
+    pub fn sum(&self) -> &T {
+        &self.sum
+    }
+}
+
+impl<T: Clone + for<'a> AddAssign<&'a T> + From<u8>, Key> RangeQueryResult<Key> for SumResult<T>
+where
+    for<'a> &'a T: Add<&'a T, Output = T>,
+{
+    fn new_for_empty_range() -> Self {
+        SumResult { sum: 0.into() }
+    }
+
+    fn combine(
+        left: &Self,
+        _left_range: &Range<Key>,
+        right: &Self,
+        _right_range: &Range<Key>,
+    ) -> Self {
+        SumResult {
+            sum: &left.sum + &right.sum,
+        }
+    }
+
+    fn add(left: &mut Self, _left_range: &Range<Key>, right: &Self, _right_range: &Range<Key>) {
+        left.sum += &right.sum
+    }
+}
+
+pub trait SumOfSameElements<Key> {
+    fn sum(initial_element_value: &Self, keys: &Range<Key>) -> Self;
+}
+
+impl<T: SumOfSameElements<Key>, TB: Borrow<T>, Key> LazyRangeInitializer<SumResult<T>, Key>
+    for SameElementsInitializer<TB>
+where
+    SumResult<T>: RangeQueryResult<Key>,
+{
+    fn get(&self, range: &Range<Key>) -> SumResult<T> {
+        SumResult {
+            sum: SumOfSameElements::sum(self.initial_element_value.borrow(), range),
+        }
+    }
+}
+
+#[derive(Copy, Clone, Debug)]
+pub enum AddAssignModification<T> {
+    None,
+    Add(T),
+    Assign(T),
+}
+
+impl<T: Clone + for<'a> AddAssign<&'a T>, Key> RangeModification<Key> for AddAssignModification<T>
+where
+    SumResult<T>: RangeQueryResult<Key>,
+    for<'a> SameElementsInitializer<&'a T>: LazyRangeInitializer<SumResult<T>, Key>,
+{
+    type Result = SumResult<T>;
+
+    fn no_op() -> Self {
+        AddAssignModification::None
+    }
+
+    fn is_no_op(&self) -> bool {
+        match self {
+            AddAssignModification::None => true,
+            _ => false,
+        }
+    }
+
+    fn is_reinitialization(&self) -> bool {
+        match self {
+            AddAssignModification::Assign(_) => true,
+            _ => false,
+        }
+    }
+
+    fn apply(&self, result: &mut SumResult<T>, range: &Range<Key>) {
+        use AddAssignModification::*;
+        match self {
+            None => {}
+            Add(x) | Assign(x) => {
+                let to_add = SameElementsInitializer::new(x).get(range).sum;
+                if let Assign(_) = self {
+                    result.sum = to_add;
+                } else {
+                    result.sum += &to_add;
+                }
+            }
+        }
+    }
+
+    fn compose(later: &Self, earlier: &mut Self) {
+        use AddAssignModification::*;
+        match (later, earlier) {
+            (_, e @ None) => *e = later.clone(),
+            (None, _) => {}
+            (Assign(_), e) => *e = later.clone(),
+            (Add(x), Add(y)) => *y += x,
+            (Add(x), Assign(value)) => *value += x,
+        }
+    }
+}
--- a/libs/persistent_range_query/src/segment_tree.rs
+++ b/libs/persistent_range_query/src/segment_tree.rs
@@ -0,0 +1,255 @@
+//! # Segment Tree
+//! It is a competitive programming folklore data structure. Do not confuse with the interval tree.
+
+use crate::{LazyRangeInitializer, PersistentVecStorage, RangeQueryResult, VecReadableVersion};
+use std::ops::Range;
+use std::rc::Rc;
+
+pub trait MidpointableKey: Clone + Ord + Sized {
+    fn midpoint(range: &Range<Self>) -> Self;
+}
+
+pub trait RangeModification<Key>: Clone + crate::RangeModification<Key> {}
+
+// TODO: use trait alias when stabilized
+impl<T: Clone + crate::RangeModification<Key>, Key> RangeModification<Key> for T {}
+
+#[derive(Debug)]
+struct Node<Modification: RangeModification<Key>, Key> {
+    result: Modification::Result,
+    modify_children: Modification,
+    left: Option<Rc<Self>>,
+    right: Option<Rc<Self>>,
+}
+
+// Manual implementation because we don't need `Key: Clone` for this, unlike with `derive`.
+impl<Modification: RangeModification<Key>, Key> Clone for Node<Modification, Key> {
+    fn clone(&self) -> Self {
+        Node {
+            result: self.result.clone(),
+            modify_children: self.modify_children.clone(),
+            left: self.left.clone(),
+            right: self.right.clone(),
+        }
+    }
+}
+
+impl<Modification: RangeModification<Key>, Key> Node<Modification, Key> {
+    fn new<Initializer: LazyRangeInitializer<Modification::Result, Key>>(
+        range: &Range<Key>,
+        initializer: &Initializer,
+    ) -> Self {
+        Node {
+            result: initializer.get(range),
+            modify_children: Modification::no_op(),
+            left: None,
+            right: None,
+        }
+    }
+
+    pub fn apply(&mut self, modification: &Modification, range: &Range<Key>) {
+        modification.apply(&mut self.result, range);
+        Modification::compose(modification, &mut self.modify_children);
+        if self.modify_children.is_reinitialization() {
+            self.left = None;
+            self.right = None;
+        }
+    }
+
+    pub fn force_children<Initializer: LazyRangeInitializer<Modification::Result, Key>>(
+        &mut self,
+        initializer: &Initializer,
+        range_left: &Range<Key>,
+        range_right: &Range<Key>,
+    ) {
+        let left = Rc::make_mut(
+            self.left
+                .get_or_insert_with(|| Rc::new(Node::new(&range_left, initializer))),
+        );
+        let right = Rc::make_mut(
+            self.right
+                .get_or_insert_with(|| Rc::new(Node::new(&range_right, initializer))),
+        );
+        left.apply(&self.modify_children, &range_left);
+        right.apply(&self.modify_children, &range_right);
+        self.modify_children = Modification::no_op();
+    }
+
+    pub fn recalculate_from_children(&mut self, range_left: &Range<Key>, range_right: &Range<Key>) {
+        assert!(self.modify_children.is_no_op());
+        assert!(self.left.is_some());
+        assert!(self.right.is_some());
+        self.result = Modification::Result::combine(
+            &self.left.as_ref().unwrap().result,
+            &range_left,
+            &self.right.as_ref().unwrap().result,
+            &range_right,
+        );
+    }
+}
+
+fn split_range<Key: MidpointableKey>(range: &Range<Key>) -> (Range<Key>, Range<Key>) {
+    let range_left = range.start.clone()..MidpointableKey::midpoint(range);
+    let range_right = range_left.end.clone()..range.end.clone();
+    (range_left, range_right)
+}
+
+pub struct PersistentSegmentTreeVersion<
+    Modification: RangeModification<Key>,
+    Initializer: LazyRangeInitializer<Modification::Result, Key>,
+    Key: Clone,
+> {
+    root: Rc<Node<Modification, Key>>,
+    all_keys: Range<Key>,
+    initializer: Rc<Initializer>,
+}
+
+// Manual implementation because we don't need `Key: Clone` for this, unlike with `derive`.
+impl<
+        Modification: RangeModification<Key>,
+        Initializer: LazyRangeInitializer<Modification::Result, Key>,
+        Key: Clone,
+    > Clone for PersistentSegmentTreeVersion<Modification, Initializer, Key>
+{
+    fn clone(&self) -> Self {
+        Self {
+            root: self.root.clone(),
+            all_keys: self.all_keys.clone(),
+            initializer: self.initializer.clone(),
+        }
+    }
+}
+
+fn get<
+    Modification: RangeModification<Key>,
+    Initializer: LazyRangeInitializer<Modification::Result, Key>,
+    Key: MidpointableKey,
+>(
+    node: &mut Rc<Node<Modification, Key>>,
+    node_keys: &Range<Key>,
+    initializer: &Initializer,
+    keys: &Range<Key>,
+) -> Modification::Result {
+    if node_keys.end <= keys.start || keys.end <= node_keys.start {
+        return Modification::Result::new_for_empty_range();
+    }
+    if keys.start <= node_keys.start && node_keys.end <= keys.end {
+        return node.result.clone();
+    }
+    let node = Rc::make_mut(node);
+    let (left_keys, right_keys) = split_range(node_keys);
+    node.force_children(initializer, &left_keys, &right_keys);
+    let mut result = get(node.left.as_mut().unwrap(), &left_keys, initializer, keys);
+    Modification::Result::add(
+        &mut result,
+        &left_keys,
+        &get(node.right.as_mut().unwrap(), &right_keys, initializer, keys),
+        &right_keys,
+    );
+    result
+}
+
+fn modify<
+    Modification: RangeModification<Key>,
+    Initializer: LazyRangeInitializer<Modification::Result, Key>,
+    Key: MidpointableKey,
+>(
+    node: &mut Rc<Node<Modification, Key>>,
+    node_keys: &Range<Key>,
+    initializer: &Initializer,
+    keys: &Range<Key>,
+    modification: &Modification,
+) {
+    if modification.is_no_op() || node_keys.end <= keys.start || keys.end <= node_keys.start {
+        return;
+    }
+    let node = Rc::make_mut(node);
+    if keys.start <= node_keys.start && node_keys.end <= keys.end {
+        node.apply(modification, node_keys);
+        return;
+    }
+    let (left_keys, right_keys) = split_range(node_keys);
+    node.force_children(initializer, &left_keys, &right_keys);
+    modify(
+        node.left.as_mut().unwrap(),
+        &left_keys,
+        initializer,
+        keys,
+        &modification,
+    );
+    modify(
+        node.right.as_mut().unwrap(),
+        &right_keys,
+        initializer,
+        keys,
+        &modification,
+    );
+    node.recalculate_from_children(&left_keys, &right_keys);
+}
+
+impl<
+        Modification: RangeModification<Key>,
+        Initializer: LazyRangeInitializer<Modification::Result, Key>,
+        Key: MidpointableKey,
+    > VecReadableVersion<Modification, Key>
+    for PersistentSegmentTreeVersion<Modification, Initializer, Key>
+{
+    fn get(&self, keys: &Range<Key>) -> Modification::Result {
+        get(
+            &mut self.root.clone(), // TODO: do not always force a branch
+            &self.all_keys,
+            self.initializer.as_ref(),
+            keys,
+        )
+    }
+}
+
+pub struct PersistentSegmentTree<
+    Modification: RangeModification<Key>,
+    Initializer: LazyRangeInitializer<Modification::Result, Key>,
+    Key: MidpointableKey,
+>(PersistentSegmentTreeVersion<Modification, Initializer, Key>);
+
+impl<
+        Modification: RangeModification<Key>,
+        Initializer: LazyRangeInitializer<Modification::Result, Key>,
+        Key: MidpointableKey,
+    > VecReadableVersion<Modification, Key>
+    for PersistentSegmentTree<Modification, Initializer, Key>
+{
+    fn get(&self, keys: &Range<Key>) -> Modification::Result {
+        self.0.get(keys)
+    }
+}
+
+impl<
+        Modification: RangeModification<Key>,
+        Initializer: LazyRangeInitializer<Modification::Result, Key>,
+        Key: MidpointableKey,
+    > PersistentVecStorage<Modification, Initializer, Key>
+    for PersistentSegmentTree<Modification, Initializer, Key>
+{
+    fn new(all_keys: Range<Key>, initializer: Initializer) -> Self {
+        PersistentSegmentTree(PersistentSegmentTreeVersion {
+            root: Rc::new(Node::new(&all_keys, &initializer)),
+            all_keys: all_keys,
+            initializer: Rc::new(initializer),
+        })
+    }
+
+    type FrozenVersion = PersistentSegmentTreeVersion<Modification, Initializer, Key>;
+
+    fn modify(&mut self, keys: &Range<Key>, modification: &Modification) {
+        modify(
+            &mut self.0.root, // TODO: do not always force a branch
+            &self.0.all_keys,
+            self.0.initializer.as_ref(),
+            keys,
+            modification,
+        )
+    }
+
+    fn freeze(&mut self) -> Self::FrozenVersion {
+        self.0.clone()
+    }
+}
--- a/libs/persistent_range_query/tests/layer_map_test.rs
+++ b/libs/persistent_range_query/tests/layer_map_test.rs
@@ -0,0 +1,295 @@
+use persistent_range_query::naive::{IndexableKey, NaiveVecStorage};
+use persistent_range_query::ops::SameElementsInitializer;
+use persistent_range_query::segment_tree::{MidpointableKey, PersistentSegmentTree};
+use persistent_range_query::{
+    LazyRangeInitializer, PersistentVecStorage, RangeModification, RangeQueryResult,
+    VecReadableVersion,
+};
+use std::cmp::Ordering;
+use std::ops::Range;
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd)]
+struct PageIndex(u32);
+type LayerId = String;
+
+impl IndexableKey for PageIndex {
+    fn index(all_keys: &Range<Self>, key: &Self) -> usize {
+        (key.0 as usize) - (all_keys.start.0 as usize)
+    }
+
+    fn element_range(all_keys: &Range<Self>, index: usize) -> Range<Self> {
+        PageIndex(all_keys.start.0 + index as u32)..PageIndex(all_keys.start.0 + index as u32 + 1)
+    }
+}
+
+impl MidpointableKey for PageIndex {
+    fn midpoint(range: &Range<Self>) -> Self {
+        PageIndex(range.start.0 + (range.end.0 - range.start.0) / 2)
+    }
+}
+
+#[derive(Clone, Debug, Eq, PartialEq)]
+struct LayerMapInformation {
+    // Only make sense for a range of length 1.
+    last_layer: Option<LayerId>,
+    last_image_layer: Option<LayerId>,
+    // Work for all ranges
+    max_delta_layers: (usize, Range<PageIndex>),
+}
+
+impl LayerMapInformation {
+    fn last_layers(&self) -> (&Option<LayerId>, &Option<LayerId>) {
+        (&self.last_layer, &self.last_image_layer)
+    }
+
+    fn max_delta_layers(&self) -> &(usize, Range<PageIndex>) {
+        &self.max_delta_layers
+    }
+}
+
+fn merge_ranges(left: &Range<PageIndex>, right: &Range<PageIndex>) -> Range<PageIndex> {
+    if left.is_empty() {
+        right.clone()
+    } else if right.is_empty() {
+        left.clone()
+    } else if left.end == right.start {
+        left.start..right.end
+    } else {
+        left.clone()
+    }
+}
+
+impl RangeQueryResult<PageIndex> for LayerMapInformation {
+    fn new_for_empty_range() -> Self {
+        LayerMapInformation {
+            last_layer: None,
+            last_image_layer: None,
+            max_delta_layers: (0, PageIndex(0)..PageIndex(0)),
+        }
+    }
+
+    fn combine(
+        left: &Self,
+        _left_range: &Range<PageIndex>,
+        right: &Self,
+        _right_range: &Range<PageIndex>,
+    ) -> Self {
+        // Note that either range may be empty.
+        LayerMapInformation {
+            last_layer: left
+                .last_layer
+                .as_ref()
+                .or_else(|| right.last_layer.as_ref())
+                .cloned(),
+            last_image_layer: left
+                .last_image_layer
+                .as_ref()
+                .or_else(|| right.last_image_layer.as_ref())
+                .cloned(),
+            max_delta_layers: match left.max_delta_layers.0.cmp(&right.max_delta_layers.0) {
+                Ordering::Less => right.max_delta_layers.clone(),
+                Ordering::Greater => left.max_delta_layers.clone(),
+                Ordering::Equal => (
+                    left.max_delta_layers.0,
+                    merge_ranges(&left.max_delta_layers.1, &right.max_delta_layers.1),
+                ),
+            },
+        }
+    }
+
+    fn add(
+        left: &mut Self,
+        left_range: &Range<PageIndex>,
+        right: &Self,
+        right_range: &Range<PageIndex>,
+    ) {
+        *left = Self::combine(&left, left_range, right, right_range);
+    }
+}
+
+#[derive(Clone, Debug)]
+struct AddDeltaLayers {
+    last_layer: LayerId,
+    count: usize,
+}
+
+#[derive(Clone, Debug)]
+struct LayerMapModification {
+    add_image_layer: Option<LayerId>,
+    add_delta_layers: Option<AddDeltaLayers>,
+}
+
+impl LayerMapModification {
+    fn add_image_layer(layer: impl Into<LayerId>) -> Self {
+        LayerMapModification {
+            add_image_layer: Some(layer.into()),
+            add_delta_layers: None,
+        }
+    }
+
+    fn add_delta_layer(layer: impl Into<LayerId>) -> Self {
+        LayerMapModification {
+            add_image_layer: None,
+            add_delta_layers: Some(AddDeltaLayers {
+                last_layer: layer.into(),
+                count: 1,
+            }),
+        }
+    }
+}
+
+impl RangeModification<PageIndex> for LayerMapModification {
+    type Result = LayerMapInformation;
+
+    fn no_op() -> Self {
+        LayerMapModification {
+            add_image_layer: None,
+            add_delta_layers: None,
+        }
+    }
+
+    fn is_no_op(&self) -> bool {
+        self.add_image_layer.is_none() && self.add_delta_layers.is_none()
+    }
+
+    fn is_reinitialization(&self) -> bool {
+        self.add_image_layer.is_some()
+    }
+
+    fn apply(&self, result: &mut Self::Result, range: &Range<PageIndex>) {
+        if let Some(layer) = &self.add_image_layer {
+            result.last_layer = Some(layer.clone());
+            result.last_image_layer = Some(layer.clone());
+            result.max_delta_layers = (0, range.clone());
+        }
+        if let Some(AddDeltaLayers { last_layer, count }) = &self.add_delta_layers {
+            result.last_layer = Some(last_layer.clone());
+            result.max_delta_layers.0 += count;
+        }
+    }
+
+    fn compose(later: &Self, earlier: &mut Self) {
+        if later.add_image_layer.is_some() {
+            *earlier = later.clone();
+            return;
+        }
+        if let Some(AddDeltaLayers { last_layer, count }) = &later.add_delta_layers {
+            let res = earlier.add_delta_layers.get_or_insert(AddDeltaLayers {
+                last_layer: LayerId::default(),
+                count: 0,
+            });
+            res.last_layer = last_layer.clone();
+            res.count += count;
+        }
+    }
+}
+
+impl LazyRangeInitializer<LayerMapInformation, PageIndex> for SameElementsInitializer<()> {
+    fn get(&self, range: &Range<PageIndex>) -> LayerMapInformation {
+        LayerMapInformation {
+            last_layer: None,
+            last_image_layer: None,
+            max_delta_layers: (0, range.clone()),
+        }
+    }
+}
+
+fn test_layer_map<
+    S: PersistentVecStorage<LayerMapModification, SameElementsInitializer<()>, PageIndex>,
+>() {
+    let mut s = S::new(
+        PageIndex(0)..PageIndex(100),
+        SameElementsInitializer::new(()),
+    );
+    s.modify(
+        &(PageIndex(0)..PageIndex(70)),
+        &LayerMapModification::add_image_layer("Img0..70"),
+    );
+    s.modify(
+        &(PageIndex(50)..PageIndex(100)),
+        &LayerMapModification::add_image_layer("Img50..100"),
+    );
+    s.modify(
+        &(PageIndex(10)..PageIndex(60)),
+        &LayerMapModification::add_delta_layer("Delta10..60"),
+    );
+    let s_before_last_delta = s.freeze();
+    s.modify(
+        &(PageIndex(20)..PageIndex(80)),
+        &LayerMapModification::add_delta_layer("Delta20..80"),
+    );
+
+    assert_eq!(
+        s.get(&(PageIndex(5)..PageIndex(6))).last_layers(),
+        (&Some("Img0..70".to_owned()), &Some("Img0..70".to_owned()))
+    );
+    assert_eq!(
+        s.get(&(PageIndex(15)..PageIndex(16))).last_layers(),
+        (
+            &Some("Delta10..60".to_owned()),
+            &Some("Img0..70".to_owned())
+        )
+    );
+    assert_eq!(
+        s.get(&(PageIndex(25)..PageIndex(26))).last_layers(),
+        (
+            &Some("Delta20..80".to_owned()),
+            &Some("Img0..70".to_owned())
+        )
+    );
+    assert_eq!(
+        s.get(&(PageIndex(65)..PageIndex(66))).last_layers(),
+        (
+            &Some("Delta20..80".to_owned()),
+            &Some("Img50..100".to_owned())
+        )
+    );
+    assert_eq!(
+        s.get(&(PageIndex(95)..PageIndex(96))).last_layers(),
+        (
+            &Some("Img50..100".to_owned()),
+            &Some("Img50..100".to_owned())
+        )
+    );
+
+    assert_eq!(
+        s.get(&(PageIndex(0)..PageIndex(100))).max_delta_layers(),
+        &(2, PageIndex(20)..PageIndex(60)),
+    );
+    assert_eq!(
+        *s_before_last_delta
+            .get(&(PageIndex(0)..PageIndex(100)))
+            .max_delta_layers(),
+        (1, PageIndex(10)..PageIndex(60)),
+    );
+
+    assert_eq!(
+        *s.get(&(PageIndex(10)..PageIndex(30))).max_delta_layers(),
+        (2, PageIndex(20)..PageIndex(30))
+    );
+    assert_eq!(
+        *s.get(&(PageIndex(10)..PageIndex(20))).max_delta_layers(),
+        (1, PageIndex(10)..PageIndex(20))
+    );
+
+    assert_eq!(
+        *s.get(&(PageIndex(70)..PageIndex(80))).max_delta_layers(),
+        (1, PageIndex(70)..PageIndex(80))
+    );
+    assert_eq!(
+        *s_before_last_delta
+            .get(&(PageIndex(70)..PageIndex(80)))
+            .max_delta_layers(),
+        (0, PageIndex(70)..PageIndex(80))
+    );
+}
+
+#[test]
+fn test_naive() {
+    test_layer_map::<NaiveVecStorage<_, _, _>>();
+}
+
+#[test]
+fn test_segment_tree() {
+    test_layer_map::<PersistentSegmentTree<_, _, _>>();
+}
--- a/libs/persistent_range_query/tests/rsq_test.rs
+++ b/libs/persistent_range_query/tests/rsq_test.rs
@@ -0,0 +1,116 @@
+use persistent_range_query::naive::*;
+use persistent_range_query::ops::rsq::AddAssignModification::Add;
+use persistent_range_query::ops::rsq::*;
+use persistent_range_query::ops::SameElementsInitializer;
+use persistent_range_query::segment_tree::{MidpointableKey, PersistentSegmentTree};
+use persistent_range_query::{PersistentVecStorage, VecReadableVersion};
+use rand::{Rng, SeedableRng};
+use std::ops::Range;
+
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
+struct K(u16);
+
+impl IndexableKey for K {
+    fn index(all_keys: &Range<Self>, key: &Self) -> usize {
+        (key.0 as usize) - (all_keys.start.0 as usize)
+    }
+
+    fn element_range(all_keys: &Range<Self>, index: usize) -> Range<Self> {
+        K(all_keys.start.0 + index as u16)..K(all_keys.start.0 + index as u16 + 1)
+    }
+}
+
+impl SumOfSameElements<K> for i32 {
+    fn sum(initial_element_value: &Self, keys: &Range<K>) -> Self {
+        initial_element_value * (keys.end.0 - keys.start.0) as Self
+    }
+}
+
+impl MidpointableKey for K {
+    fn midpoint(range: &Range<Self>) -> Self {
+        K(range.start.0 + (range.end.0 - range.start.0) / 2)
+    }
+}
+
+fn test_storage<
+    S: PersistentVecStorage<AddAssignModification<i32>, SameElementsInitializer<i32>, K>,
+>() {
+    let mut s = S::new(K(0)..K(12), SameElementsInitializer::new(0i32));
+    assert_eq!(*s.get(&(K(0)..K(12))).sum(), 0);
+
+    s.modify(&(K(2)..K(5)), &AddAssignModification::Add(3));
+    assert_eq!(*s.get(&(K(0)..K(12))).sum(), 3 + 3 + 3);
+    let s_old = s.freeze();
+
+    s.modify(&(K(3)..K(6)), &AddAssignModification::Assign(10));
+    assert_eq!(*s.get(&(K(0)..K(12))).sum(), 3 + 10 + 10 + 10);
+
+    s.modify(&(K(4)..K(7)), &AddAssignModification::Add(2));
+    assert_eq!(*s.get(&(K(0)..K(12))).sum(), 3 + 10 + 12 + 12 + 2);
+
+    assert_eq!(*s.get(&(K(4)..K(6))).sum(), 12 + 12);
+    assert_eq!(*s_old.get(&(K(4)..K(6))).sum(), 3);
+}
+
+#[test]
+fn test_naive() {
+    test_storage::<NaiveVecStorage<_, _, _>>();
+}
+
+#[test]
+fn test_segment_tree() {
+    test_storage::<PersistentSegmentTree<_, _, _>>();
+}
+
+#[test]
+fn test_stress() {
+    const LEN: u16 = 17_238;
+    const OPERATIONS: i32 = 20_000;
+
+    let mut rng = rand::rngs::StdRng::seed_from_u64(0);
+    let mut naive: NaiveVecStorage<AddAssignModification<i32>, _, _> =
+        NaiveVecStorage::new(K(0)..K(LEN), SameElementsInitializer::new(2i32));
+    let mut segm_tree: PersistentSegmentTree<AddAssignModification<i32>, _, _> =
+        PersistentSegmentTree::new(K(0)..K(LEN), SameElementsInitializer::new(2i32));
+
+    fn gen_range(rng: &mut impl Rng) -> Range<K> {
+        let l: u16 = rng.gen_range(0..LEN);
+        let r: u16 = rng.gen_range(0..LEN);
+        if l <= r {
+            K(l)..K(r)
+        } else {
+            K(r)..K(l)
+        }
+    }
+
+    for _ in 0..2 {
+        let checksum_range = gen_range(&mut rng);
+        let checksum_before: i32 = *naive.get(&checksum_range).sum();
+        assert_eq!(checksum_before, *segm_tree.get(&checksum_range).sum());
+
+        let naive_before = naive.freeze();
+        let segm_tree_before = segm_tree.freeze();
+        assert_eq!(checksum_before, *naive_before.get(&checksum_range).sum());
+        assert_eq!(checksum_before, *segm_tree.get(&checksum_range).sum());
+
+        for _ in 0..OPERATIONS {
+            {
+                let range = gen_range(&mut rng);
+                assert_eq!(naive.get(&range).sum(), segm_tree.get(&range).sum());
+            }
+            {
+                let range = gen_range(&mut rng);
+                let val = rng.gen_range(-10i32..=10i32);
+                let op = Add(val);
+                naive.modify(&range, &op);
+                segm_tree.modify(&range, &op);
+            }
+        }
+
+        assert_eq!(checksum_before, *naive_before.get(&checksum_range).sum());
+        assert_eq!(
+            checksum_before,
+            *segm_tree_before.get(&checksum_range).sum()
+        );
+    }
+}
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "pq_proto"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+anyhow = "1.0"
+bytes = "1.0.1"
+pin-project-lite = "0.2.7"
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+rand = "0.8.3"
+serde = { version = "1.0", features = ["derive"] }
+tokio = { version = "1.17", features = ["macros"] }
+tracing = "0.1"
+
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/utils/src/pq_proto.rs
+++ b/libs/utils/src/pq_proto.rs
@@ -2,7 +2,9 @@
 //! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
 //! on message formats.

-use crate::sync::{AsyncishRead, SyncFuture};
+// Tools for calling certain async methods in sync contexts.
+pub mod sync;
+
 use anyhow::{bail, ensure, Context, Result};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use postgres_protocol::PG_EPOCH;
@@ -16,6 +18,7 @@ use std::{
    str,
    time::{Duration, SystemTime},
 };
+use sync::{AsyncishRead, SyncFuture};
 use tokio::io::AsyncReadExt;
 use tracing::{trace, warn};

@@ -198,7 +201,7 @@ impl FeMessage {
    ///
    /// ```
    /// # use std::io;
-    /// # use utils::pq_proto::FeMessage;
+    /// # use pq_proto::FeMessage;
    /// #
    /// # fn process_message(msg: FeMessage) -> anyhow::Result<()> {
    /// #     Ok(())
@@ -302,6 +305,7 @@ impl FeStartupPacket {
                Err(e) => return Err(e.into()),
            };

+            #[allow(clippy::manual_range_contains)]
            if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
                bail!("invalid message length");
            }
--- a/libs/pq_proto/src/sync.rs
+++ b/libs/pq_proto/src/sync.rs
@@ -29,7 +29,7 @@ impl<S, T: Future> SyncFuture<S, T> {
    /// Example:
    ///
    /// ```
-    /// # use utils::sync::SyncFuture;
+    /// # use pq_proto::sync::SyncFuture;
    /// # use std::future::Future;
    /// # use tokio::io::AsyncReadExt;
    /// #
--- a/libs/tenant_size_model/.gitignore
+++ b/libs/tenant_size_model/.gitignore
@@ -0,0 +1,3 @@
+*.dot
+*.png
+*.svg
--- a/libs/tenant_size_model/Cargo.toml
+++ b/libs/tenant_size_model/Cargo.toml
@@ -0,0 +1,8 @@
+[package]
+name = "tenant_size_model"
+version = "0.1.0"
+edition = "2021"
+publish = false
+
+[dependencies]
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/tenant_size_model/Makefile
+++ b/libs/tenant_size_model/Makefile
@@ -0,0 +1,13 @@
+all: 1.svg 2.svg 3.svg 4.svg 1.png 2.png 3.png 4.png
+
+../../target/debug/tenant_size_model: Cargo.toml src/main.rs src/lib.rs
+	cargo build --bin tenant_size_model
+
+%.svg: %.dot
+	dot -Tsvg $< > $@
+
+%.png: %.dot
+	dot -Tpng $< > $@
+
+%.dot: ../../target/debug/tenant_size_model
+	../../target/debug/tenant_size_model $* > $@
--- a/libs/tenant_size_model/README.md
+++ b/libs/tenant_size_model/README.md
@@ -0,0 +1,7 @@
+# Logical size + WAL pricing
+
+This is a simulator to calculate the tenant size in different scenarios,
+using the "Logical size + WAL" method. Makefile produces diagrams used in a
+private presentation:
+
+https://docs.google.com/presentation/d/1OapE4k11xmcwMh7I7YvNWGC63yCRLh6udO9bXZ-fZmo/edit?usp=sharing
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -0,0 +1,382 @@
+use std::borrow::Cow;
+use std::collections::HashMap;
+
+/// Pricing model or history size builder.
+///
+/// Maintains knowledge of the branches and their modifications. Generic over the branch name key
+/// type.
+pub struct Storage<K: 'static> {
+    segments: Vec<Segment>,
+
+    /// Mapping from the branch name to the index of a segment describing it's latest state.
+    branches: HashMap<K, usize>,
+}
+
+/// Snapshot of a branch.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Segment {
+    /// Previous segment index into ['Storage::segments`], if any.
+    parent: Option<usize>,
+
+    /// Description of how did we get to this state.
+    ///
+    /// Mainly used in the original scenarios 1..=4 with insert, delete and update. Not used when
+    /// modifying a branch directly.
+    pub op: Cow<'static, str>,
+
+    /// LSN before this state
+    start_lsn: u64,
+
+    /// LSN at this state
+    pub end_lsn: u64,
+
+    /// Logical size before this state
+    start_size: u64,
+
+    /// Logical size at this state. Can be None in the last Segment of a branch.
+    pub end_size: Option<u64>,
+
+    /// Indices to [`Storage::segments`]
+    ///
+    /// FIXME: this could be an Option<usize>
+    children_after: Vec<usize>,
+
+    /// Determined by `retention_period` given to [`Storage::calculate`]
+    pub needed: bool,
+}
+
+//
+//
+//
+//
+//                 *-g--*---D--->
+//                /
+//               /
+//              /                 *---b----*-B--->
+//             /                 /
+//            /                 /
+//      -----*--e---*-----f----* C
+//           E                  \
+//                               \
+//                                *--a---*---A-->
+//
+// If A and B need to be retained, is it cheaper to store
+// snapshot at C+a+b, or snapshots at A and B ?
+//
+// If D also needs to be retained, which is cheaper:
+//
+// 1. E+g+e+f+a+b
+// 2. D+C+a+b
+// 3. D+A+B
+
+/// [`Segment`] which has had it's size calculated.
+pub struct SegmentSize {
+    pub seg_id: usize,
+
+    pub method: SegmentMethod,
+
+    this_size: u64,
+
+    pub children: Vec<SegmentSize>,
+}
+
+impl SegmentSize {
+    fn total(&self) -> u64 {
+        self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
+    }
+
+    pub fn total_children(&self) -> u64 {
+        if self.method == SnapshotAfter {
+            self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
+        } else {
+            self.children.iter().fold(0, |acc, x| acc + x.total())
+        }
+    }
+}
+
+/// Different methods to retain history from a particular state
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum SegmentMethod {
+    SnapshotAfter,
+    Wal,
+    WalNeeded,
+    Skipped,
+}
+
+use SegmentMethod::*;
+
+impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
+    /// Creates a new storage with the given default branch name.
+    pub fn new(initial_branch: K) -> Storage<K> {
+        let init_segment = Segment {
+            op: "".into(),
+            needed: false,
+            parent: None,
+            start_lsn: 0,
+            end_lsn: 0,
+            start_size: 0,
+            end_size: Some(0),
+            children_after: Vec::new(),
+        };
+
+        Storage {
+            segments: vec![init_segment],
+            branches: HashMap::from([(initial_branch, 0)]),
+        }
+    }
+
+    /// Advances the branch with a new point, at given LSN.
+    pub fn insert_point<Q: ?Sized>(
+        &mut self,
+        branch: &Q,
+        op: Cow<'static, str>,
+        lsn: u64,
+        size: Option<u64>,
+    ) where
+        K: std::borrow::Borrow<Q>,
+        Q: std::hash::Hash + Eq,
+    {
+        let lastseg_id = *self.branches.get(branch).unwrap();
+        let newseg_id = self.segments.len();
+        let lastseg = &mut self.segments[lastseg_id];
+
+        assert!(lsn > lastseg.end_lsn);
+
+        let newseg = Segment {
+            op,
+            parent: Some(lastseg_id),
+            start_lsn: lastseg.end_lsn,
+            end_lsn: lsn,
+            start_size: lastseg.end_size.unwrap(),
+            end_size: size,
+            children_after: Vec::new(),
+            needed: false,
+        };
+        lastseg.children_after.push(newseg_id);
+
+        self.segments.push(newseg);
+        *self.branches.get_mut(branch).expect("read already") = newseg_id;
+    }
+
+    /// Advances the branch with the named operation, by the relative LSN and logical size bytes.
+    pub fn modify_branch<Q: ?Sized>(
+        &mut self,
+        branch: &Q,
+        op: Cow<'static, str>,
+        lsn_bytes: u64,
+        size_bytes: i64,
+    ) where
+        K: std::borrow::Borrow<Q>,
+        Q: std::hash::Hash + Eq,
+    {
+        let lastseg_id = *self.branches.get(branch).unwrap();
+        let newseg_id = self.segments.len();
+        let lastseg = &mut self.segments[lastseg_id];
+
+        let newseg = Segment {
+            op,
+            parent: Some(lastseg_id),
+            start_lsn: lastseg.end_lsn,
+            end_lsn: lastseg.end_lsn + lsn_bytes,
+            start_size: lastseg.end_size.unwrap(),
+            end_size: Some((lastseg.end_size.unwrap() as i64 + size_bytes) as u64),
+            children_after: Vec::new(),
+            needed: false,
+        };
+        lastseg.children_after.push(newseg_id);
+
+        self.segments.push(newseg);
+        *self.branches.get_mut(branch).expect("read already") = newseg_id;
+    }
+
+    pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
+    where
+        K: std::borrow::Borrow<Q>,
+        Q: std::hash::Hash + Eq,
+    {
+        self.modify_branch(branch, "insert".into(), bytes, bytes as i64);
+    }
+
+    pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
+    where
+        K: std::borrow::Borrow<Q>,
+        Q: std::hash::Hash + Eq,
+    {
+        self.modify_branch(branch, "update".into(), bytes, 0i64);
+    }
+
+    pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
+    where
+        K: std::borrow::Borrow<Q>,
+        Q: std::hash::Hash + Eq,
+    {
+        self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64));
+    }
+
+    /// Panics if the parent branch cannot be found.
+    pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K)
+    where
+        K: std::borrow::Borrow<Q>,
+        Q: std::hash::Hash + Eq,
+    {
+        // Find the right segment
+        let branchseg_id = *self
+            .branches
+            .get(parent)
+            .expect("should had found the parent by key");
+        let _branchseg = &mut self.segments[branchseg_id];
+
+        // Create branch name for it
+        self.branches.insert(name, branchseg_id);
+    }
+
+    pub fn calculate(&mut self, retention_period: u64) -> SegmentSize {
+        // Phase 1: Mark all the segments that need to be retained
+        for (_branch, &last_seg_id) in self.branches.iter() {
+            let last_seg = &self.segments[last_seg_id];
+            let cutoff_lsn = last_seg.start_lsn.saturating_sub(retention_period);
+            let mut seg_id = last_seg_id;
+            loop {
+                let seg = &mut self.segments[seg_id];
+                if seg.end_lsn < cutoff_lsn {
+                    break;
+                }
+                seg.needed = true;
+                if let Some(prev_seg_id) = seg.parent {
+                    seg_id = prev_seg_id;
+                } else {
+                    break;
+                }
+            }
+        }
+
+        // Phase 2: For each oldest segment in a chain that needs to be retained,
+        // calculate if we should store snapshot or WAL
+        self.size_from_snapshot_later(0)
+    }
+
+    fn size_from_wal(&self, seg_id: usize) -> SegmentSize {
+        let seg = &self.segments[seg_id];
+
+        let this_size = seg.end_lsn - seg.start_lsn;
+
+        let mut children = Vec::new();
+
+        // try both ways
+        for &child_id in seg.children_after.iter() {
+            // try each child both ways
+            let child = &self.segments[child_id];
+            let p1 = self.size_from_wal(child_id);
+
+            let p = if !child.needed {
+                let p2 = self.size_from_snapshot_later(child_id);
+                if p1.total() < p2.total() {
+                    p1
+                } else {
+                    p2
+                }
+            } else {
+                p1
+            };
+            children.push(p);
+        }
+        SegmentSize {
+            seg_id,
+            method: if seg.needed { WalNeeded } else { Wal },
+            this_size,
+            children,
+        }
+    }
+
+    fn size_from_snapshot_later(&self, seg_id: usize) -> SegmentSize {
+        // If this is needed, then it's time to do the snapshot and continue
+        // with wal method.
+        let seg = &self.segments[seg_id];
+        //eprintln!("snap: seg{}: {} needed: {}", seg_id, seg.children_after.len(), seg.needed);
+        if seg.needed {
+            let mut children = Vec::new();
+
+            for &child_id in seg.children_after.iter() {
+                // try each child both ways
+                let child = &self.segments[child_id];
+                let p1 = self.size_from_wal(child_id);
+
+                let p = if !child.needed {
+                    let p2 = self.size_from_snapshot_later(child_id);
+                    if p1.total() < p2.total() {
+                        p1
+                    } else {
+                        p2
+                    }
+                } else {
+                    p1
+                };
+                children.push(p);
+            }
+            SegmentSize {
+                seg_id,
+                method: WalNeeded,
+                this_size: seg.start_size,
+                children,
+            }
+        } else {
+            // If any of the direct children are "needed", need to be able to reconstruct here
+            let mut children_needed = false;
+            for &child in seg.children_after.iter() {
+                let seg = &self.segments[child];
+                if seg.needed {
+                    children_needed = true;
+                    break;
+                }
+            }
+
+            let method1 = if !children_needed {
+                let mut children = Vec::new();
+                for child in seg.children_after.iter() {
+                    children.push(self.size_from_snapshot_later(*child));
+                }
+                Some(SegmentSize {
+                    seg_id,
+                    method: Skipped,
+                    this_size: 0,
+                    children,
+                })
+            } else {
+                None
+            };
+
+            // If this a junction, consider snapshotting here
+            let method2 = if children_needed || seg.children_after.len() >= 2 {
+                let mut children = Vec::new();
+                for child in seg.children_after.iter() {
+                    children.push(self.size_from_wal(*child));
+                }
+                Some(SegmentSize {
+                    seg_id,
+                    method: SnapshotAfter,
+                    this_size: seg.end_size.unwrap(),
+                    children,
+                })
+            } else {
+                None
+            };
+
+            match (method1, method2) {
+                (None, None) => panic!(),
+                (Some(method), None) => method,
+                (None, Some(method)) => method,
+                (Some(method1), Some(method2)) => {
+                    if method1.total() < method2.total() {
+                        method1
+                    } else {
+                        method2
+                    }
+                }
+            }
+        }
+    }
+
+    pub fn into_segments(self) -> Vec<Segment> {
+        self.segments
+    }
+}
--- a/libs/tenant_size_model/src/main.rs
+++ b/libs/tenant_size_model/src/main.rs
@@ -0,0 +1,268 @@
+//! Tenant size model testing ground.
+//!
+//! Has a number of scenarios and a `main` for invoking these by number, calculating the history
+//! size, outputs graphviz graph. Makefile in directory shows how to use graphviz to turn scenarios
+//! into pngs.
+
+use tenant_size_model::{Segment, SegmentSize, Storage};
+
+// Main branch only. Some updates on it.
+fn scenario_1() -> (Vec<Segment>, SegmentSize) {
+    // Create main branch
+    let mut storage = Storage::new("main");
+
+    // Bulk load 5 GB of data to it
+    storage.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        storage.update("main", 1_000);
+    }
+
+    let size = storage.calculate(1000);
+
+    (storage.into_segments(), size)
+}
+
+// Main branch only. Some updates on it.
+fn scenario_2() -> (Vec<Segment>, SegmentSize) {
+    // Create main branch
+    let mut storage = Storage::new("main");
+
+    // Bulk load 5 GB of data to it
+    storage.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        storage.update("main", 1_000);
+    }
+
+    // Branch
+    storage.branch("main", "child");
+    storage.update("child", 1_000);
+
+    // More updates on parent
+    storage.update("main", 1_000);
+
+    let size = storage.calculate(1000);
+
+    (storage.into_segments(), size)
+}
+
+// Like 2, but more updates on main
+fn scenario_3() -> (Vec<Segment>, SegmentSize) {
+    // Create main branch
+    let mut storage = Storage::new("main");
+
+    // Bulk load 5 GB of data to it
+    storage.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        storage.update("main", 1_000);
+    }
+
+    // Branch
+    storage.branch("main", "child");
+    storage.update("child", 1_000);
+
+    // More updates on parent
+    for _ in 0..5 {
+        storage.update("main", 1_000);
+    }
+
+    let size = storage.calculate(1000);
+
+    (storage.into_segments(), size)
+}
+
+// Diverged branches
+fn scenario_4() -> (Vec<Segment>, SegmentSize) {
+    // Create main branch
+    let mut storage = Storage::new("main");
+
+    // Bulk load 5 GB of data to it
+    storage.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        storage.update("main", 1_000);
+    }
+
+    // Branch
+    storage.branch("main", "child");
+    storage.update("child", 1_000);
+
+    // More updates on parent
+    for _ in 0..8 {
+        storage.update("main", 1_000);
+    }
+
+    let size = storage.calculate(1000);
+
+    (storage.into_segments(), size)
+}
+
+fn scenario_5() -> (Vec<Segment>, SegmentSize) {
+    let mut storage = Storage::new("a");
+    storage.insert("a", 5000);
+    storage.branch("a", "b");
+    storage.update("b", 4000);
+    storage.update("a", 2000);
+    storage.branch("a", "c");
+    storage.insert("c", 4000);
+    storage.insert("a", 2000);
+
+    let size = storage.calculate(5000);
+
+    (storage.into_segments(), size)
+}
+
+fn scenario_6() -> (Vec<Segment>, SegmentSize) {
+    use std::borrow::Cow;
+
+    const NO_OP: Cow<'static, str> = Cow::Borrowed("");
+
+    let branches = [
+        Some(0x7ff1edab8182025f15ae33482edb590a_u128),
+        Some(0xb1719e044db05401a05a2ed588a3ad3f),
+        Some(0xb68d6691c895ad0a70809470020929ef),
+    ];
+
+    // compared to other scenarios, this one uses bytes instead of kB
+
+    let mut storage = Storage::new(None);
+
+    storage.branch(&None, branches[0]); // at 0
+    storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128); // at 108951064
+    storage.branch(&branches[0], branches[1]); // at 108951064
+    storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392); // at 124511472
+    storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904); // at 283415424
+    storage.branch(&branches[0], branches[2]); // at 283415424
+    storage.modify_branch(&branches[2], NO_OP, 15906192, 8192); // at 299321616
+    storage.modify_branch(&branches[0], NO_OP, 18909976, 32768); // at 302325400
+
+    let size = storage.calculate(100_000);
+
+    (storage.into_segments(), size)
+}
+
+fn main() {
+    let args: Vec<String> = std::env::args().collect();
+
+    let scenario = if args.len() < 2 { "1" } else { &args[1] };
+
+    let (segments, size) = match scenario {
+        "1" => scenario_1(),
+        "2" => scenario_2(),
+        "3" => scenario_3(),
+        "4" => scenario_4(),
+        "5" => scenario_5(),
+        "6" => scenario_6(),
+        other => {
+            eprintln!("invalid scenario {}", other);
+            std::process::exit(1);
+        }
+    };
+
+    graphviz_tree(&segments, &size);
+}
+
+fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) {
+    use tenant_size_model::SegmentMethod::*;
+
+    let seg_id = node.seg_id;
+    let seg = segments.get(seg_id).unwrap();
+    let lsn = seg.end_lsn;
+    let size = seg.end_size.unwrap_or(0);
+    let method = node.method;
+
+    println!("  {{");
+    println!("    node [width=0.1 height=0.1 shape=oval]");
+
+    let tenant_size = node.total_children();
+
+    let penwidth = if seg.needed { 6 } else { 3 };
+    let x = match method {
+        SnapshotAfter =>
+            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" style=filled penwidth={penwidth}"),
+        Wal =>
+            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
+        WalNeeded =>
+            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
+        Skipped =>
+            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"gray\" penwidth={penwidth}"),
+    };
+
+    println!("    \"seg{seg_id}\" [{x}]");
+    println!("  }}");
+
+    // Recurse. Much of the data is actually on the edge
+    for child in node.children.iter() {
+        let child_id = child.seg_id;
+        graphviz_recurse(segments, child);
+
+        let edge_color = match child.method {
+            SnapshotAfter => "gray",
+            Wal => "black",
+            WalNeeded => "black",
+            Skipped => "gray",
+        };
+
+        println!("  {{");
+        println!("    edge [] ");
+        print!("    \"seg{seg_id}\" -> \"seg{child_id}\" [");
+        print!("color={edge_color}");
+        if child.method == WalNeeded {
+            print!(" penwidth=6");
+        }
+        if child.method == Wal {
+            print!(" penwidth=3");
+        }
+
+        let next = segments.get(child_id).unwrap();
+
+        if next.op.is_empty() {
+            print!(
+                " label=\"{} / {}\"",
+                next.end_lsn - seg.end_lsn,
+                (next.end_size.unwrap_or(0) as i128 - seg.end_size.unwrap_or(0) as i128)
+            );
+        } else {
+            print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn);
+        }
+        println!("]");
+        println!("  }}");
+    }
+}
+
+fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) {
+    println!("digraph G {{");
+    println!("  fontname=\"Helvetica,Arial,sans-serif\"");
+    println!("  node [fontname=\"Helvetica,Arial,sans-serif\"]");
+    println!("  edge [fontname=\"Helvetica,Arial,sans-serif\"]");
+    println!("  graph [center=1 rankdir=LR]");
+    println!("  edge [dir=none]");
+
+    graphviz_recurse(segments, tree);
+
+    println!("}}");
+}
+
+#[test]
+fn scenarios_return_same_size() {
+    type ScenarioFn = fn() -> (Vec<Segment>, SegmentSize);
+    let truths: &[(u32, ScenarioFn, _)] = &[
+        (line!(), scenario_1, 8000),
+        (line!(), scenario_2, 9000),
+        (line!(), scenario_3, 13000),
+        (line!(), scenario_4, 16000),
+        (line!(), scenario_5, 17000),
+        (line!(), scenario_6, 333_792_000),
+    ];
+
+    for (line, scenario, expected) in truths {
+        let (_, size) = scenario();
+        assert_eq!(*expected, size.total_children(), "scenario on line {line}");
+    }
+}
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -9,9 +9,6 @@ anyhow = "1.0"
 bincode = "1.3"
 bytes = "1.0.1"
 hyper = { version = "0.14.7", features = ["full"] }
-pin-project-lite = "0.2.7"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 routerify = "3"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
@@ -33,8 +30,8 @@ once_cell = "1.13.0"
 strum = "0.24"
 strum_macros = "0.24"

-
 metrics = { path = "../metrics" }
+pq_proto = { path = "../pq_proto" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }

 [dev-dependencies]
--- a/libs/utils/src/connstring.rs
+++ b/libs/utils/src/connstring.rs
@@ -1,52 +0,0 @@
-use postgres::Config;
-
-pub fn connection_host_port(config: &Config) -> (String, u16) {
-    assert_eq!(
-        config.get_hosts().len(),
-        1,
-        "only one pair of host and port is supported in connection string"
-    );
-    assert_eq!(
-        config.get_ports().len(),
-        1,
-        "only one pair of host and port is supported in connection string"
-    );
-    let host = match &config.get_hosts()[0] {
-        postgres::config::Host::Tcp(host) => host.as_ref(),
-        postgres::config::Host::Unix(host) => host.to_str().unwrap(),
-    };
-    (host.to_owned(), config.get_ports()[0])
-}
-
-pub fn connection_address(config: &Config) -> String {
-    let (host, port) = connection_host_port(config);
-    format!("{}:{}", host, port)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_connection_host_port() {
-        let config: Config = "postgresql://no_user@localhost:64000/no_db"
-            .parse()
-            .unwrap();
-        assert_eq!(
-            connection_host_port(&config),
-            ("localhost".to_owned(), 64000)
-        );
-    }
-
-    #[test]
-    #[should_panic(expected = "only one pair of host and port is supported in connection string")]
-    fn test_connection_host_port_multiple_ports() {
-        let config: Config = "postgresql://no_user@localhost:64000,localhost:64001/no_db"
-            .parse()
-            .unwrap();
-        assert_eq!(
-            connection_host_port(&config),
-            ("localhost".to_owned(), 64000)
-        );
-    }
-}
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -204,6 +204,17 @@ pub struct TenantId(Id);

 id_newtype!(TenantId);

+/// Neon Connection Id identifies long-lived connections (for example a pagestream
+/// connection with the page_service). Is used for better logging and tracing
+///
+/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
+/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
+/// See [`Id`] for alternative ways to serialize it.
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
+pub struct ConnectionId(Id);
+
+id_newtype!(ConnectionId);
+
 // A pair uniquely identifying Neon instance.
 #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct TenantTimelineId {
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,8 +1,6 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.

-#![allow(clippy::manual_range_contains)]
-
 /// `Lsn` type implements common tasks on Log Sequence Numbers
 pub mod lsn;
 /// SeqWait allows waiting for a future sequence number to arrive
@@ -17,10 +15,6 @@ pub mod vec_map;
 pub mod bin_ser;
 pub mod postgres_backend;
 pub mod postgres_backend_async;
-pub mod pq_proto;
-
-// dealing with connstring parsing and handy access to it's parts
-pub mod connstring;

 // helper functions for creating and fsyncing
 pub mod crashsafe;
@@ -39,13 +33,12 @@ pub mod sock_split;
 // common log initialisation routine
 pub mod logging;

+pub mod lock_file;
+
 // Misc
 pub mod accum;
 pub mod shutdown;

-// Tools for calling certain async methods in sync contexts
-pub mod sync;
-
 // Utility for binding TcpListeners with proper socket options.
 pub mod tcp_listener;

@@ -55,6 +48,25 @@ pub mod nonblock;
 // Default signal handling
 pub mod signals;

+/// use with fail::cfg("$name", "return(2000)")
+#[macro_export]
+macro_rules! failpoint_sleep_millis_async {
+    ($name:literal) => {{
+        let should_sleep: Option<std::time::Duration> = (|| {
+            fail::fail_point!($name, |v: Option<_>| {
+                let millis = v.unwrap().parse::<u64>().unwrap();
+                Some(Duration::from_millis(millis))
+            });
+            None
+        })();
+        if let Some(d) = should_sleep {
+            tracing::info!("failpoint {:?}: sleeping for {:?}", $name, d);
+            tokio::time::sleep(d).await;
+            tracing::info!("failpoint {:?}: sleep done", $name);
+        }
+    }};
+}
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -0,0 +1,81 @@
+//! A module to create and read lock files. A lock file ensures that only one
+//! process is running at a time, in a particular directory.
+//!
+//! File locking is done using [`fcntl::flock`], which means that holding the
+//! lock on file only prevents acquiring another lock on it; all other
+//! operations are still possible on files. Other process can still open, read,
+//! write, or remove the file, for example.
+//! If the file is removed while a process is holding a lock on it,
+//! the process that holds the lock does not get any error or notification.
+//! Furthermore, you can create a new file with the same name and lock the new file,
+//! while the old process is still running.
+//! Deleting the lock file while the locking process is still running is a bad idea!
+
+use std::{fs, os::unix::prelude::AsRawFd, path::Path};
+
+use anyhow::Context;
+use nix::fcntl;
+
+use crate::crashsafe;
+
+pub enum LockCreationResult {
+    Created {
+        new_lock_contents: String,
+        file: fs::File,
+    },
+    AlreadyLocked {
+        existing_lock_contents: String,
+    },
+    CreationFailed(anyhow::Error),
+}
+
+/// Creates a lock file in the path given and writes the given contents into the file.
+/// Note: The lock is automatically released when the file closed. You might want to use Box::leak to make sure it lives until the end of the program.
+pub fn create_lock_file(lock_file_path: &Path, contents: String) -> LockCreationResult {
+    let lock_file = match fs::OpenOptions::new()
+        .create(true) // O_CREAT
+        .write(true)
+        .open(lock_file_path)
+        .context("Failed to open lock file")
+    {
+        Ok(file) => file,
+        Err(e) => return LockCreationResult::CreationFailed(e),
+    };
+
+    match fcntl::flock(
+        lock_file.as_raw_fd(),
+        fcntl::FlockArg::LockExclusiveNonblock,
+    ) {
+        Ok(()) => {
+            match lock_file
+                .set_len(0)
+                .context("Failed to truncate lockfile")
+                .and_then(|()| {
+                    fs::write(lock_file_path, &contents).with_context(|| {
+                        format!("Failed to write '{contents}' contents into lockfile")
+                    })
+                })
+                .and_then(|()| {
+                    crashsafe::fsync_file_and_parent(lock_file_path)
+                        .context("Failed to fsync lockfile")
+                }) {
+                Ok(()) => LockCreationResult::Created {
+                    new_lock_contents: contents,
+                    file: lock_file,
+                },
+                Err(e) => LockCreationResult::CreationFailed(e),
+            }
+        }
+        Err(nix::errno::Errno::EAGAIN) => {
+            match fs::read_to_string(lock_file_path).context("Failed to read lockfile contents") {
+                Ok(existing_lock_contents) => LockCreationResult::AlreadyLocked {
+                    existing_lock_contents,
+                },
+                Err(e) => LockCreationResult::CreationFailed(e),
+            }
+        }
+        Err(e) => {
+            LockCreationResult::CreationFailed(anyhow::anyhow!("Failed to lock lockfile: {e}"))
+        }
+    }
+}
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,10 +1,6 @@
-use std::{
-    fs::{File, OpenOptions},
-    path::Path,
-    str::FromStr,
-};
+use std::str::FromStr;

-use anyhow::{Context, Result};
+use anyhow::Context;
 use strum_macros::{EnumString, EnumVariantNames};

 #[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
@@ -25,19 +21,8 @@ impl LogFormat {
        })
    }
 }
-pub fn init(
-    log_filename: impl AsRef<Path>,
-    daemonize: bool,
-    log_format: LogFormat,
-) -> Result<File> {
-    // Don't open the same file for output multiple times;
-    // the different fds could overwrite each other's output.
-    let log_file = OpenOptions::new()
-        .create(true)
-        .append(true)
-        .open(&log_filename)
-        .with_context(|| format!("failed to open {:?}", log_filename.as_ref()))?;

+pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
    let default_filter_str = "info";

    // We fall back to printing all spans at info-level or above if
@@ -45,50 +30,16 @@ pub fn init(
    let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_filter_str));

-    let x: File = log_file.try_clone().unwrap();
    let base_logger = tracing_subscriber::fmt()
        .with_env_filter(env_filter)
        .with_target(false)
        .with_ansi(false)
-        .with_writer(move || -> Box<dyn std::io::Write> {
-            // we are cloning and returning log file in order to allow redirecting daemonized stdout and stderr to it
-            // if we do not use daemonization (e.g. in docker) it is better to log to stdout directly
-            // for example to be in line with docker log command which expects logs comimg from stdout
-            if daemonize {
-                Box::new(x.try_clone().unwrap())
-            } else {
-                Box::new(std::io::stdout())
-            }
-        });
+        .with_writer(std::io::stdout);

    match log_format {
        LogFormat::Json => base_logger.json().init(),
        LogFormat::Plain => base_logger.init(),
    }

-    Ok(log_file)
-}
-
-// #[cfg(test)]
-// Due to global logger, can't run tests in same process.
-// So until there's a non-global one, the tests are in ../tests/ as separate files.
-#[macro_export(local_inner_macros)]
-macro_rules! test_init_file_logger {
-    ($log_level:expr, $log_format:expr) => {{
-        use std::str::FromStr;
-        std::env::set_var("RUST_LOG", $log_level);
-
-        let tmp_dir = tempfile::TempDir::new().unwrap();
-        let log_file_path = tmp_dir.path().join("logfile");
-
-        let log_format = $crate::logging::LogFormat::from_str($log_format).unwrap();
-        let _log_file = $crate::logging::init(&log_file_path, true, log_format).unwrap();
-
-        let log_file = std::fs::OpenOptions::new()
-            .read(true)
-            .open(&log_file_path)
-            .unwrap();
-
-        log_file
-    }};
+    Ok(())
 }
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -13,7 +13,7 @@ use crate::seqwait::MonotonicCounter;
 pub const XLOG_BLCKSZ: u32 = 8192;

 /// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
-#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Serialize, Deserialize)]
+#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Serialize, Deserialize)]
 #[serde(transparent)]
 pub struct Lsn(pub u64);

@@ -138,7 +138,7 @@ impl FromStr for Lsn {
    ///
    /// If the input string is missing the '/' character, then use `Lsn::from_hex`
    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let mut splitter = s.split('/');
+        let mut splitter = s.trim().split('/');
        if let (Some(left), Some(right), None) = (splitter.next(), splitter.next(), splitter.next())
        {
            let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
@@ -270,6 +270,11 @@ mod tests {
        );
        assert_eq!(Lsn::from_hex("0"), Ok(Lsn(0)));
        assert_eq!(Lsn::from_hex("F12345678AAAA5555"), Err(LsnParseError));
+
+        let expected_lsn = Lsn(0x3C490F8);
+        assert_eq!(" 0/3C490F8".parse(), Ok(expected_lsn));
+        assert_eq!("0/3C490F8 ".parse(), Ok(expected_lsn));
+        assert_eq!(" 0/3C490F8 ".parse(), Ok(expected_lsn));
    }

    #[test]
--- a/libs/utils/src/postgres_backend.rs
+++ b/libs/utils/src/postgres_backend.rs
@@ -3,10 +3,10 @@
 //! implementation determining how to process the queries. Currently its API
 //! is rather narrow, but we can extend it once required.

-use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
 use crate::sock_split::{BidiStream, ReadStream, WriteStream};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
+use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
 use std::fmt;
--- a/libs/utils/src/postgres_backend_async.rs
+++ b/libs/utils/src/postgres_backend_async.rs
@@ -4,9 +4,9 @@
 //! is rather narrow, but we can extend it once required.

 use crate::postgres_backend::AuthType;
-use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
 use anyhow::{bail, Context, Result};
 use bytes::{Bytes, BytesMut};
+use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
 use rand::Rng;
 use std::future::Future;
 use std::net::SocketAddr;
--- a/libs/utils/tests/logger_json_test.rs
+++ b/libs/utils/tests/logger_json_test.rs
@@ -1,36 +0,0 @@
-// This could be in ../src/logging.rs but since the logger is global, these
-// can't be run in threads of the same process
-use std::fs::File;
-use std::io::{BufRead, BufReader, Lines};
-use tracing::*;
-use utils::test_init_file_logger;
-
-fn read_lines(file: File) -> Lines<BufReader<File>> {
-    BufReader::new(file).lines()
-}
-
-#[test]
-fn test_json_format_has_message_and_custom_field() {
-    std::env::set_var("RUST_LOG", "info");
-
-    let log_file = test_init_file_logger!("info", "json");
-
-    let custom_field: &str = "hi";
-    trace!(custom = %custom_field, "test log message");
-    debug!(custom = %custom_field, "test log message");
-    info!(custom = %custom_field, "test log message");
-    warn!(custom = %custom_field, "test log message");
-    error!(custom = %custom_field, "test log message");
-
-    let lines = read_lines(log_file);
-    for line in lines {
-        let content = line.unwrap();
-        let json_object = serde_json::from_str::<serde_json::Value>(&content).unwrap();
-
-        assert_eq!(json_object["fields"]["custom"], "hi");
-        assert_eq!(json_object["fields"]["message"], "test log message");
-
-        assert_ne!(json_object["level"], "TRACE");
-        assert_ne!(json_object["level"], "DEBUG");
-    }
-}
--- a/libs/utils/tests/logger_plain_test.rs
+++ b/libs/utils/tests/logger_plain_test.rs
@@ -1,36 +0,0 @@
-// This could be in ../src/logging.rs but since the logger is global, these
-// can't be run in threads of the same process
-use std::fs::File;
-use std::io::{BufRead, BufReader, Lines};
-use tracing::*;
-use utils::test_init_file_logger;
-
-fn read_lines(file: File) -> Lines<BufReader<File>> {
-    BufReader::new(file).lines()
-}
-
-#[test]
-fn test_plain_format_has_message_and_custom_field() {
-    std::env::set_var("RUST_LOG", "warn");
-
-    let log_file = test_init_file_logger!("warn", "plain");
-
-    let custom_field: &str = "hi";
-    trace!(custom = %custom_field, "test log message");
-    debug!(custom = %custom_field, "test log message");
-    info!(custom = %custom_field, "test log message");
-    warn!(custom = %custom_field, "test log message");
-    error!(custom = %custom_field, "test log message");
-
-    let lines = read_lines(log_file);
-    for line in lines {
-        let content = line.unwrap();
-        serde_json::from_str::<serde_json::Value>(&content).unwrap_err();
-        assert!(content.contains("custom=hi"));
-        assert!(content.contains("test log message"));
-
-        assert!(!content.contains("TRACE"));
-        assert!(!content.contains("DEBUG"));
-        assert!(!content.contains("INFO"));
-    }
-}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -12,62 +12,61 @@ testing = ["fail/failpoints"]
 profiling = ["pprof"]

 [dependencies]
+amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
+anyhow = { version = "1.0", features = ["backtrace"] }
 async-stream = "0.3"
 async-trait = "0.1"
-chrono = "0.4.19"
-rand = "0.8.3"
-regex = "1.4.5"
-bytes = "1.0.1"
 byteorder = "1.4.3"
+bytes = "1.0.1"
+chrono = "0.4.19"
+clap = { version = "4.0", features = ["string"] }
+close_fds = "0.3.2"
+const_format = "0.2.21"
+crc32c = "0.6.0"
+crossbeam-utils = "0.8.5"
+fail = "0.5.0"
 futures = "0.3.13"
+git-version = "0.3.5"
 hex = "0.4.3"
+humantime = "2.1.0"
+humantime-serde = "1.1.1"
 hyper = "0.14"
 itertools = "0.10.3"
-clap = { version = "4.0", features = ["string"] }
-daemonize = "0.4.1"
-tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
-tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+nix = "0.25"
+num-traits = "0.2.15"
+once_cell = "1.13.0"
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-anyhow = { version = "1.0", features = ["backtrace"] }
-crc32c = "0.6.0"
-thiserror = "1.0"
-tar = "0.4.33"
-humantime = "2.1.0"
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
+rand = "0.8.3"
+regex = "1.4.5"
+rstar = "0.9.3"
+scopeguard = "1.1.0"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "2.0"
-humantime-serde = "1.1.1"
-
-pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
-
-toml_edit = { version = "0.14", features = ["easy"] }
-scopeguard = "1.1.0"
-const_format = "0.2.21"
-tracing = "0.1.36"
 signal-hook = "0.3.10"
+svg_fmt = "0.4.1"
+tar = "0.4.33"
+thiserror = "1.0"
+tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
+toml_edit = { version = "0.14", features = ["easy"] }
+tracing = "0.1.36"
 url = "2"
-nix = "0.25"
-once_cell = "1.13.0"
-crossbeam-utils = "0.8.5"
-fail = "0.5.0"
-git-version = "0.3.5"
-rstar = "0.9.3"
-num-traits = "0.2.15"
-amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
+walkdir = "2.3.2"

-pageserver_api = { path = "../libs/pageserver_api" }
-postgres_ffi = { path = "../libs/postgres_ffi" }
 etcd_broker = { path = "../libs/etcd_broker" }
 metrics = { path = "../libs/metrics" }
-utils = { path = "../libs/utils" }
+pageserver_api = { path = "../libs/pageserver_api" }
+postgres_ffi = { path = "../libs/postgres_ffi" }
+pq_proto = { path = "../libs/pq_proto" }
 remote_storage = { path = "../libs/remote_storage" }
+tenant_size_model = { path = "../libs/tenant_size_model" }
+utils = { path = "../libs/utils" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
-close_fds = "0.3.2"
-walkdir = "2.3.2"
-svg_fmt = "0.4.1"

 [dev-dependencies]
 criterion = "0.4"
@@ -77,3 +76,7 @@ tempfile = "3.2"
 [[bench]]
 name = "bench_layer_map"
 harness = false
+
+[[bench]]
+name = "bench_walredo"
+harness = false
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -1,38 +1,37 @@
 //! Main entry point for the Page Server executable.

-use remote_storage::GenericRemoteStorage;
 use std::{env, ops::ControlFlow, path::Path, str::FromStr};
+
+use anyhow::{anyhow, Context};
+use clap::{Arg, ArgAction, Command};
+use fail::FailScenario;
+use nix::unistd::Pid;
 use tracing::*;

-use anyhow::{anyhow, bail, Context, Result};
-
-use clap::{Arg, ArgAction, Command};
-use daemonize::Daemonize;
-
-use fail::FailScenario;
 use metrics::set_build_info_metric;
-
 use pageserver::{
    config::{defaults::*, PageServerConf},
-    http, page_cache, page_image_cache, page_service, profiling, task_mgr,
+    http, page_cache, page_service, profiling, task_mgr,
    task_mgr::TaskKind,
    task_mgr::{
        BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
    },
-    tenant_mgr, virtual_file, LOG_FILE_NAME,
+    tenant_mgr, virtual_file,
 };
+use remote_storage::GenericRemoteStorage;
 use utils::{
    auth::JwtAuth,
-    logging,
+    lock_file, logging,
    postgres_backend::AuthType,
    project_git_version,
-    shutdown::exit_now,
    signals::{self, Signal},
    tcp_listener,
 };

 project_git_version!(GIT_VERSION);

+const PID_FILE_NAME: &str = "pageserver.pid";
+
 const FEATURES: &[&str] = &[
    #[cfg(feature = "testing")]
    "testing",
@@ -65,6 +64,7 @@ fn main() -> anyhow::Result<()> {
    let workdir = workdir
        .canonicalize()
        .with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
+
    let cfg_file_path = workdir.join("pageserver.toml");

    // Set CWD to workdir for non-daemon modes
@@ -75,8 +75,6 @@ fn main() -> anyhow::Result<()> {
        )
    })?;

-    let daemonize = arg_matches.get_flag("daemonize");
-
    let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
        ControlFlow::Continue(conf) => conf,
        ControlFlow::Break(()) => {
@@ -101,9 +99,8 @@ fn main() -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
    virtual_file::init(conf.max_file_descriptors);
    page_cache::init(conf.page_cache_size);
-    page_image_cache::init(64 * conf.page_cache_size); // temporary hack for benchmarking

-    start_pageserver(conf, daemonize).context("Failed to start pageserver")?;
+    start_pageserver(conf).context("Failed to start pageserver")?;

    scenario.teardown();
    Ok(())
@@ -198,12 +195,48 @@ fn initialize_config(
    })
 }

-fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> {
-    // Initialize logger
-    let log_file = logging::init(LOG_FILE_NAME, daemonize, conf.log_format)?;
-
+fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
+    logging::init(conf.log_format)?;
    info!("version: {}", version());

+    // If any failpoints were set from FAILPOINTS environment variable,
+    // print them to the log for debugging purposes
+    let failpoints = fail::list();
+    if !failpoints.is_empty() {
+        info!(
+            "started with failpoints: {}",
+            failpoints
+                .iter()
+                .map(|(name, actions)| format!("{name}={actions}"))
+                .collect::<Vec<String>>()
+                .join(";")
+        )
+    }
+
+    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
+    let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
+        lock_file::LockCreationResult::Created {
+            new_lock_contents,
+            file,
+        } => {
+            info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}");
+            file
+        }
+        lock_file::LockCreationResult::AlreadyLocked {
+            existing_lock_contents,
+        } => anyhow::bail!(
+            "Could not lock pid file; pageserver is already running in {:?} with PID {}",
+            conf.workdir,
+            existing_lock_contents
+        ),
+        lock_file::LockCreationResult::CreationFailed(e) => {
+            return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
+        }
+    };
+    // ensure that the lock file is held even if the main thread of the process is panics
+    // we need to release the lock file only when the current process is gone
+    let _ = Box::leak(Box::new(lock_file));
+
    // TODO: Check that it looks like a valid repository before going further

    // bind sockets before daemonizing so we report errors early and do not return until we are listening
@@ -219,33 +252,6 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
    );
    let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;

-    // NB: Don't spawn any threads before daemonizing!
-    if daemonize {
-        info!("daemonizing...");
-
-        // There shouldn't be any logging to stdin/stdout. Redirect it to the main log so
-        // that we will see any accidental manual fprintf's or backtraces.
-        let stdout = log_file
-            .try_clone()
-            .with_context(|| format!("Failed to clone log file '{:?}'", log_file))?;
-        let stderr = log_file;
-
-        let daemonize = Daemonize::new()
-            .pid_file("pageserver.pid")
-            .working_directory(".")
-            .stdout(stdout)
-            .stderr(stderr);
-
-        // XXX: The parent process should exit abruptly right after
-        // it has spawned a child to prevent coverage machinery from
-        // dumping stats into a `profraw` file now owned by the child.
-        // Otherwise, the coverage data will be damaged.
-        match daemonize.exit_action(|| exit_now(0)).start() {
-            Ok(_) => info!("Success, daemonized"),
-            Err(err) => bail!("{err}. could not daemonize. bailing."),
-        }
-    }
-
    let signals = signals::install_shutdown_handlers()?;

    // start profiler (if enabled)
@@ -348,14 +354,6 @@ fn cli() -> Command {
    Command::new("Neon page server")
        .about("Materializes WAL stream to pages and serves them to the postgres")
        .version(version())
-        .arg(
-
-            Arg::new("daemonize")
-                .short('d')
-                .long("daemonize")
-                .action(ArgAction::SetTrue)
-                .help("Run in the background"),
-        )
        .arg(
            Arg::new("init")
                .long("init")
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -8,7 +8,9 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
 use remote_storage::RemoteStorageConfig;
 use std::env;
 use utils::crashsafe::path_with_suffix_extension;
+use utils::id::ConnectionId;

+use std::num::NonZeroUsize;
 use std::path::{Path, PathBuf};
 use std::str::FromStr;
 use std::time::Duration;
@@ -48,6 +50,9 @@ pub mod defaults {

    pub const DEFAULT_LOG_FORMAT: &str = "plain";

+    pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
+        super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
+
    ///
    /// Default built-in configuration file.
    ///
@@ -67,6 +72,9 @@ pub mod defaults {
 #initial_superuser_name = '{DEFAULT_SUPERUSER}'

 #log_format = '{DEFAULT_LOG_FORMAT}'
+
+#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
+
 # [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -132,6 +140,9 @@ pub struct PageServerConf {
    pub broker_endpoints: Vec<Url>,

    pub log_format: LogFormat,
+
+    /// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
+    pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
 }

 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -200,6 +211,8 @@ struct PageServerConfigBuilder {
    broker_endpoints: BuilderValue<Vec<Url>>,

    log_format: BuilderValue<LogFormat>,
+
+    concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
 }

 impl Default for PageServerConfigBuilder {
@@ -228,6 +241,8 @@ impl Default for PageServerConfigBuilder {
            broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()),
            broker_endpoints: Set(Vec::new()),
            log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
+
+            concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
        }
    }
 }
@@ -304,6 +319,10 @@ impl PageServerConfigBuilder {
        self.log_format = BuilderValue::Set(log_format)
    }

+    pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: ConfigurableSemaphore) {
+        self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let broker_endpoints = self
            .broker_endpoints
@@ -349,6 +368,11 @@ impl PageServerConfigBuilder {
                .broker_etcd_prefix
                .ok_or(anyhow!("missing broker_etcd_prefix"))?,
            log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
+            concurrent_tenant_size_logical_size_queries: self
+                .concurrent_tenant_size_logical_size_queries
+                .ok_or(anyhow!(
+                    "missing concurrent_tenant_size_logical_size_queries"
+                ))?,
        })
    }
 }
@@ -391,6 +415,22 @@ impl PageServerConf {
        )
    }

+    pub fn traces_path(&self) -> PathBuf {
+        self.workdir.join("traces")
+    }
+
+    pub fn trace_path(
+        &self,
+        tenant_id: &TenantId,
+        timeline_id: &TimelineId,
+        connection_id: &ConnectionId,
+    ) -> PathBuf {
+        self.traces_path()
+            .join(tenant_id.to_string())
+            .join(timeline_id.to_string())
+            .join(connection_id.to_string())
+    }
+
    /// Points to a place in pageserver's local directory,
    /// where certain timeline's metadata file should be located.
    pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf {
@@ -476,6 +516,12 @@ impl PageServerConf {
                "log_format" => builder.log_format(
                    LogFormat::from_config(&parse_toml_string(key, item)?)?
                ),
+                "concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({
+                    let input = parse_toml_string(key, item)?;
+                    let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
+                    let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
+                    ConfigurableSemaphore::new(permits)
+                }),
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -568,8 +614,9 @@ impl PageServerConf {
        PathBuf::from(format!("../tmp_check/test_{test_name}"))
    }

-    #[cfg(test)]
    pub fn dummy_conf(repo_dir: PathBuf) -> Self {
+        let pg_distrib_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
+
        PageServerConf {
            id: NodeId(0),
            wait_lsn_timeout: Duration::from_secs(60),
@@ -580,7 +627,7 @@ impl PageServerConf {
            listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
            superuser: "cloud_admin".to_string(),
            workdir: repo_dir,
-            pg_distrib_dir: PathBuf::new(),
+            pg_distrib_dir,
            auth_type: AuthType::Trust,
            auth_validation_public_key_path: None,
            remote_storage_config: None,
@@ -589,6 +636,7 @@ impl PageServerConf {
            broker_endpoints: Vec::new(),
            broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
            log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
+            concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
        }
    }
 }
@@ -654,6 +702,58 @@ fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result<Vec<String>> {
        .collect()
 }

+/// Configurable semaphore permits setting.
+///
+/// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty
+/// semaphore cannot be distinguished, leading any feature using these to await forever (or until
+/// new permits are added).
+#[derive(Debug, Clone)]
+pub struct ConfigurableSemaphore {
+    initial_permits: NonZeroUsize,
+    inner: std::sync::Arc<tokio::sync::Semaphore>,
+}
+
+impl ConfigurableSemaphore {
+    pub const DEFAULT_INITIAL: NonZeroUsize = match NonZeroUsize::new(1) {
+        Some(x) => x,
+        None => panic!("const unwrap is not yet stable"),
+    };
+
+    /// Initializse using a non-zero amount of permits.
+    ///
+    /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
+    /// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
+    /// behave like [`futures::future::pending`], just waiting until new permits are added.
+    pub fn new(initial_permits: NonZeroUsize) -> Self {
+        ConfigurableSemaphore {
+            initial_permits,
+            inner: std::sync::Arc::new(tokio::sync::Semaphore::new(initial_permits.get())),
+        }
+    }
+}
+
+impl Default for ConfigurableSemaphore {
+    fn default() -> Self {
+        Self::new(Self::DEFAULT_INITIAL)
+    }
+}
+
+impl PartialEq for ConfigurableSemaphore {
+    fn eq(&self, other: &Self) -> bool {
+        // the number of permits can be increased at runtime, so we cannot really fulfill the
+        // PartialEq value equality otherwise
+        self.initial_permits == other.initial_permits
+    }
+}
+
+impl Eq for ConfigurableSemaphore {}
+
+impl ConfigurableSemaphore {
+    pub fn inner(&self) -> &std::sync::Arc<tokio::sync::Semaphore> {
+        &self.inner
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use std::{
@@ -725,6 +825,7 @@ log_format = 'json'
                    .expect("Failed to parse a valid broker endpoint URL")],
                broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
                log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
+                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -770,6 +871,7 @@ log_format = 'json'
                    .expect("Failed to parse a valid broker endpoint URL")],
                broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
                log_format: LogFormat::Json,
+                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -354,6 +354,54 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"

+  /v1/tenant/{tenant_id}/size:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    get:
+      description: |
+        Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
+      responses:
+        "200":
+          description: OK,
+          content:
+            application/json:
+              schema:
+                type: object
+                required:
+                  - id
+                  - size
+                properties:
+                  id:
+                    type: string
+                    format: hex
+                  size:
+                    type: integer
+                    description: |
+                      Size metric in bytes.
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
  /v1/tenant/{tenant_id}/timeline/:
    parameters:
      - name: tenant_id
@@ -619,6 +667,7 @@ components:
        - disk_consistent_lsn
        - awaits_download
        - state
+        - latest_gc_cutoff_lsn
      properties:
        timeline_id:
          type: string
@@ -663,6 +712,9 @@ components:
          type: boolean
        state:
          type: string
+        latest_gc_cutoff_lsn:
+          type: string
+          format: hex

        # These 'local' and 'remote' fields just duplicate some of the fields
        # above. They are kept for backwards-compatibility. They can be removed,
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -227,13 +227,10 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,

    let state = get_state(&request);

-    let timelines = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
+    let timelines = info_span!("timeline_list", tenant = %tenant_id).in_scope(|| {
        let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
        Ok(tenant.list_timelines())
-    })
-    .await
-    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
+    })?;

    let mut response_data = Vec::with_capacity(timelines.len());
    for timeline in timelines {
@@ -523,9 +520,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
    check_permission(&request, Some(tenant_id))?;

    // if tenant is in progress of downloading it can be absent in global tenant map
-    let tenant = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant(tenant_id, false))
-        .await
-        .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
+    let tenant = tenant_mgr::get_tenant(tenant_id, false);

    let state = get_state(&request);
    let remote_index = &state.remote_index;
@@ -571,6 +566,44 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
    )
 }

+async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::InternalServerError)?;
+
+    // this can be long operation, it currently is not backed by any request coalescing or similar
+    let inputs = tenant
+        .gather_size_inputs()
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    let size = inputs.calculate().map_err(ApiError::InternalServerError)?;
+
+    /// Private response type with the additional "unstable" `inputs` field.
+    ///
+    /// The type is described with `id` and `size` in the openapi_spec file, but the `inputs` is
+    /// intentionally left out. The type resides in the pageserver not to expose `ModelInputs`.
+    #[serde_with::serde_as]
+    #[derive(serde::Serialize)]
+    struct TenantHistorySize {
+        #[serde_as(as = "serde_with::DisplayFromStr")]
+        id: TenantId,
+        /// Size is a mixture of WAL and logical size, so the unit is bytes.
+        size: u64,
+        inputs: crate::tenant::size::ModelInputs,
+    }
+
+    json_response(
+        StatusCode::OK,
+        TenantHistorySize {
+            id: tenant_id,
+            size,
+            inputs,
+        },
+    )
+}
+
 // Helper function to standardize the error messages we produce on bad durations
 //
 // Intended to be used with anyhow's `with_context`, e.g.:
@@ -585,6 +618,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    check_permission(&request, None)?;

    let request_data: TenantCreateRequest = json_request(&mut request).await?;
+    println!("tenant create: {:?}", request_data.trace_read_requests);
    let remote_index = get_state(&request).remote_index.clone();

    let mut tenant_conf = TenantConfOpt::default();
@@ -626,6 +660,9 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
        tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
    }
+    if let Some(trace_read_requests) = request_data.trace_read_requests {
+        tenant_conf.trace_read_requests = Some(trace_read_requests);
+    }

    tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
    if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
@@ -713,6 +750,9 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
    if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
        tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
    }
+    if let Some(trace_read_requests) = request_data.trace_read_requests {
+        tenant_conf.trace_read_requests = Some(trace_read_requests);
+    }

    tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
    if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
@@ -792,14 +832,14 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
    let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?;
    let gc_req: TimelineGcRequest = json_request(&mut request).await?;

-    let _span_guard =
-        info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered();
    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());

    // Use tenant's pitr setting
    let pitr = tenant.get_pitr_interval();
    let result = tenant
        .gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
+        .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
+        .await
        // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
        // better once the types support it.
        .map_err(ApiError::InternalServerError)?;
@@ -835,6 +875,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
        .map_err(ApiError::NotFound)?;
    timeline
        .checkpoint(CheckpointConfig::Forced)
+        .await
        .map_err(ApiError::InternalServerError)?;

    json_response(StatusCode::OK, ())
@@ -898,6 +939,7 @@ pub fn make_router(
        .get("/v1/tenant", tenant_list_handler)
        .post("/v1/tenant", tenant_create_handler)
        .get("/v1/tenant/:tenant_id", tenant_status)
+        .get("/v1/tenant/:tenant_id/size", tenant_size_handler)
        .put("/v1/tenant/config", tenant_config_handler)
        .get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
        .post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -5,7 +5,6 @@ pub mod import_datadir;
 pub mod keyspace;
 pub mod metrics;
 pub mod page_cache;
-pub mod page_image_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
 pub mod profiling;
@@ -16,6 +15,7 @@ pub mod tenant;
 pub mod tenant_config;
 pub mod tenant_mgr;
 pub mod tenant_tasks;
+pub mod trace;
 pub mod virtual_file;
 pub mod walingest;
 pub mod walreceiver;
@@ -44,8 +44,6 @@ pub const DEFAULT_PG_VERSION: u32 = 14;
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
 pub const DELTA_FILE_MAGIC: u16 = 0x5A61;

-pub const LOG_FILE_NAME: &str = "pageserver.log";
-
 static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);

 /// Config for the Repository checkpointer
@@ -82,7 +80,6 @@ pub async fn shutdown_pageserver(exit_code: i32) {

    // There should be nothing left, but let's be sure
    task_mgr::shutdown_tasks(None, None, None).await;
-
    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -31,6 +31,7 @@ const STORAGE_TIME_OPERATIONS: &[&str] = &[
    "compact",
    "create images",
    "init logical size",
+    "logical size",
    "load layer map",
    "gc",
 ];
@@ -365,6 +366,7 @@ pub struct TimelineMetrics {
    pub compact_time_histo: Histogram,
    pub create_images_time_histo: Histogram,
    pub init_logical_size_histo: Histogram,
+    pub logical_size_histo: Histogram,
    pub load_layer_map_histo: Histogram,
    pub last_record_gauge: IntGauge,
    pub wait_lsn_time_histo: Histogram,
@@ -397,6 +399,9 @@ impl TimelineMetrics {
        let init_logical_size_histo = STORAGE_TIME
            .get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id])
            .unwrap();
+        let logical_size_histo = STORAGE_TIME
+            .get_metric_with_label_values(&["logical size", &tenant_id, &timeline_id])
+            .unwrap();
        let load_layer_map_histo = STORAGE_TIME
            .get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id])
            .unwrap();
@@ -428,6 +433,7 @@ impl TimelineMetrics {
            compact_time_histo,
            create_images_time_histo,
            init_logical_size_histo,
+            logical_size_histo,
            load_layer_map_histo,
            last_record_gauge,
            wait_lsn_time_histo,
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -108,10 +108,10 @@ enum CacheKey {
 }

 #[derive(Debug, PartialEq, Eq, Hash, Clone)]
-pub struct MaterializedPageHashKey {
-    pub tenant_id: TenantId,
-    pub timeline_id: TimelineId,
-    pub key: Key,
+struct MaterializedPageHashKey {
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    key: Key,
 }

 #[derive(Clone)]
--- a/pageserver/src/page_image_cache.rs
+++ b/pageserver/src/page_image_cache.rs
@@ -1,345 +0,0 @@
-//!
-//! Global page image cache
-//!
-//! Unlike page_cache it holds only most recent version of reconstructed page images.
-//! And it uses invalidation mechanism to avoid layer ap lookups.
-
-use crate::page_cache::MaterializedPageHashKey;
-use crate::pgdatadir_mapping::{rel_block_to_key, BlockNumber};
-use crate::repository::Key;
-use crate::tenant::Timeline;
-use crate::virtual_file::VirtualFile;
-use anyhow::{bail, Result};
-use bytes::Bytes;
-use once_cell::sync::OnceCell;
-use pageserver_api::reltag::RelTag;
-use std::collections::hash_map::DefaultHasher;
-use std::hash::{Hash, Hasher};
-use std::os::unix::fs::FileExt;
-use std::sync::{Arc, Condvar, Mutex};
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
-
-static PAGE_CACHE: OnceCell<Mutex<PageImageCache>> = OnceCell::new();
-const TEST_PAGE_CACHE_SIZE: usize = 50;
-pub const PAGE_SZ: usize = postgres_ffi::BLCKSZ as usize;
-
-enum PageImageState {
-    Vacant,                        // entry is not used
-    Loaded(bool),                  // page is loaded or has failed
-    Loading(Option<Arc<Condvar>>), // page in process of loading, Condvar is created on demand when some thread need to wait load completion
-}
-
-struct CacheEntry {
-    key: MaterializedPageHashKey,
-
-    // next+prev are used for LRU L2-list and next is also used for L1 free pages list
-    next: usize,
-    prev: usize,
-
-    collision: usize, // L1 hash collision chain
-
-    access_count: u32,
-    state: PageImageState,
-}
-
-pub struct PageImageCache {
-    free_list: usize, // L1 list of free entries
-    pages: Vec<CacheEntry>,
-    hash_table: Vec<usize>, // indexes in pages array
-    file: Arc<VirtualFile>,
-}
-
-///
-/// Initialize the page cache. This must be called once at page server startup.
-///
-pub fn init(size: usize) {
-    if PAGE_CACHE
-        .set(Mutex::new(PageImageCache::new(size)))
-        .is_err()
-    {
-        panic!("page cache already initialized");
-    }
-}
-
-///
-/// Get a handle to the page cache.
-///
-pub fn get() -> &'static Mutex<PageImageCache> {
-    //
-    // In unit tests, page server startup doesn't happen and no one calls
-    // page_image_cache::init(). Initialize it here with a tiny cache, so that the
-    // page cache is usable in unit tests.
-    //
-    if cfg!(test) {
-        PAGE_CACHE.get_or_init(|| Mutex::new(PageImageCache::new(TEST_PAGE_CACHE_SIZE)))
-    } else {
-        PAGE_CACHE.get().expect("page cache not initialized")
-    }
-}
-
-fn hash<T: Hash>(t: &T) -> usize {
-    let mut s = DefaultHasher::new();
-    t.hash(&mut s);
-    s.finish() as usize
-}
-
-impl PageImageCache {
-    fn new(size: usize) -> Self {
-        let mut pages: Vec<CacheEntry> = Vec::with_capacity(size + 1);
-        let hash_table = vec![0usize; size];
-        let file = Arc::new(
-            VirtualFile::open_with_options(
-                &std::path::PathBuf::from("page.cache"),
-                std::fs::OpenOptions::new()
-                    .read(true)
-                    .write(true)
-                    .create(true)
-                    .truncate(true),
-            )
-            .unwrap(),
-        );
-        // Dummy key
-        let dummy_key = MaterializedPageHashKey {
-            key: Key::MIN,
-            tenant_id: TenantId::from([0u8; 16]),
-            timeline_id: TimelineId::from([0u8; 16]),
-        };
-
-        // LRU list head
-        pages.push(CacheEntry {
-            key: dummy_key.clone(),
-            next: 0,
-            prev: 0,
-            access_count: 0,
-            collision: 0,
-            state: PageImageState::Vacant,
-        });
-
-        // Construct L1 free page list
-        for i in 0..size {
-            pages.push(CacheEntry {
-                key: dummy_key.clone(),
-                next: i + 2, // build L1-list of free pages
-                prev: 0,
-                access_count: 0,
-                collision: 0,
-                state: PageImageState::Vacant,
-            });
-        }
-        pages[size - 1].next = 0; // en of free page list
-
-        PageImageCache {
-            free_list: 1,
-            pages,
-            hash_table,
-            file,
-        }
-    }
-
-    // Unlink from L2-list
-    fn unlink(&mut self, index: usize) {
-        let next = self.pages[index].next;
-        let prev = self.pages[index].prev;
-        self.pages[next].prev = prev;
-        self.pages[prev].next = next;
-    }
-
-    // Link in L2-list after specified element
-    fn link_after(&mut self, after: usize, index: usize) {
-        let next = self.pages[after].next;
-        self.pages[index].prev = after;
-        self.pages[index].next = next;
-        self.pages[next].prev = index;
-        self.pages[after].next = index;
-    }
-
-    fn prune(&mut self, index: usize) {
-        self.pages[index].prev = index;
-        self.pages[index].next = index;
-    }
-
-    fn is_empty(&self, index: usize) -> bool {
-        self.pages[index].next == index
-    }
-}
-
-// Remove entry from cache: o page invalidation or drop relation
-pub fn remove(key: Key, tenant_id: TenantId, timeline_id: TimelineId) {
-    let key = MaterializedPageHashKey {
-        key,
-        tenant_id,
-        timeline_id,
-    };
-    let this = get();
-    let mut cache = this.lock().unwrap();
-    let h = hash(&key) % cache.hash_table.len();
-    let mut index = cache.hash_table[h];
-    let mut prev = 0usize;
-    while index != 0 {
-        if cache.pages[index].key == key {
-            if !cache.is_empty(index) {
-                cache.pages[index].state = PageImageState::Vacant;
-                // Remove from LRU list
-                cache.unlink(index);
-                // Insert entry in free list
-                cache.pages[index].next = cache.free_list;
-                cache.free_list = index;
-            } else {
-                // Page is process of loading: we can not remove it righ now,
-                // so just mark for deletion
-                cache.pages[index].next = 0; // make is_empty == false
-            }
-            // Remove from hash table
-            if prev == 0 {
-                cache.hash_table[h] = cache.pages[index].collision;
-            } else {
-                cache.pages[prev].collision = cache.pages[index].collision;
-            }
-            break;
-        }
-        prev = index;
-        index = cache.pages[index].collision;
-    }
-    // It's Ok if image not found
-}
-
-// Find or load page image in the cache
-pub fn lookup(timeline: &Timeline, rel: RelTag, blkno: BlockNumber, lsn: Lsn) -> Result<Bytes> {
-    let key = MaterializedPageHashKey {
-        key: rel_block_to_key(rel, blkno),
-        tenant_id: timeline.tenant_id,
-        timeline_id: timeline.timeline_id,
-    };
-    let this = get();
-    let mut cache = this.lock().unwrap();
-    let h = hash(&key) % cache.hash_table.len();
-
-    'lookup: loop {
-        let mut index = cache.hash_table[h];
-        while index != 0 {
-            if cache.pages[index].key == key {
-                // cache hit
-                match &cache.pages[index].state {
-                    PageImageState::Loaded(success) => {
-                        if *success {
-                            // Pin page
-                            if cache.pages[index].access_count == 0 {
-                                cache.unlink(index);
-                            }
-                            cache.pages[index].access_count += 1;
-                            let file = cache.file.clone();
-                            drop(cache);
-                            let mut buf = [0u8; PAGE_SZ];
-                            file.read_exact_at(&mut buf, index as u64 * PAGE_SZ as u64)?;
-                            cache = this.lock().unwrap();
-                            assert!(cache.pages[index].access_count > 0);
-                            cache.pages[index].access_count -= 1;
-                            if cache.pages[index].access_count == 0 {
-                                // Move to the head of LRU list
-                                cache.link_after(0, index);
-                            }
-                            return Ok(Bytes::from(buf.to_vec()));
-                        } else {
-                            return Err(anyhow::anyhow!("page loading failed earlier"));
-                        }
-                    }
-                    PageImageState::Loading(event) => {
-                        // Create event on which to sleep if not yet assigned
-                        let cv = match event {
-                            None => {
-                                let cv = Arc::new(Condvar::new());
-                                cache.pages[index].state =
-                                    PageImageState::Loading(Some(cv.clone()));
-                                cv
-                            }
-                            Some(cv) => cv.clone(),
-                        };
-                        cache = cv.wait(cache).unwrap();
-                        // Retry lookup
-                        continue 'lookup;
-                    }
-                    PageImageState::Vacant => bail!("Vacant entry is not expected here"),
-                };
-            }
-            index = cache.pages[index].collision;
-        }
-        let file = cache.file.clone();
-        // Cache miss
-        index = cache.free_list;
-        if index == 0 {
-            // no free items
-            let victim = cache.pages[0].prev; // take least recently used element from the tail of LRU list
-            assert!(victim != 0);
-            assert!(cache.pages[victim].access_count == 0);
-            // Remove victim from hash table
-            let h = hash(&cache.pages[victim].key) % cache.hash_table.len();
-            index = cache.hash_table[h];
-            let mut prev = 0usize;
-            while index != victim {
-                assert!(index != 0);
-                prev = index;
-                index = cache.pages[index].collision;
-            }
-            if prev == 0 {
-                cache.hash_table[h] = cache.pages[victim].collision;
-            } else {
-                cache.pages[prev].collision = cache.pages[victim].collision;
-            }
-            // and from LRU list
-            cache.unlink(victim);
-
-            index = victim;
-        } else {
-            // Use next free item
-            cache.free_list = cache.pages[index].next;
-        }
-        // Make is_empty(index) == true. If entry is removed in process of loaded,
-        // it will be updated so that !is_empty(index)
-        cache.prune(index);
-
-        // Insert in hash table
-        cache.pages[index].collision = cache.hash_table[h];
-        cache.hash_table[h] = index;
-
-        cache.pages[index].key = key;
-        cache.pages[index].state = PageImageState::Loading(None);
-        drop(cache); //release lock
-
-        // Load page
-        let result = timeline.get_rel_page_at_lsn(rel, blkno, lsn, true);
-        let mut success = false;
-        if let Ok(page) = &result {
-            success = true;
-            file.write_all_at(&page, index as u64 * PAGE_SZ as u64)?;
-        }
-        cache = this.lock().unwrap();
-        if let PageImageState::Loading(event) = &cache.pages[index].state {
-            // Are there some waiting threads?
-            if let Some(cv) = event {
-                // If so, then wakeup them
-                cv.notify_all();
-            }
-        } else {
-            bail!("Loading state is expected");
-        }
-        if cache.is_empty(index) {
-            // entry was not marked as deleted {
-            // Page is loaded
-
-            // match &res { ... } is same as `res.as_ref().ok().cloned()`
-            cache.pages[index].state = PageImageState::Loaded(success);
-            // Link the page to the head of LRU list
-            cache.link_after(0, index);
-        } else {
-            cache.pages[index].state = PageImageState::Vacant;
-            // Return page to free list
-            cache.pages[index].next = cache.free_list;
-            cache.free_list = index;
-        }
-        // only the first one gets the full error from `get_rel_page_at_lsn`
-        return result;
-    }
-}
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -10,6 +10,7 @@
 //

 use anyhow::{bail, ensure, Context, Result};
+use bytes::Buf;
 use bytes::Bytes;
 use futures::{Stream, StreamExt};
 use pageserver_api::models::{
@@ -18,22 +19,23 @@ use pageserver_api::models::{
    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
    PagestreamNblocksRequest, PagestreamNblocksResponse,
 };
-
+use pq_proto::{BeMessage, FeMessage, RowDescriptor};
 use std::io;
 use std::net::TcpListener;
 use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
+use tokio::pin;
 use tokio_util::io::StreamReader;
 use tokio_util::io::SyncIoBridge;
 use tracing::*;
+use utils::id::ConnectionId;
 use utils::{
    auth::{self, Claims, JwtAuth, Scope},
    id::{TenantId, TimelineId},
    lsn::Lsn,
    postgres_backend::AuthType,
    postgres_backend_async::{self, PostgresBackend},
-    pq_proto::{BeMessage, FeMessage, RowDescriptor},
    simple_rcu::RcuReadGuard,
 };

@@ -41,12 +43,12 @@ use crate::basebackup;
 use crate::config::{PageServerConf, ProfilingConfig};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
-use crate::page_image_cache;
 use crate::profiling::profpoint_start;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::Timeline;
 use crate::tenant_mgr;
+use crate::trace::Tracer;
 use crate::CheckpointConfig;

 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
@@ -74,6 +76,12 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                        FeMessage::CopyData(bytes) => bytes,
                        FeMessage::CopyDone => { break },
                        FeMessage::Sync => continue,
+                        FeMessage::Terminate => {
+                            let msg = format!("client terminated connection with Terminate message during COPY");
+                            pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
+                            Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                            break;
+                        }
                        m => {
                            let msg = format!("unexpected message {:?}", m);
                            pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
@@ -85,10 +93,10 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                    yield copy_data_bytes;
                }
                Ok(None) => {
-                    let msg = "client closed connection";
+                    let msg = "client closed connection during COPY";
                    pgb.write_message(&BeMessage::ErrorResponse(msg))?;
                    pgb.flush().await?;
-                    Err(io::Error::new(io::ErrorKind::Other, msg))?;
+                    Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
                }
                Err(e) => {
                    Err(io::Error::new(io::ErrorKind::Other, e))?;
@@ -269,6 +277,18 @@ impl PageServerHandler {
        //       so there is no need to reset the association
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));

+        // Make request tracer if needed
+        let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
+        let mut tracer = if tenant.get_trace_read_requests() {
+            let connection_id = ConnectionId::generate();
+            let path = tenant
+                .conf
+                .trace_path(&tenant_id, &timeline_id, &connection_id);
+            Some(Tracer::new(path))
+        } else {
+            None
+        };
+
        // Check that the timeline exists
        let timeline = get_local_timeline(tenant_id, timeline_id)?;

@@ -301,7 +321,12 @@ impl PageServerHandler {

            trace!("query: {copy_data_bytes:?}");

-            let neon_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;
+            // Trace request if needed
+            if let Some(t) = tracer.as_mut() {
+                t.trace(&copy_data_bytes)
+            }
+
+            let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;

            let response = match neon_fe_msg {
                PagestreamFeMessage::Exists(req) => {
@@ -368,14 +393,12 @@ impl PageServerHandler {
        pgb.write_message(&BeMessage::CopyInResponse)?;
        pgb.flush().await?;

-        // import_basebackup_from_tar() is not async, mainly because the Tar crate
-        // it uses is not async. So we need to jump through some hoops:
-        // - convert the input from client connection to a synchronous Read
-        // - use block_in_place()
-        let mut copyin_stream = Box::pin(copyin_stream(pgb));
-        let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
-        tokio::task::block_in_place(|| timeline.import_basebackup_from_tar(reader, base_lsn))?;
-        timeline.initialize()?;
+        let copyin_stream = copyin_stream(pgb);
+        pin!(copyin_stream);
+
+        timeline
+            .import_basebackup_from_tar(&mut copyin_stream, base_lsn)
+            .await?;

        // Drain the rest of the Copy data
        let mut bytes_after_tar = 0;
@@ -440,7 +463,7 @@ impl PageServerHandler {
        // We only want to persist the data, and it doesn't matter if it's in the
        // shape of deltas or images.
        info!("flushing layers");
-        timeline.checkpoint(CheckpointConfig::Flush)?;
+        timeline.checkpoint(CheckpointConfig::Flush).await?;

        info!("done");
        Ok(())
@@ -582,12 +605,8 @@ impl PageServerHandler {
        // current profiling is based on a thread-local variable, so it doesn't work
        // across awaits
        let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
+        let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?;

-        let page = if req.latest {
-            page_image_cache::lookup(timeline, req.rel, req.blkno, lsn)
-        } else {
-            timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, false)
-        }?;
        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
            page,
        }))
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1179,7 +1179,7 @@ fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
    }
 }

-pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
+fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
    Key {
        field1: 0x00,
        field2: rel.spcnode,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12,8 +12,12 @@
 //!

 use anyhow::{bail, Context};
+use bytes::Bytes;
+use futures::Stream;
 use pageserver_api::models::TimelineState;
 use tokio::sync::watch;
+use tokio_util::io::StreamReader;
+use tokio_util::io::SyncIoBridge;
 use tracing::*;
 use utils::crashsafe::path_with_suffix_extension;

@@ -29,6 +33,7 @@ use std::io::Write;
 use std::ops::Bound::Included;
 use std::path::Path;
 use std::path::PathBuf;
+use std::pin::Pin;
 use std::process::Command;
 use std::process::Stdio;
 use std::sync::Arc;
@@ -72,6 +77,8 @@ pub mod storage_layer;

 mod timeline;

+pub mod size;
+
 use storage_layer::Layer;

 pub use timeline::Timeline;
@@ -120,6 +127,9 @@ pub struct Tenant {

    /// Makes every timeline to backup their files to remote storage.
    upload_layers: bool,
+
+    /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
+    cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
 }

 /// A timeline with some of its files on disk, being initialized.
@@ -132,7 +142,7 @@ pub struct Tenant {
 pub struct UninitializedTimeline<'t> {
    owning_tenant: &'t Tenant,
    timeline_id: TimelineId,
-    raw_timeline: Option<(Timeline, TimelineUninitMark)>,
+    raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
 }

 /// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
@@ -164,7 +174,6 @@ impl UninitializedTimeline<'_> {
        let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
            format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
        })?;
-        let new_timeline = Arc::new(new_timeline);

        let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
        // TODO it would be good to ensure that, but apparently a lot of our testing is dependend on that at least
@@ -192,6 +201,9 @@ impl UninitializedTimeline<'_> {
                })?;
                new_timeline.set_state(TimelineState::Active);
                v.insert(Arc::clone(&new_timeline));
+
+                new_timeline.maybe_spawn_flush_loop();
+
                new_timeline.launch_wal_receiver();
            }
        }
@@ -200,20 +212,28 @@ impl UninitializedTimeline<'_> {
    }

    /// Prepares timeline data by loading it from the basebackup archive.
-    pub fn import_basebackup_from_tar(
-        &self,
-        reader: impl std::io::Read,
+    pub async fn import_basebackup_from_tar(
+        self,
+        mut copyin_stream: &mut Pin<&mut impl Stream<Item = io::Result<Bytes>>>,
        base_lsn: Lsn,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<Arc<Timeline>> {
        let raw_timeline = self.raw_timeline()?;
-        import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn).with_context(
-            || {
-                format!(
-                    "Failed to import basebackup for timeline {}/{}",
-                    self.owning_tenant.tenant_id, self.timeline_id
-                )
-            },
-        )?;
+
+        // import_basebackup_from_tar() is not async, mainly because the Tar crate
+        // it uses is not async. So we need to jump through some hoops:
+        // - convert the input from client connection to a synchronous Read
+        // - use block_in_place()
+        let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
+
+        tokio::task::block_in_place(|| {
+            import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn)
+                .context("Failed to import basebackup")
+        })?;
+
+        // Flush loop needs to be spawned in order for checkpoint to be able to flush.
+        // We want to run proper checkpoint before we mark timeline as available to outside world
+        // Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
+        raw_timeline.maybe_spawn_flush_loop();

        fail::fail_point!("before-checkpoint-new-timeline", |_| {
            bail!("failpoint before-checkpoint-new-timeline");
@@ -221,16 +241,15 @@ impl UninitializedTimeline<'_> {

        raw_timeline
            .checkpoint(CheckpointConfig::Flush)
-            .with_context(|| {
-                format!(
-                    "Failed to checkpoint after basebackup import for timeline {}/{}",
-                    self.owning_tenant.tenant_id, self.timeline_id
-                )
-            })?;
-        Ok(())
+            .await
+            .context("Failed to checkpoint after basebackup import")?;
+
+        let timeline = self.initialize()?;
+
+        Ok(timeline)
    }

-    fn raw_timeline(&self) -> anyhow::Result<&Timeline> {
+    fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
        Ok(&self
            .raw_timeline
            .as_ref()
@@ -442,14 +461,7 @@ impl Tenant {
                    .context("Cannot branch off the timeline that's not present in pageserver")?;

                if let Some(lsn) = ancestor_start_lsn.as_mut() {
-                    // Wait for the WAL to arrive and be processed on the parent branch up
-                    // to the requested branch point. The repository code itself doesn't
-                    // require it, but if we start to receive WAL on the new timeline,
-                    // decoding the new WAL might need to look up previous pages, relation
-                    // sizes etc. and that would get confused if the previous page versions
-                    // are not in the repository yet.
                    *lsn = lsn.align();
-                    ancestor_timeline.wait_lsn(*lsn).await?;

                    let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
                    if ancestor_ancestor_lsn > *lsn {
@@ -461,11 +473,19 @@ impl Tenant {
                            ancestor_ancestor_lsn,
                        );
                    }
+
+                    // Wait for the WAL to arrive and be processed on the parent branch up
+                    // to the requested branch point. The repository code itself doesn't
+                    // require it, but if we start to receive WAL on the new timeline,
+                    // decoding the new WAL might need to look up previous pages, relation
+                    // sizes etc. and that would get confused if the previous page versions
+                    // are not in the repository yet.
+                    ancestor_timeline.wait_lsn(*lsn).await?;
                }

                self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?
            }
-            None => self.bootstrap_timeline(new_timeline_id, pg_version)?,
+            None => self.bootstrap_timeline(new_timeline_id, pg_version).await?,
        };

        // Have added new timeline into the tenant, now its background tasks are needed.
@@ -483,7 +503,7 @@ impl Tenant {
    /// `checkpoint_before_gc` parameter is used to force compaction of storage before GC
    /// to make tests more deterministic.
    /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed?
-    pub fn gc_iteration(
+    pub async fn gc_iteration(
        &self,
        target_timeline_id: Option<TimelineId>,
        horizon: u64,
@@ -499,11 +519,13 @@ impl Tenant {
            .map(|x| x.to_string())
            .unwrap_or_else(|| "-".to_string());

-        STORAGE_TIME
-            .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
-            .observe_closure_duration(|| {
-                self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc)
-            })
+        {
+            let _timer = STORAGE_TIME
+                .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
+                .start_timer();
+            self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc)
+                .await
+        }
    }

    /// Perform one compaction iteration.
@@ -523,7 +545,6 @@ impl Tenant {
        let timelines = self.timelines.lock().unwrap();
        let timelines_to_compact = timelines
            .iter()
-            .filter(|(_, timeline)| timeline.is_active())
            .map(|(timeline_id, timeline)| (*timeline_id, timeline.clone()))
            .collect::<Vec<_>>();
        drop(timelines);
@@ -540,23 +561,24 @@ impl Tenant {
    ///
    /// Used at graceful shutdown.
    ///
-    pub fn checkpoint(&self) -> anyhow::Result<()> {
+    pub async fn checkpoint(&self) -> anyhow::Result<()> {
        // Scan through the hashmap and collect a list of all the timelines,
        // while holding the lock. Then drop the lock and actually perform the
        // checkpoints. We don't want to block everything else while the
        // checkpoint runs.
-        let timelines = self.timelines.lock().unwrap();
-        let timelines_to_checkpoint = timelines
-            .iter()
-            .map(|(timeline_id, timeline)| (*timeline_id, Arc::clone(timeline)))
-            .collect::<Vec<_>>();
-        drop(timelines);
+        let timelines_to_checkpoint = {
+            let timelines = self.timelines.lock().unwrap();
+            timelines
+                .iter()
+                .map(|(id, timeline)| (*id, Arc::clone(timeline)))
+                .collect::<Vec<_>>()
+        };

-        for (timeline_id, timeline) in &timelines_to_checkpoint {
-            let _entered =
-                info_span!("checkpoint", timeline = %timeline_id, tenant = %self.tenant_id)
-                    .entered();
-            timeline.checkpoint(CheckpointConfig::Flush)?;
+        for (id, timeline) in &timelines_to_checkpoint {
+            timeline
+                .checkpoint(CheckpointConfig::Flush)
+                .instrument(info_span!("checkpoint", timeline = %id, tenant = %self.tenant_id))
+                .await?;
        }

        Ok(())
@@ -785,6 +807,13 @@ impl Tenant {
            .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
    }

+    pub fn get_trace_read_requests(&self) -> bool {
+        let tenant_conf = self.tenant_conf.read().unwrap();
+        tenant_conf
+            .trace_read_requests
+            .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
+    }
+
    pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
        self.tenant_conf.write().unwrap().update(&new_tenant_conf);
    }
@@ -835,6 +864,7 @@ impl Tenant {
            remote_index,
            upload_layers,
            state,
+            cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
        }
    }

@@ -956,8 +986,9 @@ impl Tenant {
    //                 +-----baz-------->
    //
    //
-    // 1. Grab 'gc_cs' mutex to prevent new timelines from being created
-    // 2. Scan all timelines, and on each timeline, make note of the
+    // 1. Grab 'gc_cs' mutex to prevent new timelines from being created while Timeline's
+    //    `gc_infos` are being refreshed
+    // 2. Scan collected timelines, and on each timeline, make note of the
    //    all the points where other timelines have been branched off.
    //    We will refrain from removing page versions at those LSNs.
    // 3. For each timeline, scan all layer files on the timeline.
@@ -968,7 +999,7 @@ impl Tenant {
    // - if a relation has a non-incremental persistent layer on a child branch, then we
    //   don't need to keep that in the parent anymore. But currently
    //   we do.
-    fn gc_iteration_internal(
+    async fn gc_iteration_internal(
        &self,
        target_timeline_id: Option<TimelineId>,
        horizon: u64,
@@ -978,6 +1009,72 @@ impl Tenant {
        let mut totals: GcResult = Default::default();
        let now = Instant::now();

+        let gc_timelines = self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)?;
+
+        utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
+
+        info!("starting on {} timelines", gc_timelines.len());
+
+        // Perform GC for each timeline.
+        //
+        // Note that we don't hold the GC lock here because we don't want
+        // to delay the branch creation task, which requires the GC lock.
+        // A timeline GC iteration can be slow because it may need to wait for
+        // compaction (both require `layer_removal_cs` lock),
+        // but the GC iteration can run concurrently with branch creation.
+        //
+        // See comments in [`Tenant::branch_timeline`] for more information
+        // about why branch creation task can run concurrently with timeline's GC iteration.
+        for timeline in gc_timelines {
+            if task_mgr::is_shutdown_requested() {
+                // We were requested to shut down. Stop and return with the progress we
+                // made.
+                break;
+            }
+
+            // If requested, force flush all in-memory layers to disk first,
+            // so that they too can be garbage collected. That's
+            // used in tests, so we want as deterministic results as possible.
+            if checkpoint_before_gc {
+                timeline.checkpoint(CheckpointConfig::Forced).await?;
+                info!(
+                    "timeline {} checkpoint_before_gc done",
+                    timeline.timeline_id
+                );
+            }
+
+            let result = timeline.gc()?;
+            totals += result;
+        }
+
+        totals.elapsed = now.elapsed();
+        Ok(totals)
+    }
+
+    /// Refreshes the Timeline::gc_info for all timelines, returning the
+    /// vector of timelines which have [`Timeline::get_last_record_lsn`] past
+    /// [`Tenant::get_gc_horizon`].
+    ///
+    /// This is usually executed as part of periodic gc, but can now be triggered more often.
+    pub fn refresh_gc_info(&self) -> anyhow::Result<Vec<Arc<Timeline>>> {
+        // since this method can now be called at different rates than the configured gc loop, it
+        // might be that these configuration values get applied faster than what it was previously,
+        // since these were only read from the gc task.
+        let horizon = self.get_gc_horizon();
+        let pitr = self.get_pitr_interval();
+
+        // refresh all timelines
+        let target_timeline_id = None;
+
+        self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)
+    }
+
+    fn refresh_gc_info_internal(
+        &self,
+        target_timeline_id: Option<TimelineId>,
+        horizon: u64,
+        pitr: Duration,
+    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
        // grab mutex to prevent new timelines from being created here.
        let gc_cs = self.gc_cs.lock().unwrap();

@@ -995,11 +1092,7 @@ impl Tenant {

            timelines
                .iter()
-                .filter(|(_, timeline)| timeline.is_active())
                .map(|(timeline_id, timeline_entry)| {
-                    // This is unresolved question for now, how to do gc in presence of remote timelines
-                    // especially when this is combined with branching.
-                    // Somewhat related: https://github.com/neondatabase/neon/issues/999
                    if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() {
                        // If target_timeline is specified, we only need to know branchpoints of its children
                        if let Some(timeline_id) = target_timeline_id {
@@ -1053,41 +1146,7 @@ impl Tenant {
            }
        }
        drop(gc_cs);
-
-        // Perform GC for each timeline.
-        //
-        // Note that we don't hold the GC lock here because we don't want
-        // to delay the branch creation task, which requires the GC lock.
-        // A timeline GC iteration can be slow because it may need to wait for
-        // compaction (both require `layer_removal_cs` lock),
-        // but the GC iteration can run concurrently with branch creation.
-        //
-        // See comments in [`Tenant::branch_timeline`] for more information
-        // about why branch creation task can run concurrently with timeline's GC iteration.
-        for timeline in gc_timelines {
-            if task_mgr::is_shutdown_requested() {
-                // We were requested to shut down. Stop and return with the progress we
-                // made.
-                break;
-            }
-
-            // If requested, force flush all in-memory layers to disk first,
-            // so that they too can be garbage collected. That's
-            // used in tests, so we want as deterministic results as possible.
-            if checkpoint_before_gc {
-                timeline.checkpoint(CheckpointConfig::Forced)?;
-                info!(
-                    "timeline {} checkpoint_before_gc done",
-                    timeline.timeline_id
-                );
-            }
-
-            let result = timeline.gc()?;
-            totals += result;
-        }
-
-        totals.elapsed = now.elapsed();
-        Ok(totals)
+        Ok(gc_timelines)
    }

    /// Branch an existing timeline
@@ -1191,14 +1250,15 @@ impl Tenant {

    /// - run initdb to init temporary instance and get bootstrap data
    /// - after initialization complete, remove the temp dir.
-    fn bootstrap_timeline(
+    async fn bootstrap_timeline(
        &self,
        timeline_id: TimelineId,
        pg_version: u32,
    ) -> anyhow::Result<Arc<Timeline>> {
-        let timelines = self.timelines.lock().unwrap();
-        let timeline_uninit_mark = self.create_timeline_uninit_mark(timeline_id, &timelines)?;
-        drop(timelines);
+        let timeline_uninit_mark = {
+            let timelines = self.timelines.lock().unwrap();
+            self.create_timeline_uninit_mark(timeline_id, &timelines)?
+        };
        // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
        // temporary directory for basebackup files for the given timeline.
        let initdb_path = path_with_suffix_extension(
@@ -1248,25 +1308,35 @@ impl Tenant {

        let tenant_id = raw_timeline.owning_tenant.tenant_id;
        let unfinished_timeline = raw_timeline.raw_timeline()?;
-        import_datadir::import_timeline_from_postgres_datadir(
-            unfinished_timeline,
-            pgdata_path,
-            pgdata_lsn,
-        )
+
+        tokio::task::block_in_place(|| {
+            import_datadir::import_timeline_from_postgres_datadir(
+                unfinished_timeline,
+                pgdata_path,
+                pgdata_lsn,
+            )
+        })
        .with_context(|| {
            format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}")
        })?;

+        // Flush loop needs to be spawned in order for checkpoint to be able to flush.
+        // We want to run proper checkpoint before we mark timeline as available to outside world
+        // Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
+        unfinished_timeline.maybe_spawn_flush_loop();
+
        fail::fail_point!("before-checkpoint-new-timeline", |_| {
            anyhow::bail!("failpoint before-checkpoint-new-timeline");
        });
+
        unfinished_timeline
-            .checkpoint(CheckpointConfig::Forced)
+            .checkpoint(CheckpointConfig::Forced).await
            .with_context(|| format!("Failed to checkpoint after pgdatadir import for timeline {tenant_id}/{timeline_id}"))?;

-        let mut timelines = self.timelines.lock().unwrap();
-        let timeline = raw_timeline.initialize_with_lock(&mut timelines, false)?;
-        drop(timelines);
+        let timeline = {
+            let mut timelines = self.timelines.lock().unwrap();
+            raw_timeline.initialize_with_lock(&mut timelines, false)?
+        };

        info!(
            "created root timeline {} timeline.lsn {}",
@@ -1306,7 +1376,7 @@ impl Tenant {
                Ok(UninitializedTimeline {
                    owning_tenant: self,
                    timeline_id: new_timeline_id,
-                    raw_timeline: Some((new_timeline, uninit_mark)),
+                    raw_timeline: Some((Arc::new(new_timeline), uninit_mark)),
                })
            }
            Err(e) => {
@@ -1425,7 +1495,7 @@ impl Tenant {
            let timeline = UninitializedTimeline {
                owning_tenant: self,
                timeline_id,
-                raw_timeline: Some((dummy_timeline, TimelineUninitMark::dummy())),
+                raw_timeline: Some((Arc::new(dummy_timeline), TimelineUninitMark::dummy())),
            };
            match timeline.initialize_with_lock(&mut timelines_accessor, true) {
                Ok(initialized_timeline) => {
@@ -1446,6 +1516,25 @@ impl Tenant {

        Ok(())
    }
+
+    /// Gathers inputs from all of the timelines to produce a sizing model input.
+    ///
+    /// Future is cancellation safe. Only one calculation can be running at once per tenant.
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
+    pub async fn gather_size_inputs(&self) -> anyhow::Result<size::ModelInputs> {
+        let logical_sizes_at_once = self
+            .conf
+            .concurrent_tenant_size_logical_size_queries
+            .inner();
+
+        // TODO: Having a single mutex block concurrent reads is unfortunate, but since the queries
+        // are for testing/experimenting, we tolerate this.
+        //
+        // See more for on the issue #2748 condenced out of the initial PR review.
+        let mut shared_cache = self.cached_logical_sizes.lock().await;
+
+        size::gather_inputs(self, logical_sizes_at_once, &mut *shared_cache).await
+    }
 }

 /// Create the cluster temporarily in 'initdbpath' directory inside the repository
@@ -1589,6 +1678,7 @@ pub mod harness {
                walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
                lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
                max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
+                trace_read_requests: Some(tenant_conf.trace_read_requests),
            }
        }
    }
@@ -1860,7 +1950,7 @@ mod tests {
        Ok(())
    }

-    fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> {
+    async fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> {
        let mut lsn = start_lsn;
        #[allow(non_snake_case)]
        {
@@ -1881,7 +1971,7 @@ mod tests {
            writer.finish_write(lsn);
            lsn += 0x10;
        }
-        tline.checkpoint(CheckpointConfig::Forced)?;
+        tline.checkpoint(CheckpointConfig::Forced).await?;
        {
            let writer = tline.writer();
            writer.put(
@@ -1898,24 +1988,26 @@ mod tests {
            )?;
            writer.finish_write(lsn);
        }
-        tline.checkpoint(CheckpointConfig::Forced)
+        tline.checkpoint(CheckpointConfig::Forced).await
    }

-    #[test]
-    fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
        let tenant =
            TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
                .load();
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
            .initialize()?;
-        make_some_layers(tline.as_ref(), Lsn(0x20))?;
+        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;

        // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
        // FIXME: this doesn't actually remove any layer currently, given how the checkpointing
        // and compaction works. But it does set the 'cutoff' point so that the cross check
        // below should fail.
-        tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
+        tenant
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
+            .await?;

        // try to branch at lsn 25, should fail because we already garbage collected the data
        match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
@@ -1960,14 +2052,14 @@ mod tests {
    /*
    // FIXME: This currently fails to error out. Calling GC doesn't currently
    // remove the old value, we'd need to work a little harder
-    #[test]
-    fn test_prohibit_get_for_garbage_collected_data() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn test_prohibit_get_for_garbage_collected_data() -> anyhow::Result<()> {
        let repo =
            RepoHarness::create("test_prohibit_get_for_garbage_collected_data")?
            .load();

        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
-        make_some_layers(tline.as_ref(), Lsn(0x20))?;
+        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;

        repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
        let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
@@ -1980,43 +2072,47 @@ mod tests {
    }
     */

-    #[test]
-    fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
        let tenant =
            TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load();
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
            .initialize()?;
-        make_some_layers(tline.as_ref(), Lsn(0x20))?;
+        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;

        tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
        let newtline = tenant
            .get_timeline(NEW_TIMELINE_ID, true)
            .expect("Should have a local timeline");
        // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
-        tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
+        tenant
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
+            .await?;
        assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok());

        Ok(())
    }
-    #[test]
-    fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
        let tenant =
            TenantHarness::create("test_parent_keeps_data_forever_after_branching")?.load();
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
            .initialize()?;
-        make_some_layers(tline.as_ref(), Lsn(0x20))?;
+        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;

        tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
        let newtline = tenant
            .get_timeline(NEW_TIMELINE_ID, true)
            .expect("Should have a local timeline");

-        make_some_layers(newtline.as_ref(), Lsn(0x60))?;
+        make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;

        // run gc on parent
-        tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
+        tenant
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
+            .await?;

        // Check that the data is still accessible on the branch.
        assert_eq!(
@@ -2027,8 +2123,8 @@ mod tests {
        Ok(())
    }

-    #[test]
-    fn timeline_load() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn timeline_load() -> anyhow::Result<()> {
        const TEST_NAME: &str = "timeline_load";
        let harness = TenantHarness::create(TEST_NAME)?;
        {
@@ -2036,8 +2132,8 @@ mod tests {
            let tline = tenant
                .create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?
                .initialize()?;
-            make_some_layers(tline.as_ref(), Lsn(0x8000))?;
-            tline.checkpoint(CheckpointConfig::Forced)?;
+            make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
+            tline.checkpoint(CheckpointConfig::Forced).await?;
        }

        let tenant = harness.load();
@@ -2048,8 +2144,8 @@ mod tests {
        Ok(())
    }

-    #[test]
-    fn timeline_load_with_ancestor() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
        const TEST_NAME: &str = "timeline_load_with_ancestor";
        let harness = TenantHarness::create(TEST_NAME)?;
        // create two timelines
@@ -2059,8 +2155,8 @@ mod tests {
                .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
                .initialize()?;

-            make_some_layers(tline.as_ref(), Lsn(0x20))?;
-            tline.checkpoint(CheckpointConfig::Forced)?;
+            make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
+            tline.checkpoint(CheckpointConfig::Forced).await?;

            tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;

@@ -2068,8 +2164,8 @@ mod tests {
                .get_timeline(NEW_TIMELINE_ID, true)
                .expect("Should have a local timeline");

-            make_some_layers(newtline.as_ref(), Lsn(0x60))?;
-            tline.checkpoint(CheckpointConfig::Forced)?;
+            make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
+            tline.checkpoint(CheckpointConfig::Forced).await?;
        }

        // check that both of them are initially unloaded
@@ -2129,8 +2225,8 @@ mod tests {
        Ok(())
    }

-    #[test]
-    fn test_images() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn test_images() -> anyhow::Result<()> {
        let tenant = TenantHarness::create("test_images")?.load();
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
@@ -2141,7 +2237,7 @@ mod tests {
        writer.finish_write(Lsn(0x10));
        drop(writer);

-        tline.checkpoint(CheckpointConfig::Forced)?;
+        tline.checkpoint(CheckpointConfig::Forced).await?;
        tline.compact()?;

        let writer = tline.writer();
@@ -2149,7 +2245,7 @@ mod tests {
        writer.finish_write(Lsn(0x20));
        drop(writer);

-        tline.checkpoint(CheckpointConfig::Forced)?;
+        tline.checkpoint(CheckpointConfig::Forced).await?;
        tline.compact()?;

        let writer = tline.writer();
@@ -2157,7 +2253,7 @@ mod tests {
        writer.finish_write(Lsn(0x30));
        drop(writer);

-        tline.checkpoint(CheckpointConfig::Forced)?;
+        tline.checkpoint(CheckpointConfig::Forced).await?;
        tline.compact()?;

        let writer = tline.writer();
@@ -2165,7 +2261,7 @@ mod tests {
        writer.finish_write(Lsn(0x40));
        drop(writer);

-        tline.checkpoint(CheckpointConfig::Forced)?;
+        tline.checkpoint(CheckpointConfig::Forced).await?;
        tline.compact()?;

        assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
@@ -2181,8 +2277,8 @@ mod tests {
    // Insert 1000 key-value pairs with increasing keys, checkpoint,
    // repeat 50 times.
    //
-    #[test]
-    fn test_bulk_insert() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn test_bulk_insert() -> anyhow::Result<()> {
        let tenant = TenantHarness::create("test_bulk_insert")?.load();
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
@@ -2215,7 +2311,7 @@ mod tests {
            let cutoff = tline.get_last_record_lsn();

            tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
-            tline.checkpoint(CheckpointConfig::Forced)?;
+            tline.checkpoint(CheckpointConfig::Forced).await?;
            tline.compact()?;
            tline.gc()?;
        }
@@ -2223,8 +2319,8 @@ mod tests {
        Ok(())
    }

-    #[test]
-    fn test_random_updates() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn test_random_updates() -> anyhow::Result<()> {
        let tenant = TenantHarness::create("test_random_updates")?.load();
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
@@ -2287,7 +2383,7 @@ mod tests {
            println!("checkpointing {}", lsn);
            let cutoff = tline.get_last_record_lsn();
            tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
-            tline.checkpoint(CheckpointConfig::Forced)?;
+            tline.checkpoint(CheckpointConfig::Forced).await?;
            tline.compact()?;
            tline.gc()?;
        }
@@ -2295,8 +2391,8 @@ mod tests {
        Ok(())
    }

-    #[test]
-    fn test_traverse_branches() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn test_traverse_branches() -> anyhow::Result<()> {
        let tenant = TenantHarness::create("test_traverse_branches")?.load();
        let mut tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
@@ -2368,7 +2464,7 @@ mod tests {
            println!("checkpointing {}", lsn);
            let cutoff = tline.get_last_record_lsn();
            tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
-            tline.checkpoint(CheckpointConfig::Forced)?;
+            tline.checkpoint(CheckpointConfig::Forced).await?;
            tline.compact()?;
            tline.gc()?;
        }
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -74,6 +74,7 @@ where
        };

        dstbuf.clear();
+        dstbuf.reserve(len);

        // Read the payload
        let mut remain = len;
--- a/pageserver/src/tenant/delta_layer.rs
+++ b/pageserver/src/tenant/delta_layer.rs
@@ -260,8 +260,9 @@ impl Layer for DeltaLayer {

            // Ok, 'offsets' now contains the offsets of all the entries we need to read
            let mut cursor = file.block_cursor();
+            let mut buf = Vec::new();
            for (entry_lsn, pos) in offsets {
-                let buf = cursor.read_blob(pos).with_context(|| {
+                cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
                    format!(
                        "Failed to read blob from virtual file {}",
                        file.file.path.display()
@@ -610,9 +611,9 @@ impl DeltaLayer {
 ///
 /// 3. Call `finish`.
 ///
-pub struct DeltaLayerWriter {
+struct DeltaLayerWriterInner {
    conf: &'static PageServerConf,
-    path: PathBuf,
+    pub path: PathBuf,
    timeline_id: TimelineId,
    tenant_id: TenantId,

@@ -624,17 +625,17 @@ pub struct DeltaLayerWriter {
    blob_writer: WriteBlobWriter<BufWriter<VirtualFile>>,
 }

-impl DeltaLayerWriter {
+impl DeltaLayerWriterInner {
    ///
    /// Start building a new delta layer.
    ///
-    pub fn new(
+    fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
        key_start: Key,
        lsn_range: Range<Lsn>,
-    ) -> Result<DeltaLayerWriter> {
+    ) -> anyhow::Result<Self> {
        // Create the file initially with a temporary filename. We don't know
        // the end key yet, so we cannot form the final filename yet. We will
        // rename it when we're done.
@@ -653,7 +654,7 @@ impl DeltaLayerWriter {
        let block_buf = BlockBuf::new();
        let tree_builder = DiskBtreeBuilder::new(block_buf);

-        Ok(DeltaLayerWriter {
+        Ok(Self {
            conf,
            path,
            timeline_id,
@@ -670,17 +671,17 @@ impl DeltaLayerWriter {
    ///
    /// The values must be appended in key, lsn order.
    ///
-    pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
+    fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
        self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
    }

-    pub fn put_value_bytes(
+    fn put_value_bytes(
        &mut self,
        key: Key,
        lsn: Lsn,
        val: &[u8],
        will_init: bool,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
        assert!(self.lsn_range.start <= lsn);

        let off = self.blob_writer.write_blob(val)?;
@@ -693,14 +694,14 @@ impl DeltaLayerWriter {
        Ok(())
    }

-    pub fn size(&self) -> u64 {
+    fn size(&self) -> u64 {
        self.blob_writer.size() + self.tree.borrow_writer().size()
    }

    ///
    /// Finish writing the delta layer.
    ///
-    pub fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+    fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -768,6 +769,102 @@ impl DeltaLayerWriter {
    }
 }

+/// A builder object for constructing a new delta layer.
+///
+/// Usage:
+///
+/// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...)
+///
+/// 2. Write the contents by calling `put_value` for every page
+///    version to store in the layer.
+///
+/// 3. Call `finish`.
+///
+/// # Note
+///
+/// As described in https://github.com/neondatabase/neon/issues/2650, it's
+/// possible for the writer to drop before `finish` is actually called. So this
+/// could lead to odd temporary files in the directory, exhausting file system.
+/// This structure wraps `DeltaLayerWriterInner` and also contains `Drop`
+/// implementation that cleans up the temporary file in failure. It's not
+/// possible to do this directly in `DeltaLayerWriterInner` since `finish` moves
+/// out some fields, making it impossible to implement `Drop`.
+///
+#[must_use]
+pub struct DeltaLayerWriter {
+    inner: Option<DeltaLayerWriterInner>,
+}
+
+impl DeltaLayerWriter {
+    ///
+    /// Start building a new delta layer.
+    ///
+    pub fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_id: TenantId,
+        key_start: Key,
+        lsn_range: Range<Lsn>,
+    ) -> anyhow::Result<Self> {
+        Ok(Self {
+            inner: Some(DeltaLayerWriterInner::new(
+                conf,
+                timeline_id,
+                tenant_id,
+                key_start,
+                lsn_range,
+            )?),
+        })
+    }
+
+    ///
+    /// Append a key-value pair to the file.
+    ///
+    /// The values must be appended in key, lsn order.
+    ///
+    pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
+        self.inner.as_mut().unwrap().put_value(key, lsn, val)
+    }
+
+    pub fn put_value_bytes(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        val: &[u8],
+        will_init: bool,
+    ) -> anyhow::Result<()> {
+        self.inner
+            .as_mut()
+            .unwrap()
+            .put_value_bytes(key, lsn, val, will_init)
+    }
+
+    pub fn size(&self) -> u64 {
+        self.inner.as_ref().unwrap().size()
+    }
+
+    ///
+    /// Finish writing the delta layer.
+    ///
+    pub fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+        self.inner.take().unwrap().finish(key_end)
+    }
+}
+
+impl Drop for DeltaLayerWriter {
+    fn drop(&mut self) {
+        if let Some(inner) = self.inner.take() {
+            match inner.blob_writer.into_inner().into_inner() {
+                Ok(vfile) => vfile.remove(),
+                Err(err) => warn!(
+                    "error while flushing buffer of image layer temporary file: {}",
+                    err
+                ),
+            }
+        }
+    }
+}
+
 ///
 /// Iterator over all key-value pairse stored in a delta layer
 ///
--- a/pageserver/src/tenant/image_layer.rs
+++ b/pageserver/src/tenant/image_layer.rs
@@ -411,7 +411,7 @@ impl ImageLayer {
 ///
 /// 3. Call `finish`.
 ///
-pub struct ImageLayerWriter {
+struct ImageLayerWriterInner {
    conf: &'static PageServerConf,
    path: PathBuf,
    timeline_id: TimelineId,
@@ -423,14 +423,17 @@ pub struct ImageLayerWriter {
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
 }

-impl ImageLayerWriter {
-    pub fn new(
+impl ImageLayerWriterInner {
+    ///
+    /// Start building a new image layer.
+    ///
+    fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
        key_range: &Range<Key>,
        lsn: Lsn,
-    ) -> anyhow::Result<ImageLayerWriter> {
+    ) -> anyhow::Result<Self> {
        // Create the file initially with a temporary filename.
        // We'll atomically rename it to the final name when we're done.
        let path = ImageLayer::temp_path_for(
@@ -455,7 +458,7 @@ impl ImageLayerWriter {
        let block_buf = BlockBuf::new();
        let tree_builder = DiskBtreeBuilder::new(block_buf);

-        let writer = ImageLayerWriter {
+        let writer = Self {
            conf,
            path,
            timeline_id,
@@ -474,7 +477,7 @@ impl ImageLayerWriter {
    ///
    /// The page versions must be appended in blknum order.
    ///
-    pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> {
+    fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
        ensure!(self.key_range.contains(&key));
        let off = self.blob_writer.write_blob(img)?;

@@ -485,7 +488,10 @@ impl ImageLayerWriter {
        Ok(())
    }

-    pub fn finish(self) -> anyhow::Result<ImageLayer> {
+    ///
+    /// Finish writing the image layer.
+    ///
+    fn finish(self) -> anyhow::Result<ImageLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -552,3 +558,76 @@ impl ImageLayerWriter {
        Ok(layer)
    }
 }
+
+/// A builder object for constructing a new image layer.
+///
+/// Usage:
+///
+/// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
+///
+/// 2. Write the contents by calling `put_page_image` for every key-value
+///    pair in the key range.
+///
+/// 3. Call `finish`.
+///
+/// # Note
+///
+/// As described in https://github.com/neondatabase/neon/issues/2650, it's
+/// possible for the writer to drop before `finish` is actually called. So this
+/// could lead to odd temporary files in the directory, exhausting file system.
+/// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
+/// implementation that cleans up the temporary file in failure. It's not
+/// possible to do this directly in `ImageLayerWriterInner` since `finish` moves
+/// out some fields, making it impossible to implement `Drop`.
+///
+#[must_use]
+pub struct ImageLayerWriter {
+    inner: Option<ImageLayerWriterInner>,
+}
+
+impl ImageLayerWriter {
+    ///
+    /// Start building a new image layer.
+    ///
+    pub fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_id: TenantId,
+        key_range: &Range<Key>,
+        lsn: Lsn,
+    ) -> anyhow::Result<ImageLayerWriter> {
+        Ok(Self {
+            inner: Some(ImageLayerWriterInner::new(
+                conf,
+                timeline_id,
+                tenant_id,
+                key_range,
+                lsn,
+            )?),
+        })
+    }
+
+    ///
+    /// Write next value to the file.
+    ///
+    /// The page versions must be appended in blknum order.
+    ///
+    pub fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
+        self.inner.as_mut().unwrap().put_image(key, img)
+    }
+
+    ///
+    /// Finish writing the image layer.
+    ///
+    pub fn finish(mut self) -> anyhow::Result<ImageLayer> {
+        self.inner.take().unwrap().finish()
+    }
+}
+
+impl Drop for ImageLayerWriter {
+    fn drop(&mut self) {
+        if let Some(inner) = self.inner.take() {
+            inner.blob_writer.into_inner().remove();
+        }
+    }
+}
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -0,0 +1,461 @@
+use std::cmp;
+use std::collections::{HashMap, HashSet};
+use std::sync::Arc;
+
+use anyhow::Context;
+use tokio::sync::Semaphore;
+
+use super::Tenant;
+use utils::id::TimelineId;
+use utils::lsn::Lsn;
+
+use tracing::*;
+
+/// Inputs to the actual tenant sizing model
+///
+/// Implements [`serde::Serialize`] but is not meant to be part of the public API, instead meant to
+/// be a transferrable format between execution environments and developer.
+#[serde_with::serde_as]
+#[derive(Debug, serde::Serialize, serde::Deserialize)]
+pub struct ModelInputs {
+    updates: Vec<Update>,
+    retention_period: u64,
+    #[serde_as(as = "HashMap<serde_with::DisplayFromStr, _>")]
+    timeline_inputs: HashMap<TimelineId, TimelineInputs>,
+}
+
+/// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
+/// part of [`ModelInputs`] from the HTTP api, explaining the inputs.
+#[serde_with::serde_as]
+#[derive(Debug, serde::Serialize, serde::Deserialize)]
+struct TimelineInputs {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    last_record: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    latest_gc_cutoff: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    horizon_cutoff: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pitr_cutoff: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    next_gc_cutoff: Lsn,
+}
+
+/// Gathers the inputs for the tenant sizing model.
+///
+/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
+/// is updated on-demand, during the start of this calculation and separate from the
+/// [`Timeline::latest_gc_cutoff`].
+///
+/// For timelines in general:
+///
+/// ```ignore
+/// 0-----|---------|----|------------| · · · · · |·> lsn
+///   initdb_lsn  branchpoints*  next_gc_cutoff  latest
+/// ```
+///
+/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the
+/// tenant size will be zero.
+pub(super) async fn gather_inputs(
+    tenant: &Tenant,
+    limit: &Arc<Semaphore>,
+    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
+) -> anyhow::Result<ModelInputs> {
+    // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to
+    // our advantage with `?` error handling.
+    let mut joinset = tokio::task::JoinSet::new();
+
+    let timelines = tenant
+        .refresh_gc_info()
+        .context("Failed to refresh gc_info before gathering inputs")?;
+
+    if timelines.is_empty() {
+        // All timelines are below tenant's gc_horizon; alternative would be to use
+        // Tenant::list_timelines but then those gc_info's would not be updated yet, possibly
+        // missing GcInfo::retain_lsns or having obsolete values for cutoff's.
+        return Ok(ModelInputs {
+            updates: vec![],
+            retention_period: 0,
+            timeline_inputs: HashMap::new(),
+        });
+    }
+
+    // record the used/inserted cache keys here, to remove extras not to start leaking
+    // after initial run the cache should be quite stable, but live timelines will eventually
+    // require new lsns to be inspected.
+    let mut needed_cache = HashSet::<(TimelineId, Lsn)>::new();
+
+    let mut updates = Vec::new();
+
+    // record the per timline values used to determine `retention_period`
+    let mut timeline_inputs = HashMap::with_capacity(timelines.len());
+
+    // used to determine the `retention_period` for the size model
+    let mut max_cutoff_distance = None;
+
+    // this will probably conflict with on-demand downloaded layers, or at least force them all
+    // to be downloaded
+    for timeline in timelines {
+        let last_record_lsn = timeline.get_last_record_lsn();
+
+        let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = {
+            // there's a race between the update (holding tenant.gc_lock) and this read but it
+            // might not be an issue, because it's not for Timeline::gc
+            let gc_info = timeline.gc_info.read().unwrap();
+
+            // similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a
+            // new gc run, which we have no control over. however differently from `Timeline::gc`
+            // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
+            // actually removing files.
+            let next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);
+
+            // the minimum where we should find the next_gc_cutoff for our calculations.
+            //
+            // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
+            // want to query any logical size before initdb_lsn.
+            let cutoff_minimum = cmp::max(timeline.get_ancestor_lsn(), timeline.initdb_lsn);
+
+            let maybe_cutoff = if next_gc_cutoff > cutoff_minimum {
+                Some((next_gc_cutoff, LsnKind::GcCutOff))
+            } else {
+                None
+            };
+
+            // this assumes there are no other lsns than the branchpoints
+            let lsns = gc_info
+                .retain_lsns
+                .iter()
+                .inspect(|&&lsn| {
+                    trace!(
+                        timeline_id=%timeline.timeline_id,
+                        "retained lsn: {lsn:?}, is_before_ancestor_lsn={}",
+                        lsn < timeline.get_ancestor_lsn()
+                    )
+                })
+                .filter(|&&lsn| lsn > timeline.get_ancestor_lsn())
+                .copied()
+                .map(|lsn| (lsn, LsnKind::BranchPoint))
+                .chain(maybe_cutoff)
+                .collect::<Vec<_>>();
+
+            (
+                lsns,
+                gc_info.horizon_cutoff,
+                gc_info.pitr_cutoff,
+                next_gc_cutoff,
+            )
+        };
+
+        // update this to have a retention_period later for the tenant_size_model
+        // tenant_size_model compares this to the last segments start_lsn
+        if let Some(cutoff_distance) = last_record_lsn.checked_sub(next_gc_cutoff) {
+            match max_cutoff_distance.as_mut() {
+                Some(max) => {
+                    *max = std::cmp::max(*max, cutoff_distance);
+                }
+                _ => {
+                    max_cutoff_distance = Some(cutoff_distance);
+                }
+            }
+        }
+
+        // all timelines branch from something, because it might be impossible to pinpoint
+        // which is the tenant_size_model's "default" branch.
+        updates.push(Update {
+            lsn: timeline.get_ancestor_lsn(),
+            command: Command::BranchFrom(timeline.get_ancestor_timeline_id()),
+            timeline_id: timeline.timeline_id,
+        });
+
+        for (lsn, _kind) in &interesting_lsns {
+            if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) {
+                updates.push(Update {
+                    lsn: *lsn,
+                    timeline_id: timeline.timeline_id,
+                    command: Command::Update(*size),
+                });
+
+                needed_cache.insert((timeline.timeline_id, *lsn));
+            } else {
+                let timeline = Arc::clone(&timeline);
+                let parallel_size_calcs = Arc::clone(limit);
+                joinset.spawn(calculate_logical_size(parallel_size_calcs, timeline, *lsn));
+            }
+        }
+
+        // all timelines also have an end point if they have made any progress
+        if last_record_lsn > timeline.get_ancestor_lsn()
+            && !interesting_lsns
+                .iter()
+                .any(|(lsn, _)| lsn == &last_record_lsn)
+        {
+            updates.push(Update {
+                lsn: last_record_lsn,
+                command: Command::EndOfBranch,
+                timeline_id: timeline.timeline_id,
+            });
+        }
+
+        timeline_inputs.insert(
+            timeline.timeline_id,
+            TimelineInputs {
+                last_record: last_record_lsn,
+                // this is not used above, because it might not have updated recently enough
+                latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
+                horizon_cutoff,
+                pitr_cutoff,
+                next_gc_cutoff,
+            },
+        );
+    }
+
+    let mut have_any_error = false;
+
+    while let Some(res) = joinset.join_next().await {
+        // each of these come with Result<Result<_, JoinError>, JoinError>
+        // because of spawn + spawn_blocking
+        let res = res.and_then(|inner| inner);
+        match res {
+            Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size))) => {
+                debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
+
+                logical_size_cache.insert((timeline.timeline_id, lsn), size);
+                needed_cache.insert((timeline.timeline_id, lsn));
+
+                updates.push(Update {
+                    lsn,
+                    timeline_id: timeline.timeline_id,
+                    command: Command::Update(size),
+                });
+            }
+            Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error))) => {
+                warn!(
+                    timeline_id=%timeline.timeline_id,
+                    "failed to calculate logical size at {lsn}: {error:#}"
+                );
+                have_any_error = true;
+            }
+            Err(join_error) if join_error.is_cancelled() => {
+                unreachable!("we are not cancelling any of the futures, nor should be");
+            }
+            Err(join_error) => {
+                // cannot really do anything, as this panic is likely a bug
+                error!("logical size query panicked: {join_error:#}");
+                have_any_error = true;
+            }
+        }
+    }
+
+    // prune any keys not needed anymore; we record every used key and added key.
+    logical_size_cache.retain(|key, _| needed_cache.contains(key));
+
+    if have_any_error {
+        // we cannot complete this round, because we are missing data.
+        // we have however cached all we were able to request calculation on.
+        anyhow::bail!("failed to calculate some logical_sizes");
+    }
+
+    // the data gathered to updates is per lsn, regardless of the branch, so we can use it to
+    // our advantage, not requiring a sorted container or graph walk.
+    //
+    // for branch points, which come as multiple updates at the same LSN, the Command::Update
+    // is needed before a branch is made out of that branch Command::BranchFrom. this is
+    // handled by the variant order in `Command`.
+    updates.sort_unstable();
+
+    let retention_period = match max_cutoff_distance {
+        Some(max) => max.0,
+        None => {
+            anyhow::bail!("the first branch should have a gc_cutoff after it's branch point at 0")
+        }
+    };
+
+    Ok(ModelInputs {
+        updates,
+        retention_period,
+        timeline_inputs,
+    })
+}
+
+impl ModelInputs {
+    pub fn calculate(&self) -> anyhow::Result<u64> {
+        // Option<TimelineId> is used for "naming" the branches because it is assumed to be
+        // impossible to always determine the a one main branch.
+        let mut storage = tenant_size_model::Storage::<Option<TimelineId>>::new(None);
+
+        for update in &self.updates {
+            let Update {
+                lsn,
+                command: op,
+                timeline_id,
+            } = update;
+            let Lsn(now) = *lsn;
+            match op {
+                Command::Update(sz) => {
+                    storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz));
+                }
+                Command::EndOfBranch => {
+                    storage.insert_point(&Some(*timeline_id), "".into(), now, None);
+                }
+                Command::BranchFrom(parent) => {
+                    storage.branch(parent, Some(*timeline_id));
+                }
+            }
+        }
+
+        Ok(storage.calculate(self.retention_period).total_children())
+    }
+}
+
+/// A point of interest in the tree of branches
+#[serde_with::serde_as]
+#[derive(
+    Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize,
+)]
+struct Update {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    lsn: utils::lsn::Lsn,
+    command: Command,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    timeline_id: TimelineId,
+}
+
+#[serde_with::serde_as]
+#[derive(PartialOrd, PartialEq, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize)]
+#[serde(rename_all = "snake_case")]
+enum Command {
+    Update(u64),
+    BranchFrom(#[serde_as(as = "Option<serde_with::DisplayFromStr>")] Option<TimelineId>),
+    EndOfBranch,
+}
+
+impl std::fmt::Debug for Command {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // custom one-line implementation makes it more enjoyable to read {:#?} avoiding 3
+        // linebreaks
+        match self {
+            Self::Update(arg0) => write!(f, "Update({arg0})"),
+            Self::BranchFrom(arg0) => write!(f, "BranchFrom({arg0:?})"),
+            Self::EndOfBranch => write!(f, "EndOfBranch"),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+enum LsnKind {
+    BranchPoint,
+    GcCutOff,
+}
+
+/// Newtype around the tuple that carries the timeline at lsn logical size calculation.
+struct TimelineAtLsnSizeResult(
+    Arc<crate::tenant::Timeline>,
+    utils::lsn::Lsn,
+    anyhow::Result<u64>,
+);
+
+#[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))]
+async fn calculate_logical_size(
+    limit: Arc<tokio::sync::Semaphore>,
+    timeline: Arc<crate::tenant::Timeline>,
+    lsn: utils::lsn::Lsn,
+) -> Result<TimelineAtLsnSizeResult, tokio::task::JoinError> {
+    let permit = tokio::sync::Semaphore::acquire_owned(limit)
+        .await
+        .expect("global semaphore should not had been closed");
+
+    tokio::task::spawn_blocking(move || {
+        let _permit = permit;
+        let size_res = timeline.calculate_logical_size(lsn);
+        TimelineAtLsnSizeResult(timeline, lsn, size_res)
+    })
+    .await
+}
+
+#[test]
+fn updates_sort() {
+    use std::str::FromStr;
+    use utils::id::TimelineId;
+    use utils::lsn::Lsn;
+
+    let ids = [
+        TimelineId::from_str("7ff1edab8182025f15ae33482edb590a").unwrap(),
+        TimelineId::from_str("b1719e044db05401a05a2ed588a3ad3f").unwrap(),
+        TimelineId::from_str("b68d6691c895ad0a70809470020929ef").unwrap(),
+    ];
+
+    // try through all permutations
+    let ids = [
+        [&ids[0], &ids[1], &ids[2]],
+        [&ids[0], &ids[2], &ids[1]],
+        [&ids[1], &ids[0], &ids[2]],
+        [&ids[1], &ids[2], &ids[0]],
+        [&ids[2], &ids[0], &ids[1]],
+        [&ids[2], &ids[1], &ids[0]],
+    ];
+
+    for ids in ids {
+        // apply a fixture which uses a permutation of ids
+        let commands = [
+            Update {
+                lsn: Lsn(0),
+                command: Command::BranchFrom(None),
+                timeline_id: *ids[0],
+            },
+            Update {
+                lsn: Lsn::from_str("0/67E7618").unwrap(),
+                command: Command::Update(43696128),
+                timeline_id: *ids[0],
+            },
+            Update {
+                lsn: Lsn::from_str("0/67E7618").unwrap(),
+                command: Command::BranchFrom(Some(*ids[0])),
+                timeline_id: *ids[1],
+            },
+            Update {
+                lsn: Lsn::from_str("0/76BE4F0").unwrap(),
+                command: Command::Update(41844736),
+                timeline_id: *ids[1],
+            },
+            Update {
+                lsn: Lsn::from_str("0/10E49380").unwrap(),
+                command: Command::Update(42164224),
+                timeline_id: *ids[0],
+            },
+            Update {
+                lsn: Lsn::from_str("0/10E49380").unwrap(),
+                command: Command::BranchFrom(Some(*ids[0])),
+                timeline_id: *ids[2],
+            },
+            Update {
+                lsn: Lsn::from_str("0/11D74910").unwrap(),
+                command: Command::Update(42172416),
+                timeline_id: *ids[2],
+            },
+            Update {
+                lsn: Lsn::from_str("0/12051E98").unwrap(),
+                command: Command::Update(42196992),
+                timeline_id: *ids[0],
+            },
+        ];
+
+        let mut sorted = commands;
+
+        // these must sort in the same order, regardless of how the ids sort
+        // which is why the timeline_id is the last field
+        sorted.sort_unstable();
+
+        assert_eq!(commands, sorted, "{:#?} vs. {:#?}", commands, sorted);
+    }
+}
+
+#[test]
+fn verify_size_for_multiple_branches() {
+    // this is generated from integration test test_tenant_size_with_multiple_branches, but this way
+    // it has the stable lsn's
+    let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072,"timeline_inputs":{"cd9d9409c216e64bf580904facedb01b":{"last_record":"0/18D5E40","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/18B5E40","pitr_cutoff":"0/18B5E40","next_gc_cutoff":"0/18B5E40"},"10b532a550540bc15385eac4edde416a":{"last_record":"0/1839818","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/1819818","pitr_cutoff":"0/1819818","next_gc_cutoff":"0/1819818"},"230fc9d756f7363574c0d66533564dcc":{"last_record":"0/222F438","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/220F438","pitr_cutoff":"0/220F438","next_gc_cutoff":"0/220F438"}}}"#;
+
+    let inputs: ModelInputs = serde_json::from_str(doc).unwrap();
+
+    assert_eq!(inputs.calculate().unwrap(), 36_409_872);
+}
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -16,7 +16,7 @@ use std::fs;
 use std::ops::{Deref, Range};
 use std::path::PathBuf;
 use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering};
-use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError};
+use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::{Duration, Instant, SystemTime};

 use crate::tenant::{
@@ -34,7 +34,6 @@ use crate::tenant::{
 use crate::config::{PageServerConf, METADATA_FILE_NAME};
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::TimelineMetrics;
-use crate::page_image_cache;
 use crate::pgdatadir_mapping::BlockNumber;
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
@@ -62,6 +61,13 @@ use crate::{
    storage_sync::{self, index::LayerFileMetadata},
 };

+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+enum FlushLoopState {
+    NotStarted,
+    Running,
+    Exited,
+}
+
 pub struct Timeline {
    conf: &'static PageServerConf,
    tenant_conf: Arc<RwLock<TenantConfOpt>>,
@@ -122,8 +128,16 @@ pub struct Timeline {
    /// to avoid deadlock.
    write_lock: Mutex<()>,

-    /// Used to ensure that there is only task performing flushing at a time
-    layer_flush_lock: Mutex<()>,
+    /// Used to avoid multiple `flush_loop` tasks running
+    flush_loop_state: Mutex<FlushLoopState>,
+
+    /// layer_flush_start_tx can be used to wake up the layer-flushing task.
+    /// The value is a counter, incremented every time a new flush cycle is requested.
+    /// The flush cycle counter is sent back on the layer_flush_done channel when
+    /// the flush finishes. You can use that to wait for the flush to finish.
+    layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
+    /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
+    layer_flush_done_tx: tokio::sync::watch::Sender<(u64, anyhow::Result<()>)>,

    /// Layer removal lock.
    /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
@@ -273,6 +287,11 @@ impl LogicalSize {
        self.size_added_after_initial
            .fetch_add(delta, AtomicOrdering::SeqCst);
    }
+
+    /// Returns the initialized (already calculated) value, if any.
+    fn initialized_size(&self) -> Option<u64> {
+        self.initial_logical_size.get().copied()
+    }
 }

 pub struct WalReceiverInfo {
@@ -462,15 +481,16 @@ impl Timeline {
    ///
    /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
    /// know anything about them here in the repository.
-    pub fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> {
+    #[instrument(skip(self), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id))]
+    pub async fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> {
        match cconf {
            CheckpointConfig::Flush => {
                self.freeze_inmem_layer(false);
-                self.flush_frozen_layers(true)
+                self.flush_frozen_layers_and_wait().await
            }
            CheckpointConfig::Forced => {
                self.freeze_inmem_layer(false);
-                self.flush_frozen_layers(true)?;
+                self.flush_frozen_layers_and_wait().await?;
                self.compact()
            }
        }
@@ -620,24 +640,8 @@ impl Timeline {
                self.last_freeze_at.store(last_lsn);
                *(self.last_freeze_ts.write().unwrap()) = Instant::now();

-                // Launch a task to flush the frozen layer to disk, unless
-                // a task was already running. (If the task was running
-                // at the time that we froze the layer, it must've seen the
-                // the layer we just froze before it exited; see comments
-                // in flush_frozen_layers())
-                if let Ok(guard) = self.layer_flush_lock.try_lock() {
-                    drop(guard);
-                    let self_clone = Arc::clone(self);
-                    task_mgr::spawn(
-                        task_mgr::BACKGROUND_RUNTIME.handle(),
-                        task_mgr::TaskKind::LayerFlushTask,
-                        Some(self.tenant_id),
-                        Some(self.timeline_id),
-                        "layer flush task",
-                        false,
-                        async move { self_clone.flush_frozen_layers(false) },
-                    );
-                }
+                // Wake up the layer flusher
+                self.flush_frozen_layers();
            }
        }
        Ok(())
@@ -728,6 +732,9 @@ impl Timeline {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
        let (state, _) = watch::channel(TimelineState::Suspended);

+        let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
+        let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
+
        let mut result = Timeline {
            conf,
            tenant_conf,
@@ -755,8 +762,12 @@ impl Timeline {

            upload_layers: AtomicBool::new(upload_layers),

+            flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
+
+            layer_flush_start_tx,
+            layer_flush_done_tx,
+
            write_lock: Mutex::new(()),
-            layer_flush_lock: Mutex::new(()),
            layer_removal_cs: Mutex::new(()),

            gc_info: RwLock::new(GcInfo {
@@ -789,6 +800,48 @@ impl Timeline {
        result
    }

+    pub(super) fn maybe_spawn_flush_loop(self: &Arc<Self>) {
+        let mut flush_loop_state = self.flush_loop_state.lock().unwrap();
+        match *flush_loop_state {
+            FlushLoopState::NotStarted => (),
+            FlushLoopState::Running => {
+                info!(
+                    "skipping attempt to start flush_loop twice {}/{}",
+                    self.tenant_id, self.timeline_id
+                );
+                return;
+            }
+            FlushLoopState::Exited => {
+                warn!(
+                    "ignoring attempt to restart exited flush_loop {}/{}",
+                    self.tenant_id, self.timeline_id
+                );
+                return;
+            }
+        }
+
+        let layer_flush_start_rx = self.layer_flush_start_tx.subscribe();
+        let self_clone = Arc::clone(self);
+        info!("spawning flush loop");
+        task_mgr::spawn(
+                    task_mgr::BACKGROUND_RUNTIME.handle(),
+                    task_mgr::TaskKind::LayerFlushTask,
+                    Some(self.tenant_id),
+                    Some(self.timeline_id),
+                    "layer flush task",
+                    false,
+                    async move {
+                         self_clone.flush_loop(layer_flush_start_rx).await;
+                         let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
+                         assert_eq!(*flush_loop_state, FlushLoopState::Running);
+                         *flush_loop_state  = FlushLoopState::Exited;
+                         Ok(()) }
+                    .instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id))
+                );
+
+        *flush_loop_state = FlushLoopState::Running;
+    }
+
    pub(super) fn launch_wal_receiver(self: &Arc<Self>) {
        if !is_etcd_client_initialized() {
            if cfg!(test) {
@@ -980,9 +1033,26 @@ impl Timeline {
    /// Calculate the logical size of the database at the latest LSN.
    ///
    /// NOTE: counted incrementally, includes ancestors, this can be a slow operation.
-    fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result<u64> {
-        info!("Calculating logical size for timeline {}", self.timeline_id);
-        let timer = self.metrics.init_logical_size_histo.start_timer();
+    pub fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result<u64> {
+        info!(
+            "Calculating logical size for timeline {} at {}",
+            self.timeline_id, up_to_lsn
+        );
+        let timer = if up_to_lsn == self.initdb_lsn {
+            if let Some(size) = self.current_logical_size.initialized_size() {
+                if size != 0 {
+                    // non-zero size means that the size has already been calculated by this method
+                    // after startup. if the logical size is for a new timeline without layers the
+                    // size will be zero, and we cannot use that, or this caching strategy until
+                    // pageserver restart.
+                    return Ok(size);
+                }
+            }
+
+            self.metrics.init_logical_size_histo.start_timer()
+        } else {
+            self.metrics.logical_size_histo.start_timer()
+        };
        let logical_size = self.get_current_logical_size_non_incremental(up_to_lsn)?;
        debug!("calculated logical size: {logical_size}");
        timer.stop_and_record();
@@ -1268,53 +1338,95 @@ impl Timeline {
        drop(layers);
    }

-    /// Flush all frozen layers to disk.
-    ///
-    /// Only one task at a time can be doing layer-flushing for a
-    /// given timeline. If 'wait' is true, and another task is
-    /// currently doing the flushing, this function will wait for it
-    /// to finish. If 'wait' is false, this function will return
-    /// immediately instead.
-    fn flush_frozen_layers(&self, wait: bool) -> anyhow::Result<()> {
-        let flush_lock_guard = if wait {
-            self.layer_flush_lock.lock().unwrap()
-        } else {
-            match self.layer_flush_lock.try_lock() {
-                Ok(guard) => guard,
-                Err(TryLockError::WouldBlock) => return Ok(()),
-                Err(TryLockError::Poisoned(err)) => panic!("{:?}", err),
-            }
-        };
-
-        let timer = self.metrics.flush_time_histo.start_timer();
-
+    /// Layer flusher task's main loop.
+    async fn flush_loop(&self, mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>) {
+        info!("started flush loop");
        loop {
-            let layers = self.layers.read().unwrap();
-            if let Some(frozen_layer) = layers.frozen_layers.front() {
-                let frozen_layer = Arc::clone(frozen_layer);
-                drop(layers); // to allow concurrent reads and writes
-                self.flush_frozen_layer(frozen_layer)?;
-            } else {
-                // Drop the 'layer_flush_lock' *before* 'layers'. That
-                // way, if you freeze a layer, and then call
-                // flush_frozen_layers(false), it is guaranteed that
-                // if another thread was busy flushing layers and the
-                // call therefore returns immediately, the other
-                // thread will have seen the newly-frozen layer and
-                // will flush that too (assuming no errors).
-                drop(flush_lock_guard);
-                drop(layers);
-                break;
+            tokio::select! {
+                _ = task_mgr::shutdown_watcher() => {
+                    info!("shutting down layer flush task");
+                    break;
+                },
+                _ = layer_flush_start_rx.changed() => {}
            }
+
+            trace!("waking up");
+            let timer = self.metrics.flush_time_histo.start_timer();
+            let flush_counter = *layer_flush_start_rx.borrow();
+            let result = loop {
+                let layer_to_flush = {
+                    let layers = self.layers.read().unwrap();
+                    layers.frozen_layers.front().cloned()
+                    // drop 'layers' lock to allow concurrent reads and writes
+                };
+                if let Some(layer_to_flush) = layer_to_flush {
+                    if let Err(err) = self.flush_frozen_layer(layer_to_flush).await {
+                        error!("could not flush frozen layer: {err:?}");
+                        break Err(err);
+                    }
+                    continue;
+                } else {
+                    break Ok(());
+                }
+            };
+            // Notify any listeners that we're done
+            let _ = self
+                .layer_flush_done_tx
+                .send_replace((flush_counter, result));
+
+            timer.stop_and_record();
+        }
+    }
+
+    async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> {
+        let mut rx = self.layer_flush_done_tx.subscribe();
+
+        // Increment the flush cycle counter and wake up the flush task.
+        // Remember the new value, so that when we listen for the flush
+        // to finish, we know when the flush that we initiated has
+        // finished, instead of some other flush that was started earlier.
+        let mut my_flush_request = 0;
+
+        let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
+        if flush_loop_state != FlushLoopState::Running {
+            anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
        }

-        timer.stop_and_record();
+        self.layer_flush_start_tx.send_modify(|counter| {
+            my_flush_request = *counter + 1;
+            *counter = my_flush_request;
+        });

-        Ok(())
+        loop {
+            {
+                let (last_result_counter, last_result) = &*rx.borrow();
+                if *last_result_counter >= my_flush_request {
+                    if let Err(_err) = last_result {
+                        // We already logged the original error in
+                        // flush_loop. We cannot propagate it to the caller
+                        // here, because it might not be Cloneable
+                        anyhow::bail!(
+                            "Could not flush frozen layer. Request id: {}",
+                            my_flush_request
+                        );
+                    } else {
+                        return Ok(());
+                    }
+                }
+            }
+            trace!("waiting for flush to complete");
+            rx.changed().await?;
+            trace!("done")
+        }
+    }
+
+    fn flush_frozen_layers(&self) {
+        self.layer_flush_start_tx.send_modify(|val| *val += 1);
    }

    /// Flush one frozen in-memory layer to disk, as a new delta layer.
-    fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
+    #[instrument(skip(self, frozen_layer), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.filename().display()))]
+    async fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
        // As a special case, when we have just imported an image into the repository,
        // instead of writing out a L0 delta layer, we directly write out image layer
        // files instead. This is possible as long as *all* the data imported into the
@@ -1542,6 +1654,10 @@ impl Timeline {
                    lsn,
                )?;

+                fail_point!("image-layer-writer-fail-before-finish", |_| {
+                    anyhow::bail!("failpoint image-layer-writer-fail-before-finish");
+                });
+
                for range in &partition.ranges {
                    let mut key = range.start;
                    while key < range.end {
@@ -1836,6 +1952,11 @@ impl Timeline {
                    },
                )?);
            }
+
+            fail_point!("delta-layer-writer-fail-before-finish", |_| {
+                anyhow::bail!("failpoint delta-layer-writer-fail-before-finish");
+            });
+
            writer.as_mut().unwrap().put_value(key, lsn, value)?;
            prev_key = Some(key);
        }
@@ -2235,13 +2356,10 @@ impl Timeline {

                let last_rec_lsn = data.records.last().unwrap().0;

-                let img = self.walredo_mgr.request_redo(
-                    key,
-                    request_lsn,
-                    base_img,
-                    data.records,
-                    self.pg_version,
-                )?;
+                let img = self
+                    .walredo_mgr
+                    .request_redo(key, request_lsn, base_img, data.records, self.pg_version)
+                    .context("Failed to reconstruct a page image:")?;

                if img.len() == page_cache::PAGE_SZ {
                    let cache = page_cache::get();
@@ -2316,7 +2434,6 @@ impl<'a> TimelineWriter<'a> {
    /// This will implicitly extend the relation, if the page is beyond the
    /// current end-of-file.
    pub fn put(&self, key: Key, lsn: Lsn, value: &Value) -> anyhow::Result<()> {
-        page_image_cache::remove(key, self.tenant_id, self.timeline_id);
        self.tl.put_value(key, lsn, value)
    }

--- a/pageserver/src/tenant_config.rs
+++ b/pageserver/src/tenant_config.rs
@@ -82,6 +82,7 @@ pub struct TenantConf {
    /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
    /// to avoid eager reconnects.
    pub max_lsn_wal_lag: NonZeroU64,
+    pub trace_read_requests: bool,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -105,6 +106,7 @@ pub struct TenantConfOpt {
    #[serde(with = "humantime_serde")]
    pub lagging_wal_timeout: Option<Duration>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
+    pub trace_read_requests: Option<bool>,
 }

 impl TenantConfOpt {
@@ -138,6 +140,9 @@ impl TenantConfOpt {
                .lagging_wal_timeout
                .unwrap_or(global_conf.lagging_wal_timeout),
            max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag),
+            trace_read_requests: self
+                .trace_read_requests
+                .unwrap_or(global_conf.trace_read_requests),
        }
    }

@@ -207,10 +212,10 @@ impl TenantConf {
                .expect("cannot parse default walreceiver lagging wal timeout"),
            max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
                .expect("cannot parse default max walreceiver Lsn wal lag"),
+            trace_read_requests: false,
        }
    }

-    #[cfg(test)]
    pub fn dummy_conf() -> Self {
        TenantConf {
            checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
@@ -232,6 +237,7 @@ impl TenantConf {
            .unwrap(),
            max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
                .unwrap(),
+            trace_read_requests: false,
        }
    }
 }
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -241,7 +241,7 @@ pub async fn shutdown_all_tenants() {
        let tenant_id = tenant.tenant_id();
        debug!("shutdown tenant {tenant_id}");

-        if let Err(err) = tenant.checkpoint() {
+        if let Err(err) = tenant.checkpoint().await {
            error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
        }
    }
--- a/pageserver/src/tenant_tasks.rs
+++ b/pageserver/src/tenant_tasks.rs
@@ -71,9 +71,7 @@ async fn compaction_loop(tenant_id: TenantId) {
            let mut sleep_duration = tenant.get_compaction_period();
            if let Err(e) = tenant.compaction_iteration() {
                sleep_duration = wait_duration;
-                error!("Compaction failed, retrying in {:?}: {e:#}", sleep_duration);
-                #[cfg(feature = "testing")]
-                std::process::abort();
+                error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
            }

            // Sleep
@@ -119,12 +117,10 @@ async fn gc_loop(tenant_id: TenantId) {
            let gc_horizon = tenant.get_gc_horizon();
            let mut sleep_duration = gc_period;
            if gc_horizon > 0 {
-                if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false)
+                if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await
                {
                    sleep_duration = wait_duration;
-                    error!("Gc failed, retrying in {:?}: {e:#}", sleep_duration);
-                    #[cfg(feature = "testing")]
-                    std::process::abort();
+                    error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
                }
            }

--- a/pageserver/src/trace.rs
+++ b/pageserver/src/trace.rs
@@ -0,0 +1,36 @@
+use bytes::Bytes;
+use std::{
+    fs::{create_dir_all, File},
+    io::{BufWriter, Write},
+    path::PathBuf,
+};
+
+pub struct Tracer {
+    writer: BufWriter<File>,
+}
+
+impl Drop for Tracer {
+    fn drop(&mut self) {
+        self.flush()
+    }
+}
+
+impl Tracer {
+    pub fn new(path: PathBuf) -> Self {
+        let parent = path.parent().expect("failed to parse parent path");
+        create_dir_all(parent).expect("failed to create trace dir");
+
+        let file = File::create(path).expect("failed to create trace file");
+        Tracer {
+            writer: BufWriter::new(file),
+        }
+    }
+
+    pub fn trace(&mut self, msg: &Bytes) {
+        self.writer.write_all(msg).expect("failed to write trace");
+    }
+
+    pub fn flush(&mut self) {
+        self.writer.flush().expect("failed to flush trace file");
+    }
+}
--- a/Show More
+++ b/Show More