WIP

test_forward_compatibility: fix path to pg_distrib_dir (#2826 )
Set correct `pg_distrib_dir` in `pageserver.toml` and in neon_local `config`. `test_forward_compatibility` shows flakiness during `neon_local pg start`, so hopefully, the patch will help. ``` 2022-11-15 16:07:34.091 GMT [13338] LOG: starting with zenith basebackup at LSN 0/A6A9310, prev 0/0 2022-11-15 16:07:34.091 GMT [13338] FATAL: cannot start in read-write mode from this base backup 2022-11-15 16:07:34.091 GMT [13337] LOG: startup process (PID 13338) exited with exit code 1 ```
2026-07-21 21:10:38 +00:00 · 2022-11-16 11:40:39 -05:00 · 2022-11-16 15:14:36 +00:00 · 2022-11-16 15:10:36 +00:00 · 2022-11-16 15:50:49 +02:00 · 2022-11-16 16:54:55 +04:00
178 changed files with 8609 additions and 2457 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -55,6 +55,22 @@ runs:
        name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
        path: /tmp/neon

+    - name: Download Neon binaries for the previous release
+      if: inputs.build_type != 'remote'
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
+        path: /tmp/neon-previous
+        prefix: latest
+
+    - name: Download compatibility snapshot for Postgres 14
+      if: inputs.build_type != 'remote'
+      uses: ./.github/actions/download
+      with:
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg14
+        path: /tmp/compatibility_snapshot_pg14
+        prefix: latest
+
    - name: Checkout
      if: inputs.needs_postgres_source == 'true'
      uses: actions/checkout@v3
@@ -73,22 +89,18 @@ runs:
      shell: bash -euxo pipefail {0}
      run: ./scripts/pysync

-    - name: Download compatibility snapshot for Postgres 14
-      uses: ./.github/actions/download
-      with:
-        name: compatibility-snapshot-${{ inputs.build_type }}-pg14
-        path: /tmp/compatibility_snapshot_pg14
-        prefix: latest
-
    - name: Run pytest
      env:
        NEON_BIN: /tmp/neon/bin
+        COMPATIBILITY_NEON_BIN: /tmp/neon-previous/bin
+        COMPATIBILITY_POSTGRES_DISTRIB_DIR: /tmp/neon-previous/pg_install
        TEST_OUTPUT: /tmp/test_output
        BUILD_TYPE: ${{ inputs.build_type }}
        AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
        AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
-        ALLOW_BREAKING_CHANGES: contains(github.event.pull_request.labels.*.name, 'breaking changes')
+        ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
+        ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
      shell: bash -euxo pipefail {0}
      run: |
        # PLATFORM will be embedded in the perf test report
@@ -111,7 +123,12 @@ runs:
          exit 1
        fi
        if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
+          # -n4 uses four processes to run tests via pytest-xdist
          EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
+
+          # --dist=loadgroup points tests marked with @pytest.mark.xdist_group
+          # to the same worker to make @pytest.mark.order work with xdist
+          EXTRA_PARAMS="--dist=loadgroup $EXTRA_PARAMS"
        fi

        if [[ "${{ inputs.run_with_real_s3 }}" == "true" ]]; then
@@ -146,9 +163,9 @@ runs:
        # --verbose prints name of each test (helpful when there are
        # multiple tests in one file)
        # -rA prints summary in the end
-        # -n4 uses four processes to run tests via pytest-xdist
        # -s is not used to prevent pytest from capturing output, because tests are running
        # in parallel and logs are mixed between different tests
+        #
        mkdir -p $TEST_OUTPUT/allure/results
        "${cov_prefix[@]}" ./scripts/pytest \
          --junitxml=$TEST_OUTPUT/junit.xml \
@@ -168,12 +185,12 @@ runs:
      uses: ./.github/actions/upload
      with:
        name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
-        # The path includes a test name (test_prepare_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
-        path: /tmp/test_output/test_prepare_snapshot/compatibility_snapshot_pg14/
+        # The path includes a test name (test_create_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
+        path: /tmp/test_output/test_create_snapshot/compatibility_snapshot_pg14/
        prefix: latest

    - name: Create Allure report
-      if: always()
+      if: success() || failure()
      uses: ./.github/actions/allure-report
      with:
        action: store
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -0,0 +1,35 @@
+storage:
+  vars:
+    bucket_name: neon-prod-storage-ap-southeast-1
+    bucket_region: ap-southeast-1
+    console_mgmt_base_url: http://console-release.local
+    etcd_endpoints: etcd-0.ap-southeast-1.aws.neon.tech:2379
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: ap-southeast-1
+    ansible_aws_ssm_bucket_name: neon-prod-storage-ap-southeast-1
+    console_region_id: aws-ap-southeast-1
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-064de8ea28bdb495b
+        pageserver-1.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-0b180defcaeeb6b93
+
+    safekeepers:
+      hosts:
+        safekeeper-0.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-0d6f1dc5161eef894
+        safekeeper-1.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-0e338adda8eb2d19f
+        safekeeper-2.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-04fb63634e4679eb9
--- a/.github/ansible/prod.eu-central-1.hosts.yaml
+++ b/.github/ansible/prod.eu-central-1.hosts.yaml
@@ -0,0 +1,35 @@
+storage:
+  vars:
+    bucket_name: neon-prod-storage-eu-central-1
+    bucket_region: eu-central-1
+    console_mgmt_base_url: http://console-release.local
+    etcd_endpoints: etcd-0.eu-central-1.aws.neon.tech:2379
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: eu-central-1
+    ansible_aws_ssm_bucket_name: neon-prod-storage-eu-central-1
+    console_region_id: aws-eu-central-1
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.eu-central-1.aws.neon.tech:
+          ansible_host:  i-0cd8d316ecbb715be
+        pageserver-1.eu-central-1.aws.neon.tech:
+          ansible_host:  i-090044ed3d383fef0
+
+    safekeepers:
+      hosts:
+        safekeeper-0.eu-central-1.aws.neon.tech:
+          ansible_host:  i-0b238612d2318a050
+        safekeeper-1.eu-central-1.aws.neon.tech:
+          ansible_host:  i-07b9c45e5c2637cd4
+        safekeeper-2.eu-central-1.aws.neon.tech:
+          ansible_host:  i-020257302c3c93d88
--- a/.github/ansible/prod.us-east-2.hosts.yaml
+++ b/.github/ansible/prod.us-east-2.hosts.yaml
@@ -0,0 +1,36 @@
+storage:
+  vars:
+    bucket_name: neon-prod-storage-us-east-2
+    bucket_region: us-east-2
+    console_mgmt_base_url: http://console-release.local
+    etcd_endpoints: etcd-0.us-east-2.aws.neon.tech:2379
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: us-east-2
+    ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-2
+    console_region_id: aws-us-east-2
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.us-east-2.aws.neon.tech:
+          ansible_host:  i-062227ba7f119eb8c
+        pageserver-1.us-east-2.aws.neon.tech:
+          ansible_host:  i-0b3ec0afab5968938
+
+    safekeepers:
+      hosts:
+        safekeeper-0.us-east-2.aws.neon.tech:
+          ansible_host:  i-0e94224750c57d346
+        safekeeper-1.us-east-2.aws.neon.tech:
+          ansible_host:  i-06d113fb73bfddeb0
+        safekeeper-2.us-east-2.aws.neon.tech:
+          ansible_host:  i-09f66c8e04afff2e8
+          
--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -22,6 +22,10 @@ storage:
          console_region_id: aws-us-west-2
        zenith-1-ps-3:
          console_region_id: aws-us-west-2
+        zenith-1-ps-4:
+          console_region_id: aws-us-west-2
+        zenith-1-ps-5:
+          console_region_id: aws-us-west-2

    safekeepers:
      hosts:
--- a/.github/ansible/ssm_config
+++ b/.github/ansible/ssm_config
@@ -1,3 +1,2 @@
 ansible_connection: aws_ssm
-ansible_aws_ssm_bucket_name: neon-dev-bucket
 ansible_python_interpreter: /usr/bin/python3
--- a/.github/ansible/staging.hosts.yaml
+++ b/.github/ansible/staging.hosts.yaml
@@ -3,7 +3,7 @@ storage:
    bucket_name: zenith-staging-storage-us-east-1
    bucket_region: us-east-1
    console_mgmt_base_url: http://console-staging.local
-    etcd_endpoints: zenith-us-stage-etcd.local:2379
+    etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
      remote_storage:
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -14,6 +14,7 @@ storage:
    hostname_suffix: ""
    remote_user: ssm-user
    ansible_aws_ssm_region: us-east-2
+    ansible_aws_ssm_bucket_name: neon-staging-storage-us-east-2
    console_region_id: aws-us-east-2

  children:
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-release.local/management/api/v2"
+  domain: "*.ap-southeast-1.aws.neon.tech"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: prod
+  zenith_region: ap-southeast-1
+  zenith_region_slug: ap-southeast-1
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-release.local/management/api/v2"
+  domain: "*.eu-central-1.aws.neon.tech"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: prod
+  zenith_region: eu-central-1
+  zenith_region_slug: eu-central-1
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-release.local/management/api/v2"
+  domain: "*.us-east-2.aws.neon.tech"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: prod
+  zenith_region: us-east-2
+  zenith_region_slug: us-east-2
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -164,7 +164,7 @@ jobs:
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
      PLATFORM: ${{ matrix.platform }}

-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
      options: --init
@@ -265,7 +265,7 @@ jobs:
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"

    - name: Create Allure report
-      if: always()
+      if: success() || failure()
      uses: ./.github/actions/allure-report
      with:
        action: generate
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -18,8 +18,8 @@ env:

 jobs:
  tag:
-    runs-on: dev
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
+    runs-on: [ self-hosted, dev, x64 ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
    outputs:
      build-tag: ${{steps.build-tag.outputs.tag}}

@@ -46,7 +46,7 @@ jobs:
        id: build-tag

  build-neon:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -127,8 +127,8 @@ jobs:
            target/
          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
          key: |
-            v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
-            v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-
+            v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
+            v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-

      - name: Cache postgres v14 build
        id: cache_pg_14
@@ -236,7 +236,7 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -268,34 +268,8 @@ jobs:
        if: matrix.build_type == 'debug'
        uses: ./.github/actions/save-coverage-data

-  upload-latest-artifacts:
-    runs-on: dev
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
-      options: --init
-    needs: [ regress-tests ]
-    if: github.ref_name == 'main'
-    steps:
-      - name: Copy Neon artifact to the latest directory
-        shell: bash -euxo pipefail {0}
-        env:
-          BUCKET: neon-github-public-dev
-          PREFIX: artifacts/${{ github.run_id }}
-        run: |
-          for build_type in debug release; do
-            FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
-
-            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
-            if [ -z "${S3_KEY}" ]; then
-              echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist"
-              exit 1
-            fi
-
-            time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/artifacts/latest/${FILENAME}
-          done
-
  benchmarks:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -326,12 +300,12 @@ jobs:
      # while coverage is currently collected for the debug ones

  merge-allure-report:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
    needs: [ regress-tests, benchmarks ]
-    if: always()
+    if: success() || failure()
    strategy:
      fail-fast: false
      matrix:
@@ -364,7 +338,7 @@ jobs:
          DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json

  coverage-report:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -389,7 +363,7 @@ jobs:
            !~/.cargo/registry/src
            ~/.cargo/git/
            target/
-          key: v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
+          key: v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}

      - name: Get Neon artifact
        uses: ./.github/actions/download
@@ -441,15 +415,19 @@ jobs:
        shell: bash -euxo pipefail {0}

  trigger-e2e-tests:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
-    needs: [ build-neon ]
+    needs: [ push-docker-hub, tag ]
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
+          # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
+          # but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
+          # to place a job run status update later.
          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+          # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}

          REMOTE_REPO="${{ github.repository_owner }}/cloud"
@@ -475,12 +453,14 @@ jobs:
              \"inputs\": {
                \"ci_job_name\": \"neon-cloud-e2e\",
                \"commit_hash\": \"$COMMIT_SHA\",
-                \"remote_repo\": \"${{ github.repository }}\"
+                \"remote_repo\": \"${{ github.repository }}\",
+                \"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
+                \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\"
              }
            }"

  neon-image:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug

@@ -498,7 +478,7 @@ jobs:
        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}

  compute-tools-image:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug

@@ -512,28 +492,8 @@ jobs:
      - name: Kaniko build compute tools
        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}

-  compute-node-image:
-    runs-on: dev
-    container: gcr.io/kaniko-project/executor:v1.9.0-debug
-    needs: [ tag ]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1 # v3 won't work with kaniko
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
-
-        # compute-node uses postgres 14, which is default now
-        # cloud repo depends on this image name, thus duplicating it
-        # remove compute-node when cloud repo is updated
-      - name: Kaniko build compute node with extensions v14 (compatibility)
-        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}}
-
  compute-node-image-v14:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug
    needs: [ tag ]
    steps:
@@ -549,9 +509,8 @@ jobs:
      - name: Kaniko build compute node with extensions v14
        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}}

-
  compute-node-image-v15:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug
    needs: [ tag ]
    steps:
@@ -567,18 +526,58 @@ jobs:
      - name: Kaniko build compute node with extensions v15
        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}}

+  test-images:
+    needs: [ tag, neon-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
+    runs-on: [ self-hosted, dev, x64 ]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
+      # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
+      # Regular pageserver version string looks like
+      #   Neon page server git-env:32d14403bd6ab4f4520a94cbfd81a6acef7a526c failpoints: true, features: []
+      # Bad versions might loop like:
+      #   Neon page server git-env:local failpoints: true, features: ["testing"]
+      # Ensure that we don't have bad versions.
+      - name: Verify image versions
+        shell: bash # ensure no set -e for better error messages
+        run: |
+          pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
+
+          echo "Pageserver version string: $pageserver_version"
+
+          if ! echo "$pageserver_version" | grep -qv 'git-env:local' ; then
+            echo "Pageserver version should not be the default Dockerfile one"
+            exit 1
+          fi
+
+          if ! echo "$pageserver_version" | grep -qv '"testing"' ; then
+            echo "Pageserver version should have no testing feature enabled"
+            exit 1
+          fi
+
+      - name: Verify docker-compose example
+        run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
+
+      - name: Print logs and clean up
+        if: always()
+        run: |
+          docker compose -f ./docker-compose/docker-compose.yml logs || 0
+          docker compose -f ./docker-compose/docker-compose.yml down
+
  promote-images:
-    runs-on: dev
-    needs: [ tag, neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
+    runs-on: [ self-hosted, dev, x64 ]
+    needs: [ tag, test-images ]
    if: github.event_name != 'workflow_dispatch'
    container: amazon/aws-cli
    strategy:
      fail-fast: false
      matrix:
-        # compute-node uses postgres 14, which is default now
-        # cloud repo depends on this image name, thus duplicating it
-        # remove compute-node when cloud repo is updated
-        name: [ neon, compute-node, compute-node-v14, compute-node-v15, compute-tools ]
+        name: [ neon, compute-node-v14, compute-node-v15, compute-tools ]

    steps:
      - name: Promote image to latest
@@ -587,7 +586,7 @@ jobs:
          aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"

  push-docker-hub:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    needs: [ promote-images, tag ]
    container: golang:1.19-bullseye

@@ -608,9 +607,6 @@ jobs:
      - name: Pull compute tools image from ECR
        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools

-      - name: Pull compute node image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} compute-node
-
      - name: Pull compute node v14 image from ECR
        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14

@@ -625,11 +621,10 @@ jobs:
          (github.ref_name == 'main' || github.ref_name == 'release') &&
          github.event_name != 'workflow_dispatch'
        run: |
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v14:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v15:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest

      - name: Configure Docker Hub login
        run: |
@@ -643,9 +638,6 @@ jobs:
      - name: Push compute tools image to Docker Hub
        run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}

-      - name: Push compute node image to Docker Hub
-        run: crane push compute-node neondatabase/compute-node:${{needs.tag.outputs.build-tag}}
-
      - name: Push compute node v14 image to Docker Hub
        run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}

@@ -662,7 +654,6 @@ jobs:
        run: |
          crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest

@@ -745,7 +736,7 @@ jobs:
          rm -f neon_install.tar.gz .neon_current_version

  deploy-new:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
@@ -756,9 +747,9 @@ jobs:
    defaults:
      run:
        shell: bash
-    env:
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+    strategy:
+      matrix:
+        target_region: [ us-east-2 ]
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -781,11 +772,51 @@ jobs:
          fi

          ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i staging.us-east-2.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
+          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
+          rm -f neon_install.tar.gz .neon_current_version
+
+  deploy-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
+    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
+    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    if: |
+      (github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        target_region: [ us-east-2, eu-central-1, ap-southeast-1 ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Redeploy
+        run: |
+          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
+          cd "$(pwd)/.github/ansible"
+
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            ./get_binaries.sh
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            RELEASE=true ./get_binaries.sh
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+
+          ansible-galaxy collection install sivel.toiletwater
+          ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_PRODUCTION_API_KEY}}
          rm -f neon_install.tar.gz .neon_current_version

  deploy-proxy:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
@@ -827,7 +858,7 @@ jobs:
          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s

  deploy-proxy-new:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
@@ -837,6 +868,11 @@ jobs:
    defaults:
      run:
        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: dev-us-east-2-beta
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -847,15 +883,52 @@ jobs:
      - name: Configure environment
        run: |
          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region us-east-2 eks update-kubeconfig --name dev-us-east-2-beta --role-arn arn:aws:iam::369495373322:role/github-runner
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}

      - name: Re-deploy proxy
        run: |
          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s

-  promote-compatibility-test-snapshot:
-    runs-on: dev
+  deploy-proxy-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
+    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    if: |
+      (github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: prod-us-east-2-delta
+          - target_region: eu-central-1
+            target_cluster: prod-eu-central-1-gamma
+          - target_region: ap-southeast-1
+            target_cluster: prod-ap-southeast-1-epsilon
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+
+      - name: Re-deploy proxy
+        run: |
+          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+
+  promote-compatibility-data:
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -868,9 +941,24 @@ jobs:
          BUCKET: neon-github-public-dev
          PREFIX: artifacts/latest
        run: |
+          # Update compatibility snapshot for the release
          for build_type in debug release; do
            OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst
            NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst

            time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
          done
+
+          # Update Neon artifact for the release (reuse already uploaded artifact)
+          for build_type in debug release; do
+            OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
+            FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
+
+            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
+            if [ -z "${S3_KEY}" ]; then
+              echo 2>&1 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
+              exit 1
+            fi
+
+            time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
+          done
--- a/.github/workflows/codestyle.yml
+++ b/.github/workflows/codestyle.yml
@@ -106,7 +106,7 @@ jobs:
            !~/.cargo/registry/src
            ~/.cargo/git
            target
-          key: v5-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
+          key: v6-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust

      - name: Run cargo clippy
        run: ./run_clippy.sh
@@ -115,7 +115,7 @@ jobs:
        run: cargo build --locked --all --all-targets

  check-rust-dependencies:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,7 +1,7 @@
 [submodule "vendor/postgres-v14"]
 	path = vendor/postgres-v14
 	url = https://github.com/neondatabase/postgres.git
-	branch = main
+	branch = REL_14_STABLE_neon
 [submodule "vendor/postgres-v15"]
 	path = vendor/postgres-v15
 	url = https://github.com/neondatabase/postgres.git
--- a/10
+++ b/10
@@ -0,0 +1,10 @@
+/compute_tools/ @neondatabase/control-plane
+/control_plane/ @neondatabase/compute @neondatabase/storage
+/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
+/libs/postgres_ffi/ @neondatabase/compute 
+/libs/remote_storage/ @neondatabase/storage 
+/libs/safekeeper_api/ @neondatabase/safekeepers  
+/pageserver/ @neondatabase/compute @neondatabase/storage 
+/pgxn/ @neondatabase/compute
+/proxy/ @neondatabase/control-plane 
+/safekeeper/ @neondatabase/safekeepers
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -317,12 +317,6 @@ dependencies = [
 "generic-array",
 ]

-[[package]]
-name = "boxfnonce"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5988cb1d626264ac94100be357308f29ff7cbdd3b36bda27f450a4ee3f713426"
-
 [[package]]
 name = "bstr"
 version = "1.0.1"
@@ -600,6 +594,7 @@ dependencies = [
 "tar",
 "thiserror",
 "toml",
+ "url",
 "utils",
 "workspace_hack",
 ]
@@ -849,16 +844,6 @@ dependencies = [
 "syn",
 ]

-[[package]]
-name = "daemonize"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70c24513e34f53b640819f0ac9f705b673fcf4006d7aab8778bee72ebfc89815"
-dependencies = [
- "boxfnonce",
- "libc",
-]
-
 [[package]]
 name = "darling"
 version = "0.14.1"
@@ -2140,7 +2125,6 @@ dependencies = [
 "crc32c",
 "criterion",
 "crossbeam-utils",
- "daemonize",
 "etcd_broker",
 "fail",
 "futures",
@@ -2161,6 +2145,7 @@ dependencies = [
 "postgres-types",
 "postgres_ffi",
 "pprof",
+ "pq_proto",
 "rand",
 "regex",
 "remote_storage",
@@ -2173,6 +2158,7 @@ dependencies = [
 "svg_fmt",
 "tar",
 "tempfile",
+ "tenant_size_model",
 "thiserror",
 "tokio",
 "tokio-postgres",
@@ -2190,6 +2176,7 @@ name = "pageserver_api"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "byteorder",
 "bytes",
 "const_format",
 "postgres_ffi",
@@ -2452,6 +2439,21 @@ version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"

+[[package]]
+name = "pq_proto"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "bytes",
+ "pin-project-lite",
+ "postgres-protocol",
+ "rand",
+ "serde",
+ "tokio",
+ "tracing",
+ "workspace_hack",
+]
+
 [[package]]
 name = "prettyplease"
 version = "0.1.21"
@@ -2584,6 +2586,7 @@ dependencies = [
 "once_cell",
 "parking_lot 0.12.1",
 "pin-project-lite",
+ "pq_proto",
 "rand",
 "rcgen",
 "reqwest",
@@ -3087,7 +3090,6 @@ dependencies = [
 "clap 4.0.15",
 "const_format",
 "crc32c",
- "daemonize",
 "etcd_broker",
 "fs2",
 "git-version",
@@ -3095,11 +3097,13 @@ dependencies = [
 "humantime",
 "hyper",
 "metrics",
+ "nix 0.25.0",
 "once_cell",
 "parking_lot 0.12.1",
 "postgres",
 "postgres-protocol",
 "postgres_ffi",
+ "pq_proto",
 "regex",
 "remote_storage",
 "safekeeper_api",
@@ -3548,6 +3552,13 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "tenant_size_model"
+version = "0.1.0"
+dependencies = [
+ "workspace_hack",
+]
+
 [[package]]
 name = "termcolor"
 version = "1.1.3"
@@ -4053,9 +4064,7 @@ dependencies = [
 "metrics",
 "nix 0.25.0",
 "once_cell",
- "pin-project-lite",
- "postgres",
- "postgres-protocol",
+ "pq_proto",
 "rand",
 "routerify",
 "rustls",
@@ -4380,6 +4389,9 @@ dependencies = [
 "crossbeam-utils",
 "either",
 "fail",
+ "futures-channel",
+ "futures-task",
+ "futures-util",
 "hashbrown",
 "indexmap",
 "libc",
@@ -4393,6 +4405,7 @@ dependencies = [
 "rand",
 "regex",
 "regex-syntax",
+ "reqwest",
 "scopeguard",
 "serde",
 "stable_deref_trait",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -25,6 +25,10 @@ members = [
 # Besides, debug info should not affect the performance.
 debug = true

+# disable debug symbols for all packages except this one to decrease binaries size
+[profile.release.package."*"]
+debug = false
+
 [profile.release-line-debug]
 inherits = "release"
 debug = 1 # true = 2 = all symbols, 1 = line only
--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node-v14
@@ -13,7 +13,7 @@ ARG TAG=pinned
 FROM debian:bullseye-slim AS build-deps
 RUN apt update &&  \
    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
-    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config
+    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev

 #########################################################################################
 #
@@ -24,7 +24,7 @@ RUN apt update &&  \
 FROM build-deps AS pg-build
 COPY vendor/postgres-v14 postgres
 RUN cd postgres && \
-    ./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
+    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
    # Install headers
--- a/Dockerfile.compute-node-v15
+++ b/Dockerfile.compute-node-v15
@@ -13,7 +13,7 @@ ARG TAG=pinned
 FROM debian:bullseye-slim AS build-deps
 RUN apt update &&  \
    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
-    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config
+    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev

 #########################################################################################
 #
@@ -24,7 +24,7 @@ RUN apt update &&  \
 FROM build-deps AS pg-build
 COPY vendor/postgres-v15 postgres
 RUN cd postgres && \
-    ./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
+    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
    # Install headers
--- a/Dockerfile.compute-node.legacy
+++ b/Dockerfile.compute-node.legacy
@@ -1,88 +0,0 @@
-#
-# Legacy version of the Dockerfile for the compute node.
-# Used by e2e CI. Building Dockerfile.compute-node will take
-# unreasonable ammount of time without v2 runners.
-#
-# TODO: remove once cloud repo CI is moved to v2 runners.
-#
-
-
-# Allow specifiyng different compute-tools tag and image repo, so we are
-# able to use different images
-ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
-ARG IMAGE=compute-tools
-ARG TAG=latest
-
-#
-# Image with pre-built tools
-#
-FROM $REPOSITORY/$IMAGE:$TAG AS compute-deps
-# Only to get ready compute_ctl binary as deppendency
-
-#
-# Image with Postgres build deps
-#
-FROM debian:bullseye-slim AS build-deps
-
-RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-                                          libcurl4-openssl-dev libossp-uuid-dev
-
-#
-# Image with built Postgres
-#
-FROM build-deps AS pg-build
-
-# Add user postgres
-RUN adduser postgres
-RUN mkdir /pg && chown postgres:postgres /pg
-
-# Copy source files
-# version 14 is default for now
-COPY ./vendor/postgres-v14 /pg/
-COPY ./pgxn /pg/
-
-# Build and install Postgres locally
-RUN mkdir /pg/compute_build && cd /pg/compute_build && \
-    ../configure CFLAGS='-O2 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --with-uuid=ossp && \
-    # Install main binaries and contribs
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
-    # Install headers
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install
-
-# Install neon contrib
-RUN make MAKELEVEL=0 PG_CONFIG=/pg/compute_build/postgres_bin/bin/pg_config -j $(getconf _NPROCESSORS_ONLN) -C /pg/neon install
-
-USER postgres
-WORKDIR /pg
-
-#
-# Final compute node image to be exported
-#
-FROM debian:bullseye-slim
-
-# libreadline-dev is required to run psql
-RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev
-
-# Add user postgres
-RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
-    echo "postgres:test_console_pass" | chpasswd && \
-    mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
-    chown -R postgres:postgres /var/db/postgres && \
-    chmod 0750 /var/db/postgres/compute
-
-# Copy ready Postgres binaries
-COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local
-
-# Copy binaries from compute-tools
-COPY --from=compute-deps /usr/local/bin/compute_ctl /usr/local/bin/compute_ctl
-
-# XXX: temporary symlink for compatibility with old control-plane
-RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
-
-# Add postgres shared objects to the search path
-RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
-
-USER postgres
-
-ENTRYPOINT ["/usr/local/bin/compute_ctl"]
--- a/10
+++ b/10
@@ -151,6 +151,11 @@ neon-pg-ext-v14: postgres-v14
 	(cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
+	+@echo "Compiling neon_walredo v14"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14
+	(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 && \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
 	+@echo "Compiling neon_test_utils" v14
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14
 	(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \
@@ -163,6 +168,11 @@ neon-pg-ext-v15: postgres-v15
 	(cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
+	+@echo "Compiling neon_walredo v15"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15
+	(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 && \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
 	+@echo "Compiling neon_test_utils" v15
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15
 	(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \
--- a/README.md
+++ b/README.md
@@ -223,10 +223,7 @@ Ensure your dependencies are installed as described [here](https://github.com/ne
 ```sh
 git clone --recursive https://github.com/neondatabase/neon.git

-# either:
 CARGO_BUILD_FLAGS="--features=testing" make
-# or:
-make debug

 ./scripts/pytest
 ```
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -65,7 +65,7 @@ impl GenericOption {
            let name = match self.name.as_str() {
                "safekeepers" => "neon.safekeepers",
                "wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout",
-                "wal_acceptor_connect_timeout" => "neon.safekeeper_connect_timeout",
+                "wal_acceptor_connection_timeout" => "neon.safekeeper_connection_timeout",
                it => it,
            };

--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -4,20 +4,21 @@ version = "0.1.0"
 edition = "2021"

 [dependencies]
+anyhow = "1.0"
 clap = "4.0"
 comfy-table = "6.1"
 git-version = "0.3.5"
-tar = "0.4.38"
+nix = "0.25"
+once_cell = "1.13.0"
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+regex = "1"
+reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
 serde = { version = "1.0", features = ["derive"] }
 serde_with = "2.0"
-toml = "0.5"
-once_cell = "1.13.0"
-regex = "1"
-anyhow = "1.0"
+tar = "0.4.38"
 thiserror = "1"
-nix = "0.25"
-reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
+toml = "0.5"
+url = "2.2.2"

 # Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
 # instead, so that recompile times are better.
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -0,0 +1,264 @@
+//! Spawns and kills background processes that are needed by Neon CLI.
+//! Applies common set-up such as log and pid files (if needed) to every process.
+//!
+//! Neon CLI does not run in background, so it needs to store the information about
+//! spawned processes, which it does in this module.
+//! We do that by storing the pid of the process in the "${process_name}.pid" file.
+//! The pid file can be created by the process itself
+//! (Neon storage binaries do that and also ensure that a lock is taken onto that file)
+//! or we create such file after starting the process
+//! (non-Neon binaries don't necessarily follow our pidfile conventions).
+//! The pid stored in the file is later used to stop the service.
+//!
+//! See [`lock_file`] module for more info.
+
+use std::ffi::OsStr;
+use std::io::Write;
+use std::path::Path;
+use std::process::{Child, Command};
+use std::time::Duration;
+use std::{fs, io, thread};
+
+use anyhow::{anyhow, bail, Context, Result};
+use nix::errno::Errno;
+use nix::sys::signal::{kill, Signal};
+use nix::unistd::Pid;
+
+use utils::lock_file;
+
+const RETRIES: u32 = 15;
+const RETRY_TIMEOUT_MILLIS: u64 = 500;
+
+/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
+/// it itself.
+pub enum InitialPidFile<'t> {
+    /// Create a pidfile, to allow future CLI invocations to manipulate the process.
+    Create(&'t Path),
+    /// The process will create the pidfile itself, need to wait for that event.
+    Expect(&'t Path),
+}
+
+/// Start a background child process using the parameters given.
+pub fn start_process<F, S: AsRef<OsStr>>(
+    process_name: &str,
+    datadir: &Path,
+    command: &Path,
+    args: &[S],
+    initial_pid_file: InitialPidFile,
+    process_status_check: F,
+) -> anyhow::Result<Child>
+where
+    F: Fn() -> anyhow::Result<bool>,
+{
+    let log_path = datadir.join(format!("{process_name}.log"));
+    let process_log_file = fs::OpenOptions::new()
+        .create(true)
+        .write(true)
+        .append(true)
+        .open(&log_path)
+        .with_context(|| {
+            format!("Could not open {process_name} log file {log_path:?} for writing")
+        })?;
+    let same_file_for_stderr = process_log_file.try_clone().with_context(|| {
+        format!("Could not reuse {process_name} log file {log_path:?} for writing stderr")
+    })?;
+
+    let mut command = Command::new(command);
+    let background_command = command
+        .stdout(process_log_file)
+        .stderr(same_file_for_stderr)
+        .args(args);
+    let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
+
+    let mut spawned_process = filled_cmd.spawn().with_context(|| {
+        format!("Could not spawn {process_name}, see console output and log files for details.")
+    })?;
+    let pid = spawned_process.id();
+    let pid = Pid::from_raw(
+        i32::try_from(pid)
+            .with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
+    );
+
+    let pid_file_to_check = match initial_pid_file {
+        InitialPidFile::Create(target_pid_file_path) => {
+            match lock_file::create_lock_file(target_pid_file_path, pid.to_string()) {
+                lock_file::LockCreationResult::Created { .. } => {
+                    // We use "lock" file here only to create the pid file. The lock on the pidfile will be dropped as soon
+                    // as this CLI invocation exits, so it's a bit useless, but doesn't any harm either.
+                }
+                lock_file::LockCreationResult::AlreadyLocked { .. } => {
+                    anyhow::bail!("Cannot write pid file for {process_name} at path {target_pid_file_path:?}: file is already locked by another process")
+                }
+                lock_file::LockCreationResult::CreationFailed(e) => {
+                    return Err(e.context(format!(
+                    "Failed to create pid file for {process_name} at path {target_pid_file_path:?}"
+                )))
+                }
+            }
+            None
+        }
+        InitialPidFile::Expect(pid_file_path) => Some(pid_file_path),
+    };
+
+    for retries in 0..RETRIES {
+        match process_started(pid, pid_file_to_check, &process_status_check) {
+            Ok(true) => {
+                println!("\n{process_name} started, pid: {pid}");
+                return Ok(spawned_process);
+            }
+            Ok(false) => {
+                if retries < 5 {
+                    print!(".");
+                    io::stdout().flush().unwrap();
+                } else {
+                    if retries == 5 {
+                        println!() // put a line break after dots for second message
+                    }
+                    println!("{process_name} has not started yet, retrying ({retries})...");
+                }
+                thread::sleep(Duration::from_millis(RETRY_TIMEOUT_MILLIS));
+            }
+            Err(e) => {
+                println!("{process_name} failed to start: {e:#}");
+                if let Err(e) = spawned_process.kill() {
+                    println!("Could not stop {process_name} subprocess: {e:#}")
+                };
+                return Err(e);
+            }
+        }
+    }
+    anyhow::bail!("{process_name} could not start in {RETRIES} attempts");
+}
+
+/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
+pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
+    if !pid_file.exists() {
+        println!("{process_name} is already stopped: no pid file {pid_file:?} is present");
+        return Ok(());
+    }
+    let pid = read_pidfile(pid_file)?;
+
+    let sig = if immediate {
+        print!("Stopping {process_name} with pid {pid} immediately..");
+        Signal::SIGQUIT
+    } else {
+        print!("Stopping {process_name} with pid {pid} gracefully..");
+        Signal::SIGTERM
+    };
+    io::stdout().flush().unwrap();
+    match kill(pid, sig) {
+        Ok(()) => (),
+        Err(Errno::ESRCH) => {
+            println!(
+                "{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found"
+            );
+            return Ok(());
+        }
+        Err(e) => anyhow::bail!("Failed to send signal to {process_name} with pid {pid}: {e}"),
+    }
+
+    // Wait until process is gone
+    for _ in 0..RETRIES {
+        match process_has_stopped(pid) {
+            Ok(true) => {
+                println!("\n{process_name} stopped");
+                if let Err(e) = fs::remove_file(pid_file) {
+                    if e.kind() != io::ErrorKind::NotFound {
+                        eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
+                    }
+                }
+                return Ok(());
+            }
+            Ok(false) => {
+                print!(".");
+                io::stdout().flush().unwrap();
+                thread::sleep(Duration::from_secs(1))
+            }
+            Err(e) => {
+                println!("{process_name} with pid {pid} failed to stop: {e:#}");
+                return Err(e);
+            }
+        }
+    }
+
+    anyhow::bail!("{process_name} with pid {pid} failed to stop in {RETRIES} attempts");
+}
+
+fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
+    let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
+
+    let var = "LLVM_PROFILE_FILE";
+    if let Some(val) = std::env::var_os(var) {
+        filled_cmd = filled_cmd.env(var, val);
+    }
+
+    const RUST_LOG_KEY: &str = "RUST_LOG";
+    if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
+        filled_cmd.env(RUST_LOG_KEY, rust_log_value)
+    } else {
+        filled_cmd
+    }
+}
+
+fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
+    for env_key in [
+        "AWS_ACCESS_KEY_ID",
+        "AWS_SECRET_ACCESS_KEY",
+        "AWS_SESSION_TOKEN",
+    ] {
+        if let Ok(value) = std::env::var(env_key) {
+            cmd = cmd.env(env_key, value);
+        }
+    }
+    cmd
+}
+
+fn process_started<F>(
+    pid: Pid,
+    pid_file_to_check: Option<&Path>,
+    status_check: &F,
+) -> anyhow::Result<bool>
+where
+    F: Fn() -> anyhow::Result<bool>,
+{
+    match status_check() {
+        Ok(true) => match pid_file_to_check {
+            Some(pid_file_path) => {
+                if pid_file_path.exists() {
+                    let pid_in_file = read_pidfile(pid_file_path)?;
+                    Ok(pid_in_file == pid)
+                } else {
+                    Ok(false)
+                }
+            }
+            None => Ok(true),
+        },
+        Ok(false) => Ok(false),
+        Err(e) => anyhow::bail!("process failed to start: {e}"),
+    }
+}
+
+/// Read a PID file
+///
+/// We expect a file that contains a single integer.
+fn read_pidfile(pidfile: &Path) -> Result<Pid> {
+    let pid_str = fs::read_to_string(pidfile)
+        .with_context(|| format!("failed to read pidfile {pidfile:?}"))?;
+    let pid: i32 = pid_str
+        .parse()
+        .map_err(|_| anyhow!("failed to parse pidfile {pidfile:?}"))?;
+    if pid < 1 {
+        bail!("pidfile {pidfile:?} contained bad value '{pid}'");
+    }
+    Ok(Pid::from_raw(pid))
+}
+
+fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
+    match kill(pid, None) {
+        // Process exists, keep waiting
+        Ok(_) => Ok(false),
+        // Process not found, we're done
+        Err(Errno::ESRCH) => Ok(true),
+        Err(err) => anyhow::bail!("Failed to send signal to process with pid {pid}: {err}"),
+    }
+}
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -9,8 +9,8 @@ use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
 use control_plane::compute::ComputeControlPlane;
 use control_plane::local_env::{EtcdBroker, LocalEnv};
+use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
-use control_plane::storage::PageServerNode;
 use control_plane::{etcd, local_env};
 use pageserver_api::models::TimelineInfo;
 use pageserver_api::{
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -12,15 +12,14 @@ use std::time::Duration;

 use anyhow::{Context, Result};
 use utils::{
-    connstring::connection_host_port,
    id::{TenantId, TimelineId},
    lsn::Lsn,
    postgres_backend::AuthType,
 };

 use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION};
+use crate::pageserver::PageServerNode;
 use crate::postgresql_conf::PostgresConf;
-use crate::storage::PageServerNode;

 //
 // ComputeControlPlane
@@ -300,7 +299,8 @@ impl PostgresNode {

        // Configure the node to fetch pages from pageserver
        let pageserver_connstr = {
-            let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
+            let config = &self.pageserver.pg_connection_config;
+            let (host, port) = (config.host(), config.port());

            // Set up authentication
            //
--- a/control_plane/src/connection.rs
+++ b/control_plane/src/connection.rs
@@ -0,0 +1,57 @@
+use url::Url;
+
+#[derive(Debug)]
+pub struct PgConnectionConfig {
+    url: Url,
+}
+
+impl PgConnectionConfig {
+    pub fn host(&self) -> &str {
+        self.url.host_str().expect("BUG: no host")
+    }
+
+    pub fn port(&self) -> u16 {
+        self.url.port().expect("BUG: no port")
+    }
+
+    /// Return a `<host>:<port>` string.
+    pub fn raw_address(&self) -> String {
+        format!("{}:{}", self.host(), self.port())
+    }
+
+    /// Connect using postgres protocol with TLS disabled.
+    pub fn connect_no_tls(&self) -> Result<postgres::Client, postgres::Error> {
+        postgres::Client::connect(self.url.as_str(), postgres::NoTls)
+    }
+}
+
+impl std::str::FromStr for PgConnectionConfig {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let mut url: Url = s.parse()?;
+
+        match url.scheme() {
+            "postgres" | "postgresql" => {}
+            other => anyhow::bail!("invalid scheme: {other}"),
+        }
+
+        // It's not a valid connection url if host is unavailable.
+        if url.host().is_none() {
+            anyhow::bail!(url::ParseError::EmptyHost);
+        }
+
+        // E.g. `postgres:bar`.
+        if url.cannot_be_a_base() {
+            anyhow::bail!("URL cannot be a base");
+        }
+
+        // Set the default PG port if it's missing.
+        if url.port().is_none() {
+            url.set_port(Some(5432))
+                .expect("BUG: couldn't set the default port");
+        }
+
+        Ok(Self { url })
+    }
+}
--- a/control_plane/src/etcd.rs
+++ b/control_plane/src/etcd.rs
@@ -1,99 +1,75 @@
-use std::{
-    fs,
-    path::PathBuf,
-    process::{Command, Stdio},
-};
+use std::{fs, path::PathBuf};

 use anyhow::Context;
-use nix::{
-    sys::signal::{kill, Signal},
-    unistd::Pid,
-};

-use crate::{local_env, read_pidfile};
+use crate::{background_process, local_env};

 pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
    let etcd_broker = &env.etcd_broker;
    println!(
-        "Starting etcd broker using {}",
-        etcd_broker.etcd_binary_path.display()
+        "Starting etcd broker using {:?}",
+        etcd_broker.etcd_binary_path
    );

    let etcd_data_dir = env.base_data_dir.join("etcd");
-    fs::create_dir_all(&etcd_data_dir).with_context(|| {
-        format!(
-            "Failed to create etcd data dir: {}",
-            etcd_data_dir.display()
-        )
-    })?;
+    fs::create_dir_all(&etcd_data_dir)
+        .with_context(|| format!("Failed to create etcd data dir {etcd_data_dir:?}"))?;

-    let etcd_stdout_file =
-        fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| {
-            format!(
-                "Failed to create etcd stout file in directory {}",
-                etcd_data_dir.display()
-            )
-        })?;
-    let etcd_stderr_file =
-        fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| {
-            format!(
-                "Failed to create etcd stderr file in directory {}",
-                etcd_data_dir.display()
-            )
-        })?;
    let client_urls = etcd_broker.comma_separated_endpoints();
+    let args = [
+        format!("--data-dir={}", etcd_data_dir.display()),
+        format!("--listen-client-urls={client_urls}"),
+        format!("--advertise-client-urls={client_urls}"),
+        // Set --quota-backend-bytes to keep the etcd virtual memory
+        // size smaller. Our test etcd clusters are very small.
+        // See https://github.com/etcd-io/etcd/issues/7910
+        "--quota-backend-bytes=100000000".to_string(),
+        // etcd doesn't compact (vacuum) with default settings,
+        // enable it to prevent space exhaustion.
+        "--auto-compaction-mode=revision".to_string(),
+        "--auto-compaction-retention=1".to_string(),
+    ];

-    let etcd_process = Command::new(&etcd_broker.etcd_binary_path)
-        .args(&[
-            format!("--data-dir={}", etcd_data_dir.display()),
-            format!("--listen-client-urls={client_urls}"),
-            format!("--advertise-client-urls={client_urls}"),
-            // Set --quota-backend-bytes to keep the etcd virtual memory
-            // size smaller. Our test etcd clusters are very small.
-            // See https://github.com/etcd-io/etcd/issues/7910
-            "--quota-backend-bytes=100000000".to_string(),
-            // etcd doesn't compact (vacuum) with default settings,
-            // enable it to prevent space exhaustion.
-            "--auto-compaction-mode=revision".to_string(),
-            "--auto-compaction-retention=1".to_string(),
-        ])
-        .stdout(Stdio::from(etcd_stdout_file))
-        .stderr(Stdio::from(etcd_stderr_file))
-        .spawn()
-        .context("Failed to spawn etcd subprocess")?;
-    let pid = etcd_process.id();
+    let pid_file_path = etcd_pid_file_path(env);

-    let etcd_pid_file_path = etcd_pid_file_path(env);
-    fs::write(&etcd_pid_file_path, pid.to_string()).with_context(|| {
-        format!(
-            "Failed to create etcd pid file at {}",
-            etcd_pid_file_path.display()
-        )
-    })?;
+    let client = reqwest::blocking::Client::new();
+
+    background_process::start_process(
+        "etcd",
+        &etcd_data_dir,
+        &etcd_broker.etcd_binary_path,
+        &args,
+        background_process::InitialPidFile::Create(&pid_file_path),
+        || {
+            for broker_endpoint in &etcd_broker.broker_endpoints {
+                let request = broker_endpoint
+                    .join("health")
+                    .with_context(|| {
+                        format!(
+                            "Failed to append /health path to broker endopint {}",
+                            broker_endpoint
+                        )
+                    })
+                    .and_then(|url| {
+                        client.get(&url.to_string()).build().with_context(|| {
+                            format!("Failed to construct request to etcd endpoint {url}")
+                        })
+                    })?;
+                if client.execute(request).is_ok() {
+                    return Ok(true);
+                }
+            }
+
+            Ok(false)
+        },
+    )
+    .context("Failed to spawn etcd subprocess")?;

    Ok(())
 }

 pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
-    let etcd_path = &env.etcd_broker.etcd_binary_path;
-    println!("Stopping etcd broker at {}", etcd_path.display());
-
-    let etcd_pid_file_path = etcd_pid_file_path(env);
-    let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| {
-        format!(
-            "Failed to read etcd pid file at {}",
-            etcd_pid_file_path.display()
-        )
-    })?);
-
-    kill(pid, Signal::SIGTERM).with_context(|| {
-        format!(
-            "Failed to stop etcd with pid {pid} at {}",
-            etcd_pid_file_path.display()
-        )
-    })?;
-
-    Ok(())
+    background_process::stop_process(true, "etcd", &etcd_pid_file_path(env))
 }

 fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -6,59 +6,12 @@
 // Intended to be used in integration tests and in CLI tools for
 // local installations.
 //
-use anyhow::{anyhow, bail, Context, Result};
-use std::fs;
-use std::path::Path;
-use std::process::Command;

+mod background_process;
 pub mod compute;
+pub mod connection;
 pub mod etcd;
 pub mod local_env;
+pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
-pub mod storage;
-
-/// Read a PID file
-///
-/// We expect a file that contains a single integer.
-/// We return an i32 for compatibility with libc and nix.
-pub fn read_pidfile(pidfile: &Path) -> Result<i32> {
-    let pid_str = fs::read_to_string(pidfile)
-        .with_context(|| format!("failed to read pidfile {:?}", pidfile))?;
-    let pid: i32 = pid_str
-        .parse()
-        .map_err(|_| anyhow!("failed to parse pidfile {:?}", pidfile))?;
-    if pid < 1 {
-        bail!("pidfile {:?} contained bad value '{}'", pidfile, pid);
-    }
-    Ok(pid)
-}
-
-fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
-    let cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
-
-    let var = "LLVM_PROFILE_FILE";
-    if let Some(val) = std::env::var_os(var) {
-        cmd.env(var, val);
-    }
-
-    const RUST_LOG_KEY: &str = "RUST_LOG";
-    if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
-        cmd.env(RUST_LOG_KEY, rust_log_value)
-    } else {
-        cmd
-    }
-}
-
-fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
-    for env_key in [
-        "AWS_ACCESS_KEY_ID",
-        "AWS_SECRET_ACCESS_KEY",
-        "AWS_SESSION_TOKEN",
-    ] {
-        if let Ok(value) = std::env::var(env_key) {
-            cmd = cmd.env(env_key, value);
-        }
-    }
-    cmd
-}
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -226,12 +226,12 @@ impl LocalEnv {
        }
    }

-    pub fn pageserver_bin(&self) -> anyhow::Result<PathBuf> {
-        Ok(self.neon_distrib_dir.join("pageserver"))
+    pub fn pageserver_bin(&self) -> PathBuf {
+        self.neon_distrib_dir.join("pageserver")
    }

-    pub fn safekeeper_bin(&self) -> anyhow::Result<PathBuf> {
-        Ok(self.neon_distrib_dir.join("safekeeper"))
+    pub fn safekeeper_bin(&self) -> PathBuf {
+        self.neon_distrib_dir.join("safekeeper")
    }

    pub fn pg_data_dirs_path(&self) -> PathBuf {
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -1,33 +1,27 @@
 use std::collections::HashMap;
-use std::fs::File;
+use std::fs::{self, File};
 use std::io::{BufReader, Write};
 use std::num::NonZeroU64;
 use std::path::{Path, PathBuf};
-use std::process::Command;
-use std::time::Duration;
-use std::{io, result, thread};
+use std::process::Child;
+use std::{io, result};

+use crate::connection::PgConnectionConfig;
 use anyhow::{bail, Context};
-use nix::errno::Errno;
-use nix::sys::signal::{kill, Signal};
-use nix::unistd::Pid;
 use pageserver_api::models::{
    TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
 };
-use postgres::{Config, NoTls};
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
 use utils::{
-    connstring::connection_address,
    http::error::HttpErrorBody,
    id::{TenantId, TimelineId},
    lsn::Lsn,
    postgres_backend::AuthType,
 };

-use crate::local_env::LocalEnv;
-use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
+use crate::{background_process, local_env::LocalEnv};

 #[derive(Error, Debug)]
 pub enum PageserverHttpError {
@@ -75,7 +69,7 @@ impl ResponseErrorMessageExt for Response {
 //
 #[derive(Debug)]
 pub struct PageServerNode {
-    pub pg_connection_config: Config,
+    pub pg_connection_config: PgConnectionConfig,
    pub env: LocalEnv,
    pub http_client: Client,
    pub http_base_url: String,
@@ -101,7 +95,7 @@ impl PageServerNode {
    }

    /// Construct libpq connection string for connecting to the pageserver.
-    fn pageserver_connection_config(password: &str, listen_addr: &str) -> Config {
+    fn pageserver_connection_config(password: &str, listen_addr: &str) -> PgConnectionConfig {
        format!("postgresql://no_user:{password}@{listen_addr}/no_db")
            .parse()
            .unwrap()
@@ -161,7 +155,15 @@ impl PageServerNode {
            init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'");
        }

-        self.start_node(&init_config_overrides, &self.env.base_data_dir, true)?;
+        let mut pageserver_process = self
+            .start_node(&init_config_overrides, &self.env.base_data_dir, true)
+            .with_context(|| {
+                format!(
+                    "Failed to start a process for pageserver {}",
+                    self.env.pageserver.id,
+                )
+            })?;
+
        let init_result = self
            .try_init_timeline(create_tenant, initial_timeline_id, pg_version)
            .context("Failed to create initial tenant and timeline for pageserver");
@@ -171,7 +173,29 @@ impl PageServerNode {
            }
            Err(e) => eprintln!("{e:#}"),
        }
-        self.stop(false)?;
+        match pageserver_process.kill() {
+            Err(e) => {
+                eprintln!(
+                    "Failed to stop pageserver {} process with pid {}: {e:#}",
+                    self.env.pageserver.id,
+                    pageserver_process.id(),
+                )
+            }
+            Ok(()) => {
+                println!(
+                    "Stopped pageserver {} process with pid {}",
+                    self.env.pageserver.id,
+                    pageserver_process.id(),
+                );
+                // cleanup after pageserver startup, since we do not call regular `stop_process` during init
+                let pid_file = self.pid_file();
+                if let Err(e) = fs::remove_file(&pid_file) {
+                    if e.kind() != io::ErrorKind::NotFound {
+                        eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
+                    }
+                }
+            }
+        }
        init_result
    }

@@ -196,11 +220,14 @@ impl PageServerNode {
        self.env.pageserver_data_dir()
    }

-    pub fn pid_file(&self) -> PathBuf {
+    /// The pid file is created by the pageserver process, with its pid stored inside.
+    /// Other pageservers cannot lock the same file and overwrite it for as long as the current
+    /// pageserver runs. (Unless someone removes the file manually; never do that!)
+    fn pid_file(&self) -> PathBuf {
        self.repo_path().join("pageserver.pid")
    }

-    pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
+    pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
        self.start_node(config_overrides, &self.repo_path(), false)
    }

@@ -209,10 +236,10 @@ impl PageServerNode {
        config_overrides: &[&str],
        datadir: &Path,
        update_config: bool,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<Child> {
        println!(
            "Starting pageserver at '{}' in '{}'",
-            connection_address(&self.pg_connection_config),
+            self.pg_connection_config.raw_address(),
            datadir.display()
        );
        io::stdout().flush()?;
@@ -220,10 +247,7 @@ impl PageServerNode {
        let mut args = vec![
            "-D",
            datadir.to_str().with_context(|| {
-                format!(
-                    "Datadir path '{}' cannot be represented as a unicode string",
-                    datadir.display()
-                )
+                format!("Datadir path {datadir:?} cannot be represented as a unicode string")
            })?,
        ];

@@ -235,48 +259,18 @@ impl PageServerNode {
            args.extend(["-c", config_override]);
        }

-        let mut cmd = Command::new(self.env.pageserver_bin()?);
-        let mut filled_cmd = fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
-        filled_cmd = fill_aws_secrets_vars(filled_cmd);
-
-        if !filled_cmd.status()?.success() {
-            bail!(
-                "Pageserver failed to start. See console output and '{}' for details.",
-                datadir.join("pageserver.log").display()
-            );
-        }
-
-        // It takes a while for the page server to start up. Wait until it is
-        // open for business.
-        const RETRIES: i8 = 15;
-        for retries in 1..RETRIES {
-            match self.check_status() {
-                Ok(()) => {
-                    println!("\nPageserver started");
-                    return Ok(());
-                }
-                Err(err) => {
-                    match err {
-                        PageserverHttpError::Transport(err) => {
-                            if err.is_connect() && retries < 5 {
-                                print!(".");
-                                io::stdout().flush().unwrap();
-                            } else {
-                                if retries == 5 {
-                                    println!() // put a line break after dots for second message
-                                }
-                                println!("Pageserver not responding yet, err {err} retrying ({retries})...");
-                            }
-                        }
-                        PageserverHttpError::Response(msg) => {
-                            bail!("pageserver failed to start: {msg} ")
-                        }
-                    }
-                    thread::sleep(Duration::from_secs(1));
-                }
-            }
-        }
-        bail!("pageserver failed to start in {RETRIES} seconds");
+        background_process::start_process(
+            "pageserver",
+            datadir,
+            &self.env.pageserver_bin(),
+            &args,
+            background_process::InitialPidFile::Expect(&self.pid_file()),
+            || match self.check_status() {
+                Ok(()) => Ok(true),
+                Err(PageserverHttpError::Transport(_)) => Ok(false),
+                Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
+            },
+        )
    }

    ///
@@ -288,69 +282,18 @@ impl PageServerNode {
    /// If the server is not running, returns success
    ///
    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        let pid_file = self.pid_file();
-        if !pid_file.exists() {
-            println!("Pageserver is already stopped");
-            return Ok(());
-        }
-        let pid = Pid::from_raw(read_pidfile(&pid_file)?);
-
-        let sig = if immediate {
-            print!("Stopping pageserver immediately..");
-            Signal::SIGQUIT
-        } else {
-            print!("Stopping pageserver gracefully..");
-            Signal::SIGTERM
-        };
-        io::stdout().flush().unwrap();
-        match kill(pid, sig) {
-            Ok(_) => (),
-            Err(Errno::ESRCH) => {
-                println!("Pageserver with pid {pid} does not exist, but a PID file was found");
-                return Ok(());
-            }
-            Err(err) => bail!(
-                "Failed to send signal to pageserver with pid {pid}: {}",
-                err.desc()
-            ),
-        }
-
-        // Wait until process is gone
-        for i in 0..600 {
-            let signal = None; // Send no signal, just get the error code
-            match kill(pid, signal) {
-                Ok(_) => (), // Process exists, keep waiting
-                Err(Errno::ESRCH) => {
-                    // Process not found, we're done
-                    println!("done!");
-                    return Ok(());
-                }
-                Err(err) => bail!(
-                    "Failed to send signal to pageserver with pid {}: {}",
-                    pid,
-                    err.desc()
-                ),
-            };
-
-            if i % 10 == 0 {
-                print!(".");
-                io::stdout().flush().unwrap();
-            }
-            thread::sleep(Duration::from_millis(100));
-        }
-
-        bail!("Failed to stop pageserver with pid {pid}");
+        background_process::stop_process(immediate, "pageserver", &self.pid_file())
    }

    pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
-        let mut client = self.pg_connection_config.connect(NoTls).unwrap();
+        let mut client = self.pg_connection_config.connect_no_tls().unwrap();

        println!("Pageserver query: '{sql}'");
        client.simple_query(sql).unwrap()
    }

    pub fn page_server_psql_client(&self) -> result::Result<postgres::Client, postgres::Error> {
-        self.pg_connection_config.connect(NoTls)
+        self.pg_connection_config.connect_no_tls()
    }

    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
@@ -419,6 +362,11 @@ impl PageServerNode {
                .map(|x| x.parse::<NonZeroU64>())
                .transpose()
                .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
+            trace_read_requests: settings
+                .remove("trace_read_requests")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'trace_read_requests' as bool")?,
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
@@ -481,6 +429,11 @@ impl PageServerNode {
                    .map(|x| x.parse::<NonZeroU64>())
                    .transpose()
                    .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
+                trace_read_requests: settings
+                    .get("trace_read_requests")
+                    .map(|x| x.parse::<bool>())
+                    .transpose()
+                    .context("Failed to parse 'trace_read_requests' as bool")?,
            })
            .send()?
            .error_from_body()?;
@@ -549,7 +502,7 @@ impl PageServerNode {
        pg_wal: Option<(Lsn, PathBuf)>,
        pg_version: u32,
    ) -> anyhow::Result<()> {
-        let mut client = self.pg_connection_config.connect(NoTls).unwrap();
+        let mut client = self.pg_connection_config.connect_no_tls().unwrap();

        // Init base reader
        let (start_lsn, base_tarfile_path) = base;
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -1,23 +1,21 @@
 use std::io::Write;
 use std::path::PathBuf;
-use std::process::Command;
+use std::process::Child;
 use std::sync::Arc;
-use std::time::Duration;
-use std::{io, result, thread};
+use std::{io, result};

-use anyhow::bail;
-use nix::errno::Errno;
-use nix::sys::signal::{kill, Signal};
-use nix::unistd::Pid;
-use postgres::Config;
+use anyhow::Context;
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
-use utils::{connstring::connection_address, http::error::HttpErrorBody, id::NodeId};
+use utils::{http::error::HttpErrorBody, id::NodeId};

-use crate::local_env::{LocalEnv, SafekeeperConf};
-use crate::storage::PageServerNode;
-use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
+use crate::connection::PgConnectionConfig;
+use crate::pageserver::PageServerNode;
+use crate::{
+    background_process,
+    local_env::{LocalEnv, SafekeeperConf},
+};

 #[derive(Error, Debug)]
 pub enum SafekeeperHttpError {
@@ -63,7 +61,7 @@ pub struct SafekeeperNode {

    pub conf: SafekeeperConf,

-    pub pg_connection_config: Config,
+    pub pg_connection_config: PgConnectionConfig,
    pub env: LocalEnv,
    pub http_client: Client,
    pub http_base_url: String,
@@ -87,15 +85,15 @@ impl SafekeeperNode {
    }

    /// Construct libpq connection string for connecting to this safekeeper.
-    fn safekeeper_connection_config(port: u16) -> Config {
+    fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
        // TODO safekeeper authentication not implemented yet
-        format!("postgresql://no_user@127.0.0.1:{}/no_db", port)
+        format!("postgresql://no_user@127.0.0.1:{port}/no_db")
            .parse()
            .unwrap()
    }

    pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
-        env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref())
+        env.safekeeper_data_dir(&format!("sk{sk_id}"))
    }

    pub fn datadir_path(&self) -> PathBuf {
@@ -106,91 +104,78 @@ impl SafekeeperNode {
        self.datadir_path().join("safekeeper.pid")
    }

-    pub fn start(&self) -> anyhow::Result<()> {
+    pub fn start(&self) -> anyhow::Result<Child> {
        print!(
            "Starting safekeeper at '{}' in '{}'",
-            connection_address(&self.pg_connection_config),
+            self.pg_connection_config.raw_address(),
            self.datadir_path().display()
        );
        io::stdout().flush().unwrap();

        let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
        let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
+        let id = self.id;
+        let datadir = self.datadir_path();

-        let mut cmd = Command::new(self.env.safekeeper_bin()?);
-        fill_rust_env_vars(
-            cmd.args(&["-D", self.datadir_path().to_str().unwrap()])
-                .args(&["--id", self.id.to_string().as_ref()])
-                .args(&["--listen-pg", &listen_pg])
-                .args(&["--listen-http", &listen_http])
-                .arg("--daemonize"),
-        );
+        let id_string = id.to_string();
+        let mut args = vec![
+            "-D",
+            datadir.to_str().with_context(|| {
+                format!("Datadir path {datadir:?} cannot be represented as a unicode string")
+            })?,
+            "--id",
+            &id_string,
+            "--listen-pg",
+            &listen_pg,
+            "--listen-http",
+            &listen_http,
+        ];
        if !self.conf.sync {
-            cmd.arg("--no-sync");
+            args.push("--no-sync");
        }

        let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints();
        if !comma_separated_endpoints.is_empty() {
-            cmd.args(&["--broker-endpoints", &comma_separated_endpoints]);
+            args.extend(["--broker-endpoints", &comma_separated_endpoints]);
        }
        if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
-            cmd.args(&["--broker-etcd-prefix", prefix]);
+            args.extend(["--broker-etcd-prefix", prefix]);
        }
+
+        let mut backup_threads = String::new();
        if let Some(threads) = self.conf.backup_threads {
-            cmd.args(&["--backup-threads", threads.to_string().as_ref()]);
+            backup_threads = threads.to_string();
+            args.extend(["--backup-threads", &backup_threads]);
+        } else {
+            drop(backup_threads);
        }
+
        if let Some(ref remote_storage) = self.conf.remote_storage {
-            cmd.args(&["--remote-storage", remote_storage]);
+            args.extend(["--remote-storage", remote_storage]);
        }
+
+        let key_path = self.env.base_data_dir.join("auth_public_key.pem");
        if self.conf.auth_enabled {
-            cmd.arg("--auth-validation-public-key-path");
-            // PathBuf is better be passed as is, not via `String`.
-            cmd.arg(self.env.base_data_dir.join("auth_public_key.pem"));
+            args.extend([
+                "--auth-validation-public-key-path",
+                key_path.to_str().with_context(|| {
+                    format!("Key path {key_path:?} cannot be represented as a unicode string")
+                })?,
+            ]);
        }

-        fill_aws_secrets_vars(&mut cmd);
-
-        if !cmd.status()?.success() {
-            bail!(
-                "Safekeeper failed to start. See '{}' for details.",
-                self.datadir_path().join("safekeeper.log").display()
-            );
-        }
-
-        // It takes a while for the safekeeper to start up. Wait until it is
-        // open for business.
-        const RETRIES: i8 = 15;
-        for retries in 1..RETRIES {
-            match self.check_status() {
-                Ok(_) => {
-                    println!("\nSafekeeper started");
-                    return Ok(());
-                }
-                Err(err) => {
-                    match err {
-                        SafekeeperHttpError::Transport(err) => {
-                            if err.is_connect() && retries < 5 {
-                                print!(".");
-                                io::stdout().flush().unwrap();
-                            } else {
-                                if retries == 5 {
-                                    println!() // put a line break after dots for second message
-                                }
-                                println!(
-                                    "Safekeeper not responding yet, err {} retrying ({})...",
-                                    err, retries
-                                );
-                            }
-                        }
-                        SafekeeperHttpError::Response(msg) => {
-                            bail!("safekeeper failed to start: {} ", msg)
-                        }
-                    }
-                    thread::sleep(Duration::from_secs(1));
-                }
-            }
-        }
-        bail!("safekeeper failed to start in {} seconds", RETRIES);
+        background_process::start_process(
+            &format!("safekeeper {id}"),
+            &datadir,
+            &self.env.safekeeper_bin(),
+            &args,
+            background_process::InitialPidFile::Expect(&self.pid_file()),
+            || match self.check_status() {
+                Ok(()) => Ok(true),
+                Err(SafekeeperHttpError::Transport(_)) => Ok(false),
+                Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
+            },
+        )
    }

    ///
@@ -202,63 +187,11 @@ impl SafekeeperNode {
    /// If the server is not running, returns success
    ///
    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        let pid_file = self.pid_file();
-        if !pid_file.exists() {
-            println!("Safekeeper {} is already stopped", self.id);
-            return Ok(());
-        }
-        let pid = read_pidfile(&pid_file)?;
-        let pid = Pid::from_raw(pid);
-
-        let sig = if immediate {
-            print!("Stopping safekeeper {} immediately..", self.id);
-            Signal::SIGQUIT
-        } else {
-            print!("Stopping safekeeper {} gracefully..", self.id);
-            Signal::SIGTERM
-        };
-        io::stdout().flush().unwrap();
-        match kill(pid, sig) {
-            Ok(_) => (),
-            Err(Errno::ESRCH) => {
-                println!(
-                    "Safekeeper with pid {} does not exist, but a PID file was found",
-                    pid
-                );
-                return Ok(());
-            }
-            Err(err) => bail!(
-                "Failed to send signal to safekeeper with pid {}: {}",
-                pid,
-                err.desc()
-            ),
-        }
-
-        // Wait until process is gone
-        for i in 0..600 {
-            let signal = None; // Send no signal, just get the error code
-            match kill(pid, signal) {
-                Ok(_) => (), // Process exists, keep waiting
-                Err(Errno::ESRCH) => {
-                    // Process not found, we're done
-                    println!("done!");
-                    return Ok(());
-                }
-                Err(err) => bail!(
-                    "Failed to send signal to pageserver with pid {}: {}",
-                    pid,
-                    err.desc()
-                ),
-            };
-
-            if i % 10 == 0 {
-                print!(".");
-                io::stdout().flush().unwrap();
-            }
-            thread::sleep(Duration::from_millis(100));
-        }
-
-        bail!("Failed to stop safekeeper with pid {}", pid);
+        background_process::stop_process(
+            immediate,
+            &format!("safekeeper {}", self.id),
+            &self.pid_file(),
+        )
    }

    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -0,0 +1,13 @@
+ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+ARG COMPUTE_IMAGE=compute-node-v14
+ARG TAG=latest
+
+FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG
+
+USER root
+RUN apt-get update &&       \
+    apt-get install -y curl \
+                       jq   \
+                       netcat
+
+USER postgres
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -2,6 +2,7 @@ version: '3'

 services:
  etcd:
+    restart: always
    image: quay.io/coreos/etcd:v3.5.4
    ports:
      - 2379:2379
@@ -9,7 +10,7 @@ services:
    environment:
      # This signifficantly speeds up etcd and we anyway don't data persistency there.
      ETCD_UNSAFE_NO_FSYNC: "1"
-    command: 
+    command:
      - "etcd"
      - "--auto-compaction-mode=revision"
      - "--auto-compaction-retention=1"
@@ -24,6 +25,7 @@ services:
      - "--quota-backend-bytes=134217728" # 128 MB

  minio:
+    restart: always
    image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z
    ports:
      - 9000:9000
@@ -41,7 +43,7 @@ services:
    entrypoint:
      - "/bin/sh"
      - "-c"
-    command: 
+    command:
      - "until (/usr/bin/mc alias set minio http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD) do
             echo 'Waiting to start minio...' && sleep 1;
         done;
@@ -51,7 +53,8 @@ services:
      - minio

  pageserver:
-    image: neondatabase/neon:${TAG:-latest}
+    restart: always
+    image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
    environment:
      - BROKER_ENDPOINT='http://etcd:2379'
      - AWS_ACCESS_KEY_ID=minio
@@ -77,7 +80,8 @@ services:
      - minio_create_buckets

  safekeeper1:
-    image: neondatabase/neon:${TAG:-latest}
+    restart: always
+    image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
    environment:
      - SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
      - SAFEKEEPER_ID=1
@@ -106,7 +110,8 @@ services:
      - minio_create_buckets

  safekeeper2:
-    image: neondatabase/neon:${TAG:-latest}
+    restart: always
+    image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
    environment:
      - SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
      - SAFEKEEPER_ID=2
@@ -135,7 +140,8 @@ services:
      - minio_create_buckets

  safekeeper3:
-    image: neondatabase/neon:${TAG:-latest}
+    restart: always
+    image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
    environment:
      - SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
      - SAFEKEEPER_ID=3
@@ -164,18 +170,21 @@ services:
      - minio_create_buckets

  compute:
+    restart: always
    build:
-      context: ./image/compute
+      context: ./compute_wrapper/
      args:
-        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}:${TAG:-latest}
+        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}
+        - TAG=${TAG:-latest}
        - http_proxy=$http_proxy
        - https_proxy=$https_proxy
    environment:
      - PG_VERSION=${PG_VERSION:-14}
      #- RUST_BACKTRACE=1
+    # Mount the test files directly, for faster editing cycle.
    volumes:
-      - ./compute/var/db/postgres/specs/:/var/db/postgres/specs/
-      - ./compute/shell/:/shell/
+      - ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/
+      - ./compute_wrapper/shell/:/shell/
    ports:
      - 55433:55433 # pg protocol handler
      - 3080:3080 # http endpoints
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# A basic test to ensure Docker images are built correctly.
+# Build a wrapper around the compute, start all services and runs a simple SQL query.
+# Repeats the process for all currenly supported Postgres versions.
+
+# Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file
+# Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
+# to verify custom image builds (e.g pre-published ones).
+
+# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
+
+set -eux -o pipefail
+
+SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml
+
+COMPUTE_CONTAINER_NAME=docker-compose-compute-1
+SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
+PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
+
+cleanup() {
+    echo "show container information"
+    docker ps
+    docker compose -f $COMPOSE_FILE logs
+    echo "stop containers..."
+    docker compose -f $COMPOSE_FILE down
+}
+
+echo "clean up containers if exists"
+cleanup
+
+for pg_version in 14 15; do
+    echo "start containers (pg_version=$pg_version)."
+    PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d
+
+    echo "wait until the compute is ready. timeout after 60s. "
+    cnt=0
+    while sleep 1; do
+        # check timeout
+        cnt=`expr $cnt + 1`
+        if [ $cnt -gt 60 ]; then
+            echo "timeout before the compute is ready."
+            cleanup
+            exit 1
+        fi
+
+        # check if the compute is ready
+        set +o pipefail
+        result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
+        set -o pipefail
+        if [ $result -eq 1 ]; then
+            echo "OK. The compute is ready to connect."
+            echo "execute simple queries."
+            docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
+            cleanup
+            break
+        fi
+    done
+done
--- a/docker-compose/image/compute/Dockerfile
+++ b/docker-compose/image/compute/Dockerfile
@@ -1,10 +0,0 @@
-ARG COMPUTE_IMAGE=compute-node-v14:latest
-FROM neondatabase/${COMPUTE_IMAGE}
-
-USER root
-RUN apt-get update &&       \
-    apt-get install -y curl \
-                       jq   \
-                       netcat
-
-USER postgres
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -37,7 +37,7 @@

 - [Source view](./sourcetree.md)
  - [docker.md](./docker.md) — Docker images and building pipeline.
-  - [Error handling and logging]()
+  - [Error handling and logging](./error-handling.md)
  - [Testing]()
    - [Unit testing]()
    - [Integration testing]()
--- a/docs/error-handling.md
+++ b/docs/error-handling.md
@@ -0,0 +1,198 @@
+# Error handling and logging
+
+## Logging errors
+
+The principle is that errors are logged when they are handled. If you
+just propagate an error to the caller in a function, you don't need to
+log it; the caller will. But if you consume an error in a function,
+you *must* log it (if it needs to be logged at all).
+
+For example:
+
+```rust
+fn read_motd_file() -> std::io::Result<String> {
+    let mut f = File::open("/etc/motd")?;
+    let mut result = String::new();
+    f.read_to_string(&mut result)?;
+    result
+}
+```
+
+Opening or reading the file could fail, but there is no need to log
+the error here. The function merely propagates the error to the
+caller, and it is up to the caller to log the error or propagate it
+further, if the failure is not expected. But if, for example, it is
+normal that the "/etc/motd" file doesn't exist, the caller can choose
+to silently ignore the error, or log it as an INFO or DEBUG level
+message:
+
+```rust
+fn get_message_of_the_day() -> String {
+    // Get the motd from /etc/motd, or return the default proverb
+    match read_motd_file() {
+        Ok(motd) => motd,
+        Err(err)  => {
+            // It's normal that /etc/motd doesn't exist, but if we fail to
+            // read it for some other reason, that's unexpected. The message
+            // of the day isn't very important though, so we just WARN and
+            // continue with the default in any case.
+            if err.kind() != std::io::ErrorKind::NotFound {
+                 tracing::warn!("could not read \"/etc/motd\": {err:?}");
+            }
+            "An old error is always more popular than a new truth. - German proverb"
+        }
+    }
+}
+```
+
+## Error types
+
+We use the `anyhow` crate widely. It contains many convenient macros
+like `bail!` and `ensure!` to construct and return errors, and to
+propagate many kinds of low-level errors, wrapped in `anyhow::Error`.
+
+A downside of `anyhow::Error` is that the caller cannot distinguish
+between different error cases. Most errors are propagated all the way
+to the mgmt API handler function, or the main loop that handles a
+connection with the compute node, and they are all handled the same
+way: the error is logged and returned to the client as an HTTP or
+libpq error.
+
+But in some cases, we need to distinguish between errors and handle
+them differently. For example, attaching a tenant to the pageserver
+could fail either because the tenant has already been attached, or
+because we could not load its metadata from cloud storage. The first
+case is more or less expected. The console sends the Attach request to
+the pageserver, and the pageserver completes the operation, but the
+network connection might be lost before the console receives the
+response. The console will retry the operation in that case, but the
+tenant has already been attached. It is important that the pagserver
+responds with the HTTP 403 Already Exists error in that case, rather
+than a generic HTTP 500 Internal Server Error.
+
+If you need to distinguish between different kinds of errors, create a
+new `Error` type. The `thiserror` crate is useful for that. But in
+most cases `anyhow::Error` is good enough.
+
+## Panics
+
+Depending on where a panic happens, it can cause the whole pageserver
+or safekeeper to restart, or just a single tenant. In either case,
+that is pretty bad and causes an outage. Avoid panics. Never use
+`unwrap()` or other calls that might panic, to verify inputs from the
+network or from disk.
+
+It is acceptable to use functions that might panic, like `unwrap()`, if
+it is obvious that it cannot panic. For example, if you have just
+checked that a variable is not None, it is OK to call `unwrap()` on it,
+but it is still preferable to use `expect("reason")` instead to explain
+why the function cannot fail.
+
+`assert!` and `panic!` are reserved for checking clear invariants and
+very obvious "can't happen" cases. When in doubt, use anyhow `ensure!`
+or `bail!` instead.
+
+## Error levels
+
+`tracing::Level` doesn't provide very clear guidelines on what the
+different levels mean, or when to use which level. Here is how we use
+them:
+
+### Error
+
+Examples:
+- could not open file "foobar"
+- invalid tenant id
+
+Errors are not expected to happen during normal operation. Incorrect
+inputs from client can cause ERRORs. For example, if a client tries to
+call a mgmt API that doesn't exist, or if a compute node sends passes
+an LSN that has already been garbage collected away.
+
+These should *not* happen during normal operations. "Normal
+operations" is not a very precise concept. But for example, disk
+errors are not expected to happen when the system is working, so those
+count as Errors. However, if a TCP connection to a compute node is
+lost, that is not considered an Error, because it doesn't affect the
+pageserver's or safekeeper's operation in any way, and happens fairly
+frequently when compute nodes are shut down, or are killed abruptly
+because of errors in the compute.
+
+**Errors are monitored, and always need human investigation to determine
+the cause.**
+
+Whether something should be logged at ERROR, WARNING or INFO level can
+depend on the callers and clients. For example, it might be unexpected
+and a sign of a serious issue if the console calls the
+"timeline_detail" mgmt API for a timeline that doesn't exist. ERROR
+would be appropriate in that case. But if the console routinely calls
+the API after deleting a timeline, to check if the deletion has
+completed, then it would be totally normal and an INFO or DEBUG level
+message would be more appropriate. If a message is logged as an ERROR,
+but it in fact happens frequently in production and never requires any
+action, it should probably be demoted to an INFO level message.
+
+### Warn
+
+Examples:
+- could not remove temporary file "foobar.temp"
+- unrecognized file "foobar" in timeline directory
+
+Warnings are similar to Errors, in that they should not happen
+when the system is operating normally. The difference between Error and
+Warning is that an Error means that the operation failed, whereas Warning
+means that something unexpected happened, but the operation continued anyway.
+For example, if deleting a file fails because the file already didn't exist,
+it should be logged as Warning.
+
+> **Note:** The python regression tests, under `test_regress`, check the
+> pageserver log after each test for any ERROR and WARN lines. If there are
+> any ERRORs or WARNs that have not been explicitly listed in the test as
+> allowed, the test is marked a failed. This is to catch unexpected errors
+> e.g. in background operations, that don't cause immediate misbehaviour in
+> the tested functionality.
+
+### Info
+
+Info level is used to log useful information when the system is
+operating normally. Info level is appropriate e.g. for logging state
+changes, background operations, and network connections.
+
+Examples:
+- "system is shutting down"
+- "tenant was created"
+- "retrying S3 upload"
+
+### Debug & Trace
+
+Debug and Trace level messages are not printed to the log in our normal
+production configuration, but could be enabled for a specific server or
+tenant, to aid debugging. (Although we don't actually have that
+capability as of this writing).
+
+## Context
+
+We use logging "spans" to hold context information about the current
+operation. Almost every operation happens on a particular tenant and
+timeline, so we enter a span with the "tenant_id" and "timeline_id"
+very early when processing an incoming API request, for example. All
+background operations should also run in a span containing at least
+those two fields, and any other parameters or information that might
+be useful when debugging an error that might happen when performing
+the operation.
+
+TODO: Spans are not captured in the Error when it is created, but when
+the error is logged. It would be more useful to capture them at Error
+creation. We should consider using `tracing_error::SpanTrace` to do
+that.
+
+## Error message style
+
+PostgreSQL has a style guide for writing error messages:
+
+https://www.postgresql.org/docs/current/error-style-guide.html
+
+Follow that guide when writing error messages in the PostgreSQL
+extension. We don't follow it strictly in the pageserver and
+safekeeper, but the advice in the PostgreSQL style guide is generally
+good, and you can't go wrong by following it.
--- a/docs/rfcs/020-pageserver-s3-coordination.md
+++ b/docs/rfcs/020-pageserver-s3-coordination.md
@@ -0,0 +1,246 @@
+# Coordinating access of multiple pageservers to the same s3 data
+
+## Motivation
+
+There are some blind spots around coordinating access of multiple pageservers
+to the same s3 data. Currently this is applicable only to tenant relocation
+case, but in the future we'll need to solve similar problems for
+replica/standby pageservers.
+
+## Impacted components (e.g. pageserver, safekeeper, console, etc)
+
+Pageserver
+
+## The problem
+
+### Relocation
+
+During relocation both pageservers can write to s3. This should be ok for all
+data except the `index_part.json`. For index part it causes problems during
+compaction/gc because they remove files from index/s3.
+
+Imagine this case:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant PS1
+    participant S3
+    participant PS2
+
+    PS1->>S3: Uploads L1, L2 <br/> Index contains L1 L2
+    PS2->>S3: Attach called, sees L1, L2
+    PS1->>S3: Compaction comes <br/> Removes L1, adds L3
+    note over S3: Index now L2, L3
+    PS2->>S3: Uploads new layer L4 <br/> (added to previous view of the index)
+    note over S3: Index now L1, L2, L4
+```
+
+At this point it is not possible to restore from index, it contains L2 which
+is no longer available in s3 and doesnt contain L3 added by compaction by the
+first pageserver. So if any of the pageservers restart initial sync will fail
+(or in on-demand world it will fail a bit later during page request from
+missing layer)
+
+### Standby pageserver
+
+Another related case is standby pageserver. In this case second pageserver can
+be used as a replica to scale reads and serve as a failover target in case
+first one fails.
+
+In this mode second pageserver needs to have the same picture of s3 files to
+be able to load layers on-demand. To accomplish that second pageserver
+cannot run gc/compaction jobs. Instead it needs to receive updates for index
+contents. (There is no need to run walreceiver on the second pageserver then).
+
+## Observations
+
+- If both pageservers ingest wal then their layer set diverges, because layer
+  file generation is not deterministic
+- If one of the pageservers does not ingest wal (and just picks up layer
+  updates) then it lags behind and cannot really answer queries in the same
+  pace as the primary one
+- Can compaction help make layers deterministic? E g we do not upload level
+  zero layers and construction of higher levels should be deterministic.
+  This way we can guarantee that layer creation by timeout wont mess things up.
+  This way one pageserver uploads data and second one can just ingest it.
+  But we still need some form of election
+
+## Solutions
+
+### Manual orchestration
+
+One possible solution for relocation case is to orchestrate background jobs
+from outside. The oracle who runs migration can turn off background jobs on
+PS1 before migration and then run migration -> enable them on PS2. The problem
+comes if migration fails. In this case in order to resume background jobs
+oracle needs to guarantee that PS2 doesnt run background jobs and if it doesnt
+respond then PS1 is stuck unable to run compaction/gc. This cannot be solved
+without human ensuring that no upload from PS2 can happen. In order to be able
+to resolve this automatically CAS is required on S3 side so pageserver can
+avoid overwriting index part if it is no longer the leading one
+
+Note that flag that disables background jobs needs to be persistent, because
+otherwise pageserver restart will clean it
+
+### Avoid index_part.json
+
+Index part consists of two parts, list of layers and metadata. List of layers
+can be easily obtained by `ListObjects` S3 API method. But what to do with
+metadata? Create metadata instance for each checkpoint and add some counter
+to the file name?
+
+Back to potentially long s3 ls.
+
+### Coordination based approach
+
+Do it like safekeepers chose leader for WAL upload. Ping each other and decide
+based on some heuristics e g smallest node id. During relocation PS1 sends
+"resign" ping message so others can start election without waiting for a timeout.
+
+This still leaves metadata question open and non deterministic layers are a
+problem as well
+
+### Avoid metadata file
+
+One way to eliminate metadata file is to store it in layer files under some
+special key. This may resonate with intention to keep all relation sizes in
+some special segment to avoid initial download during size calculation.
+Maybe with that we can even store pre calculated value.
+
+As a downside each checkpoint gets 512 bytes larger.
+
+If we entirely avoid metadata file this opens up many approaches
+
+* * *
+
+During discussion it seems that we converged on the approach consisting of:
+
+- index files stored per pageserver in the same timeline directory. With that
+  index file name starts to look like: `<pageserver_node_id>_index_part.json`.
+  In such set up there are no concurrent overwrites of index file by different
+  pageservers.
+- For replica pageservers the solution would be for primary to broadcast index
+  changes to any followers with an ability to check index files in s3 and
+  restore the full state. To properly merge changes with index files we can use
+  a counter that is persisted in an index file, is incremented on every change
+  to it and passed along with broadcasted change. This way we can determine
+  whether we need to apply change to the index state or not.
+- Responsibility for running background jobs is assigned externally. Pageserver
+  keeps locally persistent flag for each tenant that indicates whether this
+  pageserver is considered as primary one or not. TODO what happends if we
+  crash and cannot start for some extended period of time? Control plane can
+  assign ownership to some other pageserver. Pageserver needs some way to check
+  if its still the blessed one. Maybe by explicit request to control plane on
+  start.
+
+Requirement for deterministic layer generation was considered overly strict
+because of two reasons:
+
+- It can limit possible optimizations e g when pageserver wants to reshuffle
+  some data locally and doesnt want to coordinate this
+- The deterministic algorithm itself can change so during deployments for some
+  time there will be two different version running at the same time which can
+  cause non determinism
+
+### External elections
+
+The above case with lost state in this schema with externally managed
+leadership is represented like this:
+
+Note that here we keep objects list in the index file.
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant PS1
+    participant CP as Control Plane
+    participant S3
+    participant PS2
+
+    note over PS1,PS2: PS1 starts up and still a leader
+    PS1->>CP: Am I still the leader for Tenant X?
+    activate CP
+    CP->>PS1: Yes
+    deactivate CP
+    PS1->>S3: Fetch PS1 index.
+    note over PS1: Continue operations, start backround jobs
+    note over PS1,PS2: PS1 starts up and still and is not a leader anymore
+    PS1->>CP: Am I still the leader for Tenant X?
+    CP->>PS1: No
+    PS1->>PS2: Subscribe to index changes
+    PS1->>S3: Fetch PS1 and PS2 indexes
+    note over PS1: Combine index file to include layers <br> from both indexes to be able <br> to see newer files from leader (PS2)
+    note over PS1: Continue operations, do not start background jobs
+```
+
+### Internal elections
+
+To manage leadership internally we can use broker to exchange pings so nodes
+can decide on the leader roles. In case multiple pageservers are active leader
+is the one with lowest node id.
+
+Operations with internally managed elections:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant PS1
+    participant S3
+
+    note over PS1: Starts up
+    note over PS1: Subscribes to changes, waits for two ping <br> timeouts to see if there is a leader
+    PS1->>S3: Fetch indexes from s3
+    alt there is a leader
+        note over PS1: do not start background jobs, <br> continue applying index updates
+    else there is no leader
+        note over PS1: start background jobs, <br> broadcast index changes
+    end
+
+    note over PS1,S3: Then the picture is similar to external elections <br> the difference is that follower can become a leader <br> if there are no pings after some timeout new leader gets elected
+```
+
+### Eviction
+
+When two pageservers operate on a tenant for extended period of time follower
+doesnt perform write operations in s3. When layer is evicted follower relies
+on updates from primary to get info about layers it needs to cover range for
+evicted layer.
+
+Note that it wont match evicted layer exactly, so layers will overlap and
+lookup code needs to correctly handle that.
+
+### Relocation flow
+
+Actions become:
+
+- Attach tenant to new pageserver
+- New pageserver becomes follower since previous one is still leading
+- New pageserver starts replicating from safekeepers but does not upload layers
+- Detach is called on the old one
+- New pageserver becomes leader after it realizes that old one disappeared
+
+### Index File
+
+Using `s3 ls` on startup simplifies things, but we still need metadata, so we
+need to fetch index files anyway. If they contain list of files we can combine
+them and avoid costly `s3 ls`
+
+### Remaining issues
+
+- More than one remote consistent lsn for safekeepers to know
+
+Anything else?
+
+### Proposed solution
+
+To recap. On meeting we converged on approach with external elections but I
+think it will be overall harder to manage and will introduce a dependency on
+control plane for pageserver. Using separate index files for each pageserver
+consisting of log of operations and a metadata snapshot should be enough.
+
+### What we need to get there?
+
+- Change index file structure to contain log of changes instead of just the
+  file list
+- Implement pinging/elections for pageservers
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -52,6 +52,10 @@ PostgreSQL extension that implements storage manager API and network communicati

 PostgreSQL extension that contains functions needed for testing and debugging.

+`/pgxn/neon_walredo`:
+
+Library to run Postgres as a "WAL redo process" in the pageserver.
+
 `/safekeeper`:

 The neon WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -9,6 +9,7 @@ serde_with = "2.0"
 const_format = "0.2.21"
 anyhow = { version = "1.0", features = ["backtrace"] }
 bytes = "1.0.1"
+byteorder = "1.4.3"

 utils = { path = "../utils" }
 postgres_ffi = { path = "../postgres_ffi" }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1,5 +1,6 @@
 use std::num::NonZeroU64;

+use byteorder::{BigEndian, ReadBytesExt};
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use utils::{
@@ -9,7 +10,7 @@ use utils::{

 use crate::reltag::RelTag;
 use anyhow::bail;
-use bytes::{Buf, BufMut, Bytes, BytesMut};
+use bytes::{BufMut, Bytes, BytesMut};

 /// A state of a tenant in pageserver's memory.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -72,6 +73,7 @@ pub struct TenantCreateRequest {
    pub walreceiver_connect_timeout: Option<String>,
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
+    pub trace_read_requests: Option<bool>,
 }

 #[serde_as]
@@ -111,6 +113,7 @@ pub struct TenantConfigRequest {
    pub walreceiver_connect_timeout: Option<String>,
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
+    pub trace_read_requests: Option<bool>,
 }

 impl TenantConfigRequest {
@@ -129,6 +132,7 @@ impl TenantConfigRequest {
            walreceiver_connect_timeout: None,
            lagging_wal_timeout: None,
            max_lsn_wal_lag: None,
+            trace_read_requests: None,
        }
    }
 }
@@ -225,6 +229,7 @@ pub struct TimelineGcRequest {
 }

 // Wrapped in libpq CopyData
+#[derive(PartialEq, Eq)]
 pub enum PagestreamFeMessage {
    Exists(PagestreamExistsRequest),
    Nblocks(PagestreamNblocksRequest),
@@ -241,21 +246,21 @@ pub enum PagestreamBeMessage {
    DbSize(PagestreamDbSizeResponse),
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamExistsRequest {
    pub latest: bool,
    pub lsn: Lsn,
    pub rel: RelTag,
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamNblocksRequest {
    pub latest: bool,
    pub lsn: Lsn,
    pub rel: RelTag,
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamGetPageRequest {
    pub latest: bool,
    pub lsn: Lsn,
@@ -263,7 +268,7 @@ pub struct PagestreamGetPageRequest {
    pub blkno: u32,
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamDbSizeRequest {
    pub latest: bool,
    pub lsn: Lsn,
@@ -296,52 +301,98 @@ pub struct PagestreamDbSizeResponse {
 }

 impl PagestreamFeMessage {
-    pub fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
+    pub fn serialize(&self) -> Bytes {
+        let mut bytes = BytesMut::new();
+
+        match self {
+            Self::Exists(req) => {
+                bytes.put_u8(0);
+                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u64(req.lsn.0);
+                bytes.put_u32(req.rel.spcnode);
+                bytes.put_u32(req.rel.dbnode);
+                bytes.put_u32(req.rel.relnode);
+                bytes.put_u8(req.rel.forknum);
+            }
+
+            Self::Nblocks(req) => {
+                bytes.put_u8(1);
+                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u64(req.lsn.0);
+                bytes.put_u32(req.rel.spcnode);
+                bytes.put_u32(req.rel.dbnode);
+                bytes.put_u32(req.rel.relnode);
+                bytes.put_u8(req.rel.forknum);
+            }
+
+            Self::GetPage(req) => {
+                bytes.put_u8(2);
+                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u64(req.lsn.0);
+                bytes.put_u32(req.rel.spcnode);
+                bytes.put_u32(req.rel.dbnode);
+                bytes.put_u32(req.rel.relnode);
+                bytes.put_u8(req.rel.forknum);
+                bytes.put_u32(req.blkno);
+            }
+
+            Self::DbSize(req) => {
+                bytes.put_u8(3);
+                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u64(req.lsn.0);
+                bytes.put_u32(req.dbnode);
+            }
+        }
+
+        bytes.into()
+    }
+
+    pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
        // TODO these gets can fail

        // these correspond to the NeonMessageTag enum in pagestore_client.h
        //
        // TODO: consider using protobuf or serde bincode for less error prone
        // serialization.
-        let msg_tag = body.get_u8();
+        let msg_tag = body.read_u8()?;
        match msg_tag {
            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
+                    spcnode: body.read_u32::<BigEndian>()?,
+                    dbnode: body.read_u32::<BigEndian>()?,
+                    relnode: body.read_u32::<BigEndian>()?,
+                    forknum: body.read_u8()?,
                },
            })),
            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
+                    spcnode: body.read_u32::<BigEndian>()?,
+                    dbnode: body.read_u32::<BigEndian>()?,
+                    relnode: body.read_u32::<BigEndian>()?,
+                    forknum: body.read_u8()?,
                },
            })),
            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
+                    spcnode: body.read_u32::<BigEndian>()?,
+                    dbnode: body.read_u32::<BigEndian>()?,
+                    relnode: body.read_u32::<BigEndian>()?,
+                    forknum: body.read_u8()?,
                },
-                blkno: body.get_u32(),
+                blkno: body.read_u32::<BigEndian>()?,
            })),
            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                dbnode: body.get_u32(),
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                dbnode: body.read_u32::<BigEndian>()?,
            })),
-            _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
+            _ => bail!("unknown smgr message tag: {:?}", msg_tag),
        }
    }
 }
@@ -380,3 +431,58 @@ impl PagestreamBeMessage {
        bytes.into()
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use bytes::Buf;
+
+    use super::*;
+
+    #[test]
+    fn test_pagestream() {
+        // Test serialization/deserialization of PagestreamFeMessage
+        let messages = vec![
+            PagestreamFeMessage::Exists(PagestreamExistsRequest {
+                latest: true,
+                lsn: Lsn(4),
+                rel: RelTag {
+                    forknum: 1,
+                    spcnode: 2,
+                    dbnode: 3,
+                    relnode: 4,
+                },
+            }),
+            PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
+                latest: false,
+                lsn: Lsn(4),
+                rel: RelTag {
+                    forknum: 1,
+                    spcnode: 2,
+                    dbnode: 3,
+                    relnode: 4,
+                },
+            }),
+            PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
+                latest: true,
+                lsn: Lsn(4),
+                rel: RelTag {
+                    forknum: 1,
+                    spcnode: 2,
+                    dbnode: 3,
+                    relnode: 4,
+                },
+                blkno: 7,
+            }),
+            PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
+                latest: true,
+                lsn: Lsn(4),
+                dbnode: 7,
+            }),
+        ];
+        for msg in messages {
+            let bytes = msg.serialize();
+            let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
+            assert!(msg == reconstructed);
+        }
+    }
+}
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "pq_proto"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+anyhow = "1.0"
+bytes = "1.0.1"
+pin-project-lite = "0.2.7"
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+rand = "0.8.3"
+serde = { version = "1.0", features = ["derive"] }
+tokio = { version = "1.17", features = ["macros"] }
+tracing = "0.1"
+
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/utils/src/pq_proto.rs
+++ b/libs/utils/src/pq_proto.rs
@@ -2,7 +2,9 @@
 //! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
 //! on message formats.

-use crate::sync::{AsyncishRead, SyncFuture};
+// Tools for calling certain async methods in sync contexts.
+pub mod sync;
+
 use anyhow::{bail, ensure, Context, Result};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use postgres_protocol::PG_EPOCH;
@@ -16,6 +18,7 @@ use std::{
    str,
    time::{Duration, SystemTime},
 };
+use sync::{AsyncishRead, SyncFuture};
 use tokio::io::AsyncReadExt;
 use tracing::{trace, warn};

@@ -198,7 +201,7 @@ impl FeMessage {
    ///
    /// ```
    /// # use std::io;
-    /// # use utils::pq_proto::FeMessage;
+    /// # use pq_proto::FeMessage;
    /// #
    /// # fn process_message(msg: FeMessage) -> anyhow::Result<()> {
    /// #     Ok(())
@@ -302,6 +305,7 @@ impl FeStartupPacket {
                Err(e) => return Err(e.into()),
            };

+            #[allow(clippy::manual_range_contains)]
            if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
                bail!("invalid message length");
            }
--- a/libs/pq_proto/src/sync.rs
+++ b/libs/pq_proto/src/sync.rs
@@ -29,7 +29,7 @@ impl<S, T: Future> SyncFuture<S, T> {
    /// Example:
    ///
    /// ```
-    /// # use utils::sync::SyncFuture;
+    /// # use pq_proto::sync::SyncFuture;
    /// # use std::future::Future;
    /// # use tokio::io::AsyncReadExt;
    /// #
--- a/libs/tenant_size_model/.gitignore
+++ b/libs/tenant_size_model/.gitignore
@@ -0,0 +1,3 @@
+*.dot
+*.png
+*.svg
--- a/libs/tenant_size_model/Cargo.toml
+++ b/libs/tenant_size_model/Cargo.toml
@@ -0,0 +1,8 @@
+[package]
+name = "tenant_size_model"
+version = "0.1.0"
+edition = "2021"
+publish = false
+
+[dependencies]
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/tenant_size_model/Makefile
+++ b/libs/tenant_size_model/Makefile
@@ -0,0 +1,13 @@
+all: 1.svg 2.svg 3.svg 4.svg 1.png 2.png 3.png 4.png
+
+../../target/debug/tenant_size_model: Cargo.toml src/main.rs src/lib.rs
+	cargo build --bin tenant_size_model
+
+%.svg: %.dot
+	dot -Tsvg $< > $@
+
+%.png: %.dot
+	dot -Tpng $< > $@
+
+%.dot: ../../target/debug/tenant_size_model
+	../../target/debug/tenant_size_model $* > $@
--- a/libs/tenant_size_model/README.md
+++ b/libs/tenant_size_model/README.md
@@ -0,0 +1,7 @@
+# Logical size + WAL pricing
+
+This is a simulator to calculate the tenant size in different scenarios,
+using the "Logical size + WAL" method. Makefile produces diagrams used in a
+private presentation:
+
+https://docs.google.com/presentation/d/1OapE4k11xmcwMh7I7YvNWGC63yCRLh6udO9bXZ-fZmo/edit?usp=sharing
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -0,0 +1,349 @@
+use std::borrow::Cow;
+use std::collections::HashMap;
+
+/// Pricing model or history size builder.
+///
+/// Maintains knowledge of the branches and their modifications. Generic over the branch name key
+/// type.
+pub struct Storage<K: 'static> {
+    segments: Vec<Segment>,
+
+    /// Mapping from the branch name to the index of a segment describing it's latest state.
+    branches: HashMap<K, usize>,
+}
+
+/// Snapshot of a branch.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct Segment {
+    /// Previous segment index into ['Storage::segments`], if any.
+    parent: Option<usize>,
+
+    /// Description of how did we get to this state.
+    ///
+    /// Mainly used in the original scenarios 1..=4 with insert, delete and update. Not used when
+    /// modifying a branch directly.
+    pub op: Cow<'static, str>,
+
+    /// LSN before this state
+    start_lsn: u64,
+
+    /// LSN at this state
+    pub end_lsn: u64,
+
+    /// Logical size before this state
+    start_size: u64,
+
+    /// Logical size at this state
+    pub end_size: u64,
+
+    /// Indices to [`Storage::segments`]
+    ///
+    /// FIXME: this could be an Option<usize>
+    children_after: Vec<usize>,
+
+    /// Determined by `retention_period` given to [`Storage::calculate`]
+    pub needed: bool,
+}
+
+//
+//
+//
+//
+//                 *-g--*---D--->
+//                /
+//               /
+//              /                 *---b----*-B--->
+//             /                 /
+//            /                 /
+//      -----*--e---*-----f----* C
+//           E                  \
+//                               \
+//                                *--a---*---A-->
+//
+// If A and B need to be retained, is it cheaper to store
+// snapshot at C+a+b, or snapshots at A and B ?
+//
+// If D also needs to be retained, which is cheaper:
+//
+// 1. E+g+e+f+a+b
+// 2. D+C+a+b
+// 3. D+A+B
+
+/// [`Segment`] which has had it's size calculated.
+pub struct SegmentSize {
+    pub seg_id: usize,
+
+    pub method: SegmentMethod,
+
+    this_size: u64,
+
+    pub children: Vec<SegmentSize>,
+}
+
+impl SegmentSize {
+    fn total(&self) -> u64 {
+        self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
+    }
+
+    pub fn total_children(&self) -> u64 {
+        if self.method == SnapshotAfter {
+            self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
+        } else {
+            self.children.iter().fold(0, |acc, x| acc + x.total())
+        }
+    }
+}
+
+/// Different methods to retain history from a particular state
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum SegmentMethod {
+    SnapshotAfter,
+    Wal,
+    WalNeeded,
+    Skipped,
+}
+
+use SegmentMethod::*;
+
+impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
+    /// Creates a new storage with the given default branch name.
+    pub fn new(initial_branch: K) -> Storage<K> {
+        let init_segment = Segment {
+            op: "".into(),
+            needed: false,
+            parent: None,
+            start_lsn: 0,
+            end_lsn: 0,
+            start_size: 0,
+            end_size: 0,
+            children_after: Vec::new(),
+        };
+
+        Storage {
+            segments: vec![init_segment],
+            branches: HashMap::from([(initial_branch, 0)]),
+        }
+    }
+
+    /// Advances the branch with the named operation, by the relative LSN and logical size bytes.
+    pub fn modify_branch<Q: ?Sized>(
+        &mut self,
+        branch: &Q,
+        op: Cow<'static, str>,
+        lsn_bytes: u64,
+        size_bytes: i64,
+    ) where
+        K: std::borrow::Borrow<Q>,
+        Q: std::hash::Hash + Eq,
+    {
+        let lastseg_id = *self.branches.get(branch).unwrap();
+        let newseg_id = self.segments.len();
+        let lastseg = &mut self.segments[lastseg_id];
+
+        let newseg = Segment {
+            op,
+            parent: Some(lastseg_id),
+            start_lsn: lastseg.end_lsn,
+            end_lsn: lastseg.end_lsn + lsn_bytes,
+            start_size: lastseg.end_size,
+            end_size: (lastseg.end_size as i64 + size_bytes) as u64,
+            children_after: Vec::new(),
+            needed: false,
+        };
+        lastseg.children_after.push(newseg_id);
+
+        self.segments.push(newseg);
+        *self.branches.get_mut(branch).expect("read already") = newseg_id;
+    }
+
+    pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
+    where
+        K: std::borrow::Borrow<Q>,
+        Q: std::hash::Hash + Eq,
+    {
+        self.modify_branch(branch, "insert".into(), bytes, bytes as i64);
+    }
+
+    pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
+    where
+        K: std::borrow::Borrow<Q>,
+        Q: std::hash::Hash + Eq,
+    {
+        self.modify_branch(branch, "update".into(), bytes, 0i64);
+    }
+
+    pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
+    where
+        K: std::borrow::Borrow<Q>,
+        Q: std::hash::Hash + Eq,
+    {
+        self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64));
+    }
+
+    /// Panics if the parent branch cannot be found.
+    pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K)
+    where
+        K: std::borrow::Borrow<Q>,
+        Q: std::hash::Hash + Eq,
+    {
+        // Find the right segment
+        let branchseg_id = *self
+            .branches
+            .get(parent)
+            .expect("should had found the parent by key");
+        let _branchseg = &mut self.segments[branchseg_id];
+
+        // Create branch name for it
+        self.branches.insert(name, branchseg_id);
+    }
+
+    pub fn calculate(&mut self, retention_period: u64) -> SegmentSize {
+        // Phase 1: Mark all the segments that need to be retained
+        for (_branch, &last_seg_id) in self.branches.iter() {
+            let last_seg = &self.segments[last_seg_id];
+            let cutoff_lsn = last_seg.start_lsn.saturating_sub(retention_period);
+            let mut seg_id = last_seg_id;
+            loop {
+                let seg = &mut self.segments[seg_id];
+                if seg.end_lsn < cutoff_lsn {
+                    break;
+                }
+                seg.needed = true;
+                if let Some(prev_seg_id) = seg.parent {
+                    seg_id = prev_seg_id;
+                } else {
+                    break;
+                }
+            }
+        }
+
+        // Phase 2: For each oldest segment in a chain that needs to be retained,
+        // calculate if we should store snapshot or WAL
+        self.size_from_snapshot_later(0)
+    }
+
+    fn size_from_wal(&self, seg_id: usize) -> SegmentSize {
+        let seg = &self.segments[seg_id];
+
+        let this_size = seg.end_lsn - seg.start_lsn;
+
+        let mut children = Vec::new();
+
+        // try both ways
+        for &child_id in seg.children_after.iter() {
+            // try each child both ways
+            let child = &self.segments[child_id];
+            let p1 = self.size_from_wal(child_id);
+
+            let p = if !child.needed {
+                let p2 = self.size_from_snapshot_later(child_id);
+                if p1.total() < p2.total() {
+                    p1
+                } else {
+                    p2
+                }
+            } else {
+                p1
+            };
+            children.push(p);
+        }
+        SegmentSize {
+            seg_id,
+            method: if seg.needed { WalNeeded } else { Wal },
+            this_size,
+            children,
+        }
+    }
+
+    fn size_from_snapshot_later(&self, seg_id: usize) -> SegmentSize {
+        // If this is needed, then it's time to do the snapshot and continue
+        // with wal method.
+        let seg = &self.segments[seg_id];
+        //eprintln!("snap: seg{}: {} needed: {}", seg_id, seg.children_after.len(), seg.needed);
+        if seg.needed {
+            let mut children = Vec::new();
+
+            for &child_id in seg.children_after.iter() {
+                // try each child both ways
+                let child = &self.segments[child_id];
+                let p1 = self.size_from_wal(child_id);
+
+                let p = if !child.needed {
+                    let p2 = self.size_from_snapshot_later(child_id);
+                    if p1.total() < p2.total() {
+                        p1
+                    } else {
+                        p2
+                    }
+                } else {
+                    p1
+                };
+                children.push(p);
+            }
+            SegmentSize {
+                seg_id,
+                method: WalNeeded,
+                this_size: seg.start_size,
+                children,
+            }
+        } else {
+            // If any of the direct children are "needed", need to be able to reconstruct here
+            let mut children_needed = false;
+            for &child in seg.children_after.iter() {
+                let seg = &self.segments[child];
+                if seg.needed {
+                    children_needed = true;
+                    break;
+                }
+            }
+
+            let method1 = if !children_needed {
+                let mut children = Vec::new();
+                for child in seg.children_after.iter() {
+                    children.push(self.size_from_snapshot_later(*child));
+                }
+                Some(SegmentSize {
+                    seg_id,
+                    method: Skipped,
+                    this_size: 0,
+                    children,
+                })
+            } else {
+                None
+            };
+
+            // If this a junction, consider snapshotting here
+            let method2 = if children_needed || seg.children_after.len() >= 2 {
+                let mut children = Vec::new();
+                for child in seg.children_after.iter() {
+                    children.push(self.size_from_wal(*child));
+                }
+                Some(SegmentSize {
+                    seg_id,
+                    method: SnapshotAfter,
+                    this_size: seg.end_size,
+                    children,
+                })
+            } else {
+                None
+            };
+
+            match (method1, method2) {
+                (None, None) => panic!(),
+                (Some(method), None) => method,
+                (None, Some(method)) => method,
+                (Some(method1), Some(method2)) => {
+                    if method1.total() < method2.total() {
+                        method1
+                    } else {
+                        method2
+                    }
+                }
+            }
+        }
+    }
+
+    pub fn into_segments(self) -> Vec<Segment> {
+        self.segments
+    }
+}
--- a/libs/tenant_size_model/src/main.rs
+++ b/libs/tenant_size_model/src/main.rs
@@ -0,0 +1,268 @@
+//! Tenant size model testing ground.
+//!
+//! Has a number of scenarios and a `main` for invoking these by number, calculating the history
+//! size, outputs graphviz graph. Makefile in directory shows how to use graphviz to turn scenarios
+//! into pngs.
+
+use tenant_size_model::{Segment, SegmentSize, Storage};
+
+// Main branch only. Some updates on it.
+fn scenario_1() -> (Vec<Segment>, SegmentSize) {
+    // Create main branch
+    let mut storage = Storage::new("main");
+
+    // Bulk load 5 GB of data to it
+    storage.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        storage.update("main", 1_000);
+    }
+
+    let size = storage.calculate(1000);
+
+    (storage.into_segments(), size)
+}
+
+// Main branch only. Some updates on it.
+fn scenario_2() -> (Vec<Segment>, SegmentSize) {
+    // Create main branch
+    let mut storage = Storage::new("main");
+
+    // Bulk load 5 GB of data to it
+    storage.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        storage.update("main", 1_000);
+    }
+
+    // Branch
+    storage.branch("main", "child");
+    storage.update("child", 1_000);
+
+    // More updates on parent
+    storage.update("main", 1_000);
+
+    let size = storage.calculate(1000);
+
+    (storage.into_segments(), size)
+}
+
+// Like 2, but more updates on main
+fn scenario_3() -> (Vec<Segment>, SegmentSize) {
+    // Create main branch
+    let mut storage = Storage::new("main");
+
+    // Bulk load 5 GB of data to it
+    storage.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        storage.update("main", 1_000);
+    }
+
+    // Branch
+    storage.branch("main", "child");
+    storage.update("child", 1_000);
+
+    // More updates on parent
+    for _ in 0..5 {
+        storage.update("main", 1_000);
+    }
+
+    let size = storage.calculate(1000);
+
+    (storage.into_segments(), size)
+}
+
+// Diverged branches
+fn scenario_4() -> (Vec<Segment>, SegmentSize) {
+    // Create main branch
+    let mut storage = Storage::new("main");
+
+    // Bulk load 5 GB of data to it
+    storage.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        storage.update("main", 1_000);
+    }
+
+    // Branch
+    storage.branch("main", "child");
+    storage.update("child", 1_000);
+
+    // More updates on parent
+    for _ in 0..8 {
+        storage.update("main", 1_000);
+    }
+
+    let size = storage.calculate(1000);
+
+    (storage.into_segments(), size)
+}
+
+fn scenario_5() -> (Vec<Segment>, SegmentSize) {
+    let mut storage = Storage::new("a");
+    storage.insert("a", 5000);
+    storage.branch("a", "b");
+    storage.update("b", 4000);
+    storage.update("a", 2000);
+    storage.branch("a", "c");
+    storage.insert("c", 4000);
+    storage.insert("a", 2000);
+
+    let size = storage.calculate(5000);
+
+    (storage.into_segments(), size)
+}
+
+fn scenario_6() -> (Vec<Segment>, SegmentSize) {
+    use std::borrow::Cow;
+
+    const NO_OP: Cow<'static, str> = Cow::Borrowed("");
+
+    let branches = [
+        Some(0x7ff1edab8182025f15ae33482edb590a_u128),
+        Some(0xb1719e044db05401a05a2ed588a3ad3f),
+        Some(0xb68d6691c895ad0a70809470020929ef),
+    ];
+
+    // compared to other scenarios, this one uses bytes instead of kB
+
+    let mut storage = Storage::new(None);
+
+    storage.branch(&None, branches[0]); // at 0
+    storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128); // at 108951064
+    storage.branch(&branches[0], branches[1]); // at 108951064
+    storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392); // at 124511472
+    storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904); // at 283415424
+    storage.branch(&branches[0], branches[2]); // at 283415424
+    storage.modify_branch(&branches[2], NO_OP, 15906192, 8192); // at 299321616
+    storage.modify_branch(&branches[0], NO_OP, 18909976, 32768); // at 302325400
+
+    let size = storage.calculate(100_000);
+
+    (storage.into_segments(), size)
+}
+
+fn main() {
+    let args: Vec<String> = std::env::args().collect();
+
+    let scenario = if args.len() < 2 { "1" } else { &args[1] };
+
+    let (segments, size) = match scenario {
+        "1" => scenario_1(),
+        "2" => scenario_2(),
+        "3" => scenario_3(),
+        "4" => scenario_4(),
+        "5" => scenario_5(),
+        "6" => scenario_6(),
+        other => {
+            eprintln!("invalid scenario {}", other);
+            std::process::exit(1);
+        }
+    };
+
+    graphviz_tree(&segments, &size);
+}
+
+fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) {
+    use tenant_size_model::SegmentMethod::*;
+
+    let seg_id = node.seg_id;
+    let seg = segments.get(seg_id).unwrap();
+    let lsn = seg.end_lsn;
+    let size = seg.end_size;
+    let method = node.method;
+
+    println!("  {{");
+    println!("    node [width=0.1 height=0.1 shape=oval]");
+
+    let tenant_size = node.total_children();
+
+    let penwidth = if seg.needed { 6 } else { 3 };
+    let x = match method {
+        SnapshotAfter =>
+            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" style=filled penwidth={penwidth}"),
+        Wal =>
+            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
+        WalNeeded =>
+            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
+        Skipped =>
+            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"gray\" penwidth={penwidth}"),
+    };
+
+    println!("    \"seg{seg_id}\" [{x}]");
+    println!("  }}");
+
+    // Recurse. Much of the data is actually on the edge
+    for child in node.children.iter() {
+        let child_id = child.seg_id;
+        graphviz_recurse(segments, child);
+
+        let edge_color = match child.method {
+            SnapshotAfter => "gray",
+            Wal => "black",
+            WalNeeded => "black",
+            Skipped => "gray",
+        };
+
+        println!("  {{");
+        println!("    edge [] ");
+        print!("    \"seg{seg_id}\" -> \"seg{child_id}\" [");
+        print!("color={edge_color}");
+        if child.method == WalNeeded {
+            print!(" penwidth=6");
+        }
+        if child.method == Wal {
+            print!(" penwidth=3");
+        }
+
+        let next = segments.get(child_id).unwrap();
+
+        if next.op.is_empty() {
+            print!(
+                " label=\"{} / {}\"",
+                next.end_lsn - seg.end_lsn,
+                (next.end_size as i128 - seg.end_size as i128)
+            );
+        } else {
+            print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn);
+        }
+        println!("]");
+        println!("  }}");
+    }
+}
+
+fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) {
+    println!("digraph G {{");
+    println!("  fontname=\"Helvetica,Arial,sans-serif\"");
+    println!("  node [fontname=\"Helvetica,Arial,sans-serif\"]");
+    println!("  edge [fontname=\"Helvetica,Arial,sans-serif\"]");
+    println!("  graph [center=1 rankdir=LR]");
+    println!("  edge [dir=none]");
+
+    graphviz_recurse(segments, tree);
+
+    println!("}}");
+}
+
+#[test]
+fn scenarios_return_same_size() {
+    type ScenarioFn = fn() -> (Vec<Segment>, SegmentSize);
+    let truths: &[(u32, ScenarioFn, _)] = &[
+        (line!(), scenario_1, 8000),
+        (line!(), scenario_2, 9000),
+        (line!(), scenario_3, 13000),
+        (line!(), scenario_4, 16000),
+        (line!(), scenario_5, 17000),
+        (line!(), scenario_6, 333_792_000),
+    ];
+
+    for (line, scenario, expected) in truths {
+        let (_, size) = scenario();
+        assert_eq!(*expected, size.total_children(), "scenario on line {line}");
+    }
+}
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -9,9 +9,6 @@ anyhow = "1.0"
 bincode = "1.3"
 bytes = "1.0.1"
 hyper = { version = "0.14.7", features = ["full"] }
-pin-project-lite = "0.2.7"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 routerify = "3"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
@@ -33,8 +30,8 @@ once_cell = "1.13.0"
 strum = "0.24"
 strum_macros = "0.24"

-
 metrics = { path = "../metrics" }
+pq_proto = { path = "../pq_proto" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }

 [dev-dependencies]
--- a/libs/utils/src/connstring.rs
+++ b/libs/utils/src/connstring.rs
@@ -1,52 +0,0 @@
-use postgres::Config;
-
-pub fn connection_host_port(config: &Config) -> (String, u16) {
-    assert_eq!(
-        config.get_hosts().len(),
-        1,
-        "only one pair of host and port is supported in connection string"
-    );
-    assert_eq!(
-        config.get_ports().len(),
-        1,
-        "only one pair of host and port is supported in connection string"
-    );
-    let host = match &config.get_hosts()[0] {
-        postgres::config::Host::Tcp(host) => host.as_ref(),
-        postgres::config::Host::Unix(host) => host.to_str().unwrap(),
-    };
-    (host.to_owned(), config.get_ports()[0])
-}
-
-pub fn connection_address(config: &Config) -> String {
-    let (host, port) = connection_host_port(config);
-    format!("{}:{}", host, port)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_connection_host_port() {
-        let config: Config = "postgresql://no_user@localhost:64000/no_db"
-            .parse()
-            .unwrap();
-        assert_eq!(
-            connection_host_port(&config),
-            ("localhost".to_owned(), 64000)
-        );
-    }
-
-    #[test]
-    #[should_panic(expected = "only one pair of host and port is supported in connection string")]
-    fn test_connection_host_port_multiple_ports() {
-        let config: Config = "postgresql://no_user@localhost:64000,localhost:64001/no_db"
-            .parse()
-            .unwrap();
-        assert_eq!(
-            connection_host_port(&config),
-            ("localhost".to_owned(), 64000)
-        );
-    }
-}
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -204,6 +204,17 @@ pub struct TenantId(Id);

 id_newtype!(TenantId);

+/// Neon Connection Id identifies long-lived connections (for example a pagestream
+/// connection with the page_service). Is used for better logging and tracing
+///
+/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
+/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
+/// See [`Id`] for alternative ways to serialize it.
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
+pub struct ConnectionId(Id);
+
+id_newtype!(ConnectionId);
+
 // A pair uniquely identifying Neon instance.
 #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct TenantTimelineId {
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,8 +1,6 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.

-#![allow(clippy::manual_range_contains)]
-
 /// `Lsn` type implements common tasks on Log Sequence Numbers
 pub mod lsn;
 /// SeqWait allows waiting for a future sequence number to arrive
@@ -17,10 +15,6 @@ pub mod vec_map;
 pub mod bin_ser;
 pub mod postgres_backend;
 pub mod postgres_backend_async;
-pub mod pq_proto;
-
-// dealing with connstring parsing and handy access to it's parts
-pub mod connstring;

 // helper functions for creating and fsyncing
 pub mod crashsafe;
@@ -39,13 +33,12 @@ pub mod sock_split;
 // common log initialisation routine
 pub mod logging;

+pub mod lock_file;
+
 // Misc
 pub mod accum;
 pub mod shutdown;

-// Tools for calling certain async methods in sync contexts
-pub mod sync;
-
 // Utility for binding TcpListeners with proper socket options.
 pub mod tcp_listener;

--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -0,0 +1,81 @@
+//! A module to create and read lock files. A lock file ensures that only one
+//! process is running at a time, in a particular directory.
+//!
+//! File locking is done using [`fcntl::flock`], which means that holding the
+//! lock on file only prevents acquiring another lock on it; all other
+//! operations are still possible on files. Other process can still open, read,
+//! write, or remove the file, for example.
+//! If the file is removed while a process is holding a lock on it,
+//! the process that holds the lock does not get any error or notification.
+//! Furthermore, you can create a new file with the same name and lock the new file,
+//! while the old process is still running.
+//! Deleting the lock file while the locking process is still running is a bad idea!
+
+use std::{fs, os::unix::prelude::AsRawFd, path::Path};
+
+use anyhow::Context;
+use nix::fcntl;
+
+use crate::crashsafe;
+
+pub enum LockCreationResult {
+    Created {
+        new_lock_contents: String,
+        file: fs::File,
+    },
+    AlreadyLocked {
+        existing_lock_contents: String,
+    },
+    CreationFailed(anyhow::Error),
+}
+
+/// Creates a lock file in the path given and writes the given contents into the file.
+/// Note: The lock is automatically released when the file closed. You might want to use Box::leak to make sure it lives until the end of the program.
+pub fn create_lock_file(lock_file_path: &Path, contents: String) -> LockCreationResult {
+    let lock_file = match fs::OpenOptions::new()
+        .create(true) // O_CREAT
+        .write(true)
+        .open(lock_file_path)
+        .context("Failed to open lock file")
+    {
+        Ok(file) => file,
+        Err(e) => return LockCreationResult::CreationFailed(e),
+    };
+
+    match fcntl::flock(
+        lock_file.as_raw_fd(),
+        fcntl::FlockArg::LockExclusiveNonblock,
+    ) {
+        Ok(()) => {
+            match lock_file
+                .set_len(0)
+                .context("Failed to truncate lockfile")
+                .and_then(|()| {
+                    fs::write(lock_file_path, &contents).with_context(|| {
+                        format!("Failed to write '{contents}' contents into lockfile")
+                    })
+                })
+                .and_then(|()| {
+                    crashsafe::fsync_file_and_parent(lock_file_path)
+                        .context("Failed to fsync lockfile")
+                }) {
+                Ok(()) => LockCreationResult::Created {
+                    new_lock_contents: contents,
+                    file: lock_file,
+                },
+                Err(e) => LockCreationResult::CreationFailed(e),
+            }
+        }
+        Err(nix::errno::Errno::EAGAIN) => {
+            match fs::read_to_string(lock_file_path).context("Failed to read lockfile contents") {
+                Ok(existing_lock_contents) => LockCreationResult::AlreadyLocked {
+                    existing_lock_contents,
+                },
+                Err(e) => LockCreationResult::CreationFailed(e),
+            }
+        }
+        Err(e) => {
+            LockCreationResult::CreationFailed(anyhow::anyhow!("Failed to lock lockfile: {e}"))
+        }
+    }
+}
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,10 +1,6 @@
-use std::{
-    fs::{File, OpenOptions},
-    path::Path,
-    str::FromStr,
-};
+use std::str::FromStr;

-use anyhow::{Context, Result};
+use anyhow::Context;
 use strum_macros::{EnumString, EnumVariantNames};

 #[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
@@ -25,19 +21,8 @@ impl LogFormat {
        })
    }
 }
-pub fn init(
-    log_filename: impl AsRef<Path>,
-    daemonize: bool,
-    log_format: LogFormat,
-) -> Result<File> {
-    // Don't open the same file for output multiple times;
-    // the different fds could overwrite each other's output.
-    let log_file = OpenOptions::new()
-        .create(true)
-        .append(true)
-        .open(&log_filename)
-        .with_context(|| format!("failed to open {:?}", log_filename.as_ref()))?;

+pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
    let default_filter_str = "info";

    // We fall back to printing all spans at info-level or above if
@@ -45,50 +30,16 @@ pub fn init(
    let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_filter_str));

-    let x: File = log_file.try_clone().unwrap();
    let base_logger = tracing_subscriber::fmt()
        .with_env_filter(env_filter)
        .with_target(false)
        .with_ansi(false)
-        .with_writer(move || -> Box<dyn std::io::Write> {
-            // we are cloning and returning log file in order to allow redirecting daemonized stdout and stderr to it
-            // if we do not use daemonization (e.g. in docker) it is better to log to stdout directly
-            // for example to be in line with docker log command which expects logs comimg from stdout
-            if daemonize {
-                Box::new(x.try_clone().unwrap())
-            } else {
-                Box::new(std::io::stdout())
-            }
-        });
+        .with_writer(std::io::stdout);

    match log_format {
        LogFormat::Json => base_logger.json().init(),
        LogFormat::Plain => base_logger.init(),
    }

-    Ok(log_file)
-}
-
-// #[cfg(test)]
-// Due to global logger, can't run tests in same process.
-// So until there's a non-global one, the tests are in ../tests/ as separate files.
-#[macro_export(local_inner_macros)]
-macro_rules! test_init_file_logger {
-    ($log_level:expr, $log_format:expr) => {{
-        use std::str::FromStr;
-        std::env::set_var("RUST_LOG", $log_level);
-
-        let tmp_dir = tempfile::TempDir::new().unwrap();
-        let log_file_path = tmp_dir.path().join("logfile");
-
-        let log_format = $crate::logging::LogFormat::from_str($log_format).unwrap();
-        let _log_file = $crate::logging::init(&log_file_path, true, log_format).unwrap();
-
-        let log_file = std::fs::OpenOptions::new()
-            .read(true)
-            .open(&log_file_path)
-            .unwrap();
-
-        log_file
-    }};
+    Ok(())
 }
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -13,7 +13,7 @@ use crate::seqwait::MonotonicCounter;
 pub const XLOG_BLCKSZ: u32 = 8192;

 /// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
-#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Serialize, Deserialize)]
+#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Serialize, Deserialize)]
 #[serde(transparent)]
 pub struct Lsn(pub u64);

@@ -138,7 +138,7 @@ impl FromStr for Lsn {
    ///
    /// If the input string is missing the '/' character, then use `Lsn::from_hex`
    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let mut splitter = s.split('/');
+        let mut splitter = s.trim().split('/');
        if let (Some(left), Some(right), None) = (splitter.next(), splitter.next(), splitter.next())
        {
            let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
@@ -270,6 +270,11 @@ mod tests {
        );
        assert_eq!(Lsn::from_hex("0"), Ok(Lsn(0)));
        assert_eq!(Lsn::from_hex("F12345678AAAA5555"), Err(LsnParseError));
+
+        let expected_lsn = Lsn(0x3C490F8);
+        assert_eq!(" 0/3C490F8".parse(), Ok(expected_lsn));
+        assert_eq!("0/3C490F8 ".parse(), Ok(expected_lsn));
+        assert_eq!(" 0/3C490F8 ".parse(), Ok(expected_lsn));
    }

    #[test]
--- a/libs/utils/src/postgres_backend.rs
+++ b/libs/utils/src/postgres_backend.rs
@@ -3,10 +3,10 @@
 //! implementation determining how to process the queries. Currently its API
 //! is rather narrow, but we can extend it once required.

-use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
 use crate::sock_split::{BidiStream, ReadStream, WriteStream};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
+use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
 use std::fmt;
--- a/libs/utils/src/postgres_backend_async.rs
+++ b/libs/utils/src/postgres_backend_async.rs
@@ -4,9 +4,9 @@
 //! is rather narrow, but we can extend it once required.

 use crate::postgres_backend::AuthType;
-use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
 use anyhow::{bail, Context, Result};
 use bytes::{Bytes, BytesMut};
+use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
 use rand::Rng;
 use std::future::Future;
 use std::net::SocketAddr;
--- a/libs/utils/tests/logger_json_test.rs
+++ b/libs/utils/tests/logger_json_test.rs
@@ -1,36 +0,0 @@
-// This could be in ../src/logging.rs but since the logger is global, these
-// can't be run in threads of the same process
-use std::fs::File;
-use std::io::{BufRead, BufReader, Lines};
-use tracing::*;
-use utils::test_init_file_logger;
-
-fn read_lines(file: File) -> Lines<BufReader<File>> {
-    BufReader::new(file).lines()
-}
-
-#[test]
-fn test_json_format_has_message_and_custom_field() {
-    std::env::set_var("RUST_LOG", "info");
-
-    let log_file = test_init_file_logger!("info", "json");
-
-    let custom_field: &str = "hi";
-    trace!(custom = %custom_field, "test log message");
-    debug!(custom = %custom_field, "test log message");
-    info!(custom = %custom_field, "test log message");
-    warn!(custom = %custom_field, "test log message");
-    error!(custom = %custom_field, "test log message");
-
-    let lines = read_lines(log_file);
-    for line in lines {
-        let content = line.unwrap();
-        let json_object = serde_json::from_str::<serde_json::Value>(&content).unwrap();
-
-        assert_eq!(json_object["fields"]["custom"], "hi");
-        assert_eq!(json_object["fields"]["message"], "test log message");
-
-        assert_ne!(json_object["level"], "TRACE");
-        assert_ne!(json_object["level"], "DEBUG");
-    }
-}
--- a/libs/utils/tests/logger_plain_test.rs
+++ b/libs/utils/tests/logger_plain_test.rs
@@ -1,36 +0,0 @@
-// This could be in ../src/logging.rs but since the logger is global, these
-// can't be run in threads of the same process
-use std::fs::File;
-use std::io::{BufRead, BufReader, Lines};
-use tracing::*;
-use utils::test_init_file_logger;
-
-fn read_lines(file: File) -> Lines<BufReader<File>> {
-    BufReader::new(file).lines()
-}
-
-#[test]
-fn test_plain_format_has_message_and_custom_field() {
-    std::env::set_var("RUST_LOG", "warn");
-
-    let log_file = test_init_file_logger!("warn", "plain");
-
-    let custom_field: &str = "hi";
-    trace!(custom = %custom_field, "test log message");
-    debug!(custom = %custom_field, "test log message");
-    info!(custom = %custom_field, "test log message");
-    warn!(custom = %custom_field, "test log message");
-    error!(custom = %custom_field, "test log message");
-
-    let lines = read_lines(log_file);
-    for line in lines {
-        let content = line.unwrap();
-        serde_json::from_str::<serde_json::Value>(&content).unwrap_err();
-        assert!(content.contains("custom=hi"));
-        assert!(content.contains("test log message"));
-
-        assert!(!content.contains("TRACE"));
-        assert!(!content.contains("DEBUG"));
-        assert!(!content.contains("INFO"));
-    }
-}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -12,62 +12,61 @@ testing = ["fail/failpoints"]
 profiling = ["pprof"]

 [dependencies]
+amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
+anyhow = { version = "1.0", features = ["backtrace"] }
 async-stream = "0.3"
 async-trait = "0.1"
-chrono = "0.4.19"
-rand = "0.8.3"
-regex = "1.4.5"
-bytes = "1.0.1"
 byteorder = "1.4.3"
+bytes = "1.0.1"
+chrono = "0.4.19"
+clap = { version = "4.0", features = ["string"] }
+close_fds = "0.3.2"
+const_format = "0.2.21"
+crc32c = "0.6.0"
+crossbeam-utils = "0.8.5"
+fail = "0.5.0"
 futures = "0.3.13"
+git-version = "0.3.5"
 hex = "0.4.3"
+humantime = "2.1.0"
+humantime-serde = "1.1.1"
 hyper = "0.14"
 itertools = "0.10.3"
-clap = { version = "4.0", features = ["string"] }
-daemonize = "0.4.1"
-tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
-tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+nix = "0.25"
+num-traits = "0.2.15"
+once_cell = "1.13.0"
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-anyhow = { version = "1.0", features = ["backtrace"] }
-crc32c = "0.6.0"
-thiserror = "1.0"
-tar = "0.4.33"
-humantime = "2.1.0"
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
+rand = "0.8.3"
+regex = "1.4.5"
+rstar = "0.9.3"
+scopeguard = "1.1.0"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "2.0"
-humantime-serde = "1.1.1"
-
-pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
-
-toml_edit = { version = "0.14", features = ["easy"] }
-scopeguard = "1.1.0"
-const_format = "0.2.21"
-tracing = "0.1.36"
 signal-hook = "0.3.10"
+svg_fmt = "0.4.1"
+tar = "0.4.33"
+thiserror = "1.0"
+tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
+toml_edit = { version = "0.14", features = ["easy"] }
+tracing = "0.1.36"
 url = "2"
-nix = "0.25"
-once_cell = "1.13.0"
-crossbeam-utils = "0.8.5"
-fail = "0.5.0"
-git-version = "0.3.5"
-rstar = "0.9.3"
-num-traits = "0.2.15"
-amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
+walkdir = "2.3.2"

-pageserver_api = { path = "../libs/pageserver_api" }
-postgres_ffi = { path = "../libs/postgres_ffi" }
 etcd_broker = { path = "../libs/etcd_broker" }
 metrics = { path = "../libs/metrics" }
-utils = { path = "../libs/utils" }
+pageserver_api = { path = "../libs/pageserver_api" }
+postgres_ffi = { path = "../libs/postgres_ffi" }
+pq_proto = { path = "../libs/pq_proto" }
 remote_storage = { path = "../libs/remote_storage" }
+tenant_size_model = { path = "../libs/tenant_size_model" }
+utils = { path = "../libs/utils" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
-close_fds = "0.3.2"
-walkdir = "2.3.2"
-svg_fmt = "0.4.1"

 [dev-dependencies]
 criterion = "0.4"
@@ -77,3 +76,7 @@ tempfile = "3.2"
 [[bench]]
 name = "bench_layer_map"
 harness = false
+
+[[bench]]
+name = "bench_walredo"
+harness = false
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -1,17 +1,14 @@
 //! Main entry point for the Page Server executable.

-use remote_storage::GenericRemoteStorage;
 use std::{env, ops::ControlFlow, path::Path, str::FromStr};
+
+use anyhow::{anyhow, Context};
+use clap::{Arg, ArgAction, Command};
+use fail::FailScenario;
+use nix::unistd::Pid;
 use tracing::*;

-use anyhow::{anyhow, bail, Context, Result};
-
-use clap::{Arg, ArgAction, Command};
-use daemonize::Daemonize;
-
-use fail::FailScenario;
 use metrics::set_build_info_metric;
-
 use pageserver::{
    config::{defaults::*, PageServerConf},
    http, page_cache, page_service, profiling, task_mgr,
@@ -19,20 +16,22 @@ use pageserver::{
    task_mgr::{
        BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
    },
-    tenant_mgr, virtual_file, LOG_FILE_NAME,
+    tenant_mgr, virtual_file,
 };
+use remote_storage::GenericRemoteStorage;
 use utils::{
    auth::JwtAuth,
-    logging,
+    lock_file, logging,
    postgres_backend::AuthType,
    project_git_version,
-    shutdown::exit_now,
    signals::{self, Signal},
    tcp_listener,
 };

 project_git_version!(GIT_VERSION);

+const PID_FILE_NAME: &str = "pageserver.pid";
+
 const FEATURES: &[&str] = &[
    #[cfg(feature = "testing")]
    "testing",
@@ -65,6 +64,7 @@ fn main() -> anyhow::Result<()> {
    let workdir = workdir
        .canonicalize()
        .with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
+
    let cfg_file_path = workdir.join("pageserver.toml");

    // Set CWD to workdir for non-daemon modes
@@ -75,8 +75,6 @@ fn main() -> anyhow::Result<()> {
        )
    })?;

-    let daemonize = arg_matches.get_flag("daemonize");
-
    let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
        ControlFlow::Continue(conf) => conf,
        ControlFlow::Break(()) => {
@@ -102,7 +100,7 @@ fn main() -> anyhow::Result<()> {
    virtual_file::init(conf.max_file_descriptors);
    page_cache::init(conf.page_cache_size);

-    start_pageserver(conf, daemonize).context("Failed to start pageserver")?;
+    start_pageserver(conf).context("Failed to start pageserver")?;

    scenario.teardown();
    Ok(())
@@ -197,12 +195,34 @@ fn initialize_config(
    })
 }

-fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> {
-    // Initialize logger
-    let log_file = logging::init(LOG_FILE_NAME, daemonize, conf.log_format)?;
-
+fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
+    logging::init(conf.log_format)?;
    info!("version: {}", version());

+    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
+    let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
+        lock_file::LockCreationResult::Created {
+            new_lock_contents,
+            file,
+        } => {
+            info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}");
+            file
+        }
+        lock_file::LockCreationResult::AlreadyLocked {
+            existing_lock_contents,
+        } => anyhow::bail!(
+            "Could not lock pid file; pageserver is already running in {:?} with PID {}",
+            conf.workdir,
+            existing_lock_contents
+        ),
+        lock_file::LockCreationResult::CreationFailed(e) => {
+            return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
+        }
+    };
+    // ensure that the lock file is held even if the main thread of the process is panics
+    // we need to release the lock file only when the current process is gone
+    let _ = Box::leak(Box::new(lock_file));
+
    // TODO: Check that it looks like a valid repository before going further

    // bind sockets before daemonizing so we report errors early and do not return until we are listening
@@ -218,33 +238,6 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
    );
    let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;

-    // NB: Don't spawn any threads before daemonizing!
-    if daemonize {
-        info!("daemonizing...");
-
-        // There shouldn't be any logging to stdin/stdout. Redirect it to the main log so
-        // that we will see any accidental manual fprintf's or backtraces.
-        let stdout = log_file
-            .try_clone()
-            .with_context(|| format!("Failed to clone log file '{:?}'", log_file))?;
-        let stderr = log_file;
-
-        let daemonize = Daemonize::new()
-            .pid_file("pageserver.pid")
-            .working_directory(".")
-            .stdout(stdout)
-            .stderr(stderr);
-
-        // XXX: The parent process should exit abruptly right after
-        // it has spawned a child to prevent coverage machinery from
-        // dumping stats into a `profraw` file now owned by the child.
-        // Otherwise, the coverage data will be damaged.
-        match daemonize.exit_action(|| exit_now(0)).start() {
-            Ok(_) => info!("Success, daemonized"),
-            Err(err) => bail!("{err}. could not daemonize. bailing."),
-        }
-    }
-
    let signals = signals::install_shutdown_handlers()?;

    // start profiler (if enabled)
@@ -347,14 +340,6 @@ fn cli() -> Command {
    Command::new("Neon page server")
        .about("Materializes WAL stream to pages and serves them to the postgres")
        .version(version())
-        .arg(
-
-            Arg::new("daemonize")
-                .short('d')
-                .long("daemonize")
-                .action(ArgAction::SetTrue)
-                .help("Run in the background"),
-        )
        .arg(
            Arg::new("init")
                .long("init")
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -8,7 +8,9 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
 use remote_storage::RemoteStorageConfig;
 use std::env;
 use utils::crashsafe::path_with_suffix_extension;
+use utils::id::ConnectionId;

+use std::num::NonZeroUsize;
 use std::path::{Path, PathBuf};
 use std::str::FromStr;
 use std::time::Duration;
@@ -48,6 +50,9 @@ pub mod defaults {

    pub const DEFAULT_LOG_FORMAT: &str = "plain";

+    pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
+        super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
+
    ///
    /// Default built-in configuration file.
    ///
@@ -67,6 +72,9 @@ pub mod defaults {
 #initial_superuser_name = '{DEFAULT_SUPERUSER}'

 #log_format = '{DEFAULT_LOG_FORMAT}'
+
+#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
+
 # [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -132,6 +140,9 @@ pub struct PageServerConf {
    pub broker_endpoints: Vec<Url>,

    pub log_format: LogFormat,
+
+    /// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
+    pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
 }

 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -200,6 +211,8 @@ struct PageServerConfigBuilder {
    broker_endpoints: BuilderValue<Vec<Url>>,

    log_format: BuilderValue<LogFormat>,
+
+    concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
 }

 impl Default for PageServerConfigBuilder {
@@ -228,6 +241,8 @@ impl Default for PageServerConfigBuilder {
            broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()),
            broker_endpoints: Set(Vec::new()),
            log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
+
+            concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
        }
    }
 }
@@ -304,6 +319,10 @@ impl PageServerConfigBuilder {
        self.log_format = BuilderValue::Set(log_format)
    }

+    pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: ConfigurableSemaphore) {
+        self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let broker_endpoints = self
            .broker_endpoints
@@ -349,6 +368,11 @@ impl PageServerConfigBuilder {
                .broker_etcd_prefix
                .ok_or(anyhow!("missing broker_etcd_prefix"))?,
            log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
+            concurrent_tenant_size_logical_size_queries: self
+                .concurrent_tenant_size_logical_size_queries
+                .ok_or(anyhow!(
+                    "missing concurrent_tenant_size_logical_size_queries"
+                ))?,
        })
    }
 }
@@ -391,6 +415,22 @@ impl PageServerConf {
        )
    }

+    pub fn traces_path(&self) -> PathBuf {
+        self.workdir.join("traces")
+    }
+
+    pub fn trace_path(
+        &self,
+        tenant_id: &TenantId,
+        timeline_id: &TimelineId,
+        connection_id: &ConnectionId,
+    ) -> PathBuf {
+        self.traces_path()
+            .join(tenant_id.to_string())
+            .join(timeline_id.to_string())
+            .join(connection_id.to_string())
+    }
+
    /// Points to a place in pageserver's local directory,
    /// where certain timeline's metadata file should be located.
    pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf {
@@ -476,6 +516,12 @@ impl PageServerConf {
                "log_format" => builder.log_format(
                    LogFormat::from_config(&parse_toml_string(key, item)?)?
                ),
+                "concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({
+                    let input = parse_toml_string(key, item)?;
+                    let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
+                    let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
+                    ConfigurableSemaphore::new(permits)
+                }),
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -568,8 +614,9 @@ impl PageServerConf {
        PathBuf::from(format!("../tmp_check/test_{test_name}"))
    }

-    #[cfg(test)]
    pub fn dummy_conf(repo_dir: PathBuf) -> Self {
+        let pg_distrib_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
+
        PageServerConf {
            id: NodeId(0),
            wait_lsn_timeout: Duration::from_secs(60),
@@ -580,7 +627,7 @@ impl PageServerConf {
            listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
            superuser: "cloud_admin".to_string(),
            workdir: repo_dir,
-            pg_distrib_dir: PathBuf::new(),
+            pg_distrib_dir,
            auth_type: AuthType::Trust,
            auth_validation_public_key_path: None,
            remote_storage_config: None,
@@ -589,6 +636,7 @@ impl PageServerConf {
            broker_endpoints: Vec::new(),
            broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
            log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
+            concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
        }
    }
 }
@@ -654,6 +702,58 @@ fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result<Vec<String>> {
        .collect()
 }

+/// Configurable semaphore permits setting.
+///
+/// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty
+/// semaphore cannot be distinguished, leading any feature using these to await forever (or until
+/// new permits are added).
+#[derive(Debug, Clone)]
+pub struct ConfigurableSemaphore {
+    initial_permits: NonZeroUsize,
+    inner: std::sync::Arc<tokio::sync::Semaphore>,
+}
+
+impl ConfigurableSemaphore {
+    pub const DEFAULT_INITIAL: NonZeroUsize = match NonZeroUsize::new(1) {
+        Some(x) => x,
+        None => panic!("const unwrap is not yet stable"),
+    };
+
+    /// Initializse using a non-zero amount of permits.
+    ///
+    /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
+    /// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
+    /// behave like [`futures::future::pending`], just waiting until new permits are added.
+    pub fn new(initial_permits: NonZeroUsize) -> Self {
+        ConfigurableSemaphore {
+            initial_permits,
+            inner: std::sync::Arc::new(tokio::sync::Semaphore::new(initial_permits.get())),
+        }
+    }
+}
+
+impl Default for ConfigurableSemaphore {
+    fn default() -> Self {
+        Self::new(Self::DEFAULT_INITIAL)
+    }
+}
+
+impl PartialEq for ConfigurableSemaphore {
+    fn eq(&self, other: &Self) -> bool {
+        // the number of permits can be increased at runtime, so we cannot really fulfill the
+        // PartialEq value equality otherwise
+        self.initial_permits == other.initial_permits
+    }
+}
+
+impl Eq for ConfigurableSemaphore {}
+
+impl ConfigurableSemaphore {
+    pub fn inner(&self) -> &std::sync::Arc<tokio::sync::Semaphore> {
+        &self.inner
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use std::{
@@ -725,6 +825,7 @@ log_format = 'json'
                    .expect("Failed to parse a valid broker endpoint URL")],
                broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
                log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
+                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -770,6 +871,7 @@ log_format = 'json'
                    .expect("Failed to parse a valid broker endpoint URL")],
                broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
                log_format: LogFormat::Json,
+                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -354,6 +354,54 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"

+  /v1/tenant/{tenant_id}/size:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    get:
+      description: |
+        Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
+      responses:
+        "200":
+          description: OK,
+          content:
+            application/json:
+              schema:
+                type: object
+                required:
+                  - id
+                  - size
+                properties:
+                  id:
+                    type: string
+                    format: hex
+                  size:
+                    type: integer
+                    description: |
+                      Size metric in bytes.
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
  /v1/tenant/{tenant_id}/timeline/:
    parameters:
      - name: tenant_id
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -227,13 +227,10 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,

    let state = get_state(&request);

-    let timelines = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
+    let timelines = info_span!("timeline_list", tenant = %tenant_id).in_scope(|| {
        let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
        Ok(tenant.list_timelines())
-    })
-    .await
-    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
+    })?;

    let mut response_data = Vec::with_capacity(timelines.len());
    for timeline in timelines {
@@ -523,9 +520,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
    check_permission(&request, Some(tenant_id))?;

    // if tenant is in progress of downloading it can be absent in global tenant map
-    let tenant = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant(tenant_id, false))
-        .await
-        .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
+    let tenant = tenant_mgr::get_tenant(tenant_id, false);

    let state = get_state(&request);
    let remote_index = &state.remote_index;
@@ -571,6 +566,44 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
    )
 }

+async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::InternalServerError)?;
+
+    // this can be long operation, it currently is not backed by any request coalescing or similar
+    let inputs = tenant
+        .gather_size_inputs()
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    let size = inputs.calculate().map_err(ApiError::InternalServerError)?;
+
+    /// Private response type with the additional "unstable" `inputs` field.
+    ///
+    /// The type is described with `id` and `size` in the openapi_spec file, but the `inputs` is
+    /// intentionally left out. The type resides in the pageserver not to expose `ModelInputs`.
+    #[serde_with::serde_as]
+    #[derive(serde::Serialize)]
+    struct TenantHistorySize {
+        #[serde_as(as = "serde_with::DisplayFromStr")]
+        id: TenantId,
+        /// Size is a mixture of WAL and logical size, so the unit is bytes.
+        size: u64,
+        inputs: crate::tenant::size::ModelInputs,
+    }
+
+    json_response(
+        StatusCode::OK,
+        TenantHistorySize {
+            id: tenant_id,
+            size,
+            inputs,
+        },
+    )
+}
+
 // Helper function to standardize the error messages we produce on bad durations
 //
 // Intended to be used with anyhow's `with_context`, e.g.:
@@ -585,6 +618,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    check_permission(&request, None)?;

    let request_data: TenantCreateRequest = json_request(&mut request).await?;
+    println!("tenant create: {:?}", request_data.trace_read_requests);
    let remote_index = get_state(&request).remote_index.clone();

    let mut tenant_conf = TenantConfOpt::default();
@@ -626,6 +660,9 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
        tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
    }
+    if let Some(trace_read_requests) = request_data.trace_read_requests {
+        tenant_conf.trace_read_requests = Some(trace_read_requests);
+    }

    tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
    if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
@@ -713,6 +750,9 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
    if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
        tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
    }
+    if let Some(trace_read_requests) = request_data.trace_read_requests {
+        tenant_conf.trace_read_requests = Some(trace_read_requests);
+    }

    tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
    if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
@@ -792,14 +832,14 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
    let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?;
    let gc_req: TimelineGcRequest = json_request(&mut request).await?;

-    let _span_guard =
-        info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered();
    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());

    // Use tenant's pitr setting
    let pitr = tenant.get_pitr_interval();
    let result = tenant
        .gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
+        .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
+        .await
        // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
        // better once the types support it.
        .map_err(ApiError::InternalServerError)?;
@@ -835,6 +875,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
        .map_err(ApiError::NotFound)?;
    timeline
        .checkpoint(CheckpointConfig::Forced)
+        .await
        .map_err(ApiError::InternalServerError)?;

    json_response(StatusCode::OK, ())
@@ -898,6 +939,7 @@ pub fn make_router(
        .get("/v1/tenant", tenant_list_handler)
        .post("/v1/tenant", tenant_create_handler)
        .get("/v1/tenant/:tenant_id", tenant_status)
+        .get("/v1/tenant/:tenant_id/size", tenant_size_handler)
        .put("/v1/tenant/config", tenant_config_handler)
        .get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
        .post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -15,6 +15,7 @@ pub mod tenant;
 pub mod tenant_config;
 pub mod tenant_mgr;
 pub mod tenant_tasks;
+pub mod trace;
 pub mod virtual_file;
 pub mod walingest;
 pub mod walreceiver;
@@ -43,8 +44,6 @@ pub const DEFAULT_PG_VERSION: u32 = 14;
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
 pub const DELTA_FILE_MAGIC: u16 = 0x5A61;

-pub const LOG_FILE_NAME: &str = "pageserver.log";
-
 static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);

 /// Config for the Repository checkpointer
@@ -81,7 +80,6 @@ pub async fn shutdown_pageserver(exit_code: i32) {

    // There should be nothing left, but let's be sure
    task_mgr::shutdown_tasks(None, None, None).await;
-
    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -31,6 +31,7 @@ const STORAGE_TIME_OPERATIONS: &[&str] = &[
    "compact",
    "create images",
    "init logical size",
+    "logical size",
    "load layer map",
    "gc",
 ];
@@ -365,6 +366,7 @@ pub struct TimelineMetrics {
    pub compact_time_histo: Histogram,
    pub create_images_time_histo: Histogram,
    pub init_logical_size_histo: Histogram,
+    pub logical_size_histo: Histogram,
    pub load_layer_map_histo: Histogram,
    pub last_record_gauge: IntGauge,
    pub wait_lsn_time_histo: Histogram,
@@ -397,6 +399,9 @@ impl TimelineMetrics {
        let init_logical_size_histo = STORAGE_TIME
            .get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id])
            .unwrap();
+        let logical_size_histo = STORAGE_TIME
+            .get_metric_with_label_values(&["logical size", &tenant_id, &timeline_id])
+            .unwrap();
        let load_layer_map_histo = STORAGE_TIME
            .get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id])
            .unwrap();
@@ -428,6 +433,7 @@ impl TimelineMetrics {
            compact_time_histo,
            create_images_time_histo,
            init_logical_size_histo,
+            logical_size_histo,
            load_layer_map_histo,
            last_record_gauge,
            wait_lsn_time_histo,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -10,6 +10,7 @@
 //

 use anyhow::{bail, ensure, Context, Result};
+use bytes::Buf;
 use bytes::Bytes;
 use futures::{Stream, StreamExt};
 use pageserver_api::models::{
@@ -18,21 +19,23 @@ use pageserver_api::models::{
    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
    PagestreamNblocksRequest, PagestreamNblocksResponse,
 };
+use pq_proto::{BeMessage, FeMessage, RowDescriptor};
 use std::io;
 use std::net::TcpListener;
 use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
+use tokio::pin;
 use tokio_util::io::StreamReader;
 use tokio_util::io::SyncIoBridge;
 use tracing::*;
+use utils::id::ConnectionId;
 use utils::{
    auth::{self, Claims, JwtAuth, Scope},
    id::{TenantId, TimelineId},
    lsn::Lsn,
    postgres_backend::AuthType,
    postgres_backend_async::{self, PostgresBackend},
-    pq_proto::{BeMessage, FeMessage, RowDescriptor},
    simple_rcu::RcuReadGuard,
 };

@@ -45,6 +48,7 @@ use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::Timeline;
 use crate::tenant_mgr;
+use crate::trace::Tracer;
 use crate::CheckpointConfig;

 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
@@ -72,6 +76,12 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                        FeMessage::CopyData(bytes) => bytes,
                        FeMessage::CopyDone => { break },
                        FeMessage::Sync => continue,
+                        FeMessage::Terminate => {
+                            let msg = format!("client terminated connection with Terminate message during COPY");
+                            pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
+                            Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                            break;
+                        }
                        m => {
                            let msg = format!("unexpected message {:?}", m);
                            pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
@@ -83,10 +93,10 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                    yield copy_data_bytes;
                }
                Ok(None) => {
-                    let msg = "client closed connection";
+                    let msg = "client closed connection during COPY";
                    pgb.write_message(&BeMessage::ErrorResponse(msg))?;
                    pgb.flush().await?;
-                    Err(io::Error::new(io::ErrorKind::Other, msg))?;
+                    Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
                }
                Err(e) => {
                    Err(io::Error::new(io::ErrorKind::Other, e))?;
@@ -267,6 +277,18 @@ impl PageServerHandler {
        //       so there is no need to reset the association
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));

+        // Make request tracer if needed
+        let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
+        let mut tracer = if tenant.get_trace_read_requests() {
+            let connection_id = ConnectionId::generate();
+            let path = tenant
+                .conf
+                .trace_path(&tenant_id, &timeline_id, &connection_id);
+            Some(Tracer::new(path))
+        } else {
+            None
+        };
+
        // Check that the timeline exists
        let timeline = get_local_timeline(tenant_id, timeline_id)?;

@@ -299,7 +321,12 @@ impl PageServerHandler {

            trace!("query: {copy_data_bytes:?}");

-            let neon_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;
+            // Trace request if needed
+            if let Some(t) = tracer.as_mut() {
+                t.trace(&copy_data_bytes)
+            }
+
+            let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;

            let response = match neon_fe_msg {
                PagestreamFeMessage::Exists(req) => {
@@ -366,14 +393,12 @@ impl PageServerHandler {
        pgb.write_message(&BeMessage::CopyInResponse)?;
        pgb.flush().await?;

-        // import_basebackup_from_tar() is not async, mainly because the Tar crate
-        // it uses is not async. So we need to jump through some hoops:
-        // - convert the input from client connection to a synchronous Read
-        // - use block_in_place()
-        let mut copyin_stream = Box::pin(copyin_stream(pgb));
-        let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
-        tokio::task::block_in_place(|| timeline.import_basebackup_from_tar(reader, base_lsn))?;
-        timeline.initialize()?;
+        let copyin_stream = copyin_stream(pgb);
+        pin!(copyin_stream);
+
+        timeline
+            .import_basebackup_from_tar(&mut copyin_stream, base_lsn)
+            .await?;

        // Drain the rest of the Copy data
        let mut bytes_after_tar = 0;
@@ -438,7 +463,7 @@ impl PageServerHandler {
        // We only want to persist the data, and it doesn't matter if it's in the
        // shape of deltas or images.
        info!("flushing layers");
-        timeline.checkpoint(CheckpointConfig::Flush)?;
+        timeline.checkpoint(CheckpointConfig::Flush).await?;

        info!("done");
        Ok(())
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12,8 +12,12 @@
 //!

 use anyhow::{bail, Context};
+use bytes::Bytes;
+use futures::Stream;
 use pageserver_api::models::TimelineState;
 use tokio::sync::watch;
+use tokio_util::io::StreamReader;
+use tokio_util::io::SyncIoBridge;
 use tracing::*;
 use utils::crashsafe::path_with_suffix_extension;

@@ -29,6 +33,7 @@ use std::io::Write;
 use std::ops::Bound::Included;
 use std::path::Path;
 use std::path::PathBuf;
+use std::pin::Pin;
 use std::process::Command;
 use std::process::Stdio;
 use std::sync::Arc;
@@ -72,6 +77,8 @@ pub mod storage_layer;

 mod timeline;

+pub mod size;
+
 use storage_layer::Layer;

 pub use timeline::Timeline;
@@ -120,6 +127,9 @@ pub struct Tenant {

    /// Makes every timeline to backup their files to remote storage.
    upload_layers: bool,
+
+    /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
+    cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
 }

 /// A timeline with some of its files on disk, being initialized.
@@ -132,7 +142,7 @@ pub struct Tenant {
 pub struct UninitializedTimeline<'t> {
    owning_tenant: &'t Tenant,
    timeline_id: TimelineId,
-    raw_timeline: Option<(Timeline, TimelineUninitMark)>,
+    raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
 }

 /// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
@@ -164,7 +174,6 @@ impl UninitializedTimeline<'_> {
        let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
            format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
        })?;
-        let new_timeline = Arc::new(new_timeline);

        let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
        // TODO it would be good to ensure that, but apparently a lot of our testing is dependend on that at least
@@ -192,6 +201,9 @@ impl UninitializedTimeline<'_> {
                })?;
                new_timeline.set_state(TimelineState::Active);
                v.insert(Arc::clone(&new_timeline));
+
+                new_timeline.maybe_spawn_flush_loop();
+
                new_timeline.launch_wal_receiver();
            }
        }
@@ -200,20 +212,28 @@ impl UninitializedTimeline<'_> {
    }

    /// Prepares timeline data by loading it from the basebackup archive.
-    pub fn import_basebackup_from_tar(
-        &self,
-        reader: impl std::io::Read,
+    pub async fn import_basebackup_from_tar(
+        self,
+        mut copyin_stream: &mut Pin<&mut impl Stream<Item = io::Result<Bytes>>>,
        base_lsn: Lsn,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<Arc<Timeline>> {
        let raw_timeline = self.raw_timeline()?;
-        import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn).with_context(
-            || {
-                format!(
-                    "Failed to import basebackup for timeline {}/{}",
-                    self.owning_tenant.tenant_id, self.timeline_id
-                )
-            },
-        )?;
+
+        // import_basebackup_from_tar() is not async, mainly because the Tar crate
+        // it uses is not async. So we need to jump through some hoops:
+        // - convert the input from client connection to a synchronous Read
+        // - use block_in_place()
+        let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
+
+        tokio::task::block_in_place(|| {
+            import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn)
+                .context("Failed to import basebackup")
+        })?;
+
+        // Flush loop needs to be spawned in order for checkpoint to be able to flush.
+        // We want to run proper checkpoint before we mark timeline as available to outside world
+        // Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
+        raw_timeline.maybe_spawn_flush_loop();

        fail::fail_point!("before-checkpoint-new-timeline", |_| {
            bail!("failpoint before-checkpoint-new-timeline");
@@ -221,16 +241,15 @@ impl UninitializedTimeline<'_> {

        raw_timeline
            .checkpoint(CheckpointConfig::Flush)
-            .with_context(|| {
-                format!(
-                    "Failed to checkpoint after basebackup import for timeline {}/{}",
-                    self.owning_tenant.tenant_id, self.timeline_id
-                )
-            })?;
-        Ok(())
+            .await
+            .context("Failed to checkpoint after basebackup import")?;
+
+        let timeline = self.initialize()?;
+
+        Ok(timeline)
    }

-    fn raw_timeline(&self) -> anyhow::Result<&Timeline> {
+    fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
        Ok(&self
            .raw_timeline
            .as_ref()
@@ -465,7 +484,7 @@ impl Tenant {

                self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?
            }
-            None => self.bootstrap_timeline(new_timeline_id, pg_version)?,
+            None => self.bootstrap_timeline(new_timeline_id, pg_version).await?,
        };

        // Have added new timeline into the tenant, now its background tasks are needed.
@@ -483,7 +502,7 @@ impl Tenant {
    /// `checkpoint_before_gc` parameter is used to force compaction of storage before GC
    /// to make tests more deterministic.
    /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed?
-    pub fn gc_iteration(
+    pub async fn gc_iteration(
        &self,
        target_timeline_id: Option<TimelineId>,
        horizon: u64,
@@ -499,11 +518,13 @@ impl Tenant {
            .map(|x| x.to_string())
            .unwrap_or_else(|| "-".to_string());

-        STORAGE_TIME
-            .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
-            .observe_closure_duration(|| {
-                self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc)
-            })
+        {
+            let _timer = STORAGE_TIME
+                .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
+                .start_timer();
+            self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc)
+                .await
+        }
    }

    /// Perform one compaction iteration.
@@ -523,7 +544,6 @@ impl Tenant {
        let timelines = self.timelines.lock().unwrap();
        let timelines_to_compact = timelines
            .iter()
-            .filter(|(_, timeline)| timeline.is_active())
            .map(|(timeline_id, timeline)| (*timeline_id, timeline.clone()))
            .collect::<Vec<_>>();
        drop(timelines);
@@ -540,23 +560,24 @@ impl Tenant {
    ///
    /// Used at graceful shutdown.
    ///
-    pub fn checkpoint(&self) -> anyhow::Result<()> {
+    pub async fn checkpoint(&self) -> anyhow::Result<()> {
        // Scan through the hashmap and collect a list of all the timelines,
        // while holding the lock. Then drop the lock and actually perform the
        // checkpoints. We don't want to block everything else while the
        // checkpoint runs.
-        let timelines = self.timelines.lock().unwrap();
-        let timelines_to_checkpoint = timelines
-            .iter()
-            .map(|(timeline_id, timeline)| (*timeline_id, Arc::clone(timeline)))
-            .collect::<Vec<_>>();
-        drop(timelines);
+        let timelines_to_checkpoint = {
+            let timelines = self.timelines.lock().unwrap();
+            timelines
+                .iter()
+                .map(|(id, timeline)| (*id, Arc::clone(timeline)))
+                .collect::<Vec<_>>()
+        };

-        for (timeline_id, timeline) in &timelines_to_checkpoint {
-            let _entered =
-                info_span!("checkpoint", timeline = %timeline_id, tenant = %self.tenant_id)
-                    .entered();
-            timeline.checkpoint(CheckpointConfig::Flush)?;
+        for (id, timeline) in &timelines_to_checkpoint {
+            timeline
+                .checkpoint(CheckpointConfig::Flush)
+                .instrument(info_span!("checkpoint", timeline = %id, tenant = %self.tenant_id))
+                .await?;
        }

        Ok(())
@@ -785,6 +806,13 @@ impl Tenant {
            .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
    }

+    pub fn get_trace_read_requests(&self) -> bool {
+        let tenant_conf = self.tenant_conf.read().unwrap();
+        tenant_conf
+            .trace_read_requests
+            .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
+    }
+
    pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
        self.tenant_conf.write().unwrap().update(&new_tenant_conf);
    }
@@ -835,6 +863,7 @@ impl Tenant {
            remote_index,
            upload_layers,
            state,
+            cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
        }
    }

@@ -956,8 +985,9 @@ impl Tenant {
    //                 +-----baz-------->
    //
    //
-    // 1. Grab 'gc_cs' mutex to prevent new timelines from being created
-    // 2. Scan all timelines, and on each timeline, make note of the
+    // 1. Grab 'gc_cs' mutex to prevent new timelines from being created while Timeline's
+    //    `gc_infos` are being refreshed
+    // 2. Scan collected timelines, and on each timeline, make note of the
    //    all the points where other timelines have been branched off.
    //    We will refrain from removing page versions at those LSNs.
    // 3. For each timeline, scan all layer files on the timeline.
@@ -968,7 +998,7 @@ impl Tenant {
    // - if a relation has a non-incremental persistent layer on a child branch, then we
    //   don't need to keep that in the parent anymore. But currently
    //   we do.
-    fn gc_iteration_internal(
+    async fn gc_iteration_internal(
        &self,
        target_timeline_id: Option<TimelineId>,
        horizon: u64,
@@ -978,6 +1008,68 @@ impl Tenant {
        let mut totals: GcResult = Default::default();
        let now = Instant::now();

+        let gc_timelines = self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)?;
+
+        // Perform GC for each timeline.
+        //
+        // Note that we don't hold the GC lock here because we don't want
+        // to delay the branch creation task, which requires the GC lock.
+        // A timeline GC iteration can be slow because it may need to wait for
+        // compaction (both require `layer_removal_cs` lock),
+        // but the GC iteration can run concurrently with branch creation.
+        //
+        // See comments in [`Tenant::branch_timeline`] for more information
+        // about why branch creation task can run concurrently with timeline's GC iteration.
+        for timeline in gc_timelines {
+            if task_mgr::is_shutdown_requested() {
+                // We were requested to shut down. Stop and return with the progress we
+                // made.
+                break;
+            }
+
+            // If requested, force flush all in-memory layers to disk first,
+            // so that they too can be garbage collected. That's
+            // used in tests, so we want as deterministic results as possible.
+            if checkpoint_before_gc {
+                timeline.checkpoint(CheckpointConfig::Forced).await?;
+                info!(
+                    "timeline {} checkpoint_before_gc done",
+                    timeline.timeline_id
+                );
+            }
+
+            let result = timeline.gc()?;
+            totals += result;
+        }
+
+        totals.elapsed = now.elapsed();
+        Ok(totals)
+    }
+
+    /// Refreshes the Timeline::gc_info for all timelines, returning the
+    /// vector of timelines which have [`Timeline::get_last_record_lsn`] past
+    /// [`Tenant::get_gc_horizon`].
+    ///
+    /// This is usually executed as part of periodic gc, but can now be triggered more often.
+    pub fn refresh_gc_info(&self) -> anyhow::Result<Vec<Arc<Timeline>>> {
+        // since this method can now be called at different rates than the configured gc loop, it
+        // might be that these configuration values get applied faster than what it was previously,
+        // since these were only read from the gc task.
+        let horizon = self.get_gc_horizon();
+        let pitr = self.get_pitr_interval();
+
+        // refresh all timelines
+        let target_timeline_id = None;
+
+        self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)
+    }
+
+    fn refresh_gc_info_internal(
+        &self,
+        target_timeline_id: Option<TimelineId>,
+        horizon: u64,
+        pitr: Duration,
+    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
        // grab mutex to prevent new timelines from being created here.
        let gc_cs = self.gc_cs.lock().unwrap();

@@ -995,11 +1087,7 @@ impl Tenant {

            timelines
                .iter()
-                .filter(|(_, timeline)| timeline.is_active())
                .map(|(timeline_id, timeline_entry)| {
-                    // This is unresolved question for now, how to do gc in presence of remote timelines
-                    // especially when this is combined with branching.
-                    // Somewhat related: https://github.com/neondatabase/neon/issues/999
                    if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() {
                        // If target_timeline is specified, we only need to know branchpoints of its children
                        if let Some(timeline_id) = target_timeline_id {
@@ -1053,41 +1141,7 @@ impl Tenant {
            }
        }
        drop(gc_cs);
-
-        // Perform GC for each timeline.
-        //
-        // Note that we don't hold the GC lock here because we don't want
-        // to delay the branch creation task, which requires the GC lock.
-        // A timeline GC iteration can be slow because it may need to wait for
-        // compaction (both require `layer_removal_cs` lock),
-        // but the GC iteration can run concurrently with branch creation.
-        //
-        // See comments in [`Tenant::branch_timeline`] for more information
-        // about why branch creation task can run concurrently with timeline's GC iteration.
-        for timeline in gc_timelines {
-            if task_mgr::is_shutdown_requested() {
-                // We were requested to shut down. Stop and return with the progress we
-                // made.
-                break;
-            }
-
-            // If requested, force flush all in-memory layers to disk first,
-            // so that they too can be garbage collected. That's
-            // used in tests, so we want as deterministic results as possible.
-            if checkpoint_before_gc {
-                timeline.checkpoint(CheckpointConfig::Forced)?;
-                info!(
-                    "timeline {} checkpoint_before_gc done",
-                    timeline.timeline_id
-                );
-            }
-
-            let result = timeline.gc()?;
-            totals += result;
-        }
-
-        totals.elapsed = now.elapsed();
-        Ok(totals)
+        Ok(gc_timelines)
    }

    /// Branch an existing timeline
@@ -1191,14 +1245,15 @@ impl Tenant {

    /// - run initdb to init temporary instance and get bootstrap data
    /// - after initialization complete, remove the temp dir.
-    fn bootstrap_timeline(
+    async fn bootstrap_timeline(
        &self,
        timeline_id: TimelineId,
        pg_version: u32,
    ) -> anyhow::Result<Arc<Timeline>> {
-        let timelines = self.timelines.lock().unwrap();
-        let timeline_uninit_mark = self.create_timeline_uninit_mark(timeline_id, &timelines)?;
-        drop(timelines);
+        let timeline_uninit_mark = {
+            let timelines = self.timelines.lock().unwrap();
+            self.create_timeline_uninit_mark(timeline_id, &timelines)?
+        };
        // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
        // temporary directory for basebackup files for the given timeline.
        let initdb_path = path_with_suffix_extension(
@@ -1248,25 +1303,35 @@ impl Tenant {

        let tenant_id = raw_timeline.owning_tenant.tenant_id;
        let unfinished_timeline = raw_timeline.raw_timeline()?;
-        import_datadir::import_timeline_from_postgres_datadir(
-            unfinished_timeline,
-            pgdata_path,
-            pgdata_lsn,
-        )
+
+        tokio::task::block_in_place(|| {
+            import_datadir::import_timeline_from_postgres_datadir(
+                unfinished_timeline,
+                pgdata_path,
+                pgdata_lsn,
+            )
+        })
        .with_context(|| {
            format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}")
        })?;

+        // Flush loop needs to be spawned in order for checkpoint to be able to flush.
+        // We want to run proper checkpoint before we mark timeline as available to outside world
+        // Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
+        unfinished_timeline.maybe_spawn_flush_loop();
+
        fail::fail_point!("before-checkpoint-new-timeline", |_| {
            anyhow::bail!("failpoint before-checkpoint-new-timeline");
        });
+
        unfinished_timeline
-            .checkpoint(CheckpointConfig::Forced)
+            .checkpoint(CheckpointConfig::Forced).await
            .with_context(|| format!("Failed to checkpoint after pgdatadir import for timeline {tenant_id}/{timeline_id}"))?;

-        let mut timelines = self.timelines.lock().unwrap();
-        let timeline = raw_timeline.initialize_with_lock(&mut timelines, false)?;
-        drop(timelines);
+        let timeline = {
+            let mut timelines = self.timelines.lock().unwrap();
+            raw_timeline.initialize_with_lock(&mut timelines, false)?
+        };

        info!(
            "created root timeline {} timeline.lsn {}",
@@ -1306,7 +1371,7 @@ impl Tenant {
                Ok(UninitializedTimeline {
                    owning_tenant: self,
                    timeline_id: new_timeline_id,
-                    raw_timeline: Some((new_timeline, uninit_mark)),
+                    raw_timeline: Some((Arc::new(new_timeline), uninit_mark)),
                })
            }
            Err(e) => {
@@ -1425,7 +1490,7 @@ impl Tenant {
            let timeline = UninitializedTimeline {
                owning_tenant: self,
                timeline_id,
-                raw_timeline: Some((dummy_timeline, TimelineUninitMark::dummy())),
+                raw_timeline: Some((Arc::new(dummy_timeline), TimelineUninitMark::dummy())),
            };
            match timeline.initialize_with_lock(&mut timelines_accessor, true) {
                Ok(initialized_timeline) => {
@@ -1446,6 +1511,25 @@ impl Tenant {

        Ok(())
    }
+
+    /// Gathers inputs from all of the timelines to produce a sizing model input.
+    ///
+    /// Future is cancellation safe. Only one calculation can be running at once per tenant.
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
+    pub async fn gather_size_inputs(&self) -> anyhow::Result<size::ModelInputs> {
+        let logical_sizes_at_once = self
+            .conf
+            .concurrent_tenant_size_logical_size_queries
+            .inner();
+
+        // TODO: Having a single mutex block concurrent reads is unfortunate, but since the queries
+        // are for testing/experimenting, we tolerate this.
+        //
+        // See more for on the issue #2748 condenced out of the initial PR review.
+        let mut shared_cache = self.cached_logical_sizes.lock().await;
+
+        size::gather_inputs(self, logical_sizes_at_once, &mut *shared_cache).await
+    }
 }

 /// Create the cluster temporarily in 'initdbpath' directory inside the repository
@@ -1589,6 +1673,7 @@ pub mod harness {
                walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
                lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
                max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
+                trace_read_requests: Some(tenant_conf.trace_read_requests),
            }
        }
    }
@@ -1860,7 +1945,7 @@ mod tests {
        Ok(())
    }

-    fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> {
+    async fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> {
        let mut lsn = start_lsn;
        #[allow(non_snake_case)]
        {
@@ -1881,7 +1966,7 @@ mod tests {
            writer.finish_write(lsn);
            lsn += 0x10;
        }
-        tline.checkpoint(CheckpointConfig::Forced)?;
+        tline.checkpoint(CheckpointConfig::Forced).await?;
        {
            let writer = tline.writer();
            writer.put(
@@ -1898,24 +1983,26 @@ mod tests {
            )?;
            writer.finish_write(lsn);
        }
-        tline.checkpoint(CheckpointConfig::Forced)
+        tline.checkpoint(CheckpointConfig::Forced).await
    }

-    #[test]
-    fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
        let tenant =
            TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
                .load();
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
            .initialize()?;
-        make_some_layers(tline.as_ref(), Lsn(0x20))?;
+        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;

        // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
        // FIXME: this doesn't actually remove any layer currently, given how the checkpointing
        // and compaction works. But it does set the 'cutoff' point so that the cross check
        // below should fail.
-        tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
+        tenant
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
+            .await?;

        // try to branch at lsn 25, should fail because we already garbage collected the data
        match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
@@ -1960,14 +2047,14 @@ mod tests {
    /*
    // FIXME: This currently fails to error out. Calling GC doesn't currently
    // remove the old value, we'd need to work a little harder
-    #[test]
-    fn test_prohibit_get_for_garbage_collected_data() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn test_prohibit_get_for_garbage_collected_data() -> anyhow::Result<()> {
        let repo =
            RepoHarness::create("test_prohibit_get_for_garbage_collected_data")?
            .load();

        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
-        make_some_layers(tline.as_ref(), Lsn(0x20))?;
+        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;

        repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
        let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
@@ -1980,43 +2067,47 @@ mod tests {
    }
     */

-    #[test]
-    fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
        let tenant =
            TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load();
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
            .initialize()?;
-        make_some_layers(tline.as_ref(), Lsn(0x20))?;
+        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;

        tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
        let newtline = tenant
            .get_timeline(NEW_TIMELINE_ID, true)
            .expect("Should have a local timeline");
        // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
-        tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
+        tenant
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
+            .await?;
        assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok());

        Ok(())
    }
-    #[test]
-    fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
        let tenant =
            TenantHarness::create("test_parent_keeps_data_forever_after_branching")?.load();
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
            .initialize()?;
-        make_some_layers(tline.as_ref(), Lsn(0x20))?;
+        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;

        tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
        let newtline = tenant
            .get_timeline(NEW_TIMELINE_ID, true)
            .expect("Should have a local timeline");

-        make_some_layers(newtline.as_ref(), Lsn(0x60))?;
+        make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;

        // run gc on parent
-        tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
+        tenant
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
+            .await?;

        // Check that the data is still accessible on the branch.
        assert_eq!(
@@ -2027,8 +2118,8 @@ mod tests {
        Ok(())
    }

-    #[test]
-    fn timeline_load() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn timeline_load() -> anyhow::Result<()> {
        const TEST_NAME: &str = "timeline_load";
        let harness = TenantHarness::create(TEST_NAME)?;
        {
@@ -2036,8 +2127,8 @@ mod tests {
            let tline = tenant
                .create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?
                .initialize()?;
-            make_some_layers(tline.as_ref(), Lsn(0x8000))?;
-            tline.checkpoint(CheckpointConfig::Forced)?;
+            make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
+            tline.checkpoint(CheckpointConfig::Forced).await?;
        }

        let tenant = harness.load();
@@ -2048,8 +2139,8 @@ mod tests {
        Ok(())
    }

-    #[test]
-    fn timeline_load_with_ancestor() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
        const TEST_NAME: &str = "timeline_load_with_ancestor";
        let harness = TenantHarness::create(TEST_NAME)?;
        // create two timelines
@@ -2059,8 +2150,8 @@ mod tests {
                .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
                .initialize()?;

-            make_some_layers(tline.as_ref(), Lsn(0x20))?;
-            tline.checkpoint(CheckpointConfig::Forced)?;
+            make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
+            tline.checkpoint(CheckpointConfig::Forced).await?;

            tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;

@@ -2068,8 +2159,8 @@ mod tests {
                .get_timeline(NEW_TIMELINE_ID, true)
                .expect("Should have a local timeline");

-            make_some_layers(newtline.as_ref(), Lsn(0x60))?;
-            tline.checkpoint(CheckpointConfig::Forced)?;
+            make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
+            tline.checkpoint(CheckpointConfig::Forced).await?;
        }

        // check that both of them are initially unloaded
@@ -2129,8 +2220,8 @@ mod tests {
        Ok(())
    }

-    #[test]
-    fn test_images() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn test_images() -> anyhow::Result<()> {
        let tenant = TenantHarness::create("test_images")?.load();
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
@@ -2141,7 +2232,7 @@ mod tests {
        writer.finish_write(Lsn(0x10));
        drop(writer);

-        tline.checkpoint(CheckpointConfig::Forced)?;
+        tline.checkpoint(CheckpointConfig::Forced).await?;
        tline.compact()?;

        let writer = tline.writer();
@@ -2149,7 +2240,7 @@ mod tests {
        writer.finish_write(Lsn(0x20));
        drop(writer);

-        tline.checkpoint(CheckpointConfig::Forced)?;
+        tline.checkpoint(CheckpointConfig::Forced).await?;
        tline.compact()?;

        let writer = tline.writer();
@@ -2157,7 +2248,7 @@ mod tests {
        writer.finish_write(Lsn(0x30));
        drop(writer);

-        tline.checkpoint(CheckpointConfig::Forced)?;
+        tline.checkpoint(CheckpointConfig::Forced).await?;
        tline.compact()?;

        let writer = tline.writer();
@@ -2165,7 +2256,7 @@ mod tests {
        writer.finish_write(Lsn(0x40));
        drop(writer);

-        tline.checkpoint(CheckpointConfig::Forced)?;
+        tline.checkpoint(CheckpointConfig::Forced).await?;
        tline.compact()?;

        assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
@@ -2181,8 +2272,8 @@ mod tests {
    // Insert 1000 key-value pairs with increasing keys, checkpoint,
    // repeat 50 times.
    //
-    #[test]
-    fn test_bulk_insert() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn test_bulk_insert() -> anyhow::Result<()> {
        let tenant = TenantHarness::create("test_bulk_insert")?.load();
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
@@ -2215,7 +2306,7 @@ mod tests {
            let cutoff = tline.get_last_record_lsn();

            tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
-            tline.checkpoint(CheckpointConfig::Forced)?;
+            tline.checkpoint(CheckpointConfig::Forced).await?;
            tline.compact()?;
            tline.gc()?;
        }
@@ -2223,8 +2314,8 @@ mod tests {
        Ok(())
    }

-    #[test]
-    fn test_random_updates() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn test_random_updates() -> anyhow::Result<()> {
        let tenant = TenantHarness::create("test_random_updates")?.load();
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
@@ -2287,7 +2378,7 @@ mod tests {
            println!("checkpointing {}", lsn);
            let cutoff = tline.get_last_record_lsn();
            tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
-            tline.checkpoint(CheckpointConfig::Forced)?;
+            tline.checkpoint(CheckpointConfig::Forced).await?;
            tline.compact()?;
            tline.gc()?;
        }
@@ -2295,8 +2386,8 @@ mod tests {
        Ok(())
    }

-    #[test]
-    fn test_traverse_branches() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn test_traverse_branches() -> anyhow::Result<()> {
        let tenant = TenantHarness::create("test_traverse_branches")?.load();
        let mut tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
@@ -2368,7 +2459,7 @@ mod tests {
            println!("checkpointing {}", lsn);
            let cutoff = tline.get_last_record_lsn();
            tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
-            tline.checkpoint(CheckpointConfig::Forced)?;
+            tline.checkpoint(CheckpointConfig::Forced).await?;
            tline.compact()?;
            tline.gc()?;
        }
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -74,6 +74,7 @@ where
        };

        dstbuf.clear();
+        dstbuf.reserve(len);

        // Read the payload
        let mut remain = len;
--- a/pageserver/src/tenant/delta_layer.rs
+++ b/pageserver/src/tenant/delta_layer.rs
@@ -260,8 +260,9 @@ impl Layer for DeltaLayer {

            // Ok, 'offsets' now contains the offsets of all the entries we need to read
            let mut cursor = file.block_cursor();
+            let mut buf = Vec::new();
            for (entry_lsn, pos) in offsets {
-                let buf = cursor.read_blob(pos).with_context(|| {
+                cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
                    format!(
                        "Failed to read blob from virtual file {}",
                        file.file.path.display()
@@ -610,9 +611,9 @@ impl DeltaLayer {
 ///
 /// 3. Call `finish`.
 ///
-pub struct DeltaLayerWriter {
+struct DeltaLayerWriterInner {
    conf: &'static PageServerConf,
-    path: PathBuf,
+    pub path: PathBuf,
    timeline_id: TimelineId,
    tenant_id: TenantId,

@@ -624,17 +625,17 @@ pub struct DeltaLayerWriter {
    blob_writer: WriteBlobWriter<BufWriter<VirtualFile>>,
 }

-impl DeltaLayerWriter {
+impl DeltaLayerWriterInner {
    ///
    /// Start building a new delta layer.
    ///
-    pub fn new(
+    fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
        key_start: Key,
        lsn_range: Range<Lsn>,
-    ) -> Result<DeltaLayerWriter> {
+    ) -> anyhow::Result<Self> {
        // Create the file initially with a temporary filename. We don't know
        // the end key yet, so we cannot form the final filename yet. We will
        // rename it when we're done.
@@ -653,7 +654,7 @@ impl DeltaLayerWriter {
        let block_buf = BlockBuf::new();
        let tree_builder = DiskBtreeBuilder::new(block_buf);

-        Ok(DeltaLayerWriter {
+        Ok(Self {
            conf,
            path,
            timeline_id,
@@ -670,17 +671,17 @@ impl DeltaLayerWriter {
    ///
    /// The values must be appended in key, lsn order.
    ///
-    pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
+    fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
        self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
    }

-    pub fn put_value_bytes(
+    fn put_value_bytes(
        &mut self,
        key: Key,
        lsn: Lsn,
        val: &[u8],
        will_init: bool,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
        assert!(self.lsn_range.start <= lsn);

        let off = self.blob_writer.write_blob(val)?;
@@ -693,14 +694,14 @@ impl DeltaLayerWriter {
        Ok(())
    }

-    pub fn size(&self) -> u64 {
+    fn size(&self) -> u64 {
        self.blob_writer.size() + self.tree.borrow_writer().size()
    }

    ///
    /// Finish writing the delta layer.
    ///
-    pub fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+    fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -768,6 +769,102 @@ impl DeltaLayerWriter {
    }
 }

+/// A builder object for constructing a new delta layer.
+///
+/// Usage:
+///
+/// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...)
+///
+/// 2. Write the contents by calling `put_value` for every page
+///    version to store in the layer.
+///
+/// 3. Call `finish`.
+///
+/// # Note
+///
+/// As described in https://github.com/neondatabase/neon/issues/2650, it's
+/// possible for the writer to drop before `finish` is actually called. So this
+/// could lead to odd temporary files in the directory, exhausting file system.
+/// This structure wraps `DeltaLayerWriterInner` and also contains `Drop`
+/// implementation that cleans up the temporary file in failure. It's not
+/// possible to do this directly in `DeltaLayerWriterInner` since `finish` moves
+/// out some fields, making it impossible to implement `Drop`.
+///
+#[must_use]
+pub struct DeltaLayerWriter {
+    inner: Option<DeltaLayerWriterInner>,
+}
+
+impl DeltaLayerWriter {
+    ///
+    /// Start building a new delta layer.
+    ///
+    pub fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_id: TenantId,
+        key_start: Key,
+        lsn_range: Range<Lsn>,
+    ) -> anyhow::Result<Self> {
+        Ok(Self {
+            inner: Some(DeltaLayerWriterInner::new(
+                conf,
+                timeline_id,
+                tenant_id,
+                key_start,
+                lsn_range,
+            )?),
+        })
+    }
+
+    ///
+    /// Append a key-value pair to the file.
+    ///
+    /// The values must be appended in key, lsn order.
+    ///
+    pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
+        self.inner.as_mut().unwrap().put_value(key, lsn, val)
+    }
+
+    pub fn put_value_bytes(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        val: &[u8],
+        will_init: bool,
+    ) -> anyhow::Result<()> {
+        self.inner
+            .as_mut()
+            .unwrap()
+            .put_value_bytes(key, lsn, val, will_init)
+    }
+
+    pub fn size(&self) -> u64 {
+        self.inner.as_ref().unwrap().size()
+    }
+
+    ///
+    /// Finish writing the delta layer.
+    ///
+    pub fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+        self.inner.take().unwrap().finish(key_end)
+    }
+}
+
+impl Drop for DeltaLayerWriter {
+    fn drop(&mut self) {
+        if let Some(inner) = self.inner.take() {
+            match inner.blob_writer.into_inner().into_inner() {
+                Ok(vfile) => vfile.remove(),
+                Err(err) => warn!(
+                    "error while flushing buffer of image layer temporary file: {}",
+                    err
+                ),
+            }
+        }
+    }
+}
+
 ///
 /// Iterator over all key-value pairse stored in a delta layer
 ///
--- a/pageserver/src/tenant/image_layer.rs
+++ b/pageserver/src/tenant/image_layer.rs
@@ -411,7 +411,7 @@ impl ImageLayer {
 ///
 /// 3. Call `finish`.
 ///
-pub struct ImageLayerWriter {
+struct ImageLayerWriterInner {
    conf: &'static PageServerConf,
    path: PathBuf,
    timeline_id: TimelineId,
@@ -423,14 +423,17 @@ pub struct ImageLayerWriter {
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
 }

-impl ImageLayerWriter {
-    pub fn new(
+impl ImageLayerWriterInner {
+    ///
+    /// Start building a new image layer.
+    ///
+    fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
        key_range: &Range<Key>,
        lsn: Lsn,
-    ) -> anyhow::Result<ImageLayerWriter> {
+    ) -> anyhow::Result<Self> {
        // Create the file initially with a temporary filename.
        // We'll atomically rename it to the final name when we're done.
        let path = ImageLayer::temp_path_for(
@@ -455,7 +458,7 @@ impl ImageLayerWriter {
        let block_buf = BlockBuf::new();
        let tree_builder = DiskBtreeBuilder::new(block_buf);

-        let writer = ImageLayerWriter {
+        let writer = Self {
            conf,
            path,
            timeline_id,
@@ -474,7 +477,7 @@ impl ImageLayerWriter {
    ///
    /// The page versions must be appended in blknum order.
    ///
-    pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> {
+    fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
        ensure!(self.key_range.contains(&key));
        let off = self.blob_writer.write_blob(img)?;

@@ -485,7 +488,10 @@ impl ImageLayerWriter {
        Ok(())
    }

-    pub fn finish(self) -> anyhow::Result<ImageLayer> {
+    ///
+    /// Finish writing the image layer.
+    ///
+    fn finish(self) -> anyhow::Result<ImageLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -552,3 +558,76 @@ impl ImageLayerWriter {
        Ok(layer)
    }
 }
+
+/// A builder object for constructing a new image layer.
+///
+/// Usage:
+///
+/// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
+///
+/// 2. Write the contents by calling `put_page_image` for every key-value
+///    pair in the key range.
+///
+/// 3. Call `finish`.
+///
+/// # Note
+///
+/// As described in https://github.com/neondatabase/neon/issues/2650, it's
+/// possible for the writer to drop before `finish` is actually called. So this
+/// could lead to odd temporary files in the directory, exhausting file system.
+/// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
+/// implementation that cleans up the temporary file in failure. It's not
+/// possible to do this directly in `ImageLayerWriterInner` since `finish` moves
+/// out some fields, making it impossible to implement `Drop`.
+///
+#[must_use]
+pub struct ImageLayerWriter {
+    inner: Option<ImageLayerWriterInner>,
+}
+
+impl ImageLayerWriter {
+    ///
+    /// Start building a new image layer.
+    ///
+    pub fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_id: TenantId,
+        key_range: &Range<Key>,
+        lsn: Lsn,
+    ) -> anyhow::Result<ImageLayerWriter> {
+        Ok(Self {
+            inner: Some(ImageLayerWriterInner::new(
+                conf,
+                timeline_id,
+                tenant_id,
+                key_range,
+                lsn,
+            )?),
+        })
+    }
+
+    ///
+    /// Write next value to the file.
+    ///
+    /// The page versions must be appended in blknum order.
+    ///
+    pub fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
+        self.inner.as_mut().unwrap().put_image(key, img)
+    }
+
+    ///
+    /// Finish writing the image layer.
+    ///
+    pub fn finish(mut self) -> anyhow::Result<ImageLayer> {
+        self.inner.take().unwrap().finish()
+    }
+}
+
+impl Drop for ImageLayerWriter {
+    fn drop(&mut self) {
+        if let Some(inner) = self.inner.take() {
+            inner.blob_writer.into_inner().remove();
+        }
+    }
+}
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -0,0 +1,475 @@
+use std::cmp;
+use std::collections::{HashMap, HashSet};
+use std::sync::Arc;
+
+use anyhow::Context;
+use tokio::sync::Semaphore;
+
+use super::Tenant;
+use utils::id::TimelineId;
+use utils::lsn::Lsn;
+
+use tracing::*;
+
+/// Inputs to the actual tenant sizing model
+///
+/// Implements [`serde::Serialize`] but is not meant to be part of the public API, instead meant to
+/// be a transferrable format between execution environments and developer.
+#[serde_with::serde_as]
+#[derive(Debug, serde::Serialize, serde::Deserialize)]
+pub struct ModelInputs {
+    updates: Vec<Update>,
+    retention_period: u64,
+    #[serde_as(as = "HashMap<serde_with::DisplayFromStr, _>")]
+    timeline_inputs: HashMap<TimelineId, TimelineInputs>,
+}
+
+/// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
+/// part of [`ModelInputs`] from the HTTP api, explaining the inputs.
+#[serde_with::serde_as]
+#[derive(Debug, serde::Serialize, serde::Deserialize)]
+struct TimelineInputs {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    last_record: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    latest_gc_cutoff: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    horizon_cutoff: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pitr_cutoff: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    next_gc_cutoff: Lsn,
+}
+
+/// Gathers the inputs for the tenant sizing model.
+///
+/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
+/// is updated on-demand, during the start of this calculation and separate from the
+/// [`Timeline::latest_gc_cutoff`].
+///
+/// For timelines in general:
+///
+/// ```ignore
+/// 0-----|---------|----|------------| · · · · · |·> lsn
+///   initdb_lsn  branchpoints*  next_gc_cutoff  latest
+/// ```
+///
+/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the
+/// tenant size will be zero.
+pub(super) async fn gather_inputs(
+    tenant: &Tenant,
+    limit: &Arc<Semaphore>,
+    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
+) -> anyhow::Result<ModelInputs> {
+    // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to
+    // our advantage with `?` error handling.
+    let mut joinset = tokio::task::JoinSet::new();
+
+    let timelines = tenant
+        .refresh_gc_info()
+        .context("Failed to refresh gc_info before gathering inputs")?;
+
+    if timelines.is_empty() {
+        // All timelines are below tenant's gc_horizon; alternative would be to use
+        // Tenant::list_timelines but then those gc_info's would not be updated yet, possibly
+        // missing GcInfo::retain_lsns or having obsolete values for cutoff's.
+        return Ok(ModelInputs {
+            updates: vec![],
+            retention_period: 0,
+            timeline_inputs: HashMap::new(),
+        });
+    }
+
+    // record the used/inserted cache keys here, to remove extras not to start leaking
+    // after initial run the cache should be quite stable, but live timelines will eventually
+    // require new lsns to be inspected.
+    let mut needed_cache = HashSet::<(TimelineId, Lsn)>::new();
+
+    let mut updates = Vec::new();
+
+    // record the per timline values used to determine `retention_period`
+    let mut timeline_inputs = HashMap::with_capacity(timelines.len());
+
+    // used to determine the `retention_period` for the size model
+    let mut max_cutoff_distance = None;
+
+    // this will probably conflict with on-demand downloaded layers, or at least force them all
+    // to be downloaded
+    for timeline in timelines {
+        let last_record_lsn = timeline.get_last_record_lsn();
+
+        let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = {
+            // there's a race between the update (holding tenant.gc_lock) and this read but it
+            // might not be an issue, because it's not for Timeline::gc
+            let gc_info = timeline.gc_info.read().unwrap();
+
+            // similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a
+            // new gc run, which we have no control over. however differently from `Timeline::gc`
+            // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
+            // actually removing files.
+            let next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);
+
+            // the minimum where we should find the next_gc_cutoff for our calculations.
+            //
+            // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
+            // want to query any logical size before initdb_lsn.
+            let cutoff_minimum = cmp::max(timeline.get_ancestor_lsn(), timeline.initdb_lsn);
+
+            let maybe_cutoff = if next_gc_cutoff > cutoff_minimum {
+                Some((next_gc_cutoff, LsnKind::GcCutOff))
+            } else {
+                None
+            };
+
+            // this assumes there are no other lsns than the branchpoints
+            let lsns = gc_info
+                .retain_lsns
+                .iter()
+                .inspect(|&&lsn| {
+                    trace!(
+                        timeline_id=%timeline.timeline_id,
+                        "retained lsn: {lsn:?}, is_before_ancestor_lsn={}",
+                        lsn < timeline.get_ancestor_lsn()
+                    )
+                })
+                .filter(|&&lsn| lsn > timeline.get_ancestor_lsn())
+                .copied()
+                .map(|lsn| (lsn, LsnKind::BranchPoint))
+                .chain(maybe_cutoff)
+                .collect::<Vec<_>>();
+
+            (
+                lsns,
+                gc_info.horizon_cutoff,
+                gc_info.pitr_cutoff,
+                next_gc_cutoff,
+            )
+        };
+
+        // update this to have a retention_period later for the tenant_size_model
+        // tenant_size_model compares this to the last segments start_lsn
+        if let Some(cutoff_distance) = last_record_lsn.checked_sub(next_gc_cutoff) {
+            match max_cutoff_distance.as_mut() {
+                Some(max) => {
+                    *max = std::cmp::max(*max, cutoff_distance);
+                }
+                _ => {
+                    max_cutoff_distance = Some(cutoff_distance);
+                }
+            }
+        }
+
+        // all timelines branch from something, because it might be impossible to pinpoint
+        // which is the tenant_size_model's "default" branch.
+        updates.push(Update {
+            lsn: timeline.get_ancestor_lsn(),
+            command: Command::BranchFrom(timeline.get_ancestor_timeline_id()),
+            timeline_id: timeline.timeline_id,
+        });
+
+        for (lsn, _kind) in &interesting_lsns {
+            if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) {
+                updates.push(Update {
+                    lsn: *lsn,
+                    timeline_id: timeline.timeline_id,
+                    command: Command::Update(*size),
+                });
+
+                needed_cache.insert((timeline.timeline_id, *lsn));
+            } else {
+                let timeline = Arc::clone(&timeline);
+                let parallel_size_calcs = Arc::clone(limit);
+                joinset.spawn(calculate_logical_size(parallel_size_calcs, timeline, *lsn));
+            }
+        }
+
+        timeline_inputs.insert(
+            timeline.timeline_id,
+            TimelineInputs {
+                last_record: last_record_lsn,
+                // this is not used above, because it might not have updated recently enough
+                latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
+                horizon_cutoff,
+                pitr_cutoff,
+                next_gc_cutoff,
+            },
+        );
+    }
+
+    let mut have_any_error = false;
+
+    while let Some(res) = joinset.join_next().await {
+        // each of these come with Result<Result<_, JoinError>, JoinError>
+        // because of spawn + spawn_blocking
+        let res = res.and_then(|inner| inner);
+        match res {
+            Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size))) => {
+                debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
+
+                logical_size_cache.insert((timeline.timeline_id, lsn), size);
+                needed_cache.insert((timeline.timeline_id, lsn));
+
+                updates.push(Update {
+                    lsn,
+                    timeline_id: timeline.timeline_id,
+                    command: Command::Update(size),
+                });
+            }
+            Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error))) => {
+                warn!(
+                    timeline_id=%timeline.timeline_id,
+                    "failed to calculate logical size at {lsn}: {error:#}"
+                );
+                have_any_error = true;
+            }
+            Err(join_error) if join_error.is_cancelled() => {
+                unreachable!("we are not cancelling any of the futures, nor should be");
+            }
+            Err(join_error) => {
+                // cannot really do anything, as this panic is likely a bug
+                error!("logical size query panicked: {join_error:#}");
+                have_any_error = true;
+            }
+        }
+    }
+
+    // prune any keys not needed anymore; we record every used key and added key.
+    logical_size_cache.retain(|key, _| needed_cache.contains(key));
+
+    if have_any_error {
+        // we cannot complete this round, because we are missing data.
+        // we have however cached all we were able to request calculation on.
+        anyhow::bail!("failed to calculate some logical_sizes");
+    }
+
+    // the data gathered to updates is per lsn, regardless of the branch, so we can use it to
+    // our advantage, not requiring a sorted container or graph walk.
+    //
+    // for branch points, which come as multiple updates at the same LSN, the Command::Update
+    // is needed before a branch is made out of that branch Command::BranchFrom. this is
+    // handled by the variant order in `Command`.
+    updates.sort_unstable();
+
+    let retention_period = match max_cutoff_distance {
+        Some(max) => max.0,
+        None => {
+            anyhow::bail!("the first branch should have a gc_cutoff after it's branch point at 0")
+        }
+    };
+
+    Ok(ModelInputs {
+        updates,
+        retention_period,
+        timeline_inputs,
+    })
+}
+
+impl ModelInputs {
+    pub fn calculate(&self) -> anyhow::Result<u64> {
+        // Option<TimelineId> is used for "naming" the branches because it is assumed to be
+        // impossible to always determine the a one main branch.
+        let mut storage = tenant_size_model::Storage::<Option<TimelineId>>::new(None);
+
+        // tracking these not to require modifying the current implementation of the size model,
+        // which works in relative LSNs and sizes.
+        let mut last_state: HashMap<TimelineId, (Lsn, u64)> = HashMap::new();
+
+        for update in &self.updates {
+            let Update {
+                lsn,
+                command: op,
+                timeline_id,
+            } = update;
+            match op {
+                Command::Update(sz) => {
+                    let latest = last_state.get_mut(timeline_id).ok_or_else(|| {
+                        anyhow::anyhow!(
+                        "ordering-mismatch: there must had been a previous state for {timeline_id}"
+                    )
+                    })?;
+
+                    let lsn_bytes = {
+                        let Lsn(now) = lsn;
+                        let Lsn(prev) = latest.0;
+                        debug_assert!(prev <= *now, "self.updates should had been sorted");
+                        now - prev
+                    };
+
+                    let size_diff =
+                        i64::try_from(*sz as i128 - latest.1 as i128).with_context(|| {
+                            format!("size difference i64 overflow for {timeline_id}")
+                        })?;
+
+                    storage.modify_branch(&Some(*timeline_id), "".into(), lsn_bytes, size_diff);
+                    *latest = (*lsn, *sz);
+                }
+                Command::BranchFrom(parent) => {
+                    storage.branch(parent, Some(*timeline_id));
+
+                    let size = parent
+                        .as_ref()
+                        .and_then(|id| last_state.get(id))
+                        .map(|x| x.1)
+                        .unwrap_or(0);
+                    last_state.insert(*timeline_id, (*lsn, size));
+                }
+            }
+        }
+
+        Ok(storage.calculate(self.retention_period).total_children())
+    }
+}
+
+/// Single size model update.
+///
+/// Sizing model works with relative increments over latest branch state.
+/// Updates are absolute, so additional state needs to be tracked when applying.
+#[serde_with::serde_as]
+#[derive(
+    Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize,
+)]
+struct Update {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    lsn: utils::lsn::Lsn,
+    command: Command,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    timeline_id: TimelineId,
+}
+
+#[serde_with::serde_as]
+#[derive(PartialOrd, PartialEq, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize)]
+#[serde(rename_all = "snake_case")]
+enum Command {
+    Update(u64),
+    BranchFrom(#[serde_as(as = "Option<serde_with::DisplayFromStr>")] Option<TimelineId>),
+}
+
+impl std::fmt::Debug for Command {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // custom one-line implementation makes it more enjoyable to read {:#?} avoiding 3
+        // linebreaks
+        match self {
+            Self::Update(arg0) => write!(f, "Update({arg0})"),
+            Self::BranchFrom(arg0) => write!(f, "BranchFrom({arg0:?})"),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+enum LsnKind {
+    BranchPoint,
+    GcCutOff,
+}
+
+/// Newtype around the tuple that carries the timeline at lsn logical size calculation.
+struct TimelineAtLsnSizeResult(
+    Arc<crate::tenant::Timeline>,
+    utils::lsn::Lsn,
+    anyhow::Result<u64>,
+);
+
+#[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))]
+async fn calculate_logical_size(
+    limit: Arc<tokio::sync::Semaphore>,
+    timeline: Arc<crate::tenant::Timeline>,
+    lsn: utils::lsn::Lsn,
+) -> Result<TimelineAtLsnSizeResult, tokio::task::JoinError> {
+    let permit = tokio::sync::Semaphore::acquire_owned(limit)
+        .await
+        .expect("global semaphore should not had been closed");
+
+    tokio::task::spawn_blocking(move || {
+        let _permit = permit;
+        let size_res = timeline.calculate_logical_size(lsn);
+        TimelineAtLsnSizeResult(timeline, lsn, size_res)
+    })
+    .await
+}
+
+#[test]
+fn updates_sort() {
+    use std::str::FromStr;
+    use utils::id::TimelineId;
+    use utils::lsn::Lsn;
+
+    let ids = [
+        TimelineId::from_str("7ff1edab8182025f15ae33482edb590a").unwrap(),
+        TimelineId::from_str("b1719e044db05401a05a2ed588a3ad3f").unwrap(),
+        TimelineId::from_str("b68d6691c895ad0a70809470020929ef").unwrap(),
+    ];
+
+    // try through all permutations
+    let ids = [
+        [&ids[0], &ids[1], &ids[2]],
+        [&ids[0], &ids[2], &ids[1]],
+        [&ids[1], &ids[0], &ids[2]],
+        [&ids[1], &ids[2], &ids[0]],
+        [&ids[2], &ids[0], &ids[1]],
+        [&ids[2], &ids[1], &ids[0]],
+    ];
+
+    for ids in ids {
+        // apply a fixture which uses a permutation of ids
+        let commands = [
+            Update {
+                lsn: Lsn(0),
+                command: Command::BranchFrom(None),
+                timeline_id: *ids[0],
+            },
+            Update {
+                lsn: Lsn::from_str("0/67E7618").unwrap(),
+                command: Command::Update(43696128),
+                timeline_id: *ids[0],
+            },
+            Update {
+                lsn: Lsn::from_str("0/67E7618").unwrap(),
+                command: Command::BranchFrom(Some(*ids[0])),
+                timeline_id: *ids[1],
+            },
+            Update {
+                lsn: Lsn::from_str("0/76BE4F0").unwrap(),
+                command: Command::Update(41844736),
+                timeline_id: *ids[1],
+            },
+            Update {
+                lsn: Lsn::from_str("0/10E49380").unwrap(),
+                command: Command::Update(42164224),
+                timeline_id: *ids[0],
+            },
+            Update {
+                lsn: Lsn::from_str("0/10E49380").unwrap(),
+                command: Command::BranchFrom(Some(*ids[0])),
+                timeline_id: *ids[2],
+            },
+            Update {
+                lsn: Lsn::from_str("0/11D74910").unwrap(),
+                command: Command::Update(42172416),
+                timeline_id: *ids[2],
+            },
+            Update {
+                lsn: Lsn::from_str("0/12051E98").unwrap(),
+                command: Command::Update(42196992),
+                timeline_id: *ids[0],
+            },
+        ];
+
+        let mut sorted = commands;
+
+        // these must sort in the same order, regardless of how the ids sort
+        // which is why the timeline_id is the last field
+        sorted.sort_unstable();
+
+        assert_eq!(commands, sorted, "{:#?} vs. {:#?}", commands, sorted);
+    }
+}
+
+#[test]
+fn verify_size_for_multiple_branches() {
+    // this is generated from integration test test_tenant_size_with_multiple_branches, but this way
+    // it has the stable lsn's
+    let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072,"timeline_inputs":{"cd9d9409c216e64bf580904facedb01b":{"last_record":"0/18D5E40","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/18B5E40","pitr_cutoff":"0/18B5E40","next_gc_cutoff":"0/18B5E40"},"10b532a550540bc15385eac4edde416a":{"last_record":"0/1839818","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/1819818","pitr_cutoff":"0/1819818","next_gc_cutoff":"0/1819818"},"230fc9d756f7363574c0d66533564dcc":{"last_record":"0/222F438","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/220F438","pitr_cutoff":"0/220F438","next_gc_cutoff":"0/220F438"}}}"#;
+
+    let inputs: ModelInputs = serde_json::from_str(doc).unwrap();
+
+    assert_eq!(inputs.calculate().unwrap(), 36_409_872);
+}
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -16,7 +16,7 @@ use std::fs;
 use std::ops::{Deref, Range};
 use std::path::PathBuf;
 use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering};
-use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError};
+use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::{Duration, Instant, SystemTime};

 use crate::tenant::{
@@ -121,8 +121,16 @@ pub struct Timeline {
    /// to avoid deadlock.
    write_lock: Mutex<()>,

-    /// Used to ensure that there is only task performing flushing at a time
-    layer_flush_lock: Mutex<()>,
+    /// Used to avoid multiple `flush_loop` tasks running
+    flush_loop_started: Mutex<bool>,
+
+    /// layer_flush_start_tx can be used to wake up the layer-flushing task.
+    /// The value is a counter, incremented every time a new flush cycle is requested.
+    /// The flush cycle counter is sent back on the layer_flush_done channel when
+    /// the flush finishes. You can use that to wait for the flush to finish.
+    layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
+    /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
+    layer_flush_done_tx: tokio::sync::watch::Sender<(u64, anyhow::Result<()>)>,

    /// Layer removal lock.
    /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
@@ -272,6 +280,11 @@ impl LogicalSize {
        self.size_added_after_initial
            .fetch_add(delta, AtomicOrdering::SeqCst);
    }
+
+    /// Returns the initialized (already calculated) value, if any.
+    fn initialized_size(&self) -> Option<u64> {
+        self.initial_logical_size.get().copied()
+    }
 }

 pub struct WalReceiverInfo {
@@ -461,15 +474,16 @@ impl Timeline {
    ///
    /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
    /// know anything about them here in the repository.
-    pub fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> {
+    #[instrument(skip(self), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id))]
+    pub async fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> {
        match cconf {
            CheckpointConfig::Flush => {
                self.freeze_inmem_layer(false);
-                self.flush_frozen_layers(true)
+                self.flush_frozen_layers_and_wait().await
            }
            CheckpointConfig::Forced => {
                self.freeze_inmem_layer(false);
-                self.flush_frozen_layers(true)?;
+                self.flush_frozen_layers_and_wait().await?;
                self.compact()
            }
        }
@@ -619,24 +633,8 @@ impl Timeline {
                self.last_freeze_at.store(last_lsn);
                *(self.last_freeze_ts.write().unwrap()) = Instant::now();

-                // Launch a task to flush the frozen layer to disk, unless
-                // a task was already running. (If the task was running
-                // at the time that we froze the layer, it must've seen the
-                // the layer we just froze before it exited; see comments
-                // in flush_frozen_layers())
-                if let Ok(guard) = self.layer_flush_lock.try_lock() {
-                    drop(guard);
-                    let self_clone = Arc::clone(self);
-                    task_mgr::spawn(
-                        task_mgr::BACKGROUND_RUNTIME.handle(),
-                        task_mgr::TaskKind::LayerFlushTask,
-                        Some(self.tenant_id),
-                        Some(self.timeline_id),
-                        "layer flush task",
-                        false,
-                        async move { self_clone.flush_frozen_layers(false) },
-                    );
-                }
+                // Wake up the layer flusher
+                self.flush_frozen_layers();
            }
        }
        Ok(())
@@ -727,6 +725,9 @@ impl Timeline {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
        let (state, _) = watch::channel(TimelineState::Suspended);

+        let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
+        let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
+
        let mut result = Timeline {
            conf,
            tenant_conf,
@@ -754,8 +755,12 @@ impl Timeline {

            upload_layers: AtomicBool::new(upload_layers),

+            flush_loop_started: Mutex::new(false),
+
+            layer_flush_start_tx,
+            layer_flush_done_tx,
+
            write_lock: Mutex::new(()),
-            layer_flush_lock: Mutex::new(()),
            layer_removal_cs: Mutex::new(()),

            gc_info: RwLock::new(GcInfo {
@@ -788,6 +793,33 @@ impl Timeline {
        result
    }

+    pub(super) fn maybe_spawn_flush_loop(self: &Arc<Self>) {
+        let mut flush_loop_started = self.flush_loop_started.lock().unwrap();
+        if *flush_loop_started {
+            info!(
+                "skipping attempt to start flush_loop twice {}/{}",
+                self.tenant_id, self.timeline_id
+            );
+            return;
+        }
+
+        let layer_flush_start_rx = self.layer_flush_start_tx.subscribe();
+        let self_clone = Arc::clone(self);
+        info!("spawning flush loop");
+        task_mgr::spawn(
+                    task_mgr::BACKGROUND_RUNTIME.handle(),
+                    task_mgr::TaskKind::LayerFlushTask,
+                    Some(self.tenant_id),
+                    Some(self.timeline_id),
+                    "layer flush task",
+                    false,
+                    async move { self_clone.flush_loop(layer_flush_start_rx).await; Ok(()) }
+                    .instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id))
+                );
+
+        *flush_loop_started = true;
+    }
+
    pub(super) fn launch_wal_receiver(self: &Arc<Self>) {
        if !is_etcd_client_initialized() {
            if cfg!(test) {
@@ -979,9 +1011,26 @@ impl Timeline {
    /// Calculate the logical size of the database at the latest LSN.
    ///
    /// NOTE: counted incrementally, includes ancestors, this can be a slow operation.
-    fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result<u64> {
-        info!("Calculating logical size for timeline {}", self.timeline_id);
-        let timer = self.metrics.init_logical_size_histo.start_timer();
+    pub fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result<u64> {
+        info!(
+            "Calculating logical size for timeline {} at {}",
+            self.timeline_id, up_to_lsn
+        );
+        let timer = if up_to_lsn == self.initdb_lsn {
+            if let Some(size) = self.current_logical_size.initialized_size() {
+                if size != 0 {
+                    // non-zero size means that the size has already been calculated by this method
+                    // after startup. if the logical size is for a new timeline without layers the
+                    // size will be zero, and we cannot use that, or this caching strategy until
+                    // pageserver restart.
+                    return Ok(size);
+                }
+            }
+
+            self.metrics.init_logical_size_histo.start_timer()
+        } else {
+            self.metrics.logical_size_histo.start_timer()
+        };
        let logical_size = self.get_current_logical_size_non_incremental(up_to_lsn)?;
        debug!("calculated logical size: {logical_size}");
        timer.stop_and_record();
@@ -1267,53 +1316,94 @@ impl Timeline {
        drop(layers);
    }

-    /// Flush all frozen layers to disk.
-    ///
-    /// Only one task at a time can be doing layer-flushing for a
-    /// given timeline. If 'wait' is true, and another task is
-    /// currently doing the flushing, this function will wait for it
-    /// to finish. If 'wait' is false, this function will return
-    /// immediately instead.
-    fn flush_frozen_layers(&self, wait: bool) -> anyhow::Result<()> {
-        let flush_lock_guard = if wait {
-            self.layer_flush_lock.lock().unwrap()
-        } else {
-            match self.layer_flush_lock.try_lock() {
-                Ok(guard) => guard,
-                Err(TryLockError::WouldBlock) => return Ok(()),
-                Err(TryLockError::Poisoned(err)) => panic!("{:?}", err),
-            }
-        };
-
-        let timer = self.metrics.flush_time_histo.start_timer();
-
+    /// Layer flusher task's main loop.
+    async fn flush_loop(&self, mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>) {
+        info!("started flush loop");
        loop {
-            let layers = self.layers.read().unwrap();
-            if let Some(frozen_layer) = layers.frozen_layers.front() {
-                let frozen_layer = Arc::clone(frozen_layer);
-                drop(layers); // to allow concurrent reads and writes
-                self.flush_frozen_layer(frozen_layer)?;
-            } else {
-                // Drop the 'layer_flush_lock' *before* 'layers'. That
-                // way, if you freeze a layer, and then call
-                // flush_frozen_layers(false), it is guaranteed that
-                // if another thread was busy flushing layers and the
-                // call therefore returns immediately, the other
-                // thread will have seen the newly-frozen layer and
-                // will flush that too (assuming no errors).
-                drop(flush_lock_guard);
-                drop(layers);
-                break;
+            tokio::select! {
+                _ = task_mgr::shutdown_watcher() => {
+                    info!("shutting down layer flush task");
+                    break;
+                },
+                _ = layer_flush_start_rx.changed() => {}
            }
+
+            trace!("waking up");
+            let timer = self.metrics.flush_time_histo.start_timer();
+            let flush_counter = *layer_flush_start_rx.borrow();
+            let result = loop {
+                let layer_to_flush = {
+                    let layers = self.layers.read().unwrap();
+                    layers.frozen_layers.front().cloned()
+                    // drop 'layers' lock to allow concurrent reads and writes
+                };
+                if let Some(layer_to_flush) = layer_to_flush {
+                    if let Err(err) = self.flush_frozen_layer(layer_to_flush).await {
+                        error!("could not flush frozen layer: {err:?}");
+                        break Err(err);
+                    }
+                    continue;
+                } else {
+                    break Ok(());
+                }
+            };
+            // Notify any listeners that we're done
+            let _ = self
+                .layer_flush_done_tx
+                .send_replace((flush_counter, result));
+
+            timer.stop_and_record();
+        }
+    }
+
+    async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> {
+        let mut rx = self.layer_flush_done_tx.subscribe();
+
+        // Increment the flush cycle counter and wake up the flush task.
+        // Remember the new value, so that when we listen for the flush
+        // to finish, we know when the flush that we initiated has
+        // finished, instead of some other flush that was started earlier.
+        let mut my_flush_request = 0;
+
+        if !&*self.flush_loop_started.lock().unwrap() {
+            anyhow::bail!("cannot flush frozen layers when flush_loop is not running")
        }

-        timer.stop_and_record();
+        self.layer_flush_start_tx.send_modify(|counter| {
+            my_flush_request = *counter + 1;
+            *counter = my_flush_request;
+        });

-        Ok(())
+        loop {
+            {
+                let (last_result_counter, last_result) = &*rx.borrow();
+                if *last_result_counter >= my_flush_request {
+                    if let Err(_err) = last_result {
+                        // We already logged the original error in
+                        // flush_loop. We cannot propagate it to the caller
+                        // here, because it might not be Cloneable
+                        anyhow::bail!(
+                            "Could not flush frozen layer. Request id: {}",
+                            my_flush_request
+                        );
+                    } else {
+                        return Ok(());
+                    }
+                }
+            }
+            trace!("waiting for flush to complete");
+            rx.changed().await?;
+            trace!("done")
+        }
+    }
+
+    fn flush_frozen_layers(&self) {
+        self.layer_flush_start_tx.send_modify(|val| *val += 1);
    }

    /// Flush one frozen in-memory layer to disk, as a new delta layer.
-    fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
+    #[instrument(skip(self, frozen_layer), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.filename().display()))]
+    async fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
        // As a special case, when we have just imported an image into the repository,
        // instead of writing out a L0 delta layer, we directly write out image layer
        // files instead. This is possible as long as *all* the data imported into the
@@ -1541,6 +1631,10 @@ impl Timeline {
                    lsn,
                )?;

+                fail_point!("image-layer-writer-fail-before-finish", |_| {
+                    anyhow::bail!("failpoint image-layer-writer-fail-before-finish");
+                });
+
                for range in &partition.ranges {
                    let mut key = range.start;
                    while key < range.end {
@@ -1835,6 +1929,11 @@ impl Timeline {
                    },
                )?);
            }
+
+            fail_point!("delta-layer-writer-fail-before-finish", |_| {
+                anyhow::bail!("failpoint delta-layer-writer-fail-before-finish");
+            });
+
            writer.as_mut().unwrap().put_value(key, lsn, value)?;
            prev_key = Some(key);
        }
@@ -2234,13 +2333,10 @@ impl Timeline {

                let last_rec_lsn = data.records.last().unwrap().0;

-                let img = self.walredo_mgr.request_redo(
-                    key,
-                    request_lsn,
-                    base_img,
-                    data.records,
-                    self.pg_version,
-                )?;
+                let img = self
+                    .walredo_mgr
+                    .request_redo(key, request_lsn, base_img, data.records, self.pg_version)
+                    .context("Failed to reconstruct a page image:")?;

                if img.len() == page_cache::PAGE_SZ {
                    let cache = page_cache::get();
--- a/pageserver/src/tenant_config.rs
+++ b/pageserver/src/tenant_config.rs
@@ -82,6 +82,7 @@ pub struct TenantConf {
    /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
    /// to avoid eager reconnects.
    pub max_lsn_wal_lag: NonZeroU64,
+    pub trace_read_requests: bool,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -105,6 +106,7 @@ pub struct TenantConfOpt {
    #[serde(with = "humantime_serde")]
    pub lagging_wal_timeout: Option<Duration>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
+    pub trace_read_requests: Option<bool>,
 }

 impl TenantConfOpt {
@@ -138,6 +140,9 @@ impl TenantConfOpt {
                .lagging_wal_timeout
                .unwrap_or(global_conf.lagging_wal_timeout),
            max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag),
+            trace_read_requests: self
+                .trace_read_requests
+                .unwrap_or(global_conf.trace_read_requests),
        }
    }

@@ -207,10 +212,10 @@ impl TenantConf {
                .expect("cannot parse default walreceiver lagging wal timeout"),
            max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
                .expect("cannot parse default max walreceiver Lsn wal lag"),
+            trace_read_requests: false,
        }
    }

-    #[cfg(test)]
    pub fn dummy_conf() -> Self {
        TenantConf {
            checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
@@ -232,6 +237,7 @@ impl TenantConf {
            .unwrap(),
            max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
                .unwrap(),
+            trace_read_requests: false,
        }
    }
 }
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -241,7 +241,7 @@ pub async fn shutdown_all_tenants() {
        let tenant_id = tenant.tenant_id();
        debug!("shutdown tenant {tenant_id}");

-        if let Err(err) = tenant.checkpoint() {
+        if let Err(err) = tenant.checkpoint().await {
            error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
        }
    }
--- a/pageserver/src/tenant_tasks.rs
+++ b/pageserver/src/tenant_tasks.rs
@@ -72,8 +72,6 @@ async fn compaction_loop(tenant_id: TenantId) {
            if let Err(e) = tenant.compaction_iteration() {
                sleep_duration = wait_duration;
                error!("Compaction failed, retrying in {:?}: {e:#}", sleep_duration);
-                #[cfg(feature = "testing")]
-                std::process::abort();
            }

            // Sleep
@@ -119,12 +117,10 @@ async fn gc_loop(tenant_id: TenantId) {
            let gc_horizon = tenant.get_gc_horizon();
            let mut sleep_duration = gc_period;
            if gc_horizon > 0 {
-                if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false)
+                if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await
                {
                    sleep_duration = wait_duration;
                    error!("Gc failed, retrying in {:?}: {e:#}", sleep_duration);
-                    #[cfg(feature = "testing")]
-                    std::process::abort();
                }
            }

--- a/pageserver/src/trace.rs
+++ b/pageserver/src/trace.rs
@@ -0,0 +1,36 @@
+use bytes::Bytes;
+use std::{
+    fs::{create_dir_all, File},
+    io::{BufWriter, Write},
+    path::PathBuf,
+};
+
+pub struct Tracer {
+    writer: BufWriter<File>,
+}
+
+impl Drop for Tracer {
+    fn drop(&mut self) {
+        self.flush()
+    }
+}
+
+impl Tracer {
+    pub fn new(path: PathBuf) -> Self {
+        let parent = path.parent().expect("failed to parse parent path");
+        create_dir_all(parent).expect("failed to create trace dir");
+
+        let file = File::create(path).expect("failed to create trace file");
+        Tracer {
+            writer: BufWriter::new(file),
+        }
+    }
+
+    pub fn trace(&mut self, msg: &Bytes) {
+        self.writer.write_all(msg).expect("failed to write trace");
+    }
+
+    pub fn flush(&mut self) {
+        self.writer.flush().expect("failed to flush trace file");
+    }
+}
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -319,6 +319,12 @@ impl VirtualFile {

        Ok(result)
    }
+
+    pub fn remove(self) {
+        let path = self.path.clone();
+        drop(self);
+        std::fs::remove_file(path).expect("failed to remove the virtual file");
+    }
 }

 impl Drop for VirtualFile {
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -155,22 +155,19 @@ impl<E: Clone> TaskHandle<E> {

    /// Aborts current task, waiting for it to finish.
    pub async fn shutdown(self) {
-        match self.join_handle {
-            Some(jh) => {
-                self.cancellation.send(()).ok();
-                match jh.await {
-                    Ok(Ok(())) => debug!("Shutdown success"),
-                    Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
-                    Err(join_error) => {
-                        if join_error.is_cancelled() {
-                            error!("Shutdown task was cancelled");
-                        } else {
-                            error!("Shutdown task join error: {join_error}")
-                        }
+        if let Some(jh) = self.join_handle {
+            self.cancellation.send(()).ok();
+            match jh.await {
+                Ok(Ok(())) => debug!("Shutdown success"),
+                Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
+                Err(join_error) => {
+                    if join_error.is_cancelled() {
+                        error!("Shutdown task was cancelled");
+                    } else {
+                        error!("Shutdown task join error: {join_error}")
                    }
                }
            }
-            None => {}
        }
    }
 }
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -93,7 +93,7 @@ pub fn spawn_connection_manager_task(
            }
        }
        .instrument(
-            info_span!("wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id),
+            info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id),
        ),
    );
 }
@@ -836,15 +836,20 @@ fn wal_stream_connection_string(
    listen_pg_addr_str: &str,
 ) -> anyhow::Result<String> {
    let sk_connstr = format!("postgresql://no_user@{listen_pg_addr_str}/no_db");
-    let me_conf = sk_connstr
-        .parse::<postgres::config::Config>()
-        .with_context(|| {
-            format!("Failed to parse pageserver connection string '{sk_connstr}' as a postgres one")
-        })?;
-    let (host, port) = utils::connstring::connection_host_port(&me_conf);
-    Ok(format!(
-        "host={host} port={port} options='-c timeline_id={timeline_id} tenant_id={tenant_id}'"
-    ))
+    sk_connstr
+        .parse()
+        .context("bad url")
+        .and_then(|url: url::Url| {
+            let host = url.host_str().context("host is missing")?;
+            let port = url.port().unwrap_or(5432); // default PG port
+
+            Ok(format!(
+                "host={host} \
+                 port={port} \
+                 options='-c timeline_id={timeline_id} tenant_id={tenant_id}'"
+            ))
+        })
+        .with_context(|| format!("Failed to parse pageserver connection URL '{sk_connstr}'"))
 }

 #[cfg(test)]
@@ -892,7 +897,7 @@ mod tests {
                        peer_horizon_lsn: None,
                        local_start_lsn: None,

-                        safekeeper_connstr: Some("no commit_lsn".to_string()),
+                        safekeeper_connstr: Some("no_commit_lsn".to_string()),
                    },
                    etcd_version: 0,
                    latest_update: now,
@@ -909,7 +914,7 @@ mod tests {
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
                        local_start_lsn: None,
-                        safekeeper_connstr: Some("no commit_lsn".to_string()),
+                        safekeeper_connstr: Some("no_commit_lsn".to_string()),
                    },
                    etcd_version: 0,
                    latest_update: now,
@@ -1005,7 +1010,7 @@ mod tests {
                        peer_horizon_lsn: None,
                        local_start_lsn: None,

-                        safekeeper_connstr: Some("not advanced Lsn".to_string()),
+                        safekeeper_connstr: Some("not_advanced_lsn".to_string()),
                    },
                    etcd_version: 0,
                    latest_update: now,
@@ -1023,7 +1028,7 @@ mod tests {
                        peer_horizon_lsn: None,
                        local_start_lsn: None,

-                        safekeeper_connstr: Some("not enough advanced Lsn".to_string()),
+                        safekeeper_connstr: Some("not_enough_advanced_lsn".to_string()),
                    },
                    etcd_version: 0,
                    latest_update: now,
@@ -1093,7 +1098,7 @@ mod tests {
                        peer_horizon_lsn: None,
                        local_start_lsn: None,

-                        safekeeper_connstr: Some("smaller commit_lsn".to_string()),
+                        safekeeper_connstr: Some("smaller_commit_lsn".to_string()),
                    },
                    etcd_version: 0,
                    latest_update: now,
@@ -1283,7 +1288,7 @@ mod tests {
                        peer_horizon_lsn: None,
                        local_start_lsn: None,

-                        safekeeper_connstr: Some("advanced by Lsn safekeeper".to_string()),
+                        safekeeper_connstr: Some("advanced_by_lsn_safekeeper".to_string()),
                    },
                    etcd_version: 0,
                    latest_update: now,
@@ -1307,7 +1312,7 @@ mod tests {
        );
        assert!(over_threshcurrent_candidate
            .wal_source_connstr
-            .contains("advanced by Lsn safekeeper"));
+            .contains("advanced_by_lsn_safekeeper"));

        Ok(())
    }
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/walreceiver/walreceiver_connection.rs
@@ -31,8 +31,8 @@ use crate::{
    walrecord::DecodedWALRecord,
 };
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use utils::id::TenantTimelineId;
-use utils::{lsn::Lsn, pq_proto::ReplicationFeedback};
+use pq_proto::ReplicationFeedback;
+use utils::{id::TenantTimelineId, lsn::Lsn};

 /// Status of the connection.
 #[derive(Debug, Clone)]
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -10,7 +10,7 @@
 //! process. Then we get the page image back. Communication with the
 //! postgres process happens via stdin/stdout
 //!
-//! See src/backend/tcop/zenith_wal_redo.c for the other side of
+//! See pgxn/neon_walredo/walredoproc.c for the other side of
 //! this communication.
 //!
 //! The Postgres process is assumed to be secure against malicious WAL
@@ -22,10 +22,10 @@ use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, Bytes, BytesMut};
 use nix::poll::*;
 use serde::Serialize;
-use std::fs;
 use std::fs::OpenOptions;
 use std::io::prelude::*;
 use std::io::{Error, ErrorKind};
+use std::ops::{Deref, DerefMut};
 use std::os::unix::io::AsRawFd;
 use std::os::unix::prelude::CommandExt;
 use std::path::PathBuf;
@@ -34,6 +34,7 @@ use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
 use std::sync::Mutex;
 use std::time::Duration;
 use std::time::Instant;
+use std::{fs, io};
 use tracing::*;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
@@ -44,6 +45,7 @@ use crate::metrics::{
 };
 use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
 use crate::repository::Key;
+use crate::task_mgr::BACKGROUND_RUNTIME;
 use crate::walrecord::NeonWalRecord;
 use crate::{config::PageServerConf, TEMP_FILE_SUFFIX};
 use pageserver_api::reltag::{RelTag, SlruKind};
@@ -208,6 +210,16 @@ impl PostgresRedoManager {
        }
    }

+    /// Launch process pre-emptively. Should not be needed except for benchmarking.
+    pub fn launch_process(&mut self, pg_version: u32) -> anyhow::Result<()> {
+        let inner = self.process.get_mut().unwrap();
+        if inner.is_none() {
+            let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
+            *inner = Some(p);
+        }
+        Ok(())
+    }
+
    ///
    /// Process one request for WAL redo using wal-redo postgres
    ///
@@ -229,7 +241,7 @@ impl PostgresRedoManager {

        // launch the WAL redo process on first use
        if process_guard.is_none() {
-            let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id, pg_version)?;
+            let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
            *process_guard = Some(p);
        }
        let process = process_guard.as_mut().unwrap();
@@ -579,7 +591,8 @@ impl<C: CommandExt> CloseFileDescriptors for C {
 /// Handle to the Postgres WAL redo process
 ///
 struct PostgresRedoProcess {
-    child: Child,
+    tenant_id: TenantId,
+    child: NoLeakChild,
    stdin: ChildStdin,
    stdout: ChildStdout,
    stderr: ChildStderr,
@@ -589,16 +602,17 @@ impl PostgresRedoProcess {
    //
    // Start postgres binary in special WAL redo mode.
    //
+    #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
    fn launch(
        conf: &PageServerConf,
-        tenant_id: &TenantId,
+        tenant_id: TenantId,
        pg_version: u32,
    ) -> Result<PostgresRedoProcess, Error> {
        // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
        // just create one with constant name. That fails if you try to launch more than
        // one WAL redo manager concurrently.
        let datadir = path_with_suffix_extension(
-            conf.tenant_path(tenant_id).join("wal-redo-datadir"),
+            conf.tenant_path(&tenant_id).join("wal-redo-datadir"),
            TEMP_FILE_SUFFIX,
        );

@@ -644,18 +658,16 @@ impl PostgresRedoProcess {
                ),
            ));
        } else {
-            // Limit shared cache for wal-redo-postres
+            // Limit shared cache for wal-redo-postgres
            let mut config = OpenOptions::new()
                .append(true)
                .open(PathBuf::from(&datadir).join("postgresql.conf"))?;
            config.write_all(b"shared_buffers=128kB\n")?;
            config.write_all(b"fsync=off\n")?;
-            config.write_all(b"shared_preload_libraries=neon\n")?;
-            config.write_all(b"neon.wal_redo=on\n")?;
        }

        // Start postgres itself
-        let mut child = Command::new(pg_bin_dir_path.join("postgres"))
+        let child = Command::new(pg_bin_dir_path.join("postgres"))
            .arg("--wal-redo")
            .stdin(Stdio::piped())
            .stderr(Stdio::piped())
@@ -664,20 +676,17 @@ impl PostgresRedoProcess {
            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
            .env("PGDATA", &datadir)
-            // The redo process is not trusted, so it runs in seccomp mode
-            // (see seccomp in zenith_wal_redo.c). We have to make sure it doesn't
-            // inherit any file descriptors from the pageserver that would allow
-            // an attacker to do bad things.
+            // The redo process is not trusted, and runs in seccomp mode that
+            // doesn't allow it to open any files. We have to also make sure it
+            // doesn't inherit any file descriptors from the pageserver, that
+            // would allow an attacker to read any files that happen to be open
+            // in the pageserver.
            //
            // The Rust standard library makes sure to mark any file descriptors with
            // as close-on-exec by default, but that's not enough, since we use
            // libraries that directly call libc open without setting that flag.
-            //
-            // One example is the pidfile of the daemonize library, which doesn't
-            // currently mark file descriptors as close-on-exec. Either way, we
-            // want to be on the safe side and prevent accidental regression.
            .close_fds()
-            .spawn()
+            .spawn_no_leak_child()
            .map_err(|e| {
                Error::new(
                    e.kind(),
@@ -685,20 +694,33 @@ impl PostgresRedoProcess {
                )
            })?;

-        info!(
-            "launched WAL redo postgres process on {}",
-            datadir.display()
-        );
+        let mut child = scopeguard::guard(child, |child| {
+            error!("killing wal-redo-postgres process due to a problem during launch");
+            child.kill_and_wait();
+        });

        let stdin = child.stdin.take().unwrap();
        let stdout = child.stdout.take().unwrap();
        let stderr = child.stderr.take().unwrap();

-        set_nonblock(stdin.as_raw_fd())?;
-        set_nonblock(stdout.as_raw_fd())?;
-        set_nonblock(stderr.as_raw_fd())?;
+        macro_rules! set_nonblock_or_log_err {
+            ($file:ident) => {{
+                let res = set_nonblock($file.as_raw_fd());
+                if let Err(e) = &res {
+                    error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
+                }
+                res
+            }};
+        }
+        set_nonblock_or_log_err!(stdin)?;
+        set_nonblock_or_log_err!(stdout)?;
+        set_nonblock_or_log_err!(stderr)?;
+
+        // all fallible operations post-spawn are complete, so get rid of the guard
+        let child = scopeguard::ScopeGuard::into_inner(child);

        Ok(PostgresRedoProcess {
+            tenant_id,
            child,
            stdin,
            stdout,
@@ -706,18 +728,16 @@ impl PostgresRedoProcess {
        })
    }

-    fn kill(mut self) {
-        let _ = self.child.kill();
-        if let Ok(exit_status) = self.child.wait() {
-            error!("wal-redo-postgres exited with code {}", exit_status);
-        }
-        drop(self);
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
+    fn kill(self) {
+        self.child.kill_and_wait();
    }

    //
    // Apply given WAL records ('records') over an old page image. Returns
    // new page image.
    //
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
    fn apply_wal_records(
        &mut self,
        tag: BufferTag,
@@ -730,7 +750,11 @@ impl PostgresRedoProcess {
        // This could be problematic if there are millions of records to replay,
        // but in practice the number of records is usually so small that it doesn't
        // matter, and it's better to keep this code simple.
-        let mut writebuf: Vec<u8> = Vec::new();
+        //
+        // Most requests start with a before-image with BLCKSZ bytes, followed by
+        // by some other WAL records. Start with a buffer that can hold that
+        // comfortably.
+        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
        build_begin_redo_for_block_msg(tag, &mut writebuf);
        if let Some(img) = base_img {
            build_push_page_msg(tag, &img, &mut writebuf);
@@ -843,8 +867,101 @@ impl PostgresRedoProcess {
    }
 }

+/// Wrapper type around `std::process::Child` which guarantees that the child
+/// will be killed and waited-for by this process before being dropped.
+struct NoLeakChild {
+    child: Option<Child>,
+}
+
+impl Deref for NoLeakChild {
+    type Target = Child;
+
+    fn deref(&self) -> &Self::Target {
+        self.child.as_ref().expect("must not use from drop")
+    }
+}
+
+impl DerefMut for NoLeakChild {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.child.as_mut().expect("must not use from drop")
+    }
+}
+
+impl NoLeakChild {
+    fn spawn(command: &mut Command) -> io::Result<Self> {
+        let child = command.spawn()?;
+        Ok(NoLeakChild { child: Some(child) })
+    }
+
+    fn kill_and_wait(mut self) {
+        let child = match self.child.take() {
+            Some(child) => child,
+            None => return,
+        };
+        Self::kill_and_wait_impl(child);
+    }
+
+    #[instrument(skip_all, fields(pid=child.id()))]
+    fn kill_and_wait_impl(mut child: Child) {
+        let res = child.kill();
+        if let Err(e) = res {
+            // This branch is very unlikely because:
+            // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it.
+            // - This is the only place that calls .kill()
+            // - We consume `self`, so, .kill() can't be called twice.
+            // - If the process exited by itself or was killed by someone else,
+            //   .kill() will still succeed because we haven't wait()'ed yet.
+            //
+            // So, if we arrive here, we have really no idea what happened,
+            // whether the PID stored in self.child is still valid, etc.
+            // If this function were fallible, we'd return an error, but
+            // since it isn't, all we can do is log an error and proceed
+            // with the wait().
+            error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process");
+        }
+
+        match child.wait() {
+            Ok(exit_status) => {
+                // log at error level since .kill() is something we only do on errors ATM
+                error!(exit_status = %exit_status, "wait successful");
+            }
+            Err(e) => {
+                error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");
+            }
+        }
+    }
+}
+
+impl Drop for NoLeakChild {
+    fn drop(&mut self) {
+        let child = match self.child.take() {
+            Some(child) => child,
+            None => return,
+        };
+        // Offload the kill+wait of the child process into the background.
+        // If someone stops the runtime, we'll leak the child process.
+        // We can ignore that case because we only stop the runtime on pageserver exit.
+        BACKGROUND_RUNTIME.spawn(async move {
+            tokio::task::spawn_blocking(move || {
+                Self::kill_and_wait_impl(child);
+            })
+            .await
+        });
+    }
+}
+
+trait NoLeakChildCommandExt {
+    fn spawn_no_leak_child(&mut self) -> io::Result<NoLeakChild>;
+}
+
+impl NoLeakChildCommandExt for Command {
+    fn spawn_no_leak_child(&mut self) -> io::Result<NoLeakChild> {
+        NoLeakChild::spawn(self)
+    }
+}
+
 // Functions for constructing messages to send to the postgres WAL redo
-// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
+// process. See pgxn/neon_walredo/walredoproc.c for
 // explanation of the protocol.

 fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -4,7 +4,6 @@
 MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
-	inmem_smgr.o \
 	libpagestore.o \
 	libpqwalproposer.o \
 	pagestore_smgr.o \
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -40,8 +40,22 @@
 bool		connected = false;
 PGconn	   *pageserver_conn = NULL;

+/*
+ * WaitEventSet containing:
+ * - WL_SOCKET_READABLE on pageserver_conn,
+ * - WL_LATCH_SET on MyLatch, and
+ * - WL_EXIT_ON_PM_DEATH.
+ */
+WaitEventSet *pageserver_conn_wes = NULL;
+
 char	   *page_server_connstring_raw;

+int			n_unflushed_requests = 0;
+int			flush_every_n_requests = 8;
+int			readahead_buffer_size = 128;
+
+static void pageserver_flush(void);
+
 static void
 pageserver_connect()
 {
@@ -58,6 +72,7 @@ pageserver_connect()

 		PQfinish(pageserver_conn);
 		pageserver_conn = NULL;
+
 		ereport(ERROR,
 				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
 				 errmsg(NEON_TAG "could not establish connection to pageserver"),
@@ -73,22 +88,26 @@ pageserver_connect()
 		neon_log(ERROR, "could not send pagestream command to pageserver");
 	}

+	pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
+	AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
+			  MyLatch, NULL);
+	AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+			  NULL, NULL);
+	AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);
+
 	while (PQisBusy(pageserver_conn))
 	{
 		int			wc;
+		WaitEvent	event;

 		/* Sleep until there's something to do */
-		wc = WaitLatchOrSocket(MyLatch,
-							   WL_LATCH_SET | WL_SOCKET_READABLE |
-							   WL_EXIT_ON_PM_DEATH,
-							   PQsocket(pageserver_conn),
-							   -1L, PG_WAIT_EXTENSION);
+		wc = WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);

 		CHECK_FOR_INTERRUPTS();

 		/* Data available in socket? */
-		if (wc & WL_SOCKET_READABLE)
+		if (event.events & WL_SOCKET_READABLE)
 		{
 			if (!PQconsumeInput(pageserver_conn))
 			{
@@ -96,6 +115,7 @@ pageserver_connect()

 				PQfinish(pageserver_conn);
 				pageserver_conn = NULL;
+				FreeWaitEventSet(pageserver_conn_wes);

 				neon_log(ERROR, "could not complete handshake with pageserver: %s",
 						 msg);
@@ -112,33 +132,30 @@ pageserver_connect()
 * A wrapper around PQgetCopyData that checks for interrupts while sleeping.
 */
 static int
-call_PQgetCopyData(PGconn *conn, char **buffer)
+call_PQgetCopyData(char **buffer)
 {
 	int			ret;

 retry:
-	ret = PQgetCopyData(conn, buffer, 1 /* async */ );
+	ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );

 	if (ret == 0)
 	{
 		int			wc;
+		WaitEvent	event;

 		/* Sleep until there's something to do */
-		wc = WaitLatchOrSocket(MyLatch,
-							   WL_LATCH_SET | WL_SOCKET_READABLE |
-							   WL_EXIT_ON_PM_DEATH,
-							   PQsocket(conn),
-							   -1L, PG_WAIT_EXTENSION);
+		wc = WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);

 		CHECK_FOR_INTERRUPTS();

 		/* Data available in socket? */
-		if (wc & WL_SOCKET_READABLE)
+		if (event.events & WL_SOCKET_READABLE)
 		{
-			if (!PQconsumeInput(conn))
+			if (!PQconsumeInput(pageserver_conn))
 				neon_log(ERROR, "could not get response from pageserver: %s",
-						 PQerrorMessage(conn));
+						 PQerrorMessage(pageserver_conn));
 		}

 		goto retry;
@@ -164,7 +181,11 @@ pageserver_disconnect(void)
 		PQfinish(pageserver_conn);
 		pageserver_conn = NULL;
 		connected = false;
+
+		prefetch_on_ps_disconnect();
 	}
+	if (pageserver_conn_wes != NULL)
+		FreeWaitEventSet(pageserver_conn_wes);
 }

 static void
@@ -174,11 +195,7 @@ pageserver_send(NeonRequest * request)

 	/* If the connection was lost for some reason, reconnect */
 	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
-	{
-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
-		connected = false;
-	}
+		pageserver_disconnect();

 	if (!connected)
 		pageserver_connect();
@@ -202,6 +219,11 @@ pageserver_send(NeonRequest * request)
 	}
 	pfree(req_buff.data);

+	n_unflushed_requests++;
+
+	if (flush_every_n_requests > 0 && n_unflushed_requests >= flush_every_n_requests)
+		pageserver_flush();
+
 	if (message_level_is_interesting(PageStoreTrace))
 	{
 		char	   *msg = nm_to_string((NeonMessage *) request);
@@ -220,7 +242,7 @@ pageserver_receive(void)
 	PG_TRY();
 	{
 		/* read response */
-		resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data);
+		resp_buff.len = call_PQgetCopyData(&resp_buff.data);
 		resp_buff.cursor = 0;

 		if (resp_buff.len < 0)
@@ -255,25 +277,21 @@ pageserver_receive(void)
 static void
 pageserver_flush(void)
 {
-	if (PQflush(pageserver_conn))
+	if (!connected)
+	{
+		neon_log(WARNING, "Tried to flush while disconnected");
+	}
+	else if (PQflush(pageserver_conn))
 	{
 		char	   *msg = PQerrorMessage(pageserver_conn);

 		pageserver_disconnect();
 		neon_log(ERROR, "failed to flush page requests: %s", msg);
 	}
-}
-
-static NeonResponse *
-pageserver_call(NeonRequest * request)
-{
-	pageserver_send(request);
-	pageserver_flush();
-	return pageserver_receive();
+	n_unflushed_requests = 0;
 }

 page_server_api api = {
-	.request = pageserver_call,
 	.send = pageserver_send,
 	.flush = pageserver_flush,
 	.receive = pageserver_receive
@@ -419,15 +437,6 @@ pg_init_libpagestore(void)
 							   0,	/* no flags required */
 							   check_neon_id, NULL, NULL);

-	DefineCustomBoolVariable("neon.wal_redo",
-							 "start in wal-redo mode",
-							 NULL,
-							 &wal_redo,
-							 false,
-							 PGC_POSTMASTER,
-							 0,
-							 NULL, NULL, NULL);
-
 	DefineCustomIntVariable("neon.max_cluster_size",
 							"cluster size limit",
 							NULL,
@@ -436,6 +445,27 @@ pg_init_libpagestore(void)
 							PGC_SIGHUP,
 							GUC_UNIT_MB,
 							NULL, NULL, NULL);
+	DefineCustomIntVariable("neon.flush_output_after",
+							"Flush the output buffer after every N unflushed requests",
+							NULL,
+							&flush_every_n_requests,
+							8, -1, INT_MAX,
+							PGC_USERSET,
+							0,	/* no flags required */
+							NULL, NULL, NULL);
+	DefineCustomIntVariable("neon.readahead_buffer_size",
+							"number of prefetches to buffer",
+							"This buffer is used to store prefetched data; so "
+							"it is important that this buffer is at least as "
+							"large as the configured value of all tablespaces' "
+							"effective_io_concurrency and maintenance_io_concurrency, "
+							"your sessions' values of these, and the value for "
+							"seqscan_prefetch_buffers.",
+							&readahead_buffer_size,
+							128, 16, 1024,
+							PGC_USERSET,
+							0,	/* no flags required */
+							NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL);

 	relsize_hash_init();

@@ -452,13 +482,7 @@ pg_init_libpagestore(void)
 	neon_timeline_walproposer = neon_timeline;
 	neon_tenant_walproposer = neon_tenant;

-	if (wal_redo)
-	{
-		neon_log(PageStoreTrace, "set inmem_smgr hook");
-		smgr_hook = smgr_inmem;
-		smgr_init_hook = smgr_init_inmem;
-	}
-	else if (page_server_connstring && page_server_connstring[0])
+	if (page_server_connstring && page_server_connstring[0])
 	{
 		neon_log(PageStoreTrace, "set neon_smgr hook");
 		smgr_hook = smgr_neon;
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -115,6 +115,8 @@ typedef struct
 	char		page[FLEXIBLE_ARRAY_MEMBER];
 }			NeonGetPageResponse;

+#define PS_GETPAGERESPONSE_SIZE (MAXALIGN(offsetof(NeonGetPageResponse, page) + BLCKSZ))
+
 typedef struct
 {
 	NeonMessageTag tag;
@@ -138,15 +140,20 @@ extern char *nm_to_string(NeonMessage * msg);

 typedef struct
 {
-	NeonResponse *(*request) (NeonRequest * request);
 	void		(*send) (NeonRequest * request);
 	NeonResponse *(*receive) (void);
 	void		(*flush) (void);
 }			page_server_api;

+extern void prefetch_on_ps_disconnect(void);
+
 extern page_server_api * page_server;

 extern char *page_server_connstring;
+extern int flush_every_n_requests;
+extern int readahead_buffer_size;
+extern bool seqscan_prefetch_enabled;
+extern int seqscan_prefetch_distance;
 extern char *neon_timeline;
 extern char *neon_tenant;
 extern bool wal_redo;
@@ -154,10 +161,7 @@ extern int32 max_cluster_size;

 extern const f_smgr *smgr_neon(BackendId backend, RelFileNode rnode);
 extern void smgr_init_neon(void);
-
-extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode);
-extern void smgr_init_inmem(void);
-extern void smgr_shutdown_inmem(void);
+extern void readahead_buffer_resize(int newsize, void *extra);

 /* Neon storage manager functionality */

@@ -171,7 +175,6 @@ extern void neon_extend(SMgrRelation reln, ForkNumber forknum,
 						BlockNumber blocknum, char *buffer, bool skipFsync);
 extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber blocknum);
-extern void neon_reset_prefetch(SMgrRelation reln);
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  char *buffer);

@@ -188,29 +191,6 @@ extern void neon_truncate(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber nblocks);
 extern void neon_immedsync(SMgrRelation reln, ForkNumber forknum);

-/* neon wal-redo storage manager functionality */
-
-extern void inmem_init(void);
-extern void inmem_open(SMgrRelation reln);
-extern void inmem_close(SMgrRelation reln, ForkNumber forknum);
-extern void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
-extern bool inmem_exists(SMgrRelation reln, ForkNumber forknum);
-extern void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
-extern void inmem_extend(SMgrRelation reln, ForkNumber forknum,
-						 BlockNumber blocknum, char *buffer, bool skipFsync);
-extern bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
-						   BlockNumber blocknum);
-extern void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-					   char *buffer);
-extern void inmem_write(SMgrRelation reln, ForkNumber forknum,
-						BlockNumber blocknum, char *buffer, bool skipFsync);
-extern void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
-							BlockNumber blocknum, BlockNumber nblocks);
-extern BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
-extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
-						   BlockNumber nblocks);
-extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
-
 /* utils for neon relsize cache */
 extern void relsize_hash_init(void);
 extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -49,22 +49,20 @@
 #include "access/xlog.h"
 #include "access/xloginsert.h"
 #include "access/xlog_internal.h"
-#include "catalog/pg_class.h"
-#include "pagestore_client.h"
-#include "pagestore_client.h"
-#include "storage/smgr.h"
 #include "access/xlogdefs.h"
+#include "catalog/pg_class.h"
+#include "common/hashfn.h"
+#include "pagestore_client.h"
 #include "postmaster/interrupt.h"
+#include "postmaster/autovacuum.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
 #include "storage/relfilenode.h"
 #include "storage/buf_internals.h"
+#include "storage/smgr.h"
 #include "storage/md.h"
-#include "fmgr.h"
-#include "miscadmin.h"
 #include "pgstat.h"
-#include "catalog/pg_tablespace_d.h"
-#include "postmaster/autovacuum.h"
+

 #if PG_VERSION_NUM >= 150000
 #include "access/xlogutils.h"
@@ -99,7 +97,6 @@ char	   *page_server_connstring;
 /*with substituted password*/
 char	   *neon_timeline;
 char	   *neon_tenant;
-bool		wal_redo = false;
 int32		max_cluster_size;

 /* unlogged relation build states */
@@ -114,48 +111,646 @@ typedef enum
 static SMgrRelation unlogged_build_rel = NULL;
 static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;

-
 /*
 * Prefetch implementation:
+ * 
 * Prefetch is performed locally by each backend.
- * There can be up to MAX_PREFETCH_REQUESTS registered using smgr_prefetch
- * before smgr_read. All this requests are appended to primary smgr_read request.
- * It is assumed that pages will be requested in prefetch order.
- * Reading of prefetch responses is delayed until them are actually needed (smgr_read).
- * It make it possible to parallelize processing and receiving of prefetched pages.
- * In case of prefetch miss or any other SMGR request other than smgr_read,
- * all prefetch responses has to be consumed.
+ *
+ * There can be up to readahead_buffer_size active IO requests registered at
+ * any time. Requests using smgr_prefetch are sent to the pageserver, but we
+ * don't wait on the response. Requests using smgr_read are either read from
+ * the buffer, or (if that's not possible) we wait on the response to arrive -
+ * this also will allow us to receive other prefetched pages. 
+ * Each request is immediately written to the output buffer of the pageserver
+ * connection, but may not be flushed if smgr_prefetch is used: pageserver
+ * flushes sent requests on manual flush, or every neon.flush_output_after
+ * unflushed requests; which is not necessarily always and all the time.
+ *
+ * Once we have received a response, this value will be stored in the response
+ * buffer, indexed in a hash table. This allows us to retain our buffered
+ * prefetch responses even when we have cache misses.
+ *
+ * Reading of prefetch responses is delayed until them are actually needed
+ * (smgr_read). In case of prefetch miss or any other SMGR request other than
+ * smgr_read, all prefetch responses in the pipeline will need to be read from
+ * the connection; the responses are stored for later use.
+ *
+ * NOTE: The current implementation of the prefetch system implements a ring
+ * buffer of up to readahead_buffer_size requests. If there are more _read and
+ * _prefetch requests between the initial _prefetch and the _read of a buffer,
+ * the prefetch request will have been dropped from this prefetch buffer, and
+ * your prefetch was wasted.
 */

-#define MAX_PREFETCH_REQUESTS 128
+/*
+ * State machine:
+ *        
+ * not in hash : in hash
+ *             :
+ * UNUSED ------> REQUESTED --> RECEIVED
+ *   ^         :      |            |
+ *   |         :      v            |
+ *   |         : TAG_UNUSED        |
+ *   |         :      |            |
+ *   +----------------+------------+
+ *             :
+ */
+typedef enum PrefetchStatus {
+	PRFS_UNUSED = 0,	/* unused slot */
+	PRFS_REQUESTED,		/* request was written to the sendbuffer to PS, but not
+						 * necessarily flushed.
+						 * all fields except response valid */
+	PRFS_RECEIVED,		/* all fields valid */
+	PRFS_TAG_REMAINS,	/* only buftag and my_ring_index are still valid */
+} PrefetchStatus;

-BufferTag	prefetch_requests[MAX_PREFETCH_REQUESTS];
-BufferTag	prefetch_responses[MAX_PREFETCH_REQUESTS];
-int			n_prefetch_requests;
-int			n_prefetch_responses;
-int			n_prefetched_buffers;
-int			n_prefetch_hits;
-int			n_prefetch_misses;
-XLogRecPtr	prefetch_lsn;
+typedef struct PrefetchRequest {
+	BufferTag	buftag; /* must be first entry in the struct */
+	XLogRecPtr	effective_request_lsn;
+	NeonResponse *response; /* may be null */
+	PrefetchStatus status;
+	uint64		my_ring_index;
+} PrefetchRequest;

+/* prefetch buffer lookup hash table */
+
+typedef struct PrfHashEntry {
+	PrefetchRequest *slot;
+	uint32 status;
+	uint32 hash;
+} PrfHashEntry;
+
+#define SH_PREFIX			prfh
+#define SH_ELEMENT_TYPE		PrfHashEntry
+#define SH_KEY_TYPE			PrefetchRequest *
+#define SH_KEY				slot
+#define SH_STORE_HASH
+#define SH_GET_HASH(tb, a)	((a)->hash)
+#define SH_HASH_KEY(tb, key) hash_bytes( \
+	((const unsigned char *) &(key)->buftag), \
+	sizeof(BufferTag) \
+)
+
+#define SH_EQUAL(tb, a, b)	(BUFFERTAGS_EQUAL((a)->buftag, (b)->buftag))
+#define SH_SCOPE			static inline
+#define SH_DEFINE
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
+/*
+ * PrefetchState maintains the state of (prefetch) getPage@LSN requests.
+ * It maintains a (ring) buffer of in-flight requests and responses.
+ * 
+ * We maintain several indexes into the ring buffer:
+ * ring_unused >= ring_flush >= ring_receive >= ring_last >= 0
+ * 
+ * ring_unused points to the first unused slot of the buffer
+ * ring_receive is the next request that is to be received
+ * ring_last is the oldest received entry in the buffer
+ * 
+ * Apart from being an entry in the ring buffer of prefetch requests, each
+ * PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag.
+ */
+typedef struct PrefetchState {
+	MemoryContext bufctx; /* context for prf_buffer[].response allocations */
+	MemoryContext errctx; /* context for prf_buffer[].response allocations */
+	MemoryContext hashctx; /* context for prf_buffer */
+
+	/* buffer indexes */
+	uint64	ring_unused;		/* first unused slot */
+	uint64	ring_flush;			/* next request to flush */
+	uint64	ring_receive;		/* next slot that is to receive a response */
+	uint64	ring_last;			/* min slot with a response value */
+
+	/* metrics / statistics  */
+	int		n_responses_buffered;	/* count of PS responses not yet in buffers */
+	int		n_requests_inflight;	/* count of PS requests considered in flight */
+	int		n_unused;				/* count of buffers < unused, > last, that are also unused */
+
+	/* the buffers */
+	prfh_hash *prf_hash;
+	PrefetchRequest prf_buffer[]; /* prefetch buffers */
+} PrefetchState;
+
+PrefetchState *MyPState;
+
+#define GetPrfSlot(ring_index) ( \
+	( \
+		AssertMacro((ring_index) < MyPState->ring_unused && \
+					(ring_index) >= MyPState->ring_last), \
+		&MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \
+	) \
+)
+
+int			n_prefetch_hits = 0;
+int			n_prefetch_misses = 0;
+int			n_prefetch_missed_caches = 0;
+int			n_prefetch_dupes = 0;
+
+XLogRecPtr	prefetch_lsn = 0;
+
+static void consume_prefetch_responses(void);
+static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn);
+static void prefetch_read(PrefetchRequest *slot);
+static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn);
+static void prefetch_wait_for(uint64 ring_index);
+static void prefetch_cleanup(void);
+static inline void prefetch_set_unused(uint64 ring_index);
+
+static XLogRecPtr neon_get_request_lsn(bool *latest, RelFileNode rnode,
+									   ForkNumber forknum, BlockNumber blkno);
+
+void
+readahead_buffer_resize(int newsize, void *extra)
+{
+	uint64		end,
+				nfree = newsize;
+	PrefetchState *newPState;
+	Size 		newprfs_size = offsetof(PrefetchState, prf_buffer) + (
+		sizeof(PrefetchRequest) * readahead_buffer_size
+	);
+	
+	/* don't try to re-initialize if we haven't initialized yet */
+	if (MyPState == NULL)
+		return;
+
+	/*
+	 * Make sure that we don't lose track of active prefetch requests by
+	 * ensuring we have received all but the last n requests (n = newsize).
+	 */
+	if (MyPState->n_requests_inflight > newsize)
+		prefetch_wait_for(MyPState->ring_unused - newsize);
+
+	/* construct the new PrefetchState, and copy over the memory contexts */
+	newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size);
+
+	newPState->bufctx = MyPState->bufctx;
+	newPState->errctx = MyPState->errctx;
+	newPState->hashctx = MyPState->hashctx;
+	newPState->prf_hash = prfh_create(MyPState->hashctx, newsize, NULL);
+	newPState->n_unused = newsize;
+	newPState->n_requests_inflight = 0;
+	newPState->n_responses_buffered = 0;
+	newPState->ring_last = newsize;
+	newPState->ring_unused = newsize;
+	newPState->ring_receive = newsize;
+	newPState->ring_flush = newsize;
+
+	/* 
+	 * Copy over the prefetches.
+	 * 
+	 * We populate the prefetch array from the end; to retain the most recent
+	 * prefetches, but this has the benefit of only needing to do one iteration
+	 * on the dataset, and trivial compaction.
+	 */
+	for (end = MyPState->ring_unused - 1;
+		 end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0;
+		 end -= 1)
+	{
+		PrefetchRequest *slot = GetPrfSlot(end);
+		PrefetchRequest *newslot;
+		bool	found;
+
+		if (slot->status == PRFS_UNUSED)
+			continue;
+
+		nfree -= 1;
+
+		newslot = &newPState->prf_buffer[nfree];
+		*newslot = *slot;
+		newslot->my_ring_index = nfree;
+
+		prfh_insert(newPState->prf_hash, newslot, &found);
+
+		Assert(!found);
+		
+		switch (newslot->status)
+		{
+			case PRFS_UNUSED:
+				pg_unreachable();
+			case PRFS_REQUESTED:
+				newPState->n_requests_inflight += 1;
+				newPState->ring_receive -= 1;
+				newPState->ring_last -= 1;
+				break;
+			case PRFS_RECEIVED:
+				newPState->n_responses_buffered += 1;
+				newPState->ring_last -= 1;
+				break;
+			case PRFS_TAG_REMAINS:
+				newPState->ring_last -= 1;
+				break;
+		}
+		newPState->n_unused -= 1;
+	}
+
+	for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
+	{
+		prefetch_set_unused(end);
+	}
+
+	prfh_destroy(MyPState->prf_hash);
+	pfree(MyPState);
+	MyPState = newPState;
+}
+
+
+
+/*
+ * Make sure that there are no responses still in the buffer.
+ *
+ * NOTE: this function may indirectly update MyPState->pfs_hash; which
+ * invalidates any active pointers into the hash table.
+ */
 static void
 consume_prefetch_responses(void)
 {
-	for (int i = n_prefetched_buffers; i < n_prefetch_responses; i++)
-	{
-		NeonResponse *resp = page_server->receive();
+	if (MyPState->ring_receive < MyPState->ring_unused)
+		prefetch_wait_for(MyPState->ring_unused - 1);
+}

-		pfree(resp);
+static void
+prefetch_cleanup(void)
+{
+	uint64	ring_index;
+	PrefetchRequest *slot;
+
+	while (MyPState->ring_last < MyPState->ring_receive) {
+		ring_index = MyPState->ring_last;
+		slot = GetPrfSlot(ring_index);
+
+		if (slot->status == PRFS_UNUSED)
+			MyPState->ring_last += 1;
+		else
+			break;
 	}
-	n_prefetched_buffers = 0;
-	n_prefetch_responses = 0;
+}
+
+/*
+ * Wait for slot of ring_index to have received its response.
+ * The caller is responsible for making sure the request buffer is flushed.
+ * 
+ * NOTE: this function may indirectly update MyPState->pfs_hash; which
+ * invalidates any active pointers into the hash table.
+ */
+static void
+prefetch_wait_for(uint64 ring_index)
+{
+	PrefetchRequest *entry;
+
+	if (MyPState->ring_flush <= ring_index &&
+		MyPState->ring_unused > MyPState->ring_flush)
+	{
+		page_server->flush();
+		MyPState->ring_flush = MyPState->ring_unused;
+	}
+
+	Assert(MyPState->ring_unused > ring_index);
+
+	while (MyPState->ring_receive <= ring_index)
+	{
+		entry = GetPrfSlot(MyPState->ring_receive);
+
+		Assert(entry->status == PRFS_REQUESTED);
+		prefetch_read(entry);
+	}
+}
+
+/*
+ * Read the response of a prefetch request into its slot.
+ * 
+ * The caller is responsible for making sure that the request for this buffer
+ * was flushed to the PageServer.
+ *
+ * NOTE: this function may indirectly update MyPState->pfs_hash; which
+ * invalidates any active pointers into the hash table.
+ */
+static void
+prefetch_read(PrefetchRequest *slot)
+{
+	NeonResponse *response;
+	MemoryContext old;
+
+	Assert(slot->status == PRFS_REQUESTED);
+	Assert(slot->response == NULL);
+	Assert(slot->my_ring_index == MyPState->ring_receive);
+
+	old = MemoryContextSwitchTo(MyPState->errctx);
+	response = (NeonResponse *) page_server->receive();
+	MemoryContextSwitchTo(old);
+	
+	/* update prefetch state */
+	MyPState->n_responses_buffered += 1;
+	MyPState->n_requests_inflight -= 1;
+	MyPState->ring_receive += 1;
+
+	/* update slot state */
+	slot->status = PRFS_RECEIVED;
+	slot->response = response;
+}
+
+/*
+ * Disconnect hook - drop prefetches when the connection drops
+ * 
+ * If we don't remove the failed prefetches, we'd be serving incorrect
+ * data to the smgr.
+ */
+void
+prefetch_on_ps_disconnect(void)
+{
+	MyPState->ring_flush = MyPState->ring_unused;
+	while (MyPState->ring_receive < MyPState->ring_unused)
+	{
+		PrefetchRequest *slot;
+		uint64 ring_index = MyPState->ring_receive;
+
+		slot = GetPrfSlot(ring_index);
+
+		Assert(slot->status == PRFS_REQUESTED);
+		Assert(slot->my_ring_index == ring_index);
+
+		/* clean up the request */
+		slot->status = PRFS_TAG_REMAINS;
+		MyPState->n_requests_inflight -= 1;
+		MyPState->ring_receive += 1;
+		prefetch_set_unused(ring_index);
+	}
+}
+
+/*
+ * prefetch_set_unused() - clear a received prefetch slot
+ *
+ * The slot at ring_index must be a current member of the ring buffer,
+ * and may not be in the PRFS_REQUESTED state.
+ *
+ * NOTE: this function will update MyPState->pfs_hash; which invalidates any
+ * active pointers into the hash table.
+ */
+static inline void
+prefetch_set_unused(uint64 ring_index)
+{
+	PrefetchRequest *slot = GetPrfSlot(ring_index);
+
+	if (ring_index < MyPState->ring_last)
+		return; /* Should already be unused */
+
+	Assert(MyPState->ring_unused > ring_index);
+
+	if (slot->status == PRFS_UNUSED)
+		return;
+
+	Assert(slot->status == PRFS_RECEIVED || slot->status == PRFS_TAG_REMAINS);
+
+	if (slot->status == PRFS_RECEIVED)
+	{
+		pfree(slot->response);
+		slot->response = NULL;
+
+		MyPState->n_responses_buffered -= 1;
+		MyPState->n_unused += 1;
+	}
+	else
+	{
+		Assert(slot->response == NULL);
+	}
+
+	prfh_delete(MyPState->prf_hash, slot);
+
+	/* clear all fields */
+	MemSet(slot, 0, sizeof(PrefetchRequest));
+	slot->status = PRFS_UNUSED;
+
+	/* run cleanup if we're holding back ring_last */
+	if (MyPState->ring_last == ring_index)
+		prefetch_cleanup();
+}
+
+static void
+prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn)
+{
+	bool found;
+	NeonGetPageRequest request = {
+		.req.tag = T_NeonGetPageRequest,
+		.req.latest = false,
+		.req.lsn = 0,
+		.rnode = slot->buftag.rnode,
+		.forknum = slot->buftag.forkNum,
+		.blkno = slot->buftag.blockNum,
+	};
+
+	if (force_lsn && force_latest)
+	{
+		request.req.lsn = *force_lsn;
+		request.req.latest = *force_latest;
+		slot->effective_request_lsn = *force_lsn;
+	}
+	else
+	{
+		XLogRecPtr lsn = neon_get_request_lsn(
+			&request.req.latest,
+			slot->buftag.rnode,
+			slot->buftag.forkNum,
+			slot->buftag.blockNum
+		);
+		/*
+		 * Note: effective_request_lsn is potentially higher than the requested
+		 * LSN, but still correct:
+		 * 
+		 * We know there are no changes between the actual requested LSN and
+		 * the value of effective_request_lsn: If there were, the page would
+		 * have been in cache and evicted between those LSN values, which
+		 * then would have had to result in a larger request LSN for this page.
+		 * 
+		 * It is possible that a concurrent backend loads the page, modifies
+		 * it and then evicts it again, but the LSN of that eviction cannot be
+		 * smaller than the current WAL insert/redo pointer, which is already
+		 * larger than this prefetch_lsn. So in any case, that would
+		 * invalidate this cache.
+		 * 
+		 * The best LSN to use for effective_request_lsn would be
+		 * XLogCtl->Insert.RedoRecPtr, but that's expensive to access.
+		 */
+		request.req.lsn = lsn;
+		prefetch_lsn = Max(prefetch_lsn, lsn);
+		slot->effective_request_lsn = prefetch_lsn;
+	}
+
+	Assert(slot->response == NULL);
+	Assert(slot->my_ring_index == MyPState->ring_unused);
+	page_server->send((NeonRequest *) &request);
+
+	/* update prefetch state */
+	MyPState->n_requests_inflight += 1;
+	MyPState->n_unused -= 1;
+	MyPState->ring_unused += 1;
+
+	/* update slot state */
+	slot->status = PRFS_REQUESTED;
+
+	prfh_insert(MyPState->prf_hash, slot, &found);
+	Assert(!found);
+}
+
+/*
+ * prefetch_register_buffer() - register and prefetch buffer
+ *
+ * Register that we may want the contents of BufferTag in the near future.
+ * 
+ * If force_latest and force_lsn are not NULL, those values are sent to the
+ * pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure
+ * to fill in these values manually.
+ *
+ * NOTE: this function may indirectly update MyPState->pfs_hash; which
+ * invalidates any active pointers into the hash table.
+ */
+
+static uint64
+prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn)
+{
+	uint64	ring_index;
+	PrefetchRequest req;
+	PrefetchRequest *slot;
+	PrfHashEntry *entry;
+
+	/* use an intermediate PrefetchRequest struct to ensure correct alignment */
+	req.buftag = tag;
+	
+	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req);
+
+	if (entry != NULL)
+	{
+		slot = entry->slot;
+		ring_index = slot->my_ring_index;
+		Assert(slot == GetPrfSlot(ring_index));
+
+		Assert(slot->status != PRFS_UNUSED);
+		Assert(MyPState->ring_last <= ring_index &&
+			   ring_index < MyPState->ring_unused);
+		Assert(BUFFERTAGS_EQUAL(slot->buftag, tag));
+
+		/*
+		 * If we want a specific lsn, we do not accept requests that were made
+		 * with a potentially different LSN.
+		 */
+		if (force_latest && force_lsn)
+		{
+			/* if we want the latest version, any effective_request_lsn < request lsn is OK */
+			if (*force_latest)
+			{
+				if (*force_lsn > slot->effective_request_lsn)
+				{
+					prefetch_wait_for(ring_index);
+					prefetch_set_unused(ring_index);
+					entry = NULL;
+				}
+			}
+			/* if we don't want the latest version, only accept requests with the exact same LSN */
+			else
+			{
+				if (*force_lsn != slot->effective_request_lsn)
+				{
+					prefetch_wait_for(ring_index);
+					prefetch_set_unused(ring_index);
+					entry = NULL;
+				}
+			}
+		}
+
+		/*
+		 * We received a prefetch for a page that was recently read and
+		 * removed from the buffers. Remove that request from the buffers.
+		 */
+		else if (slot->status == PRFS_TAG_REMAINS)
+		{
+			prefetch_set_unused(ring_index);
+			entry = NULL;
+		}
+		else
+		{
+			/* The buffered request is good enough, return that index */
+			n_prefetch_dupes++;
+			return ring_index;
+		}
+	}
+
+	/*
+	 * If the prefetch queue is full, we need to make room by clearing the
+	 * oldest slot. If the oldest slot holds a buffer that was already
+	 * received, we can just throw it away; we fetched the page unnecessarily
+	 * in that case. If the oldest slot holds a request that we haven't
+	 * received a response for yet, we have to wait for the response to that
+	 * before we can continue. We might not have even flushed the request to
+	 * the pageserver yet, it might be just sitting in the output buffer. In
+	 * that case, we flush it and wait for the response. (We could decide not
+	 * to send it, but it's hard to abort when the request is already in the
+	 * output buffer, and 'not sending' a prefetch request kind of goes
+	 * against the principles of prefetching)
+	 */
+	if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused)
+	{
+		uint64 cleanup_index = MyPState->ring_last;
+		slot = GetPrfSlot(cleanup_index);
+
+		Assert(slot->status != PRFS_UNUSED);
+
+		/* We have the slot for ring_last, so that must still be in progress */
+		switch (slot->status)
+		{
+			case PRFS_REQUESTED:
+				Assert(MyPState->ring_receive == cleanup_index);
+				prefetch_wait_for(cleanup_index);
+				prefetch_set_unused(cleanup_index);
+				break;
+			case PRFS_RECEIVED:
+			case PRFS_TAG_REMAINS:
+				prefetch_set_unused(cleanup_index);
+				break;
+			default:
+				pg_unreachable();
+		}
+	}
+
+	/*
+	 * The next buffer pointed to by `ring_unused` is now definitely empty,
+	 * so we can insert the new request to it.
+	 */
+	ring_index = MyPState->ring_unused;
+	slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)];
+
+	Assert(MyPState->ring_last <= ring_index);
+
+	Assert(slot->status == PRFS_UNUSED);
+
+	/*
+	 * We must update the slot data before insertion, because the hash
+	 * function reads the buffer tag from the slot.
+	 */
+	slot->buftag = tag;
+	slot->my_ring_index = ring_index;
+
+	prefetch_do_request(slot, force_latest, force_lsn);
+	Assert(slot->status == PRFS_REQUESTED);
+	Assert(MyPState->ring_last <= ring_index &&
+		   ring_index < MyPState->ring_unused);
+
+	if (flush_every_n_requests > 0 &&
+		MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
+	{
+		page_server->flush();
+		MyPState->ring_flush = MyPState->ring_unused;
+	}
+
+	return ring_index;
 }

 static NeonResponse *
 page_server_request(void const *req)
 {
+	page_server->send((NeonRequest *) req);
+	page_server->flush();
+	MyPState->ring_flush = MyPState->ring_unused;
 	consume_prefetch_responses();
-	return page_server->request((NeonRequest *) req);
+	return page_server->receive();
 }


@@ -269,12 +864,15 @@ nm_unpack_response(StringInfo s)

 		case T_NeonGetPageResponse:
 			{
-				NeonGetPageResponse *msg_resp = palloc0(offsetof(NeonGetPageResponse, page) + BLCKSZ);
+				NeonGetPageResponse *msg_resp;

+				msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE);
 				msg_resp->tag = tag;
 				/* XXX:	should be varlena */
 				memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
 				pq_getmsgend(s);
+				
+				Assert(msg_resp->tag == T_NeonGetPageResponse);

 				resp = (NeonResponse *) msg_resp;
 				break;
@@ -618,7 +1216,33 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 void
 neon_init(void)
 {
-	/* noop */
+	Size prfs_size;
+
+	if (MyPState != NULL)
+		return;
+
+	prfs_size = offsetof(PrefetchState, prf_buffer) + (
+		sizeof(PrefetchRequest) * readahead_buffer_size
+	);
+
+	MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size);
+	
+	MyPState->n_unused = readahead_buffer_size;
+
+	MyPState->bufctx = SlabContextCreate(TopMemoryContext,
+										 "NeonSMGR/prefetch",
+										 SLAB_DEFAULT_BLOCK_SIZE * 17,
+										 PS_GETPAGERESPONSE_SIZE);
+	MyPState->errctx = AllocSetContextCreate(TopMemoryContext, 
+											 "NeonSMGR/errors",
+											 ALLOCSET_DEFAULT_SIZES);
+	MyPState->hashctx = AllocSetContextCreate(TopMemoryContext,
+											  "NeonSMGR/prefetch",
+											  ALLOCSET_DEFAULT_SIZES);
+
+	MyPState->prf_hash = prfh_create(MyPState->hashctx,
+									 readahead_buffer_size, NULL);
+
 #ifdef DEBUG_COMPARE_LOCAL
 	mdinit();
 #endif
@@ -1005,27 +1629,17 @@ neon_close(SMgrRelation reln, ForkNumber forknum)
 }


-/*
- *	neon_reset_prefetch() -- reoe all previously rgistered prefeth requests
- */
-void
-neon_reset_prefetch(SMgrRelation reln)
-{
-	n_prefetch_requests = 0;
-}
-
 /*
 *	neon_prefetch() -- Initiate asynchronous read of the specified block of a relation
 */
 bool
 neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
+	uint64 ring_index PG_USED_FOR_ASSERTS_ONLY;
+
 	switch (reln->smgr_relpersistence)
 	{
-		case 0:
-			/* probably shouldn't happen, but ignore it */
-			break;
-
+		case 0: /* probably shouldn't happen, but ignore it */
 		case RELPERSISTENCE_PERMANENT:
 			break;

@@ -1037,14 +1651,17 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	if (n_prefetch_requests < MAX_PREFETCH_REQUESTS)
-	{
-		prefetch_requests[n_prefetch_requests].rnode = reln->smgr_rnode.node;
-		prefetch_requests[n_prefetch_requests].forkNum = forknum;
-		prefetch_requests[n_prefetch_requests].blockNum = blocknum;
-		n_prefetch_requests += 1;
-		return true;
-	}
+	BufferTag tag = (BufferTag) {
+		.rnode = reln->smgr_rnode.node,
+		.forkNum = forknum,
+		.blockNum = blocknum
+	};
+
+	ring_index = prefetch_register_buffer(tag, NULL, NULL);
+
+	Assert(ring_index < MyPState->ring_unused &&
+		   MyPState->ring_last <= ring_index);
+
 	return false;
 }

@@ -1095,81 +1712,70 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 				 XLogRecPtr request_lsn, bool request_latest, char *buffer)
 {
 	NeonResponse *resp;
-	int			i;
+	BufferTag	buftag;
+	uint64		ring_index;
+	PrfHashEntry *entry;
+	PrefetchRequest *slot;
+
+	buftag = (BufferTag) {
+		.rnode = rnode,
+		.forkNum = forkNum,
+		.blockNum = blkno,
+	};

 	/*
-	 * Try to find prefetched page. It is assumed that pages will be requested
-	 * in the same order as them are prefetched, but some other backend may
-	 * load page in shared buffers, so some prefetch responses should be
-	 * skipped.
+	 * Try to find prefetched page in the list of received pages.
 	 */
-	for (i = n_prefetched_buffers; i < n_prefetch_responses; i++)
-	{
-		resp = page_server->receive();
-		if (resp->tag == T_NeonGetPageResponse &&
-			RelFileNodeEquals(prefetch_responses[i].rnode, rnode) &&
-			prefetch_responses[i].forkNum == forkNum &&
-			prefetch_responses[i].blockNum == blkno)
-		{
-			char	   *page = ((NeonGetPageResponse *) resp)->page;
+	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);

+	if (entry != NULL)
+	{
+		slot = entry->slot;
+		if (slot->effective_request_lsn >= request_lsn)
+		{
+			ring_index = slot->my_ring_index;
+			n_prefetch_hits += 1;
+		}
+		else /* the current prefetch LSN is not large enough, so drop the prefetch */
+		{
 			/*
-			 * Check if prefetched page is still relevant. If it is updated by
-			 * some other backend, then it should not be requested from smgr
-			 * unless it is evicted from shared buffers. In the last case
-			 * last_evicted_lsn should be updated and request_lsn should be
-			 * greater than prefetch_lsn. Maximum with page LSN is used
-			 * because page returned by page server may have LSN either
-			 * greater either smaller than requested.
+			 * We can't drop cache for not-yet-received requested items. It is
+			 * unlikely this happens, but it can happen if prefetch distance is
+			 * large enough and a backend didn't consume all prefetch requests.
 			 */
-			if (Max(prefetch_lsn, PageGetLSN(page)) >= request_lsn)
+			if (slot->status == PRFS_REQUESTED)
 			{
-				n_prefetched_buffers = i + 1;
-				n_prefetch_hits += 1;
-				n_prefetch_requests = 0;
-				memcpy(buffer, page, BLCKSZ);
-				pfree(resp);
-				return;
+				prefetch_wait_for(slot->my_ring_index);
 			}
+			/* drop caches */
+			prefetch_set_unused(slot->my_ring_index);
+			n_prefetch_missed_caches += 1;
+			/* make it look like a prefetch cache miss */
+			entry = NULL;
 		}
-		pfree(resp);
 	}
-	n_prefetched_buffers = 0;
-	n_prefetch_responses = 0;
-	n_prefetch_misses += 1;
-	{
-		NeonGetPageRequest request = {
-			.req.tag = T_NeonGetPageRequest,
-			.req.latest = request_latest,
-			.req.lsn = request_lsn,
-			.rnode = rnode,
-			.forknum = forkNum,
-			.blkno = blkno
-		};

-		if (n_prefetch_requests > 0)
-		{
-			/* Combine all prefetch requests with primary request */
-			page_server->send((NeonRequest *) & request);
-			for (i = 0; i < n_prefetch_requests; i++)
-			{
-				request.rnode = prefetch_requests[i].rnode;
-				request.forknum = prefetch_requests[i].forkNum;
-				request.blkno = prefetch_requests[i].blockNum;
-				prefetch_responses[i] = prefetch_requests[i];
-				page_server->send((NeonRequest *) & request);
-			}
-			page_server->flush();
-			n_prefetch_responses = n_prefetch_requests;
-			n_prefetch_requests = 0;
-			prefetch_lsn = request_lsn;
-			resp = page_server->receive();
-		}
-		else
-		{
-			resp = page_server->request((NeonRequest *) & request);
-		}
+	if (entry == NULL)
+	{
+		n_prefetch_misses += 1;
+
+		ring_index = prefetch_register_buffer(buftag, &request_latest,
+											  &request_lsn);
+		slot = GetPrfSlot(ring_index);
 	}
+
+	Assert(slot->my_ring_index == ring_index);
+	Assert(MyPState->ring_last <= ring_index &&
+		   MyPState->ring_unused > ring_index);
+	Assert(slot->status != PRFS_UNUSED);
+	Assert(GetPrfSlot(ring_index) == slot);
+
+	prefetch_wait_for(ring_index);
+
+	Assert(slot->status == PRFS_RECEIVED);
+
+	resp = slot->response;
+
 	switch (resp->tag)
 	{
 		case T_NeonGetPageResponse:
@@ -1189,12 +1795,13 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
-
 		default:
 			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
 	}

-	pfree(resp);
+	/* buffer was used, clean up for later reuse */
+	prefetch_set_unused(ring_index);
+	prefetch_cleanup();
 }

 /*
@@ -1816,7 +2423,6 @@ static const struct f_smgr neon_smgr =
 	.smgr_unlink = neon_unlink,
 	.smgr_extend = neon_extend,
 	.smgr_prefetch = neon_prefetch,
-	.smgr_reset_prefetch = neon_reset_prefetch,
 	.smgr_read = neon_read,
 	.smgr_write = neon_write,
 	.smgr_writeback = neon_writeback,
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -43,6 +43,7 @@
 #if PG_VERSION_NUM >= 150000
 #include "access/xlogrecovery.h"
 #endif
+#include "storage/fd.h"
 #include "storage/latch.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -69,11 +70,12 @@
 #include "neon.h"
 #include "walproposer.h"
 #include "walproposer_utils.h"
-#include "replication/walpropshim.h"
+
+static bool syncSafekeepers = false;

 char	   *wal_acceptors_list;
 int			wal_acceptor_reconnect_timeout;
-int			wal_acceptor_connect_timeout;
+int			wal_acceptor_connection_timeout;
 bool		am_wal_proposer;

 char	   *neon_timeline_walproposer = NULL;
@@ -117,8 +119,8 @@ static TimestampTz last_reconnect_attempt;
 static WalproposerShmemState * walprop_shared;

 /* Prototypes for private functions */
-static void WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId);
-static void WalProposerStartImpl(void);
+static void WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId);
+static void WalProposerStart(void);
 static void WalProposerLoop(void);
 static void InitEventSet(void);
 static void UpdateEventSet(Safekeeper *sk, uint32 events);
@@ -186,9 +188,56 @@ pg_init_walproposer(void)
 	ProcessInterruptsCallback = backpressure_throttling_impl;

 	WalProposerRegister();
+}

-	WalProposerInit = &WalProposerInitImpl;
-	WalProposerStart = &WalProposerStartImpl;
+/*
+ * Entry point for `postgres --sync-safekeepers`.
+ */
+void
+WalProposerSync(int argc, char *argv[])
+{
+	struct stat stat_buf;
+
+	syncSafekeepers = true;
+#if PG_VERSION_NUM < 150000
+	ThisTimeLineID = 1;
+#endif
+
+	/*
+	 * Initialize postmaster_alive_fds as WaitEventSet checks them.
+	 *
+	 * Copied from InitPostmasterDeathWatchHandle()
+	 */
+	if (pipe(postmaster_alive_fds) < 0)
+		ereport(FATAL,
+				(errcode_for_file_access(),
+					errmsg_internal("could not create pipe to monitor postmaster death: %m")));
+	if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1)
+		ereport(FATAL,
+				(errcode_for_socket_access(),
+					errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m")));
+
+	ChangeToDataDir();
+
+	/* Create pg_wal directory, if it doesn't exist */
+	if (stat(XLOGDIR, &stat_buf) != 0)
+	{
+		ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", XLOGDIR)));
+		if (MakePGDirectory(XLOGDIR) < 0)
+		{
+			ereport(ERROR,
+					(errcode_for_file_access(),
+						errmsg("could not create directory \"%s\": %m",
+							   XLOGDIR)));
+			exit(1);
+		}
+	}
+
+	WalProposerInit(0, 0);
+
+	BackgroundWorkerUnblockSignals();
+
+	WalProposerStart();
 }

 static void
@@ -217,9 +266,9 @@ nwp_register_gucs(void)

 	DefineCustomIntVariable(
 							"neon.safekeeper_connect_timeout",
-							"Timeout after which give up connection attempt to safekeeper.",
+							"Timeout for connection establishement and it's maintenance against safekeeper",
 							NULL,
-							&wal_acceptor_connect_timeout,
+							&wal_acceptor_connection_timeout,
 							5000, 0, INT_MAX,
 							PGC_SIGHUP,
 							GUC_UNIT_MS,
@@ -368,7 +417,9 @@ WalProposerPoll(void)
 			ResetLatch(MyLatch);
 			break;
 		}
-		if (rc == 0)			/* timeout expired: poll state */
+
+		now = GetCurrentTimestamp();
+		if (rc == 0 || TimeToReconnect(now) <= 0)			/* timeout expired: poll state */
 		{
 			TimestampTz now;

@@ -389,13 +440,11 @@ WalProposerPoll(void)
 			{
 				Safekeeper *sk = &safekeeper[i];

-				if ((sk->state == SS_CONNECTING_WRITE ||
-					 sk->state == SS_CONNECTING_READ) &&
-					TimestampDifferenceExceeds(sk->startedConnAt, now,
-											   wal_acceptor_connect_timeout))
+				if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
+											   wal_acceptor_connection_timeout))
 				{
-					elog(WARNING, "failed to connect to node '%s:%s': exceeded connection timeout %dms",
-						 sk->host, sk->port, wal_acceptor_connect_timeout);
+					elog(WARNING, "failed to connect to node '%s:%s' in '%s' state: exceeded connection timeout %dms",
+						 sk->host, sk->port, FormatSafekeeperState(sk->state), wal_acceptor_connection_timeout);
 					ShutdownConnection(sk);
 				}
 			}
@@ -429,7 +478,7 @@ WalProposerRegister(void)
 }

 static void
-WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId)
+WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 {
 	char	   *host;
 	char	   *sep;
@@ -508,7 +557,7 @@ WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId)
 }

 static void
-WalProposerStartImpl(void)
+WalProposerStart(void)
 {

 	/* Initiate connections to all safekeeper nodes */
@@ -711,7 +760,7 @@ ResetConnection(Safekeeper *sk)
 	elog(LOG, "connecting with node %s:%s", sk->host, sk->port);

 	sk->state = SS_CONNECTING_WRITE;
-	sk->startedConnAt = GetCurrentTimestamp();
+	sk->latestMsgReceivedAt = GetCurrentTimestamp();

 	sock = walprop_socket(sk->conn);
 	sk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, sk);
@@ -869,7 +918,7 @@ HandleConnectionEvent(Safekeeper *sk)
 		case WP_CONN_POLLING_OK:
 			elog(LOG, "connected with node %s:%s", sk->host,
 				 sk->port);
-
+			sk->latestMsgReceivedAt = GetCurrentTimestamp();
 			/*
 			 * We have to pick some event to update event set. We'll
 			 * eventually need the socket to be readable, so we go with that.
@@ -2255,7 +2304,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg)
 		ResetConnection(sk);
 		return false;
 	}
-
+	sk->latestMsgReceivedAt = GetCurrentTimestamp();
 	switch (tag)
 	{
 		case 'g':
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -30,7 +30,7 @@

 extern char *wal_acceptors_list;
 extern int	wal_acceptor_reconnect_timeout;
-extern int	wal_acceptor_connect_timeout;
+extern int	wal_acceptor_connection_timeout;
 extern bool am_wal_proposer;

 struct WalProposerConn;			/* Defined in libpqwalproposer */
@@ -371,7 +371,7 @@ typedef struct Safekeeper
 	int			eventPos;		/* position in wait event set. Equal to -1 if*
 								 * no event */
 	SafekeeperState state;		/* safekeeper state machine state */
-	TimestampTz startedConnAt;	/* when connection attempt started */
+	TimestampTz latestMsgReceivedAt;        /* when latest msg is received */
 	AcceptorGreeting greetResponse; /* acceptor greeting */
 	VoteResponse voteResponse;	/* the vote */
 	AppendResponse appendResponse;	/* feedback for master */
--- a/pgxn/neon_walredo/Makefile
+++ b/pgxn/neon_walredo/Makefile
@@ -0,0 +1,22 @@
+# pgxs/neon_walredo/Makefile
+
+MODULE_big = neon_walredo
+OBJS = \
+	$(WIN32RES) \
+	inmem_smgr.o \
+	walredoproc.o \
+
+# This really should be guarded by $(with_libseccomp), but I couldn't
+# make that work with pgxs. So we always compile it, but its contents
+# are wrapped in #ifdef HAVE_LIBSECCOMP instead.
+OBJS += seccomp.o
+
+PGFILEDESC = "neon_walredo - helper process that runs in Neon pageserver"
+
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+
+ifeq ($(with_libseccomp),yes)
+SHLIB_LINK += -lseccomp
+endif
--- a/pgxn/neon_walredo/inmem_smgr.c
+++ b/pgxn/neon_walredo/inmem_smgr.c
@@ -3,9 +3,8 @@
 * inmem_smgr.c
 *
 * This is an implementation of the SMGR interface, used in the WAL redo
- * process (see src/backend/tcop/zenith_wal_redo.c). It has no persistent
- * storage, the pages that are written out are kept in a small number of
- * in-memory buffers.
+ * process. It has no persistent storage, the pages that are written out
+ * are kept in a small number of in-memory buffers.
 *
 * Normally, replaying a WAL record only needs to access a handful of
 * buffers, which fit in the normal buffer cache, so this is just for
@@ -15,15 +14,11 @@
 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * IDENTIFICATION
- *	  contrib/neon/inmem_smgr.c
- *
 *-------------------------------------------------------------------------
 */
 #include "postgres.h"

 #include "access/xlog.h"
-#include "pagestore_client.h"
 #include "storage/block.h"
 #include "storage/buf_internals.h"
 #include "storage/relfilenode.h"
@@ -33,6 +28,8 @@
 #include "access/xlogutils.h"
 #endif

+#include "inmem_smgr.h"
+
 /* Size of the in-memory smgr */
 #define MAX_PAGES 64

@@ -59,10 +56,34 @@ locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno)
 	return -1;
 }

+
+/* neon wal-redo storage manager functionality */
+static void inmem_init(void);
+static void inmem_open(SMgrRelation reln);
+static void inmem_close(SMgrRelation reln, ForkNumber forknum);
+static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
+static bool inmem_exists(SMgrRelation reln, ForkNumber forknum);
+static void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
+static void inmem_extend(SMgrRelation reln, ForkNumber forknum,
+						 BlockNumber blocknum, char *buffer, bool skipFsync);
+static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
+						   BlockNumber blocknum);
+static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+					   char *buffer);
+static void inmem_write(SMgrRelation reln, ForkNumber forknum,
+						BlockNumber blocknum, char *buffer, bool skipFsync);
+static void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
+							BlockNumber blocknum, BlockNumber nblocks);
+static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
+static void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
+						   BlockNumber nblocks);
+static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
+
+
 /*
 *	inmem_init() -- Initialize private state
 */
-void
+static void
 inmem_init(void)
 {
 	used_pages = 0;
@@ -71,7 +92,7 @@ inmem_init(void)
 /*
 *	inmem_exists() -- Does the physical file exist?
 */
-bool
+static bool
 inmem_exists(SMgrRelation reln, ForkNumber forknum)
 {
 	for (int i = 0; i < used_pages; i++)
@@ -90,7 +111,7 @@ inmem_exists(SMgrRelation reln, ForkNumber forknum)
 *
 * If isRedo is true, it's okay for the relation to exist already.
 */
-void
+static void
 inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 {
 }
@@ -98,7 +119,7 @@ inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 /*
 *	inmem_unlink() -- Unlink a relation.
 */
-void
+static void
 inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
 {
 }
@@ -112,7 +133,7 @@ inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
 *		EOF).  Note that we assume writing a block beyond current EOF
 *		causes intervening file space to become filled with zeroes.
 */
-void
+static void
 inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 			 char *buffer, bool skipFsync)
 {
@@ -123,7 +144,7 @@ inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 /*
 *  inmem_open() -- Initialize newly-opened relation.
 */
-void
+static void
 inmem_open(SMgrRelation reln)
 {
 }
@@ -131,7 +152,7 @@ inmem_open(SMgrRelation reln)
 /*
 *	inmem_close() -- Close the specified relation, if it isn't closed already.
 */
-void
+static void
 inmem_close(SMgrRelation reln, ForkNumber forknum)
 {
 }
@@ -139,7 +160,7 @@ inmem_close(SMgrRelation reln, ForkNumber forknum)
 /*
 *	inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation
 */
-bool
+static bool
 inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
 	return true;
@@ -148,7 +169,7 @@ inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 /*
 * inmem_writeback() -- Tell the kernel to write pages back to storage.
 */
-void
+static void
 inmem_writeback(SMgrRelation reln, ForkNumber forknum,
 				BlockNumber blocknum, BlockNumber nblocks)
 {
@@ -157,7 +178,7 @@ inmem_writeback(SMgrRelation reln, ForkNumber forknum,
 /*
 *	inmem_read() -- Read the specified block from a relation.
 */
-void
+static void
 inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 		   char *buffer)
 {
@@ -177,7 +198,7 @@ inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 *		relation (ie, those before the current EOF).  To extend a relation,
 *		use mdextend().
 */
-void
+static void
 inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			char *buffer, bool skipFsync)
 {
@@ -224,7 +245,7 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 /*
 *	inmem_nblocks() -- Get the number of blocks stored in a relation.
 */
-BlockNumber
+static BlockNumber
 inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
 	/*
@@ -243,7 +264,7 @@ inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
 /*
 *	inmem_truncate() -- Truncate relation to specified number of blocks.
 */
-void
+static void
 inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 {
 }
@@ -251,7 +272,7 @@ inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 /*
 *	inmem_immedsync() -- Immediately sync a relation to stable storage.
 */
-void
+static void
 inmem_immedsync(SMgrRelation reln, ForkNumber forknum)
 {
 }
--- a/pgxn/neon_walredo/inmem_smgr.h
+++ b/pgxn/neon_walredo/inmem_smgr.h
@@ -0,0 +1,17 @@
+/*-------------------------------------------------------------------------
+ *
+ * inmem_smgr.h
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef INMEM_SMGR_H
+#define INMEM_SMGR_H
+
+extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode);
+extern void smgr_init_inmem(void);
+
+#endif /* INMEM_SMGR_H */
--- a/pgxn/neon_walredo/neon_seccomp.h
+++ b/pgxn/neon_walredo/neon_seccomp.h
@@ -0,0 +1,22 @@
+#ifndef NEON_SECCOMP_H
+#define NEON_SECCOMP_H
+
+#include <seccomp.h>
+
+typedef struct {
+    int    psr_syscall; /* syscall number */
+    uint32 psr_action;  /* libseccomp action, e.g. SCMP_ACT_ALLOW */
+} PgSeccompRule;
+
+#define PG_SCMP(syscall, action)                \
+    (PgSeccompRule) {                           \
+        .psr_syscall = SCMP_SYS(syscall),       \
+        .psr_action = (action),                 \
+    }
+
+#define PG_SCMP_ALLOW(syscall) \
+    PG_SCMP(syscall, SCMP_ACT_ALLOW)
+
+extern void seccomp_load_rules(PgSeccompRule *syscalls, int count);
+
+#endif /* NEON_SECCOMP_H */
--- a/Show More
+++ b/Show More