persistent_range_query: add layer_map_test

persistent_range_query: add stress test
persistent_range_query: more refs
2026-03-05 17:30:38 +00:00 · 2022-11-24 04:47:19 +02:00 · 2022-11-24 03:50:18 +02:00 · 2022-11-24 03:45:02 +02:00 · 2022-11-24 02:31:48 +02:00 · 2022-11-24 02:11:06 +02:00
118 changed files with 4340 additions and 1481 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -55,6 +55,22 @@ runs:
        name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
        path: /tmp/neon

+    - name: Download Neon binaries for the previous release
+      if: inputs.build_type != 'remote'
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
+        path: /tmp/neon-previous
+        prefix: latest
+
+    - name: Download compatibility snapshot for Postgres 14
+      if: inputs.build_type != 'remote'
+      uses: ./.github/actions/download
+      with:
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg14
+        path: /tmp/compatibility_snapshot_pg14
+        prefix: latest
+
    - name: Checkout
      if: inputs.needs_postgres_source == 'true'
      uses: actions/checkout@v3
@@ -73,23 +89,18 @@ runs:
      shell: bash -euxo pipefail {0}
      run: ./scripts/pysync

-    - name: Download compatibility snapshot for Postgres 14
-      if: inputs.build_type != 'remote'
-      uses: ./.github/actions/download
-      with:
-        name: compatibility-snapshot-${{ inputs.build_type }}-pg14
-        path: /tmp/compatibility_snapshot_pg14
-        prefix: latest
-
    - name: Run pytest
      env:
        NEON_BIN: /tmp/neon/bin
+        COMPATIBILITY_NEON_BIN: /tmp/neon-previous/bin
+        COMPATIBILITY_POSTGRES_DISTRIB_DIR: /tmp/neon-previous/pg_install
        TEST_OUTPUT: /tmp/test_output
        BUILD_TYPE: ${{ inputs.build_type }}
        AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
        AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
-        ALLOW_BREAKING_CHANGES: contains(github.event.pull_request.labels.*.name, 'breaking changes')
+        ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
+        ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
      shell: bash -euxo pipefail {0}
      run: |
        # PLATFORM will be embedded in the perf test report
@@ -112,7 +123,12 @@ runs:
          exit 1
        fi
        if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
+          # -n4 uses four processes to run tests via pytest-xdist
          EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
+
+          # --dist=loadgroup points tests marked with @pytest.mark.xdist_group
+          # to the same worker to make @pytest.mark.order work with xdist
+          EXTRA_PARAMS="--dist=loadgroup $EXTRA_PARAMS"
        fi

        if [[ "${{ inputs.run_with_real_s3 }}" == "true" ]]; then
@@ -147,9 +163,9 @@ runs:
        # --verbose prints name of each test (helpful when there are
        # multiple tests in one file)
        # -rA prints summary in the end
-        # -n4 uses four processes to run tests via pytest-xdist
        # -s is not used to prevent pytest from capturing output, because tests are running
        # in parallel and logs are mixed between different tests
+        #
        mkdir -p $TEST_OUTPUT/allure/results
        "${cov_prefix[@]}" ./scripts/pytest \
          --junitxml=$TEST_OUTPUT/junit.xml \
@@ -169,12 +185,12 @@ runs:
      uses: ./.github/actions/upload
      with:
        name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
-        # The path includes a test name (test_prepare_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
-        path: /tmp/test_output/test_prepare_snapshot/compatibility_snapshot_pg14/
+        # The path includes a test name (test_create_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
+        path: /tmp/test_output/test_create_snapshot/compatibility_snapshot_pg14/
        prefix: latest

    - name: Create Allure report
-      if: always()
+      if: success() || failure()
      uses: ./.github/actions/allure-report
      with:
        action: store
--- a/.github/ansible/.gitignore
+++ b/.github/ansible/.gitignore
@@ -1,5 +1,3 @@
-zenith_install.tar.gz
-.zenith_current_version
 neon_install.tar.gz
 .neon_current_version

--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -22,6 +22,10 @@ storage:
          console_region_id: aws-us-west-2
        zenith-1-ps-3:
          console_region_id: aws-us-west-2
+        zenith-1-ps-4:
+          console_region_id: aws-us-west-2
+        zenith-1-ps-5:
+          console_region_id: aws-us-west-2

    safekeepers:
      hosts:
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -0,0 +1,33 @@
+storage:
+  vars:
+    bucket_name: neon-dev-storage-eu-west-1
+    bucket_region: eu-west-1
+    console_mgmt_base_url: http://console-staging.local
+    etcd_endpoints: etcd-0.eu-west-1.aws.neon.build:2379
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: eu-west-1
+    ansible_aws_ssm_bucket_name: neon-dev-storage-eu-west-1
+    console_region_id: aws-eu-west-1
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.eu-west-1.aws.neon.build:
+          ansible_host: i-01d496c5041c7f34c
+
+    safekeepers:
+      hosts:
+        safekeeper-0.eu-west-1.aws.neon.build:
+          ansible_host: i-05226ef85722831bf
+        safekeeper-1.eu-west-1.aws.neon.build:
+          ansible_host: i-06969ee1bf2958bfc
+        safekeeper-2.eu-west-1.aws.neon.build:
+          ansible_host: i-087892e9625984a0b
--- a/.github/ansible/staging.hosts.yaml
+++ b/.github/ansible/staging.hosts.yaml
@@ -3,7 +3,7 @@ storage:
    bucket_name: zenith-staging-storage-us-east-1
    bucket_region: us-east-1
    console_mgmt_base_url: http://console-staging.local
-    etcd_endpoints: zenith-us-stage-etcd.local:2379
+    etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
      remote_storage:
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -22,6 +22,8 @@ storage:
      hosts:
        pageserver-0.us-east-2.aws.neon.build:
          ansible_host: i-0c3e70929edb5d691
+        pageserver-1.us-east-2.aws.neon.build:
+          ansible_host: i-0565a8b4008aa3f40

    safekeepers:
      hosts:
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-staging.local/management/api/v2"
+  domain: "*.eu-west-1.aws.neon.build"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: dev
+  zenith_region: eu-west-1
+  zenith_region_slug: eu-west-1
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: eu-west-1.aws.neon.build
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -144,7 +144,9 @@ jobs:
        # neon-captest-new: Run pgbench in a freshly created project
        # neon-captest-reuse: Same, but reusing existing project
        # neon-captest-prefetch: Same, with prefetching enabled (new project)
-        platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch ]
+        # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
+        # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
+        platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
        db_size: [ 10gb ]
        include:
          - platform: neon-captest-new
@@ -164,7 +166,7 @@ jobs:
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
      PLATFORM: ${{ matrix.platform }}

-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
      options: --init
@@ -207,8 +209,11 @@ jobs:
          rds-aurora)
            CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }}
            ;;
+          rds-postgres)
+            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
+            ;;
          *)
-            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch' or 'rds-aurora'"
+            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
@@ -265,7 +270,7 @@ jobs:
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"

    - name: Create Allure report
-      if: always()
+      if: success() || failure()
      uses: ./.github/actions/allure-report
      with:
        action: generate
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -18,8 +18,8 @@ env:

 jobs:
  tag:
-    runs-on: dev
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
+    runs-on: [ self-hosted, dev, x64 ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
    outputs:
      build-tag: ${{steps.build-tag.outputs.tag}}

@@ -46,7 +46,7 @@ jobs:
        id: build-tag

  build-neon:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -236,7 +236,7 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -268,34 +268,8 @@ jobs:
        if: matrix.build_type == 'debug'
        uses: ./.github/actions/save-coverage-data

-  upload-latest-artifacts:
-    runs-on: dev
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
-      options: --init
-    needs: [ regress-tests ]
-    if: github.ref_name == 'main'
-    steps:
-      - name: Copy Neon artifact to the latest directory
-        shell: bash -euxo pipefail {0}
-        env:
-          BUCKET: neon-github-public-dev
-          PREFIX: artifacts/${{ github.run_id }}
-        run: |
-          for build_type in debug release; do
-            FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
-
-            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
-            if [ -z "${S3_KEY}" ]; then
-              echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist"
-              exit 1
-            fi
-
-            time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/artifacts/latest/${FILENAME}
-          done
-
  benchmarks:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -326,12 +300,12 @@ jobs:
      # while coverage is currently collected for the debug ones

  merge-allure-report:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
    needs: [ regress-tests, benchmarks ]
-    if: always()
+    if: success() || failure()
    strategy:
      fail-fast: false
      matrix:
@@ -364,7 +338,7 @@ jobs:
          DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json

  coverage-report:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -441,15 +415,19 @@ jobs:
        shell: bash -euxo pipefail {0}

  trigger-e2e-tests:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
-    needs: [ build-neon ]
+    needs: [ push-docker-hub, tag ]
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
+          # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
+          # but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
+          # to place a job run status update later.
          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+          # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}

          REMOTE_REPO="${{ github.repository_owner }}/cloud"
@@ -475,12 +453,14 @@ jobs:
              \"inputs\": {
                \"ci_job_name\": \"neon-cloud-e2e\",
                \"commit_hash\": \"$COMMIT_SHA\",
-                \"remote_repo\": \"${{ github.repository }}\"
+                \"remote_repo\": \"${{ github.repository }}\",
+                \"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
+                \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\"
              }
            }"

  neon-image:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug

@@ -498,7 +478,7 @@ jobs:
        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}

  compute-tools-image:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug

@@ -512,28 +492,8 @@ jobs:
      - name: Kaniko build compute tools
        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}

-  compute-node-image:
-    runs-on: dev
-    container: gcr.io/kaniko-project/executor:v1.9.0-debug
-    needs: [ tag ]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1 # v3 won't work with kaniko
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
-
-        # compute-node uses postgres 14, which is default now
-        # cloud repo depends on this image name, thus duplicating it
-        # remove compute-node when cloud repo is updated
-      - name: Kaniko build compute node with extensions v14 (compatibility)
-        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}}
-
  compute-node-image-v14:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug
    needs: [ tag ]
    steps:
@@ -549,9 +509,8 @@ jobs:
      - name: Kaniko build compute node with extensions v14
        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}}

-
  compute-node-image-v15:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug
    needs: [ tag ]
    steps:
@@ -567,18 +526,58 @@ jobs:
      - name: Kaniko build compute node with extensions v15
        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}}

+  test-images:
+    needs: [ tag, neon-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
+    runs-on: [ self-hosted, dev, x64 ]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
+      # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
+      # Regular pageserver version string looks like
+      #   Neon page server git-env:32d14403bd6ab4f4520a94cbfd81a6acef7a526c failpoints: true, features: []
+      # Bad versions might loop like:
+      #   Neon page server git-env:local failpoints: true, features: ["testing"]
+      # Ensure that we don't have bad versions.
+      - name: Verify image versions
+        shell: bash # ensure no set -e for better error messages
+        run: |
+          pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
+
+          echo "Pageserver version string: $pageserver_version"
+
+          if ! echo "$pageserver_version" | grep -qv 'git-env:local' ; then
+            echo "Pageserver version should not be the default Dockerfile one"
+            exit 1
+          fi
+
+          if ! echo "$pageserver_version" | grep -qv '"testing"' ; then
+            echo "Pageserver version should have no testing feature enabled"
+            exit 1
+          fi
+
+      - name: Verify docker-compose example
+        run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
+
+      - name: Print logs and clean up
+        if: always()
+        run: |
+          docker compose -f ./docker-compose/docker-compose.yml logs || 0
+          docker compose -f ./docker-compose/docker-compose.yml down
+
  promote-images:
-    runs-on: dev
-    needs: [ tag, neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
+    runs-on: [ self-hosted, dev, x64 ]
+    needs: [ tag, test-images ]
    if: github.event_name != 'workflow_dispatch'
    container: amazon/aws-cli
    strategy:
      fail-fast: false
      matrix:
-        # compute-node uses postgres 14, which is default now
-        # cloud repo depends on this image name, thus duplicating it
-        # remove compute-node when cloud repo is updated
-        name: [ neon, compute-node, compute-node-v14, compute-node-v15, compute-tools ]
+        name: [ neon, compute-node-v14, compute-node-v15, compute-tools ]

    steps:
      - name: Promote image to latest
@@ -587,7 +586,7 @@ jobs:
          aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"

  push-docker-hub:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    needs: [ promote-images, tag ]
    container: golang:1.19-bullseye

@@ -608,9 +607,6 @@ jobs:
      - name: Pull compute tools image from ECR
        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools

-      - name: Pull compute node image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} compute-node
-
      - name: Pull compute node v14 image from ECR
        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14

@@ -627,7 +623,6 @@ jobs:
        run: |
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest

@@ -643,9 +638,6 @@ jobs:
      - name: Push compute tools image to Docker Hub
        run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}

-      - name: Push compute node image to Docker Hub
-        run: crane push compute-node neondatabase/compute-node:${{needs.tag.outputs.build-tag}}
-
      - name: Push compute node v14 image to Docker Hub
        run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}

@@ -662,7 +654,6 @@ jobs:
        run: |
          crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest

@@ -745,7 +736,7 @@ jobs:
          rm -f neon_install.tar.gz .neon_current_version

  deploy-new:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
@@ -770,7 +761,6 @@ jobs:
        run: |
          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
          cd "$(pwd)/.github/ansible"
-
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
            ./get_binaries.sh
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
@@ -779,6 +769,38 @@ jobs:
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
            exit 1
          fi
+          ansible-galaxy collection install sivel.toiletwater
+          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
+          rm -f neon_install.tar.gz .neon_current_version
+
+  deploy-pr-test-new:
+    runs-on: [ self-hosted, dev, x64 ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
+    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
+    needs: [ push-docker-hub, tag, regress-tests ]
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') && 
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        target_region: [ eu-west-1 ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Redeploy
+        run: |
+          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
+          cd "$(pwd)/.github/ansible"
+
+          ./get_binaries.sh

          ansible-galaxy collection install sivel.toiletwater
          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
@@ -789,7 +811,7 @@ jobs:
    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    needs: [ push-docker-hub, tag, regress-tests ]
    if: |
      (github.ref_name == 'release') &&
      github.event_name != 'workflow_dispatch'
@@ -825,7 +847,7 @@ jobs:
          rm -f neon_install.tar.gz .neon_current_version

  deploy-proxy:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
@@ -867,10 +889,10 @@ jobs:
          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s

  deploy-proxy-new:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    needs: [ push-docker-hub, tag, regress-tests ]
    if: |
      (github.ref_name == 'main') &&
      github.event_name != 'workflow_dispatch'
@@ -882,6 +904,8 @@ jobs:
        include:
          - target_region:  us-east-2
            target_cluster: dev-us-east-2-beta
+          - target_region:  eu-west-1
+            target_cluster: dev-eu-west-1-zeta
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -903,7 +927,7 @@ jobs:
    runs-on: prod
    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    needs: [ push-docker-hub, tag, regress-tests ]
    if: |
      (github.ref_name == 'release') &&
      github.event_name != 'workflow_dispatch'
@@ -936,8 +960,8 @@ jobs:
          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s

-  promote-compatibility-test-snapshot:
-    runs-on: dev
+  promote-compatibility-data:
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -950,9 +974,24 @@ jobs:
          BUCKET: neon-github-public-dev
          PREFIX: artifacts/latest
        run: |
+          # Update compatibility snapshot for the release
          for build_type in debug release; do
            OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst
            NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst

            time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
          done
+
+          # Update Neon artifact for the release (reuse already uploaded artifact)
+          for build_type in debug release; do
+            OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
+            FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
+
+            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
+            if [ -z "${S3_KEY}" ]; then
+              echo 2>&1 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
+              exit 1
+            fi
+
+            time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
+          done
--- a/.github/workflows/codestyle.yml
+++ b/.github/workflows/codestyle.yml
@@ -115,7 +115,7 @@ jobs:
        run: cargo build --locked --all --all-targets

  check-rust-dependencies:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,7 +1,7 @@
 [submodule "vendor/postgres-v14"]
 	path = vendor/postgres-v14
 	url = https://github.com/neondatabase/postgres.git
-	branch = main
+	branch = REL_14_STABLE_neon
 [submodule "vendor/postgres-v15"]
 	path = vendor/postgres-v15
 	url = https://github.com/neondatabase/postgres.git
--- a/11
+++ b/11
@@ -0,0 +1,11 @@
+/compute_tools/ @neondatabase/control-plane
+/control_plane/ @neondatabase/compute @neondatabase/storage
+/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
+/libs/postgres_ffi/ @neondatabase/compute 
+/libs/remote_storage/ @neondatabase/storage 
+/libs/safekeeper_api/ @neondatabase/safekeepers  
+/pageserver/ @neondatabase/compute @neondatabase/storage 
+/pgxn/ @neondatabase/compute
+/proxy/ @neondatabase/control-plane 
+/safekeeper/ @neondatabase/safekeepers
+/vendor/ @neondatabase/compute
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2255,6 +2255,14 @@ version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e"

+[[package]]
+name = "persistent_range_query"
+version = "0.1.0"
+dependencies = [
+ "rand",
+ "workspace_hack",
+]
+
 [[package]]
 name = "petgraph"
 version = "0.6.2"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -25,6 +25,10 @@ members = [
 # Besides, debug info should not affect the performance.
 debug = true

+# disable debug symbols for all packages except this one to decrease binaries size
+[profile.release.package."*"]
+debug = false
+
 [profile.release-line-debug]
 inherits = "release"
 debug = 1 # true = 2 = all symbols, 1 = line only
--- a/Dockerfile.compute-node.legacy
+++ b/Dockerfile.compute-node.legacy
@@ -1,88 +0,0 @@
-#
-# Legacy version of the Dockerfile for the compute node.
-# Used by e2e CI. Building Dockerfile.compute-node will take
-# unreasonable ammount of time without v2 runners.
-#
-# TODO: remove once cloud repo CI is moved to v2 runners.
-#
-
-
-# Allow specifiyng different compute-tools tag and image repo, so we are
-# able to use different images
-ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
-ARG IMAGE=compute-tools
-ARG TAG=latest
-
-#
-# Image with pre-built tools
-#
-FROM $REPOSITORY/$IMAGE:$TAG AS compute-deps
-# Only to get ready compute_ctl binary as deppendency
-
-#
-# Image with Postgres build deps
-#
-FROM debian:bullseye-slim AS build-deps
-
-RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-                                          libcurl4-openssl-dev libossp-uuid-dev
-
-#
-# Image with built Postgres
-#
-FROM build-deps AS pg-build
-
-# Add user postgres
-RUN adduser postgres
-RUN mkdir /pg && chown postgres:postgres /pg
-
-# Copy source files
-# version 14 is default for now
-COPY ./vendor/postgres-v14 /pg/
-COPY ./pgxn /pg/
-
-# Build and install Postgres locally
-RUN mkdir /pg/compute_build && cd /pg/compute_build && \
-    ../configure CFLAGS='-O2 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --with-uuid=ossp && \
-    # Install main binaries and contribs
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
-    # Install headers
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install
-
-# Install neon contrib
-RUN make MAKELEVEL=0 PG_CONFIG=/pg/compute_build/postgres_bin/bin/pg_config -j $(getconf _NPROCESSORS_ONLN) -C /pg/neon install
-
-USER postgres
-WORKDIR /pg
-
-#
-# Final compute node image to be exported
-#
-FROM debian:bullseye-slim
-
-# libreadline-dev is required to run psql
-RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev
-
-# Add user postgres
-RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
-    echo "postgres:test_console_pass" | chpasswd && \
-    mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
-    chown -R postgres:postgres /var/db/postgres && \
-    chmod 0750 /var/db/postgres/compute
-
-# Copy ready Postgres binaries
-COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local
-
-# Copy binaries from compute-tools
-COPY --from=compute-deps /usr/local/bin/compute_ctl /usr/local/bin/compute_ctl
-
-# XXX: temporary symlink for compatibility with old control-plane
-RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
-
-# Add postgres shared objects to the search path
-RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
-
-USER postgres
-
-ENTRYPOINT ["/usr/local/bin/compute_ctl"]
--- a/28
+++ b/28
@@ -20,18 +20,18 @@ else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif

-# Seccomp BPF is only available for Linux
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
+	# Seccomp BPF is only available for Linux
 	PG_CONFIGURE_OPTS += --with-libseccomp
-endif
-
-# macOS with brew-installed openssl requires explicit paths
-# It can be configured with OPENSSL_PREFIX variable
-UNAME_S := $(shell uname -s)
-ifeq ($(UNAME_S),Darwin)
-    OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
-    PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+else ifeq ($(UNAME_S),Darwin)
+	# macOS with brew-installed openssl requires explicit paths
+	# It can be configured with OPENSSL_PREFIX variable
+	OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
+	PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+	# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
+	# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
+	EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
 endif

 # Use -C option so that when PostgreSQL "make install" installs the
@@ -73,7 +73,8 @@ $(POSTGRES_INSTALL_DIR)/build/v14/config.status:
 	+@echo "Configuring Postgres v14 build"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/v14
 	(cd $(POSTGRES_INSTALL_DIR)/build/v14 && \
-	$(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure CFLAGS='$(PG_CFLAGS)' \
+	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure \
+		CFLAGS='$(PG_CFLAGS)' \
 		$(PG_CONFIGURE_OPTS) \
 		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v14 > configure.log)

@@ -81,7 +82,8 @@ $(POSTGRES_INSTALL_DIR)/build/v15/config.status:
 	+@echo "Configuring Postgres v15 build"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/v15
 	(cd $(POSTGRES_INSTALL_DIR)/build/v15 && \
-	$(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure CFLAGS='$(PG_CFLAGS)' \
+	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure \
+		CFLAGS='$(PG_CFLAGS)' \
 		$(PG_CONFIGURE_OPTS) \
 		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v15 > configure.log)

@@ -111,6 +113,8 @@ postgres-v14: postgres-v14-configure \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 install
 	+@echo "Compiling libpq v14"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq install
+	+@echo "Compiling pg_prewarm v14"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_prewarm install
 	+@echo "Compiling pg_buffercache v14"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache install
 	+@echo "Compiling pageinspect v14"
@@ -123,6 +127,8 @@ postgres-v15: postgres-v15-configure \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 install
 	+@echo "Compiling libpq v15"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq install
+	+@echo "Compiling pg_prewarm v15"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_prewarm install
 	+@echo "Compiling pg_buffercache v15"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache install
 	+@echo "Compiling pageinspect v15"
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 1. Install XCode and dependencies
 ```
 xcode-select --install
-brew install protobuf etcd openssl
+brew install protobuf etcd openssl flex bison
 ```

 2. [Install Rust](https://www.rust-lang.org/tools/install)
@@ -125,24 +125,23 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
 # Create repository in .neon with proper paths to binaries and data
 # Later that would be responsibility of a package install script
 > ./target/debug/neon_local init
-Starting pageserver at '127.0.0.1:64000' in '.neon'
-
-Pageserver started
-Successfully initialized timeline 7dd0907914ac399ff3be45fb252bfdb7
-Stopping pageserver gracefully...done!
+Starting pageserver at '127.0.0.1:64000' in '.neon'.
+pageserver started, pid: 2545906
+Successfully initialized timeline de200bd42b49cc1814412c7e592dd6e9
+Stopped pageserver 1 process with pid 2545906

 # start pageserver and safekeeper
 > ./target/debug/neon_local start
-Starting etcd broker using /usr/bin/etcd
-Starting pageserver at '127.0.0.1:64000' in '.neon'
-
-Pageserver started
-Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'
-Safekeeper started
+Starting etcd broker using "/usr/bin/etcd"
+etcd started, pid: 2545996
+Starting pageserver at '127.0.0.1:64000' in '.neon'.
+pageserver started, pid: 2546005
+Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
+safekeeper 1 started, pid: 2546041

 # start postgres compute node
 > ./target/debug/neon_local pg start main
-Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
+Starting new postgres (v14) main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
 Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
 Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'

--- a/cli-v2-story.md
+++ b/cli-v2-story.md
@@ -1,188 +0,0 @@
-Create a new Zenith repository in the current directory:
-
-    ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli init
-    The files belonging to this database system will be owned by user "heikki".
-    This user must also own the server process.
-    
-    The database cluster will be initialized with locale "en_GB.UTF-8".
-    The default database encoding has accordingly been set to "UTF8".
-    The default text search configuration will be set to "english".
-    
-    Data page checksums are disabled.
-    
-    creating directory tmp ... ok
-    creating subdirectories ... ok
-    selecting dynamic shared memory implementation ... posix
-    selecting default max_connections ... 100
-    selecting default shared_buffers ... 128MB
-    selecting default time zone ... Europe/Helsinki
-    creating configuration files ... ok
-    running bootstrap script ... ok
-    performing post-bootstrap initialization ... ok
-    syncing data to disk ... ok
-    
-    initdb: warning: enabling "trust" authentication for local connections
-    You can change this by editing pg_hba.conf or using the option -A, or
-    --auth-local and --auth-host, the next time you run initdb.
-    new zenith repository was created in .zenith
-
-Initially, there is only one branch:
-
-    ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch
-      main
-
-Start a local Postgres instance on the branch:
-
-    ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start main
-    Creating data directory from snapshot at 0/15FFB08...
-    waiting for server to start....2021-04-13 09:27:43.919 EEST [984664] LOG:  starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
-    2021-04-13 09:27:43.920 EEST [984664] LOG:  listening on IPv6 address "::1", port 5432
-    2021-04-13 09:27:43.920 EEST [984664] LOG:  listening on IPv4 address "127.0.0.1", port 5432
-    2021-04-13 09:27:43.927 EEST [984664] LOG:  listening on Unix socket "/tmp/.s.PGSQL.5432"
-    2021-04-13 09:27:43.939 EEST [984665] LOG:  database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
-    2021-04-13 09:27:43.939 EEST [984665] LOG:  creating missing WAL directory "pg_wal/archive_status"
-    2021-04-13 09:27:44.189 EEST [984665] LOG:  database system was not properly shut down; automatic recovery in progress
-    2021-04-13 09:27:44.195 EEST [984665] LOG:  invalid record length at 0/15FFB80: wanted 24, got 0
-    2021-04-13 09:27:44.195 EEST [984665] LOG:  redo is not required
-    2021-04-13 09:27:44.225 EEST [984664] LOG:  database system is ready to accept connections
-     done
-    server started
-
-Run some commands against it:
-
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -c "create table foo (t text);" 
-    CREATE TABLE
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -c "insert into foo values ('inserted on the main branch');" 
-    INSERT 0 1
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -c "select * from foo" 
-                  t              
-    -----------------------------
-     inserted on the main branch
-    (1 row)
-
-Create a new branch called 'experimental'. We create it from the
-current end of the 'main' branch, but you could specify a different
-LSN as the start point instead.
-
-    ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch experimental main
-    branching at end of WAL: 0/161F478
-    
-    ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch 
-      experimental
-      main
-
-Start another Postgres instance off the 'experimental' branch:
-
-    ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433
-    Creating data directory from snapshot at 0/15FFB08...
-    waiting for server to start....2021-04-13 09:28:41.874 EEST [984766] LOG:  starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
-    2021-04-13 09:28:41.875 EEST [984766] LOG:  listening on IPv6 address "::1", port 5433
-    2021-04-13 09:28:41.875 EEST [984766] LOG:  listening on IPv4 address "127.0.0.1", port 5433
-    2021-04-13 09:28:41.883 EEST [984766] LOG:  listening on Unix socket "/tmp/.s.PGSQL.5433"
-    2021-04-13 09:28:41.896 EEST [984767] LOG:  database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
-    2021-04-13 09:28:42.265 EEST [984767] LOG:  database system was not properly shut down; automatic recovery in progress
-    2021-04-13 09:28:42.269 EEST [984767] LOG:  redo starts at 0/15FFB80
-    2021-04-13 09:28:42.272 EEST [984767] LOG:  invalid record length at 0/161F4B0: wanted 24, got 0
-    2021-04-13 09:28:42.272 EEST [984767] LOG:  redo done at 0/161F478 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s
-    2021-04-13 09:28:42.321 EEST [984766] LOG:  database system is ready to accept connections
-     done
-    server started
-
-Insert some a row on the 'experimental' branch:
-
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo" 
-                  t              
-    -----------------------------
-     inserted on the main branch
-    (1 row)
-    
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "insert into foo values ('inserted on experimental')" 
-    INSERT 0 1
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo" 
-                  t              
-    -----------------------------
-     inserted on the main branch
-     inserted on experimental
-    (2 rows)
-    
-See that the other Postgres instance is still running on 'main' branch on port 5432:
-
-
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5432 -c "select * from foo" 
-                  t              
-    -----------------------------
-     inserted on the main branch
-    (1 row)
-
-
-
-
-Everything is stored in the .zenith directory:
-
-    ~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/
-    total 12
-    drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 datadirs
-    drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:27 refs
-    drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 timelines
-
-The 'datadirs' directory contains the datadirs of the running instances:
-
-    ~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/
-    total 8
-    drwx------ 18 heikki heikki 4096 Apr 13 09:27 3c0c634c1674079b2c6d4edf7c91523e
-    drwx------ 18 heikki heikki 4096 Apr 13 09:28 697e3c103d4b1763cd6e82e4ff361d76
-    ~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/3c0c634c1674079b2c6d4edf7c91523e/
-    total 124
-    drwxr-xr-x 5 heikki heikki  4096 Apr 13 09:27 base
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 global
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_commit_ts
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_dynshmem
-    -rw------- 1 heikki heikki  4760 Apr 13 09:27 pg_hba.conf
-    -rw------- 1 heikki heikki  1636 Apr 13 09:27 pg_ident.conf
-    drwxr-xr-x 4 heikki heikki  4096 Apr 13 09:32 pg_logical
-    drwxr-xr-x 4 heikki heikki  4096 Apr 13 09:27 pg_multixact
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_notify
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_replslot
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_serial
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_snapshots
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_stat
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:34 pg_stat_tmp
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_subtrans
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_tblspc
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_twophase
-    -rw------- 1 heikki heikki     3 Apr 13 09:27 PG_VERSION
-    lrwxrwxrwx 1 heikki heikki    52 Apr 13 09:27 pg_wal -> ../../timelines/3c0c634c1674079b2c6d4edf7c91523e/wal
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_xact
-    -rw------- 1 heikki heikki    88 Apr 13 09:27 postgresql.auto.conf
-    -rw------- 1 heikki heikki 28688 Apr 13 09:27 postgresql.conf
-    -rw------- 1 heikki heikki    96 Apr 13 09:27 postmaster.opts
-    -rw------- 1 heikki heikki   149 Apr 13 09:27 postmaster.pid
-
-Note how 'pg_wal' is just a symlink to the 'timelines' directory. The
-datadir is ephemeral, you can delete it at any time, and it can be reconstructed
-from the snapshots and WAL stored in the 'timelines' directory. So if you push/pull
-the repository, the 'datadirs' are not included. (They are like git working trees)
-
-    ~/git-sandbox/zenith (cli-v2)$ killall -9 postgres
-    ~/git-sandbox/zenith (cli-v2)$ rm -rf .zenith/datadirs/*
-    ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433
-    Creating data directory from snapshot at 0/15FFB08...
-    waiting for server to start....2021-04-13 09:37:05.476 EEST [985340] LOG:  starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
-    2021-04-13 09:37:05.477 EEST [985340] LOG:  listening on IPv6 address "::1", port 5433
-    2021-04-13 09:37:05.477 EEST [985340] LOG:  listening on IPv4 address "127.0.0.1", port 5433
-    2021-04-13 09:37:05.487 EEST [985340] LOG:  listening on Unix socket "/tmp/.s.PGSQL.5433"
-    2021-04-13 09:37:05.498 EEST [985341] LOG:  database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
-    2021-04-13 09:37:05.808 EEST [985341] LOG:  database system was not properly shut down; automatic recovery in progress
-    2021-04-13 09:37:05.813 EEST [985341] LOG:  redo starts at 0/15FFB80
-    2021-04-13 09:37:05.815 EEST [985341] LOG:  invalid record length at 0/161F770: wanted 24, got 0
-    2021-04-13 09:37:05.815 EEST [985341] LOG:  redo done at 0/161F738 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s
-    2021-04-13 09:37:05.866 EEST [985340] LOG:  database system is ready to accept connections
-     done
-    server started
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo" 
-                  t              
-    -----------------------------
-     inserted on the main branch
-     inserted on experimental
-    (2 rows)
-
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -65,7 +65,7 @@ impl GenericOption {
            let name = match self.name.as_str() {
                "safekeepers" => "neon.safekeepers",
                "wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout",
-                "wal_acceptor_connect_timeout" => "neon.safekeeper_connect_timeout",
+                "wal_acceptor_connection_timeout" => "neon.safekeeper_connection_timeout",
                it => it,
            };

--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -26,8 +26,18 @@ use nix::unistd::Pid;

 use utils::lock_file;

-const RETRIES: u32 = 15;
-const RETRY_TIMEOUT_MILLIS: u64 = 500;
+// These constants control the loop used to poll for process start / stop.
+//
+// The loop waits for at most 10 seconds, polling every 100 ms.
+// Once a second, it prints a dot ("."), to give the user an indication that
+// it's waiting. If the process hasn't started/stopped after 5 seconds,
+// it prints a notice that it's taking long, but keeps waiting.
+//
+const RETRY_UNTIL_SECS: u64 = 10;
+const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
+const RETRY_INTERVAL_MILLIS: u64 = 100;
+const DOT_EVERY_RETRIES: u64 = 10;
+const NOTICE_AFTER_RETRIES: u64 = 50;

 /// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
 /// it itself.
@@ -107,16 +117,16 @@ where
                return Ok(spawned_process);
            }
            Ok(false) => {
-                if retries < 5 {
+                if retries == NOTICE_AFTER_RETRIES {
+                    // The process is taking a long time to start up. Keep waiting, but
+                    // print a message
+                    print!("\n{process_name} has not started yet, continuing to wait");
+                }
+                if retries % DOT_EVERY_RETRIES == 0 {
                    print!(".");
                    io::stdout().flush().unwrap();
-                } else {
-                    if retries == 5 {
-                        println!() // put a line break after dots for second message
-                    }
-                    println!("{process_name} has not started yet, retrying ({retries})...");
                }
-                thread::sleep(Duration::from_millis(RETRY_TIMEOUT_MILLIS));
+                thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
            }
            Err(e) => {
                println!("{process_name} failed to start: {e:#}");
@@ -127,7 +137,8 @@ where
            }
        }
    }
-    anyhow::bail!("{process_name} could not start in {RETRIES} attempts");
+    println!();
+    anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
 }

 /// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
@@ -158,7 +169,7 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
    }

    // Wait until process is gone
-    for _ in 0..RETRIES {
+    for retries in 0..RETRIES {
        match process_has_stopped(pid) {
            Ok(true) => {
                println!("\n{process_name} stopped");
@@ -170,9 +181,16 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
                return Ok(());
            }
            Ok(false) => {
-                print!(".");
-                io::stdout().flush().unwrap();
-                thread::sleep(Duration::from_secs(1))
+                if retries == NOTICE_AFTER_RETRIES {
+                    // The process is taking a long time to start up. Keep waiting, but
+                    // print a message
+                    print!("\n{process_name} has not stopped yet, continuing to wait");
+                }
+                if retries % DOT_EVERY_RETRIES == 0 {
+                    print!(".");
+                    io::stdout().flush().unwrap();
+                }
+                thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
            }
            Err(e) => {
                println!("{process_name} with pid {pid} failed to stop: {e:#}");
@@ -180,24 +198,21 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
            }
        }
    }
-
-    anyhow::bail!("{process_name} with pid {pid} failed to stop in {RETRIES} attempts");
+    println!();
+    anyhow::bail!("{process_name} with pid {pid} did not stop in {RETRY_UNTIL_SECS} seconds");
 }

 fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
    let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");

-    let var = "LLVM_PROFILE_FILE";
-    if let Some(val) = std::env::var_os(var) {
-        filled_cmd = filled_cmd.env(var, val);
+    // Pass through these environment variables to the command
+    for var in ["LLVM_PROFILE_FILE", "FAILPOINTS", "RUST_LOG"] {
+        if let Some(val) = std::env::var_os(var) {
+            filled_cmd = filled_cmd.env(var, val);
+        }
    }

-    const RUST_LOG_KEY: &str = "RUST_LOG";
-    if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
-        filled_cmd.env(RUST_LOG_KEY, rust_log_value)
-    } else {
-        filled_cmd
-    }
+    filled_cmd
 }

 fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -343,7 +343,7 @@ impl PostgresNode {
        //   To be able to restore database in case of pageserver node crash, safekeeper should not
        //   remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers
        //   (if they are not able to upload WAL to S3).
-        conf.append("max_replication_write_lag", "500MB");
+        conf.append("max_replication_write_lag", "15MB");
        conf.append("max_replication_flush_lag", "10GB");

        if !self.env.safekeepers.is_empty() {
--- a/control_plane/src/etcd.rs
+++ b/control_plane/src/etcd.rs
@@ -6,7 +6,7 @@ use crate::{background_process, local_env};

 pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
    let etcd_broker = &env.etcd_broker;
-    println!(
+    print!(
        "Starting etcd broker using {:?}",
        etcd_broker.etcd_binary_path
    );
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -237,7 +237,7 @@ impl PageServerNode {
        datadir: &Path,
        update_config: bool,
    ) -> anyhow::Result<Child> {
-        println!(
+        print!(
            "Starting pageserver at '{}' in '{}'",
            self.pg_connection_config.raw_address(),
            datadir.display()
@@ -362,6 +362,11 @@ impl PageServerNode {
                .map(|x| x.parse::<NonZeroU64>())
                .transpose()
                .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
+            trace_read_requests: settings
+                .remove("trace_read_requests")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'trace_read_requests' as bool")?,
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
@@ -424,6 +429,11 @@ impl PageServerNode {
                    .map(|x| x.parse::<NonZeroU64>())
                    .transpose()
                    .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
+                trace_read_requests: settings
+                    .get("trace_read_requests")
+                    .map(|x| x.parse::<bool>())
+                    .transpose()
+                    .context("Failed to parse 'trace_read_requests' as bool")?,
            })
            .send()?
            .error_from_body()?;
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -0,0 +1,13 @@
+ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+ARG COMPUTE_IMAGE=compute-node-v14
+ARG TAG=latest
+
+FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG
+
+USER root
+RUN apt-get update &&       \
+    apt-get install -y curl \
+                       jq   \
+                       netcat
+
+USER postgres
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -2,6 +2,7 @@ version: '3'

 services:
  etcd:
+    restart: always
    image: quay.io/coreos/etcd:v3.5.4
    ports:
      - 2379:2379
@@ -9,7 +10,7 @@ services:
    environment:
      # This signifficantly speeds up etcd and we anyway don't data persistency there.
      ETCD_UNSAFE_NO_FSYNC: "1"
-    command: 
+    command:
      - "etcd"
      - "--auto-compaction-mode=revision"
      - "--auto-compaction-retention=1"
@@ -24,6 +25,7 @@ services:
      - "--quota-backend-bytes=134217728" # 128 MB

  minio:
+    restart: always
    image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z
    ports:
      - 9000:9000
@@ -41,7 +43,7 @@ services:
    entrypoint:
      - "/bin/sh"
      - "-c"
-    command: 
+    command:
      - "until (/usr/bin/mc alias set minio http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD) do
             echo 'Waiting to start minio...' && sleep 1;
         done;
@@ -51,7 +53,8 @@ services:
      - minio

  pageserver:
-    image: neondatabase/neon:${TAG:-latest}
+    restart: always
+    image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
    environment:
      - BROKER_ENDPOINT='http://etcd:2379'
      - AWS_ACCESS_KEY_ID=minio
@@ -77,7 +80,8 @@ services:
      - minio_create_buckets

  safekeeper1:
-    image: neondatabase/neon:${TAG:-latest}
+    restart: always
+    image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
    environment:
      - SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
      - SAFEKEEPER_ID=1
@@ -106,7 +110,8 @@ services:
      - minio_create_buckets

  safekeeper2:
-    image: neondatabase/neon:${TAG:-latest}
+    restart: always
+    image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
    environment:
      - SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
      - SAFEKEEPER_ID=2
@@ -135,7 +140,8 @@ services:
      - minio_create_buckets

  safekeeper3:
-    image: neondatabase/neon:${TAG:-latest}
+    restart: always
+    image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
    environment:
      - SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
      - SAFEKEEPER_ID=3
@@ -164,18 +170,21 @@ services:
      - minio_create_buckets

  compute:
+    restart: always
    build:
-      context: ./image/compute
+      context: ./compute_wrapper/
      args:
-        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}:${TAG:-latest}
+        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}
+        - TAG=${TAG:-latest}
        - http_proxy=$http_proxy
        - https_proxy=$https_proxy
    environment:
      - PG_VERSION=${PG_VERSION:-14}
      #- RUST_BACKTRACE=1
+    # Mount the test files directly, for faster editing cycle.
    volumes:
-      - ./compute/var/db/postgres/specs/:/var/db/postgres/specs/
-      - ./compute/shell/:/shell/
+      - ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/
+      - ./compute_wrapper/shell/:/shell/
    ports:
      - 55433:55433 # pg protocol handler
      - 3080:3080 # http endpoints
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# A basic test to ensure Docker images are built correctly.
+# Build a wrapper around the compute, start all services and runs a simple SQL query.
+# Repeats the process for all currenly supported Postgres versions.
+
+# Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file
+# Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
+# to verify custom image builds (e.g pre-published ones).
+
+# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
+
+set -eux -o pipefail
+
+SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml
+
+COMPUTE_CONTAINER_NAME=docker-compose-compute-1
+SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
+PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
+
+cleanup() {
+    echo "show container information"
+    docker ps
+    docker compose -f $COMPOSE_FILE logs
+    echo "stop containers..."
+    docker compose -f $COMPOSE_FILE down
+}
+
+echo "clean up containers if exists"
+cleanup
+
+for pg_version in 14 15; do
+    echo "start containers (pg_version=$pg_version)."
+    PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d
+
+    echo "wait until the compute is ready. timeout after 60s. "
+    cnt=0
+    while sleep 1; do
+        # check timeout
+        cnt=`expr $cnt + 1`
+        if [ $cnt -gt 60 ]; then
+            echo "timeout before the compute is ready."
+            cleanup
+            exit 1
+        fi
+
+        # check if the compute is ready
+        set +o pipefail
+        result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
+        set -o pipefail
+        if [ $result -eq 1 ]; then
+            echo "OK. The compute is ready to connect."
+            echo "execute simple queries."
+            docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
+            cleanup
+            break
+        fi
+    done
+done
--- a/docker-compose/image/compute/Dockerfile
+++ b/docker-compose/image/compute/Dockerfile
@@ -1,10 +0,0 @@
-ARG COMPUTE_IMAGE=compute-node-v14:latest
-FROM neondatabase/${COMPUTE_IMAGE}
-
-USER root
-RUN apt-get update &&       \
-    apt-get install -y curl \
-                       jq   \
-                       netcat
-
-USER postgres
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -37,7 +37,7 @@

 - [Source view](./sourcetree.md)
  - [docker.md](./docker.md) — Docker images and building pipeline.
-  - [Error handling and logging]()
+  - [Error handling and logging](./error-handling.md)
  - [Testing]()
    - [Unit testing]()
    - [Integration testing]()
--- a/docs/error-handling.md
+++ b/docs/error-handling.md
@@ -0,0 +1,198 @@
+# Error handling and logging
+
+## Logging errors
+
+The principle is that errors are logged when they are handled. If you
+just propagate an error to the caller in a function, you don't need to
+log it; the caller will. But if you consume an error in a function,
+you *must* log it (if it needs to be logged at all).
+
+For example:
+
+```rust
+fn read_motd_file() -> std::io::Result<String> {
+    let mut f = File::open("/etc/motd")?;
+    let mut result = String::new();
+    f.read_to_string(&mut result)?;
+    result
+}
+```
+
+Opening or reading the file could fail, but there is no need to log
+the error here. The function merely propagates the error to the
+caller, and it is up to the caller to log the error or propagate it
+further, if the failure is not expected. But if, for example, it is
+normal that the "/etc/motd" file doesn't exist, the caller can choose
+to silently ignore the error, or log it as an INFO or DEBUG level
+message:
+
+```rust
+fn get_message_of_the_day() -> String {
+    // Get the motd from /etc/motd, or return the default proverb
+    match read_motd_file() {
+        Ok(motd) => motd,
+        Err(err)  => {
+            // It's normal that /etc/motd doesn't exist, but if we fail to
+            // read it for some other reason, that's unexpected. The message
+            // of the day isn't very important though, so we just WARN and
+            // continue with the default in any case.
+            if err.kind() != std::io::ErrorKind::NotFound {
+                 tracing::warn!("could not read \"/etc/motd\": {err:?}");
+            }
+            "An old error is always more popular than a new truth. - German proverb"
+        }
+    }
+}
+```
+
+## Error types
+
+We use the `anyhow` crate widely. It contains many convenient macros
+like `bail!` and `ensure!` to construct and return errors, and to
+propagate many kinds of low-level errors, wrapped in `anyhow::Error`.
+
+A downside of `anyhow::Error` is that the caller cannot distinguish
+between different error cases. Most errors are propagated all the way
+to the mgmt API handler function, or the main loop that handles a
+connection with the compute node, and they are all handled the same
+way: the error is logged and returned to the client as an HTTP or
+libpq error.
+
+But in some cases, we need to distinguish between errors and handle
+them differently. For example, attaching a tenant to the pageserver
+could fail either because the tenant has already been attached, or
+because we could not load its metadata from cloud storage. The first
+case is more or less expected. The console sends the Attach request to
+the pageserver, and the pageserver completes the operation, but the
+network connection might be lost before the console receives the
+response. The console will retry the operation in that case, but the
+tenant has already been attached. It is important that the pagserver
+responds with the HTTP 403 Already Exists error in that case, rather
+than a generic HTTP 500 Internal Server Error.
+
+If you need to distinguish between different kinds of errors, create a
+new `Error` type. The `thiserror` crate is useful for that. But in
+most cases `anyhow::Error` is good enough.
+
+## Panics
+
+Depending on where a panic happens, it can cause the whole pageserver
+or safekeeper to restart, or just a single tenant. In either case,
+that is pretty bad and causes an outage. Avoid panics. Never use
+`unwrap()` or other calls that might panic, to verify inputs from the
+network or from disk.
+
+It is acceptable to use functions that might panic, like `unwrap()`, if
+it is obvious that it cannot panic. For example, if you have just
+checked that a variable is not None, it is OK to call `unwrap()` on it,
+but it is still preferable to use `expect("reason")` instead to explain
+why the function cannot fail.
+
+`assert!` and `panic!` are reserved for checking clear invariants and
+very obvious "can't happen" cases. When in doubt, use anyhow `ensure!`
+or `bail!` instead.
+
+## Error levels
+
+`tracing::Level` doesn't provide very clear guidelines on what the
+different levels mean, or when to use which level. Here is how we use
+them:
+
+### Error
+
+Examples:
+- could not open file "foobar"
+- invalid tenant id
+
+Errors are not expected to happen during normal operation. Incorrect
+inputs from client can cause ERRORs. For example, if a client tries to
+call a mgmt API that doesn't exist, or if a compute node sends passes
+an LSN that has already been garbage collected away.
+
+These should *not* happen during normal operations. "Normal
+operations" is not a very precise concept. But for example, disk
+errors are not expected to happen when the system is working, so those
+count as Errors. However, if a TCP connection to a compute node is
+lost, that is not considered an Error, because it doesn't affect the
+pageserver's or safekeeper's operation in any way, and happens fairly
+frequently when compute nodes are shut down, or are killed abruptly
+because of errors in the compute.
+
+**Errors are monitored, and always need human investigation to determine
+the cause.**
+
+Whether something should be logged at ERROR, WARNING or INFO level can
+depend on the callers and clients. For example, it might be unexpected
+and a sign of a serious issue if the console calls the
+"timeline_detail" mgmt API for a timeline that doesn't exist. ERROR
+would be appropriate in that case. But if the console routinely calls
+the API after deleting a timeline, to check if the deletion has
+completed, then it would be totally normal and an INFO or DEBUG level
+message would be more appropriate. If a message is logged as an ERROR,
+but it in fact happens frequently in production and never requires any
+action, it should probably be demoted to an INFO level message.
+
+### Warn
+
+Examples:
+- could not remove temporary file "foobar.temp"
+- unrecognized file "foobar" in timeline directory
+
+Warnings are similar to Errors, in that they should not happen
+when the system is operating normally. The difference between Error and
+Warning is that an Error means that the operation failed, whereas Warning
+means that something unexpected happened, but the operation continued anyway.
+For example, if deleting a file fails because the file already didn't exist,
+it should be logged as Warning.
+
+> **Note:** The python regression tests, under `test_regress`, check the
+> pageserver log after each test for any ERROR and WARN lines. If there are
+> any ERRORs or WARNs that have not been explicitly listed in the test as
+> allowed, the test is marked a failed. This is to catch unexpected errors
+> e.g. in background operations, that don't cause immediate misbehaviour in
+> the tested functionality.
+
+### Info
+
+Info level is used to log useful information when the system is
+operating normally. Info level is appropriate e.g. for logging state
+changes, background operations, and network connections.
+
+Examples:
+- "system is shutting down"
+- "tenant was created"
+- "retrying S3 upload"
+
+### Debug & Trace
+
+Debug and Trace level messages are not printed to the log in our normal
+production configuration, but could be enabled for a specific server or
+tenant, to aid debugging. (Although we don't actually have that
+capability as of this writing).
+
+## Context
+
+We use logging "spans" to hold context information about the current
+operation. Almost every operation happens on a particular tenant and
+timeline, so we enter a span with the "tenant_id" and "timeline_id"
+very early when processing an incoming API request, for example. All
+background operations should also run in a span containing at least
+those two fields, and any other parameters or information that might
+be useful when debugging an error that might happen when performing
+the operation.
+
+TODO: Spans are not captured in the Error when it is created, but when
+the error is logged. It would be more useful to capture them at Error
+creation. We should consider using `tracing_error::SpanTrace` to do
+that.
+
+## Error message style
+
+PostgreSQL has a style guide for writing error messages:
+
+https://www.postgresql.org/docs/current/error-style-guide.html
+
+Follow that guide when writing error messages in the PostgreSQL
+extension. We don't follow it strictly in the pageserver and
+safekeeper, but the advice in the PostgreSQL style guide is generally
+good, and you can't go wrong by following it.
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -83,6 +83,16 @@ A subject for future modularization.
 `/libs/metrics`:
 Helpers for exposing Prometheus metrics from the server.

+### Adding dependencies
+When you add a Cargo dependency, you should update hakari manifest by running commands below and committing the updated `Cargo.lock` and `workspace_hack/`. There may be no changes, that's fine.
+
+```bash
+cargo hakari generate
+cargo hakari manage-deps
+```
+
+If you don't have hakari installed (`error: no such subcommand: hakari`), install it by running `cargo install cargo-hakari`.
+
 ## Using Python
 Note that Debian/Ubuntu Python packages are stale, as it commonly happens,
 so manual installation of dependencies is not recommended.
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -73,6 +73,7 @@ pub struct TenantCreateRequest {
    pub walreceiver_connect_timeout: Option<String>,
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
+    pub trace_read_requests: Option<bool>,
 }

 #[serde_as]
@@ -112,6 +113,7 @@ pub struct TenantConfigRequest {
    pub walreceiver_connect_timeout: Option<String>,
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
+    pub trace_read_requests: Option<bool>,
 }

 impl TenantConfigRequest {
@@ -130,6 +132,7 @@ impl TenantConfigRequest {
            walreceiver_connect_timeout: None,
            lagging_wal_timeout: None,
            max_lsn_wal_lag: None,
+            trace_read_requests: None,
        }
    }
 }
--- a/libs/persistent_range_query/Cargo.toml
+++ b/libs/persistent_range_query/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "persistent_range_query"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+
+[dev-dependencies]
+rand = "0.8.3"
--- a/libs/persistent_range_query/src/lib.rs
+++ b/libs/persistent_range_query/src/lib.rs
@@ -0,0 +1,78 @@
+use std::ops::Range;
+
+pub mod naive;
+pub mod ops;
+pub mod segment_tree;
+
+/// Should be a monoid:
+/// * Identity element: for all a: combine(new_for_empty_range(), a) = combine(a, new_for_empty_range()) = a
+/// * Associativity: for all a, b, c: combine(combine(a, b), c) == combine(a, combine(b, c))
+pub trait RangeQueryResult<Key>: Sized + Clone {
+    // Clone is equivalent to combine with an empty range.
+
+    fn new_for_empty_range() -> Self;
+
+    // Contract: left_range.end == right_range.start
+    // left_range.start == left_range.end == right_range.start == right_range.end is still possible
+    fn combine(
+        left: &Self,
+        left_range: &Range<Key>,
+        right: &Self,
+        right_range: &Range<Key>,
+    ) -> Self;
+
+    fn add(left: &mut Self, left_range: &Range<Key>, right: &Self, right_range: &Range<Key>);
+}
+
+pub trait LazyRangeInitializer<Result: RangeQueryResult<Key>, Key> {
+    fn get(&self, range: &Range<Key>) -> Result;
+}
+
+/// Should be a monoid:
+/// * Identity element: for all op: compose(no_op(), op) == compose(op, no_op()) == op
+/// * Associativity: for all op_1, op_2, op_3: compose(compose(op_1, op_2), op_3) == compose(op_1, compose(op_2, op_3))
+///
+/// Should left act on Result:
+/// * Identity operation: for all r: no_op().apply(r) == r
+/// * Compatibility: for all op_1, op_2, r: op_1.apply(op_2.apply(r)) == compose(op_1, op_2).apply(r)
+pub trait RangeModification<Key> {
+    type Result: RangeQueryResult<Key>;
+
+    fn no_op() -> Self;
+    fn is_no_op(&self) -> bool;
+    fn is_reinitialization(&self) -> bool;
+    fn apply(&self, result: &mut Self::Result, range: &Range<Key>);
+    fn compose(later: &Self, earlier: &mut Self);
+}
+
+pub trait VecReadableVersion<Modification: RangeModification<Key>, Key> {
+    fn get(&self, keys: &Range<Key>) -> Modification::Result;
+}
+
+// TODO: use trait alias when stabilized
+pub trait VecFrozenVersion<Modification: RangeModification<Key>, Key>:
+    Clone + VecReadableVersion<Modification, Key>
+{
+}
+
+impl<
+        T: Clone + VecReadableVersion<Modification, Key>,
+        Modification: RangeModification<Key>,
+        Key,
+    > VecFrozenVersion<Modification, Key> for T
+{
+}
+
+pub trait PersistentVecStorage<
+    Modification: RangeModification<Key>,
+    Initializer: LazyRangeInitializer<Modification::Result, Key>,
+    Key,
+>: VecReadableVersion<Modification, Key>
+{
+    fn new(all_keys: Range<Key>, initializer: Initializer) -> Self;
+
+    type FrozenVersion: VecFrozenVersion<Modification, Key>;
+
+    fn modify(&mut self, keys: &Range<Key>, modification: &Modification);
+    fn freeze(&mut self) -> Self::FrozenVersion;
+}
--- a/libs/persistent_range_query/src/naive.rs
+++ b/libs/persistent_range_query/src/naive.rs
@@ -0,0 +1,115 @@
+use crate::{
+    LazyRangeInitializer, PersistentVecStorage, RangeModification, RangeQueryResult,
+    VecReadableVersion,
+};
+use std::marker::PhantomData;
+use std::ops::Range;
+use std::rc::Rc;
+
+pub struct NaiveFrozenVersion<Modification: RangeModification<Key>, Key> {
+    all_keys: Range<Key>,
+    values: Rc<Box<Vec<Modification::Result>>>,
+}
+
+pub trait IndexableKey: Clone {
+    fn index(all_keys: &Range<Self>, key: &Self) -> usize;
+    fn element_range(all_keys: &Range<Self>, index: usize) -> Range<Self>;
+}
+
+fn get<Modification: RangeModification<Key>, Key: IndexableKey>(
+    all_keys: &Range<Key>,
+    values: &Vec<Modification::Result>,
+    keys: &Range<Key>,
+) -> Modification::Result {
+    let mut result = Modification::Result::new_for_empty_range();
+    let mut result_range = keys.start.clone()..keys.start.clone();
+    for index in
+        IndexableKey::index(&all_keys, &keys.start)..IndexableKey::index(&all_keys, &keys.end)
+    {
+        let element_range = IndexableKey::element_range(&all_keys, index);
+        Modification::Result::add(&mut result, &result_range, &values[index], &element_range);
+        result_range.end = element_range.end;
+    }
+    result
+}
+
+impl<Modification: RangeModification<Key>, Key: IndexableKey> VecReadableVersion<Modification, Key>
+    for NaiveFrozenVersion<Modification, Key>
+{
+    fn get(&self, keys: &Range<Key>) -> Modification::Result {
+        get::<Modification, Key>(&self.all_keys, &self.values, keys)
+    }
+}
+
+// Manual implementation of `Clone` becase `derive` requires `Modification: Clone`
+impl<Modification: RangeModification<Key>, Key: Clone> Clone
+    for NaiveFrozenVersion<Modification, Key>
+{
+    fn clone(&self) -> Self {
+        Self {
+            all_keys: self.all_keys.clone(),
+            values: self.values.clone(),
+        }
+    }
+}
+
+// TODO: is it at all possible to store previous versions in this struct,
+// without any Rc<>?
+pub struct NaiveVecStorage<
+    Modification: RangeModification<Key>,
+    Initializer: LazyRangeInitializer<Modification::Result, Key>,
+    Key: IndexableKey,
+> {
+    all_keys: Range<Key>,
+    last_version: Vec<Modification::Result>,
+    _initializer: PhantomData<Initializer>,
+}
+
+impl<
+        Modification: RangeModification<Key>,
+        Initializer: LazyRangeInitializer<Modification::Result, Key>,
+        Key: IndexableKey,
+    > VecReadableVersion<Modification, Key> for NaiveVecStorage<Modification, Initializer, Key>
+{
+    fn get(&self, keys: &Range<Key>) -> Modification::Result {
+        get::<Modification, Key>(&self.all_keys, &self.last_version, keys)
+    }
+}
+
+impl<
+        Modification: RangeModification<Key>,
+        Initializer: LazyRangeInitializer<Modification::Result, Key>,
+        Key: IndexableKey,
+    > PersistentVecStorage<Modification, Initializer, Key>
+    for NaiveVecStorage<Modification, Initializer, Key>
+{
+    fn new(all_keys: Range<Key>, initializer: Initializer) -> Self {
+        let mut values = Vec::with_capacity(IndexableKey::index(&all_keys, &all_keys.end));
+        for index in 0..values.capacity() {
+            values.push(initializer.get(&IndexableKey::element_range(&all_keys, index)));
+        }
+        NaiveVecStorage {
+            all_keys,
+            last_version: values,
+            _initializer: PhantomData,
+        }
+    }
+
+    type FrozenVersion = NaiveFrozenVersion<Modification, Key>;
+
+    fn modify(&mut self, keys: &Range<Key>, modification: &Modification) {
+        for index in IndexableKey::index(&self.all_keys, &keys.start)
+            ..IndexableKey::index(&self.all_keys, &keys.end)
+        {
+            let element_range = IndexableKey::element_range(&self.all_keys, index);
+            modification.apply(&mut self.last_version[index], &element_range);
+        }
+    }
+
+    fn freeze(&mut self) -> Self::FrozenVersion {
+        NaiveFrozenVersion::<Modification, Key> {
+            all_keys: self.all_keys.clone(),
+            values: Rc::new(Box::new(self.last_version.clone())),
+        }
+    }
+}
--- a/libs/persistent_range_query/src/ops/mod.rs
+++ b/libs/persistent_range_query/src/ops/mod.rs
@@ -0,0 +1,14 @@
+pub mod rsq;
+
+#[derive(Copy, Clone, Debug)]
+pub struct SameElementsInitializer<T> {
+    initial_element_value: T,
+}
+
+impl<T> SameElementsInitializer<T> {
+    pub fn new(initial_element_value: T) -> Self {
+        SameElementsInitializer {
+            initial_element_value,
+        }
+    }
+}
--- a/libs/persistent_range_query/src/ops/rsq.rs
+++ b/libs/persistent_range_query/src/ops/rsq.rs
@@ -0,0 +1,118 @@
+//! # Range Sum Query
+
+use crate::ops::SameElementsInitializer;
+use crate::{LazyRangeInitializer, RangeModification, RangeQueryResult};
+use std::borrow::Borrow;
+use std::ops::{Add, AddAssign, Range};
+
+// TODO: commutative Add
+
+#[derive(Clone, Copy, Debug)]
+pub struct SumResult<T> {
+    sum: T,
+}
+
+impl<T> SumResult<T> {
+    pub fn sum(&self) -> &T {
+        &self.sum
+    }
+}
+
+impl<T: Clone + for<'a> AddAssign<&'a T> + From<u8>, Key> RangeQueryResult<Key> for SumResult<T>
+where
+    for<'a> &'a T: Add<&'a T, Output = T>,
+{
+    fn new_for_empty_range() -> Self {
+        SumResult { sum: 0.into() }
+    }
+
+    fn combine(
+        left: &Self,
+        _left_range: &Range<Key>,
+        right: &Self,
+        _right_range: &Range<Key>,
+    ) -> Self {
+        SumResult {
+            sum: &left.sum + &right.sum,
+        }
+    }
+
+    fn add(left: &mut Self, _left_range: &Range<Key>, right: &Self, _right_range: &Range<Key>) {
+        left.sum += &right.sum
+    }
+}
+
+pub trait SumOfSameElements<Key> {
+    fn sum(initial_element_value: &Self, keys: &Range<Key>) -> Self;
+}
+
+impl<T: SumOfSameElements<Key>, TB: Borrow<T>, Key> LazyRangeInitializer<SumResult<T>, Key>
+    for SameElementsInitializer<TB>
+where
+    SumResult<T>: RangeQueryResult<Key>,
+{
+    fn get(&self, range: &Range<Key>) -> SumResult<T> {
+        SumResult {
+            sum: SumOfSameElements::sum(self.initial_element_value.borrow(), range),
+        }
+    }
+}
+
+#[derive(Copy, Clone, Debug)]
+pub enum AddAssignModification<T> {
+    None,
+    Add(T),
+    Assign(T),
+}
+
+impl<T: Clone + for<'a> AddAssign<&'a T>, Key> RangeModification<Key> for AddAssignModification<T>
+where
+    SumResult<T>: RangeQueryResult<Key>,
+    for<'a> SameElementsInitializer<&'a T>: LazyRangeInitializer<SumResult<T>, Key>,
+{
+    type Result = SumResult<T>;
+
+    fn no_op() -> Self {
+        AddAssignModification::None
+    }
+
+    fn is_no_op(&self) -> bool {
+        match self {
+            AddAssignModification::None => true,
+            _ => false,
+        }
+    }
+
+    fn is_reinitialization(&self) -> bool {
+        match self {
+            AddAssignModification::Assign(_) => true,
+            _ => false,
+        }
+    }
+
+    fn apply(&self, result: &mut SumResult<T>, range: &Range<Key>) {
+        use AddAssignModification::*;
+        match self {
+            None => {}
+            Add(x) | Assign(x) => {
+                let to_add = SameElementsInitializer::new(x).get(range).sum;
+                if let Assign(_) = self {
+                    result.sum = to_add;
+                } else {
+                    result.sum += &to_add;
+                }
+            }
+        }
+    }
+
+    fn compose(later: &Self, earlier: &mut Self) {
+        use AddAssignModification::*;
+        match (later, earlier) {
+            (_, e @ None) => *e = later.clone(),
+            (None, _) => {}
+            (Assign(_), e) => *e = later.clone(),
+            (Add(x), Add(y)) => *y += x,
+            (Add(x), Assign(value)) => *value += x,
+        }
+    }
+}
--- a/libs/persistent_range_query/src/segment_tree.rs
+++ b/libs/persistent_range_query/src/segment_tree.rs
@@ -0,0 +1,255 @@
+//! # Segment Tree
+//! It is a competitive programming folklore data structure. Do not confuse with the interval tree.
+
+use crate::{LazyRangeInitializer, PersistentVecStorage, RangeQueryResult, VecReadableVersion};
+use std::ops::Range;
+use std::rc::Rc;
+
+pub trait MidpointableKey: Clone + Ord + Sized {
+    fn midpoint(range: &Range<Self>) -> Self;
+}
+
+pub trait RangeModification<Key>: Clone + crate::RangeModification<Key> {}
+
+// TODO: use trait alias when stabilized
+impl<T: Clone + crate::RangeModification<Key>, Key> RangeModification<Key> for T {}
+
+#[derive(Debug)]
+struct Node<Modification: RangeModification<Key>, Key> {
+    result: Modification::Result,
+    modify_children: Modification,
+    left: Option<Rc<Self>>,
+    right: Option<Rc<Self>>,
+}
+
+// Manual implementation because we don't need `Key: Clone` for this, unlike with `derive`.
+impl<Modification: RangeModification<Key>, Key> Clone for Node<Modification, Key> {
+    fn clone(&self) -> Self {
+        Node {
+            result: self.result.clone(),
+            modify_children: self.modify_children.clone(),
+            left: self.left.clone(),
+            right: self.right.clone(),
+        }
+    }
+}
+
+impl<Modification: RangeModification<Key>, Key> Node<Modification, Key> {
+    fn new<Initializer: LazyRangeInitializer<Modification::Result, Key>>(
+        range: &Range<Key>,
+        initializer: &Initializer,
+    ) -> Self {
+        Node {
+            result: initializer.get(range),
+            modify_children: Modification::no_op(),
+            left: None,
+            right: None,
+        }
+    }
+
+    pub fn apply(&mut self, modification: &Modification, range: &Range<Key>) {
+        modification.apply(&mut self.result, range);
+        Modification::compose(modification, &mut self.modify_children);
+        if self.modify_children.is_reinitialization() {
+            self.left = None;
+            self.right = None;
+        }
+    }
+
+    pub fn force_children<Initializer: LazyRangeInitializer<Modification::Result, Key>>(
+        &mut self,
+        initializer: &Initializer,
+        range_left: &Range<Key>,
+        range_right: &Range<Key>,
+    ) {
+        let left = Rc::make_mut(
+            self.left
+                .get_or_insert_with(|| Rc::new(Node::new(&range_left, initializer))),
+        );
+        let right = Rc::make_mut(
+            self.right
+                .get_or_insert_with(|| Rc::new(Node::new(&range_right, initializer))),
+        );
+        left.apply(&self.modify_children, &range_left);
+        right.apply(&self.modify_children, &range_right);
+        self.modify_children = Modification::no_op();
+    }
+
+    pub fn recalculate_from_children(&mut self, range_left: &Range<Key>, range_right: &Range<Key>) {
+        assert!(self.modify_children.is_no_op());
+        assert!(self.left.is_some());
+        assert!(self.right.is_some());
+        self.result = Modification::Result::combine(
+            &self.left.as_ref().unwrap().result,
+            &range_left,
+            &self.right.as_ref().unwrap().result,
+            &range_right,
+        );
+    }
+}
+
+fn split_range<Key: MidpointableKey>(range: &Range<Key>) -> (Range<Key>, Range<Key>) {
+    let range_left = range.start.clone()..MidpointableKey::midpoint(range);
+    let range_right = range_left.end.clone()..range.end.clone();
+    (range_left, range_right)
+}
+
+pub struct PersistentSegmentTreeVersion<
+    Modification: RangeModification<Key>,
+    Initializer: LazyRangeInitializer<Modification::Result, Key>,
+    Key: Clone,
+> {
+    root: Rc<Node<Modification, Key>>,
+    all_keys: Range<Key>,
+    initializer: Rc<Initializer>,
+}
+
+// Manual implementation because we don't need `Key: Clone` for this, unlike with `derive`.
+impl<
+        Modification: RangeModification<Key>,
+        Initializer: LazyRangeInitializer<Modification::Result, Key>,
+        Key: Clone,
+    > Clone for PersistentSegmentTreeVersion<Modification, Initializer, Key>
+{
+    fn clone(&self) -> Self {
+        Self {
+            root: self.root.clone(),
+            all_keys: self.all_keys.clone(),
+            initializer: self.initializer.clone(),
+        }
+    }
+}
+
+fn get<
+    Modification: RangeModification<Key>,
+    Initializer: LazyRangeInitializer<Modification::Result, Key>,
+    Key: MidpointableKey,
+>(
+    node: &mut Rc<Node<Modification, Key>>,
+    node_keys: &Range<Key>,
+    initializer: &Initializer,
+    keys: &Range<Key>,
+) -> Modification::Result {
+    if node_keys.end <= keys.start || keys.end <= node_keys.start {
+        return Modification::Result::new_for_empty_range();
+    }
+    if keys.start <= node_keys.start && node_keys.end <= keys.end {
+        return node.result.clone();
+    }
+    let node = Rc::make_mut(node);
+    let (left_keys, right_keys) = split_range(node_keys);
+    node.force_children(initializer, &left_keys, &right_keys);
+    let mut result = get(node.left.as_mut().unwrap(), &left_keys, initializer, keys);
+    Modification::Result::add(
+        &mut result,
+        &left_keys,
+        &get(node.right.as_mut().unwrap(), &right_keys, initializer, keys),
+        &right_keys,
+    );
+    result
+}
+
+fn modify<
+    Modification: RangeModification<Key>,
+    Initializer: LazyRangeInitializer<Modification::Result, Key>,
+    Key: MidpointableKey,
+>(
+    node: &mut Rc<Node<Modification, Key>>,
+    node_keys: &Range<Key>,
+    initializer: &Initializer,
+    keys: &Range<Key>,
+    modification: &Modification,
+) {
+    if modification.is_no_op() || node_keys.end <= keys.start || keys.end <= node_keys.start {
+        return;
+    }
+    let node = Rc::make_mut(node);
+    if keys.start <= node_keys.start && node_keys.end <= keys.end {
+        node.apply(modification, node_keys);
+        return;
+    }
+    let (left_keys, right_keys) = split_range(node_keys);
+    node.force_children(initializer, &left_keys, &right_keys);
+    modify(
+        node.left.as_mut().unwrap(),
+        &left_keys,
+        initializer,
+        keys,
+        &modification,
+    );
+    modify(
+        node.right.as_mut().unwrap(),
+        &right_keys,
+        initializer,
+        keys,
+        &modification,
+    );
+    node.recalculate_from_children(&left_keys, &right_keys);
+}
+
+impl<
+        Modification: RangeModification<Key>,
+        Initializer: LazyRangeInitializer<Modification::Result, Key>,
+        Key: MidpointableKey,
+    > VecReadableVersion<Modification, Key>
+    for PersistentSegmentTreeVersion<Modification, Initializer, Key>
+{
+    fn get(&self, keys: &Range<Key>) -> Modification::Result {
+        get(
+            &mut self.root.clone(), // TODO: do not always force a branch
+            &self.all_keys,
+            self.initializer.as_ref(),
+            keys,
+        )
+    }
+}
+
+pub struct PersistentSegmentTree<
+    Modification: RangeModification<Key>,
+    Initializer: LazyRangeInitializer<Modification::Result, Key>,
+    Key: MidpointableKey,
+>(PersistentSegmentTreeVersion<Modification, Initializer, Key>);
+
+impl<
+        Modification: RangeModification<Key>,
+        Initializer: LazyRangeInitializer<Modification::Result, Key>,
+        Key: MidpointableKey,
+    > VecReadableVersion<Modification, Key>
+    for PersistentSegmentTree<Modification, Initializer, Key>
+{
+    fn get(&self, keys: &Range<Key>) -> Modification::Result {
+        self.0.get(keys)
+    }
+}
+
+impl<
+        Modification: RangeModification<Key>,
+        Initializer: LazyRangeInitializer<Modification::Result, Key>,
+        Key: MidpointableKey,
+    > PersistentVecStorage<Modification, Initializer, Key>
+    for PersistentSegmentTree<Modification, Initializer, Key>
+{
+    fn new(all_keys: Range<Key>, initializer: Initializer) -> Self {
+        PersistentSegmentTree(PersistentSegmentTreeVersion {
+            root: Rc::new(Node::new(&all_keys, &initializer)),
+            all_keys: all_keys,
+            initializer: Rc::new(initializer),
+        })
+    }
+
+    type FrozenVersion = PersistentSegmentTreeVersion<Modification, Initializer, Key>;
+
+    fn modify(&mut self, keys: &Range<Key>, modification: &Modification) {
+        modify(
+            &mut self.0.root, // TODO: do not always force a branch
+            &self.0.all_keys,
+            self.0.initializer.as_ref(),
+            keys,
+            modification,
+        )
+    }
+
+    fn freeze(&mut self) -> Self::FrozenVersion {
+        self.0.clone()
+    }
+}
--- a/libs/persistent_range_query/tests/layer_map_test.rs
+++ b/libs/persistent_range_query/tests/layer_map_test.rs
@@ -0,0 +1,295 @@
+use persistent_range_query::naive::{IndexableKey, NaiveVecStorage};
+use persistent_range_query::ops::SameElementsInitializer;
+use persistent_range_query::segment_tree::{MidpointableKey, PersistentSegmentTree};
+use persistent_range_query::{
+    LazyRangeInitializer, PersistentVecStorage, RangeModification, RangeQueryResult,
+    VecReadableVersion,
+};
+use std::cmp::Ordering;
+use std::ops::Range;
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd)]
+struct PageIndex(u32);
+type LayerId = String;
+
+impl IndexableKey for PageIndex {
+    fn index(all_keys: &Range<Self>, key: &Self) -> usize {
+        (key.0 as usize) - (all_keys.start.0 as usize)
+    }
+
+    fn element_range(all_keys: &Range<Self>, index: usize) -> Range<Self> {
+        PageIndex(all_keys.start.0 + index as u32)..PageIndex(all_keys.start.0 + index as u32 + 1)
+    }
+}
+
+impl MidpointableKey for PageIndex {
+    fn midpoint(range: &Range<Self>) -> Self {
+        PageIndex(range.start.0 + (range.end.0 - range.start.0) / 2)
+    }
+}
+
+#[derive(Clone, Debug, Eq, PartialEq)]
+struct LayerMapInformation {
+    // Only make sense for a range of length 1.
+    last_layer: Option<LayerId>,
+    last_image_layer: Option<LayerId>,
+    // Work for all ranges
+    max_delta_layers: (usize, Range<PageIndex>),
+}
+
+impl LayerMapInformation {
+    fn last_layers(&self) -> (&Option<LayerId>, &Option<LayerId>) {
+        (&self.last_layer, &self.last_image_layer)
+    }
+
+    fn max_delta_layers(&self) -> &(usize, Range<PageIndex>) {
+        &self.max_delta_layers
+    }
+}
+
+fn merge_ranges(left: &Range<PageIndex>, right: &Range<PageIndex>) -> Range<PageIndex> {
+    if left.is_empty() {
+        right.clone()
+    } else if right.is_empty() {
+        left.clone()
+    } else if left.end == right.start {
+        left.start..right.end
+    } else {
+        left.clone()
+    }
+}
+
+impl RangeQueryResult<PageIndex> for LayerMapInformation {
+    fn new_for_empty_range() -> Self {
+        LayerMapInformation {
+            last_layer: None,
+            last_image_layer: None,
+            max_delta_layers: (0, PageIndex(0)..PageIndex(0)),
+        }
+    }
+
+    fn combine(
+        left: &Self,
+        _left_range: &Range<PageIndex>,
+        right: &Self,
+        _right_range: &Range<PageIndex>,
+    ) -> Self {
+        // Note that either range may be empty.
+        LayerMapInformation {
+            last_layer: left
+                .last_layer
+                .as_ref()
+                .or_else(|| right.last_layer.as_ref())
+                .cloned(),
+            last_image_layer: left
+                .last_image_layer
+                .as_ref()
+                .or_else(|| right.last_image_layer.as_ref())
+                .cloned(),
+            max_delta_layers: match left.max_delta_layers.0.cmp(&right.max_delta_layers.0) {
+                Ordering::Less => right.max_delta_layers.clone(),
+                Ordering::Greater => left.max_delta_layers.clone(),
+                Ordering::Equal => (
+                    left.max_delta_layers.0,
+                    merge_ranges(&left.max_delta_layers.1, &right.max_delta_layers.1),
+                ),
+            },
+        }
+    }
+
+    fn add(
+        left: &mut Self,
+        left_range: &Range<PageIndex>,
+        right: &Self,
+        right_range: &Range<PageIndex>,
+    ) {
+        *left = Self::combine(&left, left_range, right, right_range);
+    }
+}
+
+#[derive(Clone, Debug)]
+struct AddDeltaLayers {
+    last_layer: LayerId,
+    count: usize,
+}
+
+#[derive(Clone, Debug)]
+struct LayerMapModification {
+    add_image_layer: Option<LayerId>,
+    add_delta_layers: Option<AddDeltaLayers>,
+}
+
+impl LayerMapModification {
+    fn add_image_layer(layer: impl Into<LayerId>) -> Self {
+        LayerMapModification {
+            add_image_layer: Some(layer.into()),
+            add_delta_layers: None,
+        }
+    }
+
+    fn add_delta_layer(layer: impl Into<LayerId>) -> Self {
+        LayerMapModification {
+            add_image_layer: None,
+            add_delta_layers: Some(AddDeltaLayers {
+                last_layer: layer.into(),
+                count: 1,
+            }),
+        }
+    }
+}
+
+impl RangeModification<PageIndex> for LayerMapModification {
+    type Result = LayerMapInformation;
+
+    fn no_op() -> Self {
+        LayerMapModification {
+            add_image_layer: None,
+            add_delta_layers: None,
+        }
+    }
+
+    fn is_no_op(&self) -> bool {
+        self.add_image_layer.is_none() && self.add_delta_layers.is_none()
+    }
+
+    fn is_reinitialization(&self) -> bool {
+        self.add_image_layer.is_some()
+    }
+
+    fn apply(&self, result: &mut Self::Result, range: &Range<PageIndex>) {
+        if let Some(layer) = &self.add_image_layer {
+            result.last_layer = Some(layer.clone());
+            result.last_image_layer = Some(layer.clone());
+            result.max_delta_layers = (0, range.clone());
+        }
+        if let Some(AddDeltaLayers { last_layer, count }) = &self.add_delta_layers {
+            result.last_layer = Some(last_layer.clone());
+            result.max_delta_layers.0 += count;
+        }
+    }
+
+    fn compose(later: &Self, earlier: &mut Self) {
+        if later.add_image_layer.is_some() {
+            *earlier = later.clone();
+            return;
+        }
+        if let Some(AddDeltaLayers { last_layer, count }) = &later.add_delta_layers {
+            let res = earlier.add_delta_layers.get_or_insert(AddDeltaLayers {
+                last_layer: LayerId::default(),
+                count: 0,
+            });
+            res.last_layer = last_layer.clone();
+            res.count += count;
+        }
+    }
+}
+
+impl LazyRangeInitializer<LayerMapInformation, PageIndex> for SameElementsInitializer<()> {
+    fn get(&self, range: &Range<PageIndex>) -> LayerMapInformation {
+        LayerMapInformation {
+            last_layer: None,
+            last_image_layer: None,
+            max_delta_layers: (0, range.clone()),
+        }
+    }
+}
+
+fn test_layer_map<
+    S: PersistentVecStorage<LayerMapModification, SameElementsInitializer<()>, PageIndex>,
+>() {
+    let mut s = S::new(
+        PageIndex(0)..PageIndex(100),
+        SameElementsInitializer::new(()),
+    );
+    s.modify(
+        &(PageIndex(0)..PageIndex(70)),
+        &LayerMapModification::add_image_layer("Img0..70"),
+    );
+    s.modify(
+        &(PageIndex(50)..PageIndex(100)),
+        &LayerMapModification::add_image_layer("Img50..100"),
+    );
+    s.modify(
+        &(PageIndex(10)..PageIndex(60)),
+        &LayerMapModification::add_delta_layer("Delta10..60"),
+    );
+    let s_before_last_delta = s.freeze();
+    s.modify(
+        &(PageIndex(20)..PageIndex(80)),
+        &LayerMapModification::add_delta_layer("Delta20..80"),
+    );
+
+    assert_eq!(
+        s.get(&(PageIndex(5)..PageIndex(6))).last_layers(),
+        (&Some("Img0..70".to_owned()), &Some("Img0..70".to_owned()))
+    );
+    assert_eq!(
+        s.get(&(PageIndex(15)..PageIndex(16))).last_layers(),
+        (
+            &Some("Delta10..60".to_owned()),
+            &Some("Img0..70".to_owned())
+        )
+    );
+    assert_eq!(
+        s.get(&(PageIndex(25)..PageIndex(26))).last_layers(),
+        (
+            &Some("Delta20..80".to_owned()),
+            &Some("Img0..70".to_owned())
+        )
+    );
+    assert_eq!(
+        s.get(&(PageIndex(65)..PageIndex(66))).last_layers(),
+        (
+            &Some("Delta20..80".to_owned()),
+            &Some("Img50..100".to_owned())
+        )
+    );
+    assert_eq!(
+        s.get(&(PageIndex(95)..PageIndex(96))).last_layers(),
+        (
+            &Some("Img50..100".to_owned()),
+            &Some("Img50..100".to_owned())
+        )
+    );
+
+    assert_eq!(
+        s.get(&(PageIndex(0)..PageIndex(100))).max_delta_layers(),
+        &(2, PageIndex(20)..PageIndex(60)),
+    );
+    assert_eq!(
+        *s_before_last_delta
+            .get(&(PageIndex(0)..PageIndex(100)))
+            .max_delta_layers(),
+        (1, PageIndex(10)..PageIndex(60)),
+    );
+
+    assert_eq!(
+        *s.get(&(PageIndex(10)..PageIndex(30))).max_delta_layers(),
+        (2, PageIndex(20)..PageIndex(30))
+    );
+    assert_eq!(
+        *s.get(&(PageIndex(10)..PageIndex(20))).max_delta_layers(),
+        (1, PageIndex(10)..PageIndex(20))
+    );
+
+    assert_eq!(
+        *s.get(&(PageIndex(70)..PageIndex(80))).max_delta_layers(),
+        (1, PageIndex(70)..PageIndex(80))
+    );
+    assert_eq!(
+        *s_before_last_delta
+            .get(&(PageIndex(70)..PageIndex(80)))
+            .max_delta_layers(),
+        (0, PageIndex(70)..PageIndex(80))
+    );
+}
+
+#[test]
+fn test_naive() {
+    test_layer_map::<NaiveVecStorage<_, _, _>>();
+}
+
+#[test]
+fn test_segment_tree() {
+    test_layer_map::<PersistentSegmentTree<_, _, _>>();
+}
--- a/libs/persistent_range_query/tests/rsq_test.rs
+++ b/libs/persistent_range_query/tests/rsq_test.rs
@@ -0,0 +1,116 @@
+use persistent_range_query::naive::*;
+use persistent_range_query::ops::rsq::AddAssignModification::Add;
+use persistent_range_query::ops::rsq::*;
+use persistent_range_query::ops::SameElementsInitializer;
+use persistent_range_query::segment_tree::{MidpointableKey, PersistentSegmentTree};
+use persistent_range_query::{PersistentVecStorage, VecReadableVersion};
+use rand::{Rng, SeedableRng};
+use std::ops::Range;
+
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
+struct K(u16);
+
+impl IndexableKey for K {
+    fn index(all_keys: &Range<Self>, key: &Self) -> usize {
+        (key.0 as usize) - (all_keys.start.0 as usize)
+    }
+
+    fn element_range(all_keys: &Range<Self>, index: usize) -> Range<Self> {
+        K(all_keys.start.0 + index as u16)..K(all_keys.start.0 + index as u16 + 1)
+    }
+}
+
+impl SumOfSameElements<K> for i32 {
+    fn sum(initial_element_value: &Self, keys: &Range<K>) -> Self {
+        initial_element_value * (keys.end.0 - keys.start.0) as Self
+    }
+}
+
+impl MidpointableKey for K {
+    fn midpoint(range: &Range<Self>) -> Self {
+        K(range.start.0 + (range.end.0 - range.start.0) / 2)
+    }
+}
+
+fn test_storage<
+    S: PersistentVecStorage<AddAssignModification<i32>, SameElementsInitializer<i32>, K>,
+>() {
+    let mut s = S::new(K(0)..K(12), SameElementsInitializer::new(0i32));
+    assert_eq!(*s.get(&(K(0)..K(12))).sum(), 0);
+
+    s.modify(&(K(2)..K(5)), &AddAssignModification::Add(3));
+    assert_eq!(*s.get(&(K(0)..K(12))).sum(), 3 + 3 + 3);
+    let s_old = s.freeze();
+
+    s.modify(&(K(3)..K(6)), &AddAssignModification::Assign(10));
+    assert_eq!(*s.get(&(K(0)..K(12))).sum(), 3 + 10 + 10 + 10);
+
+    s.modify(&(K(4)..K(7)), &AddAssignModification::Add(2));
+    assert_eq!(*s.get(&(K(0)..K(12))).sum(), 3 + 10 + 12 + 12 + 2);
+
+    assert_eq!(*s.get(&(K(4)..K(6))).sum(), 12 + 12);
+    assert_eq!(*s_old.get(&(K(4)..K(6))).sum(), 3);
+}
+
+#[test]
+fn test_naive() {
+    test_storage::<NaiveVecStorage<_, _, _>>();
+}
+
+#[test]
+fn test_segment_tree() {
+    test_storage::<PersistentSegmentTree<_, _, _>>();
+}
+
+#[test]
+fn test_stress() {
+    const LEN: u16 = 17_238;
+    const OPERATIONS: i32 = 20_000;
+
+    let mut rng = rand::rngs::StdRng::seed_from_u64(0);
+    let mut naive: NaiveVecStorage<AddAssignModification<i32>, _, _> =
+        NaiveVecStorage::new(K(0)..K(LEN), SameElementsInitializer::new(2i32));
+    let mut segm_tree: PersistentSegmentTree<AddAssignModification<i32>, _, _> =
+        PersistentSegmentTree::new(K(0)..K(LEN), SameElementsInitializer::new(2i32));
+
+    fn gen_range(rng: &mut impl Rng) -> Range<K> {
+        let l: u16 = rng.gen_range(0..LEN);
+        let r: u16 = rng.gen_range(0..LEN);
+        if l <= r {
+            K(l)..K(r)
+        } else {
+            K(r)..K(l)
+        }
+    }
+
+    for _ in 0..2 {
+        let checksum_range = gen_range(&mut rng);
+        let checksum_before: i32 = *naive.get(&checksum_range).sum();
+        assert_eq!(checksum_before, *segm_tree.get(&checksum_range).sum());
+
+        let naive_before = naive.freeze();
+        let segm_tree_before = segm_tree.freeze();
+        assert_eq!(checksum_before, *naive_before.get(&checksum_range).sum());
+        assert_eq!(checksum_before, *segm_tree.get(&checksum_range).sum());
+
+        for _ in 0..OPERATIONS {
+            {
+                let range = gen_range(&mut rng);
+                assert_eq!(naive.get(&range).sum(), segm_tree.get(&range).sum());
+            }
+            {
+                let range = gen_range(&mut rng);
+                let val = rng.gen_range(-10i32..=10i32);
+                let op = Add(val);
+                naive.modify(&range, &op);
+                segm_tree.modify(&range, &op);
+            }
+        }
+
+        assert_eq!(checksum_before, *naive_before.get(&checksum_range).sum());
+        assert_eq!(
+            checksum_before,
+            *segm_tree_before.get(&checksum_range).sum()
+        );
+    }
+}
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -33,8 +33,8 @@ pub struct Segment {
    /// Logical size before this state
    start_size: u64,

-    /// Logical size at this state
-    pub end_size: u64,
+    /// Logical size at this state. Can be None in the last Segment of a branch.
+    pub end_size: Option<u64>,

    /// Indices to [`Storage::segments`]
    ///
@@ -115,7 +115,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
            start_lsn: 0,
            end_lsn: 0,
            start_size: 0,
-            end_size: 0,
+            end_size: Some(0),
            children_after: Vec::new(),
        };

@@ -125,6 +125,39 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
        }
    }

+    /// Advances the branch with a new point, at given LSN.
+    pub fn insert_point<Q: ?Sized>(
+        &mut self,
+        branch: &Q,
+        op: Cow<'static, str>,
+        lsn: u64,
+        size: Option<u64>,
+    ) where
+        K: std::borrow::Borrow<Q>,
+        Q: std::hash::Hash + Eq,
+    {
+        let lastseg_id = *self.branches.get(branch).unwrap();
+        let newseg_id = self.segments.len();
+        let lastseg = &mut self.segments[lastseg_id];
+
+        assert!(lsn > lastseg.end_lsn);
+
+        let newseg = Segment {
+            op,
+            parent: Some(lastseg_id),
+            start_lsn: lastseg.end_lsn,
+            end_lsn: lsn,
+            start_size: lastseg.end_size.unwrap(),
+            end_size: size,
+            children_after: Vec::new(),
+            needed: false,
+        };
+        lastseg.children_after.push(newseg_id);
+
+        self.segments.push(newseg);
+        *self.branches.get_mut(branch).expect("read already") = newseg_id;
+    }
+
    /// Advances the branch with the named operation, by the relative LSN and logical size bytes.
    pub fn modify_branch<Q: ?Sized>(
        &mut self,
@@ -145,8 +178,8 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
            parent: Some(lastseg_id),
            start_lsn: lastseg.end_lsn,
            end_lsn: lastseg.end_lsn + lsn_bytes,
-            start_size: lastseg.end_size,
-            end_size: (lastseg.end_size as i64 + size_bytes) as u64,
+            start_size: lastseg.end_size.unwrap(),
+            end_size: Some((lastseg.end_size.unwrap() as i64 + size_bytes) as u64),
            children_after: Vec::new(),
            needed: false,
        };
@@ -321,7 +354,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
                Some(SegmentSize {
                    seg_id,
                    method: SnapshotAfter,
-                    this_size: seg.end_size,
+                    this_size: seg.end_size.unwrap(),
                    children,
                })
            } else {
--- a/libs/tenant_size_model/src/main.rs
+++ b/libs/tenant_size_model/src/main.rs
@@ -174,7 +174,7 @@ fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) {
    let seg_id = node.seg_id;
    let seg = segments.get(seg_id).unwrap();
    let lsn = seg.end_lsn;
-    let size = seg.end_size;
+    let size = seg.end_size.unwrap_or(0);
    let method = node.method;

    println!("  {{");
@@ -226,7 +226,7 @@ fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) {
            print!(
                " label=\"{} / {}\"",
                next.end_lsn - seg.end_lsn,
-                (next.end_size as i128 - seg.end_size as i128)
+                (next.end_size.unwrap_or(0) as i128 - seg.end_size.unwrap_or(0) as i128)
            );
        } else {
            print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn);
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -204,6 +204,17 @@ pub struct TenantId(Id);

 id_newtype!(TenantId);

+/// Neon Connection Id identifies long-lived connections (for example a pagestream
+/// connection with the page_service). Is used for better logging and tracing
+///
+/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
+/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
+/// See [`Id`] for alternative ways to serialize it.
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
+pub struct ConnectionId(Id);
+
+id_newtype!(ConnectionId);
+
 // A pair uniquely identifying Neon instance.
 #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct TenantTimelineId {
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -48,6 +48,25 @@ pub mod nonblock;
 // Default signal handling
 pub mod signals;

+/// use with fail::cfg("$name", "return(2000)")
+#[macro_export]
+macro_rules! failpoint_sleep_millis_async {
+    ($name:literal) => {{
+        let should_sleep: Option<std::time::Duration> = (|| {
+            fail::fail_point!($name, |v: Option<_>| {
+                let millis = v.unwrap().parse::<u64>().unwrap();
+                Some(Duration::from_millis(millis))
+            });
+            None
+        })();
+        if let Some(d) = should_sleep {
+            tracing::info!("failpoint {:?}: sleeping for {:?}", $name, d);
+            tokio::time::sleep(d).await;
+            tracing::info!("failpoint {:?}: sleep done", $name);
+        }
+    }};
+}
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -138,7 +138,7 @@ impl FromStr for Lsn {
    ///
    /// If the input string is missing the '/' character, then use `Lsn::from_hex`
    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let mut splitter = s.split('/');
+        let mut splitter = s.trim().split('/');
        if let (Some(left), Some(right), None) = (splitter.next(), splitter.next(), splitter.next())
        {
            let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
@@ -270,6 +270,11 @@ mod tests {
        );
        assert_eq!(Lsn::from_hex("0"), Ok(Lsn(0)));
        assert_eq!(Lsn::from_hex("F12345678AAAA5555"), Err(LsnParseError));
+
+        let expected_lsn = Lsn(0x3C490F8);
+        assert_eq!(" 0/3C490F8".parse(), Ok(expected_lsn));
+        assert_eq!("0/3C490F8 ".parse(), Ok(expected_lsn));
+        assert_eq!(" 0/3C490F8 ".parse(), Ok(expected_lsn));
    }

    #[test]
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -76,3 +76,7 @@ tempfile = "3.2"
 [[bench]]
 name = "bench_layer_map"
 harness = false
+
+[[bench]]
+name = "bench_walredo"
+harness = false
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -199,6 +199,20 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
    logging::init(conf.log_format)?;
    info!("version: {}", version());

+    // If any failpoints were set from FAILPOINTS environment variable,
+    // print them to the log for debugging purposes
+    let failpoints = fail::list();
+    if !failpoints.is_empty() {
+        info!(
+            "started with failpoints: {}",
+            failpoints
+                .iter()
+                .map(|(name, actions)| format!("{name}={actions}"))
+                .collect::<Vec<String>>()
+                .join(";")
+        )
+    }
+
    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
    let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
        lock_file::LockCreationResult::Created {
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -8,6 +8,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
 use remote_storage::RemoteStorageConfig;
 use std::env;
 use utils::crashsafe::path_with_suffix_extension;
+use utils::id::ConnectionId;

 use std::num::NonZeroUsize;
 use std::path::{Path, PathBuf};
@@ -414,6 +415,22 @@ impl PageServerConf {
        )
    }

+    pub fn traces_path(&self) -> PathBuf {
+        self.workdir.join("traces")
+    }
+
+    pub fn trace_path(
+        &self,
+        tenant_id: &TenantId,
+        timeline_id: &TimelineId,
+        connection_id: &ConnectionId,
+    ) -> PathBuf {
+        self.traces_path()
+            .join(tenant_id.to_string())
+            .join(timeline_id.to_string())
+            .join(connection_id.to_string())
+    }
+
    /// Points to a place in pageserver's local directory,
    /// where certain timeline's metadata file should be located.
    pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf {
@@ -597,8 +614,9 @@ impl PageServerConf {
        PathBuf::from(format!("../tmp_check/test_{test_name}"))
    }

-    #[cfg(test)]
    pub fn dummy_conf(repo_dir: PathBuf) -> Self {
+        let pg_distrib_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
+
        PageServerConf {
            id: NodeId(0),
            wait_lsn_timeout: Duration::from_secs(60),
@@ -609,7 +627,7 @@ impl PageServerConf {
            listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
            superuser: "cloud_admin".to_string(),
            workdir: repo_dir,
-            pg_distrib_dir: PathBuf::new(),
+            pg_distrib_dir,
            auth_type: AuthType::Trust,
            auth_validation_public_key_path: None,
            remote_storage_config: None,
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -667,6 +667,7 @@ components:
        - disk_consistent_lsn
        - awaits_download
        - state
+        - latest_gc_cutoff_lsn
      properties:
        timeline_id:
          type: string
@@ -711,6 +712,9 @@ components:
          type: boolean
        state:
          type: string
+        latest_gc_cutoff_lsn:
+          type: string
+          format: hex

        # These 'local' and 'remote' fields just duplicate some of the fields
        # above. They are kept for backwards-compatibility. They can be removed,
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -618,6 +618,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    check_permission(&request, None)?;

    let request_data: TenantCreateRequest = json_request(&mut request).await?;
+    println!("tenant create: {:?}", request_data.trace_read_requests);
    let remote_index = get_state(&request).remote_index.clone();

    let mut tenant_conf = TenantConfOpt::default();
@@ -659,6 +660,9 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
        tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
    }
+    if let Some(trace_read_requests) = request_data.trace_read_requests {
+        tenant_conf.trace_read_requests = Some(trace_read_requests);
+    }

    tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
    if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
@@ -746,6 +750,9 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
    if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
        tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
    }
+    if let Some(trace_read_requests) = request_data.trace_read_requests {
+        tenant_conf.trace_read_requests = Some(trace_read_requests);
+    }

    tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
    if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -15,6 +15,7 @@ pub mod tenant;
 pub mod tenant_config;
 pub mod tenant_mgr;
 pub mod tenant_tasks;
+pub mod trace;
 pub mod virtual_file;
 pub mod walingest;
 pub mod walreceiver;
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -29,6 +29,7 @@ use tokio::pin;
 use tokio_util::io::StreamReader;
 use tokio_util::io::SyncIoBridge;
 use tracing::*;
+use utils::id::ConnectionId;
 use utils::{
    auth::{self, Claims, JwtAuth, Scope},
    id::{TenantId, TimelineId},
@@ -47,6 +48,7 @@ use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::Timeline;
 use crate::tenant_mgr;
+use crate::trace::Tracer;
 use crate::CheckpointConfig;

 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
@@ -74,6 +76,12 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                        FeMessage::CopyData(bytes) => bytes,
                        FeMessage::CopyDone => { break },
                        FeMessage::Sync => continue,
+                        FeMessage::Terminate => {
+                            let msg = format!("client terminated connection with Terminate message during COPY");
+                            pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
+                            Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                            break;
+                        }
                        m => {
                            let msg = format!("unexpected message {:?}", m);
                            pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
@@ -85,10 +93,10 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                    yield copy_data_bytes;
                }
                Ok(None) => {
-                    let msg = "client closed connection";
+                    let msg = "client closed connection during COPY";
                    pgb.write_message(&BeMessage::ErrorResponse(msg))?;
                    pgb.flush().await?;
-                    Err(io::Error::new(io::ErrorKind::Other, msg))?;
+                    Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
                }
                Err(e) => {
                    Err(io::Error::new(io::ErrorKind::Other, e))?;
@@ -269,6 +277,18 @@ impl PageServerHandler {
        //       so there is no need to reset the association
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));

+        // Make request tracer if needed
+        let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
+        let mut tracer = if tenant.get_trace_read_requests() {
+            let connection_id = ConnectionId::generate();
+            let path = tenant
+                .conf
+                .trace_path(&tenant_id, &timeline_id, &connection_id);
+            Some(Tracer::new(path))
+        } else {
+            None
+        };
+
        // Check that the timeline exists
        let timeline = get_local_timeline(tenant_id, timeline_id)?;

@@ -301,6 +321,11 @@ impl PageServerHandler {

            trace!("query: {copy_data_bytes:?}");

+            // Trace request if needed
+            if let Some(t) = tracer.as_mut() {
+                t.trace(&copy_data_bytes)
+            }
+
            let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;

            let response = match neon_fe_msg {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -461,14 +461,7 @@ impl Tenant {
                    .context("Cannot branch off the timeline that's not present in pageserver")?;

                if let Some(lsn) = ancestor_start_lsn.as_mut() {
-                    // Wait for the WAL to arrive and be processed on the parent branch up
-                    // to the requested branch point. The repository code itself doesn't
-                    // require it, but if we start to receive WAL on the new timeline,
-                    // decoding the new WAL might need to look up previous pages, relation
-                    // sizes etc. and that would get confused if the previous page versions
-                    // are not in the repository yet.
                    *lsn = lsn.align();
-                    ancestor_timeline.wait_lsn(*lsn).await?;

                    let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
                    if ancestor_ancestor_lsn > *lsn {
@@ -480,6 +473,14 @@ impl Tenant {
                            ancestor_ancestor_lsn,
                        );
                    }
+
+                    // Wait for the WAL to arrive and be processed on the parent branch up
+                    // to the requested branch point. The repository code itself doesn't
+                    // require it, but if we start to receive WAL on the new timeline,
+                    // decoding the new WAL might need to look up previous pages, relation
+                    // sizes etc. and that would get confused if the previous page versions
+                    // are not in the repository yet.
+                    ancestor_timeline.wait_lsn(*lsn).await?;
                }

                self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?
@@ -806,6 +807,13 @@ impl Tenant {
            .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
    }

+    pub fn get_trace_read_requests(&self) -> bool {
+        let tenant_conf = self.tenant_conf.read().unwrap();
+        tenant_conf
+            .trace_read_requests
+            .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
+    }
+
    pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
        self.tenant_conf.write().unwrap().update(&new_tenant_conf);
    }
@@ -1003,6 +1011,10 @@ impl Tenant {

        let gc_timelines = self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)?;

+        utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
+
+        info!("starting on {} timelines", gc_timelines.len());
+
        // Perform GC for each timeline.
        //
        // Note that we don't hold the GC lock here because we don't want
@@ -1666,6 +1678,7 @@ pub mod harness {
                walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
                lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
                max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
+                trace_read_requests: Some(tenant_conf.trace_read_requests),
            }
        }
    }
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -74,6 +74,7 @@ where
        };

        dstbuf.clear();
+        dstbuf.reserve(len);

        // Read the payload
        let mut remain = len;
--- a/pageserver/src/tenant/delta_layer.rs
+++ b/pageserver/src/tenant/delta_layer.rs
@@ -260,8 +260,9 @@ impl Layer for DeltaLayer {

            // Ok, 'offsets' now contains the offsets of all the entries we need to read
            let mut cursor = file.block_cursor();
+            let mut buf = Vec::new();
            for (entry_lsn, pos) in offsets {
-                let buf = cursor.read_blob(pos).with_context(|| {
+                cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
                    format!(
                        "Failed to read blob from virtual file {}",
                        file.file.path.display()
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -183,6 +183,19 @@ pub(super) async fn gather_inputs(
            }
        }

+        // all timelines also have an end point if they have made any progress
+        if last_record_lsn > timeline.get_ancestor_lsn()
+            && !interesting_lsns
+                .iter()
+                .any(|(lsn, _)| lsn == &last_record_lsn)
+        {
+            updates.push(Update {
+                lsn: last_record_lsn,
+                command: Command::EndOfBranch,
+                timeline_id: timeline.timeline_id,
+            });
+        }
+
        timeline_inputs.insert(
            timeline.timeline_id,
            TimelineInputs {
@@ -270,48 +283,22 @@ impl ModelInputs {
        // impossible to always determine the a one main branch.
        let mut storage = tenant_size_model::Storage::<Option<TimelineId>>::new(None);

-        // tracking these not to require modifying the current implementation of the size model,
-        // which works in relative LSNs and sizes.
-        let mut last_state: HashMap<TimelineId, (Lsn, u64)> = HashMap::new();
-
        for update in &self.updates {
            let Update {
                lsn,
                command: op,
                timeline_id,
            } = update;
+            let Lsn(now) = *lsn;
            match op {
                Command::Update(sz) => {
-                    let latest = last_state.get_mut(timeline_id).ok_or_else(|| {
-                        anyhow::anyhow!(
-                        "ordering-mismatch: there must had been a previous state for {timeline_id}"
-                    )
-                    })?;
-
-                    let lsn_bytes = {
-                        let Lsn(now) = lsn;
-                        let Lsn(prev) = latest.0;
-                        debug_assert!(prev <= *now, "self.updates should had been sorted");
-                        now - prev
-                    };
-
-                    let size_diff =
-                        i64::try_from(*sz as i128 - latest.1 as i128).with_context(|| {
-                            format!("size difference i64 overflow for {timeline_id}")
-                        })?;
-
-                    storage.modify_branch(&Some(*timeline_id), "".into(), lsn_bytes, size_diff);
-                    *latest = (*lsn, *sz);
+                    storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz));
+                }
+                Command::EndOfBranch => {
+                    storage.insert_point(&Some(*timeline_id), "".into(), now, None);
                }
                Command::BranchFrom(parent) => {
                    storage.branch(parent, Some(*timeline_id));
-
-                    let size = parent
-                        .as_ref()
-                        .and_then(|id| last_state.get(id))
-                        .map(|x| x.1)
-                        .unwrap_or(0);
-                    last_state.insert(*timeline_id, (*lsn, size));
                }
            }
        }
@@ -320,10 +307,7 @@ impl ModelInputs {
    }
 }

-/// Single size model update.
-///
-/// Sizing model works with relative increments over latest branch state.
-/// Updates are absolute, so additional state needs to be tracked when applying.
+/// A point of interest in the tree of branches
 #[serde_with::serde_as]
 #[derive(
    Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize,
@@ -342,6 +326,7 @@ struct Update {
 enum Command {
    Update(u64),
    BranchFrom(#[serde_as(as = "Option<serde_with::DisplayFromStr>")] Option<TimelineId>),
+    EndOfBranch,
 }

 impl std::fmt::Debug for Command {
@@ -351,6 +336,7 @@ impl std::fmt::Debug for Command {
        match self {
            Self::Update(arg0) => write!(f, "Update({arg0})"),
            Self::BranchFrom(arg0) => write!(f, "BranchFrom({arg0:?})"),
+            Self::EndOfBranch => write!(f, "EndOfBranch"),
        }
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -61,6 +61,13 @@ use crate::{
    storage_sync::{self, index::LayerFileMetadata},
 };

+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+enum FlushLoopState {
+    NotStarted,
+    Running,
+    Exited,
+}
+
 pub struct Timeline {
    conf: &'static PageServerConf,
    tenant_conf: Arc<RwLock<TenantConfOpt>>,
@@ -122,7 +129,7 @@ pub struct Timeline {
    write_lock: Mutex<()>,

    /// Used to avoid multiple `flush_loop` tasks running
-    flush_loop_started: Mutex<bool>,
+    flush_loop_state: Mutex<FlushLoopState>,

    /// layer_flush_start_tx can be used to wake up the layer-flushing task.
    /// The value is a counter, incremented every time a new flush cycle is requested.
@@ -755,7 +762,7 @@ impl Timeline {

            upload_layers: AtomicBool::new(upload_layers),

-            flush_loop_started: Mutex::new(false),
+            flush_loop_state: Mutex::new(FlushLoopState::NotStarted),

            layer_flush_start_tx,
            layer_flush_done_tx,
@@ -794,13 +801,23 @@ impl Timeline {
    }

    pub(super) fn maybe_spawn_flush_loop(self: &Arc<Self>) {
-        let mut flush_loop_started = self.flush_loop_started.lock().unwrap();
-        if *flush_loop_started {
-            info!(
-                "skipping attempt to start flush_loop twice {}/{}",
-                self.tenant_id, self.timeline_id
-            );
-            return;
+        let mut flush_loop_state = self.flush_loop_state.lock().unwrap();
+        match *flush_loop_state {
+            FlushLoopState::NotStarted => (),
+            FlushLoopState::Running => {
+                info!(
+                    "skipping attempt to start flush_loop twice {}/{}",
+                    self.tenant_id, self.timeline_id
+                );
+                return;
+            }
+            FlushLoopState::Exited => {
+                warn!(
+                    "ignoring attempt to restart exited flush_loop {}/{}",
+                    self.tenant_id, self.timeline_id
+                );
+                return;
+            }
        }

        let layer_flush_start_rx = self.layer_flush_start_tx.subscribe();
@@ -813,11 +830,16 @@ impl Timeline {
                    Some(self.timeline_id),
                    "layer flush task",
                    false,
-                    async move { self_clone.flush_loop(layer_flush_start_rx).await; Ok(()) }
+                    async move {
+                         self_clone.flush_loop(layer_flush_start_rx).await;
+                         let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
+                         assert_eq!(*flush_loop_state, FlushLoopState::Running);
+                         *flush_loop_state  = FlushLoopState::Exited;
+                         Ok(()) }
                    .instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id))
                );

-        *flush_loop_started = true;
+        *flush_loop_state = FlushLoopState::Running;
    }

    pub(super) fn launch_wal_receiver(self: &Arc<Self>) {
@@ -1365,8 +1387,9 @@ impl Timeline {
        // finished, instead of some other flush that was started earlier.
        let mut my_flush_request = 0;

-        if !&*self.flush_loop_started.lock().unwrap() {
-            anyhow::bail!("cannot flush frozen layers when flush_loop is not running")
+        let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
+        if flush_loop_state != FlushLoopState::Running {
+            anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
        }

        self.layer_flush_start_tx.send_modify(|counter| {
--- a/pageserver/src/tenant_config.rs
+++ b/pageserver/src/tenant_config.rs
@@ -82,6 +82,7 @@ pub struct TenantConf {
    /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
    /// to avoid eager reconnects.
    pub max_lsn_wal_lag: NonZeroU64,
+    pub trace_read_requests: bool,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -105,6 +106,7 @@ pub struct TenantConfOpt {
    #[serde(with = "humantime_serde")]
    pub lagging_wal_timeout: Option<Duration>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
+    pub trace_read_requests: Option<bool>,
 }

 impl TenantConfOpt {
@@ -138,6 +140,9 @@ impl TenantConfOpt {
                .lagging_wal_timeout
                .unwrap_or(global_conf.lagging_wal_timeout),
            max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag),
+            trace_read_requests: self
+                .trace_read_requests
+                .unwrap_or(global_conf.trace_read_requests),
        }
    }

@@ -207,10 +212,10 @@ impl TenantConf {
                .expect("cannot parse default walreceiver lagging wal timeout"),
            max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
                .expect("cannot parse default max walreceiver Lsn wal lag"),
+            trace_read_requests: false,
        }
    }

-    #[cfg(test)]
    pub fn dummy_conf() -> Self {
        TenantConf {
            checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
@@ -232,6 +237,7 @@ impl TenantConf {
            .unwrap(),
            max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
                .unwrap(),
+            trace_read_requests: false,
        }
    }
 }
--- a/pageserver/src/tenant_tasks.rs
+++ b/pageserver/src/tenant_tasks.rs
@@ -71,9 +71,7 @@ async fn compaction_loop(tenant_id: TenantId) {
            let mut sleep_duration = tenant.get_compaction_period();
            if let Err(e) = tenant.compaction_iteration() {
                sleep_duration = wait_duration;
-                error!("Compaction failed, retrying in {:?}: {e:#}", sleep_duration);
-                #[cfg(feature = "testing")]
-                std::process::abort();
+                error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
            }

            // Sleep
@@ -122,9 +120,7 @@ async fn gc_loop(tenant_id: TenantId) {
                if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await
                {
                    sleep_duration = wait_duration;
-                    error!("Gc failed, retrying in {:?}: {e:#}", sleep_duration);
-                    #[cfg(feature = "testing")]
-                    std::process::abort();
+                    error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
                }
            }

--- a/pageserver/src/trace.rs
+++ b/pageserver/src/trace.rs
@@ -0,0 +1,36 @@
+use bytes::Bytes;
+use std::{
+    fs::{create_dir_all, File},
+    io::{BufWriter, Write},
+    path::PathBuf,
+};
+
+pub struct Tracer {
+    writer: BufWriter<File>,
+}
+
+impl Drop for Tracer {
+    fn drop(&mut self) {
+        self.flush()
+    }
+}
+
+impl Tracer {
+    pub fn new(path: PathBuf) -> Self {
+        let parent = path.parent().expect("failed to parse parent path");
+        create_dir_all(parent).expect("failed to create trace dir");
+
+        let file = File::create(path).expect("failed to create trace file");
+        Tracer {
+            writer: BufWriter::new(file),
+        }
+    }
+
+    pub fn trace(&mut self, msg: &Bytes) {
+        self.writer.write_all(msg).expect("failed to write trace");
+    }
+
+    pub fn flush(&mut self) {
+        self.writer.flush().expect("failed to flush trace file");
+    }
+}
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,10 +22,10 @@ use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, Bytes, BytesMut};
 use nix::poll::*;
 use serde::Serialize;
-use std::fs;
 use std::fs::OpenOptions;
 use std::io::prelude::*;
 use std::io::{Error, ErrorKind};
+use std::ops::{Deref, DerefMut};
 use std::os::unix::io::AsRawFd;
 use std::os::unix::prelude::CommandExt;
 use std::path::PathBuf;
@@ -34,6 +34,7 @@ use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
 use std::sync::Mutex;
 use std::time::Duration;
 use std::time::Instant;
+use std::{fs, io};
 use tracing::*;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
@@ -44,6 +45,7 @@ use crate::metrics::{
 };
 use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
 use crate::repository::Key;
+use crate::task_mgr::BACKGROUND_RUNTIME;
 use crate::walrecord::NeonWalRecord;
 use crate::{config::PageServerConf, TEMP_FILE_SUFFIX};
 use pageserver_api::reltag::{RelTag, SlruKind};
@@ -208,6 +210,16 @@ impl PostgresRedoManager {
        }
    }

+    /// Launch process pre-emptively. Should not be needed except for benchmarking.
+    pub fn launch_process(&mut self, pg_version: u32) -> anyhow::Result<()> {
+        let inner = self.process.get_mut().unwrap();
+        if inner.is_none() {
+            let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
+            *inner = Some(p);
+        }
+        Ok(())
+    }
+
    ///
    /// Process one request for WAL redo using wal-redo postgres
    ///
@@ -229,7 +241,7 @@ impl PostgresRedoManager {

        // launch the WAL redo process on first use
        if process_guard.is_none() {
-            let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id, pg_version)?;
+            let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
            *process_guard = Some(p);
        }
        let process = process_guard.as_mut().unwrap();
@@ -579,7 +591,8 @@ impl<C: CommandExt> CloseFileDescriptors for C {
 /// Handle to the Postgres WAL redo process
 ///
 struct PostgresRedoProcess {
-    child: Child,
+    tenant_id: TenantId,
+    child: NoLeakChild,
    stdin: ChildStdin,
    stdout: ChildStdout,
    stderr: ChildStderr,
@@ -589,16 +602,17 @@ impl PostgresRedoProcess {
    //
    // Start postgres binary in special WAL redo mode.
    //
+    #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
    fn launch(
        conf: &PageServerConf,
-        tenant_id: &TenantId,
+        tenant_id: TenantId,
        pg_version: u32,
    ) -> Result<PostgresRedoProcess, Error> {
        // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
        // just create one with constant name. That fails if you try to launch more than
        // one WAL redo manager concurrently.
        let datadir = path_with_suffix_extension(
-            conf.tenant_path(tenant_id).join("wal-redo-datadir"),
+            conf.tenant_path(&tenant_id).join("wal-redo-datadir"),
            TEMP_FILE_SUFFIX,
        );

@@ -653,7 +667,7 @@ impl PostgresRedoProcess {
        }

        // Start postgres itself
-        let mut child = Command::new(pg_bin_dir_path.join("postgres"))
+        let child = Command::new(pg_bin_dir_path.join("postgres"))
            .arg("--wal-redo")
            .stdin(Stdio::piped())
            .stderr(Stdio::piped())
@@ -672,7 +686,7 @@ impl PostgresRedoProcess {
            // as close-on-exec by default, but that's not enough, since we use
            // libraries that directly call libc open without setting that flag.
            .close_fds()
-            .spawn()
+            .spawn_no_leak_child()
            .map_err(|e| {
                Error::new(
                    e.kind(),
@@ -680,20 +694,33 @@ impl PostgresRedoProcess {
                )
            })?;

-        info!(
-            "launched WAL redo postgres process on {}",
-            datadir.display()
-        );
+        let mut child = scopeguard::guard(child, |child| {
+            error!("killing wal-redo-postgres process due to a problem during launch");
+            child.kill_and_wait();
+        });

        let stdin = child.stdin.take().unwrap();
        let stdout = child.stdout.take().unwrap();
        let stderr = child.stderr.take().unwrap();

-        set_nonblock(stdin.as_raw_fd())?;
-        set_nonblock(stdout.as_raw_fd())?;
-        set_nonblock(stderr.as_raw_fd())?;
+        macro_rules! set_nonblock_or_log_err {
+            ($file:ident) => {{
+                let res = set_nonblock($file.as_raw_fd());
+                if let Err(e) = &res {
+                    error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
+                }
+                res
+            }};
+        }
+        set_nonblock_or_log_err!(stdin)?;
+        set_nonblock_or_log_err!(stdout)?;
+        set_nonblock_or_log_err!(stderr)?;
+
+        // all fallible operations post-spawn are complete, so get rid of the guard
+        let child = scopeguard::ScopeGuard::into_inner(child);

        Ok(PostgresRedoProcess {
+            tenant_id,
            child,
            stdin,
            stdout,
@@ -701,18 +728,16 @@ impl PostgresRedoProcess {
        })
    }

-    fn kill(mut self) {
-        let _ = self.child.kill();
-        if let Ok(exit_status) = self.child.wait() {
-            error!("wal-redo-postgres exited with code {}", exit_status);
-        }
-        drop(self);
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
+    fn kill(self) {
+        self.child.kill_and_wait();
    }

    //
    // Apply given WAL records ('records') over an old page image. Returns
    // new page image.
    //
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
    fn apply_wal_records(
        &mut self,
        tag: BufferTag,
@@ -725,7 +750,11 @@ impl PostgresRedoProcess {
        // This could be problematic if there are millions of records to replay,
        // but in practice the number of records is usually so small that it doesn't
        // matter, and it's better to keep this code simple.
-        let mut writebuf: Vec<u8> = Vec::new();
+        //
+        // Most requests start with a before-image with BLCKSZ bytes, followed by
+        // by some other WAL records. Start with a buffer that can hold that
+        // comfortably.
+        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
        build_begin_redo_for_block_msg(tag, &mut writebuf);
        if let Some(img) = base_img {
            build_push_page_msg(tag, &img, &mut writebuf);
@@ -838,6 +867,99 @@ impl PostgresRedoProcess {
    }
 }

+/// Wrapper type around `std::process::Child` which guarantees that the child
+/// will be killed and waited-for by this process before being dropped.
+struct NoLeakChild {
+    child: Option<Child>,
+}
+
+impl Deref for NoLeakChild {
+    type Target = Child;
+
+    fn deref(&self) -> &Self::Target {
+        self.child.as_ref().expect("must not use from drop")
+    }
+}
+
+impl DerefMut for NoLeakChild {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.child.as_mut().expect("must not use from drop")
+    }
+}
+
+impl NoLeakChild {
+    fn spawn(command: &mut Command) -> io::Result<Self> {
+        let child = command.spawn()?;
+        Ok(NoLeakChild { child: Some(child) })
+    }
+
+    fn kill_and_wait(mut self) {
+        let child = match self.child.take() {
+            Some(child) => child,
+            None => return,
+        };
+        Self::kill_and_wait_impl(child);
+    }
+
+    #[instrument(skip_all, fields(pid=child.id()))]
+    fn kill_and_wait_impl(mut child: Child) {
+        let res = child.kill();
+        if let Err(e) = res {
+            // This branch is very unlikely because:
+            // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it.
+            // - This is the only place that calls .kill()
+            // - We consume `self`, so, .kill() can't be called twice.
+            // - If the process exited by itself or was killed by someone else,
+            //   .kill() will still succeed because we haven't wait()'ed yet.
+            //
+            // So, if we arrive here, we have really no idea what happened,
+            // whether the PID stored in self.child is still valid, etc.
+            // If this function were fallible, we'd return an error, but
+            // since it isn't, all we can do is log an error and proceed
+            // with the wait().
+            error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process");
+        }
+
+        match child.wait() {
+            Ok(exit_status) => {
+                // log at error level since .kill() is something we only do on errors ATM
+                error!(exit_status = %exit_status, "wait successful");
+            }
+            Err(e) => {
+                error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");
+            }
+        }
+    }
+}
+
+impl Drop for NoLeakChild {
+    fn drop(&mut self) {
+        let child = match self.child.take() {
+            Some(child) => child,
+            None => return,
+        };
+        // Offload the kill+wait of the child process into the background.
+        // If someone stops the runtime, we'll leak the child process.
+        // We can ignore that case because we only stop the runtime on pageserver exit.
+        BACKGROUND_RUNTIME.spawn(async move {
+            tokio::task::spawn_blocking(move || {
+                Self::kill_and_wait_impl(child);
+            })
+            .await
+        });
+    }
+}
+
+trait NoLeakChildCommandExt {
+    fn spawn_no_leak_child(&mut self) -> io::Result<NoLeakChild>;
+}
+
+impl NoLeakChildCommandExt for Command {
+    fn spawn_no_leak_child(&mut self) -> io::Result<NoLeakChild> {
+        NoLeakChild::spawn(self)
+    }
+}
+
 // Functions for constructing messages to send to the postgres WAL redo
 // process. See pgxn/neon_walredo/walredoproc.c for
 // explanation of the protocol.
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -32,18 +32,22 @@

 #define PageStoreTrace DEBUG5

-#define NEON_TAG "[NEON_SMGR] "
-#define neon_log(tag, fmt, ...) ereport(tag,                                  \
-										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
-										 errhidestmt(true), errhidecontext(true)))
-
 bool		connected = false;
 PGconn	   *pageserver_conn = NULL;

+/*
+ * WaitEventSet containing:
+ * - WL_SOCKET_READABLE on pageserver_conn,
+ * - WL_LATCH_SET on MyLatch, and
+ * - WL_EXIT_ON_PM_DEATH.
+ */
+WaitEventSet *pageserver_conn_wes = NULL;
+
 char	   *page_server_connstring_raw;

 int			n_unflushed_requests = 0;
 int			flush_every_n_requests = 8;
+int			readahead_buffer_size = 128;

 static void pageserver_flush(void);

@@ -63,6 +67,7 @@ pageserver_connect()

 		PQfinish(pageserver_conn);
 		pageserver_conn = NULL;
+
 		ereport(ERROR,
 				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
 				 errmsg(NEON_TAG "could not establish connection to pageserver"),
@@ -78,22 +83,25 @@ pageserver_connect()
 		neon_log(ERROR, "could not send pagestream command to pageserver");
 	}

+	pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
+	AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
+			  MyLatch, NULL);
+	AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+			  NULL, NULL);
+	AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);
+
 	while (PQisBusy(pageserver_conn))
 	{
-		int			wc;
+		WaitEvent	event;

 		/* Sleep until there's something to do */
-		wc = WaitLatchOrSocket(MyLatch,
-							   WL_LATCH_SET | WL_SOCKET_READABLE |
-							   WL_EXIT_ON_PM_DEATH,
-							   PQsocket(pageserver_conn),
-							   -1L, PG_WAIT_EXTENSION);
+		(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);

 		CHECK_FOR_INTERRUPTS();

 		/* Data available in socket? */
-		if (wc & WL_SOCKET_READABLE)
+		if (event.events & WL_SOCKET_READABLE)
 		{
 			if (!PQconsumeInput(pageserver_conn))
 			{
@@ -101,6 +109,7 @@ pageserver_connect()

 				PQfinish(pageserver_conn);
 				pageserver_conn = NULL;
+				FreeWaitEventSet(pageserver_conn_wes);

 				neon_log(ERROR, "could not complete handshake with pageserver: %s",
 						 msg);
@@ -117,33 +126,29 @@ pageserver_connect()
 * A wrapper around PQgetCopyData that checks for interrupts while sleeping.
 */
 static int
-call_PQgetCopyData(PGconn *conn, char **buffer)
+call_PQgetCopyData(char **buffer)
 {
 	int			ret;

 retry:
-	ret = PQgetCopyData(conn, buffer, 1 /* async */ );
+	ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );

 	if (ret == 0)
 	{
-		int			wc;
+		WaitEvent	event;

 		/* Sleep until there's something to do */
-		wc = WaitLatchOrSocket(MyLatch,
-							   WL_LATCH_SET | WL_SOCKET_READABLE |
-							   WL_EXIT_ON_PM_DEATH,
-							   PQsocket(conn),
-							   -1L, PG_WAIT_EXTENSION);
+		(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);

 		CHECK_FOR_INTERRUPTS();

 		/* Data available in socket? */
-		if (wc & WL_SOCKET_READABLE)
+		if (event.events & WL_SOCKET_READABLE)
 		{
-			if (!PQconsumeInput(conn))
+			if (!PQconsumeInput(pageserver_conn))
 				neon_log(ERROR, "could not get response from pageserver: %s",
-						 PQerrorMessage(conn));
+						 PQerrorMessage(pageserver_conn));
 		}

 		goto retry;
@@ -172,6 +177,8 @@ pageserver_disconnect(void)

 		prefetch_on_ps_disconnect();
 	}
+	if (pageserver_conn_wes != NULL)
+		FreeWaitEventSet(pageserver_conn_wes);
 }

 static void
@@ -225,16 +232,22 @@ pageserver_receive(void)
 	StringInfoData resp_buff;
 	NeonResponse *resp;

+	if (!connected)
+		return NULL;
+
 	PG_TRY();
 	{
 		/* read response */
-		resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data);
+		resp_buff.len = call_PQgetCopyData(&resp_buff.data);
 		resp_buff.cursor = 0;

 		if (resp_buff.len < 0)
 		{
 			if (resp_buff.len == -1)
-				neon_log(ERROR, "end of COPY");
+			{
+				pageserver_disconnect();
+				return NULL;
+			}
 			else if (resp_buff.len == -2)
 				neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
 		}
@@ -436,9 +449,22 @@ pg_init_libpagestore(void)
 							NULL,
 							&flush_every_n_requests,
 							8, -1, INT_MAX,
-							PGC_SIGHUP,
+							PGC_USERSET,
 							0,	/* no flags required */
 							NULL, NULL, NULL);
+	DefineCustomIntVariable("neon.readahead_buffer_size",
+							"number of prefetches to buffer",
+							"This buffer is used to store prefetched data; so "
+							"it is important that this buffer is at least as "
+							"large as the configured value of all tablespaces' "
+							"effective_io_concurrency and maintenance_io_concurrency, "
+							"your sessions' values of these, and the value for "
+							"seqscan_prefetch_buffers.",
+							&readahead_buffer_size,
+							128, 16, 1024,
+							PGC_USERSET,
+							0,	/* no flags required */
+							NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL);

 	relsize_hash_init();

--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -49,6 +49,11 @@ typedef struct

 #define messageTag(m) (((const NeonMessage *)(m))->tag)

+#define NEON_TAG "[NEON_SMGR] "
+#define neon_log(tag, fmt, ...) ereport(tag,                                  \
+										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
+										 errhidestmt(true), errhidecontext(true)))
+
 /*
 * supertype of all the Neon*Request structs below
 *
@@ -150,6 +155,8 @@ extern void prefetch_on_ps_disconnect(void);
 extern page_server_api * page_server;

 extern char *page_server_connstring;
+extern int flush_every_n_requests;
+extern int readahead_buffer_size;
 extern bool seqscan_prefetch_enabled;
 extern int seqscan_prefetch_distance;
 extern char *neon_timeline;
@@ -159,6 +166,7 @@ extern int32 max_cluster_size;

 extern const f_smgr *smgr_neon(BackendId backend, RelFileNode rnode);
 extern void smgr_init_neon(void);
+extern void readahead_buffer_resize(int newsize, void *extra);

 /* Neon storage manager functionality */

--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -116,10 +116,10 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 * 
 * Prefetch is performed locally by each backend.
 *
- * There can be up to READ_BUFFER_SIZE active IO requests registered at any
- * time. Requests using smgr_prefetch are sent to the pageserver, but we don't
- * wait on the response. Requests using smgr_read are either read from the
- * buffer, or (if that's not possible) we wait on the response to arrive -
+ * There can be up to readahead_buffer_size active IO requests registered at
+ * any time. Requests using smgr_prefetch are sent to the pageserver, but we
+ * don't wait on the response. Requests using smgr_read are either read from
+ * the buffer, or (if that's not possible) we wait on the response to arrive -
 * this also will allow us to receive other prefetched pages. 
 * Each request is immediately written to the output buffer of the pageserver
 * connection, but may not be flushed if smgr_prefetch is used: pageserver
@@ -136,15 +136,25 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 * the connection; the responses are stored for later use.
 *
 * NOTE: The current implementation of the prefetch system implements a ring
- * buffer of up to READ_BUFFER_SIZE requests. If there are more _read and
+ * buffer of up to readahead_buffer_size requests. If there are more _read and
 * _prefetch requests between the initial _prefetch and the _read of a buffer,
 * the prefetch request will have been dropped from this prefetch buffer, and
 * your prefetch was wasted.
 */

-/* Max amount of tracked buffer reads */
-#define READ_BUFFER_SIZE 128
-
+/*
+ * State machine:
+ *        
+ * not in hash : in hash
+ *             :
+ * UNUSED ------> REQUESTED --> RECEIVED
+ *   ^         :      |            |
+ *   |         :      v            |
+ *   |         : TAG_UNUSED        |
+ *   |         :      |            |
+ *   +----------------+------------+
+ *             :
+ */
 typedef enum PrefetchStatus {
 	PRFS_UNUSED = 0,	/* unused slot */
 	PRFS_REQUESTED,		/* request was written to the sendbuffer to PS, but not
@@ -192,7 +202,7 @@ typedef struct PrfHashEntry {
 * It maintains a (ring) buffer of in-flight requests and responses.
 * 
 * We maintain several indexes into the ring buffer:
- * ring_unused >= ring_receive >= ring_last >= 0
+ * ring_unused >= ring_flush >= ring_receive >= ring_last >= 0
 * 
 * ring_unused points to the first unused slot of the buffer
 * ring_receive is the next request that is to be received
@@ -208,6 +218,7 @@ typedef struct PrefetchState {

 	/* buffer indexes */
 	uint64	ring_unused;		/* first unused slot */
+	uint64	ring_flush;			/* next request to flush */
 	uint64	ring_receive;		/* next slot that is to receive a response */
 	uint64	ring_last;			/* min slot with a response value */

@@ -218,11 +229,19 @@ typedef struct PrefetchState {

 	/* the buffers */
 	prfh_hash *prf_hash;
-	PrefetchRequest prf_buffer[READ_BUFFER_SIZE]; /* prefetch buffers */
+	PrefetchRequest prf_buffer[]; /* prefetch buffers */
 } PrefetchState;

 PrefetchState *MyPState;

+#define GetPrfSlot(ring_index) ( \
+	( \
+		AssertMacro((ring_index) < MyPState->ring_unused && \
+					(ring_index) >= MyPState->ring_last), \
+		&MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \
+	) \
+)
+
 int			n_prefetch_hits = 0;
 int			n_prefetch_misses = 0;
 int			n_prefetch_missed_caches = 0;
@@ -232,18 +251,116 @@ XLogRecPtr	prefetch_lsn = 0;

 static void consume_prefetch_responses(void);
 static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn);
-static void prefetch_read(PrefetchRequest *slot);
+static bool prefetch_read(PrefetchRequest *slot);
 static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn);
-static void prefetch_wait_for(uint64 ring_index);
+static bool prefetch_wait_for(uint64 ring_index);
 static void prefetch_cleanup(void);
-static inline void prefetch_set_unused(uint64 ring_index, bool hash_cleanup);
+static inline void prefetch_set_unused(uint64 ring_index);

 static XLogRecPtr neon_get_request_lsn(bool *latest, RelFileNode rnode,
 									   ForkNumber forknum, BlockNumber blkno);

+void
+readahead_buffer_resize(int newsize, void *extra)
+{
+	uint64		end,
+				nfree = newsize;
+	PrefetchState *newPState;
+	Size 		newprfs_size = offsetof(PrefetchState, prf_buffer) + (
+		sizeof(PrefetchRequest) * readahead_buffer_size
+	);
+	
+	/* don't try to re-initialize if we haven't initialized yet */
+	if (MyPState == NULL)
+		return;
+
+	/*
+	 * Make sure that we don't lose track of active prefetch requests by
+	 * ensuring we have received all but the last n requests (n = newsize).
+	 */
+	if (MyPState->n_requests_inflight > newsize)
+		prefetch_wait_for(MyPState->ring_unused - newsize);
+
+	/* construct the new PrefetchState, and copy over the memory contexts */
+	newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size);
+
+	newPState->bufctx = MyPState->bufctx;
+	newPState->errctx = MyPState->errctx;
+	newPState->hashctx = MyPState->hashctx;
+	newPState->prf_hash = prfh_create(MyPState->hashctx, newsize, NULL);
+	newPState->n_unused = newsize;
+	newPState->n_requests_inflight = 0;
+	newPState->n_responses_buffered = 0;
+	newPState->ring_last = newsize;
+	newPState->ring_unused = newsize;
+	newPState->ring_receive = newsize;
+	newPState->ring_flush = newsize;
+
+	/* 
+	 * Copy over the prefetches.
+	 * 
+	 * We populate the prefetch array from the end; to retain the most recent
+	 * prefetches, but this has the benefit of only needing to do one iteration
+	 * on the dataset, and trivial compaction.
+	 */
+	for (end = MyPState->ring_unused - 1;
+		 end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0;
+		 end -= 1)
+	{
+		PrefetchRequest *slot = GetPrfSlot(end);
+		PrefetchRequest *newslot;
+		bool	found;
+
+		if (slot->status == PRFS_UNUSED)
+			continue;
+
+		nfree -= 1;
+
+		newslot = &newPState->prf_buffer[nfree];
+		*newslot = *slot;
+		newslot->my_ring_index = nfree;
+
+		prfh_insert(newPState->prf_hash, newslot, &found);
+
+		Assert(!found);
+		
+		switch (newslot->status)
+		{
+			case PRFS_UNUSED:
+				pg_unreachable();
+			case PRFS_REQUESTED:
+				newPState->n_requests_inflight += 1;
+				newPState->ring_receive -= 1;
+				newPState->ring_last -= 1;
+				break;
+			case PRFS_RECEIVED:
+				newPState->n_responses_buffered += 1;
+				newPState->ring_last -= 1;
+				break;
+			case PRFS_TAG_REMAINS:
+				newPState->ring_last -= 1;
+				break;
+		}
+		newPState->n_unused -= 1;
+	}
+
+	for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
+	{
+		prefetch_set_unused(end);
+	}
+
+	prfh_destroy(MyPState->prf_hash);
+	pfree(MyPState);
+	MyPState = newPState;
+}
+
+

 /*
 * Make sure that there are no responses still in the buffer.
+ *
+ * NOTE: this function may indirectly update MyPState->pfs_hash; which
+ * invalidates any active pointers into the hash table.
 */
 static void
 consume_prefetch_responses(void)
@@ -255,14 +372,12 @@ consume_prefetch_responses(void)
 static void
 prefetch_cleanup(void)
 {
-	int		index;
 	uint64	ring_index;
 	PrefetchRequest *slot;

 	while (MyPState->ring_last < MyPState->ring_receive) {
 		ring_index = MyPState->ring_last;
-		index = (ring_index % READ_BUFFER_SIZE);
-		slot = &MyPState->prf_buffer[index];
+		slot = GetPrfSlot(ring_index);

 		if (slot->status == PRFS_UNUSED)
 			MyPState->ring_last += 1;
@@ -274,23 +389,33 @@ prefetch_cleanup(void)
 /*
 * Wait for slot of ring_index to have received its response.
 * The caller is responsible for making sure the request buffer is flushed.
+ * 
+ * NOTE: this function may indirectly update MyPState->pfs_hash; which
+ * invalidates any active pointers into the hash table.
 */
-static void
+static bool
 prefetch_wait_for(uint64 ring_index)
 {
-	int index;
 	PrefetchRequest *entry;

+	if (MyPState->ring_flush <= ring_index &&
+		MyPState->ring_unused > MyPState->ring_flush)
+	{
+		page_server->flush();
+		MyPState->ring_flush = MyPState->ring_unused;
+	}
+
 	Assert(MyPState->ring_unused > ring_index);

 	while (MyPState->ring_receive <= ring_index)
 	{
-		index = (MyPState->ring_receive % READ_BUFFER_SIZE);
-		entry = &MyPState->prf_buffer[index];
+		entry = GetPrfSlot(MyPState->ring_receive);

 		Assert(entry->status == PRFS_REQUESTED);
-		prefetch_read(entry);
+		if (!prefetch_read(entry))
+			return false;
 	}
+	return true;
 }

 /*
@@ -298,8 +423,11 @@ prefetch_wait_for(uint64 ring_index)
 * 
 * The caller is responsible for making sure that the request for this buffer
 * was flushed to the PageServer.
+ *
+ * NOTE: this function may indirectly update MyPState->pfs_hash; which
+ * invalidates any active pointers into the hash table.
 */
-static void
+static bool
 prefetch_read(PrefetchRequest *slot)
 {
 	NeonResponse *response;
@@ -312,15 +440,22 @@ prefetch_read(PrefetchRequest *slot)
 	old = MemoryContextSwitchTo(MyPState->errctx);
 	response = (NeonResponse *) page_server->receive();
 	MemoryContextSwitchTo(old);
+	if (response)
+	{
+		/* update prefetch state */
+		MyPState->n_responses_buffered += 1;
+		MyPState->n_requests_inflight -= 1;
+		MyPState->ring_receive += 1;

-	/* update prefetch state */
-	MyPState->n_responses_buffered += 1;
-	MyPState->n_requests_inflight -= 1;
-	MyPState->ring_receive += 1;
-
-	/* update slot state */
-	slot->status = PRFS_RECEIVED;
-	slot->response = response;
+		/* update slot state */
+		slot->status = PRFS_RECEIVED;
+		slot->response = response;
+		return true;
+	}
+	else
+	{
+		return false;
+	}
 }

 /*
@@ -332,19 +467,22 @@ prefetch_read(PrefetchRequest *slot)
 void
 prefetch_on_ps_disconnect(void)
 {
-	for (; MyPState->ring_receive < MyPState->ring_unused; MyPState->ring_receive++)
+	MyPState->ring_flush = MyPState->ring_unused;
+	while (MyPState->ring_receive < MyPState->ring_unused)
 	{
 		PrefetchRequest *slot;
-		int		index = MyPState->ring_receive % READ_BUFFER_SIZE;
+		uint64 ring_index = MyPState->ring_receive;
+
+		slot = GetPrfSlot(ring_index);

-		slot = &MyPState->prf_buffer[index];
 		Assert(slot->status == PRFS_REQUESTED);
-		Assert(slot->my_ring_index == MyPState->ring_receive);
+		Assert(slot->my_ring_index == ring_index);

 		/* clean up the request */
 		slot->status = PRFS_TAG_REMAINS;
-		MyPState->n_requests_inflight--;
-		prefetch_set_unused(MyPState->ring_receive, true);
+		MyPState->n_requests_inflight -= 1;
+		MyPState->ring_receive += 1;
+		prefetch_set_unused(ring_index);
 	}
 }

@@ -353,21 +491,24 @@ prefetch_on_ps_disconnect(void)
 *
 * The slot at ring_index must be a current member of the ring buffer,
 * and may not be in the PRFS_REQUESTED state.
+ *
+ * NOTE: this function will update MyPState->pfs_hash; which invalidates any
+ * active pointers into the hash table.
 */
 static inline void
-prefetch_set_unused(uint64 ring_index, bool hash_cleanup)
+prefetch_set_unused(uint64 ring_index)
 {
-	PrefetchRequest *slot = &MyPState->prf_buffer[ring_index % READ_BUFFER_SIZE];
+	PrefetchRequest *slot = GetPrfSlot(ring_index);

-	Assert(MyPState->ring_last <= ring_index &&
-		   MyPState->ring_unused > ring_index);
+	if (ring_index < MyPState->ring_last)
+		return; /* Should already be unused */
+
+	Assert(MyPState->ring_unused > ring_index);

 	if (slot->status == PRFS_UNUSED)
 		return;

 	Assert(slot->status == PRFS_RECEIVED || slot->status == PRFS_TAG_REMAINS);
-	Assert(ring_index >= MyPState->ring_last &&
-		   ring_index < MyPState->ring_unused);

 	if (slot->status == PRFS_RECEIVED)
 	{
@@ -382,8 +523,7 @@ prefetch_set_unused(uint64 ring_index, bool hash_cleanup)
 		Assert(slot->response == NULL);
 	}

-	if (hash_cleanup)
-		prfh_delete(MyPState->prf_hash, slot);
+	prfh_delete(MyPState->prf_hash, slot);

 	/* clear all fields */
 	MemSet(slot, 0, sizeof(PrefetchRequest));
@@ -397,6 +537,7 @@ prefetch_set_unused(uint64 ring_index, bool hash_cleanup)
 static void
 prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn)
 {
+	bool found;
 	NeonGetPageRequest request = {
 		.req.tag = T_NeonGetPageRequest,
 		.req.latest = false,
@@ -454,6 +595,9 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force

 	/* update slot state */
 	slot->status = PRFS_REQUESTED;
+
+	prfh_insert(MyPState->prf_hash, slot, &found);
+	Assert(!found);
 }

 /*
@@ -464,13 +608,14 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 * If force_latest and force_lsn are not NULL, those values are sent to the
 * pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure
 * to fill in these values manually.
+ *
+ * NOTE: this function may indirectly update MyPState->pfs_hash; which
+ * invalidates any active pointers into the hash table.
 */

 static uint64
 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn)
 {
-	int		index;
-	bool	found;
 	uint64	ring_index;
 	PrefetchRequest req;
 	PrefetchRequest *slot;
@@ -485,28 +630,49 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 	{
 		slot = entry->slot;
 		ring_index = slot->my_ring_index;
-		index = (ring_index % READ_BUFFER_SIZE);
-		Assert(slot == &MyPState->prf_buffer[index]);
+		Assert(slot == GetPrfSlot(ring_index));

 		Assert(slot->status != PRFS_UNUSED);
+		Assert(MyPState->ring_last <= ring_index &&
+			   ring_index < MyPState->ring_unused);
 		Assert(BUFFERTAGS_EQUAL(slot->buftag, tag));
-		
+
 		/*
 		 * If we want a specific lsn, we do not accept requests that were made
 		 * with a potentially different LSN.
 		 */
-		if (force_lsn && slot->effective_request_lsn != *force_lsn)
+		if (force_latest && force_lsn)
 		{
-			prefetch_wait_for(ring_index);
-			prefetch_set_unused(ring_index, true);
+			/* if we want the latest version, any effective_request_lsn < request lsn is OK */
+			if (*force_latest)
+			{
+				if (*force_lsn > slot->effective_request_lsn)
+				{
+					prefetch_wait_for(ring_index);
+					prefetch_set_unused(ring_index);
+					entry = NULL;
+				}
+			}
+			/* if we don't want the latest version, only accept requests with the exact same LSN */
+			else
+			{
+				if (*force_lsn != slot->effective_request_lsn)
+				{
+					prefetch_wait_for(ring_index);
+					prefetch_set_unused(ring_index);
+					entry = NULL;
+				}
+			}
 		}
+
 		/*
 		 * We received a prefetch for a page that was recently read and
 		 * removed from the buffers. Remove that request from the buffers.
 		 */
 		else if (slot->status == PRFS_TAG_REMAINS)
 		{
-			prefetch_set_unused(ring_index, true);
+			prefetch_set_unused(ring_index);
+			entry = NULL;
 		}
 		else
 		{
@@ -529,9 +695,10 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 	 * output buffer, and 'not sending' a prefetch request kind of goes
 	 * against the principles of prefetching)
 	 */
-	if (MyPState->ring_last + READ_BUFFER_SIZE - 1 == MyPState->ring_unused)
+	if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused)
 	{
-		slot = &MyPState->prf_buffer[(MyPState->ring_last % READ_BUFFER_SIZE)];
+		uint64 cleanup_index = MyPState->ring_last;
+		slot = GetPrfSlot(cleanup_index);

 		Assert(slot->status != PRFS_UNUSED);

@@ -539,13 +706,13 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 		switch (slot->status)
 		{
 			case PRFS_REQUESTED:
-				Assert(MyPState->ring_receive == MyPState->ring_last);
-				prefetch_wait_for(MyPState->ring_last);
-				prefetch_set_unused(MyPState->ring_last, true);
+				Assert(MyPState->ring_receive == cleanup_index);
+				prefetch_wait_for(cleanup_index);
+				prefetch_set_unused(cleanup_index);
 				break;
 			case PRFS_RECEIVED:
 			case PRFS_TAG_REMAINS:
-				prefetch_set_unused(MyPState->ring_last, true);
+				prefetch_set_unused(cleanup_index);
 				break;
 			default:
 				pg_unreachable();
@@ -553,12 +720,11 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 	}

 	/*
-	 * The next buffer pointed to by `ring_unused` is now unused, so we can insert
-	 * the new request to it.
+	 * The next buffer pointed to by `ring_unused` is now definitely empty,
+	 * so we can insert the new request to it.
 	 */
 	ring_index = MyPState->ring_unused;
-	index = (ring_index % READ_BUFFER_SIZE);
-	slot = &MyPState->prf_buffer[index];
+	slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)];

 	Assert(MyPState->ring_last <= ring_index);

@@ -571,22 +737,34 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 	slot->buftag = tag;
 	slot->my_ring_index = ring_index;

-	prfh_insert(MyPState->prf_hash, slot, &found);
-	Assert(!found);
-
 	prefetch_do_request(slot, force_latest, force_lsn);
 	Assert(slot->status == PRFS_REQUESTED);
-	Assert(ring_index < MyPState->ring_unused);
+	Assert(MyPState->ring_last <= ring_index &&
+		   ring_index < MyPState->ring_unused);
+
+	if (flush_every_n_requests > 0 &&
+		MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
+	{
+		page_server->flush();
+		MyPState->ring_flush = MyPState->ring_unused;
+	}
+
 	return ring_index;
 }

 static NeonResponse *
 page_server_request(void const *req)
 {
-	page_server->send((NeonRequest *) req);
-	page_server->flush();
-	consume_prefetch_responses();
-	return page_server->receive();
+	NeonResponse* resp;
+	do {
+		page_server->send((NeonRequest *) req);
+		page_server->flush();
+		MyPState->ring_flush = MyPState->ring_unused;
+		consume_prefetch_responses();
+		resp = page_server->receive();
+	} while (resp == NULL);
+	return resp;
+
 }


@@ -1052,14 +1230,18 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 void
 neon_init(void)
 {
-	HASHCTL info;
+	Size prfs_size;

 	if (MyPState != NULL)
 		return;

-	MyPState = MemoryContextAllocZero(TopMemoryContext, sizeof(PrefetchState));
+	prfs_size = offsetof(PrefetchState, prf_buffer) + (
+		sizeof(PrefetchRequest) * readahead_buffer_size
+	);
+
+	MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size);
 	
-	MyPState->n_unused = READ_BUFFER_SIZE;
+	MyPState->n_unused = readahead_buffer_size;

 	MyPState->bufctx = SlabContextCreate(TopMemoryContext,
 										 "NeonSMGR/prefetch",
@@ -1072,11 +1254,8 @@ neon_init(void)
 											  "NeonSMGR/prefetch",
 											  ALLOCSET_DEFAULT_SIZES);

-	info.keysize = sizeof(BufferTag);
-	info.entrysize = sizeof(uint64);
-
 	MyPState->prf_hash = prfh_create(MyPState->hashctx,
-									 READ_BUFFER_SIZE, NULL);
+									 readahead_buffer_size, NULL);

 #ifdef DEBUG_COMPARE_LOCAL
 	mdinit();
@@ -1470,7 +1649,8 @@ neon_close(SMgrRelation reln, ForkNumber forknum)
 bool
 neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
-	uint64 ring_index;
+	BufferTag	tag;
+	uint64		ring_index PG_USED_FOR_ASSERTS_ONLY;

 	switch (reln->smgr_relpersistence)
 	{
@@ -1486,7 +1666,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	BufferTag tag = (BufferTag) {
+	tag = (BufferTag) {
 		.rnode = reln->smgr_rnode.node,
 		.forkNum = forknum,
 		.blockNum = blocknum
@@ -1565,9 +1745,9 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,

 	if (entry != NULL)
 	{
-		if (entry->slot->effective_request_lsn >= prefetch_lsn)
+		slot = entry->slot;
+		if (slot->effective_request_lsn >= request_lsn)
 		{
-			slot = entry->slot;
 			ring_index = slot->my_ring_index;
 			n_prefetch_hits += 1;
 		}
@@ -1578,36 +1758,36 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 			 * unlikely this happens, but it can happen if prefetch distance is
 			 * large enough and a backend didn't consume all prefetch requests.
 			 */
-			if (entry->slot->status == PRFS_REQUESTED)
+			if (slot->status == PRFS_REQUESTED)
 			{
-				page_server->flush();
-				prefetch_wait_for(entry->slot->my_ring_index);
+				prefetch_wait_for(slot->my_ring_index);
 			}
 			/* drop caches */
-			prefetch_set_unused(entry->slot->my_ring_index, true);
+			prefetch_set_unused(slot->my_ring_index);
 			n_prefetch_missed_caches += 1;
 			/* make it look like a prefetch cache miss */
 			entry = NULL;
 		}
 	}

-	if (entry == NULL)
+	do
 	{
-		n_prefetch_misses += 1;
+		if (entry == NULL)
+		{
+			n_prefetch_misses += 1;

-		ring_index = prefetch_register_buffer(buftag, &request_latest,
-											  &request_lsn);
-		slot = &MyPState->prf_buffer[(ring_index % READ_BUFFER_SIZE)];
-	}
+			ring_index = prefetch_register_buffer(buftag, &request_latest,
+												  &request_lsn);
+			slot = GetPrfSlot(ring_index);
+		}

-	Assert(MyPState->ring_last <= ring_index &&
-		   MyPState->ring_unused > ring_index);
-	Assert(slot->my_ring_index == ring_index);
-	Assert(slot->status != PRFS_UNUSED);
-	Assert(&MyPState->prf_buffer[(ring_index % READ_BUFFER_SIZE)] == slot);
+		Assert(slot->my_ring_index == ring_index);
+		Assert(MyPState->ring_last <= ring_index &&
+			   MyPState->ring_unused > ring_index);
+		Assert(slot->status != PRFS_UNUSED);
+		Assert(GetPrfSlot(ring_index) == slot);

-	page_server->flush();
-	prefetch_wait_for(ring_index);
+	} while (!prefetch_wait_for(ring_index));

 	Assert(slot->status == PRFS_RECEIVED);

@@ -1637,7 +1817,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	}

 	/* buffer was used, clean up for later reuse */
-	prefetch_set_unused(ring_index, true);
+	prefetch_set_unused(ring_index);
 	prefetch_cleanup();
 }

--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -75,7 +75,7 @@ static bool syncSafekeepers = false;

 char	   *wal_acceptors_list;
 int			wal_acceptor_reconnect_timeout;
-int			wal_acceptor_connect_timeout;
+int			wal_acceptor_connection_timeout;
 bool		am_wal_proposer;

 char	   *neon_timeline_walproposer = NULL;
@@ -119,6 +119,7 @@ static TimestampTz last_reconnect_attempt;
 static WalproposerShmemState * walprop_shared;

 /* Prototypes for private functions */
+static void WalProposerRegister(void);
 static void WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId);
 static void WalProposerStart(void);
 static void WalProposerLoop(void);
@@ -266,9 +267,9 @@ nwp_register_gucs(void)

 	DefineCustomIntVariable(
 							"neon.safekeeper_connect_timeout",
-							"Timeout after which give up connection attempt to safekeeper.",
+							"Timeout for connection establishement and it's maintenance against safekeeper",
 							NULL,
-							&wal_acceptor_connect_timeout,
+							&wal_acceptor_connection_timeout,
 							5000, 0, INT_MAX,
 							PGC_SIGHUP,
 							GUC_UNIT_MS,
@@ -417,7 +418,9 @@ WalProposerPoll(void)
 			ResetLatch(MyLatch);
 			break;
 		}
-		if (rc == 0)			/* timeout expired: poll state */
+
+		now = GetCurrentTimestamp();
+		if (rc == 0 || TimeToReconnect(now) <= 0)			/* timeout expired: poll state */
 		{
 			TimestampTz now;

@@ -438,13 +441,11 @@ WalProposerPoll(void)
 			{
 				Safekeeper *sk = &safekeeper[i];

-				if ((sk->state == SS_CONNECTING_WRITE ||
-					 sk->state == SS_CONNECTING_READ) &&
-					TimestampDifferenceExceeds(sk->startedConnAt, now,
-											   wal_acceptor_connect_timeout))
+				if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
+											   wal_acceptor_connection_timeout))
 				{
-					elog(WARNING, "failed to connect to node '%s:%s': exceeded connection timeout %dms",
-						 sk->host, sk->port, wal_acceptor_connect_timeout);
+					elog(WARNING, "failed to connect to node '%s:%s' in '%s' state: exceeded connection timeout %dms",
+						 sk->host, sk->port, FormatSafekeeperState(sk->state), wal_acceptor_connection_timeout);
 					ShutdownConnection(sk);
 				}
 			}
@@ -455,7 +456,7 @@ WalProposerPoll(void)
 /*
 * Register a background worker proposing WAL to wal acceptors.
 */
-void
+static void
 WalProposerRegister(void)
 {
 	BackgroundWorker bgw;
@@ -760,7 +761,7 @@ ResetConnection(Safekeeper *sk)
 	elog(LOG, "connecting with node %s:%s", sk->host, sk->port);

 	sk->state = SS_CONNECTING_WRITE;
-	sk->startedConnAt = GetCurrentTimestamp();
+	sk->latestMsgReceivedAt = GetCurrentTimestamp();

 	sock = walprop_socket(sk->conn);
 	sk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, sk);
@@ -918,7 +919,7 @@ HandleConnectionEvent(Safekeeper *sk)
 		case WP_CONN_POLLING_OK:
 			elog(LOG, "connected with node %s:%s", sk->host,
 				 sk->port);
-
+			sk->latestMsgReceivedAt = GetCurrentTimestamp();
 			/*
 			 * We have to pick some event to update event set. We'll
 			 * eventually need the socket to be readable, so we go with that.
@@ -2304,7 +2305,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg)
 		ResetConnection(sk);
 		return false;
 	}
-
+	sk->latestMsgReceivedAt = GetCurrentTimestamp();
 	switch (tag)
 	{
 		case 'g':
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -30,7 +30,7 @@

 extern char *wal_acceptors_list;
 extern int	wal_acceptor_reconnect_timeout;
-extern int	wal_acceptor_connect_timeout;
+extern int	wal_acceptor_connection_timeout;
 extern bool am_wal_proposer;

 struct WalProposerConn;			/* Defined in libpqwalproposer */
@@ -371,24 +371,24 @@ typedef struct Safekeeper
 	int			eventPos;		/* position in wait event set. Equal to -1 if*
 								 * no event */
 	SafekeeperState state;		/* safekeeper state machine state */
-	TimestampTz startedConnAt;	/* when connection attempt started */
+	TimestampTz latestMsgReceivedAt;        /* when latest msg is received */
 	AcceptorGreeting greetResponse; /* acceptor greeting */
 	VoteResponse voteResponse;	/* the vote */
 	AppendResponse appendResponse;	/* feedback for master */
 } Safekeeper;

-extern PGDLLIMPORT void WalProposerMain(Datum main_arg);
-void		WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
-void		WalProposerPoll(void);
-void		WalProposerRegister(void);
-void		ParseReplicationFeedbackMessage(StringInfo reply_message,
-											ReplicationFeedback * rf);
+extern void WalProposerSync(int argc, char *argv[]);
+extern void WalProposerMain(Datum main_arg);
+extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
+extern void WalProposerPoll(void);
+extern void ParseReplicationFeedbackMessage(StringInfo reply_message,
+											ReplicationFeedback *rf);
 extern void StartProposerReplication(StartReplicationCmd *cmd);

-Size		WalproposerShmemSize(void);
-bool		WalproposerShmemInit(void);
-void		replication_feedback_set(ReplicationFeedback * rf);
-void		replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
+extern Size WalproposerShmemSize(void);
+extern bool WalproposerShmemInit(void);
+extern void replication_feedback_set(ReplicationFeedback *rf);
+extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);

 /* libpqwalproposer hooks & helper type */

--- a/poetry.lock
+++ b/poetry.lock
@@ -1077,6 +1077,17 @@ python-versions = ">=3.6"
 [package.extras]
 twisted = ["twisted"]

+[[package]]
+name = "psutil"
+version = "5.9.4"
+description = "Cross-platform lib for process and system monitoring in Python."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[package.extras]
+test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
+
 [[package]]
 name = "psycopg2-binary"
 version = "2.9.3"
@@ -1207,18 +1218,6 @@ pytest = ">=6.1.0"
 [package.extras]
 testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"]

-[[package]]
-name = "pytest-forked"
-version = "1.4.0"
-description = "run tests in isolated forked subprocesses"
-category = "main"
-optional = false
-python-versions = ">=3.6"
-
-[package.dependencies]
-py = "*"
-pytest = ">=3.10"
-
 [[package]]
 name = "pytest-lazy-fixture"
 version = "0.6.3"
@@ -1240,8 +1239,8 @@ python-versions = ">=3.6"

 [package.dependencies]
 pytest = [
-    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
    {version = ">=5.0", markers = "python_version < \"3.10\""},
+    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
 ]

 [[package]]
@@ -1257,7 +1256,7 @@ pytest = ">=5.0.0"

 [[package]]
 name = "pytest-xdist"
-version = "2.5.0"
+version = "3.0.2"
 description = "pytest xdist plugin for distributed testing and loop-on-failing modes"
 category = "main"
 optional = false
@@ -1266,7 +1265,6 @@ python-versions = ">=3.6"
 [package.dependencies]
 execnet = ">=1.1"
 pytest = ">=6.2.0"
-pytest-forked = "*"

 [package.extras]
 psutil = ["psutil (>=3.0)"]
@@ -1449,6 +1447,14 @@ category = "dev"
 optional = false
 python-versions = ">=3.7"

+[[package]]
+name = "types-psutil"
+version = "5.9.5.4"
+description = "Typing stubs for psutil"
+category = "main"
+optional = false
+python-versions = "*"
+
 [[package]]
 name = "types-psycopg2"
 version = "2.9.18"
@@ -1568,7 +1574,7 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "9352a89d49d34807f6a58f6c3f898acbd8cf3570e0f45ede973673644bde4d0e"
+content-hash = "c95c184fccaf40815405ad616ec1c55869c7f87b72777cc3a9cbaff41de98977"

 [metadata.files]
 aiopg = [
@@ -1979,9 +1985,26 @@ prometheus-client = [
    {file = "prometheus_client-0.14.1-py3-none-any.whl", hash = "sha256:522fded625282822a89e2773452f42df14b5a8e84a86433e3f8a189c1d54dc01"},
    {file = "prometheus_client-0.14.1.tar.gz", hash = "sha256:5459c427624961076277fdc6dc50540e2bacb98eebde99886e59ec55ed92093a"},
 ]
+psutil = [
+    {file = "psutil-5.9.4-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c1ca331af862803a42677c120aff8a814a804e09832f166f226bfd22b56feee8"},
+    {file = "psutil-5.9.4-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:68908971daf802203f3d37e78d3f8831b6d1014864d7a85937941bb35f09aefe"},
+    {file = "psutil-5.9.4-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:3ff89f9b835100a825b14c2808a106b6fdcc4b15483141482a12c725e7f78549"},
+    {file = "psutil-5.9.4-cp27-cp27m-win32.whl", hash = "sha256:852dd5d9f8a47169fe62fd4a971aa07859476c2ba22c2254d4a1baa4e10b95ad"},
+    {file = "psutil-5.9.4-cp27-cp27m-win_amd64.whl", hash = "sha256:9120cd39dca5c5e1c54b59a41d205023d436799b1c8c4d3ff71af18535728e94"},
+    {file = "psutil-5.9.4-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:6b92c532979bafc2df23ddc785ed116fced1f492ad90a6830cf24f4d1ea27d24"},
+    {file = "psutil-5.9.4-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:efeae04f9516907be44904cc7ce08defb6b665128992a56957abc9b61dca94b7"},
+    {file = "psutil-5.9.4-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:54d5b184728298f2ca8567bf83c422b706200bcbbfafdc06718264f9393cfeb7"},
+    {file = "psutil-5.9.4-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:16653106f3b59386ffe10e0bad3bb6299e169d5327d3f187614b1cb8f24cf2e1"},
+    {file = "psutil-5.9.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54c0d3d8e0078b7666984e11b12b88af2db11d11249a8ac8920dd5ef68a66e08"},
+    {file = "psutil-5.9.4-cp36-abi3-win32.whl", hash = "sha256:149555f59a69b33f056ba1c4eb22bb7bf24332ce631c44a319cec09f876aaeff"},
+    {file = "psutil-5.9.4-cp36-abi3-win_amd64.whl", hash = "sha256:fd8522436a6ada7b4aad6638662966de0d61d241cb821239b2ae7013d41a43d4"},
+    {file = "psutil-5.9.4-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:6001c809253a29599bc0dfd5179d9f8a5779f9dffea1da0f13c53ee568115e1e"},
+    {file = "psutil-5.9.4.tar.gz", hash = "sha256:3d7f9739eb435d4b1338944abe23f49584bde5395f27487d2ee25ad9a8774a62"},
+]
 psycopg2-binary = [
    {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"},
    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"},
+    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"},
    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"},
    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"},
    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"},
@@ -2015,6 +2038,7 @@ psycopg2-binary = [
    {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"},
    {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"},
+    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"},
@@ -2026,6 +2050,7 @@ psycopg2-binary = [
    {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"},
    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"},
+    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"},
    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"},
    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"},
    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"},
@@ -2042,18 +2067,7 @@ py = [
    {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
 ]
 pyasn1 = [
-    {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"},
-    {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"},
-    {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"},
-    {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"},
    {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"},
-    {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"},
-    {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"},
-    {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"},
-    {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"},
-    {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"},
-    {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"},
-    {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"},
    {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
 ]
 pycodestyle = [
@@ -2111,10 +2125,6 @@ pytest-asyncio = [
    {file = "pytest-asyncio-0.19.0.tar.gz", hash = "sha256:ac4ebf3b6207259750bc32f4c1d8fcd7e79739edbc67ad0c58dd150b1d072fed"},
    {file = "pytest_asyncio-0.19.0-py3-none-any.whl", hash = "sha256:7a97e37cfe1ed296e2e84941384bdd37c376453912d397ed39293e0916f521fa"},
 ]
-pytest-forked = [
-    {file = "pytest-forked-1.4.0.tar.gz", hash = "sha256:8b67587c8f98cbbadfdd804539ed5455b6ed03802203485dd2f53c1422d7440e"},
-    {file = "pytest_forked-1.4.0-py3-none-any.whl", hash = "sha256:bbbb6717efc886b9d64537b41fb1497cfaf3c9601276be8da2cccfea5a3c8ad8"},
-]
 pytest-lazy-fixture = [
    {file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"},
    {file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"},
@@ -2128,8 +2138,8 @@ pytest-timeout = [
    {file = "pytest_timeout-2.1.0-py3-none-any.whl", hash = "sha256:f6f50101443ce70ad325ceb4473c4255e9d74e3c7cd0ef827309dfa4c0d975c6"},
 ]
 pytest-xdist = [
-    {file = "pytest-xdist-2.5.0.tar.gz", hash = "sha256:4580deca3ff04ddb2ac53eba39d76cb5dd5edeac050cb6fbc768b0dd712b4edf"},
-    {file = "pytest_xdist-2.5.0-py3-none-any.whl", hash = "sha256:6fe5c74fec98906deb8f2d2b616b5c782022744978e7bd4695d39c8f42d0ce65"},
+    {file = "pytest-xdist-3.0.2.tar.gz", hash = "sha256:688da9b814370e891ba5de650c9327d1a9d861721a524eb917e620eec3e90291"},
+    {file = "pytest_xdist-3.0.2-py3-none-any.whl", hash = "sha256:9feb9a18e1790696ea23e1434fa73b325ed4998b0e9fcb221f16fd1945e6df1b"},
 ]
 python-dateutil = [
    {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
@@ -2163,6 +2173,13 @@ pyyaml = [
    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
    {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
    {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
+    {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
+    {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
+    {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
+    {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
    {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
@@ -2230,6 +2247,10 @@ tomli = [
    {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
    {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
 ]
+types-psutil = [
+    {file = "types-psutil-5.9.5.4.tar.gz", hash = "sha256:aa09102b80c65a3b4573216614372398dab78972d650488eaff1ff05482cc18f"},
+    {file = "types_psutil-5.9.5.4-py3-none-any.whl", hash = "sha256:28e59764630187e462d43788efa16d59d5e77b510115f9e25901b2d4007fca62"},
+]
 types-psycopg2 = [
    {file = "types-psycopg2-2.9.18.tar.gz", hash = "sha256:9b0e9e1f097b15cd9fa8aad2596a9e3082fd72f8d9cfe52b190cfa709105b6c0"},
    {file = "types_psycopg2-2.9.18-py3-none-any.whl", hash = "sha256:14c779dcab18c31453fa1cad3cf4b1601d33540a344adead3c47a6b8091cd2fa"},
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -1,7 +1,7 @@
 //! Client authentication mechanisms.

 pub mod backend;
-pub use backend::{BackendType, ConsoleReqExtra, DatabaseInfo};
+pub use backend::{BackendType, ConsoleReqExtra};

 mod credentials;
 pub use credentials::ClientCredentials;
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -12,7 +12,6 @@ use crate::{
    waiters::{self, Waiter, Waiters},
 };
 use once_cell::sync::Lazy;
-use serde::{Deserialize, Serialize};
 use std::borrow::Cow;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
@@ -36,45 +35,6 @@ pub fn notify(psql_session_id: &str, msg: mgmt::ComputeReady) -> Result<(), wait
    CPLANE_WAITERS.notify(psql_session_id, msg)
 }

-/// Compute node connection params provided by the cloud.
-/// Note how it implements serde traits, since we receive it over the wire.
-#[derive(Serialize, Deserialize, Default)]
-pub struct DatabaseInfo {
-    pub host: String,
-    pub port: u16,
-    pub dbname: String,
-    pub user: String,
-    pub password: Option<String>,
-}
-
-// Manually implement debug to omit personal and sensitive info.
-impl std::fmt::Debug for DatabaseInfo {
-    fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
-        fmt.debug_struct("DatabaseInfo")
-            .field("host", &self.host)
-            .field("port", &self.port)
-            .finish_non_exhaustive()
-    }
-}
-
-impl From<DatabaseInfo> for tokio_postgres::Config {
-    fn from(db_info: DatabaseInfo) -> Self {
-        let mut config = tokio_postgres::Config::new();
-
-        config
-            .host(&db_info.host)
-            .port(db_info.port)
-            .dbname(&db_info.dbname)
-            .user(&db_info.user);
-
-        if let Some(password) = db_info.password {
-            config.password(password);
-        }
-
-        config
-    }
-}
-
 /// Extra query params we'd like to pass to the console.
 pub struct ConsoleReqExtra<'a> {
    /// A unique identifier for a connection.
@@ -158,54 +118,107 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
    }
 }

+/// A product of successful authentication.
+pub struct AuthSuccess<T> {
+    /// Did we send [`pq_proto::BeMessage::AuthenticationOk`] to client?
+    pub reported_auth_ok: bool,
+    /// Something to be considered a positive result.
+    pub value: T,
+}
+
+impl<T> AuthSuccess<T> {
+    /// Very similar to [`std::option::Option::map`].
+    /// Maps [`AuthSuccess<T>`] to [`AuthSuccess<R>`] by applying
+    /// a function to a contained value.
+    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> AuthSuccess<R> {
+        AuthSuccess {
+            reported_auth_ok: self.reported_auth_ok,
+            value: f(self.value),
+        }
+    }
+}
+
+/// Info for establishing a connection to a compute node.
+/// This is what we get after auth succeeded, but not before!
+pub struct NodeInfo {
+    /// Project from [`auth::ClientCredentials`].
+    pub project: String,
+    /// Compute node connection params.
+    pub config: compute::ConnCfg,
+}
+
 impl BackendType<'_, ClientCredentials<'_>> {
+    /// Do something special if user didn't provide the `project` parameter.
+    async fn try_password_hack(
+        &mut self,
+        extra: &ConsoleReqExtra<'_>,
+        client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
+    ) -> auth::Result<Option<AuthSuccess<NodeInfo>>> {
+        use BackendType::*;
+
+        // If there's no project so far, that entails that client doesn't
+        // support SNI or other means of passing the project name.
+        // We now expect to see a very specific payload in the place of password.
+        let fetch_magic_payload = async {
+            warn!("project name not specified, resorting to the password hack auth flow");
+            let payload = AuthFlow::new(client)
+                .begin(auth::PasswordHack)
+                .await?
+                .authenticate()
+                .await?;
+
+            info!(project = &payload.project, "received missing parameter");
+            auth::Result::Ok(payload)
+        };
+
+        // TODO: find a proper way to merge those very similar blocks.
+        let (mut config, payload) = match self {
+            Console(endpoint, creds) if creds.project.is_none() => {
+                let payload = fetch_magic_payload.await?;
+
+                let mut creds = creds.as_ref();
+                creds.project = Some(payload.project.as_str().into());
+                let config = console::Api::new(endpoint, extra, &creds)
+                    .wake_compute()
+                    .await?;
+
+                (config, payload)
+            }
+            Postgres(endpoint, creds) if creds.project.is_none() => {
+                let payload = fetch_magic_payload.await?;
+
+                let mut creds = creds.as_ref();
+                creds.project = Some(payload.project.as_str().into());
+                let config = postgres::Api::new(endpoint, &creds).wake_compute().await?;
+
+                (config, payload)
+            }
+            _ => return Ok(None),
+        };
+
+        config.password(payload.password);
+        Ok(Some(AuthSuccess {
+            reported_auth_ok: false,
+            value: NodeInfo {
+                project: payload.project,
+                config,
+            },
+        }))
+    }
+
    /// Authenticate the client via the requested backend, possibly using credentials.
    pub async fn authenticate(
        mut self,
        extra: &ConsoleReqExtra<'_>,
        client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
-    ) -> super::Result<compute::NodeInfo> {
+    ) -> auth::Result<AuthSuccess<NodeInfo>> {
        use BackendType::*;

-        if let Console(_, creds) | Postgres(_, creds) = &mut self {
-            // If there's no project so far, that entails that client doesn't
-            // support SNI or other means of passing the project name.
-            // We now expect to see a very specific payload in the place of password.
-            if creds.project().is_none() {
-                warn!("project name not specified, resorting to the password hack auth flow");
-
-                let payload = AuthFlow::new(client)
-                    .begin(auth::PasswordHack)
-                    .await?
-                    .authenticate()
-                    .await?;
-
-                // Finally we may finish the initialization of `creds`.
-                // TODO: add missing type safety to ClientCredentials.
-                info!(project = &payload.project, "received missing parameter");
-                creds.project = Some(payload.project.into());
-
-                let mut config = match &self {
-                    Console(endpoint, creds) => {
-                        console::Api::new(endpoint, extra, creds)
-                            .wake_compute()
-                            .await?
-                    }
-                    Postgres(endpoint, creds) => {
-                        postgres::Api::new(endpoint, creds).wake_compute().await?
-                    }
-                    _ => unreachable!("see the patterns above"),
-                };
-
-                // We should use a password from payload as well.
-                config.password(payload.password);
-
-                info!("user successfully authenticated (using the password hack)");
-                return Ok(compute::NodeInfo {
-                    reported_auth_ok: false,
-                    config,
-                });
-            }
+        // Handle cases when `project` is missing in `creds`.
+        // TODO: type safety: return `creds` with irrefutable `project`.
+        if let Some(res) = self.try_password_hack(extra, client).await? {
+            info!("user successfully authenticated (using the password hack)");
+            return Ok(res);
        }

        let res = match self {
@@ -215,22 +228,34 @@ impl BackendType<'_, ClientCredentials<'_>> {
                    project = creds.project(),
                    "performing authentication using the console"
                );
+
+                assert!(creds.project.is_some());
                console::Api::new(&endpoint, extra, &creds)
                    .handle_user(client)
-                    .await
+                    .await?
+                    .map(|config| NodeInfo {
+                        project: creds.project.unwrap().into_owned(),
+                        config,
+                    })
            }
            Postgres(endpoint, creds) => {
                info!("performing mock authentication using a local postgres instance");
+
+                assert!(creds.project.is_some());
                postgres::Api::new(&endpoint, &creds)
                    .handle_user(client)
-                    .await
+                    .await?
+                    .map(|config| NodeInfo {
+                        project: creds.project.unwrap().into_owned(),
+                        config,
+                    })
            }
            // NOTE: this auth backend doesn't use client credentials.
            Link(url) => {
                info!("performing link authentication");
-                link::handle_user(&url, client).await
+                link::handle_user(&url, client).await?
            }
-        }?;
+        };

        info!("user successfully authenticated");
        Ok(res)
--- a/proxy/src/auth/backend/console.rs
+++ b/proxy/src/auth/backend/console.rs
@@ -1,9 +1,9 @@
 //! Cloud API V2.

-use super::ConsoleReqExtra;
+use super::{AuthSuccess, ConsoleReqExtra};
 use crate::{
    auth::{self, AuthFlow, ClientCredentials},
-    compute::{self, ComputeConnCfg},
+    compute,
    error::{io_error, UserFacingError},
    http, scram,
    stream::PqStream,
@@ -128,7 +128,7 @@ impl<'a> Api<'a> {
    pub(super) async fn handle_user(
        self,
        client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
-    ) -> auth::Result<compute::NodeInfo> {
+    ) -> auth::Result<AuthSuccess<compute::ConnCfg>> {
        handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await
    }

@@ -164,7 +164,7 @@ impl<'a> Api<'a> {
    }

    /// Wake up the compute node and return the corresponding connection info.
-    pub(super) async fn wake_compute(&self) -> Result<ComputeConnCfg, WakeComputeError> {
+    pub(super) async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
        let request_id = uuid::Uuid::new_v4().to_string();
        let req = self
            .endpoint
@@ -195,7 +195,7 @@ impl<'a> Api<'a> {
            Some(x) => x,
        };

-        let mut config = ComputeConnCfg::new();
+        let mut config = compute::ConnCfg::new();
        config
            .host(host)
            .port(port)
@@ -213,10 +213,10 @@ pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>(
    endpoint: &'a Endpoint,
    get_auth_info: impl FnOnce(&'a Endpoint) -> GetAuthInfo,
    wake_compute: impl FnOnce(&'a Endpoint) -> WakeCompute,
-) -> auth::Result<compute::NodeInfo>
+) -> auth::Result<AuthSuccess<compute::ConnCfg>>
 where
    GetAuthInfo: Future<Output = Result<AuthInfo, GetAuthInfoError>>,
-    WakeCompute: Future<Output = Result<ComputeConnCfg, WakeComputeError>>,
+    WakeCompute: Future<Output = Result<compute::ConnCfg, WakeComputeError>>,
 {
    info!("fetching user's authentication info");
    let auth_info = get_auth_info(endpoint).await?;
@@ -243,9 +243,9 @@ where
        config.auth_keys(tokio_postgres::config::AuthKeys::ScramSha256(keys));
    }

-    Ok(compute::NodeInfo {
+    Ok(AuthSuccess {
        reported_auth_ok: false,
-        config,
+        value: config,
    })
 }

--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -1,3 +1,4 @@
+use super::{AuthSuccess, NodeInfo};
 use crate::{auth, compute, error::UserFacingError, stream::PqStream, waiters};
 use pq_proto::{BeMessage as Be, BeParameterStatusMessage};
 use thiserror::Error;
@@ -49,7 +50,7 @@ pub fn new_psql_session_id() -> String {
 pub async fn handle_user(
    link_uri: &reqwest::Url,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-) -> auth::Result<compute::NodeInfo> {
+) -> auth::Result<AuthSuccess<NodeInfo>> {
    let psql_session_id = new_psql_session_id();
    let span = info_span!("link", psql_session_id = &psql_session_id);
    let greeting = hello_message(link_uri, &psql_session_id);
@@ -71,8 +72,22 @@ pub async fn handle_user(

    client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;

-    Ok(compute::NodeInfo {
+    let mut config = compute::ConnCfg::new();
+    config
+        .host(&db_info.host)
+        .port(db_info.port)
+        .dbname(&db_info.dbname)
+        .user(&db_info.user);
+
+    if let Some(password) = db_info.password {
+        config.password(password);
+    }
+
+    Ok(AuthSuccess {
        reported_auth_ok: true,
-        config: db_info.into(),
+        value: NodeInfo {
+            project: db_info.project,
+            config,
+        },
    })
 }
--- a/proxy/src/auth/backend/postgres.rs
+++ b/proxy/src/auth/backend/postgres.rs
@@ -1,12 +1,12 @@
 //! Local mock of Cloud API V2.

+use super::{
+    console::{self, AuthInfo, GetAuthInfoError, TransportError, WakeComputeError},
+    AuthSuccess,
+};
 use crate::{
-    auth::{
-        self,
-        backend::console::{self, AuthInfo, GetAuthInfoError, TransportError, WakeComputeError},
-        ClientCredentials,
-    },
-    compute::{self, ComputeConnCfg},
+    auth::{self, ClientCredentials},
+    compute,
    error::io_error,
    scram,
    stream::PqStream,
@@ -37,7 +37,7 @@ impl<'a> Api<'a> {
    pub(super) async fn handle_user(
        self,
        client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
-    ) -> auth::Result<compute::NodeInfo> {
+    ) -> auth::Result<AuthSuccess<compute::ConnCfg>> {
        // We reuse user handling logic from a production module.
        console::handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await
    }
@@ -82,8 +82,8 @@ impl<'a> Api<'a> {
    }

    /// We don't need to wake anything locally, so we just return the connection info.
-    pub(super) async fn wake_compute(&self) -> Result<ComputeConnCfg, WakeComputeError> {
-        let mut config = ComputeConnCfg::new();
+    pub(super) async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
+        let mut config = compute::ConnCfg::new();
        config
            .host(self.endpoint.host_str().unwrap_or("localhost"))
            .port(self.endpoint.port().unwrap_or(5432))
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -36,11 +36,23 @@ pub struct ClientCredentials<'a> {
 }

 impl ClientCredentials<'_> {
+    #[inline]
    pub fn project(&self) -> Option<&str> {
        self.project.as_deref()
    }
 }

+impl<'a> ClientCredentials<'a> {
+    #[inline]
+    pub fn as_ref(&'a self) -> ClientCredentials<'a> {
+        Self {
+            user: self.user,
+            dbname: self.dbname,
+            project: self.project().map(Cow::Borrowed),
+        }
+    }
+}
+
 impl<'a> ClientCredentials<'a> {
    pub fn parse(
        params: &'a StartupMessageParams,
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -40,17 +40,36 @@ impl UserFacingError for ConnectionError {
 /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`.
 pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;

-pub type ComputeConnCfg = tokio_postgres::Config;
+/// A config for establishing a connection to compute node.
+/// Eventually, `tokio_postgres` will be replaced with something better.
+/// Newtype allows us to implement methods on top of it.
+#[repr(transparent)]
+pub struct ConnCfg(pub tokio_postgres::Config);

-/// Various compute node info for establishing connection etc.
-pub struct NodeInfo {
-    /// Did we send [`pq_proto::BeMessage::AuthenticationOk`]?
-    pub reported_auth_ok: bool,
-    /// Compute node connection params.
-    pub config: tokio_postgres::Config,
+impl ConnCfg {
+    /// Construct a new connection config.
+    pub fn new() -> Self {
+        Self(tokio_postgres::Config::new())
+    }
 }

-impl NodeInfo {
+impl std::ops::Deref for ConnCfg {
+    type Target = tokio_postgres::Config;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+/// For now, let's make it easier to setup the config.
+impl std::ops::DerefMut for ConnCfg {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
+impl ConnCfg {
+    /// Establish a raw TCP connection to the compute node.
    async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> {
        use tokio_postgres::config::Host;

@@ -68,8 +87,8 @@ impl NodeInfo {
        // because it has no means for extracting the underlying socket which we
        // require for our business.
        let mut connection_error = None;
-        let ports = self.config.get_ports();
-        let hosts = self.config.get_hosts();
+        let ports = self.0.get_ports();
+        let hosts = self.0.get_hosts();
        // the ports array is supposed to have 0 entries, 1 entry, or as many entries as in the hosts array
        if ports.len() > 1 && ports.len() != hosts.len() {
            return Err(io::Error::new(
@@ -77,7 +96,7 @@ impl NodeInfo {
                format!(
                    "couldn't connect: bad compute config, \
                        ports and hosts entries' count does not match: {:?}",
-                    self.config
+                    self.0
                ),
            ));
        }
@@ -103,7 +122,7 @@ impl NodeInfo {
        Err(connection_error.unwrap_or_else(|| {
            io::Error::new(
                io::ErrorKind::Other,
-                format!("couldn't connect: bad compute config: {:?}", self.config),
+                format!("couldn't connect: bad compute config: {:?}", self.0),
            )
        }))
    }
@@ -116,7 +135,7 @@ pub struct PostgresConnection {
    pub version: String,
 }

-impl NodeInfo {
+impl ConnCfg {
    /// Connect to a corresponding compute node.
    pub async fn connect(
        mut self,
@@ -130,21 +149,21 @@ impl NodeInfo {
                .intersperse(" ") // TODO: use impl from std once it's stabilized
                .collect();

-            self.config.options(&options);
+            self.0.options(&options);
        }

        if let Some(app_name) = params.get("application_name") {
-            self.config.application_name(app_name);
+            self.0.application_name(app_name);
        }

        if let Some(replication) = params.get("replication") {
            use tokio_postgres::config::ReplicationMode;
            match replication {
                "true" | "on" | "yes" | "1" => {
-                    self.config.replication_mode(ReplicationMode::Physical);
+                    self.0.replication_mode(ReplicationMode::Physical);
                }
                "database" => {
-                    self.config.replication_mode(ReplicationMode::Logical);
+                    self.0.replication_mode(ReplicationMode::Logical);
                }
                _other => {}
            }
@@ -160,7 +179,7 @@ impl NodeInfo {
            .map_err(|_| ConnectionError::FailedToConnectToCompute)?;

        // TODO: establish a secure connection to the DB
-        let (client, conn) = self.config.connect_raw(&mut stream, NoTls).await?;
+        let (client, conn) = self.0.connect_raw(&mut stream, NoTls).await?;
        let version = conn
            .parameter("server_version")
            .ok_or(ConnectionError::FailedToFetchPgVersion)?
--- a/proxy/src/mgmt.rs
+++ b/proxy/src/mgmt.rs
@@ -6,16 +6,11 @@ use std::{
    net::{TcpListener, TcpStream},
    thread,
 };
-use tracing::{error, info};
+use tracing::{error, info, info_span};
 use utils::postgres_backend::{self, AuthType, PostgresBackend};

-/// TODO: move all of that to auth-backend/link.rs when we ditch legacy-console backend
-
-///
-/// Main proxy listener loop.
-///
-/// Listens for connections, and launches a new handler thread for each.
-///
+/// Console management API listener thread.
+/// It spawns console response handlers needed for the link auth.
 pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
    scopeguard::defer! {
        info!("mgmt has shut down");
@@ -24,6 +19,7 @@ pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
    listener
        .set_nonblocking(false)
        .context("failed to set listener to blocking")?;
+
    loop {
        let (socket, peer_addr) = listener.accept().context("failed to accept a new client")?;
        info!("accepted connection from {peer_addr}");
@@ -31,9 +27,19 @@ pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
            .set_nodelay(true)
            .context("failed to set client socket option")?;

+        // TODO: replace with async tasks.
        thread::spawn(move || {
-            if let Err(err) = handle_connection(socket) {
-                error!("{err}");
+            let tid = std::thread::current().id();
+            let span = info_span!("mgmt", thread = format_args!("{tid:?}"));
+            let _enter = span.enter();
+
+            info!("started a new console management API thread");
+            scopeguard::defer! {
+                info!("console management API thread is about to finish");
+            }
+
+            if let Err(e) = handle_connection(socket) {
+                error!("thread failed with an error: {e}");
            }
        });
    }
@@ -44,44 +50,21 @@ fn handle_connection(socket: TcpStream) -> anyhow::Result<()> {
    pgbackend.run(&mut MgmtHandler)
 }

-struct MgmtHandler;
-
-/// Serialized examples:
-// {
-//     "session_id": "71d6d03e6d93d99a",
-//     "result": {
-//         "Success": {
-//             "host": "127.0.0.1",
-//             "port": 5432,
-//             "dbname": "stas",
-//             "user": "stas",
-//             "password": "mypass"
-//         }
-//     }
-// }
-// {
-//     "session_id": "71d6d03e6d93d99a",
-//     "result": {
-//         "Failure": "oops"
-//     }
-// }
-//
-// // to test manually by sending a query to mgmt interface:
-// psql -h 127.0.0.1 -p 9999 -c '{"session_id":"4f10dde522e14739","result":{"Success":{"host":"127.0.0.1","port":5432,"dbname":"stas","user":"stas","password":"stas"}}}'
-#[derive(Deserialize)]
+/// Known as `kickResponse` in the console.
+#[derive(Debug, Deserialize)]
 struct PsqlSessionResponse {
    session_id: String,
    result: PsqlSessionResult,
 }

-#[derive(Deserialize)]
+#[derive(Debug, Deserialize)]
 enum PsqlSessionResult {
-    Success(auth::DatabaseInfo),
+    Success(DatabaseInfo),
    Failure(String),
 }

 /// A message received by `mgmt` when a compute node is ready.
-pub type ComputeReady = Result<auth::DatabaseInfo, String>;
+pub type ComputeReady = Result<DatabaseInfo, String>;

 impl PsqlSessionResult {
    fn into_compute_ready(self) -> ComputeReady {
@@ -92,25 +75,51 @@ impl PsqlSessionResult {
    }
 }

-impl postgres_backend::Handler for MgmtHandler {
-    fn process_query(
-        &mut self,
-        pgb: &mut PostgresBackend,
-        query_string: &str,
-    ) -> anyhow::Result<()> {
-        let res = try_process_query(pgb, query_string);
-        // intercept and log error message
-        if res.is_err() {
-            error!("mgmt query failed: {res:?}");
-        }
-        res
+/// Compute node connection params provided by the console.
+/// This struct and its parents are mgmt API implementation
+/// detail and thus should remain in this module.
+// TODO: restore deserialization tests from git history.
+#[derive(Deserialize)]
+pub struct DatabaseInfo {
+    pub host: String,
+    pub port: u16,
+    pub dbname: String,
+    pub user: String,
+    /// Console always provides a password, but it might
+    /// be inconvenient for debug with local PG instance.
+    pub password: Option<String>,
+    pub project: String,
+}
+
+// Manually implement debug to omit sensitive info.
+impl std::fmt::Debug for DatabaseInfo {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
+        fmt.debug_struct("DatabaseInfo")
+            .field("host", &self.host)
+            .field("port", &self.port)
+            .field("dbname", &self.dbname)
+            .field("user", &self.user)
+            .finish_non_exhaustive()
    }
 }

-fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::Result<()> {
-    info!("got mgmt query [redacted]"); // Content contains password, don't print it
+// TODO: replace with an http-based protocol.
+struct MgmtHandler;
+impl postgres_backend::Handler for MgmtHandler {
+    fn process_query(&mut self, pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> {
+        try_process_query(pgb, query).map_err(|e| {
+            error!("failed to process response: {e:?}");
+            e
+        })
+    }
+}

-    let resp: PsqlSessionResponse = serde_json::from_str(query_string)?;
+fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> {
+    let resp: PsqlSessionResponse = serde_json::from_str(query)?;
+
+    let span = info_span!("event", session_id = resp.session_id);
+    let _enter = span.enter();
+    info!("got response: {:?}", resp.result);

    match auth::backend::notify(&resp.session_id, resp.result.into_compute_ready()) {
        Ok(()) => {
@@ -119,9 +128,50 @@ fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::R
                .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
        }
        Err(e) => {
+            error!("failed to deliver response to per-client task");
            pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
        }
    }

    Ok(())
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    #[test]
+    fn parse_db_info() -> anyhow::Result<()> {
+        // with password
+        let _: DatabaseInfo = serde_json::from_value(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+            "password": "password",
+            "project": "hello_world",
+        }))?;
+
+        // without password
+        let _: DatabaseInfo = serde_json::from_value(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+            "project": "hello_world",
+        }))?;
+
+        // new field (forward compatibility)
+        let _: DatabaseInfo = serde_json::from_value(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+            "project": "hello_world",
+            "N.E.W": "forward compatibility check",
+        }))?;
+
+        Ok(())
+    }
+}
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -4,7 +4,7 @@ use crate::config::{ProxyConfig, TlsConfig};
 use crate::stream::{MeasuredStream, PqStream, Stream};
 use anyhow::{bail, Context};
 use futures::TryFutureExt;
-use metrics::{register_int_counter, IntCounter};
+use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, *};
 use std::sync::Arc;
@@ -30,10 +30,16 @@ static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    .unwrap()
 });

-static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "proxy_io_bytes_total",
-        "Number of bytes sent/received between any client and backend."
+static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_io_bytes_per_client",
+        "Number of bytes sent/received between client and backend.",
+        &[
+            // Received (rx) / sent (tx).
+            "direction",
+            // Proxy can keep calling it `project` internally.
+            "endpoint_id"
+        ]
    )
    .unwrap()
 });
@@ -230,16 +236,17 @@ impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<'_, S> {
            application_name: params.get("application_name"),
        };

-        // Authenticate and connect to a compute node.
-        let auth = creds
-            .authenticate(&extra, &mut stream)
-            .instrument(info_span!("auth"))
-            .await;
-
-        let node = async { auth }.or_else(|e| stream.throw_error(e)).await?;
-        let reported_auth_ok = node.reported_auth_ok;
+        let auth_result = async {
+            // `&mut stream` doesn't let us merge those 2 lines.
+            let res = creds.authenticate(&extra, &mut stream).await;
+            async { res }.or_else(|e| stream.throw_error(e)).await
+        }
+        .instrument(info_span!("auth"))
+        .await?;

+        let node = auth_result.value;
        let (db, cancel_closure) = node
+            .config
            .connect(params)
            .or_else(|e| stream.throw_error(e))
            .await?;
@@ -247,7 +254,9 @@ impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<'_, S> {
        let cancel_key_data = session.enable_query_cancellation(cancel_closure);

        // Report authentication success if we haven't done this already.
-        if !reported_auth_ok {
+        // Note that we do this only (for the most part) after we've connected
+        // to a compute (see above) which performs its own authentication.
+        if !auth_result.reported_auth_ok {
            stream
                .write_message_noflush(&Be::AuthenticationOk)?
                .write_message_noflush(&BeParameterStatusMessage::encoding())?;
@@ -261,17 +270,23 @@ impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<'_, S> {
            .write_message(&BeMessage::ReadyForQuery)
            .await?;

-        /// This function will be called for writes to either direction.
-        fn inc_proxied(cnt: usize) {
-            // Consider inventing something more sophisticated
-            // if this ever becomes a bottleneck (cacheline bouncing).
-            NUM_BYTES_PROXIED_COUNTER.inc_by(cnt as u64);
-        }
+        // TODO: add more identifiers.
+        let metric_id = node.project;
+
+        let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx", &metric_id]);
+        let mut client = MeasuredStream::new(stream.into_inner(), |cnt| {
+            // Number of bytes we sent to the client (outbound).
+            m_sent.inc_by(cnt as u64);
+        });
+
+        let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx", &metric_id]);
+        let mut db = MeasuredStream::new(db.stream, |cnt| {
+            // Number of bytes the client sent to the compute node (inbound).
+            m_recv.inc_by(cnt as u64);
+        });

        // Starting from here we only proxy the client's traffic.
        info!("performing the proxy pass...");
-        let mut db = MeasuredStream::new(db.stream, inc_proxied);
-        let mut client = MeasuredStream::new(stream.into_inner(), inc_proxied);
        let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?;

        Ok(())
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ psycopg2-binary = "^2.9.1"
 typing-extensions = "^4.1.0"
 PyJWT = {version = "^2.1.0", extras = ["crypto"]}
 requests = "^2.26.0"
-pytest-xdist = "^2.3.0"
+pytest-xdist = "^3.0.2"
 asyncpg = "^0.24.0"
 aiopg = "^1.3.1"
 Jinja2 = "^3.0.2"
@@ -29,6 +29,8 @@ pytest-order = "^1.0.1"
 allure-pytest = "^2.10.0"
 pytest-asyncio = "^0.19.0"
 toml = "^0.10.2"
+psutil = "^5.9.4"
+types-psutil = "^5.9.5.4"

 [tool.poetry.dev-dependencies]
 flake8 = "^5.0.4"
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -556,10 +556,6 @@ impl Timeline {
                .pageserver_feedback
                .map(|f| Lsn(f.ps_applylsn))
                .unwrap_or(Lsn::INVALID);
-            info!(
-                "checking should ws stop ttid {} lsn {} rcl {}",
-                self.ttid, reported_remote_consistent_lsn, shared_state.sk.inmem.commit_lsn
-            );
            let stop = shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet
            (reported_remote_consistent_lsn!= Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet.
            reported_remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn);
--- a/scripts/docker-compose_test.sh
+++ b/scripts/docker-compose_test.sh
@@ -1,51 +0,0 @@
-#!/bin/bash
-
-# this is a shortcut script to avoid duplication in CI
-set -eux -o pipefail
-
-SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
-COMPOSE_FILE=$SCRIPT_DIR/../docker-compose/docker-compose.yml
-
-COMPUTE_CONTAINER_NAME=dockercompose_compute_1
-SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
-PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
-
-cleanup() {
-	echo "show container information"
-	docker ps
-	docker-compose -f $COMPOSE_FILE logs
-	echo "stop containers..."
-	docker-compose -f $COMPOSE_FILE down
-}
-
-echo "clean up containers if exists"
-cleanup
-
-for pg_version in 14 15; do
-	echo "start containers (pg_version=$pg_version)."
-	PG_VERSION=$pg_version TAG=latest docker-compose -f $COMPOSE_FILE up --build -d
-
-	echo "wait until the compute is ready. timeout after 60s. "
-	cnt=0
-	while sleep 1; do
-		# check timeout
-		cnt=`expr $cnt + 1`
-		if [ $cnt -gt 60 ]; then
-			echo "timeout before the compute is ready."
-			cleanup
-			exit 1
-		fi
-
-		# check if the compute is ready
-		set +o pipefail
-		result=`docker-compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
-		set -o pipefail
-		if [ $result -eq 1 ]; then
-			echo "OK. The compute is ready to connect."
-			echo "execute simple queries."
-			docker exec -it $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
-			cleanup
-			break
-		fi
-	done
-done
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -11,39 +11,37 @@ from datetime import datetime
 from pathlib import Path

 # Type-related stuff
-from typing import Iterator, Optional
+from typing import Callable, ClassVar, Iterator, Optional

 import pytest
 from _pytest.config import Config
+from _pytest.config.argparsing import Parser
 from _pytest.terminal import TerminalReporter
+from fixtures.neon_fixtures import NeonPageserver
 from fixtures.types import TenantId, TimelineId

 """
 This file contains fixtures for micro-benchmarks.

-To use, declare the 'zenbenchmark' fixture in the test function. Run the
-bencmark, and then record the result by calling zenbenchmark.record. For example:
+To use, declare the `zenbenchmark` fixture in the test function. Run the
+bencmark, and then record the result by calling `zenbenchmark.record`. For example:

-import timeit
-from fixtures.neon_fixtures import NeonEnv
-
-def test_mybench(neon_simple_env: env, zenbenchmark):
-
-    # Initialize the test
-    ...
-
-    # Run the test, timing how long it takes
-    with zenbenchmark.record_duration('test_query'):
-        cur.execute('SELECT test_query(...)')
-
-    # Record another measurement
-    zenbenchmark.record('speed_of_light', 300000, 'km/s')
+>>> import timeit
+>>> from fixtures.neon_fixtures import NeonEnv
+>>> def test_mybench(neon_simple_env: NeonEnv, zenbenchmark):
+...     # Initialize the test
+...     ...
+...     # Run the test, timing how long it takes
+...     with zenbenchmark.record_duration('test_query'):
+...         cur.execute('SELECT test_query(...)')
+...     # Record another measurement
+...     zenbenchmark.record('speed_of_light', 300000, 'km/s')

 There's no need to import this file to use it. It should be declared as a plugin
-inside conftest.py, and that makes it available to all tests.
+inside `conftest.py`, and that makes it available to all tests.

 You can measure multiple things in one test, and record each one with a separate
-call to zenbenchmark. For example, you could time the bulk loading that happens
+call to `zenbenchmark`. For example, you could time the bulk loading that happens
 in the test initialization, or measure disk usage after the test query.

 """
@@ -117,7 +115,7 @@ class PgBenchRunResult:
            # tps = 309.281539 (without initial connection time)
            if line.startswith("tps = ") and (
                "(excluding connections establishing)" in line
-                or "(without initial connection time)"
+                or "(without initial connection time)" in line
            ):
                tps = float(line.split()[2])

@@ -137,6 +135,17 @@ class PgBenchRunResult:

@dataclasses.dataclass
 class PgBenchInitResult:
+    REGEX: ClassVar[re.Pattern] = re.compile(  # type: ignore[type-arg]
+        r"done in (\d+\.\d+) s "
+        r"\("
+        r"(?:drop tables (\d+\.\d+) s)?(?:, )?"
+        r"(?:create tables (\d+\.\d+) s)?(?:, )?"
+        r"(?:client-side generate (\d+\.\d+) s)?(?:, )?"
+        r"(?:vacuum (\d+\.\d+) s)?(?:, )?"
+        r"(?:primary keys (\d+\.\d+) s)?(?:, )?"
+        r"\)\."
+    )
+
    total: float
    drop_tables: Optional[float]
    create_tables: Optional[float]
@@ -160,18 +169,7 @@ class PgBenchInitResult:

        last_line = stderr.splitlines()[-1]

-        regex = re.compile(
-            r"done in (\d+\.\d+) s "
-            r"\("
-            r"(?:drop tables (\d+\.\d+) s)?(?:, )?"
-            r"(?:create tables (\d+\.\d+) s)?(?:, )?"
-            r"(?:client-side generate (\d+\.\d+) s)?(?:, )?"
-            r"(?:vacuum (\d+\.\d+) s)?(?:, )?"
-            r"(?:primary keys (\d+\.\d+) s)?(?:, )?"
-            r"\)\."
-        )
-
-        if (m := regex.match(last_line)) is not None:
+        if (m := cls.REGEX.match(last_line)) is not None:
            total, drop_tables, create_tables, client_side_generate, vacuum, primary_keys = [
                float(v) for v in m.groups() if v is not None
            ]
@@ -208,7 +206,7 @@ class NeonBenchmarker:
    function by the zenbenchmark fixture
    """

-    def __init__(self, property_recorder):
+    def __init__(self, property_recorder: Callable[[str, object], None]):
        # property recorder here is a pytest fixture provided by junitxml module
        # https://docs.pytest.org/en/6.2.x/reference.html#pytest.junitxml.record_property
        self.property_recorder = property_recorder
@@ -236,7 +234,7 @@ class NeonBenchmarker:
        )

    @contextmanager
-    def record_duration(self, metric_name: str):
+    def record_duration(self, metric_name: str) -> Iterator[None]:
        """
        Record a duration. Usage:

@@ -337,21 +335,21 @@ class NeonBenchmarker:
                    f"{prefix}.{metric}", value, unit="s", report=MetricReport.LOWER_IS_BETTER
                )

-    def get_io_writes(self, pageserver) -> int:
+    def get_io_writes(self, pageserver: NeonPageserver) -> int:
        """
        Fetch the "cumulative # of bytes written" metric from the pageserver
        """
        metric_name = r'libmetrics_disk_io_bytes_total{io_operation="write"}'
        return self.get_int_counter_value(pageserver, metric_name)

-    def get_peak_mem(self, pageserver) -> int:
+    def get_peak_mem(self, pageserver: NeonPageserver) -> int:
        """
        Fetch the "maxrss" metric from the pageserver
        """
        metric_name = r"libmetrics_maxrss_kb"
        return self.get_int_counter_value(pageserver, metric_name)

-    def get_int_counter_value(self, pageserver, metric_name) -> int:
+    def get_int_counter_value(self, pageserver: NeonPageserver, metric_name: str) -> int:
        """Fetch the value of given int counter from pageserver metrics."""
        # TODO: If we start to collect more of the prometheus metrics in the
        # performance test suite like this, we should refactor this to load and
@@ -365,7 +363,9 @@ class NeonBenchmarker:
        assert matches, f"metric {metric_name} not found"
        return int(round(float(matches.group(1))))

-    def get_timeline_size(self, repo_dir: Path, tenant_id: TenantId, timeline_id: TimelineId):
+    def get_timeline_size(
+        self, repo_dir: Path, tenant_id: TenantId, timeline_id: TimelineId
+    ) -> int:
        """
        Calculate the on-disk size of a timeline
        """
@@ -379,7 +379,9 @@ class NeonBenchmarker:
        return totalbytes

    @contextmanager
-    def record_pageserver_writes(self, pageserver, metric_name):
+    def record_pageserver_writes(
+        self, pageserver: NeonPageserver, metric_name: str
+    ) -> Iterator[None]:
        """
        Record bytes written by the pageserver during a test.
        """
@@ -396,7 +398,7 @@ class NeonBenchmarker:


@pytest.fixture(scope="function")
-def zenbenchmark(record_property) -> Iterator[NeonBenchmarker]:
+def zenbenchmark(record_property: Callable[[str, object], None]) -> Iterator[NeonBenchmarker]:
    """
    This is a python decorator for benchmark fixtures. It contains functions for
    recording measurements, and prints them out at the end.
@@ -405,7 +407,7 @@ def zenbenchmark(record_property) -> Iterator[NeonBenchmarker]:
    yield benchmarker


-def pytest_addoption(parser):
+def pytest_addoption(parser: Parser):
    parser.addoption(
        "--out-dir",
        dest="out_dir",
@@ -429,7 +431,9 @@ def get_out_path(target_dir: Path, revision: str) -> Path:

 # Hook to print the results at the end
@pytest.hookimpl(hookwrapper=True)
-def pytest_terminal_summary(terminalreporter: TerminalReporter, exitstatus: int, config: Config):
+def pytest_terminal_summary(
+    terminalreporter: TerminalReporter, exitstatus: int, config: Config
+) -> Iterator[None]:
    yield
    revision = os.getenv("GITHUB_SHA", "local")
    platform = os.getenv("PLATFORM", "local")
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -1,10 +1,11 @@
 from abc import ABC, abstractmethod
-from contextlib import contextmanager
+from contextlib import _GeneratorContextManager, contextmanager

 # Type-related stuff
-from typing import Dict, List
+from typing import Dict, Iterator, List

 import pytest
+from _pytest.fixtures import FixtureRequest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.neon_fixtures import NeonEnv, PgBin, PgProtocol, RemotePostgres, VanillaPostgres
 from fixtures.pg_stats import PgStatTable
@@ -28,19 +29,20 @@ class PgCompare(ABC):
        pass

    @property
+    @abstractmethod
    def zenbenchmark(self) -> NeonBenchmarker:
        pass

    @abstractmethod
-    def flush(self) -> None:
+    def flush(self):
        pass

    @abstractmethod
-    def report_peak_memory_use(self) -> None:
+    def report_peak_memory_use(self):
        pass

    @abstractmethod
-    def report_size(self) -> None:
+    def report_size(self):
        pass

    @contextmanager
@@ -54,7 +56,7 @@ class PgCompare(ABC):
        pass

    @contextmanager
-    def record_pg_stats(self, pg_stats: List[PgStatTable]):
+    def record_pg_stats(self, pg_stats: List[PgStatTable]) -> Iterator[None]:
        init_data = self._retrieve_pg_stats(pg_stats)

        yield
@@ -84,7 +86,11 @@ class NeonCompare(PgCompare):
    """PgCompare interface for the neon stack."""

    def __init__(
-        self, zenbenchmark: NeonBenchmarker, neon_simple_env: NeonEnv, pg_bin: PgBin, branch_name
+        self,
+        zenbenchmark: NeonBenchmarker,
+        neon_simple_env: NeonEnv,
+        pg_bin: PgBin,
+        branch_name: str,
    ):
        self.env = neon_simple_env
        self._zenbenchmark = zenbenchmark
@@ -97,15 +103,15 @@ class NeonCompare(PgCompare):
        self.timeline = self.pg.safe_psql("SHOW neon.timeline_id")[0][0]

    @property
-    def pg(self):
+    def pg(self) -> PgProtocol:
        return self._pg

    @property
-    def zenbenchmark(self):
+    def zenbenchmark(self) -> NeonBenchmarker:
        return self._zenbenchmark

    @property
-    def pg_bin(self):
+    def pg_bin(self) -> PgBin:
        return self._pg_bin

    def flush(self):
@@ -114,7 +120,7 @@ class NeonCompare(PgCompare):
    def compact(self):
        self.pageserver_http_client.timeline_compact(self.env.initial_tenant, self.timeline)

-    def report_peak_memory_use(self) -> None:
+    def report_peak_memory_use(self):
        self.zenbenchmark.record(
            "peak_mem",
            self.zenbenchmark.get_peak_mem(self.env.pageserver) / 1024,
@@ -122,7 +128,7 @@ class NeonCompare(PgCompare):
            report=MetricReport.LOWER_IS_BETTER,
        )

-    def report_size(self) -> None:
+    def report_size(self):
        timeline_size = self.zenbenchmark.get_timeline_size(
            self.env.repo_dir, self.env.initial_tenant, self.timeline
        )
@@ -144,17 +150,17 @@ class NeonCompare(PgCompare):
            "num_files_uploaded", total_files, "", report=MetricReport.LOWER_IS_BETTER
        )

-    def record_pageserver_writes(self, out_name):
+    def record_pageserver_writes(self, out_name: str) -> _GeneratorContextManager[None]:
        return self.zenbenchmark.record_pageserver_writes(self.env.pageserver, out_name)

-    def record_duration(self, out_name):
+    def record_duration(self, out_name: str) -> _GeneratorContextManager[None]:
        return self.zenbenchmark.record_duration(out_name)


 class VanillaCompare(PgCompare):
    """PgCompare interface for vanilla postgres."""

-    def __init__(self, zenbenchmark, vanilla_pg: VanillaPostgres):
+    def __init__(self, zenbenchmark: NeonBenchmarker, vanilla_pg: VanillaPostgres):
        self._pg = vanilla_pg
        self._zenbenchmark = zenbenchmark
        vanilla_pg.configure(
@@ -170,24 +176,24 @@ class VanillaCompare(PgCompare):
        self.cur = self.conn.cursor()

    @property
-    def pg(self):
+    def pg(self) -> PgProtocol:
        return self._pg

    @property
-    def zenbenchmark(self):
+    def zenbenchmark(self) -> NeonBenchmarker:
        return self._zenbenchmark

    @property
-    def pg_bin(self):
+    def pg_bin(self) -> PgBin:
        return self._pg.pg_bin

    def flush(self):
        self.cur.execute("checkpoint")

-    def report_peak_memory_use(self) -> None:
+    def report_peak_memory_use(self):
        pass  # TODO find something

-    def report_size(self) -> None:
+    def report_size(self):
        data_size = self.pg.get_subdir_size("base")
        self.zenbenchmark.record(
            "data_size", data_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER
@@ -198,17 +204,17 @@ class VanillaCompare(PgCompare):
        )

    @contextmanager
-    def record_pageserver_writes(self, out_name):
+    def record_pageserver_writes(self, out_name: str) -> Iterator[None]:
        yield  # Do nothing

-    def record_duration(self, out_name):
+    def record_duration(self, out_name: str) -> _GeneratorContextManager[None]:
        return self.zenbenchmark.record_duration(out_name)


 class RemoteCompare(PgCompare):
    """PgCompare interface for a remote postgres instance."""

-    def __init__(self, zenbenchmark, remote_pg: RemotePostgres):
+    def __init__(self, zenbenchmark: NeonBenchmarker, remote_pg: RemotePostgres):
        self._pg = remote_pg
        self._zenbenchmark = zenbenchmark

@@ -217,55 +223,60 @@ class RemoteCompare(PgCompare):
        self.cur = self.conn.cursor()

    @property
-    def pg(self):
+    def pg(self) -> PgProtocol:
        return self._pg

    @property
-    def zenbenchmark(self):
+    def zenbenchmark(self) -> NeonBenchmarker:
        return self._zenbenchmark

    @property
-    def pg_bin(self):
+    def pg_bin(self) -> PgBin:
        return self._pg.pg_bin

    def flush(self):
        # TODO: flush the remote pageserver
        pass

-    def report_peak_memory_use(self) -> None:
+    def report_peak_memory_use(self):
        # TODO: get memory usage from remote pageserver
        pass

-    def report_size(self) -> None:
+    def report_size(self):
        # TODO: get storage size from remote pageserver
        pass

    @contextmanager
-    def record_pageserver_writes(self, out_name):
+    def record_pageserver_writes(self, out_name: str) -> Iterator[None]:
        yield  # Do nothing

-    def record_duration(self, out_name):
+    def record_duration(self, out_name: str) -> _GeneratorContextManager[None]:
        return self.zenbenchmark.record_duration(out_name)


@pytest.fixture(scope="function")
-def neon_compare(request, zenbenchmark, pg_bin, neon_simple_env) -> NeonCompare:
+def neon_compare(
+    request: FixtureRequest,
+    zenbenchmark: NeonBenchmarker,
+    pg_bin: PgBin,
+    neon_simple_env: NeonEnv,
+) -> NeonCompare:
    branch_name = request.node.name
    return NeonCompare(zenbenchmark, neon_simple_env, pg_bin, branch_name)


@pytest.fixture(scope="function")
-def vanilla_compare(zenbenchmark, vanilla_pg) -> VanillaCompare:
+def vanilla_compare(zenbenchmark: NeonBenchmarker, vanilla_pg: VanillaPostgres) -> VanillaCompare:
    return VanillaCompare(zenbenchmark, vanilla_pg)


@pytest.fixture(scope="function")
-def remote_compare(zenbenchmark, remote_pg) -> RemoteCompare:
+def remote_compare(zenbenchmark: NeonBenchmarker, remote_pg: RemotePostgres) -> RemoteCompare:
    return RemoteCompare(zenbenchmark, remote_pg)


@pytest.fixture(params=["vanilla_compare", "neon_compare"], ids=["vanilla", "neon"])
-def neon_with_baseline(request) -> PgCompare:
+def neon_with_baseline(request: FixtureRequest) -> PgCompare:
    """Parameterized fixture that helps compare neon against vanilla postgres.

    A test that uses this fixture turns into a parameterized test that runs against:
@@ -286,8 +297,6 @@ def neon_with_baseline(request) -> PgCompare:
    implementation-specific logic is widely useful across multiple tests, it might
    make sense to add methods to the PgCompare class.
    """
-    fixture = request.getfixturevalue(request.param)
-    if isinstance(fixture, PgCompare):
-        return fixture
-    else:
-        raise AssertionError(f"test error: fixture {request.param} is not PgCompare")
+    fixture = request.getfixturevalue(request.param)  # type: ignore
+    assert isinstance(fixture, PgCompare), f"test error: fixture {fixture} is not PgCompare"
+    return fixture
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -1,5 +1,5 @@
 from collections import defaultdict
-from typing import Dict, List
+from typing import Dict, List, Optional, Tuple

 from prometheus_client.parser import text_string_to_metric_families
 from prometheus_client.samples import Sample
@@ -23,13 +23,13 @@ class Metrics:
                pass
        return res

-    def query_one(self, name: str, filter: Dict[str, str] = {}) -> Sample:
-        res = self.query_all(name, filter)
+    def query_one(self, name: str, filter: Optional[Dict[str, str]] = None) -> Sample:
+        res = self.query_all(name, filter or {})
        assert len(res) == 1, f"expected single sample for {name} {filter}, found {res}"
        return res[0]


-def parse_metrics(text: str, name: str = ""):
+def parse_metrics(text: str, name: str = "") -> Metrics:
    metrics = Metrics(name)
    gen = text_string_to_metric_families(text)
    for family in gen:
@@ -39,7 +39,7 @@ def parse_metrics(text: str, name: str = ""):
    return metrics


-PAGESERVER_PER_TENANT_METRICS = [
+PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
    "pageserver_current_logical_size",
    "pageserver_current_physical_size",
    "pageserver_getpage_reconstruct_seconds_bucket",
@@ -62,4 +62,4 @@ PAGESERVER_PER_TENANT_METRICS = [
    "pageserver_wait_lsn_seconds_sum",
    "pageserver_created_persistent_files_total",
    "pageserver_written_persistent_bytes_total",
-]
+)
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -19,7 +19,8 @@ from dataclasses import dataclass, field
 from enum import Flag, auto
 from functools import cached_property
 from pathlib import Path
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, cast
+from types import TracebackType
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast

 import asyncpg
 import backoff  # type: ignore
@@ -28,16 +29,18 @@ import jwt
 import psycopg2
 import pytest
 import requests
+from _pytest.config import Config
+from _pytest.fixtures import FixtureRequest
 from fixtures.log_helper import log
 from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import Fn, allure_attach_from_dir, etcd_path, get_self_dir, subprocess_capture

 # Type-related stuff
 from psycopg2.extensions import connection as PgConnection
+from psycopg2.extensions import cursor as PgCursor
 from psycopg2.extensions import make_dsn, parse_dsn
 from typing_extensions import Literal

-from .utils import Fn, allure_attach_from_dir, etcd_path, get_self_dir, subprocess_capture
-
 """
 This file contains pytest fixtures. A fixture is a test resource that can be
 summoned by placing its name in the test's arguments.
@@ -57,15 +60,15 @@ put directly-importable functions into utils.py or another separate file.

 Env = Dict[str, str]

-DEFAULT_OUTPUT_DIR = "test_output"
-DEFAULT_BRANCH_NAME = "main"
-DEFAULT_PG_VERSION_DEFAULT = "14"
+DEFAULT_OUTPUT_DIR: str = "test_output"
+DEFAULT_BRANCH_NAME: str = "main"
+DEFAULT_PG_VERSION_DEFAULT: str = "14"

-BASE_PORT = 15000
-WORKER_PORT_NUM = 1000
+BASE_PORT: int = 15000
+WORKER_PORT_NUM: int = 1000


-def pytest_configure(config):
+def pytest_configure(config: Config):
    """
    Check that we do not overflow available ports range.
    """
@@ -154,14 +157,14 @@ def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: str) -> Iterator[
        if not psql_bin_path.exists():
            raise Exception(f"psql not found at '{psql_bin_path}'")
    else:
-        if not postgres_bin_path.exists:
+        if not postgres_bin_path.exists():
            raise Exception(f"postgres not found at '{postgres_bin_path}'")

    log.info(f"versioned_pg_distrib_dir is {versioned_dir}")
    yield versioned_dir


-def shareable_scope(fixture_name, config) -> Literal["session", "function"]:
+def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "function"]:
    """Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar.

    This function can be used as a scope like this:
@@ -173,7 +176,7 @@ def shareable_scope(fixture_name, config) -> Literal["session", "function"]:


@pytest.fixture(scope="session")
-def worker_seq_no(worker_id: str):
+def worker_seq_no(worker_id: str) -> int:
    # worker_id is a pytest-xdist fixture
    # it can be master or gw<number>
    # parse it to always get a number
@@ -184,7 +187,7 @@ def worker_seq_no(worker_id: str):


@pytest.fixture(scope="session")
-def worker_base_port(worker_seq_no: int):
+def worker_base_port(worker_seq_no: int) -> int:
    # so we divide ports in ranges of 100 ports
    # so workers have disjoint set of ports for services
    return BASE_PORT + worker_seq_no * WORKER_PORT_NUM
@@ -228,24 +231,58 @@ def can_bind(host: str, port: int) -> bool:
 class PortDistributor:
    def __init__(self, base_port: int, port_number: int):
        self.iterator = iter(range(base_port, base_port + port_number))
+        self.port_map: Dict[int, int] = {}

    def get_port(self) -> int:
        for port in self.iterator:
            if can_bind("localhost", port):
                return port
-        else:
-            raise RuntimeError(
-                "port range configured for test is exhausted, consider enlarging the range"
-            )
+        raise RuntimeError(
+            "port range configured for test is exhausted, consider enlarging the range"
+        )
+
+    def replace_with_new_port(self, value: Union[int, str]) -> Union[int, str]:
+        """
+        Returns a new port for a port number in a string (like "localhost:1234") or int.
+        Replacements are memorised, so a substitution for the same port is always the same.
+        """
+
+        # TODO: replace with structural pattern matching for Python >= 3.10
+        if isinstance(value, int):
+            return self._replace_port_int(value)
+
+        if isinstance(value, str):
+            return self._replace_port_str(value)
+
+        raise TypeError(f"unsupported type {type(value)} of {value=}")
+
+    def _replace_port_int(self, value: int) -> int:
+        known_port = self.port_map.get(value)
+        if known_port is None:
+            known_port = self.port_map[value] = self.get_port()
+
+        return known_port
+
+    def _replace_port_str(self, value: str) -> str:
+        # Use regex to find port in a string
+        # urllib.parse.urlparse produces inconvenient results for cases without scheme like "localhost:5432"
+        # See https://bugs.python.org/issue27657
+        ports = re.findall(r":(\d+)(?:/|$)", value)
+        assert len(ports) == 1, f"can't find port in {value}"
+        port_int = int(ports[0])
+
+        return value.replace(f":{port_int}", f":{self._replace_port_int(port_int)}")


@pytest.fixture(scope="session")
-def port_distributor(worker_base_port):
+def port_distributor(worker_base_port: int) -> PortDistributor:
    return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM)


@pytest.fixture(scope="session")
-def default_broker(request: Any, port_distributor: PortDistributor, top_output_dir: Path):
+def default_broker(
+    request: FixtureRequest, port_distributor: PortDistributor, top_output_dir: Path
+) -> Iterator[Etcd]:
    client_port = port_distributor.get_port()
    # multiple pytest sessions could get launched in parallel, get them different datadirs
    etcd_datadir = get_test_output_dir(request, top_output_dir) / f"etcd_datadir_{client_port}"
@@ -260,12 +297,12 @@ def default_broker(request: Any, port_distributor: PortDistributor, top_output_d


@pytest.fixture(scope="session")
-def run_id():
+def run_id() -> Iterator[uuid.UUID]:
    yield uuid.uuid4()


@pytest.fixture(scope="session")
-def mock_s3_server(port_distributor: PortDistributor):
+def mock_s3_server(port_distributor: PortDistributor) -> Iterator[MockS3Server]:
    mock_s3_server = MockS3Server(port_distributor.get_port())
    yield mock_s3_server
    mock_s3_server.kill()
@@ -274,16 +311,16 @@ def mock_s3_server(port_distributor: PortDistributor):
 class PgProtocol:
    """Reusable connection logic"""

-    def __init__(self, **kwargs):
+    def __init__(self, **kwargs: Any):
        self.default_options = kwargs

-    def connstr(self, **kwargs) -> str:
+    def connstr(self, **kwargs: Any) -> str:
        """
        Build a libpq connection string for the Postgres instance.
        """
        return str(make_dsn(**self.conn_options(**kwargs)))

-    def conn_options(self, **kwargs):
+    def conn_options(self, **kwargs: Any) -> Dict[str, Any]:
        """
        Construct a dictionary of connection options from default values and extra parameters.
        An option can be dropped from the returning dictionary by None-valued extra parameter.
@@ -305,7 +342,7 @@ class PgProtocol:
        return result

    # autocommit=True here by default because that's what we need most of the time
-    def connect(self, autocommit=True, **kwargs) -> PgConnection:
+    def connect(self, autocommit: bool = True, **kwargs: Any) -> PgConnection:
        """
        Connect to the node.
        Returns psycopg2's connection object.
@@ -318,7 +355,7 @@ class PgProtocol:
        return conn

    @contextmanager
-    def cursor(self, autocommit=True, **kwargs):
+    def cursor(self, autocommit: bool = True, **kwargs: Any) -> Iterator[PgCursor]:
        """
        Shorthand for pg.connect().cursor().
        The cursor and connection are closed when the context is exited.
@@ -326,7 +363,7 @@ class PgProtocol:
        with closing(self.connect(autocommit=autocommit, **kwargs)) as conn:
            yield conn.cursor()

-    async def connect_async(self, **kwargs) -> asyncpg.Connection:
+    async def connect_async(self, **kwargs: Any) -> asyncpg.Connection:
        """
        Connect to the node from async python.
        Returns asyncpg's connection object.
@@ -380,10 +417,10 @@ class PgProtocol:

@dataclass
 class AuthKeys:
-    pub: bytes
-    priv: bytes
+    pub: str
+    priv: str

-    def generate_management_token(self):
+    def generate_management_token(self) -> str:
        token = jwt.encode({"scope": "pageserverapi"}, self.priv, algorithm="RS256")

        # jwt.encode can return 'bytes' or 'str', depending on Python version or type
@@ -394,9 +431,11 @@ class AuthKeys:

        return token

-    def generate_tenant_token(self, tenant_id):
+    def generate_tenant_token(self, tenant_id: TenantId) -> str:
        token = jwt.encode(
-            {"scope": "tenant", "tenant_id": str(tenant_id)}, self.priv, algorithm="RS256"
+            {"scope": "tenant", "tenant_id": str(tenant_id)},
+            self.priv,
+            algorithm="RS256",
        )

        if isinstance(token, bytes):
@@ -452,7 +491,7 @@ class MockS3Server:


@enum.unique
-class RemoteStorageKind(enum.Enum):
+class RemoteStorageKind(str, enum.Enum):
    LOCAL_FS = "local_fs"
    MOCK_S3 = "mock_s3"
    REAL_S3 = "real_s3"
@@ -496,7 +535,7 @@ RemoteStorage = Union[LocalFsStorage, S3Storage]


 # serialize as toml inline table
-def remote_storage_to_toml_inline_table(remote_storage):
+def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str:
    if isinstance(remote_storage, LocalFsStorage):
        remote_storage_config = f"local_path='{remote_storage.root}'"
    elif isinstance(remote_storage, S3Storage):
@@ -549,7 +588,7 @@ class NeonEnvBuilder:
        safekeepers_enable_fsync: bool = False,
        auth_enabled: bool = False,
        rust_log_override: Optional[str] = None,
-        default_branch_name=DEFAULT_BRANCH_NAME,
+        default_branch_name: str = DEFAULT_BRANCH_NAME,
    ):
        self.repo_dir = repo_dir
        self.rust_log_override = rust_log_override
@@ -603,7 +642,7 @@ class NeonEnvBuilder:
        else:
            raise RuntimeError(f"Unknown storage type: {remote_storage_kind}")

-    def enable_local_fs_remote_storage(self, force_enable=True):
+    def enable_local_fs_remote_storage(self, force_enable: bool = True):
        """
        Sets up the pageserver to use the local fs at the `test_dir/local_fs_remote_storage` path.
        Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
@@ -611,7 +650,7 @@ class NeonEnvBuilder:
        assert force_enable or self.remote_storage is None, "remote storage is enabled already"
        self.remote_storage = LocalFsStorage(Path(self.repo_dir / "local_fs_remote_storage"))

-    def enable_mock_s3_remote_storage(self, bucket_name: str, force_enable=True):
+    def enable_mock_s3_remote_storage(self, bucket_name: str, force_enable: bool = True):
        """
        Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already.
        Starts up the mock server, if that does not run yet.
@@ -638,7 +677,7 @@ class NeonEnvBuilder:
            secret_key=self.mock_s3_server.secret_key(),
        )

-    def enable_real_s3_remote_storage(self, test_name: str, force_enable=True):
+    def enable_real_s3_remote_storage(self, test_name: str, force_enable: bool = True):
        """
        Sets up configuration to use real s3 endpoint without mock server
        """
@@ -726,10 +765,15 @@ class NeonEnvBuilder:

        log.info("deleted %s objects from remote storage", cnt)

-    def __enter__(self):
+    def __enter__(self) -> "NeonEnvBuilder":
        return self

-    def __exit__(self, exc_type, exc_value, traceback):
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_value: Optional[BaseException],
+        traceback: Optional[TracebackType],
+    ):
        # Stop all the nodes.
        if self.env:
            log.info("Cleaning up all storage and compute nodes")
@@ -740,6 +784,8 @@ class NeonEnvBuilder:

            self.cleanup_remote_storage()

+            self.env.pageserver.assert_no_errors()
+

 class NeonEnv:
    """
@@ -876,7 +922,7 @@ class NeonEnv:

    def get_safekeeper_connstrs(self) -> str:
        """Get list of safekeeper endpoints suitable for safekeepers GUC"""
-        return ",".join([f"localhost:{wa.port.pg}" for wa in self.safekeepers])
+        return ",".join(f"localhost:{wa.port.pg}" for wa in self.safekeepers)

    def timeline_dir(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
        """Get a timeline directory's path based on the repo directory of the test environment"""
@@ -895,14 +941,14 @@ class NeonEnv:

    @cached_property
    def auth_keys(self) -> AuthKeys:
-        pub = (Path(self.repo_dir) / "auth_public_key.pem").read_bytes()
-        priv = (Path(self.repo_dir) / "auth_private_key.pem").read_bytes()
+        pub = (Path(self.repo_dir) / "auth_public_key.pem").read_text()
+        priv = (Path(self.repo_dir) / "auth_private_key.pem").read_text()
        return AuthKeys(pub=pub, priv=priv)


@pytest.fixture(scope=shareable_scope)
 def _shared_simple_env(
-    request: Any,
+    request: FixtureRequest,
    port_distributor: PortDistributor,
    mock_s3_server: MockS3Server,
    default_broker: Etcd,
@@ -960,7 +1006,7 @@ def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]:

@pytest.fixture(scope="function")
 def neon_env_builder(
-    test_output_dir,
+    test_output_dir: str,
    port_distributor: PortDistributor,
    mock_s3_server: MockS3Server,
    neon_binpath: Path,
@@ -1026,7 +1072,7 @@ class PageserverHttpClient(requests.Session):
    def check_status(self):
        self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()

-    def configure_failpoints(self, config_strings: tuple[str, str] | list[tuple[str, str]]) -> None:
+    def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
        self.is_testing_enabled_or_skip()

        if isinstance(config_strings, tuple):
@@ -1156,7 +1202,6 @@ class PageserverHttpClient(requests.Session):
        self.verbose_error(res)
        res_json = res.json()
        assert res_json is None
-        return res_json

    def timeline_gc(
        self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int]
@@ -1188,7 +1233,6 @@ class PageserverHttpClient(requests.Session):
        self.verbose_error(res)
        res_json = res.json()
        assert res_json is None
-        return res_json

    def timeline_get_lsn_by_timestamp(
        self, tenant_id: TenantId, timeline_id: TimelineId, timestamp
@@ -1214,7 +1258,6 @@ class PageserverHttpClient(requests.Session):
        self.verbose_error(res)
        res_json = res.json()
        assert res_json is None
-        return res_json

    def get_metrics(self) -> str:
        res = self.get(f"http://localhost:{self.port}/metrics")
@@ -1228,13 +1271,10 @@ class PageserverPort:
    http: int


-CREATE_TIMELINE_ID_EXTRACTOR = re.compile(
+CREATE_TIMELINE_ID_EXTRACTOR: re.Pattern = re.compile(  # type: ignore[type-arg]
    r"^Created timeline '(?P<timeline_id>[^']+)'", re.MULTILINE
 )
-CREATE_TIMELINE_ID_EXTRACTOR = re.compile(
-    r"^Created timeline '(?P<timeline_id>[^']+)'", re.MULTILINE
-)
-TIMELINE_DATA_EXTRACTOR = re.compile(
+TIMELINE_DATA_EXTRACTOR: re.Pattern = re.compile(  # type: ignore[type-arg]
    r"\s?(?P<branch_name>[^\s]+)\s\[(?P<timeline_id>[^\]]+)\]", re.MULTILINE
 )

@@ -1527,7 +1567,8 @@ class NeonCli(AbstractNeonCli):

    def pageserver_start(
        self,
-        overrides=(),
+        overrides: Tuple[str, ...] = (),
+        extra_env_vars: Optional[Dict[str, str]] = None,
    ) -> "subprocess.CompletedProcess[str]":
        start_args = ["pageserver", "start", *overrides]
        append_pageserver_param_overrides(
@@ -1537,11 +1578,11 @@ class NeonCli(AbstractNeonCli):
            pageserver_config_override=self.env.pageserver.config_override,
        )

-        s3_env_vars = None
        if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage):
            s3_env_vars = self.env.remote_storage.access_env_vars()
+            extra_env_vars = (extra_env_vars or {}) | s3_env_vars

-        return self.raw_cli(start_args, extra_env_vars=s3_env_vars)
+        return self.raw_cli(start_args, extra_env_vars=extra_env_vars)

    def pageserver_stop(self, immediate=False) -> "subprocess.CompletedProcess[str]":
        cmd = ["pageserver", "stop"]
@@ -1685,7 +1726,50 @@ class NeonPageserver(PgProtocol):
        self.config_override = config_override
        self.version = env.get_pageserver_version()

-    def start(self, overrides=()) -> "NeonPageserver":
+        # After a test finishes, we will scrape the log to see if there are any
+        # unexpected error messages. If your test expects an error, add it to
+        # 'allowed_errors' in the test with something like:
+        #
+        # env.pageserver.allowed_errors.append(".*could not open garage door.*")
+        #
+        # The entries in the list are regular experessions.
+        self.allowed_errors = [
+            # All tests print these, when starting up or shutting down
+            ".*wal receiver task finished with an error: walreceiver connection handling failure.*",
+            ".*Shutdown task error: walreceiver connection handling failure.*",
+            ".*Etcd client error: grpc request error: status: Unavailable.*",
+            ".*query handler for .* failed: Connection reset by peer.*",
+            ".*serving compute connection task.*exited with error: Broken pipe.*",
+            ".*Connection aborted: error communicating with the server: Broken pipe.*",
+            ".*Connection aborted: error communicating with the server: Transport endpoint is not connected.*",
+            ".*Connection aborted: error communicating with the server: Connection reset by peer.*",
+            ".*kill_and_wait_impl.*: wait successful.*",
+            ".*end streaming to Some.*",
+            # safekeeper connection can fail with this, in the window between timeline creation
+            # and streaming start
+            ".*Failed to process query for timeline .*: state uninitialized, no data to read.*",
+            # Tests related to authentication and authorization print these
+            ".*Error processing HTTP request: Forbidden",
+            # intentional failpoints
+            ".*failpoint ",
+            # FIXME: there is a race condition between GC and detach, see
+            # https://github.com/neondatabase/neon/issues/2442
+            ".*could not remove ephemeral file.*No such file or directory.*",
+            # FIXME: These need investigation
+            ".*gc_loop.*Failed to get a tenant .* Tenant .* not found in the local state.*",
+            ".*compaction_loop.*Failed to get a tenant .* Tenant .* not found in the local state.*",
+            ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
+            ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
+            ".*Removing intermediate uninit mark file.*",
+            # FIXME: known race condition in TaskHandle: https://github.com/neondatabase/neon/issues/2885
+            ".*sender is dropped while join handle is still alive.*",
+        ]
+
+    def start(
+        self,
+        overrides: Tuple[str, ...] = (),
+        extra_env_vars: Optional[Dict[str, str]] = None,
+    ) -> "NeonPageserver":
        """
        Start the page server.
        `overrides` allows to add some config to this pageserver start.
@@ -1693,11 +1777,11 @@ class NeonPageserver(PgProtocol):
        """
        assert self.running is False

-        self.env.neon_cli.pageserver_start(overrides=overrides)
+        self.env.neon_cli.pageserver_start(overrides=overrides, extra_env_vars=extra_env_vars)
        self.running = True
        return self

-    def stop(self, immediate=False) -> "NeonPageserver":
+    def stop(self, immediate: bool = False) -> "NeonPageserver":
        """
        Stop the page server.
        Returns self.
@@ -1707,10 +1791,15 @@ class NeonPageserver(PgProtocol):
            self.running = False
        return self

-    def __enter__(self):
+    def __enter__(self) -> "NeonPageserver":
        return self

-    def __exit__(self, exc_type, exc, tb):
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc: Optional[BaseException],
+        tb: Optional[TracebackType],
+    ):
        self.stop(immediate=True)

    def is_testing_enabled_or_skip(self):
@@ -1728,6 +1817,26 @@ class NeonPageserver(PgProtocol):
            is_testing_enabled_or_skip=self.is_testing_enabled_or_skip,
        )

+    def assert_no_errors(self):
+        logfile = open(os.path.join(self.env.repo_dir, "pageserver.log"), "r")
+
+        error_or_warn = re.compile("ERROR|WARN")
+        errors = []
+        while True:
+            line = logfile.readline()
+            if not line:
+                break
+
+            if error_or_warn.search(line):
+                # It's an ERROR or WARN. Is it in the allow-list?
+                for a in self.allowed_errors:
+                    if re.match(a, line):
+                        break
+                else:
+                    errors.append(line)
+
+        assert not errors
+

 def append_pageserver_param_overrides(
    params_to_update: List[str],
@@ -1822,7 +1931,7 @@ def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: str) -> PgBi


 class VanillaPostgres(PgProtocol):
-    def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True):
+    def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init: bool = True):
        super().__init__(host="localhost", port=port, dbname="postgres")
        self.pgdatadir = pgdatadir
        self.pg_bin = pg_bin
@@ -1857,10 +1966,15 @@ class VanillaPostgres(PgProtocol):
        """Return size of pgdatadir subdirectory in bytes."""
        return get_dir_size(os.path.join(self.pgdatadir, subdir))

-    def __enter__(self):
+    def __enter__(self) -> "VanillaPostgres":
        return self

-    def __exit__(self, exc_type, exc, tb):
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc: Optional[BaseException],
+        tb: Optional[TracebackType],
+    ):
        if self.running:
            self.stop()

@@ -1900,10 +2014,15 @@ class RemotePostgres(PgProtocol):
        # See https://www.postgresql.org/docs/14/functions-admin.html#FUNCTIONS-ADMIN-GENFILE
        raise Exception("cannot get size of a Postgres instance")

-    def __enter__(self):
+    def __enter__(self) -> "RemotePostgres":
        return self

-    def __exit__(self, exc_type, exc, tb):
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc: Optional[BaseException],
+        tb: Optional[TracebackType],
+    ):
        # do nothing
        pass

@@ -1942,7 +2061,7 @@ class PSQL:
        self.path = path
        self.database_url = f"postgres://{host}:{port}/main?options=project%3Dgeneric-project-name"

-    async def run(self, query=None):
+    async def run(self, query: Optional[str] = None) -> asyncio.subprocess.Process:
        run_args = [self.path, "--no-psqlrc", "--quiet", "--tuples-only", self.database_url]
        if query is not None:
            run_args += ["--command", query]
@@ -1961,9 +2080,9 @@ class NeonProxy(PgProtocol):
        self,
        proxy_port: int,
        http_port: int,
+        mgmt_port: int,
        neon_binpath: Path,
        auth_endpoint=None,
-        mgmt_port=None,
    ):
        super().__init__(dsn=auth_endpoint, port=proxy_port)
        self.host = "127.0.0.1"
@@ -1975,9 +2094,10 @@ class NeonProxy(PgProtocol):
        self._popen: Optional[subprocess.Popen[bytes]] = None
        self.link_auth_uri_prefix = "http://dummy-uri"

-    def start(self) -> None:
+    def start(self):
        """
-        Starts a proxy with option '--auth-backend postgres' and a postgres instance already provided though '--auth-endpoint <postgress-instance>'."
+        Starts a proxy with option '--auth-backend postgres' and a postgres instance
+        already provided though '--auth-endpoint <postgress-instance>'."
        """
        assert self._popen is None
        assert self.auth_endpoint is not None
@@ -1987,13 +2107,14 @@ class NeonProxy(PgProtocol):
            str(self.neon_binpath / "proxy"),
            *["--http", f"{self.host}:{self.http_port}"],
            *["--proxy", f"{self.host}:{self.proxy_port}"],
+            *["--mgmt", f"{self.host}:{self.mgmt_port}"],
            *["--auth-backend", "postgres"],
            *["--auth-endpoint", self.auth_endpoint],
        ]
        self._popen = subprocess.Popen(args)
        self._wait_until_ready()

-    def start_with_link_auth(self) -> None:
+    def start_with_link_auth(self):
        """
        Starts a proxy with option '--auth-backend link' and a dummy authentication link '--uri dummy-auth-link'."
        """
@@ -2021,10 +2142,15 @@ class NeonProxy(PgProtocol):
        request_result.raise_for_status()
        return request_result.text

-    def __enter__(self):
+    def __enter__(self) -> "NeonProxy":
        return self

-    def __exit__(self, exc_type, exc, tb):
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc: Optional[BaseException],
+        tb: Optional[TracebackType],
+    ):
        if self._popen is not None:
            # NOTE the process will die when we're done with tests anyway, because
            # it's a child process. This is mostly to clean up in between different tests.
@@ -2032,7 +2158,7 @@ class NeonProxy(PgProtocol):


@pytest.fixture(scope="function")
-def link_proxy(port_distributor, neon_binpath: Path) -> Iterator[NeonProxy]:
+def link_proxy(port_distributor: PortDistributor, neon_binpath: Path) -> Iterator[NeonProxy]:
    """Neon proxy that routes through link auth."""
    http_port = port_distributor.get_port()
    proxy_port = port_distributor.get_port()
@@ -2043,7 +2169,9 @@ def link_proxy(port_distributor, neon_binpath: Path) -> Iterator[NeonProxy]:


@pytest.fixture(scope="function")
-def static_proxy(vanilla_pg, port_distributor, neon_binpath: Path) -> Iterator[NeonProxy]:
+def static_proxy(
+    vanilla_pg: VanillaPostgres, port_distributor: PortDistributor, neon_binpath: Path
+) -> Iterator[NeonProxy]:
    """Neon proxy that routes directly to vanilla postgres."""

    # For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql`
@@ -2056,11 +2184,13 @@ def static_proxy(vanilla_pg, port_distributor, neon_binpath: Path) -> Iterator[N
    auth_endpoint = f"postgres://proxy:password@{host}:{port}/{dbname}"

    proxy_port = port_distributor.get_port()
+    mgmt_port = port_distributor.get_port()
    http_port = port_distributor.get_port()

    with NeonProxy(
        proxy_port=proxy_port,
        http_port=http_port,
+        mgmt_port=mgmt_port,
        neon_binpath=neon_binpath,
        auth_endpoint=auth_endpoint,
    ) as proxy:
@@ -2243,10 +2373,15 @@ class Postgres(PgProtocol):

        return self

-    def __enter__(self):
+    def __enter__(self) -> "Postgres":
        return self

-    def __exit__(self, exc_type, exc, tb):
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc: Optional[BaseException],
+        tb: Optional[TracebackType],
+    ):
        self.stop()


@@ -2255,7 +2390,7 @@ class PostgresFactory:

    def __init__(self, env: NeonEnv):
        self.env = env
-        self.num_instances = 0
+        self.num_instances: int = 0
        self.instances: List[Postgres] = []

    def create_start(
@@ -2350,7 +2485,7 @@ class Safekeeper:
                break  # success
        return self

-    def stop(self, immediate=False) -> "Safekeeper":
+    def stop(self, immediate: bool = False) -> "Safekeeper":
        log.info("Stopping safekeeper {}".format(self.id))
        self.env.neon_cli.safekeeper_stop(self.id, immediate)
        self.running = False
@@ -2565,7 +2700,7 @@ class Etcd:
            self.handle.wait()


-def get_test_output_dir(request: Any, top_output_dir: Path) -> Path:
+def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
    """Compute the working directory for an individual test."""
    test_name = request.node.name
    test_dir = top_output_dir / test_name.replace("/", "-")
@@ -2585,7 +2720,7 @@ def get_test_output_dir(request: Any, top_output_dir: Path) -> Path:
 # this fixture ensures that the directory exists.  That works because
 # 'autouse' fixtures are run before other fixtures.
@pytest.fixture(scope="function", autouse=True)
-def test_output_dir(request: Any, top_output_dir: Path) -> Iterator[Path]:
+def test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Iterator[Path]:
    """Create the working directory for an individual test."""

    # one directory per test
@@ -2649,7 +2784,7 @@ def should_skip_file(filename: str) -> bool:
 #
 # Test helpers
 #
-def list_files_to_compare(pgdata_dir: Path):
+def list_files_to_compare(pgdata_dir: Path) -> List[str]:
    pgdata_files = []
    for root, _file, filenames in os.walk(pgdata_dir):
        for filename in filenames:
--- a/test_runner/fixtures/pg_stats.py
+++ b/test_runner/fixtures/pg_stats.py
@@ -1,3 +1,4 @@
+from functools import cached_property
 from typing import List

 import pytest
@@ -13,7 +14,7 @@ class PgStatTable:
        self.columns = columns
        self.additional_query = filter_query

-    @property
+    @cached_property
    def query(self) -> str:
        return f"SELECT {','.join(self.columns)} FROM {self.table} {self.additional_query}"

@@ -55,6 +56,5 @@ def pg_stats_wal() -> List[PgStatTable]:
        PgStatTable(
            "pg_stat_wal",
            ["wal_records", "wal_fpi", "wal_bytes", "wal_buffers_full", "wal_write"],
-            "",
        )
    ]
--- a/test_runner/fixtures/slow.py
+++ b/test_runner/fixtures/slow.py
@@ -1,4 +1,8 @@
+from typing import Any, List
+
 import pytest
+from _pytest.config import Config
+from _pytest.config.argparsing import Parser

 """
 This plugin allows tests to be marked as slow using pytest.mark.slow. By default slow
@@ -9,15 +13,15 @@ Copied from here: https://docs.pytest.org/en/latest/example/simple.html
 """


-def pytest_addoption(parser):
+def pytest_addoption(parser: Parser):
    parser.addoption("--runslow", action="store_true", default=False, help="run slow tests")


-def pytest_configure(config):
+def pytest_configure(config: Config):
    config.addinivalue_line("markers", "slow: mark test as slow to run")


-def pytest_collection_modifyitems(config, items):
+def pytest_collection_modifyitems(config: Config, items: List[Any]):
    if config.getoption("--runslow"):
        # --runslow given in cli: do not skip slow tests
        return
--- a/test_runner/fixtures/types.py
+++ b/test_runner/fixtures/types.py
@@ -1,6 +1,8 @@
 import random
 from functools import total_ordering
-from typing import Union
+from typing import Any, Type, TypeVar, Union
+
+T = TypeVar("T", bound="Id")


@total_ordering
@@ -17,31 +19,35 @@ class Lsn:
            """Convert lsn from hex notation to int."""
            l, r = x.split("/")
            self.lsn_int = (int(l, 16) << 32) + int(r, 16)
-            # FIXME: error if it doesn't look like a valid LSN
+        assert 0 <= self.lsn_int <= 0xFFFFFFFF_FFFFFFFF

-    def __str__(self):
+    def __str__(self) -> str:
        """Convert lsn from int to standard hex notation."""
-        return "{:X}/{:X}".format(self.lsn_int >> 32, self.lsn_int & 0xFFFFFFFF)
+        return f"{(self.lsn_int >> 32):X}/{(self.lsn_int & 0xFFFFFFFF):X}"

-    def __repr__(self):
-        return 'Lsn("{:X}/{:X}")'.format(self.lsn_int >> 32, self.lsn_int & 0xFFFFFFFF)
+    def __repr__(self) -> str:
+        return f'Lsn("{str(self)}")'

-    def __int__(self):
+    def __int__(self) -> int:
        return self.lsn_int

-    def __lt__(self, other: "Lsn") -> bool:
+    def __lt__(self, other: Any) -> bool:
+        if not isinstance(other, Lsn):
+            return NotImplemented
        return self.lsn_int < other.lsn_int

-    def __eq__(self, other) -> bool:
+    def __eq__(self, other: Any) -> bool:
        if not isinstance(other, Lsn):
            return NotImplemented
        return self.lsn_int == other.lsn_int

    # Returns the difference between two Lsns, in bytes
-    def __sub__(self, other: "Lsn") -> int:
+    def __sub__(self, other: Any) -> int:
+        if not isinstance(other, Lsn):
+            return NotImplemented
        return self.lsn_int - other.lsn_int

-    def __hash__(self):
+    def __hash__(self) -> int:
        return hash(self.lsn_int)


@@ -57,7 +63,7 @@ class Id:
        self.id = bytearray.fromhex(x)
        assert len(self.id) == 16

-    def __str__(self):
+    def __str__(self) -> str:
        return self.id.hex()

    def __lt__(self, other) -> bool:
@@ -70,20 +76,20 @@ class Id:
            return NotImplemented
        return self.id == other.id

-    def __hash__(self):
+    def __hash__(self) -> int:
        return hash(str(self.id))

    @classmethod
-    def generate(cls):
+    def generate(cls: Type[T]) -> T:
        """Generate a random ID"""
        return cls(random.randbytes(16).hex())


 class TenantId(Id):
-    def __repr__(self):
+    def __repr__(self) -> str:
        return f'`TenantId("{self.id.hex()}")'


 class TimelineId(Id):
-    def __repr__(self):
+    def __repr__(self) -> str:
        return f'TimelineId("{self.id.hex()}")'
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -6,7 +6,7 @@ import subprocess
 import tarfile
 import time
 from pathlib import Path
-from typing import Any, Callable, List, Tuple, TypeVar
+from typing import Any, Callable, Dict, List, Tuple, TypeVar

 import allure  # type: ignore
 from fixtures.log_helper import log
@@ -17,7 +17,6 @@ Fn = TypeVar("Fn", bound=Callable[..., Any])

 def get_self_dir() -> Path:
    """Get the path to the directory where this script lives."""
-    # return os.path.dirname(os.path.abspath(__file__))
    return Path(__file__).resolve().parent


@@ -31,11 +30,11 @@ def subprocess_capture(capture_dir: Path, cmd: List[str], **kwargs: Any) -> str:
    If those files already exist, we will overwrite them.
    Returns basepath for files with captured output.
    """
-    assert type(cmd) is list
-    base = os.path.basename(cmd[0]) + "_{}".format(global_counter())
+    assert isinstance(cmd, list)
+    base = f"{os.path.basename(cmd[0])}_{global_counter()}"
    basepath = os.path.join(capture_dir, base)
-    stdout_filename = basepath + ".stdout"
-    stderr_filename = basepath + ".stderr"
+    stdout_filename = f"{basepath}.stdout"
+    stderr_filename = f"{basepath}.stderr"

    try:
        with open(stdout_filename, "w") as stdout_f:
@@ -65,7 +64,7 @@ def global_counter() -> int:
    return _global_counter


-def print_gc_result(row):
+def print_gc_result(row: Dict[str, Any]):
    log.info("GC duration {elapsed} ms".format_map(row))
    log.info(
        "  total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_pitr {layers_needed_by_pitr}"
@@ -79,8 +78,7 @@ def etcd_path() -> Path:
    path_output = shutil.which("etcd")
    if path_output is None:
        raise RuntimeError("etcd not found in PATH")
-    else:
-        return Path(path_output)
+    return Path(path_output)


 def query_scalar(cur: cursor, query: str) -> Any:
@@ -125,7 +123,6 @@ def get_timeline_dir_size(path: Path) -> int:
            # file is a delta layer
            _ = parse_delta_layer(dir_entry.name)
            sz += dir_entry.stat().st_size
-            continue
    return sz


@@ -158,8 +155,8 @@ def get_scale_for_db(size_mb: int) -> int:
    return round(0.06689 * size_mb - 0.5)


-ATTACHMENT_NAME_REGEX = re.compile(
-    r".+\.log|.+\.stderr|.+\.stdout|.+\.filediff|.+\.metrics|flamegraph\.svg|regression\.diffs|.+\.html"
+ATTACHMENT_NAME_REGEX: re.Pattern = re.compile(  # type: ignore[type-arg]
+    r"flamegraph\.svg|regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html)"
 )


--- a/test_runner/performance/README.md
+++ b/test_runner/performance/README.md
@@ -1,3 +1,22 @@
+# Running locally
+
+First make a release build. The profiling flag is optional, used only for tests that
+generate flame graphs. The `-s` flag just silences a lot of output, and makes it
+easier to see if you have compile errors without scrolling up.
+`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing,profiling" make -s -j8`
+
+NOTE: the `profiling` flag only works on linux because we use linux-specific
+libc APIs like `libc::timer_t`.
+
+Then run the tests
+`NEON_BIN=./target/release poetry run pytest test_runner/performance"`
+
+Some handy pytest flags for local development:
+- `-x` tells pytest to stop on first error
+- `-s` shows test output
+- `-k` selects a test to run
+- `--timeout=0` disables our default timeout of 300s (see `setup.cfg`)
+
 # What performance tests do we have and how we run them

 Performance tests are built using the same infrastructure as our usual python integration tests. There are some extra fixtures that help to collect performance metrics, and to run tests against both vanilla PostgreSQL and Neon for comparison.
--- a/test_runner/performance/test_branching.py
+++ b/test_runner/performance/test_branching.py
@@ -4,6 +4,7 @@ from typing import List

 from fixtures.benchmark_fixture import PgBenchRunResult
 from fixtures.compare_fixtures import NeonCompare
+from fixtures.neon_fixtures import fork_at_current_lsn
 from performance.test_perf_pgbench import utc_now_timestamp

 # -----------------------------------------------------------------------
@@ -43,7 +44,8 @@ def test_compare_child_and_root_pgbench_perf(neon_compare: NeonCompare):
    pg_root = env.postgres.create_start("root")
    pg_bin.run_capture(["pgbench", "-i", pg_root.connstr(), "-s10"])

-    env.neon_cli.create_branch("child", "root")
+    fork_at_current_lsn(env, pg_root, "child", "root")
+
    pg_child = env.postgres.create_start("child")

    run_pgbench_on_branch("root", ["pgbench", "-c10", "-T10", pg_root.connstr()])
--- a/test_runner/performance/test_read_trace.py
+++ b/test_runner/performance/test_read_trace.py
@@ -0,0 +1,31 @@
+from contextlib import closing
+
+from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+# This test demonstrates how to collect a read trace. It's useful until
+# it gets replaced by a test that actually does stuff with the trace.
+def test_read_request_tracing(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start()
+
+    tenant, _ = env.neon_cli.create_tenant(
+        conf={
+            "trace_read_requests": "true",
+        }
+    )
+
+    timeline = env.neon_cli.create_timeline("test_trace_replay", tenant_id=tenant)
+    pg = env.postgres.create_start("test_trace_replay", "main", tenant)
+
+    with closing(pg.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("create table t (i integer);")
+            cur.execute(f"insert into t values (generate_series(1,{10000}));")
+            cur.execute("select count(*) from t;")
+
+    # Stop pg so we drop the connection and flush the traces
+    pg.stop()
+
+    trace_path = env.repo_dir / "traces" / str(tenant) / str(timeline)
+    assert trace_path.exists()
--- a/test_runner/performance/test_seqscans.py
+++ b/test_runner/performance/test_seqscans.py
@@ -6,6 +6,7 @@ import pytest
 from fixtures.benchmark_fixture import MetricReport
 from fixtures.compare_fixtures import PgCompare
 from fixtures.log_helper import log
+from pytest_lazyfixture import lazy_fixture  # type: ignore


@pytest.mark.parametrize(
@@ -20,11 +21,24 @@ from fixtures.log_helper import log
        pytest.param(10000000, 1, 4),
    ],
 )
-def test_seqscans(neon_with_baseline: PgCompare, rows: int, iters: int, workers: int):
-    env = neon_with_baseline
+@pytest.mark.parametrize(
+    "env, scale",
+    [
+        # Run on all envs. Use 50x larger table on remote cluster to make sure
+        # it doesn't fit in shared buffers, which are larger on remote than local.
+        pytest.param(lazy_fixture("neon_compare"), 1, id="neon"),
+        pytest.param(lazy_fixture("vanilla_compare"), 1, id="vanilla"),
+        pytest.param(
+            lazy_fixture("remote_compare"), 50, id="remote", marks=pytest.mark.remote_cluster
+        ),
+    ],
+)
+def test_seqscans(env: PgCompare, scale: int, rows: int, iters: int, workers: int):
+    rows = scale * rows

    with closing(env.pg.connect()) as conn:
        with conn.cursor() as cur:
+            cur.execute("drop table if exists t;")
            cur.execute("create table t (i integer);")
            cur.execute(f"insert into t values (generate_series(1,{rows}));")

--- a/test_runner/performance/test_wal_backpressure.py
+++ b/test_runner/performance/test_wal_backpressure.py
@@ -2,7 +2,7 @@ import statistics
 import threading
 import time
 import timeit
-from typing import Callable
+from typing import Any, Callable, List

 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
@@ -197,7 +197,7 @@ def record_lsn_write_lag(env: PgCompare, run_cond: Callable[[], bool], pool_inte
    if not isinstance(env, NeonCompare):
        return

-    lsn_write_lags = []
+    lsn_write_lags: List[Any] = []
    last_received_lsn = Lsn(0)
    last_pg_flush_lsn = Lsn(0)

@@ -216,6 +216,7 @@ def record_lsn_write_lag(env: PgCompare, run_cond: Callable[[], bool], pool_inte
            )

            res = cur.fetchone()
+            assert isinstance(res, tuple)
            lsn_write_lags.append(res[0])

            curr_received_lsn = Lsn(res[3])
--- a/test_runner/pg_clients/python/asyncpg/asyncpg_example.py
+++ b/test_runner/pg_clients/python/asyncpg/asyncpg_example.py
@@ -24,7 +24,6 @@ if __name__ == "__main__":
        if (v := os.environ.get(k, None)) is not None
    }

-    loop = asyncio.new_event_loop()
-    row = loop.run_until_complete(run(**kwargs))
+    row = asyncio.run(run(**kwargs))

    print(row[0])
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -116,6 +116,13 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
    env = neon_simple_env
    pageserver_http_client = env.pageserver.http_client()

+    env.pageserver.allowed_errors.extend(
+        [
+            ".*invalid branch start lsn: less than latest GC cutoff.*",
+            ".*invalid branch start lsn: less than planned GC cutoff.*",
+        ]
+    )
+
    # Disable background GC but set the `pitr_interval` to be small, so GC can delete something
    tenant, _ = env.neon_cli.create_tenant(
        conf={
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -13,6 +13,9 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
    env = neon_env_builder.init_start()

+    env.pageserver.allowed_errors.append(".*invalid branch start lsn.*")
+    env.pageserver.allowed_errors.append(".*invalid start lsn .* for ancestor timeline.*")
+
    # Branch at the point where only 100 rows were inserted
    env.neon_cli.create_branch("test_branch_behind")
    pgmain = env.postgres.create_start("test_branch_behind")
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -11,10 +11,17 @@ from fixtures.types import TenantId, TimelineId
 # Test restarting page server, while safekeeper and compute node keep
 # running.
 def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
-    # One safekeeper is enough for this test.
-    neon_env_builder.num_safekeepers = 3
    env = neon_env_builder.init_start()

+    env.pageserver.allowed_errors.extend(
+        [
+            ".*No timelines to attach received.*",
+            ".*Failed to process timeline dir contents.*",
+            ".*Failed to load delta layer.*",
+            ".*Timeline .* was not found.*",
+        ]
+    )
+
    tenant_timelines: List[Tuple[TenantId, TimelineId, Postgres]] = []

    for n in range(4):
@@ -72,23 +79,24 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
    # First timeline would not get loaded into pageserver due to corrupt metadata file
    with pytest.raises(Exception, match=f"Timeline {tenant1}/{timeline1} was not found") as err:
        pg1.start()
-    log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}")
+    log.info(
+        f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}"
+    )

    # Second timeline has no ancestors, only the metadata file and no layer files
    # We don't have the remote storage enabled, which means timeline is in an incorrect state,
    # it's not loaded at all
    with pytest.raises(Exception, match=f"Timeline {tenant2}/{timeline2} was not found") as err:
        pg2.start()
-    log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}")
+    log.info(f"As expected, compute startup failed for timeline with missing layers: {err}")

-    # Yet other timelines will fail when their layers will be queried during basebackup: we don't check layer file contents on startup, when loading the timeline
-    for n in range(3, 4):
-        (bad_tenant, bad_timeline, pg) = tenant_timelines[n]
-        with pytest.raises(Exception, match="extracting base backup failed") as err:
-            pg.start()
-        log.info(
-            f"compute startup failed lazily for timeline {bad_tenant}/{bad_timeline} with corrupt layers, during basebackup preparation: {err}"
-        )
+    # Third timeline will also fail during basebackup, because the layer file is corrupt.
+    # (We don't check layer file contents on startup, when loading the timeline)
+    with pytest.raises(Exception, match="Failed to load delta layer") as err:
+        pg3.start()
+    log.info(
+        f"As expected, compute startup failed for timeline {tenant3}/{timeline3} with corrupt layers: {err}"
+    )


 def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv):
@@ -111,6 +119,13 @@ def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv):
    env = neon_simple_env
    pageserver_http = env.pageserver.http_client()

+    env.pageserver.allowed_errors.extend(
+        [
+            ".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*",
+            ".*Timeline got dropped without initializing, cleaning its files.*",
+        ]
+    )
+
    tenant_id, _ = env.neon_cli.create_tenant()

    timelines_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines"
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -1,12 +1,12 @@
 import os
-import re
 import shutil
 import subprocess
 from pathlib import Path
-from typing import Any, Dict, Union
+from typing import Any, Optional

 import pytest
-import toml
+import toml  # TODO: replace with tomllib for Python >= 3.11
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonCli,
    NeonEnvBuilder,
@@ -19,94 +19,197 @@ from fixtures.neon_fixtures import (
 from fixtures.types import Lsn
 from pytest import FixtureRequest

-DEFAILT_LOCAL_SNAPSHOT_DIR = "test_output/test_prepare_snapshot/compatibility_snapshot_pg14"
+#
+# A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases.
+# - `test_create_snapshot` a script wrapped in a test that creates a data snapshot.
+# - `test_backward_compatibility` checks that the current version of Neon can start/read/interract with a data snapshot created by the previous version.
+#   The path to the snapshot is configured by COMPATIBILITY_SNAPSHOT_DIR environment variable.
+#   If the breakage is intentional, the test can be xfaild with setting ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE=true.
+# - `test_forward_compatibility` checks that a snapshot created by the current version can be started/read/interracted by the previous version of Neon.
+#   Paths to Neon and Postgres are configured by COMPATIBILITY_NEON_BIN and COMPATIBILITY_POSTGRES_DISTRIB_DIR environment variables.
+#   If the breakage is intentional, the test can be xfaild with setting ALLOW_FORWARD_COMPATIBILITY_BREAKAGE=true.
+#
+# The file contains a couple of helper functions:
+# - prepare_snapshot copies the snapshot, cleans it up and makes it ready for the current version of Neon (replaces paths and ports in config files).
+# - check_neon_works performs the test itself, feel free to add more checks there.
+#


-def dump_differs(first: Path, second: Path, output: Path) -> bool:
-    """
-    Runs diff(1) command on two SQL dumps and write the output to the given output file.
-    Returns True if the dumps differ, False otherwise.
-    """
+# Note: if renaming this test, don't forget to update a reference to it in a workflow file:
+# "Upload compatibility snapshot" step in .github/actions/run-python-test-set/action.yml
+@pytest.mark.xdist_group("compatibility")
+@pytest.mark.order(before="test_forward_compatibility")
+def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path):
+    # The test doesn't really test anything
+    # it creates a new snapshot for releases after we tested the current version against the previous snapshot in `test_backward_compatibility`.
+    #
+    # There's no cleanup here, it allows to adjust the data in `test_backward_compatibility` itself without re-collecting it.
+    neon_env_builder.pg_version = "14"
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_local_fs_remote_storage()

-    with output.open("w") as stdout:
-        rv = subprocess.run(
-            [
-                "diff",
-                "--unified",  # Make diff output more readable
-                "--ignore-matching-lines=^--",  # Ignore changes in comments
-                "--ignore-blank-lines",
-                str(first),
-                str(second),
-            ],
-            stdout=stdout,
-        )
+    env = neon_env_builder.init_start()
+    pg = env.postgres.create_start("main")

-    return rv.returncode != 0
+    # FIXME: Is this expected?
+    env.pageserver.allowed_errors.append(
+        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+
+    pg_bin.run(["pgbench", "--initialize", "--scale=10", pg.connstr()])
+    pg_bin.run(["pgbench", "--time=60", "--progress=2", pg.connstr()])
+    pg_bin.run(["pg_dumpall", f"--dbname={pg.connstr()}", f"--file={test_output_dir / 'dump.sql'}"])
+
+    snapshot_config = toml.load(test_output_dir / "repo" / "config")
+    tenant_id = snapshot_config["default_tenant_id"]
+    timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id]
+
+    pageserver_http = env.pageserver.http_client()
+    lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+
+    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn)
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
+
+    env.postgres.stop_all()
+    for sk in env.safekeepers:
+        sk.stop()
+    env.pageserver.stop()
+
+    shutil.copytree(test_output_dir, test_output_dir / "compatibility_snapshot_pg14")
+    # Directory `test_output_dir / "compatibility_snapshot_pg14"` is uploaded to S3 in a workflow, keep the name in sync with it


-class PortReplacer(object):
-    """
-    Class-helper for replacing ports in config files.
-    """
-
-    def __init__(self, port_distributor: PortDistributor):
-        self.port_distributor = port_distributor
-        self.port_map: Dict[int, int] = {}
-
-    def replace_port(self, value: Union[int, str]) -> Union[int, str]:
-        if isinstance(value, int):
-            if (known_port := self.port_map.get(value)) is not None:
-                return known_port
-
-            self.port_map[value] = self.port_distributor.get_port()
-            return self.port_map[value]
-
-        if isinstance(value, str):
-            # Use regex to find port in a string
-            # urllib.parse.urlparse produces inconvenient results for cases without scheme like "localhost:5432"
-            # See https://bugs.python.org/issue27657
-            ports = re.findall(r":(\d+)(?:/|$)", value)
-            assert len(ports) == 1, f"can't find port in {value}"
-            port_int = int(ports[0])
-
-            if (known_port := self.port_map.get(port_int)) is not None:
-                return value.replace(f":{port_int}", f":{known_port}")
-
-            self.port_map[port_int] = self.port_distributor.get_port()
-            return value.replace(f":{port_int}", f":{self.port_map[port_int]}")
-
-        raise TypeError(f"unsupported type {type(value)} of {value=}")
-
-
-@pytest.mark.order(after="test_prepare_snapshot")
+@pytest.mark.xdist_group("compatibility")
+@pytest.mark.order(after="test_create_snapshot")
 def test_backward_compatibility(
    pg_bin: PgBin,
    port_distributor: PortDistributor,
    test_output_dir: Path,
-    request: FixtureRequest,
    neon_binpath: Path,
    pg_distrib_dir: Path,
+    pg_version: str,
+    request: FixtureRequest,
 ):
-    compatibility_snapshot_dir = Path(
-        os.environ.get("COMPATIBILITY_SNAPSHOT_DIR", DEFAILT_LOCAL_SNAPSHOT_DIR)
-    )
-    assert compatibility_snapshot_dir.exists(), (
-        f"{compatibility_snapshot_dir} doesn't exist. Please run `test_prepare_snapshot` test first "
-        "to create the snapshot or set COMPATIBILITY_SNAPSHOT_DIR env variable to the existing snapshot"
-    )
-    compatibility_snapshot_dir = compatibility_snapshot_dir.resolve()
+    compatibility_snapshot_dir_env = os.environ.get("COMPATIBILITY_SNAPSHOT_DIR")
+    assert (
+        compatibility_snapshot_dir_env is not None
+    ), "COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg14` path generateted by test_create_snapshot (ideally generated by the previous version of Neon)"
+    compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve()

-    # Make compatibility snapshot artifacts pickupable by Allure
-    # by copying the snapshot directory to the curent test output directory.
-    repo_dir = test_output_dir / "compatibility_snapshot" / "repo"
+    # Copy the snapshot to current directory, and prepare for the test
+    prepare_snapshot(
+        from_dir=compatibility_snapshot_dir,
+        to_dir=test_output_dir / "compatibility_snapshot",
+        port_distributor=port_distributor,
+    )

-    shutil.copytree(compatibility_snapshot_dir / "repo", repo_dir)
+    breaking_changes_allowed = (
+        os.environ.get("ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true"
+    )
+    try:
+        check_neon_works(
+            test_output_dir / "compatibility_snapshot" / "repo",
+            neon_binpath,
+            pg_distrib_dir,
+            pg_version,
+            port_distributor,
+            test_output_dir,
+            pg_bin,
+            request,
+        )
+    except Exception:
+        if breaking_changes_allowed:
+            pytest.xfail(
+                "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE env var"
+            )
+        else:
+            raise
+
+    assert (
+        not breaking_changes_allowed
+    ), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
+
+
+@pytest.mark.xdist_group("compatibility")
+@pytest.mark.order(after="test_create_snapshot")
+def test_forward_compatibility(
+    test_output_dir: Path,
+    port_distributor: PortDistributor,
+    pg_version: str,
+    request: FixtureRequest,
+):
+    compatibility_neon_bin_env = os.environ.get("COMPATIBILITY_NEON_BIN")
+    assert compatibility_neon_bin_env is not None, (
+        "COMPATIBILITY_NEON_BIN is not set. It should be set to a path with Neon binaries "
+        "(ideally generated by the previous version of Neon)"
+    )
+    compatibility_neon_bin = Path(compatibility_neon_bin_env).resolve()
+
+    compatibility_postgres_distrib_dir_env = os.environ.get("COMPATIBILITY_POSTGRES_DISTRIB_DIR")
+    assert (
+        compatibility_postgres_distrib_dir_env is not None
+    ), "COMPATIBILITY_POSTGRES_DISTRIB_DIR is not set. It should be set to a pg_install directrory (ideally generated by the previous version of Neon)"
+    compatibility_postgres_distrib_dir = Path(compatibility_postgres_distrib_dir_env).resolve()
+
+    compatibility_snapshot_dir = (
+        test_output_dir.parent / "test_create_snapshot" / "compatibility_snapshot_pg14"
+    )
+    # Copy the snapshot to current directory, and prepare for the test
+    prepare_snapshot(
+        from_dir=compatibility_snapshot_dir,
+        to_dir=test_output_dir / "compatibility_snapshot",
+        port_distributor=port_distributor,
+        pg_distrib_dir=compatibility_postgres_distrib_dir,
+    )
+
+    breaking_changes_allowed = (
+        os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true"
+    )
+    try:
+        check_neon_works(
+            test_output_dir / "compatibility_snapshot" / "repo",
+            compatibility_neon_bin,
+            compatibility_postgres_distrib_dir,
+            pg_version,
+            port_distributor,
+            test_output_dir,
+            PgBin(test_output_dir, compatibility_postgres_distrib_dir, pg_version),
+            request,
+        )
+    except Exception:
+        if breaking_changes_allowed:
+            pytest.xfail(
+                "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE env var"
+            )
+        else:
+            raise
+
+    assert (
+        not breaking_changes_allowed
+    ), "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
+
+
+def prepare_snapshot(
+    from_dir: Path,
+    to_dir: Path,
+    port_distributor: PortDistributor,
+    pg_distrib_dir: Optional[Path] = None,
+):
+    assert from_dir.exists(), f"Snapshot '{from_dir}' doesn't exist"
+    assert (from_dir / "repo").exists(), f"Snapshot '{from_dir}' doesn't contain a repo directory"
+    assert (from_dir / "dump.sql").exists(), f"Snapshot '{from_dir}' doesn't contain a dump.sql"
+
+    log.info(f"Copying snapshot from {from_dir} to {to_dir}")
+    shutil.copytree(from_dir, to_dir)
+
+    repo_dir = to_dir / "repo"

    # Remove old logs to avoid confusion in test artifacts
    for logfile in repo_dir.glob("**/*.log"):
        logfile.unlink()

-    # Remove tenants data for computes
+    # Remove tenants data for compute
    for tenant in (repo_dir / "pgdatadirs" / "tenants").glob("*"):
        shutil.rmtree(tenant)

@@ -115,41 +218,45 @@ def test_backward_compatibility(
        shutil.rmtree(tenant / "wal-redo-datadir.___temp")

    # Update paths and ports in config files
-    pr = PortReplacer(port_distributor)
-
    pageserver_toml = repo_dir / "pageserver.toml"
    pageserver_config = toml.load(pageserver_toml)
-    new_local_path = pageserver_config["remote_storage"]["local_path"].replace(
-        "/test_prepare_snapshot/",
-        "/test_backward_compatibility/compatibility_snapshot/",
+    pageserver_config["remote_storage"]["local_path"] = str(repo_dir / "local_fs_remote_storage")
+    pageserver_config["listen_http_addr"] = port_distributor.replace_with_new_port(
+        pageserver_config["listen_http_addr"]
+    )
+    pageserver_config["listen_pg_addr"] = port_distributor.replace_with_new_port(
+        pageserver_config["listen_pg_addr"]
    )
-
-    pageserver_config["remote_storage"]["local_path"] = new_local_path
-    pageserver_config["listen_http_addr"] = pr.replace_port(pageserver_config["listen_http_addr"])
-    pageserver_config["listen_pg_addr"] = pr.replace_port(pageserver_config["listen_pg_addr"])
    pageserver_config["broker_endpoints"] = [
-        pr.replace_port(ep) for ep in pageserver_config["broker_endpoints"]
+        port_distributor.replace_with_new_port(ep) for ep in pageserver_config["broker_endpoints"]
    ]

+    if pg_distrib_dir:
+        pageserver_config["pg_distrib_dir"] = str(pg_distrib_dir)
+
    with pageserver_toml.open("w") as f:
        toml.dump(pageserver_config, f)

    snapshot_config_toml = repo_dir / "config"
    snapshot_config = toml.load(snapshot_config_toml)
    snapshot_config["etcd_broker"]["broker_endpoints"] = [
-        pr.replace_port(ep) for ep in snapshot_config["etcd_broker"]["broker_endpoints"]
+        port_distributor.replace_with_new_port(ep)
+        for ep in snapshot_config["etcd_broker"]["broker_endpoints"]
    ]
-    snapshot_config["pageserver"]["listen_http_addr"] = pr.replace_port(
+    snapshot_config["pageserver"]["listen_http_addr"] = port_distributor.replace_with_new_port(
        snapshot_config["pageserver"]["listen_http_addr"]
    )
-    snapshot_config["pageserver"]["listen_pg_addr"] = pr.replace_port(
+    snapshot_config["pageserver"]["listen_pg_addr"] = port_distributor.replace_with_new_port(
        snapshot_config["pageserver"]["listen_pg_addr"]
    )
    for sk in snapshot_config["safekeepers"]:
-        sk["http_port"] = pr.replace_port(sk["http_port"])
-        sk["pg_port"] = pr.replace_port(sk["pg_port"])
+        sk["http_port"] = port_distributor.replace_with_new_port(sk["http_port"])
+        sk["pg_port"] = port_distributor.replace_with_new_port(sk["pg_port"])

-    with (snapshot_config_toml).open("w") as f:
+    if pg_distrib_dir:
+        snapshot_config["pg_distrib_dir"] = str(pg_distrib_dir)
+
+    with snapshot_config_toml.open("w") as f:
        toml.dump(snapshot_config, f)

    # Ensure that snapshot doesn't contain references to the original path
@@ -159,7 +266,7 @@ def test_backward_compatibility(
            "--recursive",
            "--binary-file=without-match",
            "--files-with-matches",
-            "test_prepare_snapshot/repo",
+            "test_create_snapshot/repo",
            str(repo_dir),
        ],
        capture_output=True,
@@ -167,44 +274,47 @@ def test_backward_compatibility(
    )
    assert (
        rv.returncode != 0
-    ), f"there're files referencing `test_prepare_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}"
+    ), f"there're files referencing `test_create_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}"

-    # NeonEnv stub to make NeonCli happy
+
+def check_neon_works(
+    repo_dir: Path,
+    neon_binpath: Path,
+    pg_distrib_dir: Path,
+    pg_version: str,
+    port_distributor: PortDistributor,
+    test_output_dir: Path,
+    pg_bin: PgBin,
+    request: FixtureRequest,
+):
+    snapshot_config_toml = repo_dir / "config"
+    snapshot_config = toml.load(snapshot_config_toml)
+    snapshot_config["neon_distrib_dir"] = str(neon_binpath)
+    snapshot_config["postgres_distrib_dir"] = str(pg_distrib_dir)
+    with (snapshot_config_toml).open("w") as f:
+        toml.dump(snapshot_config, f)
+
+    # TODO: replace with NeonEnvBuilder / NeonEnv
    config: Any = type("NeonEnvStub", (object,), {})
    config.rust_log_override = None
    config.repo_dir = repo_dir
-    config.pg_version = "14"  # Note: `pg_dumpall` (from pg_bin) version is set by DEFAULT_PG_VERSION_DEFAULT and can be overriden by DEFAULT_PG_VERSION env var
+    config.pg_version = pg_version
    config.initial_tenant = snapshot_config["default_tenant_id"]
    config.neon_binpath = neon_binpath
    config.pg_distrib_dir = pg_distrib_dir

-    # Check that we can start the project
    cli = NeonCli(config)
-    try:
-        cli.raw_cli(["start"])
-        request.addfinalizer(lambda: cli.raw_cli(["stop"]))
+    cli.raw_cli(["start"])
+    request.addfinalizer(lambda: cli.raw_cli(["stop"]))

-        result = cli.pg_start("main", port=port_distributor.get_port())
-        request.addfinalizer(lambda: cli.pg_stop("main"))
-    except Exception:
-        breaking_changes_allowed = (
-            os.environ.get("ALLOW_BREAKING_CHANGES", "false").lower() == "true"
-        )
-        if breaking_changes_allowed:
-            pytest.xfail("Breaking changes are allowed by ALLOW_BREAKING_CHANGES env var")
-        else:
-            raise
+    pg_port = port_distributor.get_port()
+    cli.pg_start("main", port=pg_port)
+    request.addfinalizer(lambda: cli.pg_stop("main"))

-    connstr_all = re.findall(r"Starting postgres node at '([^']+)'", result.stdout)
-    assert len(connstr_all) == 1, f"can't parse connstr from {result.stdout}"
-    connstr = connstr_all[0]
-
-    # Check that the project produces the same dump as the previous version.
-    # The assert itself deferred to the end of the test
-    # to allow us to perform checks that change data before failing
+    connstr = f"host=127.0.0.1 port={pg_port} user=cloud_admin dbname=postgres"
    pg_bin.run(["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"])
    initial_dump_differs = dump_differs(
-        compatibility_snapshot_dir / "dump.sql",
+        repo_dir.parent / "dump.sql",
        test_output_dir / "dump.sql",
        test_output_dir / "dump.filediff",
    )
@@ -242,38 +352,23 @@ def test_backward_compatibility(
    assert not initial_dump_differs, "initial dump differs"


-# Note: if renaming this test, don't forget to update a reference to it in a workflow file:
-# "Upload compatibility snapshot" step in .github/actions/run-python-test-set/action.yml
-def test_prepare_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path):
-    # The test doesn't really test anything
-    # it creates a new snapshot for releases after we tested the current version against the previous snapshot in `test_backward_compatibility`.
-    #
-    # There's no cleanup here, it allows to adjust the data in `test_backward_compatibility` itself without re-collecting it.
-    neon_env_builder.pg_version = "14"
-    neon_env_builder.num_safekeepers = 3
-    neon_env_builder.enable_local_fs_remote_storage()
+def dump_differs(first: Path, second: Path, output: Path) -> bool:
+    """
+    Runs diff(1) command on two SQL dumps and write the output to the given output file.
+    Returns True if the dumps differ, False otherwise.
+    """

-    env = neon_env_builder.init_start()
-    pg = env.postgres.create_start("main")
-    pg_bin.run(["pgbench", "--initialize", "--scale=10", pg.connstr()])
-    pg_bin.run(["pgbench", "--time=60", "--progress=2", pg.connstr()])
-    pg_bin.run(["pg_dumpall", f"--dbname={pg.connstr()}", f"--file={test_output_dir / 'dump.sql'}"])
+    with output.open("w") as stdout:
+        rv = subprocess.run(
+            [
+                "diff",
+                "--unified",  # Make diff output more readable
+                "--ignore-matching-lines=^--",  # Ignore changes in comments
+                "--ignore-blank-lines",
+                str(first),
+                str(second),
+            ],
+            stdout=stdout,
+        )

-    snapshot_config = toml.load(test_output_dir / "repo" / "config")
-    tenant_id = snapshot_config["default_tenant_id"]
-    timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id]
-
-    pageserver_http = env.pageserver.http_client()
-    lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
-
-    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn)
-    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-    wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
-
-    env.postgres.stop_all()
-    for sk in env.safekeepers:
-        sk.stop()
-    env.pageserver.stop()
-
-    shutil.copytree(test_output_dir, test_output_dir / "compatibility_snapshot_pg14")
-    # Directory `test_output_dir / "compatibility_snapshot_pg14"` is uploaded to S3 in a workflow, keep the name in sync with it
+    return rv.returncode != 0
--- a/test_runner/regress/test_compute_ctl.py
+++ b/test_runner/regress/test_compute_ctl.py
@@ -179,7 +179,16 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    # run compute_ctl and wait for 10s
    try:
        ctl.raw_cli(
-            ["--connstr", ps_connstr, "--pgdata", pgdata, "--spec", spec, "--pgbin", pg_bin_path],
+            [
+                "--connstr",
+                "postgres://invalid/",
+                "--pgdata",
+                pgdata,
+                "--spec",
+                spec,
+                "--pgbin",
+                pg_bin_path,
+            ],
            timeout=10,
        )
    except TimeoutExpired as exc:
--- a/Show More
+++ b/Show More