Add 'neon_seqscan_rel' test function, to test sequential scan performance.

Usage: postgres=# \timing Timing is on. postgres=# select neon_seqscan_rel('pgbench_accounts', 1000); INFO: scanning 491804 blocks, prefetch 1000 INFO: blk 0/491804 INFO: blk 1024/491804 INFO: blk 2048/491804 INFO: blk 3072/491804 ... INFO: blk 489472/491804 INFO: blk 490496/491804 INFO: blk 491520/491804 neon_seqscan_rel ------------------ (1 row) Time: 57517.979 ms (00:57.518) The second argument to the function is the number of pages to prefetch. Note: the prefetching in this function works differently from the prefetching we have for sequential scans in 'main'. After receiving the result for a block, it immediately sends the request for the next page, it doesn't send them in batches like 'main' does.
2026-04-20 16:00:37 +00:00 · 2022-10-21 13:30:35 +03:00
154 changed files with 2486 additions and 7896 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -73,14 +73,6 @@ runs:
      shell: bash -euxo pipefail {0}
      run: ./scripts/pysync

-    - name: Download compatibility snapshot for Postgres 14
-      if: inputs.build_type != 'remote'
-      uses: ./.github/actions/download
-      with:
-        name: compatibility-snapshot-${{ inputs.build_type }}-pg14
-        path: /tmp/compatibility_snapshot_pg14
-        prefix: latest
-
    - name: Run pytest
      env:
        NEON_BIN: /tmp/neon/bin
@@ -88,8 +80,6 @@ runs:
        BUILD_TYPE: ${{ inputs.build_type }}
        AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
        AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
-        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
-        ALLOW_BREAKING_CHANGES: contains(github.event.pull_request.labels.*.name, 'breaking changes')
      shell: bash -euxo pipefail {0}
      run: |
        # PLATFORM will be embedded in the perf test report
@@ -164,15 +154,6 @@ runs:
          scripts/generate_and_push_perf_report.sh
        fi

-    - name: Upload compatibility snapshot for Postgres 14
-      if: github.ref_name == 'release'
-      uses: ./.github/actions/upload
-      with:
-        name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
-        # The path includes a test name (test_prepare_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
-        path: /tmp/test_output/test_prepare_snapshot/compatibility_snapshot_pg14/
-        prefix: latest
-
    - name: Create Allure report
      if: always()
      uses: ./.github/actions/allure-report
--- a/.github/ansible/neon-stress.hosts.yaml
+++ b/.github/ansible/neon-stress.hosts.yaml
@@ -3,6 +3,7 @@ storage:
    bucket_name: neon-storage-ireland
    bucket_region: eu-west-1
    console_mgmt_base_url: http://neon-stress-console.local
+    env_name: neon-stress
    etcd_endpoints: neon-stress-etcd.local:2379
    safekeeper_enable_s3_offload: 'false'
    pageserver_config_stub:
@@ -11,7 +12,6 @@ storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
        prefix_in_bucket: "{{ inventory_hostname }}"
-    safekeeper_s3_prefix: neon-stress/wal
    hostname_suffix: ".local"
    remote_user: admin
  children:
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -1,35 +0,0 @@
-storage:
-  vars:
-    bucket_name: neon-prod-storage-ap-southeast-1
-    bucket_region: ap-southeast-1
-    console_mgmt_base_url: http://console-release.local
-    etcd_endpoints: etcd-0.ap-southeast-1.aws.neon.tech:2379
-    pageserver_config_stub:
-      pg_distrib_dir: /usr/local
-      remote_storage:
-        bucket_name: "{{ bucket_name }}"
-        bucket_region: "{{ bucket_region }}"
-        prefix_in_bucket: "pageserver/v1"
-    safekeeper_s3_prefix: safekeeper/v1/wal
-    hostname_suffix: ""
-    remote_user: ssm-user
-    ansible_aws_ssm_region: ap-southeast-1
-    ansible_aws_ssm_bucket_name: neon-prod-storage-ap-southeast-1
-    console_region_id: aws-ap-southeast-1
-
-  children:
-    pageservers:
-      hosts:
-        pageserver-0.ap-southeast-1.aws.neon.tech:
-          ansible_host:  i-064de8ea28bdb495b
-        pageserver-1.ap-southeast-1.aws.neon.tech:
-          ansible_host:  i-0b180defcaeeb6b93
-
-    safekeepers:
-      hosts:
-        safekeeper-0.ap-southeast-1.aws.neon.tech:
-          ansible_host:  i-0d6f1dc5161eef894
-        safekeeper-1.ap-southeast-1.aws.neon.tech:
-          ansible_host:  i-0e338adda8eb2d19f
-        safekeeper-2.ap-southeast-1.aws.neon.tech:
-          ansible_host:  i-04fb63634e4679eb9
--- a/.github/ansible/prod.eu-central-1.hosts.yaml
+++ b/.github/ansible/prod.eu-central-1.hosts.yaml
@@ -1,35 +0,0 @@
-storage:
-  vars:
-    bucket_name: neon-prod-storage-eu-central-1
-    bucket_region: eu-central-1
-    console_mgmt_base_url: http://console-release.local
-    etcd_endpoints: etcd-0.eu-central-1.aws.neon.tech:2379
-    pageserver_config_stub:
-      pg_distrib_dir: /usr/local
-      remote_storage:
-        bucket_name: "{{ bucket_name }}"
-        bucket_region: "{{ bucket_region }}"
-        prefix_in_bucket: "pageserver/v1"
-    safekeeper_s3_prefix: safekeeper/v1/wal
-    hostname_suffix: ""
-    remote_user: ssm-user
-    ansible_aws_ssm_region: eu-central-1
-    ansible_aws_ssm_bucket_name: neon-prod-storage-eu-central-1
-    console_region_id: aws-eu-central-1
-
-  children:
-    pageservers:
-      hosts:
-        pageserver-0.eu-central-1.aws.neon.tech:
-          ansible_host:  i-0cd8d316ecbb715be
-        pageserver-1.eu-central-1.aws.neon.tech:
-          ansible_host:  i-090044ed3d383fef0
-
-    safekeepers:
-      hosts:
-        safekeeper-0.eu-central-1.aws.neon.tech:
-          ansible_host:  i-0b238612d2318a050
-        safekeeper-1.eu-central-1.aws.neon.tech:
-          ansible_host:  i-07b9c45e5c2637cd4
-        safekeeper-2.eu-central-1.aws.neon.tech:
-          ansible_host:  i-020257302c3c93d88
--- a/.github/ansible/prod.us-east-2.hosts.yaml
+++ b/.github/ansible/prod.us-east-2.hosts.yaml
@@ -1,36 +0,0 @@
-storage:
-  vars:
-    bucket_name: neon-prod-storage-us-east-2
-    bucket_region: us-east-2
-    console_mgmt_base_url: http://console-release.local
-    etcd_endpoints: etcd-0.us-east-2.aws.neon.tech:2379
-    pageserver_config_stub:
-      pg_distrib_dir: /usr/local
-      remote_storage:
-        bucket_name: "{{ bucket_name }}"
-        bucket_region: "{{ bucket_region }}"
-        prefix_in_bucket: "pageserver/v1"
-    safekeeper_s3_prefix: safekeeper/v1/wal
-    hostname_suffix: ""
-    remote_user: ssm-user
-    ansible_aws_ssm_region: us-east-2
-    ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-2
-    console_region_id: aws-us-east-2
-
-  children:
-    pageservers:
-      hosts:
-        pageserver-0.us-east-2.aws.neon.tech:
-          ansible_host:  i-062227ba7f119eb8c
-        pageserver-1.us-east-2.aws.neon.tech:
-          ansible_host:  i-0b3ec0afab5968938
-
-    safekeepers:
-      hosts:
-        safekeeper-0.us-east-2.aws.neon.tech:
-          ansible_host:  i-0e94224750c57d346
-        safekeeper-1.us-east-2.aws.neon.tech:
-          ansible_host:  i-06d113fb73bfddeb0
-        safekeeper-2.us-east-2.aws.neon.tech:
-          ansible_host:  i-09f66c8e04afff2e8
-          
--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -1,6 +1,7 @@
 ---
 storage:
  vars:
+    env_name: prod-1
    console_mgmt_base_url: http://console-release.local
    bucket_name: zenith-storage-oregon
    bucket_region: us-west-2
@@ -11,7 +12,6 @@ storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
        prefix_in_bucket: "{{ inventory_hostname }}"
-    safekeeper_s3_prefix: prod-1/wal
    hostname_suffix: ".local"
    remote_user: admin

--- a/.github/ansible/ssm_config
+++ b/.github/ansible/ssm_config
@@ -1,2 +1,3 @@
 ansible_connection: aws_ssm
+ansible_aws_ssm_bucket_name: neon-dev-bucket
 ansible_python_interpreter: /usr/bin/python3
--- a/.github/ansible/staging.hosts.yaml
+++ b/.github/ansible/staging.hosts.yaml
@@ -3,6 +3,7 @@ storage:
    bucket_name: zenith-staging-storage-us-east-1
    bucket_region: us-east-1
    console_mgmt_base_url: http://console-staging.local
+    env_name: us-stage
    etcd_endpoints: zenith-us-stage-etcd.local:2379
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
@@ -10,7 +11,6 @@ storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
        prefix_in_bucket: "{{ inventory_hostname }}"
-    safekeeper_s3_prefix: us-stage/wal
    hostname_suffix: ".local"
    remote_user: admin

--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -3,6 +3,7 @@ storage:
    bucket_name: neon-staging-storage-us-east-2
    bucket_region: us-east-2
    console_mgmt_base_url: http://console-staging.local
+    env_name: us-stage
    etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
@@ -10,11 +11,9 @@ storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
        prefix_in_bucket: "pageserver/v1"
-    safekeeper_s3_prefix: safekeeper/v1/wal
    hostname_suffix: ""
    remote_user: ssm-user
    ansible_aws_ssm_region: us-east-2
-    ansible_aws_ssm_bucket_name: neon-staging-storage-us-east-2
    console_region_id: aws-us-east-2

  children:
--- a/.github/ansible/systemd/safekeeper.service
+++ b/.github/ansible/systemd/safekeeper.service
@@ -6,7 +6,7 @@ After=network.target auditd.service
 Type=simple
 User=safekeeper
 Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib
-ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}'
+ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
 KillSignal=SIGINT
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -1,31 +0,0 @@
-# Helm chart values for neon-proxy-scram.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "console"
-  authEndpoint: "http://console-staging.local/management/api/v2"
-  domain: "*.us-east-2.aws.neon.build"
-
-# -- Additional labels for neon-proxy pods
-podLabels:
-  zenith_service: proxy-scram
-  zenith_env: dev
-  zenith_region: us-east-2
-  zenith_region_slug: us-east-2
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.build
-
-#metrics:
-#  enabled: true
-#  serviceMonitor:
-#    enabled: true
-#    selector:
-#      release: kube-prometheus-stack
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -1,31 +0,0 @@
-# Helm chart values for neon-proxy-scram.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
-  domain: "*.ap-southeast-1.aws.neon.tech"
-
-# -- Additional labels for neon-proxy pods
-podLabels:
-  zenith_service: proxy-scram
-  zenith_env: prod
-  zenith_region: ap-southeast-1
-  zenith_region_slug: ap-southeast-1
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech
-
-#metrics:
-#  enabled: true
-#  serviceMonitor:
-#    enabled: true
-#    selector:
-#      release: kube-prometheus-stack
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -1,31 +0,0 @@
-# Helm chart values for neon-proxy-scram.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
-  domain: "*.eu-central-1.aws.neon.tech"
-
-# -- Additional labels for neon-proxy pods
-podLabels:
-  zenith_service: proxy-scram
-  zenith_env: prod
-  zenith_region: eu-central-1
-  zenith_region_slug: eu-central-1
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech
-
-#metrics:
-#  enabled: true
-#  serviceMonitor:
-#    enabled: true
-#    selector:
-#      release: kube-prometheus-stack
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -1,31 +0,0 @@
-# Helm chart values for neon-proxy-scram.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
-  domain: "*.us-east-2.aws.neon.tech"
-
-# -- Additional labels for neon-proxy pods
-podLabels:
-  zenith_service: proxy-scram
-  zenith_env: prod
-  zenith_region: us-east-2
-  zenith_region_slug: us-east-2
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech
-
-#metrics:
-#  enabled: true
-#  serviceMonitor:
-#    enabled: true
-#    selector:
-#      release: kube-prometheus-stack
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -127,8 +127,8 @@ jobs:
            target/
          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
          key: |
-            v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
-            v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-
+            v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
+            v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-

      - name: Cache postgres v14 build
        id: cache_pg_14
@@ -168,11 +168,6 @@ jobs:
          ${cov_prefix} cargo test $CARGO_FLAGS
        shell: bash -euxo pipefail {0}

-      - name: Slim binaries
-        run: |
-          scripts/strip-useless-debug.py -j$(nproc) target
-        shell: bash -euxo pipefail {0}
-
      - name: Install rust binaries
        run: |
          # Install target binaries
@@ -394,7 +389,7 @@ jobs:
            !~/.cargo/registry/src
            ~/.cargo/git/
            target/
-          key: v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
+          key: v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}

      - name: Get Neon artifact
        uses: ./.github/actions/download
@@ -630,11 +625,11 @@ jobs:
          (github.ref_name == 'main' || github.ref_name == 'release') &&
          github.event_name != 'workflow_dispatch'
        run: |
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v14:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v15:latest

      - name: Configure Docker Hub login
        run: |
@@ -761,9 +756,9 @@ jobs:
    defaults:
      run:
        shell: bash
-    strategy:
-      matrix:
-        target_region: [ us-east-2 ]
+    env:
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -786,47 +781,7 @@ jobs:
          fi

          ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
-          rm -f neon_install.tar.gz .neon_current_version
-
-  deploy-prod-new:
-    runs-on: prod
-    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
-    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'release') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        target_region: [ us-east-2, eu-central-1, ap-southeast-1 ]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Redeploy
-        run: |
-          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          cd "$(pwd)/.github/ansible"
-
-          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            ./get_binaries.sh
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            RELEASE=true ./get_binaries.sh
-          else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
-            exit 1
-          fi
-
-          ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_PRODUCTION_API_KEY}}
+          ansible-playbook deploy.yaml -i staging.us-east-2.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
          rm -f neon_install.tar.gz .neon_current_version

  deploy-proxy:
@@ -870,94 +825,3 @@ jobs:
          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
-
-  deploy-proxy-new:
-    runs-on: dev
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'main') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: dev-us-east-2-beta
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-
-      - name: Re-deploy proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
-
-  deploy-proxy-prod-new:
-    runs-on: prod
-    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'release') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: prod-us-east-2-delta
-          - target_region: eu-central-1
-            target_cluster: prod-eu-central-1-gamma
-          - target_region: ap-southeast-1
-            target_cluster: prod-ap-southeast-1-epsilon
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-
-      - name: Re-deploy proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
-
-  promote-compatibility-test-snapshot:
-    runs-on: dev
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
-      options: --init
-    needs: [ deploy, deploy-proxy ]
-    if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
-    steps:
-      - name: Promote compatibility snapshot for the release
-        shell: bash -euxo pipefail {0}
-        env:
-          BUCKET: neon-github-public-dev
-          PREFIX: artifacts/latest
-        run: |
-          for build_type in debug release; do
-            OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst
-            NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst
-
-            time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
-          done
--- a/.github/workflows/codestyle.yml
+++ b/.github/workflows/codestyle.yml
@@ -106,7 +106,7 @@ jobs:
            !~/.cargo/registry/src
            ~/.cargo/git
            target
-          key: v6-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
+          key: v5-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust

      - name: Run cargo clippy
        run: ./run_clippy.sh
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -317,6 +317,12 @@ dependencies = [
 "generic-array",
 ]

+[[package]]
+name = "boxfnonce"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5988cb1d626264ac94100be357308f29ff7cbdd3b36bda27f450a4ee3f713426"
+
 [[package]]
 name = "bstr"
 version = "1.0.1"
@@ -594,7 +600,6 @@ dependencies = [
 "tar",
 "thiserror",
 "toml",
- "url",
 "utils",
 "workspace_hack",
 ]
@@ -844,6 +849,16 @@ dependencies = [
 "syn",
 ]

+[[package]]
+name = "daemonize"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70c24513e34f53b640819f0ac9f705b673fcf4006d7aab8778bee72ebfc89815"
+dependencies = [
+ "boxfnonce",
+ "libc",
+]
+
 [[package]]
 name = "darling"
 version = "0.14.1"
@@ -2125,6 +2140,7 @@ dependencies = [
 "crc32c",
 "criterion",
 "crossbeam-utils",
+ "daemonize",
 "etcd_broker",
 "fail",
 "futures",
@@ -2145,7 +2161,6 @@ dependencies = [
 "postgres-types",
 "postgres_ffi",
 "pprof",
- "pq_proto",
 "rand",
 "regex",
 "remote_storage",
@@ -2155,10 +2170,8 @@ dependencies = [
 "serde_json",
 "serde_with",
 "signal-hook",
- "svg_fmt",
 "tar",
 "tempfile",
- "tenant_size_model",
 "thiserror",
 "tokio",
 "tokio-postgres",
@@ -2175,11 +2188,7 @@ dependencies = [
 name = "pageserver_api"
 version = "0.1.0"
 dependencies = [
- "anyhow",
- "byteorder",
- "bytes",
 "const_format",
- "postgres_ffi",
 "serde",
 "serde_with",
 "utils",
@@ -2439,21 +2448,6 @@ version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"

-[[package]]
-name = "pq_proto"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "bytes",
- "pin-project-lite",
- "postgres-protocol",
- "rand",
- "serde",
- "tokio",
- "tracing",
- "workspace_hack",
-]
-
 [[package]]
 name = "prettyplease"
 version = "0.1.21"
@@ -2586,7 +2580,6 @@ dependencies = [
 "once_cell",
 "parking_lot 0.12.1",
 "pin-project-lite",
- "pq_proto",
 "rand",
 "rcgen",
 "reqwest",
@@ -3090,6 +3083,7 @@ dependencies = [
 "clap 4.0.15",
 "const_format",
 "crc32c",
+ "daemonize",
 "etcd_broker",
 "fs2",
 "git-version",
@@ -3097,13 +3091,11 @@ dependencies = [
 "humantime",
 "hyper",
 "metrics",
- "nix 0.25.0",
 "once_cell",
 "parking_lot 0.12.1",
 "postgres",
 "postgres-protocol",
 "postgres_ffi",
- "pq_proto",
 "regex",
 "remote_storage",
 "safekeeper_api",
@@ -3469,12 +3461,6 @@ version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601"

-[[package]]
-name = "svg_fmt"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2"
-
 [[package]]
 name = "symbolic-common"
 version = "8.8.0"
@@ -3552,13 +3538,6 @@ dependencies = [
 "winapi",
 ]

-[[package]]
-name = "tenant_size_model"
-version = "0.1.0"
-dependencies = [
- "workspace_hack",
-]
-
 [[package]]
 name = "termcolor"
 version = "1.1.3"
@@ -3953,16 +3932,6 @@ dependencies = [
 "tracing-core",
 ]

-[[package]]
-name = "tracing-serde"
-version = "0.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1"
-dependencies = [
- "serde",
- "tracing-core",
-]
-
 [[package]]
 name = "tracing-subscriber"
 version = "0.3.16"
@@ -3973,15 +3942,12 @@ dependencies = [
 "nu-ansi-term",
 "once_cell",
 "regex",
- "serde",
- "serde_json",
 "sharded-slab",
 "smallvec",
 "thread_local",
 "tracing",
 "tracing-core",
 "tracing-log",
- "tracing-serde",
 ]

 [[package]]
@@ -4064,7 +4030,9 @@ dependencies = [
 "metrics",
 "nix 0.25.0",
 "once_cell",
- "pq_proto",
+ "pin-project-lite",
+ "postgres",
+ "postgres-protocol",
 "rand",
 "routerify",
 "rustls",
@@ -4074,8 +4042,6 @@ dependencies = [
 "serde_json",
 "serde_with",
 "signal-hook",
- "strum",
- "strum_macros",
 "tempfile",
 "thiserror",
 "tokio",
@@ -4389,9 +4355,6 @@ dependencies = [
 "crossbeam-utils",
 "either",
 "fail",
- "futures-channel",
- "futures-task",
- "futures-util",
 "hashbrown",
 "indexmap",
 "libc",
@@ -4405,7 +4368,6 @@ dependencies = [
 "rand",
 "regex",
 "regex-syntax",
- "reqwest",
 "scopeguard",
 "serde",
 "stable_deref_trait",
--- a/3
+++ b/3
@@ -44,7 +44,7 @@ COPY . .
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin draw_timeline_dir --bin safekeeper --bin proxy --locked --release \
+&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin safekeeper --bin proxy --locked --release \
    && cachepot -s

 # Build final image
@@ -65,7 +65,6 @@ RUN set -e \

 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir   /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin

--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node-v14
@@ -1,50 +1,50 @@
-#
-# This file is identical to the Dockerfile.compute-node-v15 file
-# except for the version of Postgres that is built.
-#
-
 ARG TAG=pinned
+# apparently, ARGs don't get replaced in RUN commands in kaniko
+# ARG POSTGIS_VERSION=3.3.0
+# ARG PLV8_VERSION=3.1.4
+# ARG PG_VERSION=v14

-#########################################################################################
 #
 # Layer "build-deps"
 #
-#########################################################################################
 FROM debian:bullseye-slim AS build-deps
+RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
+    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
+    apt update
 RUN apt update &&  \
-    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
-    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev
+    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
+    libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev

-#########################################################################################
 #
 # Layer "pg-build"
 # Build Postgres from the neon postgres repository.
 #
-#########################################################################################
 FROM build-deps AS pg-build
 COPY vendor/postgres-v14 postgres
 RUN cd postgres && \
-    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
+    ./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
    # Install headers
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install

-#########################################################################################
 #
 # Layer "postgis-build"
 # Build PostGIS from the upstream PostGIS mirror.
 #
-#########################################################################################
+# PostGIS compiles against neon postgres sources without changes. Perhaps we
+# could even use the upstream binaries, compiled against vanilla Postgres, but
+# it would require some investigation to check that it works, and also keeps
+# working in the future. So for now, we compile our own binaries.
 FROM build-deps AS postgis-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
    apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc

-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
-    tar xvzf postgis-3.3.1.tar.gz && \
-    cd postgis-3.3.1 && \
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
+    tar xvzf postgis-3.3.0.tar.gz && \
+    cd postgis-3.3.0 && \
    ./autogen.sh && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    ./configure && \
@@ -57,29 +57,19 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control

-#########################################################################################
 #
 # Layer "plv8-build"
 # Build plv8
 #
-#########################################################################################
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
+    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5

-# https://github.com/plv8/plv8/issues/475:
-#   v8 uses gold for linking and sets `--thread-count=4` which breaks
-#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
-# Install newer gold version manually as debian-testing binutils version updates
-# libc version, which in turn breaks other extension built against non-testing libc.
-RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
-    tar xvzf binutils-2.38.tar.gz && \
-    cd binutils-2.38 && \
-    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
-    cd ../bfd && ./configure && make bfdver.h && \
-    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
-    cp /usr/local/bin/ld.gold /usr/bin/gold
+# https://github.com/plv8/plv8/issues/475
+# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
+RUN apt update && \
+    apt install -y --no-install-recommends -t testing binutils

 # Sed is used to patch for https://github.com/plv8/plv8/issues/503
 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
@@ -87,25 +77,21 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
    cd plv8-3.1.4 && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
-    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
    rm -rf /plv8-* && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control

-#########################################################################################
 #
 # Layer "h3-pg-build"
 # Build h3_pg
 #
-#########################################################################################
 FROM build-deps AS h3-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # packaged cmake is too old
-RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
-      -q -O /tmp/cmake-install.sh \
-      && chmod u+x /tmp/cmake-install.sh \
-      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
-      && rm /tmp/cmake-install.sh
+RUN apt update && \
+    apt install -y --no-install-recommends -t testing cmake

 RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
    tar xvzf h3.tgz  && \
@@ -124,15 +110,12 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control

-#########################################################################################
 #
 # Layer "neon-pg-ext-build"
 # compile neon extensions
 #
-#########################################################################################
 FROM build-deps AS neon-pg-ext-build
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -145,22 +128,16 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
        -C pgxn/neon \
        -s install

-#########################################################################################
-#
 # Compile and run the Neon-specific `compute_ctl` binary
-#
-#########################################################################################
 FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
 RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto

-#########################################################################################
 #
 # Clean up postgres folder before inclusion
 #
-#########################################################################################
 FROM neon-pg-ext-build AS postgres-cleanup-layer
 COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql

@@ -178,12 +155,10 @@ RUN rm -r /usr/local/pgsql/lib/pgxs/src
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a

-#########################################################################################
 #
 # Final layer
 # Put it all together into the final image
 #
-#########################################################################################
 FROM debian:bullseye-slim
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
@@ -200,6 +175,8 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libreadline8 for psql
 # libossp-uuid16 for extension ossp-uuid
 # libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
+# GLIBC 2.34 for plv8.
+#     Debian bullseye provides GLIBC 2.31, so we install the library from testing
 #
 # Lastly, link compute_ctl into zenith_ctl while we're at it,
 # so that we don't need to put this in another layer.
@@ -212,6 +189,12 @@ RUN apt update &&  \
        libproj19 \
        libprotobuf-c1 && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
+    echo "Installing GLIBC 2.34" && \
+    echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
+    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
+    apt update && \
+    apt install -y --no-install-recommends -t testing libc6 && \
+    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl

 USER postgres
--- a/Dockerfile.compute-node-v15
+++ b/Dockerfile.compute-node-v15
@@ -4,39 +4,44 @@
 #

 ARG TAG=pinned
+# apparently, ARGs don't get replaced in RUN commands in kaniko
+# ARG POSTGIS_VERSION=3.3.1
+# ARG PLV8_VERSION=3.1.4
+# ARG PG_VERSION=v15

-#########################################################################################
 #
 # Layer "build-deps"
 #
-#########################################################################################
 FROM debian:bullseye-slim AS build-deps
+RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
+    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
+    apt update
 RUN apt update &&  \
-    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
-    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev
+    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
+    libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev

-#########################################################################################
 #
 # Layer "pg-build"
 # Build Postgres from the neon postgres repository.
 #
-#########################################################################################
 FROM build-deps AS pg-build
 COPY vendor/postgres-v15 postgres
 RUN cd postgres && \
-    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
+    ./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
    # Install headers
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install

-#########################################################################################
 #
 # Layer "postgis-build"
 # Build PostGIS from the upstream PostGIS mirror.
 #
-#########################################################################################
+# PostGIS compiles against neon postgres sources without changes. Perhaps we
+# could even use the upstream binaries, compiled against vanilla Postgres, but
+# it would require some investigation to check that it works, and also keeps
+# working in the future. So for now, we compile our own binaries.
 FROM build-deps AS postgis-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
@@ -57,29 +62,19 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control

-#########################################################################################
 #
 # Layer "plv8-build"
 # Build plv8
 #
-#########################################################################################
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
+    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5

-# https://github.com/plv8/plv8/issues/475:
-#   v8 uses gold for linking and sets `--thread-count=4` which breaks
-#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
-# Install newer gold version manually as debian-testing binutils version updates
-# libc version, which in turn breaks other extension built against non-testing libc.
-RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
-    tar xvzf binutils-2.38.tar.gz && \
-    cd binutils-2.38 && \
-    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
-    cd ../bfd && ./configure && make bfdver.h && \
-    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
-    cp /usr/local/bin/ld.gold /usr/bin/gold
+# https://github.com/plv8/plv8/issues/475
+# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
+RUN apt update && \
+    apt install -y --no-install-recommends -t testing binutils

 # Sed is used to patch for https://github.com/plv8/plv8/issues/503
 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
@@ -87,25 +82,21 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
    cd plv8-3.1.4 && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
-    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
    rm -rf /plv8-* && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control

-#########################################################################################
 #
 # Layer "h3-pg-build"
 # Build h3_pg
 #
-#########################################################################################
 FROM build-deps AS h3-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # packaged cmake is too old
-RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
-      -q -O /tmp/cmake-install.sh \
-      && chmod u+x /tmp/cmake-install.sh \
-      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
-      && rm /tmp/cmake-install.sh
+RUN apt update && \
+    apt install -y --no-install-recommends -t testing cmake

 RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
    tar xvzf h3.tgz  && \
@@ -124,15 +115,12 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control

-#########################################################################################
 #
 # Layer "neon-pg-ext-build"
 # compile neon extensions
 #
-#########################################################################################
 FROM build-deps AS neon-pg-ext-build
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -145,22 +133,16 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
        -C pgxn/neon \
        -s install

-#########################################################################################
-#
 # Compile and run the Neon-specific `compute_ctl` binary
-#
-#########################################################################################
 FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
 RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto

-#########################################################################################
 #
 # Clean up postgres folder before inclusion
 #
-#########################################################################################
 FROM neon-pg-ext-build AS postgres-cleanup-layer
 COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql

@@ -178,12 +160,10 @@ RUN rm -r /usr/local/pgsql/lib/pgxs/src
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a

-#########################################################################################
 #
 # Final layer
 # Put it all together into the final image
 #
-#########################################################################################
 FROM debian:bullseye-slim
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
@@ -200,6 +180,8 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libreadline8 for psql
 # libossp-uuid16 for extension ossp-uuid
 # libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
+# GLIBC 2.34 for plv8.
+#     Debian bullseye provides GLIBC 2.31, so we install the library from testing
 #
 # Lastly, link compute_ctl into zenith_ctl while we're at it,
 # so that we don't need to put this in another layer.
@@ -212,6 +194,12 @@ RUN apt update &&  \
        libproj19 \
        libprotobuf-c1 && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
+    echo "Installing GLIBC 2.34" && \
+    echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
+    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
+    apt update && \
+    apt install -y --no-install-recommends -t testing libc6 && \
+    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl

 USER postgres
--- a/10
+++ b/10
@@ -151,11 +151,6 @@ neon-pg-ext-v14: postgres-v14
 	(cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
-	+@echo "Compiling neon_walredo v14"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
 	+@echo "Compiling neon_test_utils" v14
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14
 	(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \
@@ -168,11 +163,6 @@ neon-pg-ext-v15: postgres-v15
 	(cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
-	+@echo "Compiling neon_walredo v15"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
 	+@echo "Compiling neon_test_utils" v15
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15
 	(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \
--- a/README.md
+++ b/README.md
@@ -223,7 +223,10 @@ Ensure your dependencies are installed as described [here](https://github.com/ne
 ```sh
 git clone --recursive https://github.com/neondatabase/neon.git

+# either:
 CARGO_BUILD_FLAGS="--features=testing" make
+# or:
+make debug

 ./scripts/pytest
 ```
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -424,29 +424,8 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
        db_client.simple_query(&alter_query)?;

        // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
-        // This is needed because since postgres 15 this privilege is removed by default.
-        let grant_query = "DO $$\n\
-                BEGIN\n\
-                    IF EXISTS(\n\
-                        SELECT nspname\n\
-                        FROM pg_catalog.pg_namespace\n\
-                        WHERE nspname = 'public'\n\
-                    ) AND\n\
-                    current_setting('server_version_num')::int/10000 >= 15\n\
-                    THEN\n\
-                        IF EXISTS(\n\
-                            SELECT rolname\n\
-                            FROM pg_catalog.pg_roles\n\
-                            WHERE rolname = 'web_access'\n\
-                        )\n\
-                        THEN\n\
-                            GRANT CREATE ON SCHEMA public TO web_access;\n\
-                        END IF;\n\
-                    END IF;\n\
-                END\n\
-            $$;"
-        .to_string();
-
+        // This is needed since postgres 15, where this privilege is removed by default.
+        let grant_query: String = "GRANT CREATE ON SCHEMA public TO web_access".to_string();
        info!("grant query for db {} : {}", &db.name, &grant_query);
        db_client.simple_query(&grant_query)?;
    }
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -4,21 +4,20 @@ version = "0.1.0"
 edition = "2021"

 [dependencies]
-anyhow = "1.0"
 clap = "4.0"
 comfy-table = "6.1"
 git-version = "0.3.5"
-nix = "0.25"
-once_cell = "1.13.0"
+tar = "0.4.38"
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-regex = "1"
-reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
 serde = { version = "1.0", features = ["derive"] }
 serde_with = "2.0"
-tar = "0.4.38"
-thiserror = "1"
 toml = "0.5"
-url = "2.2.2"
+once_cell = "1.13.0"
+regex = "1"
+anyhow = "1.0"
+thiserror = "1"
+nix = "0.25"
+reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }

 # Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
 # instead, so that recompile times are better.
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -1,264 +0,0 @@
-//! Spawns and kills background processes that are needed by Neon CLI.
-//! Applies common set-up such as log and pid files (if needed) to every process.
-//!
-//! Neon CLI does not run in background, so it needs to store the information about
-//! spawned processes, which it does in this module.
-//! We do that by storing the pid of the process in the "${process_name}.pid" file.
-//! The pid file can be created by the process itself
-//! (Neon storage binaries do that and also ensure that a lock is taken onto that file)
-//! or we create such file after starting the process
-//! (non-Neon binaries don't necessarily follow our pidfile conventions).
-//! The pid stored in the file is later used to stop the service.
-//!
-//! See [`lock_file`] module for more info.
-
-use std::ffi::OsStr;
-use std::io::Write;
-use std::path::Path;
-use std::process::{Child, Command};
-use std::time::Duration;
-use std::{fs, io, thread};
-
-use anyhow::{anyhow, bail, Context, Result};
-use nix::errno::Errno;
-use nix::sys::signal::{kill, Signal};
-use nix::unistd::Pid;
-
-use utils::lock_file;
-
-const RETRIES: u32 = 15;
-const RETRY_TIMEOUT_MILLIS: u64 = 500;
-
-/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
-/// it itself.
-pub enum InitialPidFile<'t> {
-    /// Create a pidfile, to allow future CLI invocations to manipulate the process.
-    Create(&'t Path),
-    /// The process will create the pidfile itself, need to wait for that event.
-    Expect(&'t Path),
-}
-
-/// Start a background child process using the parameters given.
-pub fn start_process<F, S: AsRef<OsStr>>(
-    process_name: &str,
-    datadir: &Path,
-    command: &Path,
-    args: &[S],
-    initial_pid_file: InitialPidFile,
-    process_status_check: F,
-) -> anyhow::Result<Child>
-where
-    F: Fn() -> anyhow::Result<bool>,
-{
-    let log_path = datadir.join(format!("{process_name}.log"));
-    let process_log_file = fs::OpenOptions::new()
-        .create(true)
-        .write(true)
-        .append(true)
-        .open(&log_path)
-        .with_context(|| {
-            format!("Could not open {process_name} log file {log_path:?} for writing")
-        })?;
-    let same_file_for_stderr = process_log_file.try_clone().with_context(|| {
-        format!("Could not reuse {process_name} log file {log_path:?} for writing stderr")
-    })?;
-
-    let mut command = Command::new(command);
-    let background_command = command
-        .stdout(process_log_file)
-        .stderr(same_file_for_stderr)
-        .args(args);
-    let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
-
-    let mut spawned_process = filled_cmd.spawn().with_context(|| {
-        format!("Could not spawn {process_name}, see console output and log files for details.")
-    })?;
-    let pid = spawned_process.id();
-    let pid = Pid::from_raw(
-        i32::try_from(pid)
-            .with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
-    );
-
-    let pid_file_to_check = match initial_pid_file {
-        InitialPidFile::Create(target_pid_file_path) => {
-            match lock_file::create_lock_file(target_pid_file_path, pid.to_string()) {
-                lock_file::LockCreationResult::Created { .. } => {
-                    // We use "lock" file here only to create the pid file. The lock on the pidfile will be dropped as soon
-                    // as this CLI invocation exits, so it's a bit useless, but doesn't any harm either.
-                }
-                lock_file::LockCreationResult::AlreadyLocked { .. } => {
-                    anyhow::bail!("Cannot write pid file for {process_name} at path {target_pid_file_path:?}: file is already locked by another process")
-                }
-                lock_file::LockCreationResult::CreationFailed(e) => {
-                    return Err(e.context(format!(
-                    "Failed to create pid file for {process_name} at path {target_pid_file_path:?}"
-                )))
-                }
-            }
-            None
-        }
-        InitialPidFile::Expect(pid_file_path) => Some(pid_file_path),
-    };
-
-    for retries in 0..RETRIES {
-        match process_started(pid, pid_file_to_check, &process_status_check) {
-            Ok(true) => {
-                println!("\n{process_name} started, pid: {pid}");
-                return Ok(spawned_process);
-            }
-            Ok(false) => {
-                if retries < 5 {
-                    print!(".");
-                    io::stdout().flush().unwrap();
-                } else {
-                    if retries == 5 {
-                        println!() // put a line break after dots for second message
-                    }
-                    println!("{process_name} has not started yet, retrying ({retries})...");
-                }
-                thread::sleep(Duration::from_millis(RETRY_TIMEOUT_MILLIS));
-            }
-            Err(e) => {
-                println!("{process_name} failed to start: {e:#}");
-                if let Err(e) = spawned_process.kill() {
-                    println!("Could not stop {process_name} subprocess: {e:#}")
-                };
-                return Err(e);
-            }
-        }
-    }
-    anyhow::bail!("{process_name} could not start in {RETRIES} attempts");
-}
-
-/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
-pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
-    if !pid_file.exists() {
-        println!("{process_name} is already stopped: no pid file {pid_file:?} is present");
-        return Ok(());
-    }
-    let pid = read_pidfile(pid_file)?;
-
-    let sig = if immediate {
-        print!("Stopping {process_name} with pid {pid} immediately..");
-        Signal::SIGQUIT
-    } else {
-        print!("Stopping {process_name} with pid {pid} gracefully..");
-        Signal::SIGTERM
-    };
-    io::stdout().flush().unwrap();
-    match kill(pid, sig) {
-        Ok(()) => (),
-        Err(Errno::ESRCH) => {
-            println!(
-                "{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found"
-            );
-            return Ok(());
-        }
-        Err(e) => anyhow::bail!("Failed to send signal to {process_name} with pid {pid}: {e}"),
-    }
-
-    // Wait until process is gone
-    for _ in 0..RETRIES {
-        match process_has_stopped(pid) {
-            Ok(true) => {
-                println!("\n{process_name} stopped");
-                if let Err(e) = fs::remove_file(pid_file) {
-                    if e.kind() != io::ErrorKind::NotFound {
-                        eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
-                    }
-                }
-                return Ok(());
-            }
-            Ok(false) => {
-                print!(".");
-                io::stdout().flush().unwrap();
-                thread::sleep(Duration::from_secs(1))
-            }
-            Err(e) => {
-                println!("{process_name} with pid {pid} failed to stop: {e:#}");
-                return Err(e);
-            }
-        }
-    }
-
-    anyhow::bail!("{process_name} with pid {pid} failed to stop in {RETRIES} attempts");
-}
-
-fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
-    let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
-
-    let var = "LLVM_PROFILE_FILE";
-    if let Some(val) = std::env::var_os(var) {
-        filled_cmd = filled_cmd.env(var, val);
-    }
-
-    const RUST_LOG_KEY: &str = "RUST_LOG";
-    if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
-        filled_cmd.env(RUST_LOG_KEY, rust_log_value)
-    } else {
-        filled_cmd
-    }
-}
-
-fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
-    for env_key in [
-        "AWS_ACCESS_KEY_ID",
-        "AWS_SECRET_ACCESS_KEY",
-        "AWS_SESSION_TOKEN",
-    ] {
-        if let Ok(value) = std::env::var(env_key) {
-            cmd = cmd.env(env_key, value);
-        }
-    }
-    cmd
-}
-
-fn process_started<F>(
-    pid: Pid,
-    pid_file_to_check: Option<&Path>,
-    status_check: &F,
-) -> anyhow::Result<bool>
-where
-    F: Fn() -> anyhow::Result<bool>,
-{
-    match status_check() {
-        Ok(true) => match pid_file_to_check {
-            Some(pid_file_path) => {
-                if pid_file_path.exists() {
-                    let pid_in_file = read_pidfile(pid_file_path)?;
-                    Ok(pid_in_file == pid)
-                } else {
-                    Ok(false)
-                }
-            }
-            None => Ok(true),
-        },
-        Ok(false) => Ok(false),
-        Err(e) => anyhow::bail!("process failed to start: {e}"),
-    }
-}
-
-/// Read a PID file
-///
-/// We expect a file that contains a single integer.
-fn read_pidfile(pidfile: &Path) -> Result<Pid> {
-    let pid_str = fs::read_to_string(pidfile)
-        .with_context(|| format!("failed to read pidfile {pidfile:?}"))?;
-    let pid: i32 = pid_str
-        .parse()
-        .map_err(|_| anyhow!("failed to parse pidfile {pidfile:?}"))?;
-    if pid < 1 {
-        bail!("pidfile {pidfile:?} contained bad value '{pid}'");
-    }
-    Ok(Pid::from_raw(pid))
-}
-
-fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
-    match kill(pid, None) {
-        // Process exists, keep waiting
-        Ok(_) => Ok(false),
-        // Process not found, we're done
-        Err(Errno::ESRCH) => Ok(true),
-        Err(err) => anyhow::bail!("Failed to send signal to process with pid {pid}: {err}"),
-    }
-}
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -9,8 +9,8 @@ use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
 use control_plane::compute::ComputeControlPlane;
 use control_plane::local_env::{EtcdBroker, LocalEnv};
-use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
+use control_plane::storage::PageServerNode;
 use control_plane::{etcd, local_env};
 use pageserver_api::models::TimelineInfo;
 use pageserver_api::{
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -12,14 +12,15 @@ use std::time::Duration;

 use anyhow::{Context, Result};
 use utils::{
+    connstring::connection_host_port,
    id::{TenantId, TimelineId},
    lsn::Lsn,
    postgres_backend::AuthType,
 };

 use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION};
-use crate::pageserver::PageServerNode;
 use crate::postgresql_conf::PostgresConf;
+use crate::storage::PageServerNode;

 //
 // ComputeControlPlane
@@ -281,7 +282,9 @@ impl PostgresNode {
    fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> {
        let mut conf = PostgresConf::new();
        conf.append("max_wal_senders", "10");
-        conf.append("wal_log_hints", "off");
+        // wal_log_hints is mandatory when running against pageserver (see gh issue#192)
+        // TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE?
+        conf.append("wal_log_hints", "on");
        conf.append("max_replication_slots", "10");
        conf.append("hot_standby", "on");
        conf.append("shared_buffers", "1MB");
@@ -299,8 +302,7 @@ impl PostgresNode {

        // Configure the node to fetch pages from pageserver
        let pageserver_connstr = {
-            let config = &self.pageserver.pg_connection_config;
-            let (host, port) = (config.host(), config.port());
+            let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);

            // Set up authentication
            //
--- a/control_plane/src/connection.rs
+++ b/control_plane/src/connection.rs
@@ -1,57 +0,0 @@
-use url::Url;
-
-#[derive(Debug)]
-pub struct PgConnectionConfig {
-    url: Url,
-}
-
-impl PgConnectionConfig {
-    pub fn host(&self) -> &str {
-        self.url.host_str().expect("BUG: no host")
-    }
-
-    pub fn port(&self) -> u16 {
-        self.url.port().expect("BUG: no port")
-    }
-
-    /// Return a `<host>:<port>` string.
-    pub fn raw_address(&self) -> String {
-        format!("{}:{}", self.host(), self.port())
-    }
-
-    /// Connect using postgres protocol with TLS disabled.
-    pub fn connect_no_tls(&self) -> Result<postgres::Client, postgres::Error> {
-        postgres::Client::connect(self.url.as_str(), postgres::NoTls)
-    }
-}
-
-impl std::str::FromStr for PgConnectionConfig {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let mut url: Url = s.parse()?;
-
-        match url.scheme() {
-            "postgres" | "postgresql" => {}
-            other => anyhow::bail!("invalid scheme: {other}"),
-        }
-
-        // It's not a valid connection url if host is unavailable.
-        if url.host().is_none() {
-            anyhow::bail!(url::ParseError::EmptyHost);
-        }
-
-        // E.g. `postgres:bar`.
-        if url.cannot_be_a_base() {
-            anyhow::bail!("URL cannot be a base");
-        }
-
-        // Set the default PG port if it's missing.
-        if url.port().is_none() {
-            url.set_port(Some(5432))
-                .expect("BUG: couldn't set the default port");
-        }
-
-        Ok(Self { url })
-    }
-}
--- a/control_plane/src/etcd.rs
+++ b/control_plane/src/etcd.rs
@@ -1,75 +1,95 @@
-use std::{fs, path::PathBuf};
+use std::{
+    fs,
+    path::PathBuf,
+    process::{Command, Stdio},
+};

 use anyhow::Context;
+use nix::{
+    sys::signal::{kill, Signal},
+    unistd::Pid,
+};

-use crate::{background_process, local_env};
+use crate::{local_env, read_pidfile};

 pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
    let etcd_broker = &env.etcd_broker;
    println!(
-        "Starting etcd broker using {:?}",
-        etcd_broker.etcd_binary_path
+        "Starting etcd broker using {}",
+        etcd_broker.etcd_binary_path.display()
    );

    let etcd_data_dir = env.base_data_dir.join("etcd");
-    fs::create_dir_all(&etcd_data_dir)
-        .with_context(|| format!("Failed to create etcd data dir {etcd_data_dir:?}"))?;
+    fs::create_dir_all(&etcd_data_dir).with_context(|| {
+        format!(
+            "Failed to create etcd data dir: {}",
+            etcd_data_dir.display()
+        )
+    })?;

+    let etcd_stdout_file =
+        fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| {
+            format!(
+                "Failed to create etcd stout file in directory {}",
+                etcd_data_dir.display()
+            )
+        })?;
+    let etcd_stderr_file =
+        fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| {
+            format!(
+                "Failed to create etcd stderr file in directory {}",
+                etcd_data_dir.display()
+            )
+        })?;
    let client_urls = etcd_broker.comma_separated_endpoints();
-    let args = [
-        format!("--data-dir={}", etcd_data_dir.display()),
-        format!("--listen-client-urls={client_urls}"),
-        format!("--advertise-client-urls={client_urls}"),
-        // Set --quota-backend-bytes to keep the etcd virtual memory
-        // size smaller. Our test etcd clusters are very small.
-        // See https://github.com/etcd-io/etcd/issues/7910
-        "--quota-backend-bytes=100000000".to_string(),
-        // etcd doesn't compact (vacuum) with default settings,
-        // enable it to prevent space exhaustion.
-        "--auto-compaction-mode=revision".to_string(),
-        "--auto-compaction-retention=1".to_string(),
-    ];

-    let pid_file_path = etcd_pid_file_path(env);
+    let etcd_process = Command::new(&etcd_broker.etcd_binary_path)
+        .args(&[
+            format!("--data-dir={}", etcd_data_dir.display()),
+            format!("--listen-client-urls={client_urls}"),
+            format!("--advertise-client-urls={client_urls}"),
+            // Set --quota-backend-bytes to keep the etcd virtual memory
+            // size smaller. Our test etcd clusters are very small.
+            // See https://github.com/etcd-io/etcd/issues/7910
+            "--quota-backend-bytes=100000000".to_string(),
+        ])
+        .stdout(Stdio::from(etcd_stdout_file))
+        .stderr(Stdio::from(etcd_stderr_file))
+        .spawn()
+        .context("Failed to spawn etcd subprocess")?;
+    let pid = etcd_process.id();

-    let client = reqwest::blocking::Client::new();
-
-    background_process::start_process(
-        "etcd",
-        &etcd_data_dir,
-        &etcd_broker.etcd_binary_path,
-        &args,
-        background_process::InitialPidFile::Create(&pid_file_path),
-        || {
-            for broker_endpoint in &etcd_broker.broker_endpoints {
-                let request = broker_endpoint
-                    .join("health")
-                    .with_context(|| {
-                        format!(
-                            "Failed to append /health path to broker endopint {}",
-                            broker_endpoint
-                        )
-                    })
-                    .and_then(|url| {
-                        client.get(&url.to_string()).build().with_context(|| {
-                            format!("Failed to construct request to etcd endpoint {url}")
-                        })
-                    })?;
-                if client.execute(request).is_ok() {
-                    return Ok(true);
-                }
-            }
-
-            Ok(false)
-        },
-    )
-    .context("Failed to spawn etcd subprocess")?;
+    let etcd_pid_file_path = etcd_pid_file_path(env);
+    fs::write(&etcd_pid_file_path, pid.to_string()).with_context(|| {
+        format!(
+            "Failed to create etcd pid file at {}",
+            etcd_pid_file_path.display()
+        )
+    })?;

    Ok(())
 }

 pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
-    background_process::stop_process(true, "etcd", &etcd_pid_file_path(env))
+    let etcd_path = &env.etcd_broker.etcd_binary_path;
+    println!("Stopping etcd broker at {}", etcd_path.display());
+
+    let etcd_pid_file_path = etcd_pid_file_path(env);
+    let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| {
+        format!(
+            "Failed to read etcd pid file at {}",
+            etcd_pid_file_path.display()
+        )
+    })?);
+
+    kill(pid, Signal::SIGTERM).with_context(|| {
+        format!(
+            "Failed to stop etcd with pid {pid} at {}",
+            etcd_pid_file_path.display()
+        )
+    })?;
+
+    Ok(())
 }

 fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -6,12 +6,59 @@
 // Intended to be used in integration tests and in CLI tools for
 // local installations.
 //
+use anyhow::{anyhow, bail, Context, Result};
+use std::fs;
+use std::path::Path;
+use std::process::Command;

-mod background_process;
 pub mod compute;
-pub mod connection;
 pub mod etcd;
 pub mod local_env;
-pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
+pub mod storage;
+
+/// Read a PID file
+///
+/// We expect a file that contains a single integer.
+/// We return an i32 for compatibility with libc and nix.
+pub fn read_pidfile(pidfile: &Path) -> Result<i32> {
+    let pid_str = fs::read_to_string(pidfile)
+        .with_context(|| format!("failed to read pidfile {:?}", pidfile))?;
+    let pid: i32 = pid_str
+        .parse()
+        .map_err(|_| anyhow!("failed to parse pidfile {:?}", pidfile))?;
+    if pid < 1 {
+        bail!("pidfile {:?} contained bad value '{}'", pidfile, pid);
+    }
+    Ok(pid)
+}
+
+fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
+    let cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
+
+    let var = "LLVM_PROFILE_FILE";
+    if let Some(val) = std::env::var_os(var) {
+        cmd.env(var, val);
+    }
+
+    const RUST_LOG_KEY: &str = "RUST_LOG";
+    if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
+        cmd.env(RUST_LOG_KEY, rust_log_value)
+    } else {
+        cmd
+    }
+}
+
+fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
+    for env_key in [
+        "AWS_ACCESS_KEY_ID",
+        "AWS_SECRET_ACCESS_KEY",
+        "AWS_SESSION_TOKEN",
+    ] {
+        if let Ok(value) = std::env::var(env_key) {
+            cmd = cmd.env(env_key, value);
+        }
+    }
+    cmd
+}
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -226,12 +226,12 @@ impl LocalEnv {
        }
    }

-    pub fn pageserver_bin(&self) -> PathBuf {
-        self.neon_distrib_dir.join("pageserver")
+    pub fn pageserver_bin(&self) -> anyhow::Result<PathBuf> {
+        Ok(self.neon_distrib_dir.join("pageserver"))
    }

-    pub fn safekeeper_bin(&self) -> PathBuf {
-        self.neon_distrib_dir.join("safekeeper")
+    pub fn safekeeper_bin(&self) -> anyhow::Result<PathBuf> {
+        Ok(self.neon_distrib_dir.join("safekeeper"))
    }

    pub fn pg_data_dirs_path(&self) -> PathBuf {
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -1,21 +1,23 @@
 use std::io::Write;
 use std::path::PathBuf;
-use std::process::Child;
+use std::process::Command;
 use std::sync::Arc;
-use std::{io, result};
+use std::time::Duration;
+use std::{io, result, thread};

-use anyhow::Context;
+use anyhow::bail;
+use nix::errno::Errno;
+use nix::sys::signal::{kill, Signal};
+use nix::unistd::Pid;
+use postgres::Config;
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
-use utils::{http::error::HttpErrorBody, id::NodeId};
+use utils::{connstring::connection_address, http::error::HttpErrorBody, id::NodeId};

-use crate::connection::PgConnectionConfig;
-use crate::pageserver::PageServerNode;
-use crate::{
-    background_process,
-    local_env::{LocalEnv, SafekeeperConf},
-};
+use crate::local_env::{LocalEnv, SafekeeperConf};
+use crate::storage::PageServerNode;
+use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};

 #[derive(Error, Debug)]
 pub enum SafekeeperHttpError {
@@ -61,7 +63,7 @@ pub struct SafekeeperNode {

    pub conf: SafekeeperConf,

-    pub pg_connection_config: PgConnectionConfig,
+    pub pg_connection_config: Config,
    pub env: LocalEnv,
    pub http_client: Client,
    pub http_base_url: String,
@@ -85,15 +87,15 @@ impl SafekeeperNode {
    }

    /// Construct libpq connection string for connecting to this safekeeper.
-    fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
+    fn safekeeper_connection_config(port: u16) -> Config {
        // TODO safekeeper authentication not implemented yet
-        format!("postgresql://no_user@127.0.0.1:{port}/no_db")
+        format!("postgresql://no_user@127.0.0.1:{}/no_db", port)
            .parse()
            .unwrap()
    }

    pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
-        env.safekeeper_data_dir(&format!("sk{sk_id}"))
+        env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref())
    }

    pub fn datadir_path(&self) -> PathBuf {
@@ -104,78 +106,92 @@ impl SafekeeperNode {
        self.datadir_path().join("safekeeper.pid")
    }

-    pub fn start(&self) -> anyhow::Result<Child> {
+    pub fn start(&self) -> anyhow::Result<()> {
        print!(
            "Starting safekeeper at '{}' in '{}'",
-            self.pg_connection_config.raw_address(),
+            connection_address(&self.pg_connection_config),
            self.datadir_path().display()
        );
        io::stdout().flush().unwrap();

        let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
        let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
-        let id = self.id;
-        let datadir = self.datadir_path();

-        let id_string = id.to_string();
-        let mut args = vec![
-            "-D",
-            datadir.to_str().with_context(|| {
-                format!("Datadir path {datadir:?} cannot be represented as a unicode string")
-            })?,
-            "--id",
-            &id_string,
-            "--listen-pg",
-            &listen_pg,
-            "--listen-http",
-            &listen_http,
-        ];
+        let mut cmd = Command::new(self.env.safekeeper_bin()?);
+        fill_rust_env_vars(
+            cmd.args(&["-D", self.datadir_path().to_str().unwrap()])
+                .args(&["--id", self.id.to_string().as_ref()])
+                .args(&["--listen-pg", &listen_pg])
+                .args(&["--listen-http", &listen_http])
+                .args(&["--recall", "1 second"])
+                .arg("--daemonize"),
+        );
        if !self.conf.sync {
-            args.push("--no-sync");
+            cmd.arg("--no-sync");
        }

        let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints();
        if !comma_separated_endpoints.is_empty() {
-            args.extend(["--broker-endpoints", &comma_separated_endpoints]);
+            cmd.args(&["--broker-endpoints", &comma_separated_endpoints]);
        }
        if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
-            args.extend(["--broker-etcd-prefix", prefix]);
+            cmd.args(&["--broker-etcd-prefix", prefix]);
        }
-
-        let mut backup_threads = String::new();
        if let Some(threads) = self.conf.backup_threads {
-            backup_threads = threads.to_string();
-            args.extend(["--backup-threads", &backup_threads]);
-        } else {
-            drop(backup_threads);
+            cmd.args(&["--backup-threads", threads.to_string().as_ref()]);
        }
-
        if let Some(ref remote_storage) = self.conf.remote_storage {
-            args.extend(["--remote-storage", remote_storage]);
+            cmd.args(&["--remote-storage", remote_storage]);
        }
-
-        let key_path = self.env.base_data_dir.join("auth_public_key.pem");
        if self.conf.auth_enabled {
-            args.extend([
-                "--auth-validation-public-key-path",
-                key_path.to_str().with_context(|| {
-                    format!("Key path {key_path:?} cannot be represented as a unicode string")
-                })?,
-            ]);
+            cmd.arg("--auth-validation-public-key-path");
+            // PathBuf is better be passed as is, not via `String`.
+            cmd.arg(self.env.base_data_dir.join("auth_public_key.pem"));
        }

-        background_process::start_process(
-            &format!("safekeeper {id}"),
-            &datadir,
-            &self.env.safekeeper_bin(),
-            &args,
-            background_process::InitialPidFile::Expect(&self.pid_file()),
-            || match self.check_status() {
-                Ok(()) => Ok(true),
-                Err(SafekeeperHttpError::Transport(_)) => Ok(false),
-                Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
-            },
-        )
+        fill_aws_secrets_vars(&mut cmd);
+
+        if !cmd.status()?.success() {
+            bail!(
+                "Safekeeper failed to start. See '{}' for details.",
+                self.datadir_path().join("safekeeper.log").display()
+            );
+        }
+
+        // It takes a while for the safekeeper to start up. Wait until it is
+        // open for business.
+        const RETRIES: i8 = 15;
+        for retries in 1..RETRIES {
+            match self.check_status() {
+                Ok(_) => {
+                    println!("\nSafekeeper started");
+                    return Ok(());
+                }
+                Err(err) => {
+                    match err {
+                        SafekeeperHttpError::Transport(err) => {
+                            if err.is_connect() && retries < 5 {
+                                print!(".");
+                                io::stdout().flush().unwrap();
+                            } else {
+                                if retries == 5 {
+                                    println!() // put a line break after dots for second message
+                                }
+                                println!(
+                                    "Safekeeper not responding yet, err {} retrying ({})...",
+                                    err, retries
+                                );
+                            }
+                        }
+                        SafekeeperHttpError::Response(msg) => {
+                            bail!("safekeeper failed to start: {} ", msg)
+                        }
+                    }
+                    thread::sleep(Duration::from_secs(1));
+                }
+            }
+        }
+        bail!("safekeeper failed to start in {} seconds", RETRIES);
    }

    ///
@@ -187,11 +203,63 @@ impl SafekeeperNode {
    /// If the server is not running, returns success
    ///
    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        background_process::stop_process(
-            immediate,
-            &format!("safekeeper {}", self.id),
-            &self.pid_file(),
-        )
+        let pid_file = self.pid_file();
+        if !pid_file.exists() {
+            println!("Safekeeper {} is already stopped", self.id);
+            return Ok(());
+        }
+        let pid = read_pidfile(&pid_file)?;
+        let pid = Pid::from_raw(pid);
+
+        let sig = if immediate {
+            print!("Stopping safekeeper {} immediately..", self.id);
+            Signal::SIGQUIT
+        } else {
+            print!("Stopping safekeeper {} gracefully..", self.id);
+            Signal::SIGTERM
+        };
+        io::stdout().flush().unwrap();
+        match kill(pid, sig) {
+            Ok(_) => (),
+            Err(Errno::ESRCH) => {
+                println!(
+                    "Safekeeper with pid {} does not exist, but a PID file was found",
+                    pid
+                );
+                return Ok(());
+            }
+            Err(err) => bail!(
+                "Failed to send signal to safekeeper with pid {}: {}",
+                pid,
+                err.desc()
+            ),
+        }
+
+        // Wait until process is gone
+        for i in 0..600 {
+            let signal = None; // Send no signal, just get the error code
+            match kill(pid, signal) {
+                Ok(_) => (), // Process exists, keep waiting
+                Err(Errno::ESRCH) => {
+                    // Process not found, we're done
+                    println!("done!");
+                    return Ok(());
+                }
+                Err(err) => bail!(
+                    "Failed to send signal to pageserver with pid {}: {}",
+                    pid,
+                    err.desc()
+                ),
+            };
+
+            if i % 10 == 0 {
+                print!(".");
+                io::stdout().flush().unwrap();
+            }
+            thread::sleep(Duration::from_millis(100));
+        }
+
+        bail!("Failed to stop safekeeper with pid {}", pid);
    }

    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -1,27 +1,33 @@
 use std::collections::HashMap;
-use std::fs::{self, File};
+use std::fs::File;
 use std::io::{BufReader, Write};
 use std::num::NonZeroU64;
 use std::path::{Path, PathBuf};
-use std::process::Child;
-use std::{io, result};
+use std::process::Command;
+use std::time::Duration;
+use std::{io, result, thread};

-use crate::connection::PgConnectionConfig;
 use anyhow::{bail, Context};
+use nix::errno::Errno;
+use nix::sys::signal::{kill, Signal};
+use nix::unistd::Pid;
 use pageserver_api::models::{
    TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
 };
+use postgres::{Config, NoTls};
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
 use utils::{
+    connstring::connection_address,
    http::error::HttpErrorBody,
    id::{TenantId, TimelineId},
    lsn::Lsn,
    postgres_backend::AuthType,
 };

-use crate::{background_process, local_env::LocalEnv};
+use crate::local_env::LocalEnv;
+use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};

 #[derive(Error, Debug)]
 pub enum PageserverHttpError {
@@ -69,7 +75,7 @@ impl ResponseErrorMessageExt for Response {
 //
 #[derive(Debug)]
 pub struct PageServerNode {
-    pub pg_connection_config: PgConnectionConfig,
+    pub pg_connection_config: Config,
    pub env: LocalEnv,
    pub http_client: Client,
    pub http_base_url: String,
@@ -95,7 +101,7 @@ impl PageServerNode {
    }

    /// Construct libpq connection string for connecting to the pageserver.
-    fn pageserver_connection_config(password: &str, listen_addr: &str) -> PgConnectionConfig {
+    fn pageserver_connection_config(password: &str, listen_addr: &str) -> Config {
        format!("postgresql://no_user:{password}@{listen_addr}/no_db")
            .parse()
            .unwrap()
@@ -155,15 +161,7 @@ impl PageServerNode {
            init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'");
        }

-        let mut pageserver_process = self
-            .start_node(&init_config_overrides, &self.env.base_data_dir, true)
-            .with_context(|| {
-                format!(
-                    "Failed to start a process for pageserver {}",
-                    self.env.pageserver.id,
-                )
-            })?;
-
+        self.start_node(&init_config_overrides, &self.env.base_data_dir, true)?;
        let init_result = self
            .try_init_timeline(create_tenant, initial_timeline_id, pg_version)
            .context("Failed to create initial tenant and timeline for pageserver");
@@ -173,29 +171,7 @@ impl PageServerNode {
            }
            Err(e) => eprintln!("{e:#}"),
        }
-        match pageserver_process.kill() {
-            Err(e) => {
-                eprintln!(
-                    "Failed to stop pageserver {} process with pid {}: {e:#}",
-                    self.env.pageserver.id,
-                    pageserver_process.id(),
-                )
-            }
-            Ok(()) => {
-                println!(
-                    "Stopped pageserver {} process with pid {}",
-                    self.env.pageserver.id,
-                    pageserver_process.id(),
-                );
-                // cleanup after pageserver startup, since we do not call regular `stop_process` during init
-                let pid_file = self.pid_file();
-                if let Err(e) = fs::remove_file(&pid_file) {
-                    if e.kind() != io::ErrorKind::NotFound {
-                        eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
-                    }
-                }
-            }
-        }
+        self.stop(false)?;
        init_result
    }

@@ -220,14 +196,11 @@ impl PageServerNode {
        self.env.pageserver_data_dir()
    }

-    /// The pid file is created by the pageserver process, with its pid stored inside.
-    /// Other pageservers cannot lock the same file and overwrite it for as long as the current
-    /// pageserver runs. (Unless someone removes the file manually; never do that!)
-    fn pid_file(&self) -> PathBuf {
+    pub fn pid_file(&self) -> PathBuf {
        self.repo_path().join("pageserver.pid")
    }

-    pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
+    pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
        self.start_node(config_overrides, &self.repo_path(), false)
    }

@@ -236,10 +209,10 @@ impl PageServerNode {
        config_overrides: &[&str],
        datadir: &Path,
        update_config: bool,
-    ) -> anyhow::Result<Child> {
+    ) -> anyhow::Result<()> {
        println!(
            "Starting pageserver at '{}' in '{}'",
-            self.pg_connection_config.raw_address(),
+            connection_address(&self.pg_connection_config),
            datadir.display()
        );
        io::stdout().flush()?;
@@ -247,7 +220,10 @@ impl PageServerNode {
        let mut args = vec![
            "-D",
            datadir.to_str().with_context(|| {
-                format!("Datadir path {datadir:?} cannot be represented as a unicode string")
+                format!(
+                    "Datadir path '{}' cannot be represented as a unicode string",
+                    datadir.display()
+                )
            })?,
        ];

@@ -259,18 +235,48 @@ impl PageServerNode {
            args.extend(["-c", config_override]);
        }

-        background_process::start_process(
-            "pageserver",
-            datadir,
-            &self.env.pageserver_bin(),
-            &args,
-            background_process::InitialPidFile::Expect(&self.pid_file()),
-            || match self.check_status() {
-                Ok(()) => Ok(true),
-                Err(PageserverHttpError::Transport(_)) => Ok(false),
-                Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
-            },
-        )
+        let mut cmd = Command::new(self.env.pageserver_bin()?);
+        let mut filled_cmd = fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
+        filled_cmd = fill_aws_secrets_vars(filled_cmd);
+
+        if !filled_cmd.status()?.success() {
+            bail!(
+                "Pageserver failed to start. See console output and '{}' for details.",
+                datadir.join("pageserver.log").display()
+            );
+        }
+
+        // It takes a while for the page server to start up. Wait until it is
+        // open for business.
+        const RETRIES: i8 = 15;
+        for retries in 1..RETRIES {
+            match self.check_status() {
+                Ok(()) => {
+                    println!("\nPageserver started");
+                    return Ok(());
+                }
+                Err(err) => {
+                    match err {
+                        PageserverHttpError::Transport(err) => {
+                            if err.is_connect() && retries < 5 {
+                                print!(".");
+                                io::stdout().flush().unwrap();
+                            } else {
+                                if retries == 5 {
+                                    println!() // put a line break after dots for second message
+                                }
+                                println!("Pageserver not responding yet, err {err} retrying ({retries})...");
+                            }
+                        }
+                        PageserverHttpError::Response(msg) => {
+                            bail!("pageserver failed to start: {msg} ")
+                        }
+                    }
+                    thread::sleep(Duration::from_secs(1));
+                }
+            }
+        }
+        bail!("pageserver failed to start in {RETRIES} seconds");
    }

    ///
@@ -282,18 +288,69 @@ impl PageServerNode {
    /// If the server is not running, returns success
    ///
    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        background_process::stop_process(immediate, "pageserver", &self.pid_file())
+        let pid_file = self.pid_file();
+        if !pid_file.exists() {
+            println!("Pageserver is already stopped");
+            return Ok(());
+        }
+        let pid = Pid::from_raw(read_pidfile(&pid_file)?);
+
+        let sig = if immediate {
+            print!("Stopping pageserver immediately..");
+            Signal::SIGQUIT
+        } else {
+            print!("Stopping pageserver gracefully..");
+            Signal::SIGTERM
+        };
+        io::stdout().flush().unwrap();
+        match kill(pid, sig) {
+            Ok(_) => (),
+            Err(Errno::ESRCH) => {
+                println!("Pageserver with pid {pid} does not exist, but a PID file was found");
+                return Ok(());
+            }
+            Err(err) => bail!(
+                "Failed to send signal to pageserver with pid {pid}: {}",
+                err.desc()
+            ),
+        }
+
+        // Wait until process is gone
+        for i in 0..600 {
+            let signal = None; // Send no signal, just get the error code
+            match kill(pid, signal) {
+                Ok(_) => (), // Process exists, keep waiting
+                Err(Errno::ESRCH) => {
+                    // Process not found, we're done
+                    println!("done!");
+                    return Ok(());
+                }
+                Err(err) => bail!(
+                    "Failed to send signal to pageserver with pid {}: {}",
+                    pid,
+                    err.desc()
+                ),
+            };
+
+            if i % 10 == 0 {
+                print!(".");
+                io::stdout().flush().unwrap();
+            }
+            thread::sleep(Duration::from_millis(100));
+        }
+
+        bail!("Failed to stop pageserver with pid {pid}");
    }

    pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
-        let mut client = self.pg_connection_config.connect_no_tls().unwrap();
+        let mut client = self.pg_connection_config.connect(NoTls).unwrap();

        println!("Pageserver query: '{sql}'");
        client.simple_query(sql).unwrap()
    }

    pub fn page_server_psql_client(&self) -> result::Result<postgres::Client, postgres::Error> {
-        self.pg_connection_config.connect_no_tls()
+        self.pg_connection_config.connect(NoTls)
    }

    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
@@ -492,7 +549,7 @@ impl PageServerNode {
        pg_wal: Option<(Lsn, PathBuf)>,
        pg_version: u32,
    ) -> anyhow::Result<()> {
-        let mut client = self.pg_connection_config.connect_no_tls().unwrap();
+        let mut client = self.pg_connection_config.connect(NoTls).unwrap();

        // Init base reader
        let (start_lsn, base_tarfile_path) = base;
--- a/docker-compose/compute/shell/compute.sh
+++ b/docker-compose/compute/shell/compute.sh
@@ -1,48 +0,0 @@
-#!/bin/bash
-set -eux
-
-PG_VERSION=${PG_VERSION:-14}
-
-SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
-SPEC_FILE=/tmp/spec.json
-
-echo "Waiting pageserver become ready."
-while ! nc -z pageserver 6400; do
-     sleep 1;
-done
-echo "Page server is ready."
-
-echo "Create a tenant and timeline"
-PARAMS=(
-     -sb 
-     -X POST
-     -H "Content-Type: application/json"
-     -d "{}"
-     http://pageserver:9898/v1/tenant/
-)
-tenant_id=$(curl "${PARAMS[@]}" | sed 's/"//g')
-
-PARAMS=(
-     -sb 
-     -X POST
-     -H "Content-Type: application/json"
-     -d "{\"tenant_id\":\"${tenant_id}\", \"pg_version\": ${PG_VERSION}}"
-     "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
-)
-result=$(curl "${PARAMS[@]}")
-echo $result | jq .
-
-echo "Overwrite tenant id and timeline id in spec file"
-tenant_id=$(echo ${result} | jq -r .tenant_id)
-timeline_id=$(echo ${result} | jq -r .timeline_id)
-
-sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE}
-sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
-
-cat ${SPEC_FILE}
-
-echo "Start compute node"
-/usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
-     -C "postgresql://cloud_admin@localhost:55433/postgres"  \
-     -b /usr/local/bin/postgres                              \
-     -S ${SPEC_FILE}
--- a/docker-compose/compute/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute/var/db/postgres/specs/spec.json
@@ -1,141 +0,0 @@
-{
-    "format_version": 1.0,
-
-    "timestamp": "2022-10-12T18:00:00.000Z",
-    "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
-
-    "cluster": {
-        "cluster_id": "docker_compose",
-        "name": "docker_compose_test",
-        "state": "restarted",
-        "roles": [
-            {
-                "name": "cloud_admin",
-                "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
-                "options": null
-            }
-        ],
-        "databases": [
-        ],
-        "settings": [
-            {
-                "name": "fsync",
-                "value": "off",
-                "vartype": "bool"
-            },
-            {
-                "name": "wal_level",
-                "value": "replica",
-                "vartype": "enum"
-            },
-            {
-                "name": "hot_standby",
-                "value": "on",
-                "vartype": "bool"
-            },
-            {
-                "name": "wal_log_hints",
-                "value": "on",
-                "vartype": "bool"
-            },
-            {
-                "name": "log_connections",
-                "value": "on",
-                "vartype": "bool"
-            },
-            {
-                "name": "port",
-                "value": "55433",
-                "vartype": "integer"
-            },
-            {
-                "name": "shared_buffers",
-                "value": "1MB",
-                "vartype": "string"
-            },
-            {
-                "name": "max_connections",
-                "value": "100",
-                "vartype": "integer"
-            },
-            {
-                "name": "listen_addresses",
-                "value": "0.0.0.0",
-                "vartype": "string"
-            },
-            {
-                "name": "max_wal_senders",
-                "value": "10",
-                "vartype": "integer"
-            },
-            {
-                "name": "max_replication_slots",
-                "value": "10",
-                "vartype": "integer"
-            },
-            {
-                "name": "wal_sender_timeout",
-                "value": "5s",
-                "vartype": "string"
-            },
-            {
-                "name": "wal_keep_size",
-                "value": "0",
-                "vartype": "integer"
-            },
-            {
-                "name": "password_encryption",
-                "value": "md5",
-                "vartype": "enum"
-            },
-            {
-                "name": "restart_after_crash",
-                "value": "off",
-                "vartype": "bool"
-            },
-            {
-                "name": "synchronous_standby_names",
-                "value": "walproposer",
-                "vartype": "string"
-            },
-            {
-                "name": "shared_preload_libraries",
-                "value": "neon",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.safekeepers",
-                "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.timeline_id",
-                "value": "TIMELINE_ID",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.tenant_id",
-                "value": "TENANT_ID",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.pageserver_connstring",
-                "value": "host=pageserver port=6400",
-                "vartype": "string"
-            },
-            {
-                "name": "max_replication_write_lag",
-                "value": "500MB",
-                "vartype": "string"
-            },
-            {
-                "name": "max_replication_flush_lag",
-                "value": "10GB",
-                "vartype": "string"
-            }
-        ]
-    },
-
-    "delta_operations": [
-    ]
-}
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -1,200 +0,0 @@
-version: '3'
-
-services:
-  etcd:
-    image: quay.io/coreos/etcd:v3.5.4
-    ports:
-      - 2379:2379
-      - 2380:2380
-    environment:
-      # This signifficantly speeds up etcd and we anyway don't data persistency there.
-      ETCD_UNSAFE_NO_FSYNC: "1"
-    command: 
-      - "etcd"
-      - "--auto-compaction-mode=revision"
-      - "--auto-compaction-retention=1"
-      - "--name=etcd-cluster"
-      - "--initial-cluster-state=new"
-      - "--initial-cluster-token=etcd-cluster-1"
-      - "--initial-cluster=etcd-cluster=http://etcd:2380"
-      - "--initial-advertise-peer-urls=http://etcd:2380"
-      - "--advertise-client-urls=http://etcd:2379"
-      - "--listen-client-urls=http://0.0.0.0:2379"
-      - "--listen-peer-urls=http://0.0.0.0:2380"
-      - "--quota-backend-bytes=134217728" # 128 MB
-
-  minio:
-    image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z
-    ports:
-      - 9000:9000
-      - 9001:9001
-    environment:
-      - MINIO_ROOT_USER=minio
-      - MINIO_ROOT_PASSWORD=password
-    command: server /data --address :9000 --console-address ":9001"
-
-  minio_create_buckets:
-    image: minio/mc
-    environment:
-      - MINIO_ROOT_USER=minio
-      - MINIO_ROOT_PASSWORD=password
-    entrypoint:
-      - "/bin/sh"
-      - "-c"
-    command: 
-      - "until (/usr/bin/mc alias set minio http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD) do
-             echo 'Waiting to start minio...' && sleep 1;
-         done;
-         /usr/bin/mc mb minio/neon --region=eu-north-1;
-         exit 0;"
-    depends_on:
-      - minio
-
-  pageserver:
-    image: neondatabase/neon:${TAG:-latest}
-    environment:
-      - BROKER_ENDPOINT='http://etcd:2379'
-      - AWS_ACCESS_KEY_ID=minio
-      - AWS_SECRET_ACCESS_KEY=password
-      #- RUST_BACKTRACE=1
-    ports:
-       #- 6400:6400  # pg protocol handler
-       - 9898:9898 # http endpoints
-    entrypoint:
-      - "/bin/sh"
-      - "-c"
-    command:
-      - "/usr/local/bin/pageserver -D /data/.neon/
-                                   -c \"broker_endpoints=[$$BROKER_ENDPOINT]\"
-                                   -c \"listen_pg_addr='0.0.0.0:6400'\"
-                                   -c \"listen_http_addr='0.0.0.0:9898'\"
-                                   -c \"remote_storage={endpoint='http://minio:9000',
-                                                        bucket_name='neon',
-                                                        bucket_region='eu-north-1',
-                                                        prefix_in_bucket='/pageserver/'}\""
-    depends_on:
-      - etcd
-      - minio_create_buckets
-
-  safekeeper1:
-    image: neondatabase/neon:${TAG:-latest}
-    environment:
-      - SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
-      - SAFEKEEPER_ID=1
-      - BROKER_ENDPOINT=http://etcd:2379
-      - AWS_ACCESS_KEY_ID=minio
-      - AWS_SECRET_ACCESS_KEY=password
-      #- RUST_BACKTRACE=1
-    ports:
-      #- 5454:5454 # pg protocol handler
-      - 7676:7676 # http endpoints
-    entrypoint:
-      - "/bin/sh"
-      - "-c"
-    command:
-      - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
-                    --listen-http='0.0.0.0:7676'
-                    --id=$$SAFEKEEPER_ID
-                    --broker-endpoints=$$BROKER_ENDPOINT
-                    -D /data
-                    --remote-storage=\"{endpoint='http://minio:9000',
-                                        bucket_name='neon',
-                                        bucket_region='eu-north-1',
-                                        prefix_in_bucket='/safekeeper/'}\""
-    depends_on:
-      - etcd
-      - minio_create_buckets
-
-  safekeeper2:
-    image: neondatabase/neon:${TAG:-latest}
-    environment:
-      - SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
-      - SAFEKEEPER_ID=2
-      - BROKER_ENDPOINT=http://etcd:2379
-      - AWS_ACCESS_KEY_ID=minio
-      - AWS_SECRET_ACCESS_KEY=password
-      #- RUST_BACKTRACE=1
-    ports:
-      #- 5454:5454 # pg protocol handler
-      - 7677:7676 # http endpoints
-    entrypoint:
-      - "/bin/sh"
-      - "-c"
-    command:
-      - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
-                    --listen-http='0.0.0.0:7676'
-                    --id=$$SAFEKEEPER_ID
-                    --broker-endpoints=$$BROKER_ENDPOINT
-                    -D /data
-                    --remote-storage=\"{endpoint='http://minio:9000',
-                                        bucket_name='neon',
-                                        bucket_region='eu-north-1',
-                                        prefix_in_bucket='/safekeeper/'}\""
-    depends_on:
-      - etcd
-      - minio_create_buckets
-
-  safekeeper3:
-    image: neondatabase/neon:${TAG:-latest}
-    environment:
-      - SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
-      - SAFEKEEPER_ID=3
-      - BROKER_ENDPOINT=http://etcd:2379
-      - AWS_ACCESS_KEY_ID=minio
-      - AWS_SECRET_ACCESS_KEY=password
-      #- RUST_BACKTRACE=1
-    ports:
-      #- 5454:5454 # pg protocol handler
-      - 7678:7676 # http endpoints
-    entrypoint:
-      - "/bin/sh"
-      - "-c"
-    command:
-      - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
-                    --listen-http='0.0.0.0:7676'
-                    --id=$$SAFEKEEPER_ID
-                    --broker-endpoints=$$BROKER_ENDPOINT
-                    -D /data
-                    --remote-storage=\"{endpoint='http://minio:9000',
-                                        bucket_name='neon',
-                                        bucket_region='eu-north-1',
-                                        prefix_in_bucket='/safekeeper/'}\""
-    depends_on:
-      - etcd
-      - minio_create_buckets
-
-  compute:
-    build:
-      context: ./image/compute
-      args:
-        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}:${TAG:-latest}
-        - http_proxy=$http_proxy
-        - https_proxy=$https_proxy
-    environment:
-      - PG_VERSION=${PG_VERSION:-14}
-      #- RUST_BACKTRACE=1
-    volumes:
-      - ./compute/var/db/postgres/specs/:/var/db/postgres/specs/
-      - ./compute/shell/:/shell/
-    ports:
-      - 55433:55433 # pg protocol handler
-      - 3080:3080 # http endpoints
-    entrypoint:
-      - "/shell/compute.sh"
-    depends_on:
-      - safekeeper1
-      - safekeeper2
-      - safekeeper3
-      - pageserver
-
-  compute_is_ready:
-    image: postgres:latest
-    entrypoint:
-      - "/bin/bash"
-      - "-c"
-    command:
-      - "until pg_isready -h compute -p 55433 ; do
-            echo 'Waiting to start compute...' && sleep 1;
-         done"
-    depends_on:
-      - compute
--- a/docker-compose/image/compute/Dockerfile
+++ b/docker-compose/image/compute/Dockerfile
@@ -1,10 +0,0 @@
-ARG COMPUTE_IMAGE=compute-node-v14:latest
-FROM neondatabase/${COMPUTE_IMAGE}
-
-USER root
-RUN apt-get update &&       \
-    apt-get install -y curl \
-                       jq   \
-                       netcat
-
-USER postgres
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -18,67 +18,3 @@ We build all images after a successful `release` tests run and push automaticall
 1. `neondatabase/compute-tools` and `neondatabase/compute-node`

 2. `neondatabase/neon`
-
-## Docker Compose example
-
-You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers.
-
- etcd x 1
- pageserver x 1
- safekeeper x 3
- compute x 1
- MinIO x 1        # This is Amazon S3 compatible object storage
-
-### How to use
-
-1. create containers
-
-You can specify version of neon cluster using following environment values.
- PG_VERSION: postgres version for compute (default is 14)
- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
-```
-$ cd docker-compose/docker-compose.yml
-$ docker-compose down   # remove the conainers if exists
-$ PG_VERSION=15 TAG=2221 docker-compose up --build -d  # You can specify the postgres and image version
-Creating network "dockercompose_default" with the default driver
-Creating dockercompose_etcd3_1 ...
-(...omit...)
-```
-
-2. connect compute node
-```
-$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
-$ psql -h localhost -p 55433 -U cloud_admin
-postgres=# CREATE TABLE t(key int primary key, value text);
-CREATE TABLE
-postgres=# insert into t values(1,1);
-INSERT 0 1
-postgres=# select * from t;
- key | value
-----+-------
-   1 | 1
-(1 row)
-```
-
-3. If you want to see the log, you can use `docker-compose logs` command.
-```
-# check the container name you want to see
-$ docker ps
-CONTAINER ID   IMAGE                                              COMMAND                  CREATED         STATUS         PORTS                                                                                                                                  NAMES
-d6968a5ae912   dockercompose_compute                              "/shell/compute.sh"      5 minutes ago   Up 5 minutes   0.0.0.0:3080->3080/tcp, 0.0.0.0:55433->55433/tcp                                                                                       dockercompose_compute_1
-(...omit...)
-
-$ docker logs -f dockercompose_compute_1
-2022-10-21 06:15:48.757 GMT [56] LOG:  connection authorized: user=cloud_admin database=postgres application_name=psql
-2022-10-21 06:17:00.307 GMT [56] LOG:  [NEON_SMGR] libpagestore: connected to 'host=pageserver port=6400'
-(...omit...)
-```
-
-4. If you want to see durable data in MinIO which is s3 compatible storage
-
-Access http://localhost:9001 and sign in.
-
- Username: `minio`
- Password: `password`
-
-You can see durable pages and WAL data in `neon` bucket.
--- a/docs/rfcs/020-pageserver-s3-coordination.md
+++ b/docs/rfcs/020-pageserver-s3-coordination.md
@@ -1,246 +0,0 @@
-# Coordinating access of multiple pageservers to the same s3 data
-
-## Motivation
-
-There are some blind spots around coordinating access of multiple pageservers
-to the same s3 data. Currently this is applicable only to tenant relocation
-case, but in the future we'll need to solve similar problems for
-replica/standby pageservers.
-
-## Impacted components (e.g. pageserver, safekeeper, console, etc)
-
-Pageserver
-
-## The problem
-
-### Relocation
-
-During relocation both pageservers can write to s3. This should be ok for all
-data except the `index_part.json`. For index part it causes problems during
-compaction/gc because they remove files from index/s3.
-
-Imagine this case:
-
-```mermaid
-sequenceDiagram
-    autonumber
-    participant PS1
-    participant S3
-    participant PS2
-
-    PS1->>S3: Uploads L1, L2 <br/> Index contains L1 L2
-    PS2->>S3: Attach called, sees L1, L2
-    PS1->>S3: Compaction comes <br/> Removes L1, adds L3
-    note over S3: Index now L2, L3
-    PS2->>S3: Uploads new layer L4 <br/> (added to previous view of the index)
-    note over S3: Index now L1, L2, L4
-```
-
-At this point it is not possible to restore from index, it contains L2 which
-is no longer available in s3 and doesnt contain L3 added by compaction by the
-first pageserver. So if any of the pageservers restart initial sync will fail
-(or in on-demand world it will fail a bit later during page request from
-missing layer)
-
-### Standby pageserver
-
-Another related case is standby pageserver. In this case second pageserver can
-be used as a replica to scale reads and serve as a failover target in case
-first one fails.
-
-In this mode second pageserver needs to have the same picture of s3 files to
-be able to load layers on-demand. To accomplish that second pageserver
-cannot run gc/compaction jobs. Instead it needs to receive updates for index
-contents. (There is no need to run walreceiver on the second pageserver then).
-
-## Observations
-
- If both pageservers ingest wal then their layer set diverges, because layer
-  file generation is not deterministic
- If one of the pageservers does not ingest wal (and just picks up layer
-  updates) then it lags behind and cannot really answer queries in the same
-  pace as the primary one
- Can compaction help make layers deterministic? E g we do not upload level
-  zero layers and construction of higher levels should be deterministic.
-  This way we can guarantee that layer creation by timeout wont mess things up.
-  This way one pageserver uploads data and second one can just ingest it.
-  But we still need some form of election
-
-## Solutions
-
-### Manual orchestration
-
-One possible solution for relocation case is to orchestrate background jobs
-from outside. The oracle who runs migration can turn off background jobs on
-PS1 before migration and then run migration -> enable them on PS2. The problem
-comes if migration fails. In this case in order to resume background jobs
-oracle needs to guarantee that PS2 doesnt run background jobs and if it doesnt
-respond then PS1 is stuck unable to run compaction/gc. This cannot be solved
-without human ensuring that no upload from PS2 can happen. In order to be able
-to resolve this automatically CAS is required on S3 side so pageserver can
-avoid overwriting index part if it is no longer the leading one
-
-Note that flag that disables background jobs needs to be persistent, because
-otherwise pageserver restart will clean it
-
-### Avoid index_part.json
-
-Index part consists of two parts, list of layers and metadata. List of layers
-can be easily obtained by `ListObjects` S3 API method. But what to do with
-metadata? Create metadata instance for each checkpoint and add some counter
-to the file name?
-
-Back to potentially long s3 ls.
-
-### Coordination based approach
-
-Do it like safekeepers chose leader for WAL upload. Ping each other and decide
-based on some heuristics e g smallest node id. During relocation PS1 sends
-"resign" ping message so others can start election without waiting for a timeout.
-
-This still leaves metadata question open and non deterministic layers are a
-problem as well
-
-### Avoid metadata file
-
-One way to eliminate metadata file is to store it in layer files under some
-special key. This may resonate with intention to keep all relation sizes in
-some special segment to avoid initial download during size calculation.
-Maybe with that we can even store pre calculated value.
-
-As a downside each checkpoint gets 512 bytes larger.
-
-If we entirely avoid metadata file this opens up many approaches
-
-* * *
-
-During discussion it seems that we converged on the approach consisting of:
-
- index files stored per pageserver in the same timeline directory. With that
-  index file name starts to look like: `<pageserver_node_id>_index_part.json`.
-  In such set up there are no concurrent overwrites of index file by different
-  pageservers.
- For replica pageservers the solution would be for primary to broadcast index
-  changes to any followers with an ability to check index files in s3 and
-  restore the full state. To properly merge changes with index files we can use
-  a counter that is persisted in an index file, is incremented on every change
-  to it and passed along with broadcasted change. This way we can determine
-  whether we need to apply change to the index state or not.
- Responsibility for running background jobs is assigned externally. Pageserver
-  keeps locally persistent flag for each tenant that indicates whether this
-  pageserver is considered as primary one or not. TODO what happends if we
-  crash and cannot start for some extended period of time? Control plane can
-  assign ownership to some other pageserver. Pageserver needs some way to check
-  if its still the blessed one. Maybe by explicit request to control plane on
-  start.
-
-Requirement for deterministic layer generation was considered overly strict
-because of two reasons:
-
- It can limit possible optimizations e g when pageserver wants to reshuffle
-  some data locally and doesnt want to coordinate this
- The deterministic algorithm itself can change so during deployments for some
-  time there will be two different version running at the same time which can
-  cause non determinism
-
-### External elections
-
-The above case with lost state in this schema with externally managed
-leadership is represented like this:
-
-Note that here we keep objects list in the index file.
-
-```mermaid
-sequenceDiagram
-    autonumber
-    participant PS1
-    participant CP as Control Plane
-    participant S3
-    participant PS2
-
-    note over PS1,PS2: PS1 starts up and still a leader
-    PS1->>CP: Am I still the leader for Tenant X?
-    activate CP
-    CP->>PS1: Yes
-    deactivate CP
-    PS1->>S3: Fetch PS1 index.
-    note over PS1: Continue operations, start backround jobs
-    note over PS1,PS2: PS1 starts up and still and is not a leader anymore
-    PS1->>CP: Am I still the leader for Tenant X?
-    CP->>PS1: No
-    PS1->>PS2: Subscribe to index changes
-    PS1->>S3: Fetch PS1 and PS2 indexes
-    note over PS1: Combine index file to include layers <br> from both indexes to be able <br> to see newer files from leader (PS2)
-    note over PS1: Continue operations, do not start background jobs
-```
-
-### Internal elections
-
-To manage leadership internally we can use broker to exchange pings so nodes
-can decide on the leader roles. In case multiple pageservers are active leader
-is the one with lowest node id.
-
-Operations with internally managed elections:
-
-```mermaid
-sequenceDiagram
-    autonumber
-    participant PS1
-    participant S3
-
-    note over PS1: Starts up
-    note over PS1: Subscribes to changes, waits for two ping <br> timeouts to see if there is a leader
-    PS1->>S3: Fetch indexes from s3
-    alt there is a leader
-        note over PS1: do not start background jobs, <br> continue applying index updates
-    else there is no leader
-        note over PS1: start background jobs, <br> broadcast index changes
-    end
-
-    note over PS1,S3: Then the picture is similar to external elections <br> the difference is that follower can become a leader <br> if there are no pings after some timeout new leader gets elected
-```
-
-### Eviction
-
-When two pageservers operate on a tenant for extended period of time follower
-doesnt perform write operations in s3. When layer is evicted follower relies
-on updates from primary to get info about layers it needs to cover range for
-evicted layer.
-
-Note that it wont match evicted layer exactly, so layers will overlap and
-lookup code needs to correctly handle that.
-
-### Relocation flow
-
-Actions become:
-
- Attach tenant to new pageserver
- New pageserver becomes follower since previous one is still leading
- New pageserver starts replicating from safekeepers but does not upload layers
- Detach is called on the old one
- New pageserver becomes leader after it realizes that old one disappeared
-
-### Index File
-
-Using `s3 ls` on startup simplifies things, but we still need metadata, so we
-need to fetch index files anyway. If they contain list of files we can combine
-them and avoid costly `s3 ls`
-
-### Remaining issues
-
- More than one remote consistent lsn for safekeepers to know
-
-Anything else?
-
-### Proposed solution
-
-To recap. On meeting we converged on approach with external elections but I
-think it will be overall harder to manage and will introduce a dependency on
-control plane for pageserver. Using separate index files for each pageserver
-consisting of log of operations and a metadata snapshot should be enough.
-
-### What we need to get there?
-
- Change index file structure to contain log of changes instead of just the
-  file list
- Implement pinging/elections for pageservers
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -52,10 +52,6 @@ PostgreSQL extension that implements storage manager API and network communicati

 PostgreSQL extension that contains functions needed for testing and debugging.

-`/pgxn/neon_walredo`:
-
-Library to run Postgres as a "WAL redo process" in the pageserver.
-
 `/safekeeper`:

 The neon WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.
--- a/libs/etcd_broker/src/subscription_value.rs
+++ b/libs/etcd_broker/src/subscription_value.rs
@@ -29,9 +29,6 @@ pub struct SkTimelineInfo {
    #[serde_as(as = "Option<DisplayFromStr>")]
    #[serde(default)]
    pub peer_horizon_lsn: Option<Lsn>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    #[serde(default)]
-    pub local_start_lsn: Option<Lsn>,
    /// A connection string to use for WAL receiving.
    #[serde(default)]
    pub safekeeper_connstr: Option<String>,
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -7,10 +7,6 @@ edition = "2021"
 serde = { version = "1.0", features = ["derive"] }
 serde_with = "2.0"
 const_format = "0.2.21"
-anyhow = { version = "1.0", features = ["backtrace"] }
-bytes = "1.0.1"
-byteorder = "1.4.3"

 utils = { path = "../utils" }
-postgres_ffi = { path = "../postgres_ffi" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -2,7 +2,6 @@ use const_format::formatcp;

 /// Public API types
 pub mod models;
-pub mod reltag;

 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1,6 +1,5 @@
 use std::num::NonZeroU64;

-use byteorder::{BigEndian, ReadBytesExt};
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use utils::{
@@ -8,10 +7,6 @@ use utils::{
    lsn::Lsn,
 };

-use crate::reltag::RelTag;
-use anyhow::bail;
-use bytes::{BufMut, Bytes, BytesMut};
-
 /// A state of a tenant in pageserver's memory.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TenantState {
@@ -24,22 +19,6 @@ pub enum TenantState {
    Broken,
 }

-/// A state of a timeline in pageserver's memory.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-pub enum TimelineState {
-    /// Timeline is fully operational, its background jobs are running.
-    Active,
-    /// A timeline is recognized by pageserver, but not yet ready to operate.
-    /// The status indicates, that the timeline could eventually go back to Active automatically:
-    /// for example, if the owning tenant goes back to Active again.
-    Suspended,
-    /// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to
-    /// automatically become Active after certain events: only a management call can change this status.
-    Paused,
-    /// A timeline is recognized by the pageserver, but no longer used for any operations, as failed to get activated.
-    Broken,
-}
-
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
@@ -181,8 +160,6 @@ pub struct TimelineInfo {
    pub remote_consistent_lsn: Option<Lsn>,
    pub awaits_download: bool,

-    pub state: TimelineState,
-
    // Some of the above fields are duplicated in 'local' and 'remote', for backwards-
    // compatility with older clients.
    pub local: LocalTimelineInfo,
@@ -224,262 +201,3 @@ pub struct FailpointConfig {
 pub struct TimelineGcRequest {
    pub gc_horizon: Option<u64>,
 }
-
-// Wrapped in libpq CopyData
-#[derive(PartialEq, Eq)]
-pub enum PagestreamFeMessage {
-    Exists(PagestreamExistsRequest),
-    Nblocks(PagestreamNblocksRequest),
-    GetPage(PagestreamGetPageRequest),
-    DbSize(PagestreamDbSizeRequest),
-}
-
-// Wrapped in libpq CopyData
-pub enum PagestreamBeMessage {
-    Exists(PagestreamExistsResponse),
-    Nblocks(PagestreamNblocksResponse),
-    GetPage(PagestreamGetPageResponse),
-    Error(PagestreamErrorResponse),
-    DbSize(PagestreamDbSizeResponse),
-}
-
-#[derive(Debug, PartialEq, Eq)]
-pub struct PagestreamExistsRequest {
-    pub latest: bool,
-    pub lsn: Lsn,
-    pub rel: RelTag,
-}
-
-#[derive(Debug, PartialEq, Eq)]
-pub struct PagestreamNblocksRequest {
-    pub latest: bool,
-    pub lsn: Lsn,
-    pub rel: RelTag,
-}
-
-#[derive(Debug, PartialEq, Eq)]
-pub struct PagestreamGetPageRequest {
-    pub latest: bool,
-    pub lsn: Lsn,
-    pub rel: RelTag,
-    pub blkno: u32,
-}
-
-#[derive(Debug, PartialEq, Eq)]
-pub struct PagestreamDbSizeRequest {
-    pub latest: bool,
-    pub lsn: Lsn,
-    pub dbnode: u32,
-}
-
-#[derive(Debug)]
-pub struct PagestreamExistsResponse {
-    pub exists: bool,
-}
-
-#[derive(Debug)]
-pub struct PagestreamNblocksResponse {
-    pub n_blocks: u32,
-}
-
-#[derive(Debug)]
-pub struct PagestreamGetPageResponse {
-    pub page: Bytes,
-}
-
-#[derive(Debug)]
-pub struct PagestreamErrorResponse {
-    pub message: String,
-}
-
-#[derive(Debug)]
-pub struct PagestreamDbSizeResponse {
-    pub db_size: i64,
-}
-
-impl PagestreamFeMessage {
-    pub fn serialize(&self) -> Bytes {
-        let mut bytes = BytesMut::new();
-
-        match self {
-            Self::Exists(req) => {
-                bytes.put_u8(0);
-                bytes.put_u8(if req.latest { 1 } else { 0 });
-                bytes.put_u64(req.lsn.0);
-                bytes.put_u32(req.rel.spcnode);
-                bytes.put_u32(req.rel.dbnode);
-                bytes.put_u32(req.rel.relnode);
-                bytes.put_u8(req.rel.forknum);
-            }
-
-            Self::Nblocks(req) => {
-                bytes.put_u8(1);
-                bytes.put_u8(if req.latest { 1 } else { 0 });
-                bytes.put_u64(req.lsn.0);
-                bytes.put_u32(req.rel.spcnode);
-                bytes.put_u32(req.rel.dbnode);
-                bytes.put_u32(req.rel.relnode);
-                bytes.put_u8(req.rel.forknum);
-            }
-
-            Self::GetPage(req) => {
-                bytes.put_u8(2);
-                bytes.put_u8(if req.latest { 1 } else { 0 });
-                bytes.put_u64(req.lsn.0);
-                bytes.put_u32(req.rel.spcnode);
-                bytes.put_u32(req.rel.dbnode);
-                bytes.put_u32(req.rel.relnode);
-                bytes.put_u8(req.rel.forknum);
-                bytes.put_u32(req.blkno);
-            }
-
-            Self::DbSize(req) => {
-                bytes.put_u8(3);
-                bytes.put_u8(if req.latest { 1 } else { 0 });
-                bytes.put_u64(req.lsn.0);
-                bytes.put_u32(req.dbnode);
-            }
-        }
-
-        bytes.into()
-    }
-
-    pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
-        // TODO these gets can fail
-
-        // these correspond to the NeonMessageTag enum in pagestore_client.h
-        //
-        // TODO: consider using protobuf or serde bincode for less error prone
-        // serialization.
-        let msg_tag = body.read_u8()?;
-        match msg_tag {
-            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                latest: body.read_u8()? != 0,
-                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
-                rel: RelTag {
-                    spcnode: body.read_u32::<BigEndian>()?,
-                    dbnode: body.read_u32::<BigEndian>()?,
-                    relnode: body.read_u32::<BigEndian>()?,
-                    forknum: body.read_u8()?,
-                },
-            })),
-            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                latest: body.read_u8()? != 0,
-                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
-                rel: RelTag {
-                    spcnode: body.read_u32::<BigEndian>()?,
-                    dbnode: body.read_u32::<BigEndian>()?,
-                    relnode: body.read_u32::<BigEndian>()?,
-                    forknum: body.read_u8()?,
-                },
-            })),
-            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                latest: body.read_u8()? != 0,
-                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
-                rel: RelTag {
-                    spcnode: body.read_u32::<BigEndian>()?,
-                    dbnode: body.read_u32::<BigEndian>()?,
-                    relnode: body.read_u32::<BigEndian>()?,
-                    forknum: body.read_u8()?,
-                },
-                blkno: body.read_u32::<BigEndian>()?,
-            })),
-            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                latest: body.read_u8()? != 0,
-                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
-                dbnode: body.read_u32::<BigEndian>()?,
-            })),
-            _ => bail!("unknown smgr message tag: {:?}", msg_tag),
-        }
-    }
-}
-
-impl PagestreamBeMessage {
-    pub fn serialize(&self) -> Bytes {
-        let mut bytes = BytesMut::new();
-
-        match self {
-            Self::Exists(resp) => {
-                bytes.put_u8(100); /* tag from pagestore_client.h */
-                bytes.put_u8(resp.exists as u8);
-            }
-
-            Self::Nblocks(resp) => {
-                bytes.put_u8(101); /* tag from pagestore_client.h */
-                bytes.put_u32(resp.n_blocks);
-            }
-
-            Self::GetPage(resp) => {
-                bytes.put_u8(102); /* tag from pagestore_client.h */
-                bytes.put(&resp.page[..]);
-            }
-
-            Self::Error(resp) => {
-                bytes.put_u8(103); /* tag from pagestore_client.h */
-                bytes.put(resp.message.as_bytes());
-                bytes.put_u8(0); // null terminator
-            }
-            Self::DbSize(resp) => {
-                bytes.put_u8(104); /* tag from pagestore_client.h */
-                bytes.put_i64(resp.db_size);
-            }
-        }
-
-        bytes.into()
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use bytes::Buf;
-
-    use super::*;
-
-    #[test]
-    fn test_pagestream() {
-        // Test serialization/deserialization of PagestreamFeMessage
-        let messages = vec![
-            PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                latest: true,
-                lsn: Lsn(4),
-                rel: RelTag {
-                    forknum: 1,
-                    spcnode: 2,
-                    dbnode: 3,
-                    relnode: 4,
-                },
-            }),
-            PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                latest: false,
-                lsn: Lsn(4),
-                rel: RelTag {
-                    forknum: 1,
-                    spcnode: 2,
-                    dbnode: 3,
-                    relnode: 4,
-                },
-            }),
-            PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                latest: true,
-                lsn: Lsn(4),
-                rel: RelTag {
-                    forknum: 1,
-                    spcnode: 2,
-                    dbnode: 3,
-                    relnode: 4,
-                },
-                blkno: 7,
-            }),
-            PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                latest: true,
-                lsn: Lsn(4),
-                dbnode: 7,
-            }),
-        ];
-        for msg in messages {
-            let bytes = msg.serialize();
-            let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
-            assert!(msg == reconstructed);
-        }
-    }
-}
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -1,16 +0,0 @@
-[package]
-name = "pq_proto"
-version = "0.1.0"
-edition = "2021"
-
-[dependencies]
-anyhow = "1.0"
-bytes = "1.0.1"
-pin-project-lite = "0.2.7"
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-rand = "0.8.3"
-serde = { version = "1.0", features = ["derive"] }
-tokio = { version = "1.17", features = ["macros"] }
-tracing = "0.1"
-
-workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/tenant_size_model/.gitignore
+++ b/libs/tenant_size_model/.gitignore
@@ -1,3 +0,0 @@
-*.dot
-*.png
-*.svg
--- a/libs/tenant_size_model/Cargo.toml
+++ b/libs/tenant_size_model/Cargo.toml
@@ -1,8 +0,0 @@
-[package]
-name = "tenant_size_model"
-version = "0.1.0"
-edition = "2021"
-publish = false
-
-[dependencies]
-workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/tenant_size_model/Makefile
+++ b/libs/tenant_size_model/Makefile
@@ -1,13 +0,0 @@
-all: 1.svg 2.svg 3.svg 4.svg 1.png 2.png 3.png 4.png
-
-../../target/debug/tenant_size_model: Cargo.toml src/main.rs src/lib.rs
-	cargo build --bin tenant_size_model
-
-%.svg: %.dot
-	dot -Tsvg $< > $@
-
-%.png: %.dot
-	dot -Tpng $< > $@
-
-%.dot: ../../target/debug/tenant_size_model
-	../../target/debug/tenant_size_model $* > $@
--- a/libs/tenant_size_model/README.md
+++ b/libs/tenant_size_model/README.md
@@ -1,7 +0,0 @@
-# Logical size + WAL pricing
-
-This is a simulator to calculate the tenant size in different scenarios,
-using the "Logical size + WAL" method. Makefile produces diagrams used in a
-private presentation:
-
-https://docs.google.com/presentation/d/1OapE4k11xmcwMh7I7YvNWGC63yCRLh6udO9bXZ-fZmo/edit?usp=sharing
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -1,349 +0,0 @@
-use std::borrow::Cow;
-use std::collections::HashMap;
-
-/// Pricing model or history size builder.
-///
-/// Maintains knowledge of the branches and their modifications. Generic over the branch name key
-/// type.
-pub struct Storage<K: 'static> {
-    segments: Vec<Segment>,
-
-    /// Mapping from the branch name to the index of a segment describing it's latest state.
-    branches: HashMap<K, usize>,
-}
-
-/// Snapshot of a branch.
-#[derive(Clone, Debug, Eq, PartialEq)]
-pub struct Segment {
-    /// Previous segment index into ['Storage::segments`], if any.
-    parent: Option<usize>,
-
-    /// Description of how did we get to this state.
-    ///
-    /// Mainly used in the original scenarios 1..=4 with insert, delete and update. Not used when
-    /// modifying a branch directly.
-    pub op: Cow<'static, str>,
-
-    /// LSN before this state
-    start_lsn: u64,
-
-    /// LSN at this state
-    pub end_lsn: u64,
-
-    /// Logical size before this state
-    start_size: u64,
-
-    /// Logical size at this state
-    pub end_size: u64,
-
-    /// Indices to [`Storage::segments`]
-    ///
-    /// FIXME: this could be an Option<usize>
-    children_after: Vec<usize>,
-
-    /// Determined by `retention_period` given to [`Storage::calculate`]
-    pub needed: bool,
-}
-
-//
-//
-//
-//
-//                 *-g--*---D--->
-//                /
-//               /
-//              /                 *---b----*-B--->
-//             /                 /
-//            /                 /
-//      -----*--e---*-----f----* C
-//           E                  \
-//                               \
-//                                *--a---*---A-->
-//
-// If A and B need to be retained, is it cheaper to store
-// snapshot at C+a+b, or snapshots at A and B ?
-//
-// If D also needs to be retained, which is cheaper:
-//
-// 1. E+g+e+f+a+b
-// 2. D+C+a+b
-// 3. D+A+B
-
-/// [`Segment`] which has had it's size calculated.
-pub struct SegmentSize {
-    pub seg_id: usize,
-
-    pub method: SegmentMethod,
-
-    this_size: u64,
-
-    pub children: Vec<SegmentSize>,
-}
-
-impl SegmentSize {
-    fn total(&self) -> u64 {
-        self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
-    }
-
-    pub fn total_children(&self) -> u64 {
-        if self.method == SnapshotAfter {
-            self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
-        } else {
-            self.children.iter().fold(0, |acc, x| acc + x.total())
-        }
-    }
-}
-
-/// Different methods to retain history from a particular state
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
-pub enum SegmentMethod {
-    SnapshotAfter,
-    Wal,
-    WalNeeded,
-    Skipped,
-}
-
-use SegmentMethod::*;
-
-impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
-    /// Creates a new storage with the given default branch name.
-    pub fn new(initial_branch: K) -> Storage<K> {
-        let init_segment = Segment {
-            op: "".into(),
-            needed: false,
-            parent: None,
-            start_lsn: 0,
-            end_lsn: 0,
-            start_size: 0,
-            end_size: 0,
-            children_after: Vec::new(),
-        };
-
-        Storage {
-            segments: vec![init_segment],
-            branches: HashMap::from([(initial_branch, 0)]),
-        }
-    }
-
-    /// Advances the branch with the named operation, by the relative LSN and logical size bytes.
-    pub fn modify_branch<Q: ?Sized>(
-        &mut self,
-        branch: &Q,
-        op: Cow<'static, str>,
-        lsn_bytes: u64,
-        size_bytes: i64,
-    ) where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq,
-    {
-        let lastseg_id = *self.branches.get(branch).unwrap();
-        let newseg_id = self.segments.len();
-        let lastseg = &mut self.segments[lastseg_id];
-
-        let newseg = Segment {
-            op,
-            parent: Some(lastseg_id),
-            start_lsn: lastseg.end_lsn,
-            end_lsn: lastseg.end_lsn + lsn_bytes,
-            start_size: lastseg.end_size,
-            end_size: (lastseg.end_size as i64 + size_bytes) as u64,
-            children_after: Vec::new(),
-            needed: false,
-        };
-        lastseg.children_after.push(newseg_id);
-
-        self.segments.push(newseg);
-        *self.branches.get_mut(branch).expect("read already") = newseg_id;
-    }
-
-    pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq,
-    {
-        self.modify_branch(branch, "insert".into(), bytes, bytes as i64);
-    }
-
-    pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq,
-    {
-        self.modify_branch(branch, "update".into(), bytes, 0i64);
-    }
-
-    pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq,
-    {
-        self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64));
-    }
-
-    /// Panics if the parent branch cannot be found.
-    pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K)
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq,
-    {
-        // Find the right segment
-        let branchseg_id = *self
-            .branches
-            .get(parent)
-            .expect("should had found the parent by key");
-        let _branchseg = &mut self.segments[branchseg_id];
-
-        // Create branch name for it
-        self.branches.insert(name, branchseg_id);
-    }
-
-    pub fn calculate(&mut self, retention_period: u64) -> SegmentSize {
-        // Phase 1: Mark all the segments that need to be retained
-        for (_branch, &last_seg_id) in self.branches.iter() {
-            let last_seg = &self.segments[last_seg_id];
-            let cutoff_lsn = last_seg.start_lsn.saturating_sub(retention_period);
-            let mut seg_id = last_seg_id;
-            loop {
-                let seg = &mut self.segments[seg_id];
-                if seg.end_lsn < cutoff_lsn {
-                    break;
-                }
-                seg.needed = true;
-                if let Some(prev_seg_id) = seg.parent {
-                    seg_id = prev_seg_id;
-                } else {
-                    break;
-                }
-            }
-        }
-
-        // Phase 2: For each oldest segment in a chain that needs to be retained,
-        // calculate if we should store snapshot or WAL
-        self.size_from_snapshot_later(0)
-    }
-
-    fn size_from_wal(&self, seg_id: usize) -> SegmentSize {
-        let seg = &self.segments[seg_id];
-
-        let this_size = seg.end_lsn - seg.start_lsn;
-
-        let mut children = Vec::new();
-
-        // try both ways
-        for &child_id in seg.children_after.iter() {
-            // try each child both ways
-            let child = &self.segments[child_id];
-            let p1 = self.size_from_wal(child_id);
-
-            let p = if !child.needed {
-                let p2 = self.size_from_snapshot_later(child_id);
-                if p1.total() < p2.total() {
-                    p1
-                } else {
-                    p2
-                }
-            } else {
-                p1
-            };
-            children.push(p);
-        }
-        SegmentSize {
-            seg_id,
-            method: if seg.needed { WalNeeded } else { Wal },
-            this_size,
-            children,
-        }
-    }
-
-    fn size_from_snapshot_later(&self, seg_id: usize) -> SegmentSize {
-        // If this is needed, then it's time to do the snapshot and continue
-        // with wal method.
-        let seg = &self.segments[seg_id];
-        //eprintln!("snap: seg{}: {} needed: {}", seg_id, seg.children_after.len(), seg.needed);
-        if seg.needed {
-            let mut children = Vec::new();
-
-            for &child_id in seg.children_after.iter() {
-                // try each child both ways
-                let child = &self.segments[child_id];
-                let p1 = self.size_from_wal(child_id);
-
-                let p = if !child.needed {
-                    let p2 = self.size_from_snapshot_later(child_id);
-                    if p1.total() < p2.total() {
-                        p1
-                    } else {
-                        p2
-                    }
-                } else {
-                    p1
-                };
-                children.push(p);
-            }
-            SegmentSize {
-                seg_id,
-                method: WalNeeded,
-                this_size: seg.start_size,
-                children,
-            }
-        } else {
-            // If any of the direct children are "needed", need to be able to reconstruct here
-            let mut children_needed = false;
-            for &child in seg.children_after.iter() {
-                let seg = &self.segments[child];
-                if seg.needed {
-                    children_needed = true;
-                    break;
-                }
-            }
-
-            let method1 = if !children_needed {
-                let mut children = Vec::new();
-                for child in seg.children_after.iter() {
-                    children.push(self.size_from_snapshot_later(*child));
-                }
-                Some(SegmentSize {
-                    seg_id,
-                    method: Skipped,
-                    this_size: 0,
-                    children,
-                })
-            } else {
-                None
-            };
-
-            // If this a junction, consider snapshotting here
-            let method2 = if children_needed || seg.children_after.len() >= 2 {
-                let mut children = Vec::new();
-                for child in seg.children_after.iter() {
-                    children.push(self.size_from_wal(*child));
-                }
-                Some(SegmentSize {
-                    seg_id,
-                    method: SnapshotAfter,
-                    this_size: seg.end_size,
-                    children,
-                })
-            } else {
-                None
-            };
-
-            match (method1, method2) {
-                (None, None) => panic!(),
-                (Some(method), None) => method,
-                (None, Some(method)) => method,
-                (Some(method1), Some(method2)) => {
-                    if method1.total() < method2.total() {
-                        method1
-                    } else {
-                        method2
-                    }
-                }
-            }
-        }
-    }
-
-    pub fn into_segments(self) -> Vec<Segment> {
-        self.segments
-    }
-}
--- a/libs/tenant_size_model/src/main.rs
+++ b/libs/tenant_size_model/src/main.rs
@@ -1,268 +0,0 @@
-//! Tenant size model testing ground.
-//!
-//! Has a number of scenarios and a `main` for invoking these by number, calculating the history
-//! size, outputs graphviz graph. Makefile in directory shows how to use graphviz to turn scenarios
-//! into pngs.
-
-use tenant_size_model::{Segment, SegmentSize, Storage};
-
-// Main branch only. Some updates on it.
-fn scenario_1() -> (Vec<Segment>, SegmentSize) {
-    // Create main branch
-    let mut storage = Storage::new("main");
-
-    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000);
-
-    // Stream of updates
-    for _ in 0..5 {
-        storage.update("main", 1_000);
-    }
-
-    let size = storage.calculate(1000);
-
-    (storage.into_segments(), size)
-}
-
-// Main branch only. Some updates on it.
-fn scenario_2() -> (Vec<Segment>, SegmentSize) {
-    // Create main branch
-    let mut storage = Storage::new("main");
-
-    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000);
-
-    // Stream of updates
-    for _ in 0..5 {
-        storage.update("main", 1_000);
-    }
-
-    // Branch
-    storage.branch("main", "child");
-    storage.update("child", 1_000);
-
-    // More updates on parent
-    storage.update("main", 1_000);
-
-    let size = storage.calculate(1000);
-
-    (storage.into_segments(), size)
-}
-
-// Like 2, but more updates on main
-fn scenario_3() -> (Vec<Segment>, SegmentSize) {
-    // Create main branch
-    let mut storage = Storage::new("main");
-
-    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000);
-
-    // Stream of updates
-    for _ in 0..5 {
-        storage.update("main", 1_000);
-    }
-
-    // Branch
-    storage.branch("main", "child");
-    storage.update("child", 1_000);
-
-    // More updates on parent
-    for _ in 0..5 {
-        storage.update("main", 1_000);
-    }
-
-    let size = storage.calculate(1000);
-
-    (storage.into_segments(), size)
-}
-
-// Diverged branches
-fn scenario_4() -> (Vec<Segment>, SegmentSize) {
-    // Create main branch
-    let mut storage = Storage::new("main");
-
-    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000);
-
-    // Stream of updates
-    for _ in 0..5 {
-        storage.update("main", 1_000);
-    }
-
-    // Branch
-    storage.branch("main", "child");
-    storage.update("child", 1_000);
-
-    // More updates on parent
-    for _ in 0..8 {
-        storage.update("main", 1_000);
-    }
-
-    let size = storage.calculate(1000);
-
-    (storage.into_segments(), size)
-}
-
-fn scenario_5() -> (Vec<Segment>, SegmentSize) {
-    let mut storage = Storage::new("a");
-    storage.insert("a", 5000);
-    storage.branch("a", "b");
-    storage.update("b", 4000);
-    storage.update("a", 2000);
-    storage.branch("a", "c");
-    storage.insert("c", 4000);
-    storage.insert("a", 2000);
-
-    let size = storage.calculate(5000);
-
-    (storage.into_segments(), size)
-}
-
-fn scenario_6() -> (Vec<Segment>, SegmentSize) {
-    use std::borrow::Cow;
-
-    const NO_OP: Cow<'static, str> = Cow::Borrowed("");
-
-    let branches = [
-        Some(0x7ff1edab8182025f15ae33482edb590a_u128),
-        Some(0xb1719e044db05401a05a2ed588a3ad3f),
-        Some(0xb68d6691c895ad0a70809470020929ef),
-    ];
-
-    // compared to other scenarios, this one uses bytes instead of kB
-
-    let mut storage = Storage::new(None);
-
-    storage.branch(&None, branches[0]); // at 0
-    storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128); // at 108951064
-    storage.branch(&branches[0], branches[1]); // at 108951064
-    storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392); // at 124511472
-    storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904); // at 283415424
-    storage.branch(&branches[0], branches[2]); // at 283415424
-    storage.modify_branch(&branches[2], NO_OP, 15906192, 8192); // at 299321616
-    storage.modify_branch(&branches[0], NO_OP, 18909976, 32768); // at 302325400
-
-    let size = storage.calculate(100_000);
-
-    (storage.into_segments(), size)
-}
-
-fn main() {
-    let args: Vec<String> = std::env::args().collect();
-
-    let scenario = if args.len() < 2 { "1" } else { &args[1] };
-
-    let (segments, size) = match scenario {
-        "1" => scenario_1(),
-        "2" => scenario_2(),
-        "3" => scenario_3(),
-        "4" => scenario_4(),
-        "5" => scenario_5(),
-        "6" => scenario_6(),
-        other => {
-            eprintln!("invalid scenario {}", other);
-            std::process::exit(1);
-        }
-    };
-
-    graphviz_tree(&segments, &size);
-}
-
-fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) {
-    use tenant_size_model::SegmentMethod::*;
-
-    let seg_id = node.seg_id;
-    let seg = segments.get(seg_id).unwrap();
-    let lsn = seg.end_lsn;
-    let size = seg.end_size;
-    let method = node.method;
-
-    println!("  {{");
-    println!("    node [width=0.1 height=0.1 shape=oval]");
-
-    let tenant_size = node.total_children();
-
-    let penwidth = if seg.needed { 6 } else { 3 };
-    let x = match method {
-        SnapshotAfter =>
-            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" style=filled penwidth={penwidth}"),
-        Wal =>
-            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
-        WalNeeded =>
-            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
-        Skipped =>
-            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"gray\" penwidth={penwidth}"),
-    };
-
-    println!("    \"seg{seg_id}\" [{x}]");
-    println!("  }}");
-
-    // Recurse. Much of the data is actually on the edge
-    for child in node.children.iter() {
-        let child_id = child.seg_id;
-        graphviz_recurse(segments, child);
-
-        let edge_color = match child.method {
-            SnapshotAfter => "gray",
-            Wal => "black",
-            WalNeeded => "black",
-            Skipped => "gray",
-        };
-
-        println!("  {{");
-        println!("    edge [] ");
-        print!("    \"seg{seg_id}\" -> \"seg{child_id}\" [");
-        print!("color={edge_color}");
-        if child.method == WalNeeded {
-            print!(" penwidth=6");
-        }
-        if child.method == Wal {
-            print!(" penwidth=3");
-        }
-
-        let next = segments.get(child_id).unwrap();
-
-        if next.op.is_empty() {
-            print!(
-                " label=\"{} / {}\"",
-                next.end_lsn - seg.end_lsn,
-                (next.end_size as i128 - seg.end_size as i128)
-            );
-        } else {
-            print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn);
-        }
-        println!("]");
-        println!("  }}");
-    }
-}
-
-fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) {
-    println!("digraph G {{");
-    println!("  fontname=\"Helvetica,Arial,sans-serif\"");
-    println!("  node [fontname=\"Helvetica,Arial,sans-serif\"]");
-    println!("  edge [fontname=\"Helvetica,Arial,sans-serif\"]");
-    println!("  graph [center=1 rankdir=LR]");
-    println!("  edge [dir=none]");
-
-    graphviz_recurse(segments, tree);
-
-    println!("}}");
-}
-
-#[test]
-fn scenarios_return_same_size() {
-    type ScenarioFn = fn() -> (Vec<Segment>, SegmentSize);
-    let truths: &[(u32, ScenarioFn, _)] = &[
-        (line!(), scenario_1, 8000),
-        (line!(), scenario_2, 9000),
-        (line!(), scenario_3, 13000),
-        (line!(), scenario_4, 16000),
-        (line!(), scenario_5, 17000),
-        (line!(), scenario_6, 333_792_000),
-    ];
-
-    for (line, scenario, expected) in truths {
-        let (_, size) = scenario();
-        assert_eq!(*expected, size.total_children(), "scenario on line {line}");
-    }
-}
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -9,6 +9,9 @@ anyhow = "1.0"
 bincode = "1.3"
 bytes = "1.0.1"
 hyper = { version = "0.14.7", features = ["full"] }
+pin-project-lite = "0.2.7"
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 routerify = "3"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
@@ -16,7 +19,7 @@ thiserror = "1.0"
 tokio = { version = "1.17", features = ["macros"]}
 tokio-rustls = "0.23"
 tracing = "0.1"
-tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 nix = "0.25"
 signal-hook = "0.3.10"
 rand = "0.8.3"
@@ -27,11 +30,9 @@ rustls-split = "0.3.0"
 git-version = "0.3.5"
 serde_with = "2.0"
 once_cell = "1.13.0"
-strum = "0.24"
-strum_macros = "0.24"
+

 metrics = { path = "../metrics" }
-pq_proto = { path = "../pq_proto" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }

 [dev-dependencies]
--- a/libs/utils/src/connstring.rs
+++ b/libs/utils/src/connstring.rs
@@ -0,0 +1,52 @@
+use postgres::Config;
+
+pub fn connection_host_port(config: &Config) -> (String, u16) {
+    assert_eq!(
+        config.get_hosts().len(),
+        1,
+        "only one pair of host and port is supported in connection string"
+    );
+    assert_eq!(
+        config.get_ports().len(),
+        1,
+        "only one pair of host and port is supported in connection string"
+    );
+    let host = match &config.get_hosts()[0] {
+        postgres::config::Host::Tcp(host) => host.as_ref(),
+        postgres::config::Host::Unix(host) => host.to_str().unwrap(),
+    };
+    (host.to_owned(), config.get_ports()[0])
+}
+
+pub fn connection_address(config: &Config) -> String {
+    let (host, port) = connection_host_port(config);
+    format!("{}:{}", host, port)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_connection_host_port() {
+        let config: Config = "postgresql://no_user@localhost:64000/no_db"
+            .parse()
+            .unwrap();
+        assert_eq!(
+            connection_host_port(&config),
+            ("localhost".to_owned(), 64000)
+        );
+    }
+
+    #[test]
+    #[should_panic(expected = "only one pair of host and port is supported in connection string")]
+    fn test_connection_host_port_multiple_ports() {
+        let config: Config = "postgresql://no_user@localhost:64000,localhost:64001/no_db"
+            .parse()
+            .unwrap();
+        assert_eq!(
+            connection_host_port(&config),
+            ("localhost".to_owned(), 64000)
+        );
+    }
+}
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -75,12 +75,6 @@ impl From<[u8; 16]> for Id {
    }
 }

-impl From<Id> for u128 {
-    fn from(id: Id) -> Self {
-        u128::from_le_bytes(id.0)
-    }
-}
-
 impl fmt::Display for Id {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.write_str(&self.hex_encode())
@@ -142,12 +136,6 @@ macro_rules! id_newtype {
            }
        }

-        impl From<$t> for u128 {
-            fn from(id: $t) -> Self {
-                u128::from(id.0)
-            }
-        }
-
        impl fmt::Display for $t {
            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
                self.0.fmt(f)
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,6 +1,8 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.

+#![allow(clippy::manual_range_contains)]
+
 /// `Lsn` type implements common tasks on Log Sequence Numbers
 pub mod lsn;
 /// SeqWait allows waiting for a future sequence number to arrive
@@ -15,6 +17,10 @@ pub mod vec_map;
 pub mod bin_ser;
 pub mod postgres_backend;
 pub mod postgres_backend_async;
+pub mod pq_proto;
+
+// dealing with connstring parsing and handy access to it's parts
+pub mod connstring;

 // helper functions for creating and fsyncing
 pub mod crashsafe;
@@ -33,12 +39,13 @@ pub mod sock_split;
 // common log initialisation routine
 pub mod logging;

-pub mod lock_file;
-
 // Misc
 pub mod accum;
 pub mod shutdown;

+// Tools for calling certain async methods in sync contexts
+pub mod sync;
+
 // Utility for binding TcpListeners with proper socket options.
 pub mod tcp_listener;

--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -1,81 +0,0 @@
-//! A module to create and read lock files. A lock file ensures that only one
-//! process is running at a time, in a particular directory.
-//!
-//! File locking is done using [`fcntl::flock`], which means that holding the
-//! lock on file only prevents acquiring another lock on it; all other
-//! operations are still possible on files. Other process can still open, read,
-//! write, or remove the file, for example.
-//! If the file is removed while a process is holding a lock on it,
-//! the process that holds the lock does not get any error or notification.
-//! Furthermore, you can create a new file with the same name and lock the new file,
-//! while the old process is still running.
-//! Deleting the lock file while the locking process is still running is a bad idea!
-
-use std::{fs, os::unix::prelude::AsRawFd, path::Path};
-
-use anyhow::Context;
-use nix::fcntl;
-
-use crate::crashsafe;
-
-pub enum LockCreationResult {
-    Created {
-        new_lock_contents: String,
-        file: fs::File,
-    },
-    AlreadyLocked {
-        existing_lock_contents: String,
-    },
-    CreationFailed(anyhow::Error),
-}
-
-/// Creates a lock file in the path given and writes the given contents into the file.
-/// Note: The lock is automatically released when the file closed. You might want to use Box::leak to make sure it lives until the end of the program.
-pub fn create_lock_file(lock_file_path: &Path, contents: String) -> LockCreationResult {
-    let lock_file = match fs::OpenOptions::new()
-        .create(true) // O_CREAT
-        .write(true)
-        .open(lock_file_path)
-        .context("Failed to open lock file")
-    {
-        Ok(file) => file,
-        Err(e) => return LockCreationResult::CreationFailed(e),
-    };
-
-    match fcntl::flock(
-        lock_file.as_raw_fd(),
-        fcntl::FlockArg::LockExclusiveNonblock,
-    ) {
-        Ok(()) => {
-            match lock_file
-                .set_len(0)
-                .context("Failed to truncate lockfile")
-                .and_then(|()| {
-                    fs::write(lock_file_path, &contents).with_context(|| {
-                        format!("Failed to write '{contents}' contents into lockfile")
-                    })
-                })
-                .and_then(|()| {
-                    crashsafe::fsync_file_and_parent(lock_file_path)
-                        .context("Failed to fsync lockfile")
-                }) {
-                Ok(()) => LockCreationResult::Created {
-                    new_lock_contents: contents,
-                    file: lock_file,
-                },
-                Err(e) => LockCreationResult::CreationFailed(e),
-            }
-        }
-        Err(nix::errno::Errno::EAGAIN) => {
-            match fs::read_to_string(lock_file_path).context("Failed to read lockfile contents") {
-                Ok(existing_lock_contents) => LockCreationResult::AlreadyLocked {
-                    existing_lock_contents,
-                },
-                Err(e) => LockCreationResult::CreationFailed(e),
-            }
-        }
-        Err(e) => {
-            LockCreationResult::CreationFailed(anyhow::anyhow!("Failed to lock lockfile: {e}"))
-        }
-    }
-}
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,28 +1,19 @@
-use std::str::FromStr;
+use std::{
+    fs::{File, OpenOptions},
+    path::Path,
+};

-use anyhow::Context;
-use strum_macros::{EnumString, EnumVariantNames};
+use anyhow::{Context, Result};

-#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
-#[strum(serialize_all = "snake_case")]
-pub enum LogFormat {
-    Plain,
-    Json,
-}
+pub fn init(log_filename: impl AsRef<Path>, daemonize: bool) -> Result<File> {
+    // Don't open the same file for output multiple times;
+    // the different fds could overwrite each other's output.
+    let log_file = OpenOptions::new()
+        .create(true)
+        .append(true)
+        .open(&log_filename)
+        .with_context(|| format!("failed to open {:?}", log_filename.as_ref()))?;

-impl LogFormat {
-    pub fn from_config(s: &str) -> anyhow::Result<LogFormat> {
-        use strum::VariantNames;
-        LogFormat::from_str(s).with_context(|| {
-            format!(
-                "Unrecognized log format. Please specify one of: {:?}",
-                LogFormat::VARIANTS
-            )
-        })
-    }
-}
-
-pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
    let default_filter_str = "info";

    // We fall back to printing all spans at info-level or above if
@@ -32,14 +23,20 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {

    let base_logger = tracing_subscriber::fmt()
        .with_env_filter(env_filter)
-        .with_target(false)
-        .with_ansi(false)
-        .with_writer(std::io::stdout);
+        .with_target(false) // don't include event targets
+        .with_ansi(false); // don't use colors in log file;

-    match log_format {
-        LogFormat::Json => base_logger.json().init(),
-        LogFormat::Plain => base_logger.init(),
+    // we are cloning and returning log file in order to allow redirecting daemonized stdout and stderr to it
+    // if we do not use daemonization (e.g. in docker) it is better to log to stdout directly
+    // for example to be in line with docker log command which expects logs comimg from stdout
+    if daemonize {
+        let x = log_file.try_clone().unwrap();
+        base_logger
+            .with_writer(move || x.try_clone().unwrap())
+            .init();
+    } else {
+        base_logger.init();
    }

-    Ok(())
+    Ok(log_file)
 }
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -13,7 +13,7 @@ use crate::seqwait::MonotonicCounter;
 pub const XLOG_BLCKSZ: u32 = 8192;

 /// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
-#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Serialize, Deserialize)]
+#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Serialize, Deserialize)]
 #[serde(transparent)]
 pub struct Lsn(pub u64);

--- a/libs/utils/src/postgres_backend.rs
+++ b/libs/utils/src/postgres_backend.rs
@@ -3,10 +3,10 @@
 //! implementation determining how to process the queries. Currently its API
 //! is rather narrow, but we can extend it once required.

+use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
 use crate::sock_split::{BidiStream, ReadStream, WriteStream};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
-use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
 use std::fmt;
--- a/libs/utils/src/postgres_backend_async.rs
+++ b/libs/utils/src/postgres_backend_async.rs
@@ -4,9 +4,9 @@
 //! is rather narrow, but we can extend it once required.

 use crate::postgres_backend::AuthType;
+use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
 use anyhow::{bail, Context, Result};
 use bytes::{Bytes, BytesMut};
-use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
 use rand::Rng;
 use std::future::Future;
 use std::net::SocketAddr;
--- a/libs/utils/src/pq_proto.rs
+++ b/libs/utils/src/pq_proto.rs
@@ -2,9 +2,7 @@
 //! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
 //! on message formats.

-// Tools for calling certain async methods in sync contexts.
-pub mod sync;
-
+use crate::sync::{AsyncishRead, SyncFuture};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use postgres_protocol::PG_EPOCH;
@@ -18,7 +16,6 @@ use std::{
    str,
    time::{Duration, SystemTime},
 };
-use sync::{AsyncishRead, SyncFuture};
 use tokio::io::AsyncReadExt;
 use tracing::{trace, warn};

@@ -201,7 +198,7 @@ impl FeMessage {
    ///
    /// ```
    /// # use std::io;
-    /// # use pq_proto::FeMessage;
+    /// # use utils::pq_proto::FeMessage;
    /// #
    /// # fn process_message(msg: FeMessage) -> anyhow::Result<()> {
    /// #     Ok(())
@@ -305,7 +302,6 @@ impl FeStartupPacket {
                Err(e) => return Err(e.into()),
            };

-            #[allow(clippy::manual_range_contains)]
            if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
                bail!("invalid message length");
            }
--- a/libs/pq_proto/src/sync.rs
+++ b/libs/pq_proto/src/sync.rs
@@ -29,7 +29,7 @@ impl<S, T: Future> SyncFuture<S, T> {
    /// Example:
    ///
    /// ```
-    /// # use pq_proto::sync::SyncFuture;
+    /// # use utils::sync::SyncFuture;
    /// # use std::future::Future;
    /// # use tokio::io::AsyncReadExt;
    /// #
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -12,61 +12,61 @@ testing = ["fail/failpoints"]
 profiling = ["pprof"]

 [dependencies]
-amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
-anyhow = { version = "1.0", features = ["backtrace"] }
 async-stream = "0.3"
 async-trait = "0.1"
-byteorder = "1.4.3"
-bytes = "1.0.1"
 chrono = "0.4.19"
-clap = { version = "4.0", features = ["string"] }
-close_fds = "0.3.2"
-const_format = "0.2.21"
-crc32c = "0.6.0"
-crossbeam-utils = "0.8.5"
-fail = "0.5.0"
-futures = "0.3.13"
-git-version = "0.3.5"
-hex = "0.4.3"
-humantime = "2.1.0"
-humantime-serde = "1.1.1"
-hyper = "0.14"
-itertools = "0.10.3"
-nix = "0.25"
-num-traits = "0.2.15"
-once_cell = "1.13.0"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
 rand = "0.8.3"
 regex = "1.4.5"
-rstar = "0.9.3"
-scopeguard = "1.1.0"
+bytes = "1.0.1"
+byteorder = "1.4.3"
+futures = "0.3.13"
+hex = "0.4.3"
+hyper = "0.14"
+itertools = "0.10.3"
+clap = { version = "4.0", features = ["string"] }
+daemonize = "0.4.1"
+tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
+tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+anyhow = { version = "1.0", features = ["backtrace"] }
+crc32c = "0.6.0"
+thiserror = "1.0"
+tar = "0.4.33"
+humantime = "2.1.0"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "2.0"
-signal-hook = "0.3.10"
-svg_fmt = "0.4.1"
-tar = "0.4.33"
-thiserror = "1.0"
-tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
-toml_edit = { version = "0.14", features = ["easy"] }
-tracing = "0.1.36"
-url = "2"
-walkdir = "2.3.2"
+humantime-serde = "1.1.1"
+
+pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
+
+toml_edit = { version = "0.14", features = ["easy"] }
+scopeguard = "1.1.0"
+const_format = "0.2.21"
+tracing = "0.1.36"
+signal-hook = "0.3.10"
+url = "2"
+nix = "0.25"
+once_cell = "1.13.0"
+crossbeam-utils = "0.8.5"
+fail = "0.5.0"
+git-version = "0.3.5"
+rstar = "0.9.3"
+num-traits = "0.2.15"
+amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }

-etcd_broker = { path = "../libs/etcd_broker" }
-metrics = { path = "../libs/metrics" }
 pageserver_api = { path = "../libs/pageserver_api" }
 postgres_ffi = { path = "../libs/postgres_ffi" }
-pq_proto = { path = "../libs/pq_proto" }
-remote_storage = { path = "../libs/remote_storage" }
-tenant_size_model = { path = "../libs/tenant_size_model" }
+etcd_broker = { path = "../libs/etcd_broker" }
+metrics = { path = "../libs/metrics" }
 utils = { path = "../libs/utils" }
+remote_storage = { path = "../libs/remote_storage" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
+close_fds = "0.3.2"
+walkdir = "2.3.2"

 [dev-dependencies]
 criterion = "0.4"
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -22,8 +22,8 @@ use std::time::SystemTime;
 use tar::{Builder, EntryType, Header};
 use tracing::*;

+use crate::reltag::{RelTag, SlruKind};
 use crate::tenant::Timeline;
-use pageserver_api::reltag::{RelTag, SlruKind};

 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
 use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
--- a/pageserver/src/bin/draw_timeline_dir.rs
+++ b/pageserver/src/bin/draw_timeline_dir.rs
@@ -1,150 +0,0 @@
-//! A tool for visualizing the arrangement of layerfiles within a timeline.
-//!
-//! It reads filenames from stdin and prints a svg on stdout. The image is a plot in
-//! page-lsn space, where every delta layer is a rectangle and every image layer is a
-//! thick line. Legend:
-//! - The x axis (left to right) represents page index.
-//! - The y axis represents LSN, growing upwards.
-//!
-//! Coordinates in both axis are compressed for better readability.
-//! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb)
-//!
-//! Example use:
-//! ```
-//! $ cd test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE
-//! $ ls | grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
-//! $ firefox out.svg
-//! ```
-//!
-//! This API was chosen so that we can easily work with filenames extracted from ssh,
-//! or from pageserver log files.
-//!
-//! TODO Consider shipping this as a grafana panel plugin:
-//!      https://grafana.com/tutorials/build-a-panel-plugin/
-use anyhow::Result;
-use pageserver::repository::Key;
-use std::cmp::Ordering;
-use std::io::{self, BufRead};
-use std::{
-    collections::{BTreeMap, BTreeSet},
-    ops::Range,
-};
-use svg_fmt::{rectangle, rgb, BeginSvg, EndSvg, Fill, Stroke};
-use utils::{lsn::Lsn, project_git_version};
-
-project_git_version!(GIT_VERSION);
-
-// Map values to their compressed coordinate - the index the value
-// would have in a sorted and deduplicated list of all values.
-fn build_coordinate_compression_map<T: Ord + Copy>(coords: Vec<T>) -> BTreeMap<T, usize> {
-    let set: BTreeSet<T> = coords.into_iter().collect();
-
-    let mut map: BTreeMap<T, usize> = BTreeMap::new();
-    for (i, e) in set.iter().enumerate() {
-        map.insert(*e, i);
-    }
-
-    map
-}
-
-fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
-    let split: Vec<&str> = name.split("__").collect();
-    let keys: Vec<&str> = split[0].split('-').collect();
-    let mut lsns: Vec<&str> = split[1].split('-').collect();
-    if lsns.len() == 1 {
-        lsns.push(lsns[0]);
-    }
-
-    let keys = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap();
-    let lsns = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap();
-    (keys, lsns)
-}
-
-fn main() -> Result<()> {
-    // Parse layer filenames from stdin
-    let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
-    let stdin = io::stdin();
-    for line in stdin.lock().lines() {
-        let range = parse_filename(&line.unwrap());
-        ranges.push(range);
-    }
-
-    // Collect all coordinates
-    let mut keys: Vec<Key> = vec![];
-    let mut lsns: Vec<Lsn> = vec![];
-    for (keyr, lsnr) in &ranges {
-        keys.push(keyr.start);
-        keys.push(keyr.end);
-        lsns.push(lsnr.start);
-        lsns.push(lsnr.end);
-    }
-
-    // Analyze
-    let key_map = build_coordinate_compression_map(keys);
-    let lsn_map = build_coordinate_compression_map(lsns);
-
-    // Initialize stats
-    let mut num_deltas = 0;
-    let mut num_images = 0;
-
-    // Draw
-    let stretch = 3.0; // Stretch out vertically for better visibility
-    println!(
-        "{}",
-        BeginSvg {
-            w: key_map.len() as f32,
-            h: stretch * lsn_map.len() as f32
-        }
-    );
-    for (keyr, lsnr) in &ranges {
-        let key_start = *key_map.get(&keyr.start).unwrap();
-        let key_end = *key_map.get(&keyr.end).unwrap();
-        let key_diff = key_end - key_start;
-        let lsn_max = lsn_map.len();
-
-        if key_start >= key_end {
-            panic!("Invalid key range {}-{}", key_start, key_end);
-        }
-
-        let lsn_start = *lsn_map.get(&lsnr.start).unwrap();
-        let lsn_end = *lsn_map.get(&lsnr.end).unwrap();
-
-        let mut lsn_diff = (lsn_end - lsn_start) as f32;
-        let mut fill = Fill::None;
-        let mut margin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
-        let mut lsn_offset = 0.0;
-
-        // Fill in and thicken rectangle if it's an
-        // image layer so that we can see it.
-        match lsn_start.cmp(&lsn_end) {
-            Ordering::Less => num_deltas += 1,
-            Ordering::Equal => {
-                num_images += 1;
-                lsn_diff = 0.3;
-                lsn_offset = -lsn_diff / 2.0;
-                margin = 0.05;
-                fill = Fill::Color(rgb(0, 0, 0));
-            }
-            Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
-        }
-
-        println!(
-            "    {}",
-            rectangle(
-                key_start as f32 + stretch * margin,
-                stretch * (lsn_max as f32 - (lsn_end as f32 - margin - lsn_offset)),
-                key_diff as f32 - stretch * 2.0 * margin,
-                stretch * (lsn_diff - 2.0 * margin)
-            )
-            .fill(fill)
-            .stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
-            .border_radius(0.4)
-        );
-    }
-    println!("{}", EndSvg);
-
-    eprintln!("num_images: {}", num_images);
-    eprintln!("num_deltas: {}", num_deltas);
-
-    Ok(())
-}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -1,14 +1,17 @@
 //! Main entry point for the Page Server executable.

+use remote_storage::GenericRemoteStorage;
 use std::{env, ops::ControlFlow, path::Path, str::FromStr};
-
-use anyhow::{anyhow, Context};
-use clap::{Arg, ArgAction, Command};
-use fail::FailScenario;
-use nix::unistd::Pid;
 use tracing::*;

+use anyhow::{anyhow, bail, Context, Result};
+
+use clap::{Arg, ArgAction, Command};
+use daemonize::Daemonize;
+
+use fail::FailScenario;
 use metrics::set_build_info_metric;
+
 use pageserver::{
    config::{defaults::*, PageServerConf},
    http, page_cache, page_service, profiling, task_mgr,
@@ -16,22 +19,20 @@ use pageserver::{
    task_mgr::{
        BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
    },
-    tenant_mgr, virtual_file,
+    tenant_mgr, virtual_file, LOG_FILE_NAME,
 };
-use remote_storage::GenericRemoteStorage;
 use utils::{
    auth::JwtAuth,
-    lock_file, logging,
+    logging,
    postgres_backend::AuthType,
    project_git_version,
+    shutdown::exit_now,
    signals::{self, Signal},
    tcp_listener,
 };

 project_git_version!(GIT_VERSION);

-const PID_FILE_NAME: &str = "pageserver.pid";
-
 const FEATURES: &[&str] = &[
    #[cfg(feature = "testing")]
    "testing",
@@ -64,7 +65,6 @@ fn main() -> anyhow::Result<()> {
    let workdir = workdir
        .canonicalize()
        .with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
-
    let cfg_file_path = workdir.join("pageserver.toml");

    // Set CWD to workdir for non-daemon modes
@@ -75,6 +75,8 @@ fn main() -> anyhow::Result<()> {
        )
    })?;

+    let daemonize = arg_matches.get_flag("daemonize");
+
    let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
        ControlFlow::Continue(conf) => conf,
        ControlFlow::Break(()) => {
@@ -100,7 +102,7 @@ fn main() -> anyhow::Result<()> {
    virtual_file::init(conf.max_file_descriptors);
    page_cache::init(conf.page_cache_size);

-    start_pageserver(conf).context("Failed to start pageserver")?;
+    start_pageserver(conf, daemonize).context("Failed to start pageserver")?;

    scenario.teardown();
    Ok(())
@@ -195,33 +197,11 @@ fn initialize_config(
    })
 }

-fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
-    logging::init(conf.log_format)?;
-    info!("version: {}", version());
+fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> {
+    // Initialize logger
+    let log_file = logging::init(LOG_FILE_NAME, daemonize)?;

-    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
-    let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
-        lock_file::LockCreationResult::Created {
-            new_lock_contents,
-            file,
-        } => {
-            info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}");
-            file
-        }
-        lock_file::LockCreationResult::AlreadyLocked {
-            existing_lock_contents,
-        } => anyhow::bail!(
-            "Could not lock pid file; pageserver is already running in {:?} with PID {}",
-            conf.workdir,
-            existing_lock_contents
-        ),
-        lock_file::LockCreationResult::CreationFailed(e) => {
-            return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
-        }
-    };
-    // ensure that the lock file is held even if the main thread of the process is panics
-    // we need to release the lock file only when the current process is gone
-    let _ = Box::leak(Box::new(lock_file));
+    info!("version: {}", version());

    // TODO: Check that it looks like a valid repository before going further

@@ -238,6 +218,33 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
    );
    let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;

+    // NB: Don't spawn any threads before daemonizing!
+    if daemonize {
+        info!("daemonizing...");
+
+        // There shouldn't be any logging to stdin/stdout. Redirect it to the main log so
+        // that we will see any accidental manual fprintf's or backtraces.
+        let stdout = log_file
+            .try_clone()
+            .with_context(|| format!("Failed to clone log file '{:?}'", log_file))?;
+        let stderr = log_file;
+
+        let daemonize = Daemonize::new()
+            .pid_file("pageserver.pid")
+            .working_directory(".")
+            .stdout(stdout)
+            .stderr(stderr);
+
+        // XXX: The parent process should exit abruptly right after
+        // it has spawned a child to prevent coverage machinery from
+        // dumping stats into a `profraw` file now owned by the child.
+        // Otherwise, the coverage data will be damaged.
+        match daemonize.exit_action(|| exit_now(0)).start() {
+            Ok(_) => info!("Success, daemonized"),
+            Err(err) => bail!("{err}. could not daemonize. bailing."),
+        }
+    }
+
    let signals = signals::install_shutdown_handlers()?;

    // start profiler (if enabled)
@@ -340,6 +347,14 @@ fn cli() -> Command {
    Command::new("Neon page server")
        .about("Materializes WAL stream to pages and serves them to the postgres")
        .version(version())
+        .arg(
+
+            Arg::new("daemonize")
+                .short('d')
+                .long("daemonize")
+                .action(ArgAction::SetTrue)
+                .help("Run in the background"),
+        )
        .arg(
            Arg::new("init")
                .long("init")
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -9,7 +9,6 @@ use remote_storage::RemoteStorageConfig;
 use std::env;
 use utils::crashsafe::path_with_suffix_extension;

-use std::num::NonZeroUsize;
 use std::path::{Path, PathBuf};
 use std::str::FromStr;
 use std::time::Duration;
@@ -18,7 +17,6 @@ use toml_edit::{Document, Item};
 use url::Url;
 use utils::{
    id::{NodeId, TenantId, TimelineId},
-    logging::LogFormat,
    postgres_backend::AuthType,
 };

@@ -47,11 +45,6 @@ pub mod defaults {
    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;

-    pub const DEFAULT_LOG_FORMAT: &str = "plain";
-
-    pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
-        super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
-
    ///
    /// Default built-in configuration file.
    ///
@@ -70,10 +63,6 @@ pub mod defaults {
 # initial superuser role name to use when creating a new tenant
 #initial_superuser_name = '{DEFAULT_SUPERUSER}'

-#log_format = '{DEFAULT_LOG_FORMAT}'
-
-#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
-
 # [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -137,11 +126,6 @@ pub struct PageServerConf {

    /// Etcd broker endpoints to connect to.
    pub broker_endpoints: Vec<Url>,
-
-    pub log_format: LogFormat,
-
-    /// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
-    pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
 }

 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -208,10 +192,6 @@ struct PageServerConfigBuilder {
    profiling: BuilderValue<ProfilingConfig>,
    broker_etcd_prefix: BuilderValue<String>,
    broker_endpoints: BuilderValue<Vec<Url>>,
-
-    log_format: BuilderValue<LogFormat>,
-
-    concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
 }

 impl Default for PageServerConfigBuilder {
@@ -239,9 +219,6 @@ impl Default for PageServerConfigBuilder {
            profiling: Set(ProfilingConfig::Disabled),
            broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()),
            broker_endpoints: Set(Vec::new()),
-            log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
-
-            concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
        }
    }
 }
@@ -314,14 +291,6 @@ impl PageServerConfigBuilder {
        self.profiling = BuilderValue::Set(profiling)
    }

-    pub fn log_format(&mut self, log_format: LogFormat) {
-        self.log_format = BuilderValue::Set(log_format)
-    }
-
-    pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: ConfigurableSemaphore) {
-        self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
-    }
-
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let broker_endpoints = self
            .broker_endpoints
@@ -366,12 +335,6 @@ impl PageServerConfigBuilder {
            broker_etcd_prefix: self
                .broker_etcd_prefix
                .ok_or(anyhow!("missing broker_etcd_prefix"))?,
-            log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
-            concurrent_tenant_size_logical_size_queries: self
-                .concurrent_tenant_size_logical_size_queries
-                .ok_or(anyhow!(
-                    "missing concurrent_tenant_size_logical_size_queries"
-                ))?,
        })
    }
 }
@@ -496,15 +459,6 @@ impl PageServerConf {
                        })
                        .collect::<anyhow::Result<_>>()?,
                ),
-                "log_format" => builder.log_format(
-                    LogFormat::from_config(&parse_toml_string(key, item)?)?
-                ),
-                "concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({
-                    let input = parse_toml_string(key, item)?;
-                    let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
-                    let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
-                    ConfigurableSemaphore::new(permits)
-                }),
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -617,8 +571,6 @@ impl PageServerConf {
            default_tenant_conf: TenantConf::dummy_conf(),
            broker_endpoints: Vec::new(),
            broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
-            log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
-            concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
        }
    }
 }
@@ -684,58 +636,6 @@ fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result<Vec<String>> {
        .collect()
 }

-/// Configurable semaphore permits setting.
-///
-/// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty
-/// semaphore cannot be distinguished, leading any feature using these to await forever (or until
-/// new permits are added).
-#[derive(Debug, Clone)]
-pub struct ConfigurableSemaphore {
-    initial_permits: NonZeroUsize,
-    inner: std::sync::Arc<tokio::sync::Semaphore>,
-}
-
-impl ConfigurableSemaphore {
-    pub const DEFAULT_INITIAL: NonZeroUsize = match NonZeroUsize::new(1) {
-        Some(x) => x,
-        None => panic!("const unwrap is not yet stable"),
-    };
-
-    /// Initializse using a non-zero amount of permits.
-    ///
-    /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
-    /// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
-    /// behave like [`futures::future::pending`], just waiting until new permits are added.
-    pub fn new(initial_permits: NonZeroUsize) -> Self {
-        ConfigurableSemaphore {
-            initial_permits,
-            inner: std::sync::Arc::new(tokio::sync::Semaphore::new(initial_permits.get())),
-        }
-    }
-}
-
-impl Default for ConfigurableSemaphore {
-    fn default() -> Self {
-        Self::new(Self::DEFAULT_INITIAL)
-    }
-}
-
-impl PartialEq for ConfigurableSemaphore {
-    fn eq(&self, other: &Self) -> bool {
-        // the number of permits can be increased at runtime, so we cannot really fulfill the
-        // PartialEq value equality otherwise
-        self.initial_permits == other.initial_permits
-    }
-}
-
-impl Eq for ConfigurableSemaphore {}
-
-impl ConfigurableSemaphore {
-    pub fn inner(&self) -> &std::sync::Arc<tokio::sync::Semaphore> {
-        &self.inner
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use std::{
@@ -765,8 +665,6 @@ max_file_descriptors = 333
 initial_superuser_name = 'zzzz'
 id = 10

-log_format = 'json'
-
 "#;

    #[test]
@@ -806,8 +704,6 @@ log_format = 'json'
                    .parse()
                    .expect("Failed to parse a valid broker endpoint URL")],
                broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
-                log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
-                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -852,8 +748,6 @@ log_format = 'json'
                    .parse()
                    .expect("Failed to parse a valid broker endpoint URL")],
                broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
-                log_format: LogFormat::Json,
-                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -354,54 +354,6 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"

-  /v1/tenant/{tenant_id}/size:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-    get:
-      description: |
-        Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
-      responses:
-        "200":
-          description: OK,
-          content:
-            application/json:
-              schema:
-                type: object
-                required:
-                  - id
-                  - size
-                properties:
-                  id:
-                    type: string
-                    format: hex
-                  size:
-                    type: integer
-                    description: |
-                      Size metric in bytes.
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-
  /v1/tenant/{tenant_id}/timeline/:
    parameters:
      - name: tenant_id
@@ -666,7 +618,6 @@ components:
        - last_record_lsn
        - disk_consistent_lsn
        - awaits_download
-        - state
      properties:
        timeline_id:
          type: string
@@ -709,8 +660,6 @@ components:
          type: integer
        awaits_download:
          type: boolean
-        state:
-          type: string

        # These 'local' and 'remote' fields just duplicate some of the fields
        # above. They are kept for backwards-compatibility. They can be removed,
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -129,7 +129,6 @@ async fn build_timeline_info(
        }
    };
    let current_physical_size = Some(timeline.get_physical_size());
-    let state = timeline.current_state();

    let info = TimelineInfo {
        tenant_id: timeline.tenant_id,
@@ -159,7 +158,6 @@ async fn build_timeline_info(

        remote_consistent_lsn,
        awaits_download,
-        state,

        // Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility
        // with the control plane.
@@ -227,10 +225,13 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,

    let state = get_state(&request);

-    let timelines = info_span!("timeline_list", tenant = %tenant_id).in_scope(|| {
+    let timelines = tokio::task::spawn_blocking(move || {
+        let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
        let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
        Ok(tenant.list_timelines())
-    })?;
+    })
+    .await
+    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;

    let mut response_data = Vec::with_capacity(timelines.len());
    for timeline in timelines {
@@ -293,7 +294,7 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body

    let timeline_info = async {
        let timeline = tokio::task::spawn_blocking(move || {
-            tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id, false)
+            tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id)
        })
        .await
        .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
@@ -330,13 +331,14 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
    let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);

    let timeline = tenant_mgr::get_tenant(tenant_id, true)
-        .and_then(|tenant| tenant.get_timeline(timeline_id, true))
+        .and_then(|tenant| tenant.get_timeline(timeline_id))
+        .with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
        .map_err(ApiError::NotFound)?;
    let result = match timeline
        .find_lsn_for_timestamp(timestamp_pg)
        .map_err(ApiError::InternalServerError)?
    {
-        LsnForTimestamp::Present(lsn) => format!("{lsn}"),
+        LsnForTimestamp::Present(lsn) => format!("{}", lsn),
        LsnForTimestamp::Future(_lsn) => "future".into(),
        LsnForTimestamp::Past(_lsn) => "past".into(),
        LsnForTimestamp::NoData(_lsn) => "nodata".into(),
@@ -520,7 +522,9 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
    check_permission(&request, Some(tenant_id))?;

    // if tenant is in progress of downloading it can be absent in global tenant map
-    let tenant = tenant_mgr::get_tenant(tenant_id, false);
+    let tenant = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant(tenant_id, false))
+        .await
+        .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;

    let state = get_state(&request);
    let remote_index = &state.remote_index;
@@ -566,44 +570,6 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
    )
 }

-async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
-
-    let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::InternalServerError)?;
-
-    // this can be long operation, it currently is not backed by any request coalescing or similar
-    let inputs = tenant
-        .gather_size_inputs()
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
-    let size = inputs.calculate().map_err(ApiError::InternalServerError)?;
-
-    /// Private response type with the additional "unstable" `inputs` field.
-    ///
-    /// The type is described with `id` and `size` in the openapi_spec file, but the `inputs` is
-    /// intentionally left out. The type resides in the pageserver not to expose `ModelInputs`.
-    #[serde_with::serde_as]
-    #[derive(serde::Serialize)]
-    struct TenantHistorySize {
-        #[serde_as(as = "serde_with::DisplayFromStr")]
-        id: TenantId,
-        /// Size is a mixture of WAL and logical size, so the unit is bytes.
-        size: u64,
-        inputs: crate::tenant::size::ModelInputs,
-    }
-
-    json_response(
-        StatusCode::OK,
-        TenantHistorySize {
-            id: tenant_id,
-            size,
-            inputs,
-        },
-    )
-}
-
 // Helper function to standardize the error messages we produce on bad durations
 //
 // Intended to be used with anyhow's `with_context`, e.g.:
@@ -822,17 +788,17 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
    check_permission(&request, Some(tenant_id))?;

    // FIXME: currently this will return a 500 error on bad tenant id; it should be 4XX
-    let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?;
+    let repo = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?;
    let gc_req: TimelineGcRequest = json_request(&mut request).await?;

-    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
+    let _span_guard =
+        info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered();
+    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| repo.get_gc_horizon());

    // Use tenant's pitr setting
-    let pitr = tenant.get_pitr_interval();
-    let result = tenant
+    let pitr = repo.get_pitr_interval();
+    let result = repo
        .gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
-        .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
-        .await
        // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
        // better once the types support it.
        .map_err(ApiError::InternalServerError)?;
@@ -846,9 +812,10 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
-    let timeline = tenant
-        .get_timeline(timeline_id, true)
+    let repo = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
+    let timeline = repo
+        .get_timeline(timeline_id)
+        .with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
        .map_err(ApiError::NotFound)?;
    timeline.compact().map_err(ApiError::InternalServerError)?;

@@ -862,13 +829,13 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
-    let timeline = tenant
-        .get_timeline(timeline_id, true)
+    let repo = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
+    let timeline = repo
+        .get_timeline(timeline_id)
+        .with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
        .map_err(ApiError::NotFound)?;
    timeline
        .checkpoint(CheckpointConfig::Forced)
-        .await
        .map_err(ApiError::InternalServerError)?;

    json_response(StatusCode::OK, ())
@@ -932,7 +899,6 @@ pub fn make_router(
        .get("/v1/tenant", tenant_list_handler)
        .post("/v1/tenant", tenant_create_handler)
        .get("/v1/tenant/:tenant_id", tenant_status)
-        .get("/v1/tenant/:tenant_id/size", tenant_size_handler)
        .put("/v1/tenant/config", tenant_config_handler)
        .get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
        .post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -12,10 +12,10 @@ use tracing::*;
 use walkdir::WalkDir;

 use crate::pgdatadir_mapping::*;
+use crate::reltag::{RelTag, SlruKind};
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
 use crate::walrecord::DecodedWALRecord;
-use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::*;
 use postgres_ffi::waldecoder::WalStreamDecoder;
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -8,6 +8,7 @@ pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
 pub mod profiling;
+pub mod reltag;
 pub mod repository;
 pub mod storage_sync;
 pub mod task_mgr;
@@ -43,6 +44,8 @@ pub const DEFAULT_PG_VERSION: u32 = 14;
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
 pub const DELTA_FILE_MAGIC: u16 = 0x5A61;

+pub const LOG_FILE_NAME: &str = "pageserver.log";
+
 static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);

 /// Config for the Repository checkpointer
@@ -79,6 +82,7 @@ pub async fn shutdown_pageserver(exit_code: i32) {

    // There should be nothing left, but let's be sure
    task_mgr::shutdown_tasks(None, None, None).await;
+
    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -31,7 +31,6 @@ const STORAGE_TIME_OPERATIONS: &[&str] = &[
    "compact",
    "create images",
    "init logical size",
-    "logical size",
    "load layer map",
    "gc",
 ];
@@ -366,7 +365,6 @@ pub struct TimelineMetrics {
    pub compact_time_histo: Histogram,
    pub create_images_time_histo: Histogram,
    pub init_logical_size_histo: Histogram,
-    pub logical_size_histo: Histogram,
    pub load_layer_map_histo: Histogram,
    pub last_record_gauge: IntGauge,
    pub wait_lsn_time_histo: Histogram,
@@ -399,9 +397,6 @@ impl TimelineMetrics {
        let init_logical_size_histo = STORAGE_TIME
            .get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id])
            .unwrap();
-        let logical_size_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["logical size", &tenant_id, &timeline_id])
-            .unwrap();
        let load_layer_map_histo = STORAGE_TIME
            .get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id])
            .unwrap();
@@ -433,7 +428,6 @@ impl TimelineMetrics {
            compact_time_histo,
            create_images_time_histo,
            init_logical_size_histo,
-            logical_size_histo,
            load_layer_map_histo,
            last_record_gauge,
            wait_lsn_time_histo,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -10,22 +10,13 @@
 //

 use anyhow::{bail, ensure, Context, Result};
-use bytes::Buf;
-use bytes::Bytes;
+use bytes::{Buf, BufMut, Bytes, BytesMut};
 use futures::{Stream, StreamExt};
-use pageserver_api::models::{
-    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
-    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
-    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
-    PagestreamNblocksRequest, PagestreamNblocksResponse,
-};
-use pq_proto::{BeMessage, FeMessage, RowDescriptor};
 use std::io;
 use std::net::TcpListener;
 use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
-use tokio::pin;
 use tokio_util::io::StreamReader;
 use tokio_util::io::SyncIoBridge;
 use tracing::*;
@@ -35,6 +26,7 @@ use utils::{
    lsn::Lsn,
    postgres_backend::AuthType,
    postgres_backend_async::{self, PostgresBackend},
+    pq_proto::{BeMessage, FeMessage, RowDescriptor},
    simple_rcu::RcuReadGuard,
 };

@@ -43,6 +35,7 @@ use crate::config::{PageServerConf, ProfilingConfig};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
 use crate::profiling::profpoint_start;
+use crate::reltag::RelTag;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::Timeline;
@@ -52,6 +45,163 @@ use crate::CheckpointConfig;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

+// Wrapped in libpq CopyData
+enum PagestreamFeMessage {
+    Exists(PagestreamExistsRequest),
+    Nblocks(PagestreamNblocksRequest),
+    GetPage(PagestreamGetPageRequest),
+    DbSize(PagestreamDbSizeRequest),
+}
+
+// Wrapped in libpq CopyData
+enum PagestreamBeMessage {
+    Exists(PagestreamExistsResponse),
+    Nblocks(PagestreamNblocksResponse),
+    GetPage(PagestreamGetPageResponse),
+    Error(PagestreamErrorResponse),
+    DbSize(PagestreamDbSizeResponse),
+}
+
+#[derive(Debug)]
+struct PagestreamExistsRequest {
+    latest: bool,
+    lsn: Lsn,
+    rel: RelTag,
+}
+
+#[derive(Debug)]
+struct PagestreamNblocksRequest {
+    latest: bool,
+    lsn: Lsn,
+    rel: RelTag,
+}
+
+#[derive(Debug)]
+struct PagestreamGetPageRequest {
+    latest: bool,
+    lsn: Lsn,
+    rel: RelTag,
+    blkno: u32,
+}
+
+#[derive(Debug)]
+struct PagestreamDbSizeRequest {
+    latest: bool,
+    lsn: Lsn,
+    dbnode: u32,
+}
+
+#[derive(Debug)]
+struct PagestreamExistsResponse {
+    exists: bool,
+}
+
+#[derive(Debug)]
+struct PagestreamNblocksResponse {
+    n_blocks: u32,
+}
+
+#[derive(Debug)]
+struct PagestreamGetPageResponse {
+    page: Bytes,
+}
+
+#[derive(Debug)]
+struct PagestreamErrorResponse {
+    message: String,
+}
+
+#[derive(Debug)]
+struct PagestreamDbSizeResponse {
+    db_size: i64,
+}
+
+impl PagestreamFeMessage {
+    fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
+        // TODO these gets can fail
+
+        // these correspond to the NeonMessageTag enum in pagestore_client.h
+        //
+        // TODO: consider using protobuf or serde bincode for less error prone
+        // serialization.
+        let msg_tag = body.get_u8();
+        match msg_tag {
+            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                rel: RelTag {
+                    spcnode: body.get_u32(),
+                    dbnode: body.get_u32(),
+                    relnode: body.get_u32(),
+                    forknum: body.get_u8(),
+                },
+            })),
+            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                rel: RelTag {
+                    spcnode: body.get_u32(),
+                    dbnode: body.get_u32(),
+                    relnode: body.get_u32(),
+                    forknum: body.get_u8(),
+                },
+            })),
+            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                rel: RelTag {
+                    spcnode: body.get_u32(),
+                    dbnode: body.get_u32(),
+                    relnode: body.get_u32(),
+                    forknum: body.get_u8(),
+                },
+                blkno: body.get_u32(),
+            })),
+            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                dbnode: body.get_u32(),
+            })),
+            _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
+        }
+    }
+}
+
+impl PagestreamBeMessage {
+    fn serialize(&self) -> Bytes {
+        let mut bytes = BytesMut::new();
+
+        match self {
+            Self::Exists(resp) => {
+                bytes.put_u8(100); /* tag from pagestore_client.h */
+                bytes.put_u8(resp.exists as u8);
+            }
+
+            Self::Nblocks(resp) => {
+                bytes.put_u8(101); /* tag from pagestore_client.h */
+                bytes.put_u32(resp.n_blocks);
+            }
+
+            Self::GetPage(resp) => {
+                bytes.put_u8(102); /* tag from pagestore_client.h */
+                bytes.put(&resp.page[..]);
+            }
+
+            Self::Error(resp) => {
+                bytes.put_u8(103); /* tag from pagestore_client.h */
+                bytes.put(resp.message.as_bytes());
+                bytes.put_u8(0); // null terminator
+            }
+            Self::DbSize(resp) => {
+                bytes.put_u8(104); /* tag from pagestore_client.h */
+                bytes.put_i64(resp.db_size);
+            }
+        }
+
+        bytes.into()
+    }
+}
+
 fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Bytes>> + '_ {
    async_stream::try_stream! {
        loop {
@@ -301,7 +451,7 @@ impl PageServerHandler {

            trace!("query: {copy_data_bytes:?}");

-            let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
+            let neon_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;

            let response = match neon_fe_msg {
                PagestreamFeMessage::Exists(req) => {
@@ -368,12 +518,14 @@ impl PageServerHandler {
        pgb.write_message(&BeMessage::CopyInResponse)?;
        pgb.flush().await?;

-        let copyin_stream = copyin_stream(pgb);
-        pin!(copyin_stream);
-
-        timeline
-            .import_basebackup_from_tar(&mut copyin_stream, base_lsn)
-            .await?;
+        // import_basebackup_from_tar() is not async, mainly because the Tar crate
+        // it uses is not async. So we need to jump through some hoops:
+        // - convert the input from client connection to a synchronous Read
+        // - use block_in_place()
+        let mut copyin_stream = Box::pin(copyin_stream(pgb));
+        let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
+        tokio::task::block_in_place(|| timeline.import_basebackup_from_tar(reader, base_lsn))?;
+        timeline.initialize()?;

        // Drain the rest of the Copy data
        let mut bytes_after_tar = 0;
@@ -438,7 +590,7 @@ impl PageServerHandler {
        // We only want to persist the data, and it doesn't matter if it's in the
        // shape of deltas or images.
        info!("flushing layers");
-        timeline.checkpoint(CheckpointConfig::Flush).await?;
+        timeline.checkpoint(CheckpointConfig::Flush)?;

        info!("done");
        Ok(())
@@ -908,8 +1060,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
 }

 fn get_local_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> Result<Arc<Timeline>> {
-    tenant_mgr::get_tenant(tenant_id, true)
-        .and_then(|tenant| tenant.get_timeline(timeline_id, true))
+    tenant_mgr::get_tenant(tenant_id, true).and_then(|tenant| tenant.get_timeline(timeline_id))
 }

 ///
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -7,12 +7,12 @@
 //! Clarify that)
 //!
 use crate::keyspace::{KeySpace, KeySpaceAccum};
+use crate::reltag::{RelTag, SlruKind};
 use crate::repository::*;
 use crate::tenant::Timeline;
 use crate::walrecord::NeonWalRecord;
 use anyhow::{bail, ensure, Result};
 use bytes::{Buf, Bytes};
-use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
 use postgres_ffi::{Oid, TimestampTz, TransactionId};
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/delta_layer.rs
+++ b/pageserver/src/tenant/delta_layer.rs
@@ -610,9 +610,9 @@ impl DeltaLayer {
 ///
 /// 3. Call `finish`.
 ///
-struct DeltaLayerWriterInner {
+pub struct DeltaLayerWriter {
    conf: &'static PageServerConf,
-    pub path: PathBuf,
+    path: PathBuf,
    timeline_id: TimelineId,
    tenant_id: TenantId,

@@ -624,17 +624,17 @@ struct DeltaLayerWriterInner {
    blob_writer: WriteBlobWriter<BufWriter<VirtualFile>>,
 }

-impl DeltaLayerWriterInner {
+impl DeltaLayerWriter {
    ///
    /// Start building a new delta layer.
    ///
-    fn new(
+    pub fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
        key_start: Key,
        lsn_range: Range<Lsn>,
-    ) -> anyhow::Result<Self> {
+    ) -> Result<DeltaLayerWriter> {
        // Create the file initially with a temporary filename. We don't know
        // the end key yet, so we cannot form the final filename yet. We will
        // rename it when we're done.
@@ -653,7 +653,7 @@ impl DeltaLayerWriterInner {
        let block_buf = BlockBuf::new();
        let tree_builder = DiskBtreeBuilder::new(block_buf);

-        Ok(Self {
+        Ok(DeltaLayerWriter {
            conf,
            path,
            timeline_id,
@@ -670,17 +670,17 @@ impl DeltaLayerWriterInner {
    ///
    /// The values must be appended in key, lsn order.
    ///
-    fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
+    pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
        self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
    }

-    fn put_value_bytes(
+    pub fn put_value_bytes(
        &mut self,
        key: Key,
        lsn: Lsn,
        val: &[u8],
        will_init: bool,
-    ) -> anyhow::Result<()> {
+    ) -> Result<()> {
        assert!(self.lsn_range.start <= lsn);

        let off = self.blob_writer.write_blob(val)?;
@@ -693,14 +693,14 @@ impl DeltaLayerWriterInner {
        Ok(())
    }

-    fn size(&self) -> u64 {
+    pub fn size(&self) -> u64 {
        self.blob_writer.size() + self.tree.borrow_writer().size()
    }

    ///
    /// Finish writing the delta layer.
    ///
-    fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+    pub fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -768,102 +768,6 @@ impl DeltaLayerWriterInner {
    }
 }

-/// A builder object for constructing a new delta layer.
-///
-/// Usage:
-///
-/// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...)
-///
-/// 2. Write the contents by calling `put_value` for every page
-///    version to store in the layer.
-///
-/// 3. Call `finish`.
-///
-/// # Note
-///
-/// As described in https://github.com/neondatabase/neon/issues/2650, it's
-/// possible for the writer to drop before `finish` is actually called. So this
-/// could lead to odd temporary files in the directory, exhausting file system.
-/// This structure wraps `DeltaLayerWriterInner` and also contains `Drop`
-/// implementation that cleans up the temporary file in failure. It's not
-/// possible to do this directly in `DeltaLayerWriterInner` since `finish` moves
-/// out some fields, making it impossible to implement `Drop`.
-///
-#[must_use]
-pub struct DeltaLayerWriter {
-    inner: Option<DeltaLayerWriterInner>,
-}
-
-impl DeltaLayerWriter {
-    ///
-    /// Start building a new delta layer.
-    ///
-    pub fn new(
-        conf: &'static PageServerConf,
-        timeline_id: TimelineId,
-        tenant_id: TenantId,
-        key_start: Key,
-        lsn_range: Range<Lsn>,
-    ) -> anyhow::Result<Self> {
-        Ok(Self {
-            inner: Some(DeltaLayerWriterInner::new(
-                conf,
-                timeline_id,
-                tenant_id,
-                key_start,
-                lsn_range,
-            )?),
-        })
-    }
-
-    ///
-    /// Append a key-value pair to the file.
-    ///
-    /// The values must be appended in key, lsn order.
-    ///
-    pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
-        self.inner.as_mut().unwrap().put_value(key, lsn, val)
-    }
-
-    pub fn put_value_bytes(
-        &mut self,
-        key: Key,
-        lsn: Lsn,
-        val: &[u8],
-        will_init: bool,
-    ) -> anyhow::Result<()> {
-        self.inner
-            .as_mut()
-            .unwrap()
-            .put_value_bytes(key, lsn, val, will_init)
-    }
-
-    pub fn size(&self) -> u64 {
-        self.inner.as_ref().unwrap().size()
-    }
-
-    ///
-    /// Finish writing the delta layer.
-    ///
-    pub fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
-        self.inner.take().unwrap().finish(key_end)
-    }
-}
-
-impl Drop for DeltaLayerWriter {
-    fn drop(&mut self) {
-        if let Some(inner) = self.inner.take() {
-            match inner.blob_writer.into_inner().into_inner() {
-                Ok(vfile) => vfile.remove(),
-                Err(err) => warn!(
-                    "error while flushing buffer of image layer temporary file: {}",
-                    err
-                ),
-            }
-        }
-    }
-}
-
 ///
 /// Iterator over all key-value pairse stored in a delta layer
 ///
--- a/pageserver/src/tenant/image_layer.rs
+++ b/pageserver/src/tenant/image_layer.rs
@@ -411,7 +411,7 @@ impl ImageLayer {
 ///
 /// 3. Call `finish`.
 ///
-struct ImageLayerWriterInner {
+pub struct ImageLayerWriter {
    conf: &'static PageServerConf,
    path: PathBuf,
    timeline_id: TimelineId,
@@ -423,17 +423,14 @@ struct ImageLayerWriterInner {
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
 }

-impl ImageLayerWriterInner {
-    ///
-    /// Start building a new image layer.
-    ///
-    fn new(
+impl ImageLayerWriter {
+    pub fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
        key_range: &Range<Key>,
        lsn: Lsn,
-    ) -> anyhow::Result<Self> {
+    ) -> anyhow::Result<ImageLayerWriter> {
        // Create the file initially with a temporary filename.
        // We'll atomically rename it to the final name when we're done.
        let path = ImageLayer::temp_path_for(
@@ -458,7 +455,7 @@ impl ImageLayerWriterInner {
        let block_buf = BlockBuf::new();
        let tree_builder = DiskBtreeBuilder::new(block_buf);

-        let writer = Self {
+        let writer = ImageLayerWriter {
            conf,
            path,
            timeline_id,
@@ -477,7 +474,7 @@ impl ImageLayerWriterInner {
    ///
    /// The page versions must be appended in blknum order.
    ///
-    fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
+    pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> {
        ensure!(self.key_range.contains(&key));
        let off = self.blob_writer.write_blob(img)?;

@@ -488,10 +485,7 @@ impl ImageLayerWriterInner {
        Ok(())
    }

-    ///
-    /// Finish writing the image layer.
-    ///
-    fn finish(self) -> anyhow::Result<ImageLayer> {
+    pub fn finish(self) -> anyhow::Result<ImageLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -558,76 +552,3 @@ impl ImageLayerWriterInner {
        Ok(layer)
    }
 }
-
-/// A builder object for constructing a new image layer.
-///
-/// Usage:
-///
-/// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
-///
-/// 2. Write the contents by calling `put_page_image` for every key-value
-///    pair in the key range.
-///
-/// 3. Call `finish`.
-///
-/// # Note
-///
-/// As described in https://github.com/neondatabase/neon/issues/2650, it's
-/// possible for the writer to drop before `finish` is actually called. So this
-/// could lead to odd temporary files in the directory, exhausting file system.
-/// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
-/// implementation that cleans up the temporary file in failure. It's not
-/// possible to do this directly in `ImageLayerWriterInner` since `finish` moves
-/// out some fields, making it impossible to implement `Drop`.
-///
-#[must_use]
-pub struct ImageLayerWriter {
-    inner: Option<ImageLayerWriterInner>,
-}
-
-impl ImageLayerWriter {
-    ///
-    /// Start building a new image layer.
-    ///
-    pub fn new(
-        conf: &'static PageServerConf,
-        timeline_id: TimelineId,
-        tenant_id: TenantId,
-        key_range: &Range<Key>,
-        lsn: Lsn,
-    ) -> anyhow::Result<ImageLayerWriter> {
-        Ok(Self {
-            inner: Some(ImageLayerWriterInner::new(
-                conf,
-                timeline_id,
-                tenant_id,
-                key_range,
-                lsn,
-            )?),
-        })
-    }
-
-    ///
-    /// Write next value to the file.
-    ///
-    /// The page versions must be appended in blknum order.
-    ///
-    pub fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
-        self.inner.as_mut().unwrap().put_image(key, img)
-    }
-
-    ///
-    /// Finish writing the image layer.
-    ///
-    pub fn finish(mut self) -> anyhow::Result<ImageLayer> {
-        self.inner.take().unwrap().finish()
-    }
-}
-
-impl Drop for ImageLayerWriter {
-    fn drop(&mut self) {
-        if let Some(inner) = self.inner.take() {
-            inner.blob_writer.into_inner().remove();
-        }
-    }
-}
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -1,475 +0,0 @@
-use std::cmp;
-use std::collections::{HashMap, HashSet};
-use std::sync::Arc;
-
-use anyhow::Context;
-use tokio::sync::Semaphore;
-
-use super::Tenant;
-use utils::id::TimelineId;
-use utils::lsn::Lsn;
-
-use tracing::*;
-
-/// Inputs to the actual tenant sizing model
-///
-/// Implements [`serde::Serialize`] but is not meant to be part of the public API, instead meant to
-/// be a transferrable format between execution environments and developer.
-#[serde_with::serde_as]
-#[derive(Debug, serde::Serialize, serde::Deserialize)]
-pub struct ModelInputs {
-    updates: Vec<Update>,
-    retention_period: u64,
-    #[serde_as(as = "HashMap<serde_with::DisplayFromStr, _>")]
-    timeline_inputs: HashMap<TimelineId, TimelineInputs>,
-}
-
-/// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
-/// part of [`ModelInputs`] from the HTTP api, explaining the inputs.
-#[serde_with::serde_as]
-#[derive(Debug, serde::Serialize, serde::Deserialize)]
-struct TimelineInputs {
-    #[serde_as(as = "serde_with::DisplayFromStr")]
-    last_record: Lsn,
-    #[serde_as(as = "serde_with::DisplayFromStr")]
-    latest_gc_cutoff: Lsn,
-    #[serde_as(as = "serde_with::DisplayFromStr")]
-    horizon_cutoff: Lsn,
-    #[serde_as(as = "serde_with::DisplayFromStr")]
-    pitr_cutoff: Lsn,
-    #[serde_as(as = "serde_with::DisplayFromStr")]
-    next_gc_cutoff: Lsn,
-}
-
-/// Gathers the inputs for the tenant sizing model.
-///
-/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
-/// is updated on-demand, during the start of this calculation and separate from the
-/// [`Timeline::latest_gc_cutoff`].
-///
-/// For timelines in general:
-///
-/// ```ignore
-/// 0-----|---------|----|------------| · · · · · |·> lsn
-///   initdb_lsn  branchpoints*  next_gc_cutoff  latest
-/// ```
-///
-/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the
-/// tenant size will be zero.
-pub(super) async fn gather_inputs(
-    tenant: &Tenant,
-    limit: &Arc<Semaphore>,
-    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
-) -> anyhow::Result<ModelInputs> {
-    // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to
-    // our advantage with `?` error handling.
-    let mut joinset = tokio::task::JoinSet::new();
-
-    let timelines = tenant
-        .refresh_gc_info()
-        .context("Failed to refresh gc_info before gathering inputs")?;
-
-    if timelines.is_empty() {
-        // All timelines are below tenant's gc_horizon; alternative would be to use
-        // Tenant::list_timelines but then those gc_info's would not be updated yet, possibly
-        // missing GcInfo::retain_lsns or having obsolete values for cutoff's.
-        return Ok(ModelInputs {
-            updates: vec![],
-            retention_period: 0,
-            timeline_inputs: HashMap::new(),
-        });
-    }
-
-    // record the used/inserted cache keys here, to remove extras not to start leaking
-    // after initial run the cache should be quite stable, but live timelines will eventually
-    // require new lsns to be inspected.
-    let mut needed_cache = HashSet::<(TimelineId, Lsn)>::new();
-
-    let mut updates = Vec::new();
-
-    // record the per timline values used to determine `retention_period`
-    let mut timeline_inputs = HashMap::with_capacity(timelines.len());
-
-    // used to determine the `retention_period` for the size model
-    let mut max_cutoff_distance = None;
-
-    // this will probably conflict with on-demand downloaded layers, or at least force them all
-    // to be downloaded
-    for timeline in timelines {
-        let last_record_lsn = timeline.get_last_record_lsn();
-
-        let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = {
-            // there's a race between the update (holding tenant.gc_lock) and this read but it
-            // might not be an issue, because it's not for Timeline::gc
-            let gc_info = timeline.gc_info.read().unwrap();
-
-            // similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a
-            // new gc run, which we have no control over. however differently from `Timeline::gc`
-            // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
-            // actually removing files.
-            let next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);
-
-            // the minimum where we should find the next_gc_cutoff for our calculations.
-            //
-            // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
-            // want to query any logical size before initdb_lsn.
-            let cutoff_minimum = cmp::max(timeline.get_ancestor_lsn(), timeline.initdb_lsn);
-
-            let maybe_cutoff = if next_gc_cutoff > cutoff_minimum {
-                Some((next_gc_cutoff, LsnKind::GcCutOff))
-            } else {
-                None
-            };
-
-            // this assumes there are no other lsns than the branchpoints
-            let lsns = gc_info
-                .retain_lsns
-                .iter()
-                .inspect(|&&lsn| {
-                    trace!(
-                        timeline_id=%timeline.timeline_id,
-                        "retained lsn: {lsn:?}, is_before_ancestor_lsn={}",
-                        lsn < timeline.get_ancestor_lsn()
-                    )
-                })
-                .filter(|&&lsn| lsn > timeline.get_ancestor_lsn())
-                .copied()
-                .map(|lsn| (lsn, LsnKind::BranchPoint))
-                .chain(maybe_cutoff)
-                .collect::<Vec<_>>();
-
-            (
-                lsns,
-                gc_info.horizon_cutoff,
-                gc_info.pitr_cutoff,
-                next_gc_cutoff,
-            )
-        };
-
-        // update this to have a retention_period later for the tenant_size_model
-        // tenant_size_model compares this to the last segments start_lsn
-        if let Some(cutoff_distance) = last_record_lsn.checked_sub(next_gc_cutoff) {
-            match max_cutoff_distance.as_mut() {
-                Some(max) => {
-                    *max = std::cmp::max(*max, cutoff_distance);
-                }
-                _ => {
-                    max_cutoff_distance = Some(cutoff_distance);
-                }
-            }
-        }
-
-        // all timelines branch from something, because it might be impossible to pinpoint
-        // which is the tenant_size_model's "default" branch.
-        updates.push(Update {
-            lsn: timeline.get_ancestor_lsn(),
-            command: Command::BranchFrom(timeline.get_ancestor_timeline_id()),
-            timeline_id: timeline.timeline_id,
-        });
-
-        for (lsn, _kind) in &interesting_lsns {
-            if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) {
-                updates.push(Update {
-                    lsn: *lsn,
-                    timeline_id: timeline.timeline_id,
-                    command: Command::Update(*size),
-                });
-
-                needed_cache.insert((timeline.timeline_id, *lsn));
-            } else {
-                let timeline = Arc::clone(&timeline);
-                let parallel_size_calcs = Arc::clone(limit);
-                joinset.spawn(calculate_logical_size(parallel_size_calcs, timeline, *lsn));
-            }
-        }
-
-        timeline_inputs.insert(
-            timeline.timeline_id,
-            TimelineInputs {
-                last_record: last_record_lsn,
-                // this is not used above, because it might not have updated recently enough
-                latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
-                horizon_cutoff,
-                pitr_cutoff,
-                next_gc_cutoff,
-            },
-        );
-    }
-
-    let mut have_any_error = false;
-
-    while let Some(res) = joinset.join_next().await {
-        // each of these come with Result<Result<_, JoinError>, JoinError>
-        // because of spawn + spawn_blocking
-        let res = res.and_then(|inner| inner);
-        match res {
-            Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size))) => {
-                debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
-
-                logical_size_cache.insert((timeline.timeline_id, lsn), size);
-                needed_cache.insert((timeline.timeline_id, lsn));
-
-                updates.push(Update {
-                    lsn,
-                    timeline_id: timeline.timeline_id,
-                    command: Command::Update(size),
-                });
-            }
-            Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error))) => {
-                warn!(
-                    timeline_id=%timeline.timeline_id,
-                    "failed to calculate logical size at {lsn}: {error:#}"
-                );
-                have_any_error = true;
-            }
-            Err(join_error) if join_error.is_cancelled() => {
-                unreachable!("we are not cancelling any of the futures, nor should be");
-            }
-            Err(join_error) => {
-                // cannot really do anything, as this panic is likely a bug
-                error!("logical size query panicked: {join_error:#}");
-                have_any_error = true;
-            }
-        }
-    }
-
-    // prune any keys not needed anymore; we record every used key and added key.
-    logical_size_cache.retain(|key, _| needed_cache.contains(key));
-
-    if have_any_error {
-        // we cannot complete this round, because we are missing data.
-        // we have however cached all we were able to request calculation on.
-        anyhow::bail!("failed to calculate some logical_sizes");
-    }
-
-    // the data gathered to updates is per lsn, regardless of the branch, so we can use it to
-    // our advantage, not requiring a sorted container or graph walk.
-    //
-    // for branch points, which come as multiple updates at the same LSN, the Command::Update
-    // is needed before a branch is made out of that branch Command::BranchFrom. this is
-    // handled by the variant order in `Command`.
-    updates.sort_unstable();
-
-    let retention_period = match max_cutoff_distance {
-        Some(max) => max.0,
-        None => {
-            anyhow::bail!("the first branch should have a gc_cutoff after it's branch point at 0")
-        }
-    };
-
-    Ok(ModelInputs {
-        updates,
-        retention_period,
-        timeline_inputs,
-    })
-}
-
-impl ModelInputs {
-    pub fn calculate(&self) -> anyhow::Result<u64> {
-        // Option<TimelineId> is used for "naming" the branches because it is assumed to be
-        // impossible to always determine the a one main branch.
-        let mut storage = tenant_size_model::Storage::<Option<TimelineId>>::new(None);
-
-        // tracking these not to require modifying the current implementation of the size model,
-        // which works in relative LSNs and sizes.
-        let mut last_state: HashMap<TimelineId, (Lsn, u64)> = HashMap::new();
-
-        for update in &self.updates {
-            let Update {
-                lsn,
-                command: op,
-                timeline_id,
-            } = update;
-            match op {
-                Command::Update(sz) => {
-                    let latest = last_state.get_mut(timeline_id).ok_or_else(|| {
-                        anyhow::anyhow!(
-                        "ordering-mismatch: there must had been a previous state for {timeline_id}"
-                    )
-                    })?;
-
-                    let lsn_bytes = {
-                        let Lsn(now) = lsn;
-                        let Lsn(prev) = latest.0;
-                        debug_assert!(prev <= *now, "self.updates should had been sorted");
-                        now - prev
-                    };
-
-                    let size_diff =
-                        i64::try_from(*sz as i128 - latest.1 as i128).with_context(|| {
-                            format!("size difference i64 overflow for {timeline_id}")
-                        })?;
-
-                    storage.modify_branch(&Some(*timeline_id), "".into(), lsn_bytes, size_diff);
-                    *latest = (*lsn, *sz);
-                }
-                Command::BranchFrom(parent) => {
-                    storage.branch(parent, Some(*timeline_id));
-
-                    let size = parent
-                        .as_ref()
-                        .and_then(|id| last_state.get(id))
-                        .map(|x| x.1)
-                        .unwrap_or(0);
-                    last_state.insert(*timeline_id, (*lsn, size));
-                }
-            }
-        }
-
-        Ok(storage.calculate(self.retention_period).total_children())
-    }
-}
-
-/// Single size model update.
-///
-/// Sizing model works with relative increments over latest branch state.
-/// Updates are absolute, so additional state needs to be tracked when applying.
-#[serde_with::serde_as]
-#[derive(
-    Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize,
-)]
-struct Update {
-    #[serde_as(as = "serde_with::DisplayFromStr")]
-    lsn: utils::lsn::Lsn,
-    command: Command,
-    #[serde_as(as = "serde_with::DisplayFromStr")]
-    timeline_id: TimelineId,
-}
-
-#[serde_with::serde_as]
-#[derive(PartialOrd, PartialEq, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize)]
-#[serde(rename_all = "snake_case")]
-enum Command {
-    Update(u64),
-    BranchFrom(#[serde_as(as = "Option<serde_with::DisplayFromStr>")] Option<TimelineId>),
-}
-
-impl std::fmt::Debug for Command {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // custom one-line implementation makes it more enjoyable to read {:#?} avoiding 3
-        // linebreaks
-        match self {
-            Self::Update(arg0) => write!(f, "Update({arg0})"),
-            Self::BranchFrom(arg0) => write!(f, "BranchFrom({arg0:?})"),
-        }
-    }
-}
-
-#[derive(Debug, Clone, Copy)]
-enum LsnKind {
-    BranchPoint,
-    GcCutOff,
-}
-
-/// Newtype around the tuple that carries the timeline at lsn logical size calculation.
-struct TimelineAtLsnSizeResult(
-    Arc<crate::tenant::Timeline>,
-    utils::lsn::Lsn,
-    anyhow::Result<u64>,
-);
-
-#[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))]
-async fn calculate_logical_size(
-    limit: Arc<tokio::sync::Semaphore>,
-    timeline: Arc<crate::tenant::Timeline>,
-    lsn: utils::lsn::Lsn,
-) -> Result<TimelineAtLsnSizeResult, tokio::task::JoinError> {
-    let permit = tokio::sync::Semaphore::acquire_owned(limit)
-        .await
-        .expect("global semaphore should not had been closed");
-
-    tokio::task::spawn_blocking(move || {
-        let _permit = permit;
-        let size_res = timeline.calculate_logical_size(lsn);
-        TimelineAtLsnSizeResult(timeline, lsn, size_res)
-    })
-    .await
-}
-
-#[test]
-fn updates_sort() {
-    use std::str::FromStr;
-    use utils::id::TimelineId;
-    use utils::lsn::Lsn;
-
-    let ids = [
-        TimelineId::from_str("7ff1edab8182025f15ae33482edb590a").unwrap(),
-        TimelineId::from_str("b1719e044db05401a05a2ed588a3ad3f").unwrap(),
-        TimelineId::from_str("b68d6691c895ad0a70809470020929ef").unwrap(),
-    ];
-
-    // try through all permutations
-    let ids = [
-        [&ids[0], &ids[1], &ids[2]],
-        [&ids[0], &ids[2], &ids[1]],
-        [&ids[1], &ids[0], &ids[2]],
-        [&ids[1], &ids[2], &ids[0]],
-        [&ids[2], &ids[0], &ids[1]],
-        [&ids[2], &ids[1], &ids[0]],
-    ];
-
-    for ids in ids {
-        // apply a fixture which uses a permutation of ids
-        let commands = [
-            Update {
-                lsn: Lsn(0),
-                command: Command::BranchFrom(None),
-                timeline_id: *ids[0],
-            },
-            Update {
-                lsn: Lsn::from_str("0/67E7618").unwrap(),
-                command: Command::Update(43696128),
-                timeline_id: *ids[0],
-            },
-            Update {
-                lsn: Lsn::from_str("0/67E7618").unwrap(),
-                command: Command::BranchFrom(Some(*ids[0])),
-                timeline_id: *ids[1],
-            },
-            Update {
-                lsn: Lsn::from_str("0/76BE4F0").unwrap(),
-                command: Command::Update(41844736),
-                timeline_id: *ids[1],
-            },
-            Update {
-                lsn: Lsn::from_str("0/10E49380").unwrap(),
-                command: Command::Update(42164224),
-                timeline_id: *ids[0],
-            },
-            Update {
-                lsn: Lsn::from_str("0/10E49380").unwrap(),
-                command: Command::BranchFrom(Some(*ids[0])),
-                timeline_id: *ids[2],
-            },
-            Update {
-                lsn: Lsn::from_str("0/11D74910").unwrap(),
-                command: Command::Update(42172416),
-                timeline_id: *ids[2],
-            },
-            Update {
-                lsn: Lsn::from_str("0/12051E98").unwrap(),
-                command: Command::Update(42196992),
-                timeline_id: *ids[0],
-            },
-        ];
-
-        let mut sorted = commands;
-
-        // these must sort in the same order, regardless of how the ids sort
-        // which is why the timeline_id is the last field
-        sorted.sort_unstable();
-
-        assert_eq!(commands, sorted, "{:#?} vs. {:#?}", commands, sorted);
-    }
-}
-
-#[test]
-fn verify_size_for_multiple_branches() {
-    // this is generated from integration test test_tenant_size_with_multiple_branches, but this way
-    // it has the stable lsn's
-    let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072,"timeline_inputs":{"cd9d9409c216e64bf580904facedb01b":{"last_record":"0/18D5E40","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/18B5E40","pitr_cutoff":"0/18B5E40","next_gc_cutoff":"0/18B5E40"},"10b532a550540bc15385eac4edde416a":{"last_record":"0/1839818","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/1819818","pitr_cutoff":"0/1819818","next_gc_cutoff":"0/1819818"},"230fc9d756f7363574c0d66533564dcc":{"last_record":"0/222F438","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/220F438","pitr_cutoff":"0/220F438","next_gc_cutoff":"0/220F438"}}}"#;
-
-    let inputs: ModelInputs = serde_json::from_str(doc).unwrap();
-
-    assert_eq!(inputs.calculate().unwrap(), 36_409_872);
-}
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,12 +1,10 @@
 //!

-use anyhow::{anyhow, bail, ensure, Context};
+use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::Bytes;
 use fail::fail_point;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
-use pageserver_api::models::TimelineState;
-use tokio::sync::watch;
 use tokio::task::spawn_blocking;
 use tracing::*;

@@ -16,7 +14,7 @@ use std::fs;
 use std::ops::{Deref, Range};
 use std::path::PathBuf;
 use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering};
-use std::sync::{Arc, Mutex, MutexGuard, RwLock};
+use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError};
 use std::time::{Duration, Instant, SystemTime};

 use crate::tenant::{
@@ -37,8 +35,8 @@ use crate::metrics::TimelineMetrics;
 use crate::pgdatadir_mapping::BlockNumber;
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
+use crate::reltag::RelTag;
 use crate::tenant_config::TenantConfOpt;
-use pageserver_api::reltag::RelTag;

 use postgres_ffi::to_pg_timestamp;
 use utils::{
@@ -121,16 +119,8 @@ pub struct Timeline {
    /// to avoid deadlock.
    write_lock: Mutex<()>,

-    /// Used to avoid multiple `flush_loop` tasks running
-    flush_loop_started: Mutex<bool>,
-
-    /// layer_flush_start_tx can be used to wake up the layer-flushing task.
-    /// The value is a counter, incremented every time a new flush cycle is requested.
-    /// The flush cycle counter is sent back on the layer_flush_done channel when
-    /// the flush finishes. You can use that to wait for the flush to finish.
-    layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
-    /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
-    layer_flush_done_tx: tokio::sync::watch::Sender<(u64, anyhow::Result<()>)>,
+    /// Used to ensure that there is only task performing flushing at a time
+    layer_flush_lock: Mutex<()>,

    /// Layer removal lock.
    /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
@@ -170,8 +160,6 @@ pub struct Timeline {

    /// Relation size cache
    pub rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
-
-    state: watch::Sender<TimelineState>,
 }

 /// Internal structure to hold all data needed for logical size calculation.
@@ -280,11 +268,6 @@ impl LogicalSize {
        self.size_added_after_initial
            .fetch_add(delta, AtomicOrdering::SeqCst);
    }
-
-    /// Returns the initialized (already calculated) value, if any.
-    fn initialized_size(&self) -> Option<u64> {
-        self.initial_logical_size.get().copied()
-    }
 }

 pub struct WalReceiverInfo {
@@ -324,6 +307,10 @@ pub struct GcInfo {

 /// Public interface functions
 impl Timeline {
+    //------------------------------------------------------------------------------
+    // Public GET functions
+    //------------------------------------------------------------------------------
+
    /// Get the LSN where this branch was created
    pub fn get_ancestor_lsn(&self) -> Lsn {
        self.ancestor_lsn
@@ -433,11 +420,9 @@ impl Timeline {
    /// those functions with an LSN that has been processed yet is an error.
    ///
    pub async fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> {
-        anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline");
-
        // This should never be called from the WAL receiver, because that could lead
        // to a deadlock.
-        anyhow::ensure!(
+        ensure!(
            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnection),
            "wait_lsn cannot be called in WAL receiver"
        );
@@ -460,7 +445,7 @@ impl Timeline {
        &self,
        lsn: Lsn,
        latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
-    ) -> anyhow::Result<()> {
+    ) -> Result<()> {
        ensure!(
            lsn >= **latest_gc_cutoff_lsn,
            "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)",
@@ -470,110 +455,30 @@ impl Timeline {
        Ok(())
    }

+    //------------------------------------------------------------------------------
+    // Public PUT functions, to update the repository with new page versions.
+    //
+    // These are called by the WAL receiver to digest WAL records.
+    //------------------------------------------------------------------------------
+
    /// Flush to disk all data that was written with the put_* functions
    ///
    /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
    /// know anything about them here in the repository.
-    #[instrument(skip(self), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id))]
-    pub async fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> {
+    pub fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> {
        match cconf {
            CheckpointConfig::Flush => {
                self.freeze_inmem_layer(false);
-                self.flush_frozen_layers_and_wait().await
+                self.flush_frozen_layers(true)
            }
            CheckpointConfig::Forced => {
                self.freeze_inmem_layer(false);
-                self.flush_frozen_layers_and_wait().await?;
+                self.flush_frozen_layers(true)?;
                self.compact()
            }
        }
    }

-    pub fn compact(&self) -> anyhow::Result<()> {
-        let last_record_lsn = self.get_last_record_lsn();
-
-        // Last record Lsn could be zero in case the timelie was just created
-        if !last_record_lsn.is_valid() {
-            warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
-            return Ok(());
-        }
-
-        //
-        // High level strategy for compaction / image creation:
-        //
-        // 1. First, calculate the desired "partitioning" of the
-        // currently in-use key space. The goal is to partition the
-        // key space into roughly fixed-size chunks, but also take into
-        // account any existing image layers, and try to align the
-        // chunk boundaries with the existing image layers to avoid
-        // too much churn. Also try to align chunk boundaries with
-        // relation boundaries.  In principle, we don't know about
-        // relation boundaries here, we just deal with key-value
-        // pairs, and the code in pgdatadir_mapping.rs knows how to
-        // map relations into key-value pairs. But in practice we know
-        // that 'field6' is the block number, and the fields 1-5
-        // identify a relation. This is just an optimization,
-        // though.
-        //
-        // 2. Once we know the partitioning, for each partition,
-        // decide if it's time to create a new image layer. The
-        // criteria is: there has been too much "churn" since the last
-        // image layer? The "churn" is fuzzy concept, it's a
-        // combination of too many delta files, or too much WAL in
-        // total in the delta file. Or perhaps: if creating an image
-        // file would allow to delete some older files.
-        //
-        // 3. After that, we compact all level0 delta files if there
-        // are too many of them.  While compacting, we also garbage
-        // collect any page versions that are no longer needed because
-        // of the new image layers we created in step 2.
-        //
-        // TODO: This high level strategy hasn't been implemented yet.
-        // Below are functions compact_level0() and create_image_layers()
-        // but they are a bit ad hoc and don't quite work like it's explained
-        // above. Rewrite it.
-        let _layer_removal_cs = self.layer_removal_cs.lock().unwrap();
-
-        let target_file_size = self.get_checkpoint_distance();
-
-        // Define partitioning schema if needed
-
-        match self.repartition(
-            self.get_last_record_lsn(),
-            self.get_compaction_target_size(),
-        ) {
-            Ok((partitioning, lsn)) => {
-                // 2. Create new image layers for partitions that have been modified
-                // "enough".
-                let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?;
-                if !layer_paths_to_upload.is_empty()
-                    && self.upload_layers.load(atomic::Ordering::Relaxed)
-                {
-                    storage_sync::schedule_layer_upload(
-                        self.tenant_id,
-                        self.timeline_id,
-                        layer_paths_to_upload,
-                        None,
-                    );
-                }
-
-                // 3. Compact
-                let timer = self.metrics.compact_time_histo.start_timer();
-                self.compact_level0(target_file_size)?;
-                timer.stop_and_record();
-            }
-            Err(err) => {
-                // no partitioning? This is normal, if the timeline was just created
-                // as an empty timeline. Also in unit tests, when we use the timeline
-                // as a simple key-value store, ignoring the datadir layout. Log the
-                // error but continue.
-                error!("could not compact, repartitioning keyspace failed: {err:?}");
-            }
-        };
-
-        Ok(())
-    }
-
    /// Mutate the timeline with a [`TimelineWriter`].
    pub fn writer(&self) -> TimelineWriter<'_> {
        TimelineWriter {
@@ -581,93 +486,6 @@ impl Timeline {
            _write_guard: self.write_lock.lock().unwrap(),
        }
    }
-
-    /// Retrieve current logical size of the timeline.
-    ///
-    /// The size could be lagging behind the actual number, in case
-    /// the initial size calculation has not been run (gets triggered on the first size access).
-    pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<u64> {
-        let current_size = self.current_logical_size.current_size()?;
-        debug!("Current size: {current_size:?}");
-
-        let size = current_size.size();
-        if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) =
-            (current_size, self.current_logical_size.initial_part_end)
-        {
-            self.try_spawn_size_init_task(init_lsn);
-        }
-
-        Ok(size)
-    }
-
-    /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
-    /// the in-memory layer, and initiate flushing it if so.
-    ///
-    /// Also flush after a period of time without new data -- it helps
-    /// safekeepers to regard pageserver as caught up and suspend activity.
-    pub fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
-        let last_lsn = self.get_last_record_lsn();
-        let layers = self.layers.read().unwrap();
-        if let Some(open_layer) = &layers.open_layer {
-            let open_layer_size = open_layer.size()?;
-            drop(layers);
-            let last_freeze_at = self.last_freeze_at.load();
-            let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
-            let distance = last_lsn.widening_sub(last_freeze_at);
-            // Checkpointing the open layer can be triggered by layer size or LSN range.
-            // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and
-            // we want to stay below that with a big margin.  The LSN distance determines how
-            // much WAL the safekeepers need to store.
-            if distance >= self.get_checkpoint_distance().into()
-                || open_layer_size > self.get_checkpoint_distance()
-                || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
-            {
-                info!(
-                    "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}",
-                    distance,
-                    open_layer_size,
-                    last_freeze_ts.elapsed()
-                );
-
-                self.freeze_inmem_layer(true);
-                self.last_freeze_at.store(last_lsn);
-                *(self.last_freeze_ts.write().unwrap()) = Instant::now();
-
-                // Wake up the layer flusher
-                self.flush_frozen_layers();
-            }
-        }
-        Ok(())
-    }
-
-    pub fn set_state(&self, new_state: TimelineState) {
-        match (self.current_state(), new_state) {
-            (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
-                debug!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
-            }
-            (TimelineState::Broken, _) => {
-                error!("Ignoring state update {new_state:?} for broken tenant");
-            }
-            (TimelineState::Paused, TimelineState::Active) => {
-                debug!("Not activating a paused timeline");
-            }
-            (_, new_state) => {
-                self.state.send_replace(new_state);
-            }
-        }
-    }
-
-    pub fn current_state(&self) -> TimelineState {
-        *self.state.borrow()
-    }
-
-    pub fn is_active(&self) -> bool {
-        self.current_state() == TimelineState::Active
-    }
-
-    pub fn subscribe_for_state_updates(&self) -> watch::Receiver<TimelineState> {
-        self.state.subscribe()
-    }
 }

 // Private functions
@@ -711,7 +529,7 @@ impl Timeline {
    ///
    /// Loads the metadata for the timeline into memory, but not the layer map.
    #[allow(clippy::too_many_arguments)]
-    pub(super) fn new(
+    pub fn new(
        conf: &'static PageServerConf,
        tenant_conf: Arc<RwLock<TenantConfOpt>>,
        metadata: TimelineMetadata,
@@ -721,12 +539,8 @@ impl Timeline {
        walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
        upload_layers: bool,
        pg_version: u32,
-    ) -> Self {
+    ) -> Timeline {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
-        let (state, _) = watch::channel(TimelineState::Suspended);
-
-        let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
-        let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));

        let mut result = Timeline {
            conf,
@@ -755,12 +569,8 @@ impl Timeline {

            upload_layers: AtomicBool::new(upload_layers),

-            flush_loop_started: Mutex::new(false),
-
-            layer_flush_start_tx,
-            layer_flush_done_tx,
-
            write_lock: Mutex::new(()),
+            layer_flush_lock: Mutex::new(()),
            layer_removal_cs: Mutex::new(()),

            gc_info: RwLock::new(GcInfo {
@@ -787,40 +597,12 @@ impl Timeline {

            last_received_wal: Mutex::new(None),
            rel_size_cache: RwLock::new(HashMap::new()),
-            state,
        };
        result.repartition_threshold = result.get_checkpoint_distance() / 10;
        result
    }

-    pub(super) fn maybe_spawn_flush_loop(self: &Arc<Self>) {
-        let mut flush_loop_started = self.flush_loop_started.lock().unwrap();
-        if *flush_loop_started {
-            info!(
-                "skipping attempt to start flush_loop twice {}/{}",
-                self.tenant_id, self.timeline_id
-            );
-            return;
-        }
-
-        let layer_flush_start_rx = self.layer_flush_start_tx.subscribe();
-        let self_clone = Arc::clone(self);
-        info!("spawning flush loop");
-        task_mgr::spawn(
-                    task_mgr::BACKGROUND_RUNTIME.handle(),
-                    task_mgr::TaskKind::LayerFlushTask,
-                    Some(self.tenant_id),
-                    Some(self.timeline_id),
-                    "layer flush task",
-                    false,
-                    async move { self_clone.flush_loop(layer_flush_start_rx).await; Ok(()) }
-                    .instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id))
-                );
-
-        *flush_loop_started = true;
-    }
-
-    pub(super) fn launch_wal_receiver(self: &Arc<Self>) {
+    pub fn launch_wal_receiver(self: &Arc<Self>) {
        if !is_etcd_client_initialized() {
            if cfg!(test) {
                info!("not launching WAL receiver because etcd client hasn't been initialized");
@@ -859,7 +641,7 @@ impl Timeline {
    /// Scan the timeline directory to populate the layer map.
    /// Returns all timeline-related files that were found and loaded.
    ///
-    pub(super) fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
+    pub fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
        let mut layers = self.layers.write().unwrap();
        let mut num_layers = 0;

@@ -945,13 +727,33 @@ impl Timeline {
        Ok(())
    }

-    pub(super) fn layer_removal_guard(&self) -> anyhow::Result<MutexGuard<()>> {
+    pub fn layer_removal_guard(&self) -> anyhow::Result<MutexGuard<()>> {
        self.layer_removal_cs
            .try_lock()
            .map_err(|e| anyhow!("cannot lock compaction critical section {e}"))
    }

+    /// Retrieve current logical size of the timeline.
+    ///
+    /// The size could be lagging behind the actual number, in case
+    /// the initial size calculation has not been run (gets triggered on the first size access).
+    pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<u64> {
+        let current_size = self.current_logical_size.current_size()?;
+        debug!("Current size: {current_size:?}");
+
+        let size = current_size.size();
+        if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) =
+            (current_size, self.current_logical_size.initial_part_end)
+        {
+            self.try_spawn_size_init_task(init_lsn);
+        }
+
+        Ok(size)
+    }
+
    fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn) {
+        let timeline_id = self.timeline_id;
+
        // Atomically check if the timeline size calculation had already started.
        // If the flag was not already set, this sets it.
        if !self
@@ -968,42 +770,17 @@ impl Timeline {
                "initial size calculation",
                false,
                async move {
-                    let mut timeline_state_updates = self_clone.subscribe_for_state_updates();
-                    let self_calculation = Arc::clone(&self_clone);
-                    tokio::select! {
-                        calculation_result = spawn_blocking(move || self_calculation.calculate_logical_size(init_lsn)) => {
-                            let calculated_size = calculation_result
-                                .context("Failed to spawn calculation result task")?
-                                .context("Failed to calculate logical size")?;
-                            match self_clone.current_logical_size.initial_logical_size.set(calculated_size) {
-                                Ok(()) => info!("Successfully calculated initial logical size"),
-                                Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"),
-                            }
-                            Ok(())
-                        },
-                        new_event = async {
-                            loop {
-                                match timeline_state_updates.changed().await {
-                                    Ok(()) => {
-                                        let new_state = *timeline_state_updates.borrow();
-                                        match new_state {
-                                            // we're running this job for active timelines only
-                                            TimelineState::Active => continue,
-                                            TimelineState::Broken | TimelineState::Paused | TimelineState::Suspended => return Some(new_state),
-                                        }
-                                    }
-                                    Err(_sender_dropped_error) => return None,
-                                }
-                            }
-                        } => {
-                            match new_event {
-                                Some(new_state) => info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates"),
-                                None => info!("Timeline dropped state updates sender, stopping init size calculation"),
-                            }
-                            Ok(())
-                        },
+                    let calculated_size = self_clone.calculate_logical_size(init_lsn)?;
+                    let result = spawn_blocking(move || {
+                        self_clone.current_logical_size.initial_logical_size.set(calculated_size)
+                    }).await?;
+                    match result {
+                        Ok(()) => info!("Successfully calculated initial logical size"),
+                        Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"),
                    }
-                }.instrument(info_span!("initial_logical_size_calculation", tenant = %self.tenant_id, timeline = %self.timeline_id)),
+                    Ok(())
+                }
+                .instrument(info_span!("initial_logical_size_calculation", timeline = %timeline_id))
            );
        }
    }
@@ -1011,26 +788,9 @@ impl Timeline {
    /// Calculate the logical size of the database at the latest LSN.
    ///
    /// NOTE: counted incrementally, includes ancestors, this can be a slow operation.
-    pub fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result<u64> {
-        info!(
-            "Calculating logical size for timeline {} at {}",
-            self.timeline_id, up_to_lsn
-        );
-        let timer = if up_to_lsn == self.initdb_lsn {
-            if let Some(size) = self.current_logical_size.initialized_size() {
-                if size != 0 {
-                    // non-zero size means that the size has already been calculated by this method
-                    // after startup. if the logical size is for a new timeline without layers the
-                    // size will be zero, and we cannot use that, or this caching strategy until
-                    // pageserver restart.
-                    return Ok(size);
-                }
-            }
-
-            self.metrics.init_logical_size_histo.start_timer()
-        } else {
-            self.metrics.logical_size_histo.start_timer()
-        };
+    fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result<u64> {
+        info!("Calculating logical size for timeline {}", self.timeline_id);
+        let timer = self.metrics.init_logical_size_histo.start_timer();
        let logical_size = self.get_current_logical_size_non_incremental(up_to_lsn)?;
        debug!("calculated logical size: {logical_size}");
        timer.stop_and_record();
@@ -1211,7 +971,7 @@ impl Timeline {
        Some((lsn, img))
    }

-    fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
+    fn get_ancestor_timeline(&self) -> Result<Arc<Timeline>> {
        let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
            format!(
                "Ancestor is missing. Timeline id: {} Ancestor id {:?}",
@@ -1270,14 +1030,14 @@ impl Timeline {
        Ok(layer)
    }

-    fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> {
+    fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
        //info!("PUT: key {} at {}", key, lsn);
        let layer = self.get_layer_for_write(lsn)?;
        layer.put_value(key, lsn, val)?;
        Ok(())
    }

-    fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
+    fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> Result<()> {
        let layer = self.get_layer_for_write(lsn)?;
        layer.put_tombstone(key_range, lsn)?;

@@ -1316,94 +1076,111 @@ impl Timeline {
        drop(layers);
    }

-    /// Layer flusher task's main loop.
-    async fn flush_loop(&self, mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>) {
-        info!("started flush loop");
-        loop {
-            tokio::select! {
-                _ = task_mgr::shutdown_watcher() => {
-                    info!("shutting down layer flush task");
-                    break;
-                },
-                _ = layer_flush_start_rx.changed() => {}
-            }
-
-            trace!("waking up");
-            let timer = self.metrics.flush_time_histo.start_timer();
-            let flush_counter = *layer_flush_start_rx.borrow();
-            let result = loop {
-                let layer_to_flush = {
-                    let layers = self.layers.read().unwrap();
-                    layers.frozen_layers.front().cloned()
-                    // drop 'layers' lock to allow concurrent reads and writes
-                };
-                if let Some(layer_to_flush) = layer_to_flush {
-                    if let Err(err) = self.flush_frozen_layer(layer_to_flush).await {
-                        error!("could not flush frozen layer: {err:?}");
-                        break Err(err);
-                    }
-                    continue;
-                } else {
-                    break Ok(());
-                }
-            };
-            // Notify any listeners that we're done
-            let _ = self
-                .layer_flush_done_tx
-                .send_replace((flush_counter, result));
-
-            timer.stop_and_record();
-        }
-    }
-
-    async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> {
-        let mut rx = self.layer_flush_done_tx.subscribe();
-
-        // Increment the flush cycle counter and wake up the flush task.
-        // Remember the new value, so that when we listen for the flush
-        // to finish, we know when the flush that we initiated has
-        // finished, instead of some other flush that was started earlier.
-        let mut my_flush_request = 0;
-
-        if !&*self.flush_loop_started.lock().unwrap() {
-            anyhow::bail!("cannot flush frozen layers when flush_loop is not running")
-        }
-
-        self.layer_flush_start_tx.send_modify(|counter| {
-            my_flush_request = *counter + 1;
-            *counter = my_flush_request;
-        });
-
-        loop {
+    ///
+    /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
+    /// the in-memory layer, and initiate flushing it if so.
+    ///
+    /// Also flush after a period of time without new data -- it helps
+    /// safekeepers to regard pageserver as caught up and suspend activity.
+    ///
+    pub fn check_checkpoint_distance(self: &Arc<Timeline>) -> Result<()> {
+        let last_lsn = self.get_last_record_lsn();
+        let layers = self.layers.read().unwrap();
+        if let Some(open_layer) = &layers.open_layer {
+            let open_layer_size = open_layer.size()?;
+            drop(layers);
+            let last_freeze_at = self.last_freeze_at.load();
+            let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
+            let distance = last_lsn.widening_sub(last_freeze_at);
+            // Checkpointing the open layer can be triggered by layer size or LSN range.
+            // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and
+            // we want to stay below that with a big margin.  The LSN distance determines how
+            // much WAL the safekeepers need to store.
+            if distance >= self.get_checkpoint_distance().into()
+                || open_layer_size > self.get_checkpoint_distance()
+                || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
            {
-                let (last_result_counter, last_result) = &*rx.borrow();
-                if *last_result_counter >= my_flush_request {
-                    if let Err(_err) = last_result {
-                        // We already logged the original error in
-                        // flush_loop. We cannot propagate it to the caller
-                        // here, because it might not be Cloneable
-                        anyhow::bail!(
-                            "Could not flush frozen layer. Request id: {}",
-                            my_flush_request
-                        );
-                    } else {
-                        return Ok(());
-                    }
+                info!(
+                    "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}",
+                    distance,
+                    open_layer_size,
+                    last_freeze_ts.elapsed()
+                );
+
+                self.freeze_inmem_layer(true);
+                self.last_freeze_at.store(last_lsn);
+                *(self.last_freeze_ts.write().unwrap()) = Instant::now();
+
+                // Launch a task to flush the frozen layer to disk, unless
+                // a task was already running. (If the task was running
+                // at the time that we froze the layer, it must've seen the
+                // the layer we just froze before it exited; see comments
+                // in flush_frozen_layers())
+                if let Ok(guard) = self.layer_flush_lock.try_lock() {
+                    drop(guard);
+                    let self_clone = Arc::clone(self);
+                    task_mgr::spawn(
+                        task_mgr::BACKGROUND_RUNTIME.handle(),
+                        task_mgr::TaskKind::LayerFlushTask,
+                        Some(self.tenant_id),
+                        Some(self.timeline_id),
+                        "layer flush task",
+                        false,
+                        async move { self_clone.flush_frozen_layers(false) },
+                    );
                }
            }
-            trace!("waiting for flush to complete");
-            rx.changed().await?;
-            trace!("done")
        }
+        Ok(())
    }

-    fn flush_frozen_layers(&self) {
-        self.layer_flush_start_tx.send_modify(|val| *val += 1);
+    /// Flush all frozen layers to disk.
+    ///
+    /// Only one task at a time can be doing layer-flushing for a
+    /// given timeline. If 'wait' is true, and another task is
+    /// currently doing the flushing, this function will wait for it
+    /// to finish. If 'wait' is false, this function will return
+    /// immediately instead.
+    fn flush_frozen_layers(&self, wait: bool) -> Result<()> {
+        let flush_lock_guard = if wait {
+            self.layer_flush_lock.lock().unwrap()
+        } else {
+            match self.layer_flush_lock.try_lock() {
+                Ok(guard) => guard,
+                Err(TryLockError::WouldBlock) => return Ok(()),
+                Err(TryLockError::Poisoned(err)) => panic!("{:?}", err),
+            }
+        };
+
+        let timer = self.metrics.flush_time_histo.start_timer();
+
+        loop {
+            let layers = self.layers.read().unwrap();
+            if let Some(frozen_layer) = layers.frozen_layers.front() {
+                let frozen_layer = Arc::clone(frozen_layer);
+                drop(layers); // to allow concurrent reads and writes
+                self.flush_frozen_layer(frozen_layer)?;
+            } else {
+                // Drop the 'layer_flush_lock' *before* 'layers'. That
+                // way, if you freeze a layer, and then call
+                // flush_frozen_layers(false), it is guaranteed that
+                // if another thread was busy flushing layers and the
+                // call therefore returns immediately, the other
+                // thread will have seen the newly-frozen layer and
+                // will flush that too (assuming no errors).
+                drop(flush_lock_guard);
+                drop(layers);
+                break;
+            }
+        }
+
+        timer.stop_and_record();
+
+        Ok(())
    }

    /// Flush one frozen in-memory layer to disk, as a new delta layer.
-    #[instrument(skip(self, frozen_layer), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.filename().display()))]
-    async fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
+    fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> Result<()> {
        // As a special case, when we have just imported an image into the repository,
        // instead of writing out a L0 delta layer, we directly write out image layer
        // files instead. This is possible as long as *all* the data imported into the
@@ -1461,7 +1238,7 @@ impl Timeline {
        &self,
        disk_consistent_lsn: Lsn,
        layer_paths_to_upload: HashMap<PathBuf, LayerFileMetadata>,
-    ) -> anyhow::Result<()> {
+    ) -> Result<()> {
        // We can only save a valid 'prev_record_lsn' value on disk if we
        // flushed *all* in-memory changes to disk. We only track
        // 'prev_record_lsn' in memory for the latest processed record, so we
@@ -1506,7 +1283,7 @@ impl Timeline {
            false,
        )?;

-        if self.can_upload_layers() {
+        if self.upload_layers.load(atomic::Ordering::Relaxed) {
            storage_sync::schedule_layer_upload(
                self.tenant_id,
                self.timeline_id,
@@ -1522,7 +1299,7 @@ impl Timeline {
    fn create_delta_layer(
        &self,
        frozen_layer: &InMemoryLayer,
-    ) -> anyhow::Result<(PathBuf, LayerFileMetadata)> {
+    ) -> Result<(PathBuf, LayerFileMetadata)> {
        // Write it out
        let new_delta = frozen_layer.write_to_disk()?;
        let new_delta_path = new_delta.path();
@@ -1557,7 +1334,92 @@ impl Timeline {
        Ok((new_delta_path, LayerFileMetadata::new(sz)))
    }

-    fn repartition(&self, lsn: Lsn, partition_size: u64) -> anyhow::Result<(KeyPartitioning, Lsn)> {
+    pub fn compact(&self) -> anyhow::Result<()> {
+        let last_record_lsn = self.get_last_record_lsn();
+
+        // Last record Lsn could be zero in case the timelie was just created
+        if !last_record_lsn.is_valid() {
+            warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
+            return Ok(());
+        }
+
+        //
+        // High level strategy for compaction / image creation:
+        //
+        // 1. First, calculate the desired "partitioning" of the
+        // currently in-use key space. The goal is to partition the
+        // key space into roughly fixed-size chunks, but also take into
+        // account any existing image layers, and try to align the
+        // chunk boundaries with the existing image layers to avoid
+        // too much churn. Also try to align chunk boundaries with
+        // relation boundaries.  In principle, we don't know about
+        // relation boundaries here, we just deal with key-value
+        // pairs, and the code in pgdatadir_mapping.rs knows how to
+        // map relations into key-value pairs. But in practice we know
+        // that 'field6' is the block number, and the fields 1-5
+        // identify a relation. This is just an optimization,
+        // though.
+        //
+        // 2. Once we know the partitioning, for each partition,
+        // decide if it's time to create a new image layer. The
+        // criteria is: there has been too much "churn" since the last
+        // image layer? The "churn" is fuzzy concept, it's a
+        // combination of too many delta files, or too much WAL in
+        // total in the delta file. Or perhaps: if creating an image
+        // file would allow to delete some older files.
+        //
+        // 3. After that, we compact all level0 delta files if there
+        // are too many of them.  While compacting, we also garbage
+        // collect any page versions that are no longer needed because
+        // of the new image layers we created in step 2.
+        //
+        // TODO: This high level strategy hasn't been implemented yet.
+        // Below are functions compact_level0() and create_image_layers()
+        // but they are a bit ad hoc and don't quite work like it's explained
+        // above. Rewrite it.
+        let _layer_removal_cs = self.layer_removal_cs.lock().unwrap();
+
+        let target_file_size = self.get_checkpoint_distance();
+
+        // Define partitioning schema if needed
+
+        match self.repartition(
+            self.get_last_record_lsn(),
+            self.get_compaction_target_size(),
+        ) {
+            Ok((partitioning, lsn)) => {
+                // 2. Create new image layers for partitions that have been modified
+                // "enough".
+                let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?;
+                if !layer_paths_to_upload.is_empty()
+                    && self.upload_layers.load(atomic::Ordering::Relaxed)
+                {
+                    storage_sync::schedule_layer_upload(
+                        self.tenant_id,
+                        self.timeline_id,
+                        layer_paths_to_upload,
+                        None,
+                    );
+                }
+
+                // 3. Compact
+                let timer = self.metrics.compact_time_histo.start_timer();
+                self.compact_level0(target_file_size)?;
+                timer.stop_and_record();
+            }
+            Err(err) => {
+                // no partitioning? This is normal, if the timeline was just created
+                // as an empty timeline. Also in unit tests, when we use the timeline
+                // as a simple key-value store, ignoring the datadir layout. Log the
+                // error but continue.
+                error!("could not compact, repartitioning keyspace failed: {err:?}");
+            }
+        };
+
+        Ok(())
+    }
+
+    fn repartition(&self, lsn: Lsn, partition_size: u64) -> Result<(KeyPartitioning, Lsn)> {
        let mut partitioning_guard = self.partitioning.lock().unwrap();
        if partitioning_guard.1 == Lsn(0)
            || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold
@@ -1571,7 +1433,7 @@ impl Timeline {
    }

    // Is it time to create a new image layer for the given partition?
-    fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result<bool> {
+    fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result<bool> {
        let layers = self.layers.read().unwrap();

        for part_range in &partition.ranges {
@@ -1616,7 +1478,7 @@ impl Timeline {
        partitioning: &KeyPartitioning,
        lsn: Lsn,
        force: bool,
-    ) -> anyhow::Result<HashMap<PathBuf, LayerFileMetadata>> {
+    ) -> Result<HashMap<PathBuf, LayerFileMetadata>> {
        let timer = self.metrics.create_images_time_histo.start_timer();
        let mut image_layers: Vec<ImageLayer> = Vec::new();
        for partition in partitioning.parts.iter() {
@@ -1631,10 +1493,6 @@ impl Timeline {
                    lsn,
                )?;

-                fail_point!("image-layer-writer-fail-before-finish", |_| {
-                    anyhow::bail!("failpoint image-layer-writer-fail-before-finish");
-                });
-
                for range in &partition.ranges {
                    let mut key = range.start;
                    while key < range.end {
@@ -1713,7 +1571,7 @@ impl Timeline {
    /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
    /// as Level 1 files.
    ///
-    fn compact_level0(&self, target_file_size: u64) -> anyhow::Result<()> {
+    fn compact_level0(&self, target_file_size: u64) -> Result<()> {
        let layers = self.layers.read().unwrap();
        let mut level0_deltas = layers.get_level0_deltas()?;
        drop(layers);
@@ -1929,11 +1787,6 @@ impl Timeline {
                    },
                )?);
            }
-
-            fail_point!("delta-layer-writer-fail-before-finish", |_| {
-                anyhow::bail!("failpoint delta-layer-writer-fail-before-finish");
-            });
-
            writer.as_mut().unwrap().put_value(key, lsn, value)?;
            prev_key = Some(key);
        }
@@ -1985,7 +1838,7 @@ impl Timeline {
        }
        drop(layers);

-        if self.can_upload_layers() {
+        if self.upload_layers.load(atomic::Ordering::Relaxed) {
            storage_sync::schedule_layer_upload(
                self.tenant_id,
                self.timeline_id,
@@ -2028,12 +1881,12 @@ impl Timeline {
    ///
    /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
    /// whether a record is needed for PITR.
-    pub(super) fn update_gc_info(
+    pub fn update_gc_info(
        &self,
        retain_lsns: Vec<Lsn>,
        cutoff_horizon: Lsn,
        pitr: Duration,
-    ) -> anyhow::Result<()> {
+    ) -> Result<()> {
        let mut gc_info = self.gc_info.write().unwrap();

        gc_info.horizon_cutoff = cutoff_horizon;
@@ -2088,8 +1941,8 @@ impl Timeline {
    /// within a layer file. We can only remove the whole file if it's fully
    /// obsolete.
    ///
-    pub(super) fn gc(&self) -> anyhow::Result<GcResult> {
-        let mut result: GcResult = GcResult::default();
+    pub fn gc(&self) -> Result<GcResult> {
+        let mut result: GcResult = Default::default();
        let now = SystemTime::now();

        fail_point!("before-timeline-gc");
@@ -2269,7 +2122,7 @@ impl Timeline {
            fail_point!("after-timeline-gc-removed-layers");
        }

-        if self.can_upload_layers() {
+        if self.upload_layers.load(atomic::Ordering::Relaxed) {
            storage_sync::schedule_layer_delete(
                self.tenant_id,
                self.timeline_id,
@@ -2333,10 +2186,13 @@ impl Timeline {

                let last_rec_lsn = data.records.last().unwrap().0;

-                let img = self
-                    .walredo_mgr
-                    .request_redo(key, request_lsn, base_img, data.records, self.pg_version)
-                    .context("Failed to reconstruct a page image:")?;
+                let img = self.walredo_mgr.request_redo(
+                    key,
+                    request_lsn,
+                    base_img,
+                    data.records,
+                    self.pg_version,
+                )?;

                if img.len() == page_cache::PAGE_SZ {
                    let cache = page_cache::get();
@@ -2355,11 +2211,6 @@ impl Timeline {
            }
        }
    }
-
-    fn can_upload_layers(&self) -> bool {
-        self.upload_layers.load(atomic::Ordering::Relaxed)
-            && self.current_state() != TimelineState::Broken
-    }
 }

 /// Helper function for get_reconstruct_data() to add the path of layers traversed
@@ -2410,11 +2261,11 @@ impl<'a> TimelineWriter<'a> {
    ///
    /// This will implicitly extend the relation, if the page is beyond the
    /// current end-of-file.
-    pub fn put(&self, key: Key, lsn: Lsn, value: &Value) -> anyhow::Result<()> {
+    pub fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> {
        self.tl.put_value(key, lsn, value)
    }

-    pub fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
+    pub fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> Result<()> {
        self.tl.put_tombstone(key_range, lsn)
    }

--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -241,7 +241,7 @@ pub async fn shutdown_all_tenants() {
        let tenant_id = tenant.tenant_id();
        debug!("shutdown tenant {tenant_id}");

-        if let Err(err) = tenant.checkpoint().await {
+        if let Err(err) = tenant.checkpoint() {
            error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
        }
    }
--- a/pageserver/src/tenant_tasks.rs
+++ b/pageserver/src/tenant_tasks.rs
@@ -119,7 +119,7 @@ async fn gc_loop(tenant_id: TenantId) {
            let gc_horizon = tenant.get_gc_horizon();
            let mut sleep_duration = gc_period;
            if gc_horizon > 0 {
-                if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await
+                if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false)
                {
                    sleep_duration = wait_duration;
                    error!("Gc failed, retrying in {:?}: {e:#}", sleep_duration);
@@ -175,7 +175,7 @@ async fn wait_for_active_tenant(
                        }
                        state => {
                            debug!("Not running the task loop, tenant is not active with background jobs enabled: {state:?}");
-                            continue;
+                            tokio::time::sleep(wait).await;
                        }
                    }
                }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -319,12 +319,6 @@ impl VirtualFile {

        Ok(result)
    }
-
-    pub fn remove(self) {
-        let path = self.path.clone();
-        drop(self);
-        std::fs::remove_file(path).expect("failed to remove the virtual file");
-    }
 }

 impl Drop for VirtualFile {
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -31,10 +31,10 @@ use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;

 use crate::pgdatadir_mapping::*;
+use crate::reltag::{RelTag, SlruKind};
 use crate::tenant::Timeline;
 use crate::walrecord::*;
 use crate::ZERO_PAGE;
-use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -155,19 +155,22 @@ impl<E: Clone> TaskHandle<E> {

    /// Aborts current task, waiting for it to finish.
    pub async fn shutdown(self) {
-        if let Some(jh) = self.join_handle {
-            self.cancellation.send(()).ok();
-            match jh.await {
-                Ok(Ok(())) => debug!("Shutdown success"),
-                Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
-                Err(join_error) => {
-                    if join_error.is_cancelled() {
-                        error!("Shutdown task was cancelled");
-                    } else {
-                        error!("Shutdown task join error: {join_error}")
+        match self.join_handle {
+            Some(jh) => {
+                self.cancellation.send(()).ok();
+                match jh.await {
+                    Ok(Ok(())) => debug!("Shutdown success"),
+                    Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
+                    Err(join_error) => {
+                        if join_error.is_cancelled() {
+                            error!("Shutdown task was cancelled");
+                        } else {
+                            error!("Shutdown task join error: {join_error}")
+                        }
                    }
                }
            }
+            None => {}
        }
    }
 }
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -12,7 +12,6 @@
 use std::{
    collections::{hash_map, HashMap},
    num::NonZeroU64,
-    ops::ControlFlow,
    sync::Arc,
    time::Duration,
 };
@@ -27,8 +26,7 @@ use etcd_broker::{
    subscription_key::SubscriptionKey, subscription_value::SkTimelineInfo, BrokerSubscription,
    BrokerUpdate, Client,
 };
-use pageserver_api::models::TimelineState;
-use tokio::{select, sync::watch};
+use tokio::select;
 use tracing::*;

 use crate::{
@@ -60,7 +58,10 @@ pub fn spawn_connection_manager_task(
        TaskKind::WalReceiverManager,
        Some(tenant_id),
        Some(timeline_id),
-        &format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
+        &format!(
+            "walreceiver for tenant {} timeline {}",
+            timeline.tenant_id, timeline.timeline_id
+        ),
        false,
        async move {
            info!("WAL receiver broker started, connecting to etcd");
@@ -74,26 +75,24 @@ pub fn spawn_connection_manager_task(
                select! {
                    _ = task_mgr::shutdown_watcher() => {
                        info!("WAL receiver shutdown requested, shutting down");
-                        walreceiver_state.shutdown().await;
+                        // Kill current connection, if any
+                        if let Some(wal_connection) = walreceiver_state.wal_connection.take()
+                        {
+                            wal_connection.connection_task.shutdown().await;
+                        }
                        return Ok(());
                    },
-                    loop_step_result = connection_manager_loop_step(
+
+                    _ = connection_manager_loop_step(
                        &broker_loop_prefix,
                        &mut etcd_client,
                        &mut walreceiver_state,
-                    ) => match loop_step_result {
-                        ControlFlow::Continue(()) => continue,
-                        ControlFlow::Break(()) => {
-                            info!("Connection manager loop ended, shutting down");
-                            walreceiver_state.shutdown().await;
-                            return Ok(());
-                        }
-                    },
+                    ) => {},
                }
            }
        }
        .instrument(
-            info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id),
+            info_span!("wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id),
        ),
    );
 }
@@ -105,17 +104,7 @@ async fn connection_manager_loop_step(
    broker_prefix: &str,
    etcd_client: &mut Client,
    walreceiver_state: &mut WalreceiverState,
-) -> ControlFlow<(), ()> {
-    let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates();
-
-    match wait_for_active_timeline(&mut timeline_state_updates).await {
-        ControlFlow::Continue(()) => {}
-        ControlFlow::Break(()) => {
-            info!("Timeline dropped state updates sender before becoming active, stopping wal connection manager loop");
-            return ControlFlow::Break(());
-        }
-    }
-
+) {
    let id = TenantTimelineId {
        tenant_id: walreceiver_state.timeline.tenant_id,
        timeline_id: walreceiver_state.timeline.timeline_id,
@@ -140,12 +129,10 @@ async fn connection_manager_loop_step(
        //  - change connection if the rules decide so, or if the current connection dies
        //  - receive updates from broker
        //      - this might change the current desired connection
-        //  - timeline state changes to something that does not allow walreceiver to run concurrently
        select! {
            broker_connection_result = &mut broker_subscription.watcher_handle => {
-                info!("Broker connection was closed from the other side, ending current broker loop step");
                cleanup_broker_connection(broker_connection_result, walreceiver_state);
-                return ControlFlow::Continue(());
+                return;
            },

            Some(wal_connection_update) = async {
@@ -198,36 +185,11 @@ async fn connection_manager_loop_step(
                            (&mut broker_subscription.watcher_handle).await,
                            walreceiver_state,
                        );
-                        return ControlFlow::Continue(());
+                        return;
                    }
                }
            },

-            new_event = async {
-                loop {
-                    match timeline_state_updates.changed().await {
-                        Ok(()) => {
-                            let new_state = walreceiver_state.timeline.current_state();
-                            match new_state {
-                                // we're already active as walreceiver, no need to reactivate
-                                TimelineState::Active => continue,
-                                TimelineState::Broken | TimelineState::Paused | TimelineState::Suspended => return ControlFlow::Continue(new_state),
-                            }
-                        }
-                        Err(_sender_dropped_error) => return ControlFlow::Break(()),
-                    }
-                }
-            } => match new_event {
-                ControlFlow::Continue(new_state) => {
-                    info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates");
-                    return ControlFlow::Continue(());
-                }
-                ControlFlow::Break(()) => {
-                    info!("Timeline dropped state updates sender, stopping wal connection manager loop");
-                    return ControlFlow::Break(());
-                }
-            },
-
            _ = async { tokio::time::sleep(time_until_next_retry.unwrap()).await }, if time_until_next_retry.is_some() => {}
        }

@@ -254,34 +216,6 @@ async fn connection_manager_loop_step(
    }
 }

-async fn wait_for_active_timeline(
-    timeline_state_updates: &mut watch::Receiver<TimelineState>,
-) -> ControlFlow<(), ()> {
-    let current_state = *timeline_state_updates.borrow();
-    if current_state == TimelineState::Active {
-        return ControlFlow::Continue(());
-    }
-
-    loop {
-        match timeline_state_updates.changed().await {
-            Ok(()) => {
-                let new_state = *timeline_state_updates.borrow();
-                match new_state {
-                    TimelineState::Active => {
-                        debug!("Timeline state changed to active, continuing the walreceiver connection manager");
-                        return ControlFlow::Continue(());
-                    }
-                    state => {
-                        debug!("Not running the walreceiver connection manager, timeline is not active: {state:?}");
-                        continue;
-                    }
-                }
-            }
-            Err(_sender_dropped_error) => return ControlFlow::Break(()),
-        }
-    }
-}
-
 fn cleanup_broker_connection(
    broker_connection_result: Result<Result<(), etcd_broker::BrokerError>, tokio::task::JoinError>,
    walreceiver_state: &mut WalreceiverState,
@@ -789,12 +723,6 @@ impl WalreceiverState {
            self.wal_connection_retries.remove(&node_id);
        }
    }
-
-    async fn shutdown(mut self) {
-        if let Some(wal_connection) = self.wal_connection.take() {
-            wal_connection.connection_task.shutdown().await;
-        }
-    }
 }

 #[derive(Debug, PartialEq, Eq)]
@@ -836,20 +764,15 @@ fn wal_stream_connection_string(
    listen_pg_addr_str: &str,
 ) -> anyhow::Result<String> {
    let sk_connstr = format!("postgresql://no_user@{listen_pg_addr_str}/no_db");
-    sk_connstr
-        .parse()
-        .context("bad url")
-        .and_then(|url: url::Url| {
-            let host = url.host_str().context("host is missing")?;
-            let port = url.port().unwrap_or(5432); // default PG port
-
-            Ok(format!(
-                "host={host} \
-                 port={port} \
-                 options='-c timeline_id={timeline_id} tenant_id={tenant_id}'"
-            ))
-        })
-        .with_context(|| format!("Failed to parse pageserver connection URL '{sk_connstr}'"))
+    let me_conf = sk_connstr
+        .parse::<postgres::config::Config>()
+        .with_context(|| {
+            format!("Failed to parse pageserver connection string '{sk_connstr}' as a postgres one")
+        })?;
+    let (host, port) = utils::connstring::connection_host_port(&me_conf);
+    Ok(format!(
+        "host={host} port={port} options='-c timeline_id={timeline_id} tenant_id={tenant_id}'"
+    ))
 }

 #[cfg(test)]
@@ -878,7 +801,6 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
                        safekeeper_connstr: None,
                    },
                    etcd_version: 0,
@@ -895,9 +817,7 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
-                        safekeeper_connstr: Some("no_commit_lsn".to_string()),
+                        safekeeper_connstr: Some("no commit_lsn".to_string()),
                    },
                    etcd_version: 0,
                    latest_update: now,
@@ -913,8 +833,7 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-                        safekeeper_connstr: Some("no_commit_lsn".to_string()),
+                        safekeeper_connstr: Some("no commit_lsn".to_string()),
                    },
                    etcd_version: 0,
                    latest_update: now,
@@ -930,7 +849,6 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
                        safekeeper_connstr: None,
                    },
                    etcd_version: 0,
@@ -990,8 +908,6 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                    },
                    etcd_version: 0,
@@ -1008,9 +924,7 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
-                        safekeeper_connstr: Some("not_advanced_lsn".to_string()),
+                        safekeeper_connstr: Some("not advanced Lsn".to_string()),
                    },
                    etcd_version: 0,
                    latest_update: now,
@@ -1026,9 +940,7 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
-                        safekeeper_connstr: Some("not_enough_advanced_lsn".to_string()),
+                        safekeeper_connstr: Some("not enough advanced Lsn".to_string()),
                    },
                    etcd_version: 0,
                    latest_update: now,
@@ -1062,8 +974,6 @@ mod tests {
                    backup_lsn: None,
                    remote_consistent_lsn: None,
                    peer_horizon_lsn: None,
-                    local_start_lsn: None,
-
                    safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                },
                etcd_version: 0,
@@ -1096,9 +1006,7 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
-                        safekeeper_connstr: Some("smaller_commit_lsn".to_string()),
+                        safekeeper_connstr: Some("smaller commit_lsn".to_string()),
                    },
                    etcd_version: 0,
                    latest_update: now,
@@ -1114,8 +1022,6 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                    },
                    etcd_version: 0,
@@ -1132,8 +1038,6 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
                        safekeeper_connstr: None,
                    },
                    etcd_version: 0,
@@ -1179,8 +1083,6 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                    },
                    etcd_version: 0,
@@ -1197,8 +1099,6 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                    },
                    etcd_version: 0,
@@ -1268,8 +1168,6 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                    },
                    etcd_version: 0,
@@ -1286,9 +1184,7 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
-                        safekeeper_connstr: Some("advanced_by_lsn_safekeeper".to_string()),
+                        safekeeper_connstr: Some("advanced by Lsn safekeeper".to_string()),
                    },
                    etcd_version: 0,
                    latest_update: now,
@@ -1312,7 +1208,7 @@ mod tests {
        );
        assert!(over_threshcurrent_candidate
            .wal_source_connstr
-            .contains("advanced_by_lsn_safekeeper"));
+            .contains("advanced by Lsn safekeeper"));

        Ok(())
    }
@@ -1359,8 +1255,6 @@ mod tests {
                    backup_lsn: None,
                    remote_consistent_lsn: None,
                    peer_horizon_lsn: None,
-                    local_start_lsn: None,
-
                    safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                },
                etcd_version: 0,
@@ -1432,8 +1326,6 @@ mod tests {
                    backup_lsn: None,
                    remote_consistent_lsn: None,
                    peer_horizon_lsn: None,
-                    local_start_lsn: None,
-
                    safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
                },
                etcd_version: 0,
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/walreceiver/walreceiver_connection.rs
@@ -31,8 +31,8 @@ use crate::{
    walrecord::DecodedWALRecord,
 };
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use pq_proto::ReplicationFeedback;
-use utils::{id::TenantTimelineId, lsn::Lsn};
+use utils::id::TenantTimelineId;
+use utils::{lsn::Lsn, pq_proto::ReplicationFeedback};

 /// Status of the connection.
 #[derive(Debug, Clone)]
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -10,7 +10,7 @@
 //! process. Then we get the page image back. Communication with the
 //! postgres process happens via stdin/stdout
 //!
-//! See pgxn/neon_walredo/walredoproc.c for the other side of
+//! See src/backend/tcop/zenith_wal_redo.c for the other side of
 //! this communication.
 //!
 //! The Postgres process is assumed to be secure against malicious WAL
@@ -43,10 +43,10 @@ use crate::metrics::{
    WAL_REDO_WAIT_TIME,
 };
 use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
+use crate::reltag::{RelTag, SlruKind};
 use crate::repository::Key;
 use crate::walrecord::NeonWalRecord;
 use crate::{config::PageServerConf, TEMP_FILE_SUFFIX};
-use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
 use postgres_ffi::v14::nonrelfile_utils::{
@@ -644,12 +644,14 @@ impl PostgresRedoProcess {
                ),
            ));
        } else {
-            // Limit shared cache for wal-redo-postgres
+            // Limit shared cache for wal-redo-postres
            let mut config = OpenOptions::new()
                .append(true)
                .open(PathBuf::from(&datadir).join("postgresql.conf"))?;
            config.write_all(b"shared_buffers=128kB\n")?;
            config.write_all(b"fsync=off\n")?;
+            config.write_all(b"shared_preload_libraries=neon\n")?;
+            config.write_all(b"neon.wal_redo=on\n")?;
        }

        // Start postgres itself
@@ -662,15 +664,18 @@ impl PostgresRedoProcess {
            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
            .env("PGDATA", &datadir)
-            // The redo process is not trusted, and runs in seccomp mode that
-            // doesn't allow it to open any files. We have to also make sure it
-            // doesn't inherit any file descriptors from the pageserver, that
-            // would allow an attacker to read any files that happen to be open
-            // in the pageserver.
+            // The redo process is not trusted, so it runs in seccomp mode
+            // (see seccomp in zenith_wal_redo.c). We have to make sure it doesn't
+            // inherit any file descriptors from the pageserver that would allow
+            // an attacker to do bad things.
            //
            // The Rust standard library makes sure to mark any file descriptors with
            // as close-on-exec by default, but that's not enough, since we use
            // libraries that directly call libc open without setting that flag.
+            //
+            // One example is the pidfile of the daemonize library, which doesn't
+            // currently mark file descriptors as close-on-exec. Either way, we
+            // want to be on the safe side and prevent accidental regression.
            .close_fds()
            .spawn()
            .map_err(|e| {
@@ -839,7 +844,7 @@ impl PostgresRedoProcess {
 }

 // Functions for constructing messages to send to the postgres WAL redo
-// process. See pgxn/neon_walredo/walredoproc.c for
+// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
 // explanation of the protocol.

 fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -4,6 +4,7 @@
 MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
+	inmem_smgr.o \
 	libpagestore.o \
 	libpqwalproposer.o \
 	pagestore_smgr.o \
--- a/pgxn/neon_walredo/inmem_smgr.c
+++ b/pgxn/neon_walredo/inmem_smgr.c
@@ -3,8 +3,9 @@
 * inmem_smgr.c
 *
 * This is an implementation of the SMGR interface, used in the WAL redo
- * process. It has no persistent storage, the pages that are written out
- * are kept in a small number of in-memory buffers.
+ * process (see src/backend/tcop/zenith_wal_redo.c). It has no persistent
+ * storage, the pages that are written out are kept in a small number of
+ * in-memory buffers.
 *
 * Normally, replaying a WAL record only needs to access a handful of
 * buffers, which fit in the normal buffer cache, so this is just for
@@ -14,11 +15,15 @@
 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
+ * IDENTIFICATION
+ *	  contrib/neon/inmem_smgr.c
+ *
 *-------------------------------------------------------------------------
 */
 #include "postgres.h"

 #include "access/xlog.h"
+#include "pagestore_client.h"
 #include "storage/block.h"
 #include "storage/buf_internals.h"
 #include "storage/relfilenode.h"
@@ -28,8 +33,6 @@
 #include "access/xlogutils.h"
 #endif

-#include "inmem_smgr.h"
-
 /* Size of the in-memory smgr */
 #define MAX_PAGES 64

@@ -56,34 +59,10 @@ locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno)
 	return -1;
 }

-
-/* neon wal-redo storage manager functionality */
-static void inmem_init(void);
-static void inmem_open(SMgrRelation reln);
-static void inmem_close(SMgrRelation reln, ForkNumber forknum);
-static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
-static bool inmem_exists(SMgrRelation reln, ForkNumber forknum);
-static void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
-static void inmem_extend(SMgrRelation reln, ForkNumber forknum,
-						 BlockNumber blocknum, char *buffer, bool skipFsync);
-static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
-						   BlockNumber blocknum);
-static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-					   char *buffer);
-static void inmem_write(SMgrRelation reln, ForkNumber forknum,
-						BlockNumber blocknum, char *buffer, bool skipFsync);
-static void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
-							BlockNumber blocknum, BlockNumber nblocks);
-static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
-static void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
-						   BlockNumber nblocks);
-static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
-
-
 /*
 *	inmem_init() -- Initialize private state
 */
-static void
+void
 inmem_init(void)
 {
 	used_pages = 0;
@@ -92,7 +71,7 @@ inmem_init(void)
 /*
 *	inmem_exists() -- Does the physical file exist?
 */
-static bool
+bool
 inmem_exists(SMgrRelation reln, ForkNumber forknum)
 {
 	for (int i = 0; i < used_pages; i++)
@@ -111,7 +90,7 @@ inmem_exists(SMgrRelation reln, ForkNumber forknum)
 *
 * If isRedo is true, it's okay for the relation to exist already.
 */
-static void
+void
 inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 {
 }
@@ -119,7 +98,7 @@ inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 /*
 *	inmem_unlink() -- Unlink a relation.
 */
-static void
+void
 inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
 {
 }
@@ -133,7 +112,7 @@ inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
 *		EOF).  Note that we assume writing a block beyond current EOF
 *		causes intervening file space to become filled with zeroes.
 */
-static void
+void
 inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 			 char *buffer, bool skipFsync)
 {
@@ -144,7 +123,7 @@ inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 /*
 *  inmem_open() -- Initialize newly-opened relation.
 */
-static void
+void
 inmem_open(SMgrRelation reln)
 {
 }
@@ -152,7 +131,7 @@ inmem_open(SMgrRelation reln)
 /*
 *	inmem_close() -- Close the specified relation, if it isn't closed already.
 */
-static void
+void
 inmem_close(SMgrRelation reln, ForkNumber forknum)
 {
 }
@@ -160,7 +139,7 @@ inmem_close(SMgrRelation reln, ForkNumber forknum)
 /*
 *	inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation
 */
-static bool
+bool
 inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
 	return true;
@@ -169,7 +148,7 @@ inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 /*
 * inmem_writeback() -- Tell the kernel to write pages back to storage.
 */
-static void
+void
 inmem_writeback(SMgrRelation reln, ForkNumber forknum,
 				BlockNumber blocknum, BlockNumber nblocks)
 {
@@ -178,7 +157,7 @@ inmem_writeback(SMgrRelation reln, ForkNumber forknum,
 /*
 *	inmem_read() -- Read the specified block from a relation.
 */
-static void
+void
 inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 		   char *buffer)
 {
@@ -198,7 +177,7 @@ inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 *		relation (ie, those before the current EOF).  To extend a relation,
 *		use mdextend().
 */
-static void
+void
 inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			char *buffer, bool skipFsync)
 {
@@ -245,7 +224,7 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 /*
 *	inmem_nblocks() -- Get the number of blocks stored in a relation.
 */
-static BlockNumber
+BlockNumber
 inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
 	/*
@@ -264,7 +243,7 @@ inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
 /*
 *	inmem_truncate() -- Truncate relation to specified number of blocks.
 */
-static void
+void
 inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 {
 }
@@ -272,7 +251,7 @@ inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 /*
 *	inmem_immedsync() -- Immediately sync a relation to stable storage.
 */
-static void
+void
 inmem_immedsync(SMgrRelation reln, ForkNumber forknum)
 {
 }
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -419,6 +419,15 @@ pg_init_libpagestore(void)
 							   0,	/* no flags required */
 							   check_neon_id, NULL, NULL);

+	DefineCustomBoolVariable("neon.wal_redo",
+							 "start in wal-redo mode",
+							 NULL,
+							 &wal_redo,
+							 false,
+							 PGC_POSTMASTER,
+							 0,
+							 NULL, NULL, NULL);
+
 	DefineCustomIntVariable("neon.max_cluster_size",
 							"cluster size limit",
 							NULL,
@@ -443,7 +452,13 @@ pg_init_libpagestore(void)
 	neon_timeline_walproposer = neon_timeline;
 	neon_tenant_walproposer = neon_tenant;

-	if (page_server_connstring && page_server_connstring[0])
+	if (wal_redo)
+	{
+		neon_log(PageStoreTrace, "set inmem_smgr hook");
+		smgr_hook = smgr_inmem;
+		smgr_init_hook = smgr_init_inmem;
+	}
+	else if (page_server_connstring && page_server_connstring[0])
 	{
 		neon_log(PageStoreTrace, "set neon_smgr hook");
 		smgr_hook = smgr_neon;
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -155,6 +155,10 @@ extern int32 max_cluster_size;
 extern const f_smgr *smgr_neon(BackendId backend, RelFileNode rnode);
 extern void smgr_init_neon(void);

+extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode);
+extern void smgr_init_inmem(void);
+extern void smgr_shutdown_inmem(void);
+
 /* Neon storage manager functionality */

 extern void neon_init(void);
@@ -184,6 +188,29 @@ extern void neon_truncate(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber nblocks);
 extern void neon_immedsync(SMgrRelation reln, ForkNumber forknum);

+/* neon wal-redo storage manager functionality */
+
+extern void inmem_init(void);
+extern void inmem_open(SMgrRelation reln);
+extern void inmem_close(SMgrRelation reln, ForkNumber forknum);
+extern void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
+extern bool inmem_exists(SMgrRelation reln, ForkNumber forknum);
+extern void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
+extern void inmem_extend(SMgrRelation reln, ForkNumber forknum,
+						 BlockNumber blocknum, char *buffer, bool skipFsync);
+extern bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
+						   BlockNumber blocknum);
+extern void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+					   char *buffer);
+extern void inmem_write(SMgrRelation reln, ForkNumber forknum,
+						BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
+							BlockNumber blocknum, BlockNumber nblocks);
+extern BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
+extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
+						   BlockNumber nblocks);
+extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
+
 /* utils for neon relsize cache */
 extern void relsize_hash_init(void);
 extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -99,6 +99,7 @@ char	   *page_server_connstring;
 /*with substituted password*/
 char	   *neon_timeline;
 char	   *neon_tenant;
+bool		wal_redo = false;
 int32		max_cluster_size;

 /* unlogged relation build states */
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -43,7 +43,6 @@
 #if PG_VERSION_NUM >= 150000
 #include "access/xlogrecovery.h"
 #endif
-#include "storage/fd.h"
 #include "storage/latch.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -70,8 +69,7 @@
 #include "neon.h"
 #include "walproposer.h"
 #include "walproposer_utils.h"
-
-static bool syncSafekeepers = false;
+#include "replication/walpropshim.h"

 char	   *wal_acceptors_list;
 int			wal_acceptor_reconnect_timeout;
@@ -119,8 +117,8 @@ static TimestampTz last_reconnect_attempt;
 static WalproposerShmemState * walprop_shared;

 /* Prototypes for private functions */
-static void WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId);
-static void WalProposerStart(void);
+static void WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId);
+static void WalProposerStartImpl(void);
 static void WalProposerLoop(void);
 static void InitEventSet(void);
 static void UpdateEventSet(Safekeeper *sk, uint32 events);
@@ -188,56 +186,9 @@ pg_init_walproposer(void)
 	ProcessInterruptsCallback = backpressure_throttling_impl;

 	WalProposerRegister();
-}

-/*
- * Entry point for `postgres --sync-safekeepers`.
- */
-void
-WalProposerSync(int argc, char *argv[])
-{
-	struct stat stat_buf;
-
-	syncSafekeepers = true;
-#if PG_VERSION_NUM < 150000
-	ThisTimeLineID = 1;
-#endif
-
-	/*
-	 * Initialize postmaster_alive_fds as WaitEventSet checks them.
-	 *
-	 * Copied from InitPostmasterDeathWatchHandle()
-	 */
-	if (pipe(postmaster_alive_fds) < 0)
-		ereport(FATAL,
-				(errcode_for_file_access(),
-					errmsg_internal("could not create pipe to monitor postmaster death: %m")));
-	if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1)
-		ereport(FATAL,
-				(errcode_for_socket_access(),
-					errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m")));
-
-	ChangeToDataDir();
-
-	/* Create pg_wal directory, if it doesn't exist */
-	if (stat(XLOGDIR, &stat_buf) != 0)
-	{
-		ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", XLOGDIR)));
-		if (MakePGDirectory(XLOGDIR) < 0)
-		{
-			ereport(ERROR,
-					(errcode_for_file_access(),
-						errmsg("could not create directory \"%s\": %m",
-							   XLOGDIR)));
-			exit(1);
-		}
-	}
-
-	WalProposerInit(0, 0);
-
-	BackgroundWorkerUnblockSignals();
-
-	WalProposerStart();
+	WalProposerInit = &WalProposerInitImpl;
+	WalProposerStart = &WalProposerStartImpl;
 }

 static void
@@ -478,7 +429,7 @@ WalProposerRegister(void)
 }

 static void
-WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
+WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId)
 {
 	char	   *host;
 	char	   *sep;
@@ -557,7 +508,7 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 }

 static void
-WalProposerStart(void)
+WalProposerStartImpl(void)
 {

 	/* Initiate connections to all safekeeper nodes */
--- a/pgxn/neon_test_utils/Makefile
+++ b/pgxn/neon_test_utils/Makefile
@@ -10,6 +10,9 @@ EXTENSION = neon_test_utils
 DATA = neon_test_utils--1.0.sql
 PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"

+PG_CPPFLAGS = -I$(libpq_srcdir)
+SHLIB_LINK_INTERNAL = $(libpq)
+
 PG_CONFIG = pg_config
 PGXS := $(shell $(PG_CONFIG) --pgxs)
 include $(PGXS)
--- a/pgxn/neon_test_utils/neon_test_utils--1.0.sql
+++ b/pgxn/neon_test_utils/neon_test_utils--1.0.sql
@@ -23,6 +23,11 @@ RETURNS bytea
 AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex'
 LANGUAGE C PARALLEL UNSAFE;

+CREATE FUNCTION neon_seqscan_rel(rel regclass, nprefetch int DEFAULT 0)
+RETURNS void
+AS 'MODULE_PATHNAME', 'neon_seqscan_rel'
+LANGUAGE C PARALLEL UNSAFE;
+
 CREATE FUNCTION neon_xlogflush(lsn pg_lsn)
 RETURNS VOID
 AS 'MODULE_PATHNAME', 'neon_xlogflush'
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -23,8 +23,13 @@
 #include "utils/pg_lsn.h"
 #include "utils/rel.h"
 #include "utils/varlena.h"
+#include "utils/wait_event.h"
 #include "../neon/pagestore_client.h"

+#include "libpq-fe.h"
+#include "libpq/pqformat.h"
+#include "libpq/libpq.h"
+
 PG_MODULE_MAGIC;

 extern void _PG_init(void);
@@ -34,6 +39,7 @@ PG_FUNCTION_INFO_V1(clear_buffer_cache);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
 PG_FUNCTION_INFO_V1(neon_xlogflush);
+PG_FUNCTION_INFO_V1(neon_seqscan_rel);

 /*
 * Linkage to functions in neon module.
@@ -289,6 +295,238 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 	}
 }

+
+/*
+ * A wrapper around PQgetCopyData that checks for interrupts while sleeping.
+ */
+static int
+call_PQgetCopyData(PGconn *conn, char **buffer)
+{
+	int			ret;
+
+retry:
+	ret = PQgetCopyData(conn, buffer, 1 /* async */ );
+
+	if (ret == 0)
+	{
+		int			wc;
+
+		/* Sleep until there's something to do */
+		wc = WaitLatchOrSocket(MyLatch,
+							   WL_LATCH_SET | WL_SOCKET_READABLE |
+							   WL_EXIT_ON_PM_DEATH,
+							   PQsocket(conn),
+							   -1L, PG_WAIT_EXTENSION);
+		ResetLatch(MyLatch);
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* Data available in socket? */
+		if (wc & WL_SOCKET_READABLE)
+		{
+			if (!PQconsumeInput(conn))
+				elog(ERROR, "could not get response from pageserver: %s",
+					 PQerrorMessage(conn));
+		}
+
+		goto retry;
+	}
+
+	return ret;
+}
+
+static void send_getpage_request(PGconn *pageserver_conn, RelFileNode rnode, BlockNumber blkno, XLogRecPtr lsn);
+
+/*
+ * Fetch all pages of given relation. This simulates a sequential scan
+ * over the table. You can specify the number of blocks to prefetch;
+ * the function will try to keep that many requests "in flight" at all
+ * times. The fetched pages are simply discarded.
+ */
+Datum
+neon_seqscan_rel(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	Oid			nprefetch = PG_GETARG_INT32(1);
+	Relation	rel;
+	char	   *raw_page_data;
+	BlockNumber nblocks;
+	PGconn	   *pageserver_conn;
+	XLogRecPtr	read_lsn;
+
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("must be superuser to use raw page functions")));
+
+	rel = relation_open(relid, AccessShareLock);
+
+	nblocks = RelationGetNumberOfBlocks(rel);
+
+	pageserver_conn = PQconnectdb(page_server_connstring);
+	if (PQstatus(pageserver_conn) == CONNECTION_BAD)
+	{
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
+		PQfinish(pageserver_conn);
+		ereport(ERROR,
+				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+				 errmsg("could not establish connection to pageserver"),
+				 errdetail_internal("%s", msg)));
+	}
+	PG_TRY();
+	{
+		char	   *query;
+		int			ret;
+		StringInfoData resp_buff;
+
+		read_lsn = GetXLogInsertRecPtr();
+
+		query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
+		ret = PQsendQuery(pageserver_conn, query);
+		if (ret != 1)
+		{
+			PQfinish(pageserver_conn);
+			pageserver_conn = NULL;
+			elog(ERROR, "could not send pagestream command to pageserver");
+		}
+
+		while (PQisBusy(pageserver_conn))
+		{
+			int			wc;
+
+			/* Sleep until there's something to do */
+			wc = WaitLatchOrSocket(MyLatch,
+								   WL_LATCH_SET | WL_SOCKET_READABLE |
+								   WL_EXIT_ON_PM_DEATH,
+								   PQsocket(pageserver_conn),
+								   -1L, PG_WAIT_EXTENSION);
+			ResetLatch(MyLatch);
+
+			CHECK_FOR_INTERRUPTS();
+
+			/* Data available in socket? */
+			if (wc & WL_SOCKET_READABLE)
+			{
+				if (!PQconsumeInput(pageserver_conn))
+				{
+					char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
+					PQfinish(pageserver_conn);
+					pageserver_conn = NULL;
+
+					elog(ERROR, "could not complete handshake with pageserver: %s",
+							 msg);
+				}
+			}
+		}
+
+		elog(INFO, "scanning %u blocks, prefetch %u", nblocks, nprefetch);
+
+		BlockNumber nsent = 0;
+		for (BlockNumber blkno = 0; blkno < nblocks; blkno++)
+		{
+			NeonGetPageRequest request = {
+				.req.tag = T_NeonGetPageRequest,
+				.req.latest = true,
+				.req.lsn = read_lsn,
+				.rnode = rel->rd_node,
+				.forknum = MAIN_FORKNUM,
+				.blkno = blkno
+			};
+			NeonResponse *resp;
+
+			if (blkno % 1024 == 0)
+				elog(INFO, "blk %u/%u", blkno, nblocks);
+
+			if (nsent < blkno + nprefetch + 1 && nsent < nblocks)
+			{
+				while (nsent < blkno + nprefetch + 1 && nsent < nblocks)
+					send_getpage_request(pageserver_conn, rel->rd_node, nsent++, read_lsn);
+
+				if (PQflush(pageserver_conn))
+				{
+					char	   *msg = PQerrorMessage(pageserver_conn);
+
+					elog(ERROR, "failed to flush page requests: %s", msg);
+				}
+			}
+
+			/* read response */
+			resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data);
+			resp_buff.cursor = 0;
+
+			if (resp_buff.len < 0)
+			{
+				if (resp_buff.len == -1)
+					elog(ERROR, "end of COPY");
+				else if (resp_buff.len == -2)
+					elog(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
+			}
+			resp = nm_unpack_response(&resp_buff);
+
+			switch (resp->tag)
+			{
+				case T_NeonGetPageResponse:
+					/* ok */
+					break;
+
+				case T_NeonErrorResponse:
+					ereport(ERROR,
+							(errcode(ERRCODE_IO_ERROR),
+							 errmsg("could not read block %u", blkno),
+							 errdetail("page server returned error: %s",
+									   ((NeonErrorResponse *) resp)->message)));
+					break;
+
+				default:
+					elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			}
+
+			PQfreemem(resp_buff.data);
+		}
+	}
+	PG_CATCH();
+	{
+		PQfinish(pageserver_conn);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+
+	relation_close(rel, AccessShareLock);
+}
+
+static void
+send_getpage_request(PGconn *pageserver_conn, RelFileNode rnode, BlockNumber blkno, XLogRecPtr lsn)
+{
+	NeonGetPageRequest request = {
+		.req.tag = T_NeonGetPageRequest,
+		.req.latest = true,
+		.req.lsn = lsn,
+		.rnode = rnode,
+		.forknum = MAIN_FORKNUM,
+		.blkno = blkno
+	};
+	StringInfoData req_buff;
+
+	req_buff = nm_pack_request(&request.req);
+	/*
+	 * Send request.
+	 *
+	 * In principle, this could block if the output buffer is full, and we
+	 * should use async mode and check for interrupts while waiting. In
+	 * practice, our requests are small enough to always fit in the output and
+	 * TCP buffer.
+	 */
+	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
+	{
+		char	   *msg = PQerrorMessage(pageserver_conn);
+
+		elog(ERROR, "failed to send page request: %s", msg);
+	}
+	pfree(req_buff.data);
+}
+
 /*
 * Directly calls XLogFlush(lsn) to flush WAL buffers.
 */
--- a/pgxn/neon_walredo/Makefile
+++ b/pgxn/neon_walredo/Makefile
@@ -1,22 +0,0 @@
-# pgxs/neon_walredo/Makefile
-
-MODULE_big = neon_walredo
-OBJS = \
-	$(WIN32RES) \
-	inmem_smgr.o \
-	walredoproc.o \
-
-# This really should be guarded by $(with_libseccomp), but I couldn't
-# make that work with pgxs. So we always compile it, but its contents
-# are wrapped in #ifdef HAVE_LIBSECCOMP instead.
-OBJS += seccomp.o
-
-PGFILEDESC = "neon_walredo - helper process that runs in Neon pageserver"
-
-PG_CONFIG = pg_config
-PGXS := $(shell $(PG_CONFIG) --pgxs)
-include $(PGXS)
-
-ifeq ($(with_libseccomp),yes)
-SHLIB_LINK += -lseccomp
-endif
--- a/pgxn/neon_walredo/inmem_smgr.h
+++ b/pgxn/neon_walredo/inmem_smgr.h
@@ -1,17 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * inmem_smgr.h
- *
- *
- * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- *-------------------------------------------------------------------------
- */
-#ifndef INMEM_SMGR_H
-#define INMEM_SMGR_H
-
-extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode);
-extern void smgr_init_inmem(void);
-
-#endif /* INMEM_SMGR_H */
--- a/pgxn/neon_walredo/neon_seccomp.h
+++ b/pgxn/neon_walredo/neon_seccomp.h
@@ -1,22 +0,0 @@
-#ifndef NEON_SECCOMP_H
-#define NEON_SECCOMP_H
-
-#include <seccomp.h>
-
-typedef struct {
-    int    psr_syscall; /* syscall number */
-    uint32 psr_action;  /* libseccomp action, e.g. SCMP_ACT_ALLOW */
-} PgSeccompRule;
-
-#define PG_SCMP(syscall, action)                \
-    (PgSeccompRule) {                           \
-        .psr_syscall = SCMP_SYS(syscall),       \
-        .psr_action = (action),                 \
-    }
-
-#define PG_SCMP_ALLOW(syscall) \
-    PG_SCMP(syscall, SCMP_ACT_ALLOW)
-
-extern void seccomp_load_rules(PgSeccompRule *syscalls, int count);
-
-#endif /* NEON_SECCOMP_H */
--- a/Show More
+++ b/Show More